diff options
Diffstat (limited to 'mm/memory.c')
| -rw-r--r-- | mm/memory.c | 2472 | 
1 files changed, 1347 insertions, 1125 deletions
diff --git a/mm/memory.c b/mm/memory.c index 02e48aa0ed1..8b44f765b64 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -47,7 +47,7 @@  #include <linux/pagemap.h>  #include <linux/ksm.h>  #include <linux/rmap.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/delayacct.h>  #include <linux/init.h>  #include <linux/writeback.h> @@ -57,6 +57,10 @@  #include <linux/swapops.h>  #include <linux/elf.h>  #include <linux/gfp.h> +#include <linux/migrate.h> +#include <linux/string.h> +#include <linux/dma-debug.h> +#include <linux/debugfs.h>  #include <asm/io.h>  #include <asm/pgalloc.h> @@ -67,6 +71,10 @@  #include "internal.h" +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS +#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. +#endif +  #ifndef CONFIG_NEED_MULTIPLE_NODES  /* use the per-pgdat data instead for discontigmem - mbligh */  unsigned long max_mapnr; @@ -76,7 +84,6 @@ EXPORT_SYMBOL(max_mapnr);  EXPORT_SYMBOL(mem_map);  #endif -unsigned long num_physpages;  /*   * A number of key systems in x86 including ioremap() rely on the assumption   * that high_memory defines the upper bound on direct map memory, then end @@ -86,7 +93,6 @@ unsigned long num_physpages;   */  void * high_memory; -EXPORT_SYMBOL(num_physpages);  EXPORT_SYMBOL(high_memory);  /* @@ -125,17 +131,17 @@ core_initcall(init_zero_pfn);  #if defined(SPLIT_RSS_COUNTING) -static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) +void sync_mm_rss(struct mm_struct *mm)  {  	int i;  	for (i = 0; i < NR_MM_COUNTERS; i++) { -		if (task->rss_stat.count[i]) { -			add_mm_counter(mm, i, task->rss_stat.count[i]); -			task->rss_stat.count[i] = 0; +		if (current->rss_stat.count[i]) { +			add_mm_counter(mm, i, current->rss_stat.count[i]); +			current->rss_stat.count[i] = 0;  		}  	} -	task->rss_stat.events = 0; +	current->rss_stat.events = 0;  }  static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) @@ -157,66 +163,226 @@ static void check_sync_rss_stat(struct task_struct *task)  	if (unlikely(task != current))  		return;  	if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) -		__sync_task_rss_stat(task, task->mm); +		sync_mm_rss(task->mm);  } +#else /* SPLIT_RSS_COUNTING */ -unsigned long get_mm_counter(struct mm_struct *mm, int member) +#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) +#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) + +static void check_sync_rss_stat(struct task_struct *task)  { -	long val = 0; +} -	/* -	 * Don't use task->mm here...for avoiding to use task_get_mm().. -	 * The caller must guarantee task->mm is not invalid. -	 */ -	val = atomic_long_read(&mm->rss_stat.count[member]); -	/* -	 * counter is updated in asynchronous manner and may go to minus. -	 * But it's never be expected number for users. -	 */ -	if (val < 0) +#endif /* SPLIT_RSS_COUNTING */ + +#ifdef HAVE_GENERIC_MMU_GATHER + +static int tlb_next_batch(struct mmu_gather *tlb) +{ +	struct mmu_gather_batch *batch; + +	batch = tlb->active; +	if (batch->next) { +		tlb->active = batch->next; +		return 1; +	} + +	if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)  		return 0; -	return (unsigned long)val; + +	batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); +	if (!batch) +		return 0; + +	tlb->batch_count++; +	batch->next = NULL; +	batch->nr   = 0; +	batch->max  = MAX_GATHER_BATCH; + +	tlb->active->next = batch; +	tlb->active = batch; + +	return 1; +} + +/* tlb_gather_mmu + *	Called to initialize an (on-stack) mmu_gather structure for page-table + *	tear-down from @mm. The @fullmm argument is used when @mm is without + *	users and we're going to destroy the full address space (exit/execve). + */ +void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) +{ +	tlb->mm = mm; + +	/* Is it from 0 to ~0? */ +	tlb->fullmm     = !(start | (end+1)); +	tlb->need_flush_all = 0; +	tlb->start	= start; +	tlb->end	= end; +	tlb->need_flush = 0; +	tlb->local.next = NULL; +	tlb->local.nr   = 0; +	tlb->local.max  = ARRAY_SIZE(tlb->__pages); +	tlb->active     = &tlb->local; +	tlb->batch_count = 0; + +#ifdef CONFIG_HAVE_RCU_TABLE_FREE +	tlb->batch = NULL; +#endif  } -void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) +static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)  { -	__sync_task_rss_stat(task, mm); +	tlb->need_flush = 0; +	tlb_flush(tlb); +#ifdef CONFIG_HAVE_RCU_TABLE_FREE +	tlb_table_flush(tlb); +#endif  } -#else -#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) -#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) +static void tlb_flush_mmu_free(struct mmu_gather *tlb) +{ +	struct mmu_gather_batch *batch; -static void check_sync_rss_stat(struct task_struct *task) +	for (batch = &tlb->local; batch; batch = batch->next) { +		free_pages_and_swap_cache(batch->pages, batch->nr); +		batch->nr = 0; +	} +	tlb->active = &tlb->local; +} + +void tlb_flush_mmu(struct mmu_gather *tlb)  { +	if (!tlb->need_flush) +		return; +	tlb_flush_mmu_tlbonly(tlb); +	tlb_flush_mmu_free(tlb);  } -#endif +/* tlb_finish_mmu + *	Called at the end of the shootdown operation to free up any resources + *	that were required. + */ +void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) +{ +	struct mmu_gather_batch *batch, *next; + +	tlb_flush_mmu(tlb); + +	/* keep the page table cache within bounds */ +	check_pgt_cache(); + +	for (batch = tlb->local.next; batch; batch = next) { +		next = batch->next; +		free_pages((unsigned long)batch, 0); +	} +	tlb->local.next = NULL; +} + +/* __tlb_remove_page + *	Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while + *	handling the additional races in SMP caused by other CPUs caching valid + *	mappings in their TLBs. Returns the number of free page slots left. + *	When out of page slots we must call tlb_flush_mmu(). + */ +int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) +{ +	struct mmu_gather_batch *batch; + +	VM_BUG_ON(!tlb->need_flush); + +	batch = tlb->active; +	batch->pages[batch->nr++] = page; +	if (batch->nr == batch->max) { +		if (!tlb_next_batch(tlb)) +			return 0; +		batch = tlb->active; +	} +	VM_BUG_ON_PAGE(batch->nr > batch->max, page); + +	return batch->max - batch->nr; +} + +#endif /* HAVE_GENERIC_MMU_GATHER */ + +#ifdef CONFIG_HAVE_RCU_TABLE_FREE  /* - * If a p?d_bad entry is found while walking page tables, report - * the error, before resetting entry to p?d_none.  Usually (but - * very seldom) called out from the p?d_none_or_clear_bad macros. + * See the comment near struct mmu_table_batch.   */ -void pgd_clear_bad(pgd_t *pgd) +static void tlb_remove_table_smp_sync(void *arg)  { -	pgd_ERROR(*pgd); -	pgd_clear(pgd); +	/* Simply deliver the interrupt */  } -void pud_clear_bad(pud_t *pud) +static void tlb_remove_table_one(void *table)  { -	pud_ERROR(*pud); -	pud_clear(pud); +	/* +	 * This isn't an RCU grace period and hence the page-tables cannot be +	 * assumed to be actually RCU-freed. +	 * +	 * It is however sufficient for software page-table walkers that rely on +	 * IRQ disabling. See the comment near struct mmu_table_batch. +	 */ +	smp_call_function(tlb_remove_table_smp_sync, NULL, 1); +	__tlb_remove_table(table);  } -void pmd_clear_bad(pmd_t *pmd) +static void tlb_remove_table_rcu(struct rcu_head *head)  { -	pmd_ERROR(*pmd); -	pmd_clear(pmd); +	struct mmu_table_batch *batch; +	int i; + +	batch = container_of(head, struct mmu_table_batch, rcu); + +	for (i = 0; i < batch->nr; i++) +		__tlb_remove_table(batch->tables[i]); + +	free_page((unsigned long)batch); +} + +void tlb_table_flush(struct mmu_gather *tlb) +{ +	struct mmu_table_batch **batch = &tlb->batch; + +	if (*batch) { +		call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); +		*batch = NULL; +	}  } +void tlb_remove_table(struct mmu_gather *tlb, void *table) +{ +	struct mmu_table_batch **batch = &tlb->batch; + +	tlb->need_flush = 1; + +	/* +	 * When there's less then two users of this mm there cannot be a +	 * concurrent page-table walk. +	 */ +	if (atomic_read(&tlb->mm->mm_users) < 2) { +		__tlb_remove_table(table); +		return; +	} + +	if (*batch == NULL) { +		*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); +		if (*batch == NULL) { +			tlb_remove_table_one(table); +			return; +		} +		(*batch)->nr = 0; +	} +	(*batch)->tables[(*batch)->nr++] = table; +	if ((*batch)->nr == MAX_TABLE_BATCH) +		tlb_table_flush(tlb); +} + +#endif /* CONFIG_HAVE_RCU_TABLE_FREE */ +  /*   * Note: this doesn't free the actual pages themselves. That   * has been handled earlier when unmapping all the memory regions. @@ -227,7 +393,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,  	pgtable_t token = pmd_pgtable(*pmd);  	pmd_clear(pmd);  	pte_free_tlb(tlb, token, addr); -	tlb->mm->nr_ptes--; +	atomic_long_dec(&tlb->mm->nr_ptes);  }  static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, @@ -298,8 +464,6 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,  /*   * This function frees user-level page tables of a process. - * - * Must be called with pagetable lock held.   */  void free_pgd_range(struct mmu_gather *tlb,  			unsigned long addr, unsigned long end, @@ -394,9 +558,12 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,  	}  } -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, +		pmd_t *pmd, unsigned long address)  { +	spinlock_t *ptl;  	pgtable_t new = pte_alloc_one(mm, address); +	int wait_split_huge_page;  	if (!new)  		return -ENOMEM; @@ -415,15 +582,19 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)  	 */  	smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ -	spin_lock(&mm->page_table_lock); -	if (!pmd_present(*pmd)) {	/* Has another populated it ? */ -		mm->nr_ptes++; +	ptl = pmd_lock(mm, pmd); +	wait_split_huge_page = 0; +	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */ +		atomic_long_inc(&mm->nr_ptes);  		pmd_populate(mm, pmd, new);  		new = NULL; -	} -	spin_unlock(&mm->page_table_lock); +	} else if (unlikely(pmd_trans_splitting(*pmd))) +		wait_split_huge_page = 1; +	spin_unlock(ptl);  	if (new)  		pte_free(mm, new); +	if (wait_split_huge_page) +		wait_split_huge_page(vma->anon_vma, pmd);  	return 0;  } @@ -436,10 +607,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)  	smp_wmb(); /* See comment in __pte_alloc */  	spin_lock(&init_mm.page_table_lock); -	if (!pmd_present(*pmd)) {	/* Has another populated it ? */ +	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */  		pmd_populate_kernel(&init_mm, pmd, new);  		new = NULL; -	} +	} else +		VM_BUG_ON(pmd_trans_splitting(*pmd));  	spin_unlock(&init_mm.page_table_lock);  	if (new)  		pte_free_kernel(&init_mm, new); @@ -456,7 +628,7 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)  	int i;  	if (current->mm == mm) -		sync_mm_rss(current, mm); +		sync_mm_rss(mm);  	for (i = 0; i < NR_MM_COUNTERS; i++)  		if (rss[i])  			add_mm_counter(mm, i, rss[i]); @@ -509,7 +681,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,  		current->comm,  		(long long)pte_val(pte), (long long)pmd_val(*pmd));  	if (page) -		dump_page(page); +		dump_page(page, "bad pte");  	printk(KERN_ALERT  		"addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",  		(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); @@ -517,34 +689,15 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,  	 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y  	 */  	if (vma->vm_ops) -		print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n", -				(unsigned long)vma->vm_ops->fault); -	if (vma->vm_file && vma->vm_file->f_op) -		print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", -				(unsigned long)vma->vm_file->f_op->mmap); +		printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n", +		       vma->vm_ops->fault); +	if (vma->vm_file) +		printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n", +		       vma->vm_file->f_op->mmap);  	dump_stack(); -	add_taint(TAINT_BAD_PAGE); +	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);  } -static inline int is_cow_mapping(unsigned int flags) -{ -	return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; -} - -#ifndef is_zero_pfn -static inline int is_zero_pfn(unsigned long pfn) -{ -	return pfn == zero_pfn; -} -#endif - -#ifndef my_zero_pfn -static inline unsigned long my_zero_pfn(unsigned long addr) -{ -	return zero_pfn; -} -#endif -  /*   * vm_normal_page -- This function gets the "struct page" associated with a pte.   * @@ -598,7 +751,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,  	unsigned long pfn = pte_pfn(pte);  	if (HAVE_PTE_SPECIAL) { -		if (likely(!pte_special(pte))) +		if (likely(!pte_special(pte) || pte_numa(pte)))  			goto check_pfn;  		if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))  			return NULL; @@ -624,14 +777,15 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,  		}  	} -	if (is_zero_pfn(pfn)) -		return NULL;  check_pfn:  	if (unlikely(pfn > highest_memmap_pfn)) {  		print_bad_pte(vma, addr, pte, NULL);  		return NULL;  	} +	if (is_zero_pfn(pfn)) +		return NULL; +  	/*  	 * NOTE! We still have PageReserved() pages in the page tables.  	 * eg. VDSO mappings can cause them to exist. @@ -673,15 +827,26 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,  			}  			if (likely(!non_swap_entry(entry)))  				rss[MM_SWAPENTS]++; -			else if (is_write_migration_entry(entry) && -					is_cow_mapping(vm_flags)) { -				/* -				 * COW mappings require pages in both parent -				 * and child to be set to read. -				 */ -				make_migration_entry_read(&entry); -				pte = swp_entry_to_pte(entry); -				set_pte_at(src_mm, addr, src_pte, pte); +			else if (is_migration_entry(entry)) { +				page = migration_entry_to_page(entry); + +				if (PageAnon(page)) +					rss[MM_ANONPAGES]++; +				else +					rss[MM_FILEPAGES]++; + +				if (is_write_migration_entry(entry) && +				    is_cow_mapping(vm_flags)) { +					/* +					 * COW mappings require pages in both +					 * parent and child to be set to read. +					 */ +					make_migration_entry_read(&entry); +					pte = swp_entry_to_pte(entry); +					if (pte_swp_soft_dirty(*src_pte)) +						pte = pte_swp_mksoft_dirty(pte); +					set_pte_at(src_mm, addr, src_pte, pte); +				}  			}  		}  		goto out_set_pte; @@ -719,9 +884,9 @@ out_set_pte:  	return 0;  } -static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, -		pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, -		unsigned long addr, unsigned long end) +int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +		   pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, +		   unsigned long addr, unsigned long end)  {  	pte_t *orig_src_pte, *orig_dst_pte;  	pte_t *src_pte, *dst_pte; @@ -795,6 +960,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src  	src_pmd = pmd_offset(src_pud, addr);  	do {  		next = pmd_addr_end(addr, end); +		if (pmd_trans_huge(*src_pmd)) { +			int err; +			VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); +			err = copy_huge_pmd(dst_mm, src_mm, +					    dst_pmd, src_pmd, addr, vma); +			if (err == -ENOMEM) +				return -ENOMEM; +			if (!err) +				continue; +			/* fall through */ +		}  		if (pmd_none_or_clear_bad(src_pmd))  			continue;  		if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, @@ -833,6 +1009,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,  	unsigned long next;  	unsigned long addr = vma->vm_start;  	unsigned long end = vma->vm_end; +	unsigned long mmun_start;	/* For mmu_notifiers */ +	unsigned long mmun_end;		/* For mmu_notifiers */ +	bool is_cow;  	int ret;  	/* @@ -841,7 +1020,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,  	 * readonly mappings. The tradeoff is that copy_page_range is more  	 * efficient than faulting.  	 */ -	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { +	if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR | +			       VM_PFNMAP | VM_MIXEDMAP))) {  		if (!vma->anon_vma)  			return 0;  	} @@ -849,12 +1029,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,  	if (is_vm_hugetlb_page(vma))  		return copy_hugetlb_page_range(dst_mm, src_mm, vma); -	if (unlikely(is_pfn_mapping(vma))) { +	if (unlikely(vma->vm_flags & VM_PFNMAP)) {  		/*  		 * We do not free on error cases below as remove_vma  		 * gets called on error from higher level routine  		 */ -		ret = track_pfn_vma_copy(vma); +		ret = track_pfn_copy(vma);  		if (ret)  			return ret;  	} @@ -865,8 +1045,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,  	 * parent mm. And a permission downgrade will only happen if  	 * is_cow_mapping() returns true.  	 */ -	if (is_cow_mapping(vma->vm_flags)) -		mmu_notifier_invalidate_range_start(src_mm, addr, end); +	is_cow = is_cow_mapping(vma->vm_flags); +	mmun_start = addr; +	mmun_end   = end; +	if (is_cow) +		mmu_notifier_invalidate_range_start(src_mm, mmun_start, +						    mmun_end);  	ret = 0;  	dst_pgd = pgd_offset(dst_mm, addr); @@ -882,35 +1066,34 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,  		}  	} while (dst_pgd++, src_pgd++, addr = next, addr != end); -	if (is_cow_mapping(vma->vm_flags)) -		mmu_notifier_invalidate_range_end(src_mm, -						  vma->vm_start, end); +	if (is_cow) +		mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);  	return ret;  }  static unsigned long zap_pte_range(struct mmu_gather *tlb,  				struct vm_area_struct *vma, pmd_t *pmd,  				unsigned long addr, unsigned long end, -				long *zap_work, struct zap_details *details) +				struct zap_details *details)  {  	struct mm_struct *mm = tlb->mm; -	pte_t *pte; -	spinlock_t *ptl; +	int force_flush = 0;  	int rss[NR_MM_COUNTERS]; +	spinlock_t *ptl; +	pte_t *start_pte; +	pte_t *pte; +again:  	init_rss_vec(rss); - -	pte = pte_offset_map_lock(mm, pmd, addr, &ptl); +	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); +	pte = start_pte;  	arch_enter_lazy_mmu_mode();  	do {  		pte_t ptent = *pte;  		if (pte_none(ptent)) { -			(*zap_work)--;  			continue;  		} -		(*zap_work) -= PAGE_SIZE; -  		if (pte_present(ptent)) {  			struct page *page; @@ -940,23 +1123,31 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,  				continue;  			if (unlikely(details) && details->nonlinear_vma  			    && linear_page_index(details->nonlinear_vma, -						addr) != page->index) -				set_pte_at(mm, addr, pte, -					   pgoff_to_pte(page->index)); +						addr) != page->index) { +				pte_t ptfile = pgoff_to_pte(page->index); +				if (pte_soft_dirty(ptent)) +					pte_file_mksoft_dirty(ptfile); +				set_pte_at(mm, addr, pte, ptfile); +			}  			if (PageAnon(page))  				rss[MM_ANONPAGES]--;  			else { -				if (pte_dirty(ptent)) +				if (pte_dirty(ptent)) { +					force_flush = 1;  					set_page_dirty(page); +				}  				if (pte_young(ptent) && -				    likely(!VM_SequentialReadHint(vma))) +				    likely(!(vma->vm_flags & VM_SEQ_READ)))  					mark_page_accessed(page);  				rss[MM_FILEPAGES]--;  			}  			page_remove_rmap(page);  			if (unlikely(page_mapcount(page) < 0))  				print_bad_pte(vma, addr, ptent, page); -			tlb_remove_page(tlb, page); +			if (unlikely(!__tlb_remove_page(tlb, page))) { +				force_flush = 1; +				break; +			}  			continue;  		}  		/* @@ -973,15 +1164,55 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,  			if (!non_swap_entry(entry))  				rss[MM_SWAPENTS]--; +			else if (is_migration_entry(entry)) { +				struct page *page; + +				page = migration_entry_to_page(entry); + +				if (PageAnon(page)) +					rss[MM_ANONPAGES]--; +				else +					rss[MM_FILEPAGES]--; +			}  			if (unlikely(!free_swap_and_cache(entry)))  				print_bad_pte(vma, addr, ptent, NULL);  		}  		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); -	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); +	} while (pte++, addr += PAGE_SIZE, addr != end);  	add_mm_rss_vec(mm, rss);  	arch_leave_lazy_mmu_mode(); -	pte_unmap_unlock(pte - 1, ptl); + +	/* Do the actual TLB flush before dropping ptl */ +	if (force_flush) { +		unsigned long old_end; + +		/* +		 * Flush the TLB just for the previous segment, +		 * then update the range to be the remaining +		 * TLB range. +		 */ +		old_end = tlb->end; +		tlb->end = addr; +		tlb_flush_mmu_tlbonly(tlb); +		tlb->start = addr; +		tlb->end = old_end; +	} +	pte_unmap_unlock(start_pte, ptl); + +	/* +	 * If we forced a TLB flush (either due to running out of +	 * batch buffers or because we needed to flush dirty TLB +	 * entries before releasing the ptl), free the batched +	 * memory too. Restart if we didn't do everything. +	 */ +	if (force_flush) { +		force_flush = 0; +		tlb_flush_mmu_free(tlb); + +		if (addr != end) +			goto again; +	}  	return addr;  } @@ -989,7 +1220,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,  static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,  				struct vm_area_struct *vma, pud_t *pud,  				unsigned long addr, unsigned long end, -				long *zap_work, struct zap_details *details) +				struct zap_details *details)  {  	pmd_t *pmd;  	unsigned long next; @@ -997,13 +1228,35 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,  	pmd = pmd_offset(pud, addr);  	do {  		next = pmd_addr_end(addr, end); -		if (pmd_none_or_clear_bad(pmd)) { -			(*zap_work)--; -			continue; +		if (pmd_trans_huge(*pmd)) { +			if (next - addr != HPAGE_PMD_SIZE) { +#ifdef CONFIG_DEBUG_VM +				if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { +					pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n", +						__func__, addr, end, +						vma->vm_start, +						vma->vm_end); +					BUG(); +				} +#endif +				split_huge_page_pmd(vma, addr, pmd); +			} else if (zap_huge_pmd(tlb, vma, pmd, addr)) +				goto next; +			/* fall through */  		} -		next = zap_pte_range(tlb, vma, pmd, addr, next, -						zap_work, details); -	} while (pmd++, addr = next, (addr != end && *zap_work > 0)); +		/* +		 * Here there can be other concurrent MADV_DONTNEED or +		 * trans huge page faults running, and if the pmd is +		 * none or trans huge it can change under us. This is +		 * because MADV_DONTNEED holds the mmap_sem in read +		 * mode. +		 */ +		if (pmd_none_or_trans_huge_or_clear_bad(pmd)) +			goto next; +		next = zap_pte_range(tlb, vma, pmd, addr, next, details); +next: +		cond_resched(); +	} while (pmd++, addr = next, addr != end);  	return addr;  } @@ -1011,7 +1264,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,  static inline unsigned long zap_pud_range(struct mmu_gather *tlb,  				struct vm_area_struct *vma, pgd_t *pgd,  				unsigned long addr, unsigned long end, -				long *zap_work, struct zap_details *details) +				struct zap_details *details)  {  	pud_t *pud;  	unsigned long next; @@ -1019,21 +1272,18 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,  	pud = pud_offset(pgd, addr);  	do {  		next = pud_addr_end(addr, end); -		if (pud_none_or_clear_bad(pud)) { -			(*zap_work)--; +		if (pud_none_or_clear_bad(pud))  			continue; -		} -		next = zap_pmd_range(tlb, vma, pud, addr, next, -						zap_work, details); -	} while (pud++, addr = next, (addr != end && *zap_work > 0)); +		next = zap_pmd_range(tlb, vma, pud, addr, next, details); +	} while (pud++, addr = next, addr != end);  	return addr;  } -static unsigned long unmap_page_range(struct mmu_gather *tlb, -				struct vm_area_struct *vma, -				unsigned long addr, unsigned long end, -				long *zap_work, struct zap_details *details) +static void unmap_page_range(struct mmu_gather *tlb, +			     struct vm_area_struct *vma, +			     unsigned long addr, unsigned long end, +			     struct zap_details *details)  {  	pgd_t *pgd;  	unsigned long next; @@ -1047,43 +1297,67 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,  	pgd = pgd_offset(vma->vm_mm, addr);  	do {  		next = pgd_addr_end(addr, end); -		if (pgd_none_or_clear_bad(pgd)) { -			(*zap_work)--; +		if (pgd_none_or_clear_bad(pgd))  			continue; -		} -		next = zap_pud_range(tlb, vma, pgd, addr, next, -						zap_work, details); -	} while (pgd++, addr = next, (addr != end && *zap_work > 0)); +		next = zap_pud_range(tlb, vma, pgd, addr, next, details); +	} while (pgd++, addr = next, addr != end);  	tlb_end_vma(tlb, vma);  	mem_cgroup_uncharge_end(); - -	return addr;  } -#ifdef CONFIG_PREEMPT -# define ZAP_BLOCK_SIZE	(8 * PAGE_SIZE) -#else -/* No preempt: go for improved straight-line efficiency */ -# define ZAP_BLOCK_SIZE	(1024 * PAGE_SIZE) -#endif + +static void unmap_single_vma(struct mmu_gather *tlb, +		struct vm_area_struct *vma, unsigned long start_addr, +		unsigned long end_addr, +		struct zap_details *details) +{ +	unsigned long start = max(vma->vm_start, start_addr); +	unsigned long end; + +	if (start >= vma->vm_end) +		return; +	end = min(vma->vm_end, end_addr); +	if (end <= vma->vm_start) +		return; + +	if (vma->vm_file) +		uprobe_munmap(vma, start, end); + +	if (unlikely(vma->vm_flags & VM_PFNMAP)) +		untrack_pfn(vma, 0, 0); + +	if (start != end) { +		if (unlikely(is_vm_hugetlb_page(vma))) { +			/* +			 * It is undesirable to test vma->vm_file as it +			 * should be non-null for valid hugetlb area. +			 * However, vm_file will be NULL in the error +			 * cleanup path of mmap_region. When +			 * hugetlbfs ->mmap method fails, +			 * mmap_region() nullifies vma->vm_file +			 * before calling this function to clean up. +			 * Since no pte has actually been setup, it is +			 * safe to do nothing in this case. +			 */ +			if (vma->vm_file) { +				mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); +				__unmap_hugepage_range_final(tlb, vma, start, end, NULL); +				mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); +			} +		} else +			unmap_page_range(tlb, vma, start, end, details); +	} +}  /**   * unmap_vmas - unmap a range of memory covered by a list of vma's - * @tlbp: address of the caller's struct mmu_gather + * @tlb: address of the caller's struct mmu_gather   * @vma: the starting vma   * @start_addr: virtual address at which to start unmapping   * @end_addr: virtual address at which to end unmapping - * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here - * @details: details of nonlinear truncation or shared cache invalidation - * - * Returns the end address of the unmapping (restart addr if interrupted).   *   * Unmap all pages in the vma list.   * - * We aim to not hold locks for too long (for scheduling latency reasons). - * So zap pages in ZAP_BLOCK_SIZE bytecounts.  This means we need to - * return the ending mmu_gather to the caller. - *   * Only addresses between `start' and `end' will be unmapped.   *   * The VMA list must be sorted in ascending virtual address order. @@ -1093,113 +1367,67 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,   * ensure that any thus-far unmapped pages are flushed before unmap_vmas()   * drops the lock and schedules.   */ -unsigned long unmap_vmas(struct mmu_gather **tlbp, +void unmap_vmas(struct mmu_gather *tlb,  		struct vm_area_struct *vma, unsigned long start_addr, -		unsigned long end_addr, unsigned long *nr_accounted, -		struct zap_details *details) +		unsigned long end_addr)  { -	long zap_work = ZAP_BLOCK_SIZE; -	unsigned long tlb_start = 0;	/* For tlb_finish_mmu */ -	int tlb_start_valid = 0; -	unsigned long start = start_addr; -	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; -	int fullmm = (*tlbp)->fullmm;  	struct mm_struct *mm = vma->vm_mm;  	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); -	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { -		unsigned long end; - -		start = max(vma->vm_start, start_addr); -		if (start >= vma->vm_end) -			continue; -		end = min(vma->vm_end, end_addr); -		if (end <= vma->vm_start) -			continue; - -		if (vma->vm_flags & VM_ACCOUNT) -			*nr_accounted += (end - start) >> PAGE_SHIFT; - -		if (unlikely(is_pfn_mapping(vma))) -			untrack_pfn_vma(vma, 0, 0); - -		while (start != end) { -			if (!tlb_start_valid) { -				tlb_start = start; -				tlb_start_valid = 1; -			} - -			if (unlikely(is_vm_hugetlb_page(vma))) { -				/* -				 * It is undesirable to test vma->vm_file as it -				 * should be non-null for valid hugetlb area. -				 * However, vm_file will be NULL in the error -				 * cleanup path of do_mmap_pgoff. When -				 * hugetlbfs ->mmap method fails, -				 * do_mmap_pgoff() nullifies vma->vm_file -				 * before calling this function to clean up. -				 * Since no pte has actually been setup, it is -				 * safe to do nothing in this case. -				 */ -				if (vma->vm_file) { -					unmap_hugepage_range(vma, start, end, NULL); -					zap_work -= (end - start) / -					pages_per_huge_page(hstate_vma(vma)); -				} - -				start = end; -			} else -				start = unmap_page_range(*tlbp, vma, -						start, end, &zap_work, details); - -			if (zap_work > 0) { -				BUG_ON(start != end); -				break; -			} - -			tlb_finish_mmu(*tlbp, tlb_start, start); - -			if (need_resched() || -				(i_mmap_lock && spin_needbreak(i_mmap_lock))) { -				if (i_mmap_lock) { -					*tlbp = NULL; -					goto out; -				} -				cond_resched(); -			} - -			*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); -			tlb_start_valid = 0; -			zap_work = ZAP_BLOCK_SIZE; -		} -	} -out: +	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) +		unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);  	mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); -	return start;	/* which is now the end (or restart) address */  }  /**   * zap_page_range - remove user pages in a given range   * @vma: vm_area_struct holding the applicable pages + * @start: starting address of pages to zap + * @size: number of bytes to zap + * @details: details of nonlinear truncation or shared cache invalidation + * + * Caller must protect the VMA list + */ +void zap_page_range(struct vm_area_struct *vma, unsigned long start, +		unsigned long size, struct zap_details *details) +{ +	struct mm_struct *mm = vma->vm_mm; +	struct mmu_gather tlb; +	unsigned long end = start + size; + +	lru_add_drain(); +	tlb_gather_mmu(&tlb, mm, start, end); +	update_hiwater_rss(mm); +	mmu_notifier_invalidate_range_start(mm, start, end); +	for ( ; vma && vma->vm_start < end; vma = vma->vm_next) +		unmap_single_vma(&tlb, vma, start, end, details); +	mmu_notifier_invalidate_range_end(mm, start, end); +	tlb_finish_mmu(&tlb, start, end); +} + +/** + * zap_page_range_single - remove user pages in a given range + * @vma: vm_area_struct holding the applicable pages   * @address: starting address of pages to zap   * @size: number of bytes to zap   * @details: details of nonlinear truncation or shared cache invalidation + * + * The range must fit into one VMA.   */ -unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, +static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,  		unsigned long size, struct zap_details *details)  {  	struct mm_struct *mm = vma->vm_mm; -	struct mmu_gather *tlb; +	struct mmu_gather tlb;  	unsigned long end = address + size; -	unsigned long nr_accounted = 0;  	lru_add_drain(); -	tlb = tlb_gather_mmu(mm, 0); +	tlb_gather_mmu(&tlb, mm, address, end);  	update_hiwater_rss(mm); -	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); -	if (tlb) -		tlb_finish_mmu(tlb, address, end); -	return end; +	mmu_notifier_invalidate_range_start(mm, address, end); +	unmap_single_vma(&tlb, vma, address, end, details); +	mmu_notifier_invalidate_range_end(mm, address, end); +	tlb_finish_mmu(&tlb, address, end);  }  /** @@ -1220,377 +1448,11 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,  	if (address < vma->vm_start || address + size > vma->vm_end ||  	    		!(vma->vm_flags & VM_PFNMAP))  		return -1; -	zap_page_range(vma, address, size, NULL); +	zap_page_range_single(vma, address, size, NULL);  	return 0;  }  EXPORT_SYMBOL_GPL(zap_vma_ptes); -/** - * follow_page - look up a page descriptor from a user-virtual address - * @vma: vm_area_struct mapping @address - * @address: virtual address to look up - * @flags: flags modifying lookup behaviour - * - * @flags can have FOLL_ flags set, defined in <linux/mm.h> - * - * Returns the mapped (struct page *), %NULL if no mapping exists, or - * an error pointer if there is a mapping to something not represented - * by a page descriptor (see also vm_normal_page()). - */ -struct page *follow_page(struct vm_area_struct *vma, unsigned long address, -			unsigned int flags) -{ -	pgd_t *pgd; -	pud_t *pud; -	pmd_t *pmd; -	pte_t *ptep, pte; -	spinlock_t *ptl; -	struct page *page; -	struct mm_struct *mm = vma->vm_mm; - -	page = follow_huge_addr(mm, address, flags & FOLL_WRITE); -	if (!IS_ERR(page)) { -		BUG_ON(flags & FOLL_GET); -		goto out; -	} - -	page = NULL; -	pgd = pgd_offset(mm, address); -	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) -		goto no_page_table; - -	pud = pud_offset(pgd, address); -	if (pud_none(*pud)) -		goto no_page_table; -	if (pud_huge(*pud)) { -		BUG_ON(flags & FOLL_GET); -		page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); -		goto out; -	} -	if (unlikely(pud_bad(*pud))) -		goto no_page_table; - -	pmd = pmd_offset(pud, address); -	if (pmd_none(*pmd)) -		goto no_page_table; -	if (pmd_huge(*pmd)) { -		BUG_ON(flags & FOLL_GET); -		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); -		goto out; -	} -	if (unlikely(pmd_bad(*pmd))) -		goto no_page_table; - -	ptep = pte_offset_map_lock(mm, pmd, address, &ptl); - -	pte = *ptep; -	if (!pte_present(pte)) -		goto no_page; -	if ((flags & FOLL_WRITE) && !pte_write(pte)) -		goto unlock; - -	page = vm_normal_page(vma, address, pte); -	if (unlikely(!page)) { -		if ((flags & FOLL_DUMP) || -		    !is_zero_pfn(pte_pfn(pte))) -			goto bad_page; -		page = pte_page(pte); -	} - -	if (flags & FOLL_GET) -		get_page(page); -	if (flags & FOLL_TOUCH) { -		if ((flags & FOLL_WRITE) && -		    !pte_dirty(pte) && !PageDirty(page)) -			set_page_dirty(page); -		/* -		 * pte_mkyoung() would be more correct here, but atomic care -		 * is needed to avoid losing the dirty bit: it is easier to use -		 * mark_page_accessed(). -		 */ -		mark_page_accessed(page); -	} -unlock: -	pte_unmap_unlock(ptep, ptl); -out: -	return page; - -bad_page: -	pte_unmap_unlock(ptep, ptl); -	return ERR_PTR(-EFAULT); - -no_page: -	pte_unmap_unlock(ptep, ptl); -	if (!pte_none(pte)) -		return page; - -no_page_table: -	/* -	 * When core dumping an enormous anonymous area that nobody -	 * has touched so far, we don't want to allocate unnecessary pages or -	 * page tables.  Return error instead of NULL to skip handle_mm_fault, -	 * then get_dump_page() will return NULL to leave a hole in the dump. -	 * But we can only make this optimization where a hole would surely -	 * be zero-filled if handle_mm_fault() actually did handle it. -	 */ -	if ((flags & FOLL_DUMP) && -	    (!vma->vm_ops || !vma->vm_ops->fault)) -		return ERR_PTR(-EFAULT); -	return page; -} - -int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, -		     unsigned long start, int nr_pages, unsigned int gup_flags, -		     struct page **pages, struct vm_area_struct **vmas) -{ -	int i; -	unsigned long vm_flags; - -	if (nr_pages <= 0) -		return 0; - -	VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); - -	/*  -	 * Require read or write permissions. -	 * If FOLL_FORCE is set, we only require the "MAY" flags. -	 */ -	vm_flags  = (gup_flags & FOLL_WRITE) ? -			(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); -	vm_flags &= (gup_flags & FOLL_FORCE) ? -			(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); -	i = 0; - -	do { -		struct vm_area_struct *vma; - -		vma = find_extend_vma(mm, start); -		if (!vma && in_gate_area(tsk, start)) { -			unsigned long pg = start & PAGE_MASK; -			struct vm_area_struct *gate_vma = get_gate_vma(tsk); -			pgd_t *pgd; -			pud_t *pud; -			pmd_t *pmd; -			pte_t *pte; - -			/* user gate pages are read-only */ -			if (gup_flags & FOLL_WRITE) -				return i ? : -EFAULT; -			if (pg > TASK_SIZE) -				pgd = pgd_offset_k(pg); -			else -				pgd = pgd_offset_gate(mm, pg); -			BUG_ON(pgd_none(*pgd)); -			pud = pud_offset(pgd, pg); -			BUG_ON(pud_none(*pud)); -			pmd = pmd_offset(pud, pg); -			if (pmd_none(*pmd)) -				return i ? : -EFAULT; -			pte = pte_offset_map(pmd, pg); -			if (pte_none(*pte)) { -				pte_unmap(pte); -				return i ? : -EFAULT; -			} -			if (pages) { -				struct page *page; - -				page = vm_normal_page(gate_vma, start, *pte); -				if (!page) { -					if (!(gup_flags & FOLL_DUMP) && -					     is_zero_pfn(pte_pfn(*pte))) -						page = pte_page(*pte); -					else { -						pte_unmap(pte); -						return i ? : -EFAULT; -					} -				} -				pages[i] = page; -				get_page(page); -			} -			pte_unmap(pte); -			if (vmas) -				vmas[i] = gate_vma; -			i++; -			start += PAGE_SIZE; -			nr_pages--; -			continue; -		} - -		if (!vma || -		    (vma->vm_flags & (VM_IO | VM_PFNMAP)) || -		    !(vm_flags & vma->vm_flags)) -			return i ? : -EFAULT; - -		if (is_vm_hugetlb_page(vma)) { -			i = follow_hugetlb_page(mm, vma, pages, vmas, -					&start, &nr_pages, i, gup_flags); -			continue; -		} - -		do { -			struct page *page; -			unsigned int foll_flags = gup_flags; - -			/* -			 * If we have a pending SIGKILL, don't keep faulting -			 * pages and potentially allocating memory. -			 */ -			if (unlikely(fatal_signal_pending(current))) -				return i ? i : -ERESTARTSYS; - -			cond_resched(); -			while (!(page = follow_page(vma, start, foll_flags))) { -				int ret; - -				ret = handle_mm_fault(mm, vma, start, -					(foll_flags & FOLL_WRITE) ? -					FAULT_FLAG_WRITE : 0); - -				if (ret & VM_FAULT_ERROR) { -					if (ret & VM_FAULT_OOM) -						return i ? i : -ENOMEM; -					if (ret & -					    (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE| -					     VM_FAULT_SIGBUS)) -						return i ? i : -EFAULT; -					BUG(); -				} -				if (ret & VM_FAULT_MAJOR) -					tsk->maj_flt++; -				else -					tsk->min_flt++; - -				/* -				 * The VM_FAULT_WRITE bit tells us that -				 * do_wp_page has broken COW when necessary, -				 * even if maybe_mkwrite decided not to set -				 * pte_write. We can thus safely do subsequent -				 * page lookups as if they were reads. But only -				 * do so when looping for pte_write is futile: -				 * in some cases userspace may also be wanting -				 * to write to the gotten user page, which a -				 * read fault here might prevent (a readonly -				 * page might get reCOWed by userspace write). -				 */ -				if ((ret & VM_FAULT_WRITE) && -				    !(vma->vm_flags & VM_WRITE)) -					foll_flags &= ~FOLL_WRITE; - -				cond_resched(); -			} -			if (IS_ERR(page)) -				return i ? i : PTR_ERR(page); -			if (pages) { -				pages[i] = page; - -				flush_anon_page(vma, page, start); -				flush_dcache_page(page); -			} -			if (vmas) -				vmas[i] = vma; -			i++; -			start += PAGE_SIZE; -			nr_pages--; -		} while (nr_pages && start < vma->vm_end); -	} while (nr_pages); -	return i; -} - -/** - * get_user_pages() - pin user pages in memory - * @tsk:	task_struct of target task - * @mm:		mm_struct of target mm - * @start:	starting user address - * @nr_pages:	number of pages from start to pin - * @write:	whether pages will be written to by the caller - * @force:	whether to force write access even if user mapping is - *		readonly. This will result in the page being COWed even - *		in MAP_SHARED mappings. You do not want this. - * @pages:	array that receives pointers to the pages pinned. - *		Should be at least nr_pages long. Or NULL, if caller - *		only intends to ensure the pages are faulted in. - * @vmas:	array of pointers to vmas corresponding to each page. - *		Or NULL if the caller does not require them. - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. Each page returned must be released - * with a put_page() call when it is finished with. vmas will only - * remain valid while mmap_sem is held. - * - * Must be called with mmap_sem held for read or write. - * - * get_user_pages walks a process's page tables and takes a reference to - * each struct page that each user address corresponds to at a given - * instant. That is, it takes the page that would be accessed if a user - * thread accesses the given user virtual address at that instant. - * - * This does not guarantee that the page exists in the user mappings when - * get_user_pages returns, and there may even be a completely different - * page there in some cases (eg. if mmapped pagecache has been invalidated - * and subsequently re faulted). However it does guarantee that the page - * won't be freed completely. And mostly callers simply care that the page - * contains data that was valid *at some point in time*. Typically, an IO - * or similar operation cannot guarantee anything stronger anyway because - * locks can't be held over the syscall boundary. - * - * If write=0, the page must not be written to. If the page is written to, - * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called - * after the page is finished with, and before put_page is called. - * - * get_user_pages is typically used for fewer-copy IO operations, to get a - * handle on the memory by some means other than accesses via the user virtual - * addresses. The pages may be submitted for DMA to devices or accessed via - * their kernel linear mapping (via the kmap APIs). Care should be taken to - * use the correct cache flushing APIs. - * - * See also get_user_pages_fast, for performance critical applications. - */ -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, -		unsigned long start, int nr_pages, int write, int force, -		struct page **pages, struct vm_area_struct **vmas) -{ -	int flags = FOLL_TOUCH; - -	if (pages) -		flags |= FOLL_GET; -	if (write) -		flags |= FOLL_WRITE; -	if (force) -		flags |= FOLL_FORCE; - -	return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); -} -EXPORT_SYMBOL(get_user_pages); - -/** - * get_dump_page() - pin user page in memory while writing it to core dump - * @addr: user address - * - * Returns struct page pointer of user page pinned for dump, - * to be freed afterwards by page_cache_release() or put_page(). - * - * Returns NULL on any kind of failure - a hole must then be inserted into - * the corefile, to preserve alignment with its headers; and also returns - * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - - * allowing a hole to be left in the corefile to save diskspace. - * - * Called without mmap_sem, but after all other threads have been killed. - */ -#ifdef CONFIG_ELF_CORE -struct page *get_dump_page(unsigned long addr) -{ -	struct vm_area_struct *vma; -	struct page *page; - -	if (__get_user_pages(current, current->mm, addr, 1, -			FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) -		return NULL; -	flush_cache_page(vma, addr, page_to_pfn(page)); -	return page; -} -#endif /* CONFIG_ELF_CORE */ -  pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,  			spinlock_t **ptl)  { @@ -1598,8 +1460,10 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,  	pud_t * pud = pud_alloc(mm, pgd, addr);  	if (pud) {  		pmd_t * pmd = pmd_alloc(mm, pud, addr); -		if (pmd) +		if (pmd) { +			VM_BUG_ON(pmd_trans_huge(*pmd));  			return pte_alloc_map_lock(mm, pmd, addr, ptl); +		}  	}  	return NULL;  } @@ -1667,6 +1531,11 @@ out:   * ask for a shared writable mapping!   *   * The page does not need to be reserved. + * + * Usually this function is called from f_op->mmap() handler + * under mm->mmap_sem write-lock, so it can change vma->vm_flags. + * Caller must set VM_MIXEDMAP on vma if it wants to call this + * function from other places, for example from page-fault handler.   */  int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,  			struct page *page) @@ -1675,7 +1544,11 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,  		return -EFAULT;  	if (!page_count(page))  		return -EINVAL; -	vma->vm_flags |= VM_INSERTPAGE; +	if (!(vma->vm_flags & VM_MIXEDMAP)) { +		BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem)); +		BUG_ON(vma->vm_flags & VM_PFNMAP); +		vma->vm_flags |= VM_MIXEDMAP; +	}  	return insert_page(vma, addr, page, vma->vm_page_prot);  }  EXPORT_SYMBOL(vm_insert_page); @@ -1714,7 +1587,7 @@ out:   * @addr: target user address of this page   * @pfn: source kernel pfn   * - * Similar to vm_inert_page, this allows drivers to insert individual pages + * Similar to vm_insert_page, this allows drivers to insert individual pages   * they've allocated into a user vma. Same comments apply.   *   * This function should only be called from a vm_ops->fault handler, and @@ -1744,14 +1617,11 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,  	if (addr < vma->vm_start || addr >= vma->vm_end)  		return -EFAULT; -	if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE)) +	if (track_pfn_insert(vma, &pgprot, pfn))  		return -EINVAL;  	ret = insert_pfn(vma, addr, pfn, pgprot); -	if (ret) -		untrack_pfn_vma(vma, pfn, PAGE_SIZE); -  	return ret;  }  EXPORT_SYMBOL(vm_insert_pfn); @@ -1818,6 +1688,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,  	pmd = pmd_alloc(mm, pud, addr);  	if (!pmd)  		return -ENOMEM; +	VM_BUG_ON(pmd_trans_huge(*pmd));  	do {  		next = pmd_addr_end(addr, end);  		if (remap_pte_range(mm, pmd, addr, next, @@ -1871,37 +1742,30 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,  	 * rest of the world about it:  	 *   VM_IO tells people not to look at these pages  	 *	(accesses can have side effects). -	 *   VM_RESERVED is specified all over the place, because -	 *	in 2.4 it kept swapout's vma scan off this vma; but -	 *	in 2.6 the LRU scan won't even find its pages, so this -	 *	flag means no more than count its pages in reserved_vm, -	 * 	and omit it from core dump, even when VM_IO turned off.  	 *   VM_PFNMAP tells the core MM that the base pages are just  	 *	raw PFN mappings, and do not have a "struct page" associated  	 *	with them. +	 *   VM_DONTEXPAND +	 *      Disable vma merging and expanding with mremap(). +	 *   VM_DONTDUMP +	 *      Omit vma from core dump, even when VM_IO turned off.  	 *  	 * There's a horrible special case to handle copy-on-write  	 * behaviour that some programs depend on. We mark the "original"  	 * un-COW'ed pages by matching them up with "vma->vm_pgoff". +	 * See vm_normal_page() for details.  	 */ -	if (addr == vma->vm_start && end == vma->vm_end) { +	if (is_cow_mapping(vma->vm_flags)) { +		if (addr != vma->vm_start || end != vma->vm_end) +			return -EINVAL;  		vma->vm_pgoff = pfn; -		vma->vm_flags |= VM_PFN_AT_MMAP; -	} else if (is_cow_mapping(vma->vm_flags)) -		return -EINVAL; - -	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; +	} -	err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size)); -	if (err) { -		/* -		 * To indicate that track_pfn related cleanup is not -		 * needed from higher level routine calling unmap_vmas -		 */ -		vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); -		vma->vm_flags &= ~VM_PFN_AT_MMAP; +	err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); +	if (err)  		return -EINVAL; -	} + +	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;  	BUG_ON(addr >= end);  	pfn -= addr >> PAGE_SHIFT; @@ -1916,12 +1780,59 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,  	} while (pgd++, addr = next, addr != end);  	if (err) -		untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size)); +		untrack_pfn(vma, pfn, PAGE_ALIGN(size));  	return err;  }  EXPORT_SYMBOL(remap_pfn_range); +/** + * vm_iomap_memory - remap memory to userspace + * @vma: user vma to map to + * @start: start of area + * @len: size of area + * + * This is a simplified io_remap_pfn_range() for common driver use. The + * driver just needs to give us the physical memory range to be mapped, + * we'll figure out the rest from the vma information. + * + * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get + * whatever write-combining details or similar. + */ +int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) +{ +	unsigned long vm_len, pfn, pages; + +	/* Check that the physical memory area passed in looks valid */ +	if (start + len < start) +		return -EINVAL; +	/* +	 * You *really* shouldn't map things that aren't page-aligned, +	 * but we've historically allowed it because IO memory might +	 * just have smaller alignment. +	 */ +	len += start & ~PAGE_MASK; +	pfn = start >> PAGE_SHIFT; +	pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; +	if (pfn + pages < pfn) +		return -EINVAL; + +	/* We start the mapping 'vm_pgoff' pages into the area */ +	if (vma->vm_pgoff > pages) +		return -EINVAL; +	pfn += vma->vm_pgoff; +	pages -= vma->vm_pgoff; + +	/* Can we fit all of the mapping? */ +	vm_len = vma->vm_end - vma->vm_start; +	if (vm_len >> PAGE_SHIFT > pages) +		return -EINVAL; + +	/* Ok, let it rip */ +	return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_iomap_memory); +  static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,  				     unsigned long addr, unsigned long end,  				     pte_fn_t fn, void *data) @@ -2027,10 +1938,10 @@ EXPORT_SYMBOL_GPL(apply_to_page_range);   * handle_pte_fault chooses page fault handler according to an entry   * which was read non-atomically.  Before making any commitment, on   * those architectures or configurations (e.g. i386 with PAE) which - * might give a mix of unmatched parts, do_swap_page and do_file_page + * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault   * must check under lock before unmapping the pte and proceeding   * (but do_wp_page is only called after already making such a check; - * and do_anonymous_page and do_no_page can safely check later on). + * and do_anonymous_page can safely check later on).   */  static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,  				pte_t *page_table, pte_t orig_pte) @@ -2048,21 +1959,10 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,  	return same;  } -/* - * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when - * servicing faults for write access.  In the normal case, do always want - * pte_mkwrite.  But get_user_pages can cause write faults for mappings - * that do not have writing enabled, when used by access_process_vm. - */ -static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) -{ -	if (likely(vma->vm_flags & VM_WRITE)) -		pte = pte_mkwrite(pte); -	return pte; -} -  static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)  { +	debug_dma_assert_idle(src); +  	/*  	 * If the source page was a PFN mapping, we don't have  	 * a "struct page" for it. We do a best-effort copy by @@ -2070,7 +1970,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo  	 * fails, we just zero-fill it. Live with it.  	 */  	if (unlikely(!src)) { -		void *kaddr = kmap_atomic(dst, KM_USER0); +		void *kaddr = kmap_atomic(dst);  		void __user *uaddr = (void __user *)(va & PAGE_MASK);  		/* @@ -2081,13 +1981,45 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo  		 */  		if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))  			clear_page(kaddr); -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		flush_dcache_page(dst);  	} else  		copy_user_highpage(dst, src, va, vma);  }  /* + * Notify the address space that the page is about to become writable so that + * it can prohibit this or wait for the page to get into an appropriate state. + * + * We do this without the lock held, so that it can sleep if it needs to. + */ +static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, +	       unsigned long address) +{ +	struct vm_fault vmf; +	int ret; + +	vmf.virtual_address = (void __user *)(address & PAGE_MASK); +	vmf.pgoff = page->index; +	vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; +	vmf.page = page; + +	ret = vma->vm_ops->page_mkwrite(vma, &vmf); +	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) +		return ret; +	if (unlikely(!(ret & VM_FAULT_LOCKED))) { +		lock_page(page); +		if (!page->mapping) { +			unlock_page(page); +			return 0; /* retry */ +		} +		ret |= VM_FAULT_LOCKED; +	} else +		VM_BUG_ON_PAGE(!PageLocked(page), page); +	return ret; +} + +/*   * This routine handles present pages, when users try to write   * to a shared page. It is done by copying the page to a new address   * and decrementing the shared-page counter for the old page. @@ -2110,11 +2042,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,  		spinlock_t *ptl, pte_t orig_pte)  	__releases(ptl)  { -	struct page *old_page, *new_page; +	struct page *old_page, *new_page = NULL;  	pte_t entry; -	int reuse = 0, ret = 0; +	int ret = 0;  	int page_mkwrite = 0;  	struct page *dirty_page = NULL; +	unsigned long mmun_start = 0;	/* For mmu_notifiers */ +	unsigned long mmun_end = 0;	/* For mmu_notifiers */  	old_page = vm_normal_page(vma, address, orig_pte);  	if (!old_page) { @@ -2144,19 +2078,20 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,  							 &ptl);  			if (!pte_same(*page_table, orig_pte)) {  				unlock_page(old_page); -				page_cache_release(old_page);  				goto unlock;  			}  			page_cache_release(old_page);  		} -		reuse = reuse_swap_page(old_page); -		if (reuse) +		if (reuse_swap_page(old_page)) {  			/*  			 * The page is all ours.  Move it to our anon_vma so  			 * the rmap code will not search our parent or siblings.  			 * Protected against the rmap code by the page lock.  			 */  			page_move_anon_rmap(old_page, vma, address); +			unlock_page(old_page); +			goto reuse; +		}  		unlock_page(old_page);  	} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==  					(VM_WRITE|VM_SHARED))) { @@ -2166,42 +2101,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,  		 * get_user_pages(.write=1, .force=1).  		 */  		if (vma->vm_ops && vma->vm_ops->page_mkwrite) { -			struct vm_fault vmf;  			int tmp; - -			vmf.virtual_address = (void __user *)(address & -								PAGE_MASK); -			vmf.pgoff = old_page->index; -			vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; -			vmf.page = old_page; - -			/* -			 * Notify the address space that the page is about to -			 * become writable so that it can prohibit this or wait -			 * for the page to get into an appropriate state. -			 * -			 * We do this without the lock held, so that it can -			 * sleep if it needs to. -			 */  			page_cache_get(old_page);  			pte_unmap_unlock(page_table, ptl); - -			tmp = vma->vm_ops->page_mkwrite(vma, &vmf); -			if (unlikely(tmp & -					(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { -				ret = tmp; -				goto unwritable_page; +			tmp = do_page_mkwrite(vma, old_page, address); +			if (unlikely(!tmp || (tmp & +					(VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { +				page_cache_release(old_page); +				return tmp;  			} -			if (unlikely(!(tmp & VM_FAULT_LOCKED))) { -				lock_page(old_page); -				if (!old_page->mapping) { -					ret = 0; /* retry the fault */ -					unlock_page(old_page); -					goto unwritable_page; -				} -			} else -				VM_BUG_ON(!PageLocked(old_page)); -  			/*  			 * Since we dropped the lock we need to revalidate  			 * the PTE as someone else may have changed it.  If @@ -2212,7 +2120,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,  							 &ptl);  			if (!pte_same(*page_table, orig_pte)) {  				unlock_page(old_page); -				page_cache_release(old_page);  				goto unlock;  			} @@ -2220,18 +2127,59 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,  		}  		dirty_page = old_page;  		get_page(dirty_page); -		reuse = 1; -	} -	if (reuse) {  reuse: +		/* +		 * Clear the pages cpupid information as the existing +		 * information potentially belongs to a now completely +		 * unrelated process. +		 */ +		if (old_page) +			page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1); +  		flush_cache_page(vma, address, pte_pfn(orig_pte));  		entry = pte_mkyoung(orig_pte);  		entry = maybe_mkwrite(pte_mkdirty(entry), vma);  		if (ptep_set_access_flags(vma, address, page_table, entry,1))  			update_mmu_cache(vma, address, page_table); +		pte_unmap_unlock(page_table, ptl);  		ret |= VM_FAULT_WRITE; -		goto unlock; + +		if (!dirty_page) +			return ret; + +		/* +		 * Yes, Virginia, this is actually required to prevent a race +		 * with clear_page_dirty_for_io() from clearing the page dirty +		 * bit after it clear all dirty ptes, but before a racing +		 * do_wp_page installs a dirty pte. +		 * +		 * do_shared_fault is protected similarly. +		 */ +		if (!page_mkwrite) { +			wait_on_page_locked(dirty_page); +			set_page_dirty_balance(dirty_page); +			/* file_update_time outside page_lock */ +			if (vma->vm_file) +				file_update_time(vma->vm_file); +		} +		put_page(dirty_page); +		if (page_mkwrite) { +			struct address_space *mapping = dirty_page->mapping; + +			set_page_dirty(dirty_page); +			unlock_page(dirty_page); +			page_cache_release(dirty_page); +			if (mapping)	{ +				/* +				 * Some device drivers do not set page.mapping +				 * but still dirty their pages +				 */ +				balance_dirty_pages_ratelimited(mapping); +			} +		} + +		return ret;  	}  	/* @@ -2256,19 +2204,13 @@ gotten:  	}  	__SetPageUptodate(new_page); -	/* -	 * Don't let another task, with possibly unlocked vma, -	 * keep the mlocked page. -	 */ -	if ((vma->vm_flags & VM_LOCKED) && old_page) { -		lock_page(old_page);	/* for LRU manipulation */ -		clear_page_mlock(old_page); -		unlock_page(old_page); -	} - -	if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) +	if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))  		goto oom_free_new; +	mmun_start  = address & PAGE_MASK; +	mmun_end    = mmun_start + PAGE_SIZE; +	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); +  	/*  	 * Re-check the pte - we dropped the lock  	 */ @@ -2333,169 +2275,49 @@ gotten:  	if (new_page)  		page_cache_release(new_page); -	if (old_page) -		page_cache_release(old_page);  unlock:  	pte_unmap_unlock(page_table, ptl); -	if (dirty_page) { +	if (mmun_end > mmun_start) +		mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); +	if (old_page) {  		/* -		 * Yes, Virginia, this is actually required to prevent a race -		 * with clear_page_dirty_for_io() from clearing the page dirty -		 * bit after it clear all dirty ptes, but before a racing -		 * do_wp_page installs a dirty pte. -		 * -		 * do_no_page is protected similarly. +		 * Don't let another task, with possibly unlocked vma, +		 * keep the mlocked page.  		 */ -		if (!page_mkwrite) { -			wait_on_page_locked(dirty_page); -			set_page_dirty_balance(dirty_page, page_mkwrite); -		} -		put_page(dirty_page); -		if (page_mkwrite) { -			struct address_space *mapping = dirty_page->mapping; - -			set_page_dirty(dirty_page); -			unlock_page(dirty_page); -			page_cache_release(dirty_page); -			if (mapping)	{ -				/* -				 * Some device drivers do not set page.mapping -				 * but still dirty their pages -				 */ -				balance_dirty_pages_ratelimited(mapping); -			} +		if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { +			lock_page(old_page);	/* LRU manipulation */ +			munlock_vma_page(old_page); +			unlock_page(old_page);  		} - -		/* file_update_time outside page_lock */ -		if (vma->vm_file) -			file_update_time(vma->vm_file); +		page_cache_release(old_page);  	}  	return ret;  oom_free_new:  	page_cache_release(new_page);  oom: -	if (old_page) { -		if (page_mkwrite) { -			unlock_page(old_page); -			page_cache_release(old_page); -		} +	if (old_page)  		page_cache_release(old_page); -	}  	return VM_FAULT_OOM; - -unwritable_page: -	page_cache_release(old_page); -	return ret; -} - -/* - * Helper functions for unmap_mapping_range(). - * - * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __ - * - * We have to restart searching the prio_tree whenever we drop the lock, - * since the iterator is only valid while the lock is held, and anyway - * a later vma might be split and reinserted earlier while lock dropped. - * - * The list of nonlinear vmas could be handled more efficiently, using - * a placeholder, but handle it in the same way until a need is shown. - * It is important to search the prio_tree before nonlinear list: a vma - * may become nonlinear and be shifted from prio_tree to nonlinear list - * while the lock is dropped; but never shifted from list to prio_tree. - * - * In order to make forward progress despite restarting the search, - * vm_truncate_count is used to mark a vma as now dealt with, so we can - * quickly skip it next time around.  Since the prio_tree search only - * shows us those vmas affected by unmapping the range in question, we - * can't efficiently keep all vmas in step with mapping->truncate_count: - * so instead reset them all whenever it wraps back to 0 (then go to 1). - * mapping->truncate_count and vma->vm_truncate_count are protected by - * i_mmap_lock. - * - * In order to make forward progress despite repeatedly restarting some - * large vma, note the restart_addr from unmap_vmas when it breaks out: - * and restart from that address when we reach that vma again.  It might - * have been split or merged, shrunk or extended, but never shifted: so - * restart_addr remains valid so long as it remains in the vma's range. - * unmap_mapping_range forces truncate_count to leap over page-aligned - * values so we can save vma's restart_addr in its truncate_count field. - */ -#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK)) - -static void reset_vma_truncate_counts(struct address_space *mapping) -{ -	struct vm_area_struct *vma; -	struct prio_tree_iter iter; - -	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) -		vma->vm_truncate_count = 0; -	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) -		vma->vm_truncate_count = 0;  } -static int unmap_mapping_range_vma(struct vm_area_struct *vma, +static void unmap_mapping_range_vma(struct vm_area_struct *vma,  		unsigned long start_addr, unsigned long end_addr,  		struct zap_details *details)  { -	unsigned long restart_addr; -	int need_break; - -	/* -	 * files that support invalidating or truncating portions of the -	 * file from under mmaped areas must have their ->fault function -	 * return a locked page (and set VM_FAULT_LOCKED in the return). -	 * This provides synchronisation against concurrent unmapping here. -	 */ - -again: -	restart_addr = vma->vm_truncate_count; -	if (is_restart_addr(restart_addr) && start_addr < restart_addr) { -		start_addr = restart_addr; -		if (start_addr >= end_addr) { -			/* Top of vma has been split off since last time */ -			vma->vm_truncate_count = details->truncate_count; -			return 0; -		} -	} - -	restart_addr = zap_page_range(vma, start_addr, -					end_addr - start_addr, details); -	need_break = need_resched() || spin_needbreak(details->i_mmap_lock); - -	if (restart_addr >= end_addr) { -		/* We have now completed this vma: mark it so */ -		vma->vm_truncate_count = details->truncate_count; -		if (!need_break) -			return 0; -	} else { -		/* Note restart_addr in vma's truncate_count field */ -		vma->vm_truncate_count = restart_addr; -		if (!need_break) -			goto again; -	} - -	spin_unlock(details->i_mmap_lock); -	cond_resched(); -	spin_lock(details->i_mmap_lock); -	return -EINTR; +	zap_page_range_single(vma, start_addr, end_addr - start_addr, details);  } -static inline void unmap_mapping_range_tree(struct prio_tree_root *root, +static inline void unmap_mapping_range_tree(struct rb_root *root,  					    struct zap_details *details)  {  	struct vm_area_struct *vma; -	struct prio_tree_iter iter;  	pgoff_t vba, vea, zba, zea; -restart: -	vma_prio_tree_foreach(vma, &iter, root, +	vma_interval_tree_foreach(vma, root,  			details->first_index, details->last_index) { -		/* Skip quickly over those we have already dealt with */ -		if (vma->vm_truncate_count == details->truncate_count) -			continue;  		vba = vma->vm_pgoff; -		vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; +		vea = vba + vma_pages(vma) - 1;  		/* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */  		zba = details->first_index;  		if (zba < vba) @@ -2504,11 +2326,10 @@ restart:  		if (zea > vea)  			zea = vea; -		if (unmap_mapping_range_vma(vma, +		unmap_mapping_range_vma(vma,  			((zba - vba) << PAGE_SHIFT) + vma->vm_start,  			((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, -				details) < 0) -			goto restart; +				details);  	}  } @@ -2523,15 +2344,9 @@ static inline void unmap_mapping_range_list(struct list_head *head,  	 * across *all* the pages in each nonlinear VMA, not just the pages  	 * whose virtual address lies outside the file truncation point.  	 */ -restart: -	list_for_each_entry(vma, head, shared.vm_set.list) { -		/* Skip quickly over those we have already dealt with */ -		if (vma->vm_truncate_count == details->truncate_count) -			continue; +	list_for_each_entry(vma, head, shared.nonlinear) {  		details->nonlinear_vma = vma; -		if (unmap_mapping_range_vma(vma, vma->vm_start, -					vma->vm_end, details) < 0) -			goto restart; +		unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);  	}  } @@ -2570,51 +2385,17 @@ void unmap_mapping_range(struct address_space *mapping,  	details.last_index = hba + hlen - 1;  	if (details.last_index < details.first_index)  		details.last_index = ULONG_MAX; -	details.i_mmap_lock = &mapping->i_mmap_lock; -	spin_lock(&mapping->i_mmap_lock); - -	/* Protect against endless unmapping loops */ -	mapping->truncate_count++; -	if (unlikely(is_restart_addr(mapping->truncate_count))) { -		if (mapping->truncate_count == 0) -			reset_vma_truncate_counts(mapping); -		mapping->truncate_count++; -	} -	details.truncate_count = mapping->truncate_count; -	if (unlikely(!prio_tree_empty(&mapping->i_mmap))) +	mutex_lock(&mapping->i_mmap_mutex); +	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))  		unmap_mapping_range_tree(&mapping->i_mmap, &details);  	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))  		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); -	spin_unlock(&mapping->i_mmap_lock); +	mutex_unlock(&mapping->i_mmap_mutex);  }  EXPORT_SYMBOL(unmap_mapping_range); -int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) -{ -	struct address_space *mapping = inode->i_mapping; - -	/* -	 * If the underlying filesystem is not going to provide -	 * a way to truncate a range of blocks (punch a hole) - -	 * we should return failure right now. -	 */ -	if (!inode->i_op->truncate_range) -		return -ENOSYS; - -	mutex_lock(&inode->i_mutex); -	down_write(&inode->i_alloc_sem); -	unmap_mapping_range(mapping, offset, (end - offset), 1); -	truncate_inode_pages_range(mapping, offset, end); -	unmap_mapping_range(mapping, offset, (end - offset), 1); -	inode->i_op->truncate_range(inode, offset, end); -	up_write(&inode->i_alloc_sem); -	mutex_unlock(&inode->i_mutex); - -	return 0; -} -  /*   * We enter with non-exclusive mmap_sem (to exclude vma changes,   * but allow concurrent faults), and pte mapped but not yet locked. @@ -2625,11 +2406,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,  		unsigned int flags, pte_t orig_pte)  {  	spinlock_t *ptl; -	struct page *page, *swapcache = NULL; +	struct page *page, *swapcache;  	swp_entry_t entry;  	pte_t pte;  	int locked; -	struct mem_cgroup *ptr = NULL; +	struct mem_cgroup *ptr;  	int exclusive = 0;  	int ret = 0; @@ -2651,7 +2432,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,  	delayacct_set_flag(DELAYACCT_PF_SWAPIN);  	page = lookup_swap_cache(entry);  	if (!page) { -		grab_swap_token(mm); /* Contend for token _before_ read-in */  		page = swapin_readahead(entry,  					GFP_HIGHUSER_MOVABLE, vma, address);  		if (!page) { @@ -2669,6 +2449,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,  		/* Had to read the page from swap area: Major fault */  		ret = VM_FAULT_MAJOR;  		count_vm_event(PGMAJFAULT); +		mem_cgroup_count_vm_event(mm, PGMAJFAULT);  	} else if (PageHWPoison(page)) {  		/*  		 * hwpoisoned dirty swapcache pages are kept for killing @@ -2676,10 +2457,13 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,  		 */  		ret = VM_FAULT_HWPOISON;  		delayacct_clear_flag(DELAYACCT_PF_SWAPIN); +		swapcache = page;  		goto out_release;  	} +	swapcache = page;  	locked = lock_page_or_retry(page, mm, flags); +  	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);  	if (!locked) {  		ret |= VM_FAULT_RETRY; @@ -2695,16 +2479,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,  	if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))  		goto out_page; -	if (ksm_might_need_to_copy(page, vma, address)) { -		swapcache = page; -		page = ksm_does_need_to_copy(page, vma, address); - -		if (unlikely(!page)) { -			ret = VM_FAULT_OOM; -			page = swapcache; -			swapcache = NULL; -			goto out_page; -		} +	page = ksm_might_need_to_copy(page, vma, address); +	if (unlikely(!page)) { +		ret = VM_FAULT_OOM; +		page = swapcache; +		goto out_page;  	}  	if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { @@ -2748,8 +2527,13 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,  		exclusive = 1;  	}  	flush_icache_page(vma, page); +	if (pte_swp_soft_dirty(orig_pte)) +		pte = pte_mksoft_dirty(pte);  	set_pte_at(mm, address, page_table, pte); -	do_page_add_anon_rmap(page, vma, address, exclusive); +	if (page == swapcache) +		do_page_add_anon_rmap(page, vma, address, exclusive); +	else /* ksm created a completely new copy */ +		page_add_new_anon_rmap(page, vma, address);  	/* It's better to call commit-charge after rmap is established */  	mem_cgroup_commit_charge_swapin(page, ptr); @@ -2757,7 +2541,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,  	if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))  		try_to_free_swap(page);  	unlock_page(page); -	if (swapcache) { +	if (page != swapcache) {  		/*  		 * Hold the lock to avoid the swap entry to be reused  		 * until we take the PT lock for the pte_same() check @@ -2790,7 +2574,7 @@ out_page:  	unlock_page(page);  out_release:  	page_cache_release(page); -	if (swapcache) { +	if (page != swapcache) {  		unlock_page(swapcache);  		page_cache_release(swapcache);  	} @@ -2817,7 +2601,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo  		if (prev && prev->vm_end == address)  			return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; -		expand_stack(vma, address - PAGE_SIZE); +		expand_downwards(vma, address - PAGE_SIZE);  	}  	if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {  		struct vm_area_struct *next = vma->vm_next; @@ -2866,9 +2650,14 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,  	page = alloc_zeroed_user_highpage_movable(vma, address);  	if (!page)  		goto oom; +	/* +	 * The memory barrier inside __SetPageUptodate makes sure that +	 * preceeding stores to the page contents become visible before +	 * the set_pte_at() write. +	 */  	__SetPageUptodate(page); -	if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) +	if (mem_cgroup_charge_anon(page, mm, GFP_KERNEL))  		goto oom_free_page;  	entry = mk_pte(page, vma->vm_page_prot); @@ -2899,33 +2688,11 @@ oom:  	return VM_FAULT_OOM;  } -/* - * __do_fault() tries to create a new page mapping. It aggressively - * tries to share with existing pages, but makes a separate copy if - * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid - * the next page fault. - * - * As this is called only for pages that do not currently exist, we - * do not need to flush old virtual caches or the TLB. - * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte neither mapped nor locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. - */ -static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, -		unsigned long address, pmd_t *pmd, -		pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +static int __do_fault(struct vm_area_struct *vma, unsigned long address, +		pgoff_t pgoff, unsigned int flags, struct page **page)  { -	pte_t *page_table; -	spinlock_t *ptl; -	struct page *page; -	pte_t entry; -	int anon = 0; -	int charged = 0; -	struct page *dirty_page = NULL;  	struct vm_fault vmf;  	int ret; -	int page_mkwrite = 0;  	vmf.virtual_address = (void __user *)(address & PAGE_MASK);  	vmf.pgoff = pgoff; @@ -2933,161 +2700,319 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,  	vmf.page = NULL;  	ret = vma->vm_ops->fault(vma, &vmf); -	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | -			    VM_FAULT_RETRY))) +	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))  		return ret;  	if (unlikely(PageHWPoison(vmf.page))) {  		if (ret & VM_FAULT_LOCKED)  			unlock_page(vmf.page); +		page_cache_release(vmf.page);  		return VM_FAULT_HWPOISON;  	} -	/* -	 * For consistency in subsequent calls, make the faulted page always -	 * locked. -	 */  	if (unlikely(!(ret & VM_FAULT_LOCKED)))  		lock_page(vmf.page);  	else -		VM_BUG_ON(!PageLocked(vmf.page)); +		VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); + +	*page = vmf.page; +	return ret; +} + +/** + * do_set_pte - setup new PTE entry for given page and add reverse page mapping. + * + * @vma: virtual memory area + * @address: user virtual address + * @page: page to map + * @pte: pointer to target page table entry + * @write: true, if new entry is writable + * @anon: true, if it's anonymous page + * + * Caller must hold page table lock relevant for @pte. + * + * Target users are page handler itself and implementations of + * vm_ops->map_pages. + */ +void do_set_pte(struct vm_area_struct *vma, unsigned long address, +		struct page *page, pte_t *pte, bool write, bool anon) +{ +	pte_t entry; + +	flush_icache_page(vma, page); +	entry = mk_pte(page, vma->vm_page_prot); +	if (write) +		entry = maybe_mkwrite(pte_mkdirty(entry), vma); +	else if (pte_file(*pte) && pte_file_soft_dirty(*pte)) +		pte_mksoft_dirty(entry); +	if (anon) { +		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); +		page_add_new_anon_rmap(page, vma, address); +	} else { +		inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES); +		page_add_file_rmap(page); +	} +	set_pte_at(vma->vm_mm, address, pte, entry); + +	/* no need to invalidate: a not-present page won't be cached */ +	update_mmu_cache(vma, address, pte); +} + +static unsigned long fault_around_bytes = rounddown_pow_of_two(65536); + +static inline unsigned long fault_around_pages(void) +{ +	return fault_around_bytes >> PAGE_SHIFT; +} + +static inline unsigned long fault_around_mask(void) +{ +	return ~(fault_around_bytes - 1) & PAGE_MASK; +} + +#ifdef CONFIG_DEBUG_FS +static int fault_around_bytes_get(void *data, u64 *val) +{ +	*val = fault_around_bytes; +	return 0; +} + +/* + * fault_around_pages() and fault_around_mask() expects fault_around_bytes + * rounded down to nearest page order. It's what do_fault_around() expects to + * see. + */ +static int fault_around_bytes_set(void *data, u64 val) +{ +	if (val / PAGE_SIZE > PTRS_PER_PTE) +		return -EINVAL; +	if (val > PAGE_SIZE) +		fault_around_bytes = rounddown_pow_of_two(val); +	else +		fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */ +	return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops, +		fault_around_bytes_get, fault_around_bytes_set, "%llu\n"); + +static int __init fault_around_debugfs(void) +{ +	void *ret; + +	ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL, +			&fault_around_bytes_fops); +	if (!ret) +		pr_warn("Failed to create fault_around_bytes in debugfs"); +	return 0; +} +late_initcall(fault_around_debugfs); +#endif + +/* + * do_fault_around() tries to map few pages around the fault address. The hope + * is that the pages will be needed soon and this will lower the number of + * faults to handle. + * + * It uses vm_ops->map_pages() to map the pages, which skips the page if it's + * not ready to be mapped: not up-to-date, locked, etc. + * + * This function is called with the page table lock taken. In the split ptlock + * case the page table lock only protects only those entries which belong to + * the page table corresponding to the fault address. + * + * This function doesn't cross the VMA boundaries, in order to call map_pages() + * only once. + * + * fault_around_pages() defines how many pages we'll try to map. + * do_fault_around() expects it to return a power of two less than or equal to + * PTRS_PER_PTE. + * + * The virtual address of the area that we map is naturally aligned to the + * fault_around_pages() value (and therefore to page order).  This way it's + * easier to guarantee that we don't cross page table boundaries. + */ +static void do_fault_around(struct vm_area_struct *vma, unsigned long address, +		pte_t *pte, pgoff_t pgoff, unsigned int flags) +{ +	unsigned long start_addr; +	pgoff_t max_pgoff; +	struct vm_fault vmf; +	int off; + +	start_addr = max(address & fault_around_mask(), vma->vm_start); +	off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); +	pte -= off; +	pgoff -= off;  	/* -	 * Should we do an early C-O-W break? +	 *  max_pgoff is either end of page table or end of vma +	 *  or fault_around_pages() from pgoff, depending what is nearest.  	 */ -	page = vmf.page; -	if (flags & FAULT_FLAG_WRITE) { -		if (!(vma->vm_flags & VM_SHARED)) { -			anon = 1; -			if (unlikely(anon_vma_prepare(vma))) { -				ret = VM_FAULT_OOM; -				goto out; -			} -			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, -						vma, address); -			if (!page) { -				ret = VM_FAULT_OOM; -				goto out; -			} -			if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { -				ret = VM_FAULT_OOM; -				page_cache_release(page); -				goto out; -			} -			charged = 1; -			/* -			 * Don't let another task, with possibly unlocked vma, -			 * keep the mlocked page. -			 */ -			if (vma->vm_flags & VM_LOCKED) -				clear_page_mlock(vmf.page); -			copy_user_highpage(page, vmf.page, address, vma); -			__SetPageUptodate(page); -		} else { -			/* -			 * If the page will be shareable, see if the backing -			 * address space wants to know that the page is about -			 * to become writable -			 */ -			if (vma->vm_ops->page_mkwrite) { -				int tmp; - -				unlock_page(page); -				vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; -				tmp = vma->vm_ops->page_mkwrite(vma, &vmf); -				if (unlikely(tmp & -					  (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { -					ret = tmp; -					goto unwritable_page; -				} -				if (unlikely(!(tmp & VM_FAULT_LOCKED))) { -					lock_page(page); -					if (!page->mapping) { -						ret = 0; /* retry the fault */ -						unlock_page(page); -						goto unwritable_page; -					} -				} else -					VM_BUG_ON(!PageLocked(page)); -				page_mkwrite = 1; -			} -		} - +	max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + +		PTRS_PER_PTE - 1; +	max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, +			pgoff + fault_around_pages() - 1); + +	/* Check if it makes any sense to call ->map_pages */ +	while (!pte_none(*pte)) { +		if (++pgoff > max_pgoff) +			return; +		start_addr += PAGE_SIZE; +		if (start_addr >= vma->vm_end) +			return; +		pte++;  	} -	page_table = pte_offset_map_lock(mm, pmd, address, &ptl); +	vmf.virtual_address = (void __user *) start_addr; +	vmf.pte = pte; +	vmf.pgoff = pgoff; +	vmf.max_pgoff = max_pgoff; +	vmf.flags = flags; +	vma->vm_ops->map_pages(vma, &vmf); +} + +static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, +		unsigned long address, pmd_t *pmd, +		pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +{ +	struct page *fault_page; +	spinlock_t *ptl; +	pte_t *pte; +	int ret = 0;  	/* -	 * This silly early PAGE_DIRTY setting removes a race -	 * due to the bad i386 page protection. But it's valid -	 * for other architectures too. -	 * -	 * Note that if FAULT_FLAG_WRITE is set, we either now have -	 * an exclusive copy of the page, or this is a shared mapping, -	 * so we can make it writable and dirty to avoid having to -	 * handle that later. +	 * Let's call ->map_pages() first and use ->fault() as fallback +	 * if page by the offset is not ready to be mapped (cold cache or +	 * something).  	 */ -	/* Only go through if we didn't race with anybody else... */ -	if (likely(pte_same(*page_table, orig_pte))) { -		flush_icache_page(vma, page); -		entry = mk_pte(page, vma->vm_page_prot); -		if (flags & FAULT_FLAG_WRITE) -			entry = maybe_mkwrite(pte_mkdirty(entry), vma); -		if (anon) { -			inc_mm_counter_fast(mm, MM_ANONPAGES); -			page_add_new_anon_rmap(page, vma, address); -		} else { -			inc_mm_counter_fast(mm, MM_FILEPAGES); -			page_add_file_rmap(page); -			if (flags & FAULT_FLAG_WRITE) { -				dirty_page = page; -				get_page(dirty_page); -			} -		} -		set_pte_at(mm, address, page_table, entry); +	if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) && +	    fault_around_pages() > 1) { +		pte = pte_offset_map_lock(mm, pmd, address, &ptl); +		do_fault_around(vma, address, pte, pgoff, flags); +		if (!pte_same(*pte, orig_pte)) +			goto unlock_out; +		pte_unmap_unlock(pte, ptl); +	} -		/* no need to invalidate: a not-present page won't be cached */ -		update_mmu_cache(vma, address, page_table); -	} else { -		if (charged) -			mem_cgroup_uncharge_page(page); -		if (anon) -			page_cache_release(page); -		else -			anon = 1; /* no anon but release faulted_page */ +	ret = __do_fault(vma, address, pgoff, flags, &fault_page); +	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) +		return ret; + +	pte = pte_offset_map_lock(mm, pmd, address, &ptl); +	if (unlikely(!pte_same(*pte, orig_pte))) { +		pte_unmap_unlock(pte, ptl); +		unlock_page(fault_page); +		page_cache_release(fault_page); +		return ret;  	} +	do_set_pte(vma, address, fault_page, pte, false, false); +	unlock_page(fault_page); +unlock_out: +	pte_unmap_unlock(pte, ptl); +	return ret; +} -	pte_unmap_unlock(page_table, ptl); +static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, +		unsigned long address, pmd_t *pmd, +		pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +{ +	struct page *fault_page, *new_page; +	spinlock_t *ptl; +	pte_t *pte; +	int ret; -out: -	if (dirty_page) { -		struct address_space *mapping = page->mapping; +	if (unlikely(anon_vma_prepare(vma))) +		return VM_FAULT_OOM; -		if (set_page_dirty(dirty_page)) -			page_mkwrite = 1; -		unlock_page(dirty_page); -		put_page(dirty_page); -		if (page_mkwrite && mapping) { -			/* -			 * Some device drivers do not set page.mapping but still -			 * dirty their pages -			 */ -			balance_dirty_pages_ratelimited(mapping); -		} +	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); +	if (!new_page) +		return VM_FAULT_OOM; -		/* file_update_time outside page_lock */ -		if (vma->vm_file) -			file_update_time(vma->vm_file); -	} else { -		unlock_page(vmf.page); -		if (anon) -			page_cache_release(vmf.page); +	if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) { +		page_cache_release(new_page); +		return VM_FAULT_OOM;  	} +	ret = __do_fault(vma, address, pgoff, flags, &fault_page); +	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) +		goto uncharge_out; + +	copy_user_highpage(new_page, fault_page, address, vma); +	__SetPageUptodate(new_page); + +	pte = pte_offset_map_lock(mm, pmd, address, &ptl); +	if (unlikely(!pte_same(*pte, orig_pte))) { +		pte_unmap_unlock(pte, ptl); +		unlock_page(fault_page); +		page_cache_release(fault_page); +		goto uncharge_out; +	} +	do_set_pte(vma, address, new_page, pte, true, true); +	pte_unmap_unlock(pte, ptl); +	unlock_page(fault_page); +	page_cache_release(fault_page); +	return ret; +uncharge_out: +	mem_cgroup_uncharge_page(new_page); +	page_cache_release(new_page);  	return ret; +} + +static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, +		unsigned long address, pmd_t *pmd, +		pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +{ +	struct page *fault_page; +	struct address_space *mapping; +	spinlock_t *ptl; +	pte_t *pte; +	int dirtied = 0; +	int ret, tmp; + +	ret = __do_fault(vma, address, pgoff, flags, &fault_page); +	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) +		return ret; + +	/* +	 * Check if the backing address space wants to know that the page is +	 * about to become writable +	 */ +	if (vma->vm_ops->page_mkwrite) { +		unlock_page(fault_page); +		tmp = do_page_mkwrite(vma, fault_page, address); +		if (unlikely(!tmp || +				(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { +			page_cache_release(fault_page); +			return tmp; +		} +	} + +	pte = pte_offset_map_lock(mm, pmd, address, &ptl); +	if (unlikely(!pte_same(*pte, orig_pte))) { +		pte_unmap_unlock(pte, ptl); +		unlock_page(fault_page); +		page_cache_release(fault_page); +		return ret; +	} +	do_set_pte(vma, address, fault_page, pte, true, false); +	pte_unmap_unlock(pte, ptl); + +	if (set_page_dirty(fault_page)) +		dirtied = 1; +	mapping = fault_page->mapping; +	unlock_page(fault_page); +	if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { +		/* +		 * Some device drivers do not set page.mapping but still +		 * dirty their pages +		 */ +		balance_dirty_pages_ratelimited(mapping); +	} + +	/* file_update_time outside page_lock */ +	if (vma->vm_file && !vma->vm_ops->page_mkwrite) +		file_update_time(vma->vm_file); -unwritable_page: -	page_cache_release(page);  	return ret;  } @@ -3099,7 +3024,13 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,  			- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;  	pte_unmap(page_table); -	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); +	if (!(flags & FAULT_FLAG_WRITE)) +		return do_read_fault(mm, vma, address, pmd, pgoff, flags, +				orig_pte); +	if (!(vma->vm_flags & VM_SHARED)) +		return do_cow_fault(mm, vma, address, pmd, pgoff, flags, +				orig_pte); +	return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);  }  /* @@ -3131,7 +3062,103 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,  	}  	pgoff = pte_to_pgoff(orig_pte); -	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); +	if (!(flags & FAULT_FLAG_WRITE)) +		return do_read_fault(mm, vma, address, pmd, pgoff, flags, +				orig_pte); +	if (!(vma->vm_flags & VM_SHARED)) +		return do_cow_fault(mm, vma, address, pmd, pgoff, flags, +				orig_pte); +	return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); +} + +static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, +				unsigned long addr, int page_nid, +				int *flags) +{ +	get_page(page); + +	count_vm_numa_event(NUMA_HINT_FAULTS); +	if (page_nid == numa_node_id()) { +		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); +		*flags |= TNF_FAULT_LOCAL; +	} + +	return mpol_misplaced(page, vma, addr); +} + +static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, +		   unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) +{ +	struct page *page = NULL; +	spinlock_t *ptl; +	int page_nid = -1; +	int last_cpupid; +	int target_nid; +	bool migrated = false; +	int flags = 0; + +	/* +	* The "pte" at this point cannot be used safely without +	* validation through pte_unmap_same(). It's of NUMA type but +	* the pfn may be screwed if the read is non atomic. +	* +	* ptep_modify_prot_start is not called as this is clearing +	* the _PAGE_NUMA bit and it is not really expected that there +	* would be concurrent hardware modifications to the PTE. +	*/ +	ptl = pte_lockptr(mm, pmd); +	spin_lock(ptl); +	if (unlikely(!pte_same(*ptep, pte))) { +		pte_unmap_unlock(ptep, ptl); +		goto out; +	} + +	pte = pte_mknonnuma(pte); +	set_pte_at(mm, addr, ptep, pte); +	update_mmu_cache(vma, addr, ptep); + +	page = vm_normal_page(vma, addr, pte); +	if (!page) { +		pte_unmap_unlock(ptep, ptl); +		return 0; +	} +	BUG_ON(is_zero_pfn(page_to_pfn(page))); + +	/* +	 * Avoid grouping on DSO/COW pages in specific and RO pages +	 * in general, RO pages shouldn't hurt as much anyway since +	 * they can be in shared cache state. +	 */ +	if (!pte_write(pte)) +		flags |= TNF_NO_GROUP; + +	/* +	 * Flag if the page is shared between multiple address spaces. This +	 * is later used when determining whether to group tasks together +	 */ +	if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) +		flags |= TNF_SHARED; + +	last_cpupid = page_cpupid_last(page); +	page_nid = page_to_nid(page); +	target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags); +	pte_unmap_unlock(ptep, ptl); +	if (target_nid == -1) { +		put_page(page); +		goto out; +	} + +	/* Migrate to the requested node */ +	migrated = migrate_misplaced_page(page, vma, target_nid); +	if (migrated) { +		page_nid = target_nid; +		flags |= TNF_MIGRATED; +	} + +out: +	if (page_nid != -1) +		task_numa_fault(last_cpupid, page_nid, 1, flags); +	return 0;  }  /* @@ -3147,9 +3174,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,   * but allow concurrent faults), and pte mapped but not yet locked.   * We return with mmap_sem still held, but pte unmapped and unlocked.   */ -static inline int handle_pte_fault(struct mm_struct *mm, -		struct vm_area_struct *vma, unsigned long address, -		pte_t *pte, pmd_t *pmd, unsigned int flags) +static int handle_pte_fault(struct mm_struct *mm, +		     struct vm_area_struct *vma, unsigned long address, +		     pte_t *pte, pmd_t *pmd, unsigned int flags)  {  	pte_t entry;  	spinlock_t *ptl; @@ -3172,6 +3199,9 @@ static inline int handle_pte_fault(struct mm_struct *mm,  					pte, pmd, flags, entry);  	} +	if (pte_numa(entry)) +		return do_numa_page(mm, vma, address, entry, pte, pmd); +  	ptl = pte_lockptr(mm, pmd);  	spin_lock(ptl);  	if (unlikely(!pte_same(*pte, entry))) @@ -3203,21 +3233,14 @@ unlock:  /*   * By the time we get here, we already hold the mm semaphore   */ -int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, -		unsigned long address, unsigned int flags) +static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, +			     unsigned long address, unsigned int flags)  {  	pgd_t *pgd;  	pud_t *pud;  	pmd_t *pmd;  	pte_t *pte; -	__set_current_state(TASK_RUNNING); - -	count_vm_event(PGFAULT); - -	/* do counter updates before entering really critical section. */ -	check_sync_rss_stat(current); -  	if (unlikely(is_vm_hugetlb_page(vma)))  		return hugetlb_fault(mm, vma, address, flags); @@ -3228,13 +3251,105 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,  	pmd = pmd_alloc(mm, pud, address);  	if (!pmd)  		return VM_FAULT_OOM; -	pte = pte_alloc_map(mm, pmd, address); -	if (!pte) +	if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { +		int ret = VM_FAULT_FALLBACK; +		if (!vma->vm_ops) +			ret = do_huge_pmd_anonymous_page(mm, vma, address, +					pmd, flags); +		if (!(ret & VM_FAULT_FALLBACK)) +			return ret; +	} else { +		pmd_t orig_pmd = *pmd; +		int ret; + +		barrier(); +		if (pmd_trans_huge(orig_pmd)) { +			unsigned int dirty = flags & FAULT_FLAG_WRITE; + +			/* +			 * If the pmd is splitting, return and retry the +			 * the fault.  Alternative: wait until the split +			 * is done, and goto retry. +			 */ +			if (pmd_trans_splitting(orig_pmd)) +				return 0; + +			if (pmd_numa(orig_pmd)) +				return do_huge_pmd_numa_page(mm, vma, address, +							     orig_pmd, pmd); + +			if (dirty && !pmd_write(orig_pmd)) { +				ret = do_huge_pmd_wp_page(mm, vma, address, pmd, +							  orig_pmd); +				if (!(ret & VM_FAULT_FALLBACK)) +					return ret; +			} else { +				huge_pmd_set_accessed(mm, vma, address, pmd, +						      orig_pmd, dirty); +				return 0; +			} +		} +	} + +	/* +	 * Use __pte_alloc instead of pte_alloc_map, because we can't +	 * run pte_offset_map on the pmd, if an huge pmd could +	 * materialize from under us from a different thread. +	 */ +	if (unlikely(pmd_none(*pmd)) && +	    unlikely(__pte_alloc(mm, vma, pmd, address)))  		return VM_FAULT_OOM; +	/* if an huge pmd materialized from under us just retry later */ +	if (unlikely(pmd_trans_huge(*pmd))) +		return 0; +	/* +	 * A regular pmd is established and it can't morph into a huge pmd +	 * from under us anymore at this point because we hold the mmap_sem +	 * read mode and khugepaged takes it in write mode. So now it's +	 * safe to run pte_offset_map(). +	 */ +	pte = pte_offset_map(pmd, address);  	return handle_pte_fault(mm, vma, address, pte, pmd, flags);  } +int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, +		    unsigned long address, unsigned int flags) +{ +	int ret; + +	__set_current_state(TASK_RUNNING); + +	count_vm_event(PGFAULT); +	mem_cgroup_count_vm_event(mm, PGFAULT); + +	/* do counter updates before entering really critical section. */ +	check_sync_rss_stat(current); + +	/* +	 * Enable the memcg OOM handling for faults triggered in user +	 * space.  Kernel faults are handled more gracefully. +	 */ +	if (flags & FAULT_FLAG_USER) +		mem_cgroup_oom_enable(); + +	ret = __handle_mm_fault(mm, vma, address, flags); + +	if (flags & FAULT_FLAG_USER) { +		mem_cgroup_oom_disable(); +                /* +                 * The task may have entered a memcg OOM situation but +                 * if the allocation error was handled gracefully (no +                 * VM_FAULT_OOM), there is no need to kill anything. +                 * Just clean up the OOM state peacefully. +                 */ +                if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) +                        mem_cgroup_oom_synchronize(false); +	} + +	return ret; +} +  #ifndef __PAGETABLE_PUD_FOLDED  /*   * Allocate page upper directory. @@ -3288,25 +3403,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)  }  #endif /* __PAGETABLE_PMD_FOLDED */ -int make_pages_present(unsigned long addr, unsigned long end) -{ -	int ret, len, write; -	struct vm_area_struct * vma; - -	vma = find_vma(current->mm, addr); -	if (!vma) -		return -ENOMEM; -	write = (vma->vm_flags & VM_WRITE) != 0; -	BUG_ON(addr >= end); -	BUG_ON(end > vma->vm_end); -	len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; -	ret = get_user_pages(current, current->mm, addr, -			len, write, 0, NULL, NULL); -	if (ret < 0) -		return ret; -	return ret == len ? 0 : -EFAULT; -} -  #if !defined(__HAVE_ARCH_GATE_AREA)  #if defined(AT_SYSINFO_EHDR) @@ -3319,19 +3415,13 @@ static int __init gate_vma_init(void)  	gate_vma.vm_end = FIXADDR_USER_END;  	gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;  	gate_vma.vm_page_prot = __P101; -	/* -	 * Make sure the vDSO gets into every core dump. -	 * Dumping its contents makes post-mortem fully interpretable later -	 * without matching up the same kernel and hardware config to see -	 * what PC values meant. -	 */ -	gate_vma.vm_flags |= VM_ALWAYSDUMP; +  	return 0;  }  __initcall(gate_vma_init);  #endif -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +struct vm_area_struct *get_gate_vma(struct mm_struct *mm)  {  #ifdef AT_SYSINFO_EHDR  	return &gate_vma; @@ -3340,7 +3430,7 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk)  #endif  } -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr)  {  #ifdef AT_SYSINFO_EHDR  	if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) @@ -3368,6 +3458,7 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address,  		goto out;  	pmd = pmd_offset(pud, address); +	VM_BUG_ON(pmd_trans_huge(*pmd));  	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))  		goto out; @@ -3477,23 +3568,19 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,  	return len;  } +EXPORT_SYMBOL_GPL(generic_access_phys);  #endif  /* - * Access another process' address space. - * Source/target buffer must be kernel space, - * Do not walk the page table directly, use get_user_pages + * Access another process' address space as given in mm.  If non-NULL, use the + * given task for page fault accounting.   */ -int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, +		unsigned long addr, void *buf, int len, int write)  { -	struct mm_struct *mm;  	struct vm_area_struct *vma;  	void *old_buf = buf; -	mm = get_task_mm(tsk); -	if (!mm) -		return 0; -  	down_read(&mm->mmap_sem);  	/* ignore errors, just check how much was successfully transferred */  	while (len) { @@ -3510,7 +3597,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in  			 */  #ifdef CONFIG_HAVE_IOREMAP_PROT  			vma = find_vma(mm, addr); -			if (!vma) +			if (!vma || vma->vm_start > addr)  				break;  			if (vma->vm_ops && vma->vm_ops->access)  				ret = vma->vm_ops->access(vma, addr, buf, @@ -3542,11 +3629,47 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in  		addr += bytes;  	}  	up_read(&mm->mmap_sem); -	mmput(mm);  	return buf - old_buf;  } +/** + * access_remote_vm - access another process' address space + * @mm:		the mm_struct of the target address space + * @addr:	start address to access + * @buf:	source or destination buffer + * @len:	number of bytes to transfer + * @write:	whether the access is a write + * + * The caller must hold a reference on @mm. + */ +int access_remote_vm(struct mm_struct *mm, unsigned long addr, +		void *buf, int len, int write) +{ +	return __access_remote_vm(NULL, mm, addr, buf, len, write); +} + +/* + * Access another process' address space. + * Source/target buffer must be kernel space, + * Do not walk the page table directly, use get_user_pages + */ +int access_process_vm(struct task_struct *tsk, unsigned long addr, +		void *buf, int len, int write) +{ +	struct mm_struct *mm; +	int ret; + +	mm = get_task_mm(tsk); +	if (!mm) +		return 0; + +	ret = __access_remote_vm(tsk, mm, addr, buf, len, write); +	mmput(mm); + +	return ret; +} +  /*   * Print the name of a VMA.   */ @@ -3568,24 +3691,21 @@ void print_vma_addr(char *prefix, unsigned long ip)  		struct file *f = vma->vm_file;  		char *buf = (char *)__get_free_page(GFP_KERNEL);  		if (buf) { -			char *p, *s; +			char *p;  			p = d_path(&f->f_path, buf, PAGE_SIZE);  			if (IS_ERR(p))  				p = "?"; -			s = strrchr(p, '/'); -			if (s) -				p = s+1; -			printk("%s%s[%lx+%lx]", prefix, p, +			printk("%s%s[%lx+%lx]", prefix, kbasename(p),  					vma->vm_start,  					vma->vm_end - vma->vm_start);  			free_page((unsigned long)buf);  		}  	} -	up_read(¤t->mm->mmap_sem); +	up_read(&mm->mmap_sem);  } -#ifdef CONFIG_PROVE_LOCKING +#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)  void might_fault(void)  {  	/* @@ -3597,14 +3717,116 @@ void might_fault(void)  	if (segment_eq(get_fs(), KERNEL_DS))  		return; -	might_sleep();  	/*  	 * it would be nicer only to annotate paths which are not under  	 * pagefault_disable, however that requires a larger audit and  	 * providing helpers like get_user_atomic.  	 */ -	if (!in_atomic() && current->mm) +	if (in_atomic()) +		return; + +	__might_sleep(__FILE__, __LINE__, 0); + +	if (current->mm)  		might_lock_read(¤t->mm->mmap_sem);  }  EXPORT_SYMBOL(might_fault);  #endif + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) +static void clear_gigantic_page(struct page *page, +				unsigned long addr, +				unsigned int pages_per_huge_page) +{ +	int i; +	struct page *p = page; + +	might_sleep(); +	for (i = 0; i < pages_per_huge_page; +	     i++, p = mem_map_next(p, page, i)) { +		cond_resched(); +		clear_user_highpage(p, addr + i * PAGE_SIZE); +	} +} +void clear_huge_page(struct page *page, +		     unsigned long addr, unsigned int pages_per_huge_page) +{ +	int i; + +	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { +		clear_gigantic_page(page, addr, pages_per_huge_page); +		return; +	} + +	might_sleep(); +	for (i = 0; i < pages_per_huge_page; i++) { +		cond_resched(); +		clear_user_highpage(page + i, addr + i * PAGE_SIZE); +	} +} + +static void copy_user_gigantic_page(struct page *dst, struct page *src, +				    unsigned long addr, +				    struct vm_area_struct *vma, +				    unsigned int pages_per_huge_page) +{ +	int i; +	struct page *dst_base = dst; +	struct page *src_base = src; + +	for (i = 0; i < pages_per_huge_page; ) { +		cond_resched(); +		copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); + +		i++; +		dst = mem_map_next(dst, dst_base, i); +		src = mem_map_next(src, src_base, i); +	} +} + +void copy_user_huge_page(struct page *dst, struct page *src, +			 unsigned long addr, struct vm_area_struct *vma, +			 unsigned int pages_per_huge_page) +{ +	int i; + +	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { +		copy_user_gigantic_page(dst, src, addr, vma, +					pages_per_huge_page); +		return; +	} + +	might_sleep(); +	for (i = 0; i < pages_per_huge_page; i++) { +		cond_resched(); +		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); +	} +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ + +#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS + +static struct kmem_cache *page_ptl_cachep; + +void __init ptlock_cache_init(void) +{ +	page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, +			SLAB_PANIC, NULL); +} + +bool ptlock_alloc(struct page *page) +{ +	spinlock_t *ptl; + +	ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); +	if (!ptl) +		return false; +	page->ptl = ptl; +	return true; +} + +void ptlock_free(struct page *page) +{ +	kmem_cache_free(page_ptl_cachep, page->ptl); +} +#endif  | 
