diff options
Diffstat (limited to 'mm/mmap.c')
| -rw-r--r-- | mm/mmap.c | 1817 | 
1 files changed, 1218 insertions, 599 deletions
diff --git a/mm/mmap.c b/mm/mmap.c index b179abb1474..129b847d30c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -6,9 +6,13 @@   * Address space accounting code	<alan@lxorguk.ukuu.org.uk>   */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h>  #include <linux/slab.h>  #include <linux/backing-dev.h>  #include <linux/mm.h> +#include <linux/vmacache.h>  #include <linux/shm.h>  #include <linux/mman.h>  #include <linux/pagemap.h> @@ -22,13 +26,20 @@  #include <linux/security.h>  #include <linux/hugetlb.h>  #include <linux/profile.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/mount.h>  #include <linux/mempolicy.h>  #include <linux/rmap.h>  #include <linux/mmu_notifier.h>  #include <linux/perf_event.h>  #include <linux/audit.h> +#include <linux/khugepaged.h> +#include <linux/uprobes.h> +#include <linux/rbtree_augmented.h> +#include <linux/sched/sysctl.h> +#include <linux/notifier.h> +#include <linux/memory.h> +#include <linux/printk.h>  #include <asm/uaccess.h>  #include <asm/cacheflush.h> @@ -49,12 +60,6 @@ static void unmap_region(struct mm_struct *mm,  		struct vm_area_struct *vma, struct vm_area_struct *prev,  		unsigned long start, unsigned long end); -/* - * WARNING: the debugging will use recursive algorithms so never enable this - * unless you know what you are doing. - */ -#undef DEBUG_MM_RB -  /* description of effects of mapping type and prot in current implementation.   * this is due to the limited x86 page protection hardware.  The expected   * behavior is in parens: @@ -83,10 +88,31 @@ pgprot_t vm_get_page_prot(unsigned long vm_flags)  }  EXPORT_SYMBOL(vm_get_page_prot); -int sysctl_overcommit_memory = OVERCOMMIT_GUESS;  /* heuristic overcommit */ -int sysctl_overcommit_ratio = 50;	/* default is 50% */ +int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;  /* heuristic overcommit */ +int sysctl_overcommit_ratio __read_mostly = 50;	/* default is 50% */ +unsigned long sysctl_overcommit_kbytes __read_mostly;  int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; -struct percpu_counter vm_committed_as; +unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ +unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ +/* + * Make sure vm_committed_as in one cacheline and not cacheline shared with + * other variables. It can be updated by several CPUs frequently. + */ +struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; + +/* + * The global memory commitment made in the system can be a metric + * that can be used to drive ballooning decisions when Linux is hosted + * as a guest. On Hyper-V, the host implements a policy engine for dynamically + * balancing memory across competing virtual machines that are hosted. + * Several metrics drive this policy engine including the guest reported + * memory commitment. + */ +unsigned long vm_memory_committed(void) +{ +	return percpu_counter_read_positive(&vm_committed_as); +} +EXPORT_SYMBOL_GPL(vm_memory_committed);  /*   * Check that a process has enough memory to allocate a new virtual @@ -106,7 +132,7 @@ struct percpu_counter vm_committed_as;   */  int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)  { -	unsigned long free, allowed; +	unsigned long free, allowed, reserve;  	vm_acct_memory(pages); @@ -117,10 +143,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)  		return 0;  	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { -		unsigned long n; +		free = global_page_state(NR_FREE_PAGES); +		free += global_page_state(NR_FILE_PAGES); -		free = global_page_state(NR_FILE_PAGES); -		free += nr_swap_pages; +		/* +		 * shmem pages shouldn't be counted as free in this +		 * case, they can't be purged, only swapped out, and +		 * that won't affect the overall amount of available +		 * memory in the system. +		 */ +		free -= global_page_state(NR_SHMEM); + +		free += get_nr_swap_pages();  		/*  		 * Any slabs which are created with the @@ -131,34 +165,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)  		free += global_page_state(NR_SLAB_RECLAIMABLE);  		/* -		 * Leave the last 3% for root -		 */ -		if (!cap_sys_admin) -			free -= free / 32; - -		if (free > pages) -			return 0; - -		/* -		 * nr_free_pages() is very expensive on large systems, -		 * only call if we're about to fail. -		 */ -		n = nr_free_pages(); - -		/*  		 * Leave reserved pages. The pages are not for anonymous pages.  		 */ -		if (n <= totalreserve_pages) +		if (free <= totalreserve_pages)  			goto error;  		else -			n -= totalreserve_pages; +			free -= totalreserve_pages;  		/* -		 * Leave the last 3% for root +		 * Reserve some for root  		 */  		if (!cap_sys_admin) -			n -= n / 32; -		free += n; +			free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);  		if (free > pages)  			return 0; @@ -166,19 +184,20 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)  		goto error;  	} -	allowed = (totalram_pages - hugetlb_total_pages()) -	       	* sysctl_overcommit_ratio / 100; +	allowed = vm_commit_limit();  	/* -	 * Leave the last 3% for root +	 * Reserve some for root  	 */  	if (!cap_sys_admin) -		allowed -= allowed / 32; -	allowed += total_swap_pages; +		allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); -	/* Don't let a single process grow too big: -	   leave 3% of the size of this process for other processes */ -	if (mm) -		allowed -= mm->total_vm / 32; +	/* +	 * Don't let a single process grow so big a user can't recover +	 */ +	if (mm) { +		reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); +		allowed -= min(mm->total_vm / 32, reserve); +	}  	if (percpu_counter_read_positive(&vm_committed_as) < allowed)  		return 0; @@ -189,26 +208,26 @@ error:  }  /* - * Requires inode->i_mapping->i_mmap_lock + * Requires inode->i_mapping->i_mmap_mutex   */  static void __remove_shared_vm_struct(struct vm_area_struct *vma,  		struct file *file, struct address_space *mapping)  {  	if (vma->vm_flags & VM_DENYWRITE) -		atomic_inc(&file->f_path.dentry->d_inode->i_writecount); +		atomic_inc(&file_inode(file)->i_writecount);  	if (vma->vm_flags & VM_SHARED)  		mapping->i_mmap_writable--;  	flush_dcache_mmap_lock(mapping);  	if (unlikely(vma->vm_flags & VM_NONLINEAR)) -		list_del_init(&vma->shared.vm_set.list); +		list_del_init(&vma->shared.nonlinear);  	else -		vma_prio_tree_remove(vma, &mapping->i_mmap); +		vma_interval_tree_remove(vma, &mapping->i_mmap);  	flush_dcache_mmap_unlock(mapping);  }  /* - * Unlink a file-based vm structure from its prio_tree, to hide + * Unlink a file-based vm structure from its interval tree, to hide   * vma from rmap and vmtruncate before freeing its page tables.   */  void unlink_file_vma(struct vm_area_struct *vma) @@ -217,9 +236,9 @@ void unlink_file_vma(struct vm_area_struct *vma)  	if (file) {  		struct address_space *mapping = file->f_mapping; -		spin_lock(&mapping->i_mmap_lock); +		mutex_lock(&mapping->i_mmap_mutex);  		__remove_shared_vm_struct(vma, file, mapping); -		spin_unlock(&mapping->i_mmap_lock); +		mutex_unlock(&mapping->i_mmap_mutex);  	}  } @@ -233,27 +252,35 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)  	might_sleep();  	if (vma->vm_ops && vma->vm_ops->close)  		vma->vm_ops->close(vma); -	if (vma->vm_file) { +	if (vma->vm_file)  		fput(vma->vm_file); -		if (vma->vm_flags & VM_EXECUTABLE) -			removed_exe_file_vma(vma->vm_mm); -	}  	mpol_put(vma_policy(vma));  	kmem_cache_free(vm_area_cachep, vma);  	return next;  } +static unsigned long do_brk(unsigned long addr, unsigned long len); +  SYSCALL_DEFINE1(brk, unsigned long, brk)  {  	unsigned long rlim, retval;  	unsigned long newbrk, oldbrk;  	struct mm_struct *mm = current->mm;  	unsigned long min_brk; +	bool populate;  	down_write(&mm->mmap_sem);  #ifdef CONFIG_COMPAT_BRK -	min_brk = mm->end_code; +	/* +	 * CONFIG_COMPAT_BRK can still be overridden by setting +	 * randomize_va_space to 2, which will still cause mm->start_brk +	 * to be arbitrarily shifted +	 */ +	if (current->brk_randomized) +		min_brk = mm->start_brk; +	else +		min_brk = mm->end_data;  #else  	min_brk = mm->start_brk;  #endif @@ -290,75 +317,217 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)  	/* Ok, looks good - let it rip. */  	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)  		goto out; +  set_brk:  	mm->brk = brk; +	populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; +	up_write(&mm->mmap_sem); +	if (populate) +		mm_populate(oldbrk, newbrk - oldbrk); +	return brk; +  out:  	retval = mm->brk;  	up_write(&mm->mmap_sem);  	return retval;  } -#ifdef DEBUG_MM_RB +static long vma_compute_subtree_gap(struct vm_area_struct *vma) +{ +	unsigned long max, subtree_gap; +	max = vma->vm_start; +	if (vma->vm_prev) +		max -= vma->vm_prev->vm_end; +	if (vma->vm_rb.rb_left) { +		subtree_gap = rb_entry(vma->vm_rb.rb_left, +				struct vm_area_struct, vm_rb)->rb_subtree_gap; +		if (subtree_gap > max) +			max = subtree_gap; +	} +	if (vma->vm_rb.rb_right) { +		subtree_gap = rb_entry(vma->vm_rb.rb_right, +				struct vm_area_struct, vm_rb)->rb_subtree_gap; +		if (subtree_gap > max) +			max = subtree_gap; +	} +	return max; +} + +#ifdef CONFIG_DEBUG_VM_RB  static int browse_rb(struct rb_root *root)  { -	int i = 0, j; +	int i = 0, j, bug = 0;  	struct rb_node *nd, *pn = NULL;  	unsigned long prev = 0, pend = 0;  	for (nd = rb_first(root); nd; nd = rb_next(nd)) {  		struct vm_area_struct *vma;  		vma = rb_entry(nd, struct vm_area_struct, vm_rb); -		if (vma->vm_start < prev) -			printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1; -		if (vma->vm_start < pend) -			printk("vm_start %lx pend %lx\n", vma->vm_start, pend); -		if (vma->vm_start > vma->vm_end) -			printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); +		if (vma->vm_start < prev) { +			pr_info("vm_start %lx prev %lx\n", vma->vm_start, prev); +			bug = 1; +		} +		if (vma->vm_start < pend) { +			pr_info("vm_start %lx pend %lx\n", vma->vm_start, pend); +			bug = 1; +		} +		if (vma->vm_start > vma->vm_end) { +			pr_info("vm_end %lx < vm_start %lx\n", +				vma->vm_end, vma->vm_start); +			bug = 1; +		} +		if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { +			pr_info("free gap %lx, correct %lx\n", +			       vma->rb_subtree_gap, +			       vma_compute_subtree_gap(vma)); +			bug = 1; +		}  		i++;  		pn = nd;  		prev = vma->vm_start;  		pend = vma->vm_end;  	}  	j = 0; -	for (nd = pn; nd; nd = rb_prev(nd)) { +	for (nd = pn; nd; nd = rb_prev(nd))  		j++; +	if (i != j) { +		pr_info("backwards %d, forwards %d\n", j, i); +		bug = 1;  	} -	if (i != j) -		printk("backwards %d, forwards %d\n", j, i), i = 0; -	return i; +	return bug ? -1 : i;  } -void validate_mm(struct mm_struct *mm) +static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) +{ +	struct rb_node *nd; + +	for (nd = rb_first(root); nd; nd = rb_next(nd)) { +		struct vm_area_struct *vma; +		vma = rb_entry(nd, struct vm_area_struct, vm_rb); +		BUG_ON(vma != ignore && +		       vma->rb_subtree_gap != vma_compute_subtree_gap(vma)); +	} +} + +static void validate_mm(struct mm_struct *mm)  {  	int bug = 0;  	int i = 0; -	struct vm_area_struct *tmp = mm->mmap; -	while (tmp) { -		tmp = tmp->vm_next; +	unsigned long highest_address = 0; +	struct vm_area_struct *vma = mm->mmap; +	while (vma) { +		struct anon_vma_chain *avc; +		vma_lock_anon_vma(vma); +		list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) +			anon_vma_interval_tree_verify(avc); +		vma_unlock_anon_vma(vma); +		highest_address = vma->vm_end; +		vma = vma->vm_next;  		i++;  	} -	if (i != mm->map_count) -		printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; +	if (i != mm->map_count) { +		pr_info("map_count %d vm_next %d\n", mm->map_count, i); +		bug = 1; +	} +	if (highest_address != mm->highest_vm_end) { +		pr_info("mm->highest_vm_end %lx, found %lx\n", +		       mm->highest_vm_end, highest_address); +		bug = 1; +	}  	i = browse_rb(&mm->mm_rb); -	if (i != mm->map_count) -		printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; +	if (i != mm->map_count) { +		pr_info("map_count %d rb %d\n", mm->map_count, i); +		bug = 1; +	}  	BUG_ON(bug);  }  #else +#define validate_mm_rb(root, ignore) do { } while (0)  #define validate_mm(mm) do { } while (0)  #endif -static struct vm_area_struct * -find_vma_prepare(struct mm_struct *mm, unsigned long addr, -		struct vm_area_struct **pprev, struct rb_node ***rb_link, -		struct rb_node ** rb_parent) +RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, +		     unsigned long, rb_subtree_gap, vma_compute_subtree_gap) + +/* + * Update augmented rbtree rb_subtree_gap values after vma->vm_start or + * vma->vm_prev->vm_end values changed, without modifying the vma's position + * in the rbtree. + */ +static void vma_gap_update(struct vm_area_struct *vma)  { -	struct vm_area_struct * vma; -	struct rb_node ** __rb_link, * __rb_parent, * rb_prev; +	/* +	 * As it turns out, RB_DECLARE_CALLBACKS() already created a callback +	 * function that does exacltly what we want. +	 */ +	vma_gap_callbacks_propagate(&vma->vm_rb, NULL); +} + +static inline void vma_rb_insert(struct vm_area_struct *vma, +				 struct rb_root *root) +{ +	/* All rb_subtree_gap values must be consistent prior to insertion */ +	validate_mm_rb(root, NULL); + +	rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); +} + +static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) +{ +	/* +	 * All rb_subtree_gap values must be consistent prior to erase, +	 * with the possible exception of the vma being erased. +	 */ +	validate_mm_rb(root, vma); + +	/* +	 * Note rb_erase_augmented is a fairly large inline function, +	 * so make sure we instantiate it only once with our desired +	 * augmented rbtree callbacks. +	 */ +	rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); +} + +/* + * vma has some anon_vma assigned, and is already inserted on that + * anon_vma's interval trees. + * + * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the + * vma must be removed from the anon_vma's interval trees using + * anon_vma_interval_tree_pre_update_vma(). + * + * After the update, the vma will be reinserted using + * anon_vma_interval_tree_post_update_vma(). + * + * The entire update must be protected by exclusive mmap_sem and by + * the root anon_vma's mutex. + */ +static inline void +anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) +{ +	struct anon_vma_chain *avc; + +	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) +		anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); +} + +static inline void +anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) +{ +	struct anon_vma_chain *avc; + +	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) +		anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); +} + +static int find_vma_links(struct mm_struct *mm, unsigned long addr, +		unsigned long end, struct vm_area_struct **pprev, +		struct rb_node ***rb_link, struct rb_node **rb_parent) +{ +	struct rb_node **__rb_link, *__rb_parent, *rb_prev;  	__rb_link = &mm->mm_rb.rb_node;  	rb_prev = __rb_parent = NULL; -	vma = NULL;  	while (*__rb_link) {  		struct vm_area_struct *vma_tmp; @@ -367,9 +536,9 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,  		vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);  		if (vma_tmp->vm_end > addr) { -			vma = vma_tmp; -			if (vma_tmp->vm_start <= addr) -				break; +			/* Fail if an existing vma overlaps the area */ +			if (vma_tmp->vm_start < end) +				return -ENOMEM;  			__rb_link = &__rb_parent->rb_left;  		} else {  			rb_prev = __rb_parent; @@ -382,37 +551,59 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,  		*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);  	*rb_link = __rb_link;  	*rb_parent = __rb_parent; -	return vma; +	return 0;  } -static inline void -__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, -		struct vm_area_struct *prev, struct rb_node *rb_parent) +static unsigned long count_vma_pages_range(struct mm_struct *mm, +		unsigned long addr, unsigned long end)  { -	struct vm_area_struct *next; +	unsigned long nr_pages = 0; +	struct vm_area_struct *vma; -	vma->vm_prev = prev; -	if (prev) { -		next = prev->vm_next; -		prev->vm_next = vma; -	} else { -		mm->mmap = vma; -		if (rb_parent) -			next = rb_entry(rb_parent, -					struct vm_area_struct, vm_rb); -		else -			next = NULL; +	/* Find first overlaping mapping */ +	vma = find_vma_intersection(mm, addr, end); +	if (!vma) +		return 0; + +	nr_pages = (min(end, vma->vm_end) - +		max(addr, vma->vm_start)) >> PAGE_SHIFT; + +	/* Iterate over the rest of the overlaps */ +	for (vma = vma->vm_next; vma; vma = vma->vm_next) { +		unsigned long overlap_len; + +		if (vma->vm_start > end) +			break; + +		overlap_len = min(end, vma->vm_end) - vma->vm_start; +		nr_pages += overlap_len >> PAGE_SHIFT;  	} -	vma->vm_next = next; -	if (next) -		next->vm_prev = vma; + +	return nr_pages;  }  void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,  		struct rb_node **rb_link, struct rb_node *rb_parent)  { +	/* Update tracking information for the gap following the new vma. */ +	if (vma->vm_next) +		vma_gap_update(vma->vm_next); +	else +		mm->highest_vm_end = vma->vm_end; + +	/* +	 * vma->vm_prev wasn't known when we followed the rbtree to find the +	 * correct insertion point for that vma. As a result, we could not +	 * update the vma vm_rb parents rb_subtree_gap values on the way down. +	 * So, we first insert the vma with a zero rb_subtree_gap value +	 * (to be consistent with what we did on the way down), and then +	 * immediately update the gap to the correct value. Finally we +	 * rebalance the rbtree after all augmented values have been set. +	 */  	rb_link_node(&vma->vm_rb, rb_parent, rb_link); -	rb_insert_color(&vma->vm_rb, &mm->mm_rb); +	vma->rb_subtree_gap = 0; +	vma_gap_update(vma); +	vma_rb_insert(vma, &mm->mm_rb);  }  static void __vma_link_file(struct vm_area_struct *vma) @@ -424,7 +615,7 @@ static void __vma_link_file(struct vm_area_struct *vma)  		struct address_space *mapping = file->f_mapping;  		if (vma->vm_flags & VM_DENYWRITE) -			atomic_dec(&file->f_path.dentry->d_inode->i_writecount); +			atomic_dec(&file_inode(file)->i_writecount);  		if (vma->vm_flags & VM_SHARED)  			mapping->i_mmap_writable++; @@ -432,7 +623,7 @@ static void __vma_link_file(struct vm_area_struct *vma)  		if (unlikely(vma->vm_flags & VM_NONLINEAR))  			vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);  		else -			vma_prio_tree_insert(vma, &mapping->i_mmap); +			vma_interval_tree_insert(vma, &mapping->i_mmap);  		flush_dcache_mmap_unlock(mapping);  	}  } @@ -452,36 +643,33 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,  {  	struct address_space *mapping = NULL; -	if (vma->vm_file) +	if (vma->vm_file) {  		mapping = vma->vm_file->f_mapping; - -	if (mapping) { -		spin_lock(&mapping->i_mmap_lock); -		vma->vm_truncate_count = mapping->truncate_count; +		mutex_lock(&mapping->i_mmap_mutex);  	}  	__vma_link(mm, vma, prev, rb_link, rb_parent);  	__vma_link_file(vma);  	if (mapping) -		spin_unlock(&mapping->i_mmap_lock); +		mutex_unlock(&mapping->i_mmap_mutex);  	mm->map_count++;  	validate_mm(mm);  }  /* - * Helper for vma_adjust in the split_vma insert case: - * insert vm structure into list and rbtree and anon_vma, - * but it has already been inserted into prio_tree earlier. + * Helper for vma_adjust() in the split_vma insert case: insert a vma into the + * mm's list and rbtree.  It has already been inserted into the interval tree.   */  static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)  { -	struct vm_area_struct *__vma, *prev; +	struct vm_area_struct *prev;  	struct rb_node **rb_link, *rb_parent; -	__vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); -	BUG_ON(__vma && __vma->vm_start < vma->vm_end); +	if (find_vma_links(mm, vma->vm_start, vma->vm_end, +			   &prev, &rb_link, &rb_parent)) +		BUG();  	__vma_link(mm, vma, prev, rb_link, rb_parent);  	mm->map_count++;  } @@ -490,14 +678,15 @@ static inline void  __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,  		struct vm_area_struct *prev)  { -	struct vm_area_struct *next = vma->vm_next; +	struct vm_area_struct *next; -	prev->vm_next = next; +	vma_rb_erase(vma, &mm->mm_rb); +	prev->vm_next = next = vma->vm_next;  	if (next)  		next->vm_prev = prev; -	rb_erase(&vma->vm_rb, &mm->mm_rb); -	if (mm->mmap_cache == vma) -		mm->mmap_cache = prev; + +	/* Kill the cache */ +	vmacache_invalidate(mm);  }  /* @@ -514,9 +703,10 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,  	struct vm_area_struct *next = vma->vm_next;  	struct vm_area_struct *importer = NULL;  	struct address_space *mapping = NULL; -	struct prio_tree_root *root = NULL; +	struct rb_root *root = NULL;  	struct anon_vma *anon_vma = NULL;  	struct file *file = vma->vm_file; +	bool start_changed = false, end_changed = false;  	long adjust_next = 0;  	int remove_next = 0; @@ -565,21 +755,19 @@ again:			remove_next = 1 + (end > next->vm_end);  	if (file) {  		mapping = file->f_mapping; -		if (!(vma->vm_flags & VM_NONLINEAR)) +		if (!(vma->vm_flags & VM_NONLINEAR)) {  			root = &mapping->i_mmap; -		spin_lock(&mapping->i_mmap_lock); -		if (importer && -		    vma->vm_truncate_count != next->vm_truncate_count) { -			/* -			 * unmap_mapping_range might be in progress: -			 * ensure that the expanding vma is rescanned. -			 */ -			importer->vm_truncate_count = 0; +			uprobe_munmap(vma, vma->vm_start, vma->vm_end); + +			if (adjust_next) +				uprobe_munmap(next, next->vm_start, +							next->vm_end);  		} + +		mutex_lock(&mapping->i_mmap_mutex);  		if (insert) { -			insert->vm_truncate_count = vma->vm_truncate_count;  			/* -			 * Put into prio_tree now, so instantiated pages +			 * Put into interval tree now, so instantiated pages  			 * are visible to arm/parisc __flush_dcache_page  			 * throughout; but we cannot insert into address  			 * space until vma start or end is updated. @@ -588,26 +776,35 @@ again:			remove_next = 1 + (end > next->vm_end);  		}  	} -	/* -	 * When changing only vma->vm_end, we don't really need anon_vma -	 * lock. This is a fairly rare case by itself, but the anon_vma -	 * lock may be shared between many sibling processes.  Skipping -	 * the lock for brk adjustments makes a difference sometimes. -	 */ -	if (vma->anon_vma && (insert || importer || start != vma->vm_start)) { -		anon_vma = vma->anon_vma; -		anon_vma_lock(anon_vma); +	vma_adjust_trans_huge(vma, start, end, adjust_next); + +	anon_vma = vma->anon_vma; +	if (!anon_vma && adjust_next) +		anon_vma = next->anon_vma; +	if (anon_vma) { +		VM_BUG_ON(adjust_next && next->anon_vma && +			  anon_vma != next->anon_vma); +		anon_vma_lock_write(anon_vma); +		anon_vma_interval_tree_pre_update_vma(vma); +		if (adjust_next) +			anon_vma_interval_tree_pre_update_vma(next);  	}  	if (root) {  		flush_dcache_mmap_lock(mapping); -		vma_prio_tree_remove(vma, root); +		vma_interval_tree_remove(vma, root);  		if (adjust_next) -			vma_prio_tree_remove(next, root); +			vma_interval_tree_remove(next, root);  	} -	vma->vm_start = start; -	vma->vm_end = end; +	if (start != vma->vm_start) { +		vma->vm_start = start; +		start_changed = true; +	} +	if (end != vma->vm_end) { +		vma->vm_end = end; +		end_changed = true; +	}  	vma->vm_pgoff = pgoff;  	if (adjust_next) {  		next->vm_start += adjust_next << PAGE_SHIFT; @@ -616,8 +813,8 @@ again:			remove_next = 1 + (end > next->vm_end);  	if (root) {  		if (adjust_next) -			vma_prio_tree_insert(next, root); -		vma_prio_tree_insert(vma, root); +			vma_interval_tree_insert(next, root); +		vma_interval_tree_insert(vma, root);  		flush_dcache_mmap_unlock(mapping);  	} @@ -636,18 +833,37 @@ again:			remove_next = 1 + (end > next->vm_end);  		 * (it may either follow vma or precede it).  		 */  		__insert_vm_struct(mm, insert); +	} else { +		if (start_changed) +			vma_gap_update(vma); +		if (end_changed) { +			if (!next) +				mm->highest_vm_end = end; +			else if (!adjust_next) +				vma_gap_update(next); +		}  	} -	if (anon_vma) -		anon_vma_unlock(anon_vma); +	if (anon_vma) { +		anon_vma_interval_tree_post_update_vma(vma); +		if (adjust_next) +			anon_vma_interval_tree_post_update_vma(next); +		anon_vma_unlock_write(anon_vma); +	}  	if (mapping) -		spin_unlock(&mapping->i_mmap_lock); +		mutex_unlock(&mapping->i_mmap_mutex); + +	if (root) { +		uprobe_mmap(vma); + +		if (adjust_next) +			uprobe_mmap(next); +	}  	if (remove_next) {  		if (file) { +			uprobe_munmap(next, next->vm_start, next->vm_end);  			fput(file); -			if (next->vm_flags & VM_EXECUTABLE) -				removed_exe_file_vma(mm);  		}  		if (next->anon_vma)  			anon_vma_merge(vma, next); @@ -659,11 +875,16 @@ again:			remove_next = 1 + (end > next->vm_end);  		 * we must remove another next too. It would clutter  		 * up the code too much to do both in one go.  		 */ -		if (remove_next == 2) { -			next = vma->vm_next; +		next = vma->vm_next; +		if (remove_next == 2)  			goto again; -		} +		else if (next) +			vma_gap_update(next); +		else +			mm->highest_vm_end = end;  	} +	if (insert && file) +		uprobe_mmap(insert);  	validate_mm(mm); @@ -677,8 +898,15 @@ again:			remove_next = 1 + (end > next->vm_end);  static inline int is_mergeable_vma(struct vm_area_struct *vma,  			struct file *file, unsigned long vm_flags)  { -	/* VM_CAN_NONLINEAR may get set later by f_op->mmap() */ -	if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR) +	/* +	 * VM_SOFTDIRTY should not prevent from VMA merging, if we +	 * match the flags but dirty bit -- the caller should mark +	 * merged VMA as dirty. If dirty bit won't be excluded from +	 * comparison, we increase pressue on the memory system forcing +	 * the kernel to generate new VMAs when old one could be +	 * extended instead. +	 */ +	if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)  		return 0;  	if (vma->vm_file != file)  		return 0; @@ -688,9 +916,17 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,  }  static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, -					struct anon_vma *anon_vma2) +					struct anon_vma *anon_vma2, +					struct vm_area_struct *vma)  { -	return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2); +	/* +	 * The list_is_singular() test is to avoid merging VMA cloned from +	 * parents. This can improve scalability caused by anon_vma lock. +	 */ +	if ((!anon_vma1 || !anon_vma2) && (!vma || +		list_is_singular(&vma->anon_vma_chain))) +		return 1; +	return anon_vma1 == anon_vma2;  }  /* @@ -709,7 +945,7 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,  	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)  {  	if (is_mergeable_vma(vma, file, vm_flags) && -	    is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { +	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {  		if (vma->vm_pgoff == vm_pgoff)  			return 1;  	} @@ -728,9 +964,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,  	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)  {  	if (is_mergeable_vma(vma, file, vm_flags) && -	    is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { +	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {  		pgoff_t vm_pglen; -		vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; +		vm_pglen = vma_pages(vma);  		if (vma->vm_pgoff + vm_pglen == vm_pgoff)  			return 1;  	} @@ -806,7 +1042,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,  				can_vma_merge_before(next, vm_flags,  					anon_vma, file, pgoff+pglen) &&  				is_mergeable_anon_vma(prev->anon_vma, -						      next->anon_vma)) { +						      next->anon_vma, NULL)) {  							/* cases 1, 6 */  			err = vma_adjust(prev, prev->vm_start,  				next->vm_end, prev->vm_pgoff, NULL); @@ -815,6 +1051,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,  				end, prev->vm_pgoff, NULL);  		if (err)  			return NULL; +		khugepaged_enter_vma_merge(prev);  		return prev;  	} @@ -833,6 +1070,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,  				next->vm_pgoff - pglen, NULL);  		if (err)  			return NULL; +		khugepaged_enter_vma_merge(area);  		return area;  	} @@ -857,7 +1095,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *  	return a->vm_end == b->vm_start &&  		mpol_equal(vma_policy(a), vma_policy(b)) &&  		a->vm_file == b->vm_file && -		!((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) && +		!((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) &&  		b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);  } @@ -915,14 +1153,7 @@ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)  	if (anon_vma)  		return anon_vma;  try_prev: -	/* -	 * It is potentially slow to have to call find_vma_prev here. -	 * But it's only on the first write fault on the vma, not -	 * every time, and we could devise a way to avoid it later -	 * (e.g. stash info in next's anon_vma_node when assigning -	 * an anon_vma, or when trying vma_merge).  Another time. -	 */ -	BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma); +	near = vma->vm_prev;  	if (!near)  		goto none; @@ -948,30 +1179,61 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,  	const unsigned long stack_flags  		= VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); +	mm->total_vm += pages; +  	if (file) {  		mm->shared_vm += pages;  		if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)  			mm->exec_vm += pages;  	} else if (flags & stack_flags)  		mm->stack_vm += pages; -	if (flags & (VM_RESERVED|VM_IO)) -		mm->reserved_vm += pages;  }  #endif /* CONFIG_PROC_FS */  /* + * If a hint addr is less than mmap_min_addr change hint to be as + * low as possible but still greater than mmap_min_addr + */ +static inline unsigned long round_hint_to_min(unsigned long hint) +{ +	hint &= PAGE_MASK; +	if (((void *)hint != NULL) && +	    (hint < mmap_min_addr)) +		return PAGE_ALIGN(mmap_min_addr); +	return hint; +} + +static inline int mlock_future_check(struct mm_struct *mm, +				     unsigned long flags, +				     unsigned long len) +{ +	unsigned long locked, lock_limit; + +	/*  mlock MCL_FUTURE? */ +	if (flags & VM_LOCKED) { +		locked = len >> PAGE_SHIFT; +		locked += mm->locked_vm; +		lock_limit = rlimit(RLIMIT_MEMLOCK); +		lock_limit >>= PAGE_SHIFT; +		if (locked > lock_limit && !capable(CAP_IPC_LOCK)) +			return -EAGAIN; +	} +	return 0; +} + +/*   * The caller must hold down_write(¤t->mm->mmap_sem).   */  unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,  			unsigned long len, unsigned long prot, -			unsigned long flags, unsigned long pgoff) +			unsigned long flags, unsigned long pgoff, +			unsigned long *populate)  {  	struct mm_struct * mm = current->mm; -	struct inode *inode; -	unsigned int vm_flags; -	int error; -	unsigned long reqprot = prot; +	vm_flags_t vm_flags; + +	*populate = 0;  	/*  	 * Does the application expect PROT_READ to imply PROT_EXEC? @@ -1020,20 +1282,12 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,  		if (!can_do_mlock())  			return -EPERM; -	/* mlock MCL_FUTURE? */ -	if (vm_flags & VM_LOCKED) { -		unsigned long locked, lock_limit; -		locked = len >> PAGE_SHIFT; -		locked += mm->locked_vm; -		lock_limit = rlimit(RLIMIT_MEMLOCK); -		lock_limit >>= PAGE_SHIFT; -		if (locked > lock_limit && !capable(CAP_IPC_LOCK)) -			return -EAGAIN; -	} - -	inode = file ? file->f_path.dentry->d_inode : NULL; +	if (mlock_future_check(mm, vm_flags, len)) +		return -EAGAIN;  	if (file) { +		struct inode *inode = file_inode(file); +  		switch (flags & MAP_TYPE) {  		case MAP_SHARED:  			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) @@ -1049,7 +1303,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,  			/*  			 * Make sure there are no mandatory locks on the file.  			 */ -			if (locks_verify_locked(inode)) +			if (locks_verify_locked(file))  				return -EAGAIN;  			vm_flags |= VM_SHARED | VM_MAYSHARE; @@ -1066,8 +1320,10 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,  				vm_flags &= ~VM_MAYEXEC;  			} -			if (!file->f_op || !file->f_op->mmap) +			if (!file->f_op->mmap)  				return -ENODEV; +			if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) +				return -EINVAL;  			break;  		default: @@ -1076,6 +1332,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,  	} else {  		switch (flags & MAP_TYPE) {  		case MAP_SHARED: +			if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) +				return -EINVAL;  			/*  			 * Ignore pgoff.  			 */ @@ -1093,13 +1351,27 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,  		}  	} -	error = security_file_mmap(file, reqprot, prot, flags, addr, 0); -	if (error) -		return error; +	/* +	 * Set 'VM_NORESERVE' if we should not account for the +	 * memory use of this mapping. +	 */ +	if (flags & MAP_NORESERVE) { +		/* We honor MAP_NORESERVE if allowed to overcommit */ +		if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) +			vm_flags |= VM_NORESERVE; -	return mmap_region(file, addr, len, flags, vm_flags, pgoff); +		/* hugetlb applies strict overcommit unless MAP_NORESERVE */ +		if (file && is_file_hugepages(file)) +			vm_flags |= VM_NORESERVE; +	} + +	addr = mmap_region(file, addr, len, vm_flags, pgoff); +	if (!IS_ERR_VALUE(addr) && +	    ((vm_flags & VM_LOCKED) || +	     (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) +		*populate = len; +	return addr;  } -EXPORT_SYMBOL(do_mmap_pgoff);  SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,  		unsigned long, prot, unsigned long, flags, @@ -1110,32 +1382,41 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,  	if (!(flags & MAP_ANONYMOUS)) {  		audit_mmap_fd(fd, flags); -		if (unlikely(flags & MAP_HUGETLB)) -			return -EINVAL;  		file = fget(fd);  		if (!file)  			goto out; +		if (is_file_hugepages(file)) +			len = ALIGN(len, huge_page_size(hstate_file(file))); +		retval = -EINVAL; +		if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file))) +			goto out_fput;  	} else if (flags & MAP_HUGETLB) {  		struct user_struct *user = NULL; +		struct hstate *hs; + +		hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK); +		if (!hs) +			return -EINVAL; + +		len = ALIGN(len, huge_page_size(hs));  		/*  		 * VM_NORESERVE is used because the reservations will be  		 * taken when vm_ops->mmap() is called  		 * A dummy user value is used because we are not locking  		 * memory so no accounting is necessary  		 */ -		len = ALIGN(len, huge_page_size(&default_hstate)); -		file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, -						&user, HUGETLB_ANONHUGE_INODE); +		file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, +				VM_NORESERVE, +				&user, HUGETLB_ANONHUGE_INODE, +				(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);  		if (IS_ERR(file))  			return PTR_ERR(file);  	}  	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); -	down_write(¤t->mm->mmap_sem); -	retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); -	up_write(¤t->mm->mmap_sem); - +	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); +out_fput:  	if (file)  		fput(file);  out: @@ -1174,7 +1455,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)   */  int vma_wants_writenotify(struct vm_area_struct *vma)  { -	unsigned int vm_flags = vma->vm_flags; +	vm_flags_t vm_flags = vma->vm_flags;  	/* If it was private or non-writable, the write bit is already clear */  	if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) @@ -1190,7 +1471,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)  		return 0;  	/* Specialty mapping? */ -	if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) +	if (vm_flags & VM_PFNMAP)  		return 0;  	/* Can the mapping track the dirty pages? */ @@ -1202,7 +1483,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)   * We account for memory if it's a private writeable mapping,   * not hugepages and VM_NORESERVE wasn't set.   */ -static inline int accountable_mapping(struct file *file, unsigned int vm_flags) +static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)  {  	/*  	 * hugetlb has its own accounting separate from the core VM @@ -1215,51 +1496,46 @@ static inline int accountable_mapping(struct file *file, unsigned int vm_flags)  }  unsigned long mmap_region(struct file *file, unsigned long addr, -			  unsigned long len, unsigned long flags, -			  unsigned int vm_flags, unsigned long pgoff) +		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)  {  	struct mm_struct *mm = current->mm;  	struct vm_area_struct *vma, *prev; -	int correct_wcount = 0;  	int error;  	struct rb_node **rb_link, *rb_parent;  	unsigned long charged = 0; -	struct inode *inode =  file ? file->f_path.dentry->d_inode : NULL; + +	/* Check against address space limit. */ +	if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { +		unsigned long nr_pages; + +		/* +		 * MAP_FIXED may remove pages of mappings that intersects with +		 * requested mapping. Account for the pages it would unmap. +		 */ +		if (!(vm_flags & MAP_FIXED)) +			return -ENOMEM; + +		nr_pages = count_vma_pages_range(mm, addr, addr + len); + +		if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages)) +			return -ENOMEM; +	}  	/* Clear old maps */  	error = -ENOMEM;  munmap_back: -	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); -	if (vma && vma->vm_start < addr + len) { +	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {  		if (do_munmap(mm, addr, len))  			return -ENOMEM;  		goto munmap_back;  	} -	/* Check against address space limit. */ -	if (!may_expand_vm(mm, len >> PAGE_SHIFT)) -		return -ENOMEM; - -	/* -	 * Set 'VM_NORESERVE' if we should not account for the -	 * memory use of this mapping. -	 */ -	if ((flags & MAP_NORESERVE)) { -		/* We honor MAP_NORESERVE if allowed to overcommit */ -		if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) -			vm_flags |= VM_NORESERVE; - -		/* hugetlb applies strict overcommit unless MAP_NORESERVE */ -		if (file && is_file_hugepages(file)) -			vm_flags |= VM_NORESERVE; -	} -  	/*  	 * Private writable mapping: check memory availability  	 */  	if (accountable_mapping(file, vm_flags)) {  		charged = len >> PAGE_SHIFT; -		if (security_vm_enough_memory(charged)) +		if (security_vm_enough_memory_mm(mm, charged))  			return -ENOMEM;  		vm_flags |= VM_ACCOUNT;  	} @@ -1291,30 +1567,26 @@ munmap_back:  	INIT_LIST_HEAD(&vma->anon_vma_chain);  	if (file) { -		error = -EINVAL; -		if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) -			goto free_vma;  		if (vm_flags & VM_DENYWRITE) {  			error = deny_write_access(file);  			if (error)  				goto free_vma; -			correct_wcount = 1;  		} -		vma->vm_file = file; -		get_file(file); +		vma->vm_file = get_file(file);  		error = file->f_op->mmap(file, vma);  		if (error)  			goto unmap_and_free_vma; -		if (vm_flags & VM_EXECUTABLE) -			added_exe_file_vma(mm);  		/* Can addr have changed??  		 *  		 * Answer: Yes, several device drivers can do it in their  		 *         f_op->mmap method. -DaveM +		 * Bug: If addr is changed, prev, rb_link, rb_parent should +		 *      be updated for vma_link()  		 */ +		WARN_ON_ONCE(addr != vma->vm_start); +  		addr = vma->vm_start; -		pgoff = vma->vm_pgoff;  		vm_flags = vma->vm_flags;  	} else if (vm_flags & VM_SHARED) {  		error = shmem_zero_setup(vma); @@ -1338,26 +1610,39 @@ munmap_back:  	}  	vma_link(mm, vma, prev, rb_link, rb_parent); -	file = vma->vm_file; -  	/* Once vma denies write, undo our temporary denial count */ -	if (correct_wcount) -		atomic_inc(&inode->i_writecount); +	if (vm_flags & VM_DENYWRITE) +		allow_write_access(file); +	file = vma->vm_file;  out:  	perf_event_mmap(vma); -	mm->total_vm += len >> PAGE_SHIFT;  	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);  	if (vm_flags & VM_LOCKED) { -		if (!mlock_vma_pages_range(vma, addr, addr + len)) +		if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || +					vma == get_gate_vma(current->mm)))  			mm->locked_vm += (len >> PAGE_SHIFT); -	} else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) -		make_pages_present(addr, addr + len); +		else +			vma->vm_flags &= ~VM_LOCKED; +	} + +	if (file) +		uprobe_mmap(vma); + +	/* +	 * New (or expanded) vma always get soft dirty status. +	 * Otherwise user-space soft-dirty page tracker won't +	 * be able to distinguish situation when vma area unmapped, +	 * then new mapped in-place (which must be aimed as +	 * a completely new data area). +	 */ +	vma->vm_flags |= VM_SOFTDIRTY; +  	return addr;  unmap_and_free_vma: -	if (correct_wcount) -		atomic_inc(&inode->i_writecount); +	if (vm_flags & VM_DENYWRITE) +		allow_write_access(file);  	vma->vm_file = NULL;  	fput(file); @@ -1372,6 +1657,206 @@ unacct_error:  	return error;  } +unsigned long unmapped_area(struct vm_unmapped_area_info *info) +{ +	/* +	 * We implement the search by looking for an rbtree node that +	 * immediately follows a suitable gap. That is, +	 * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length; +	 * - gap_end   = vma->vm_start        >= info->low_limit  + length; +	 * - gap_end - gap_start >= length +	 */ + +	struct mm_struct *mm = current->mm; +	struct vm_area_struct *vma; +	unsigned long length, low_limit, high_limit, gap_start, gap_end; + +	/* Adjust search length to account for worst case alignment overhead */ +	length = info->length + info->align_mask; +	if (length < info->length) +		return -ENOMEM; + +	/* Adjust search limits by the desired length */ +	if (info->high_limit < length) +		return -ENOMEM; +	high_limit = info->high_limit - length; + +	if (info->low_limit > high_limit) +		return -ENOMEM; +	low_limit = info->low_limit + length; + +	/* Check if rbtree root looks promising */ +	if (RB_EMPTY_ROOT(&mm->mm_rb)) +		goto check_highest; +	vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); +	if (vma->rb_subtree_gap < length) +		goto check_highest; + +	while (true) { +		/* Visit left subtree if it looks promising */ +		gap_end = vma->vm_start; +		if (gap_end >= low_limit && vma->vm_rb.rb_left) { +			struct vm_area_struct *left = +				rb_entry(vma->vm_rb.rb_left, +					 struct vm_area_struct, vm_rb); +			if (left->rb_subtree_gap >= length) { +				vma = left; +				continue; +			} +		} + +		gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; +check_current: +		/* Check if current node has a suitable gap */ +		if (gap_start > high_limit) +			return -ENOMEM; +		if (gap_end >= low_limit && gap_end - gap_start >= length) +			goto found; + +		/* Visit right subtree if it looks promising */ +		if (vma->vm_rb.rb_right) { +			struct vm_area_struct *right = +				rb_entry(vma->vm_rb.rb_right, +					 struct vm_area_struct, vm_rb); +			if (right->rb_subtree_gap >= length) { +				vma = right; +				continue; +			} +		} + +		/* Go back up the rbtree to find next candidate node */ +		while (true) { +			struct rb_node *prev = &vma->vm_rb; +			if (!rb_parent(prev)) +				goto check_highest; +			vma = rb_entry(rb_parent(prev), +				       struct vm_area_struct, vm_rb); +			if (prev == vma->vm_rb.rb_left) { +				gap_start = vma->vm_prev->vm_end; +				gap_end = vma->vm_start; +				goto check_current; +			} +		} +	} + +check_highest: +	/* Check highest gap, which does not precede any rbtree node */ +	gap_start = mm->highest_vm_end; +	gap_end = ULONG_MAX;  /* Only for VM_BUG_ON below */ +	if (gap_start > high_limit) +		return -ENOMEM; + +found: +	/* We found a suitable gap. Clip it with the original low_limit. */ +	if (gap_start < info->low_limit) +		gap_start = info->low_limit; + +	/* Adjust gap address to the desired alignment */ +	gap_start += (info->align_offset - gap_start) & info->align_mask; + +	VM_BUG_ON(gap_start + info->length > info->high_limit); +	VM_BUG_ON(gap_start + info->length > gap_end); +	return gap_start; +} + +unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) +{ +	struct mm_struct *mm = current->mm; +	struct vm_area_struct *vma; +	unsigned long length, low_limit, high_limit, gap_start, gap_end; + +	/* Adjust search length to account for worst case alignment overhead */ +	length = info->length + info->align_mask; +	if (length < info->length) +		return -ENOMEM; + +	/* +	 * Adjust search limits by the desired length. +	 * See implementation comment at top of unmapped_area(). +	 */ +	gap_end = info->high_limit; +	if (gap_end < length) +		return -ENOMEM; +	high_limit = gap_end - length; + +	if (info->low_limit > high_limit) +		return -ENOMEM; +	low_limit = info->low_limit + length; + +	/* Check highest gap, which does not precede any rbtree node */ +	gap_start = mm->highest_vm_end; +	if (gap_start <= high_limit) +		goto found_highest; + +	/* Check if rbtree root looks promising */ +	if (RB_EMPTY_ROOT(&mm->mm_rb)) +		return -ENOMEM; +	vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); +	if (vma->rb_subtree_gap < length) +		return -ENOMEM; + +	while (true) { +		/* Visit right subtree if it looks promising */ +		gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; +		if (gap_start <= high_limit && vma->vm_rb.rb_right) { +			struct vm_area_struct *right = +				rb_entry(vma->vm_rb.rb_right, +					 struct vm_area_struct, vm_rb); +			if (right->rb_subtree_gap >= length) { +				vma = right; +				continue; +			} +		} + +check_current: +		/* Check if current node has a suitable gap */ +		gap_end = vma->vm_start; +		if (gap_end < low_limit) +			return -ENOMEM; +		if (gap_start <= high_limit && gap_end - gap_start >= length) +			goto found; + +		/* Visit left subtree if it looks promising */ +		if (vma->vm_rb.rb_left) { +			struct vm_area_struct *left = +				rb_entry(vma->vm_rb.rb_left, +					 struct vm_area_struct, vm_rb); +			if (left->rb_subtree_gap >= length) { +				vma = left; +				continue; +			} +		} + +		/* Go back up the rbtree to find next candidate node */ +		while (true) { +			struct rb_node *prev = &vma->vm_rb; +			if (!rb_parent(prev)) +				return -ENOMEM; +			vma = rb_entry(rb_parent(prev), +				       struct vm_area_struct, vm_rb); +			if (prev == vma->vm_rb.rb_right) { +				gap_start = vma->vm_prev ? +					vma->vm_prev->vm_end : 0; +				goto check_current; +			} +		} +	} + +found: +	/* We found a suitable gap. Clip it with the original high_limit. */ +	if (gap_end > info->high_limit) +		gap_end = info->high_limit; + +found_highest: +	/* Compute highest gap address at the desired alignment */ +	gap_end -= info->length; +	gap_end -= (gap_end - info->align_offset) & info->align_mask; + +	VM_BUG_ON(gap_end < info->low_limit); +	VM_BUG_ON(gap_end < gap_start); +	return gap_end; +} +  /* Get an address range which is currently unmapped.   * For shmat() with addr=0.   * @@ -1390,9 +1875,9 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,  {  	struct mm_struct *mm = current->mm;  	struct vm_area_struct *vma; -	unsigned long start_addr; +	struct vm_unmapped_area_info info; -	if (len > TASK_SIZE) +	if (len > TASK_SIZE - mmap_min_addr)  		return -ENOMEM;  	if (flags & MAP_FIXED) @@ -1401,58 +1886,20 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,  	if (addr) {  		addr = PAGE_ALIGN(addr);  		vma = find_vma(mm, addr); -		if (TASK_SIZE - len >= addr && +		if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&  		    (!vma || addr + len <= vma->vm_start))  			return addr;  	} -	if (len > mm->cached_hole_size) { -	        start_addr = addr = mm->free_area_cache; -	} else { -	        start_addr = addr = TASK_UNMAPPED_BASE; -	        mm->cached_hole_size = 0; -	} -full_search: -	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { -		/* At this point:  (!vma || addr < vma->vm_end). */ -		if (TASK_SIZE - len < addr) { -			/* -			 * Start a new search - just in case we missed -			 * some holes. -			 */ -			if (start_addr != TASK_UNMAPPED_BASE) { -				addr = TASK_UNMAPPED_BASE; -			        start_addr = addr; -				mm->cached_hole_size = 0; -				goto full_search; -			} -			return -ENOMEM; -		} -		if (!vma || addr + len <= vma->vm_start) { -			/* -			 * Remember the place where we stopped the search: -			 */ -			mm->free_area_cache = addr + len; -			return addr; -		} -		if (addr + mm->cached_hole_size < vma->vm_start) -		        mm->cached_hole_size = vma->vm_start - addr; -		addr = vma->vm_end; -	} +	info.flags = 0; +	info.length = len; +	info.low_limit = mm->mmap_base; +	info.high_limit = TASK_SIZE; +	info.align_mask = 0; +	return vm_unmapped_area(&info);  }  #endif	 -void arch_unmap_area(struct mm_struct *mm, unsigned long addr) -{ -	/* -	 * Is this a new hole at the lowest possible address? -	 */ -	if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) { -		mm->free_area_cache = addr; -		mm->cached_hole_size = ~0UL; -	} -} -  /*   * This mmap-allocator allocates new areas top-down from below the   * stack's low limit (the base): @@ -1466,9 +1913,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,  	struct vm_area_struct *vma;  	struct mm_struct *mm = current->mm;  	unsigned long addr = addr0; +	struct vm_unmapped_area_info info;  	/* requested length too big for entire address space */ -	if (len > TASK_SIZE) +	if (len > TASK_SIZE - mmap_min_addr)  		return -ENOMEM;  	if (flags & MAP_FIXED) @@ -1478,85 +1926,36 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,  	if (addr) {  		addr = PAGE_ALIGN(addr);  		vma = find_vma(mm, addr); -		if (TASK_SIZE - len >= addr && +		if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&  				(!vma || addr + len <= vma->vm_start))  			return addr;  	} -	/* check if free_area_cache is useful for us */ -	if (len <= mm->cached_hole_size) { - 	        mm->cached_hole_size = 0; - 		mm->free_area_cache = mm->mmap_base; - 	} - -	/* either no address requested or can't fit in requested address hole */ -	addr = mm->free_area_cache; - -	/* make sure it can fit in the remaining address space */ -	if (addr > len) { -		vma = find_vma(mm, addr-len); -		if (!vma || addr <= vma->vm_start) -			/* remember the address as a hint for next time */ -			return (mm->free_area_cache = addr-len); -	} - -	if (mm->mmap_base < len) -		goto bottomup; - -	addr = mm->mmap_base-len; +	info.flags = VM_UNMAPPED_AREA_TOPDOWN; +	info.length = len; +	info.low_limit = max(PAGE_SIZE, mmap_min_addr); +	info.high_limit = mm->mmap_base; +	info.align_mask = 0; +	addr = vm_unmapped_area(&info); -	do { -		/* -		 * Lookup failure means no vma is above this address, -		 * else if new region fits below vma->vm_start, -		 * return with success: -		 */ -		vma = find_vma(mm, addr); -		if (!vma || addr+len <= vma->vm_start) -			/* remember the address as a hint for next time */ -			return (mm->free_area_cache = addr); - - 		/* remember the largest hole we saw so far */ - 		if (addr + mm->cached_hole_size < vma->vm_start) - 		        mm->cached_hole_size = vma->vm_start - addr; - -		/* try just below the current vma->vm_start */ -		addr = vma->vm_start-len; -	} while (len < vma->vm_start); - -bottomup:  	/*  	 * A failed mmap() very likely causes application failure,  	 * so fall back to the bottom-up function here. This scenario  	 * can happen with large stack limits and large mmap()  	 * allocations.  	 */ -	mm->cached_hole_size = ~0UL; -  	mm->free_area_cache = TASK_UNMAPPED_BASE; -	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); -	/* -	 * Restore the topdown base: -	 */ -	mm->free_area_cache = mm->mmap_base; -	mm->cached_hole_size = ~0UL; +	if (addr & ~PAGE_MASK) { +		VM_BUG_ON(addr != -ENOMEM); +		info.flags = 0; +		info.low_limit = TASK_UNMAPPED_BASE; +		info.high_limit = TASK_SIZE; +		addr = vm_unmapped_area(&info); +	}  	return addr;  }  #endif -void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) -{ -	/* -	 * Is this a new hole at the highest possible address? -	 */ -	if (addr > mm->free_area_cache) -		mm->free_area_cache = addr; - -	/* dont allow allocations above current base */ -	if (mm->free_area_cache > mm->mmap_base) -		mm->free_area_cache = mm->mmap_base; -} -  unsigned long  get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,  		unsigned long pgoff, unsigned long flags) @@ -1573,7 +1972,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,  		return -ENOMEM;  	get_area = current->mm->get_unmapped_area; -	if (file && file->f_op && file->f_op->get_unmapped_area) +	if (file && file->f_op->get_unmapped_area)  		get_area = file->f_op->get_unmapped_area;  	addr = get_area(file, addr, len, pgoff, flags);  	if (IS_ERR_VALUE(addr)) @@ -1584,7 +1983,9 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,  	if (addr & ~PAGE_MASK)  		return -EINVAL; -	return arch_rebalance_pgtables(addr, len); +	addr = arch_rebalance_pgtables(addr, len); +	error = security_mmap_addr(addr); +	return error ? error : addr;  }  EXPORT_SYMBOL(get_unmapped_area); @@ -1592,74 +1993,59 @@ EXPORT_SYMBOL(get_unmapped_area);  /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */  struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)  { -	struct vm_area_struct *vma = NULL; +	struct rb_node *rb_node; +	struct vm_area_struct *vma; -	if (mm) { -		/* Check the cache first. */ -		/* (Cache hit rate is typically around 35%.) */ -		vma = mm->mmap_cache; -		if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { -			struct rb_node * rb_node; - -			rb_node = mm->mm_rb.rb_node; -			vma = NULL; - -			while (rb_node) { -				struct vm_area_struct * vma_tmp; - -				vma_tmp = rb_entry(rb_node, -						struct vm_area_struct, vm_rb); - -				if (vma_tmp->vm_end > addr) { -					vma = vma_tmp; -					if (vma_tmp->vm_start <= addr) -						break; -					rb_node = rb_node->rb_left; -				} else -					rb_node = rb_node->rb_right; -			} -			if (vma) -				mm->mmap_cache = vma; -		} +	/* Check the cache first. */ +	vma = vmacache_find(mm, addr); +	if (likely(vma)) +		return vma; + +	rb_node = mm->mm_rb.rb_node; +	vma = NULL; + +	while (rb_node) { +		struct vm_area_struct *tmp; + +		tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); + +		if (tmp->vm_end > addr) { +			vma = tmp; +			if (tmp->vm_start <= addr) +				break; +			rb_node = rb_node->rb_left; +		} else +			rb_node = rb_node->rb_right;  	} + +	if (vma) +		vmacache_update(addr, vma);  	return vma;  }  EXPORT_SYMBOL(find_vma); -/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */ +/* + * Same as find_vma, but also return a pointer to the previous VMA in *pprev. + */  struct vm_area_struct *  find_vma_prev(struct mm_struct *mm, unsigned long addr,  			struct vm_area_struct **pprev)  { -	struct vm_area_struct *vma = NULL, *prev = NULL; -	struct rb_node *rb_node; -	if (!mm) -		goto out; - -	/* Guard against addr being lower than the first VMA */ -	vma = mm->mmap; - -	/* Go through the RB tree quickly. */ -	rb_node = mm->mm_rb.rb_node; - -	while (rb_node) { -		struct vm_area_struct *vma_tmp; -		vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); +	struct vm_area_struct *vma; -		if (addr < vma_tmp->vm_end) { -			rb_node = rb_node->rb_left; -		} else { -			prev = vma_tmp; -			if (!prev->vm_next || (addr < prev->vm_next->vm_end)) -				break; +	vma = find_vma(mm, addr); +	if (vma) { +		*pprev = vma->vm_prev; +	} else { +		struct rb_node *rb_node = mm->mm_rb.rb_node; +		*pprev = NULL; +		while (rb_node) { +			*pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb);  			rb_node = rb_node->rb_right;  		}  	} - -out: -	*pprev = prev; -	return prev ? prev->vm_next : vma; +	return vma;  }  /* @@ -1706,7 +2092,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns  		return -ENOMEM;  	/* Ok, everything looks good - let it rip */ -	mm->total_vm += grow;  	if (vma->vm_flags & VM_LOCKED)  		mm->locked_vm += grow;  	vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); @@ -1754,13 +2139,38 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)  		size = address - vma->vm_start;  		grow = (address - vma->vm_end) >> PAGE_SHIFT; -		error = acct_stack_growth(vma, size, grow); -		if (!error) { -			vma->vm_end = address; -			perf_event_mmap(vma); +		error = -ENOMEM; +		if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { +			error = acct_stack_growth(vma, size, grow); +			if (!error) { +				/* +				 * vma_gap_update() doesn't support concurrent +				 * updates, but we only hold a shared mmap_sem +				 * lock here, so we need to protect against +				 * concurrent vma expansions. +				 * vma_lock_anon_vma() doesn't help here, as +				 * we don't guarantee that all growable vmas +				 * in a mm share the same root anon vma. +				 * So, we reuse mm->page_table_lock to guard +				 * against concurrent vma expansions. +				 */ +				spin_lock(&vma->vm_mm->page_table_lock); +				anon_vma_interval_tree_pre_update_vma(vma); +				vma->vm_end = address; +				anon_vma_interval_tree_post_update_vma(vma); +				if (vma->vm_next) +					vma_gap_update(vma->vm_next); +				else +					vma->vm_mm->highest_vm_end = address; +				spin_unlock(&vma->vm_mm->page_table_lock); + +				perf_event_mmap(vma); +			}  		}  	}  	vma_unlock_anon_vma(vma); +	khugepaged_enter_vma_merge(vma); +	validate_mm(vma->vm_mm);  	return error;  }  #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ @@ -1768,7 +2178,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)  /*   * vma is the first one with address < vma->vm_start.  Have to extend vma.   */ -static int expand_downwards(struct vm_area_struct *vma, +int expand_downwards(struct vm_area_struct *vma,  				   unsigned long address)  {  	int error; @@ -1781,7 +2191,7 @@ static int expand_downwards(struct vm_area_struct *vma,  		return -ENOMEM;  	address &= PAGE_MASK; -	error = security_file_mmap(NULL, 0, 0, 0, address, 1); +	error = security_mmap_addr(address);  	if (error)  		return error; @@ -1800,25 +2210,61 @@ static int expand_downwards(struct vm_area_struct *vma,  		size = vma->vm_end - address;  		grow = (vma->vm_start - address) >> PAGE_SHIFT; -		error = acct_stack_growth(vma, size, grow); -		if (!error) { -			vma->vm_start = address; -			vma->vm_pgoff -= grow; -			perf_event_mmap(vma); +		error = -ENOMEM; +		if (grow <= vma->vm_pgoff) { +			error = acct_stack_growth(vma, size, grow); +			if (!error) { +				/* +				 * vma_gap_update() doesn't support concurrent +				 * updates, but we only hold a shared mmap_sem +				 * lock here, so we need to protect against +				 * concurrent vma expansions. +				 * vma_lock_anon_vma() doesn't help here, as +				 * we don't guarantee that all growable vmas +				 * in a mm share the same root anon vma. +				 * So, we reuse mm->page_table_lock to guard +				 * against concurrent vma expansions. +				 */ +				spin_lock(&vma->vm_mm->page_table_lock); +				anon_vma_interval_tree_pre_update_vma(vma); +				vma->vm_start = address; +				vma->vm_pgoff -= grow; +				anon_vma_interval_tree_post_update_vma(vma); +				vma_gap_update(vma); +				spin_unlock(&vma->vm_mm->page_table_lock); + +				perf_event_mmap(vma); +			}  		}  	}  	vma_unlock_anon_vma(vma); +	khugepaged_enter_vma_merge(vma); +	validate_mm(vma->vm_mm);  	return error;  } -int expand_stack_downwards(struct vm_area_struct *vma, unsigned long address) -{ -	return expand_downwards(vma, address); -} - +/* + * Note how expand_stack() refuses to expand the stack all the way to + * abut the next virtual mapping, *unless* that mapping itself is also + * a stack mapping. We want to leave room for a guard page, after all + * (the guard page itself is not added here, that is done by the + * actual page faulting logic) + * + * This matches the behavior of the guard page logic (see mm/memory.c: + * check_stack_guard_page()), which only allows the guard page to be + * removed under these circumstances. + */  #ifdef CONFIG_STACK_GROWSUP  int expand_stack(struct vm_area_struct *vma, unsigned long address)  { +	struct vm_area_struct *next; + +	address &= PAGE_MASK; +	next = vma->vm_next; +	if (next && next->vm_start == address + PAGE_SIZE) { +		if (!(next->vm_flags & VM_GROWSUP)) +			return -ENOMEM; +	}  	return expand_upwards(vma, address);  } @@ -1833,14 +2279,21 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)  		return vma;  	if (!prev || expand_stack(prev, addr))  		return NULL; -	if (prev->vm_flags & VM_LOCKED) { -		mlock_vma_pages_range(prev, addr, prev->vm_end); -	} +	if (prev->vm_flags & VM_LOCKED) +		__mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);  	return prev;  }  #else  int expand_stack(struct vm_area_struct *vma, unsigned long address)  { +	struct vm_area_struct *prev; + +	address &= PAGE_MASK; +	prev = vma->vm_prev; +	if (prev && prev->vm_end == address) { +		if (!(prev->vm_flags & VM_GROWSDOWN)) +			return -ENOMEM; +	}  	return expand_downwards(vma, address);  } @@ -1861,9 +2314,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)  	start = vma->vm_start;  	if (expand_stack(vma, addr))  		return NULL; -	if (vma->vm_flags & VM_LOCKED) { -		mlock_vma_pages_range(vma, addr, start); -	} +	if (vma->vm_flags & VM_LOCKED) +		__mlock_vma_pages_range(vma, addr, start, NULL);  	return vma;  }  #endif @@ -1876,15 +2328,19 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)   */  static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)  { +	unsigned long nr_accounted = 0; +  	/* Update high watermark before we lower total_vm */  	update_hiwater_vm(mm);  	do {  		long nrpages = vma_pages(vma); -		mm->total_vm -= nrpages; +		if (vma->vm_flags & VM_ACCOUNT) +			nr_accounted += nrpages;  		vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);  		vma = remove_vma(vma);  	} while (vma); +	vm_unacct_memory(nr_accounted);  	validate_mm(mm);  } @@ -1898,17 +2354,15 @@ static void unmap_region(struct mm_struct *mm,  		unsigned long start, unsigned long end)  {  	struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; -	struct mmu_gather *tlb; -	unsigned long nr_accounted = 0; +	struct mmu_gather tlb;  	lru_add_drain(); -	tlb = tlb_gather_mmu(mm, 0); +	tlb_gather_mmu(&tlb, mm, start, end);  	update_hiwater_rss(mm); -	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); -	vm_unacct_memory(nr_accounted); -	free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, -				 next? next->vm_start: 0); -	tlb_finish_mmu(tlb, start, end); +	unmap_vmas(&tlb, vma, start, end); +	free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, +				 next ? next->vm_start : USER_PGTABLES_CEILING); +	tlb_finish_mmu(&tlb, start, end);  }  /* @@ -1921,26 +2375,25 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,  {  	struct vm_area_struct **insertion_point;  	struct vm_area_struct *tail_vma = NULL; -	unsigned long addr;  	insertion_point = (prev ? &prev->vm_next : &mm->mmap);  	vma->vm_prev = NULL;  	do { -		rb_erase(&vma->vm_rb, &mm->mm_rb); +		vma_rb_erase(vma, &mm->mm_rb);  		mm->map_count--;  		tail_vma = vma;  		vma = vma->vm_next;  	} while (vma && vma->vm_start < end);  	*insertion_point = vma; -	if (vma) +	if (vma) {  		vma->vm_prev = prev; +		vma_gap_update(vma); +	} else +		mm->highest_vm_end = prev ? prev->vm_end : 0;  	tail_vma->vm_next = NULL; -	if (mm->unmap_area == arch_unmap_area) -		addr = prev ? prev->vm_end : mm->mmap_base; -	else -		addr = vma ?  vma->vm_start : mm->mmap_base; -	mm->unmap_area(mm, addr); -	mm->mmap_cache = NULL;		/* Kill the cache. */ + +	/* Kill the cache */ +	vmacache_invalidate(mm);  }  /* @@ -1950,7 +2403,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,  static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,  	      unsigned long addr, int new_below)  { -	struct mempolicy *pol;  	struct vm_area_struct *new;  	int err = -ENOMEM; @@ -1974,21 +2426,15 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,  		new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);  	} -	pol = mpol_dup(vma_policy(vma)); -	if (IS_ERR(pol)) { -		err = PTR_ERR(pol); +	err = vma_dup_policy(vma, new); +	if (err)  		goto out_free_vma; -	} -	vma_set_policy(new, pol);  	if (anon_vma_clone(new, vma))  		goto out_free_mpol; -	if (new->vm_file) { +	if (new->vm_file)  		get_file(new->vm_file); -		if (vma->vm_flags & VM_EXECUTABLE) -			added_exe_file_vma(mm); -	}  	if (new->vm_ops && new->vm_ops->open)  		new->vm_ops->open(new); @@ -2006,14 +2452,11 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,  	/* Clean everything up if vma_adjust failed. */  	if (new->vm_ops && new->vm_ops->close)  		new->vm_ops->close(new); -	if (new->vm_file) { -		if (vma->vm_flags & VM_EXECUTABLE) -			removed_exe_file_vma(mm); +	if (new->vm_file)  		fput(new->vm_file); -	}  	unlink_anon_vmas(new);   out_free_mpol: -	mpol_put(pol); +	mpol_put(vma_policy(new));   out_free_vma:  	kmem_cache_free(vm_area_cachep, new);   out_err: @@ -2050,9 +2493,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)  		return -EINVAL;  	/* Find the first overlapping VMA */ -	vma = find_vma_prev(mm, start, &prev); +	vma = find_vma(mm, start);  	if (!vma)  		return 0; +	prev = vma->vm_prev;  	/* we have  start < vma->vm_end  */  	/* if it doesn't overlap, we have nothing.. */ @@ -2119,20 +2563,23 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)  	return 0;  } -EXPORT_SYMBOL(do_munmap); - -SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) +int vm_munmap(unsigned long start, size_t len)  {  	int ret;  	struct mm_struct *mm = current->mm; -	profile_munmap(addr); -  	down_write(&mm->mmap_sem); -	ret = do_munmap(mm, addr, len); +	ret = do_munmap(mm, start, len);  	up_write(&mm->mmap_sem);  	return ret;  } +EXPORT_SYMBOL(vm_munmap); + +SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) +{ +	profile_munmap(addr); +	return vm_munmap(addr, len); +}  static inline void verify_mm_writelocked(struct mm_struct *mm)  { @@ -2149,7 +2596,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)   *  anonymous maps.  eventually we may be able to do some   *  brk-specific accounting here.   */ -unsigned long do_brk(unsigned long addr, unsigned long len) +static unsigned long do_brk(unsigned long addr, unsigned long len)  {  	struct mm_struct * mm = current->mm;  	struct vm_area_struct * vma, * prev; @@ -2162,28 +2609,15 @@ unsigned long do_brk(unsigned long addr, unsigned long len)  	if (!len)  		return addr; -	error = security_file_mmap(NULL, 0, 0, 0, addr, 1); -	if (error) -		return error; -  	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;  	error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);  	if (error & ~PAGE_MASK)  		return error; -	/* -	 * mlock MCL_FUTURE? -	 */ -	if (mm->def_flags & VM_LOCKED) { -		unsigned long locked, lock_limit; -		locked = len >> PAGE_SHIFT; -		locked += mm->locked_vm; -		lock_limit = rlimit(RLIMIT_MEMLOCK); -		lock_limit >>= PAGE_SHIFT; -		if (locked > lock_limit && !capable(CAP_IPC_LOCK)) -			return -EAGAIN; -	} +	error = mlock_future_check(mm, mm->def_flags, len); +	if (error) +		return error;  	/*  	 * mm->mmap_sem is required to protect against another thread @@ -2195,8 +2629,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)  	 * Clear old maps.  this also does some error checking for us  	 */   munmap_back: -	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); -	if (vma && vma->vm_start < addr + len) { +	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {  		if (do_munmap(mm, addr, len))  			return -ENOMEM;  		goto munmap_back; @@ -2209,7 +2642,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)  	if (mm->map_count > sysctl_max_map_count)  		return -ENOMEM; -	if (security_vm_enough_memory(len >> PAGE_SHIFT)) +	if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))  		return -ENOMEM;  	/* Can we just expand an old private anonymous mapping? */ @@ -2238,22 +2671,34 @@ unsigned long do_brk(unsigned long addr, unsigned long len)  out:  	perf_event_mmap(vma);  	mm->total_vm += len >> PAGE_SHIFT; -	if (flags & VM_LOCKED) { -		if (!mlock_vma_pages_range(vma, addr, addr + len)) -			mm->locked_vm += (len >> PAGE_SHIFT); -	} +	if (flags & VM_LOCKED) +		mm->locked_vm += (len >> PAGE_SHIFT); +	vma->vm_flags |= VM_SOFTDIRTY;  	return addr;  } -EXPORT_SYMBOL(do_brk); +unsigned long vm_brk(unsigned long addr, unsigned long len) +{ +	struct mm_struct *mm = current->mm; +	unsigned long ret; +	bool populate; + +	down_write(&mm->mmap_sem); +	ret = do_brk(addr, len); +	populate = ((mm->def_flags & VM_LOCKED) != 0); +	up_write(&mm->mmap_sem); +	if (populate) +		mm_populate(addr, len); +	return ret; +} +EXPORT_SYMBOL(vm_brk);  /* Release all mmaps. */  void exit_mmap(struct mm_struct *mm)  { -	struct mmu_gather *tlb; +	struct mmu_gather tlb;  	struct vm_area_struct *vma;  	unsigned long nr_accounted = 0; -	unsigned long end;  	/* mm's last user has gone, and its about to be pulled down */  	mmu_notifier_release(mm); @@ -2275,33 +2720,37 @@ void exit_mmap(struct mm_struct *mm)  	lru_add_drain();  	flush_cache_mm(mm); -	tlb = tlb_gather_mmu(mm, 1); +	tlb_gather_mmu(&tlb, mm, 0, -1);  	/* update_hiwater_rss(mm) here? but nobody should be looking */  	/* Use -1 here to ensure all VMAs in the mm are unmapped */ -	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); -	vm_unacct_memory(nr_accounted); +	unmap_vmas(&tlb, vma, 0, -1); -	free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); -	tlb_finish_mmu(tlb, 0, end); +	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); +	tlb_finish_mmu(&tlb, 0, -1);  	/*  	 * Walk the list again, actually closing and freeing it,  	 * with preemption enabled, without holding any MM locks.  	 */ -	while (vma) +	while (vma) { +		if (vma->vm_flags & VM_ACCOUNT) +			nr_accounted += vma_pages(vma);  		vma = remove_vma(vma); +	} +	vm_unacct_memory(nr_accounted); -	BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); +	WARN_ON(atomic_long_read(&mm->nr_ptes) > +			(FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);  }  /* Insert vm structure into process list sorted by address   * and into the inode's i_mmap tree.  If vm_file is non-NULL - * then i_mmap_lock is taken here. + * then i_mmap_mutex is taken here.   */ -int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) +int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)  { -	struct vm_area_struct * __vma, * prev; -	struct rb_node ** rb_link, * rb_parent; +	struct vm_area_struct *prev; +	struct rb_node **rb_link, *rb_parent;  	/*  	 * The vm_pgoff of a purely anonymous vma should be irrelevant @@ -2319,12 +2768,13 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)  		BUG_ON(vma->anon_vma);  		vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;  	} -	__vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent); -	if (__vma && __vma->vm_start < vma->vm_end) +	if (find_vma_links(mm, vma->vm_start, vma->vm_end, +			   &prev, &rb_link, &rb_parent))  		return -ENOMEM;  	if ((vma->vm_flags & VM_ACCOUNT) &&  	     security_vm_enough_memory_mm(mm, vma_pages(vma)))  		return -ENOMEM; +  	vma_link(mm, vma, prev, rb_link, rb_parent);  	return 0;  } @@ -2334,60 +2784,75 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)   * prior to moving page table entries, to effect an mremap move.   */  struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, -	unsigned long addr, unsigned long len, pgoff_t pgoff) +	unsigned long addr, unsigned long len, pgoff_t pgoff, +	bool *need_rmap_locks)  {  	struct vm_area_struct *vma = *vmap;  	unsigned long vma_start = vma->vm_start;  	struct mm_struct *mm = vma->vm_mm;  	struct vm_area_struct *new_vma, *prev;  	struct rb_node **rb_link, *rb_parent; -	struct mempolicy *pol; +	bool faulted_in_anon_vma = true;  	/*  	 * If anonymous vma has not yet been faulted, update new pgoff  	 * to match new location, to increase its chance of merging.  	 */ -	if (!vma->vm_file && !vma->anon_vma) +	if (unlikely(!vma->vm_file && !vma->anon_vma)) {  		pgoff = addr >> PAGE_SHIFT; +		faulted_in_anon_vma = false; +	} -	find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); +	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) +		return NULL;	/* should never get here */  	new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,  			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));  	if (new_vma) {  		/*  		 * Source vma may have been merged into new_vma  		 */ -		if (vma_start >= new_vma->vm_start && -		    vma_start < new_vma->vm_end) -			*vmap = new_vma; +		if (unlikely(vma_start >= new_vma->vm_start && +			     vma_start < new_vma->vm_end)) { +			/* +			 * The only way we can get a vma_merge with +			 * self during an mremap is if the vma hasn't +			 * been faulted in yet and we were allowed to +			 * reset the dst vma->vm_pgoff to the +			 * destination address of the mremap to allow +			 * the merge to happen. mremap must change the +			 * vm_pgoff linearity between src and dst vmas +			 * (in turn preventing a vma_merge) to be +			 * safe. It is only safe to keep the vm_pgoff +			 * linear if there are no pages mapped yet. +			 */ +			VM_BUG_ON(faulted_in_anon_vma); +			*vmap = vma = new_vma; +		} +		*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);  	} else {  		new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);  		if (new_vma) {  			*new_vma = *vma; -			pol = mpol_dup(vma_policy(vma)); -			if (IS_ERR(pol)) +			new_vma->vm_start = addr; +			new_vma->vm_end = addr + len; +			new_vma->vm_pgoff = pgoff; +			if (vma_dup_policy(vma, new_vma))  				goto out_free_vma;  			INIT_LIST_HEAD(&new_vma->anon_vma_chain);  			if (anon_vma_clone(new_vma, vma))  				goto out_free_mempol; -			vma_set_policy(new_vma, pol); -			new_vma->vm_start = addr; -			new_vma->vm_end = addr + len; -			new_vma->vm_pgoff = pgoff; -			if (new_vma->vm_file) { +			if (new_vma->vm_file)  				get_file(new_vma->vm_file); -				if (vma->vm_flags & VM_EXECUTABLE) -					added_exe_file_vma(mm); -			}  			if (new_vma->vm_ops && new_vma->vm_ops->open)  				new_vma->vm_ops->open(new_vma);  			vma_link(mm, new_vma, prev, rb_link, rb_parent); +			*need_rmap_locks = false;  		}  	}  	return new_vma;   out_free_mempol: -	mpol_put(pol); +	mpol_put(vma_policy(new_vma));   out_free_vma:  	kmem_cache_free(vm_area_cachep, new_vma);  	return NULL; @@ -2409,6 +2874,31 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages)  	return 1;  } +static int special_mapping_fault(struct vm_area_struct *vma, +				 struct vm_fault *vmf); + +/* + * Having a close hook prevents vma merging regardless of flags. + */ +static void special_mapping_close(struct vm_area_struct *vma) +{ +} + +static const char *special_mapping_name(struct vm_area_struct *vma) +{ +	return ((struct vm_special_mapping *)vma->vm_private_data)->name; +} + +static const struct vm_operations_struct special_mapping_vmops = { +	.close = special_mapping_close, +	.fault = special_mapping_fault, +	.name = special_mapping_name, +}; + +static const struct vm_operations_struct legacy_special_mapping_vmops = { +	.close = special_mapping_close, +	.fault = special_mapping_fault, +};  static int special_mapping_fault(struct vm_area_struct *vma,  				struct vm_fault *vmf) @@ -2424,7 +2914,13 @@ static int special_mapping_fault(struct vm_area_struct *vma,  	 */  	pgoff = vmf->pgoff - vma->vm_pgoff; -	for (pages = vma->vm_private_data; pgoff && *pages; ++pages) +	if (vma->vm_ops == &legacy_special_mapping_vmops) +		pages = vma->vm_private_data; +	else +		pages = ((struct vm_special_mapping *)vma->vm_private_data)-> +			pages; + +	for (; pgoff && *pages; ++pages)  		pgoff--;  	if (*pages) { @@ -2437,81 +2933,95 @@ static int special_mapping_fault(struct vm_area_struct *vma,  	return VM_FAULT_SIGBUS;  } -/* - * Having a close hook prevents vma merging regardless of flags. - */ -static void special_mapping_close(struct vm_area_struct *vma) -{ -} - -static const struct vm_operations_struct special_mapping_vmops = { -	.close = special_mapping_close, -	.fault = special_mapping_fault, -}; - -/* - * Called with mm->mmap_sem held for writing. - * Insert a new vma covering the given region, with the given flags. - * Its pages are supplied by the given array of struct page *. - * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. - * The region past the last page supplied will always produce SIGBUS. - * The array pointer and the pages it points to are assumed to stay alive - * for as long as this mapping might exist. - */ -int install_special_mapping(struct mm_struct *mm, -			    unsigned long addr, unsigned long len, -			    unsigned long vm_flags, struct page **pages) +static struct vm_area_struct *__install_special_mapping( +	struct mm_struct *mm, +	unsigned long addr, unsigned long len, +	unsigned long vm_flags, const struct vm_operations_struct *ops, +	void *priv)  { +	int ret;  	struct vm_area_struct *vma;  	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);  	if (unlikely(vma == NULL)) -		return -ENOMEM; +		return ERR_PTR(-ENOMEM);  	INIT_LIST_HEAD(&vma->anon_vma_chain);  	vma->vm_mm = mm;  	vma->vm_start = addr;  	vma->vm_end = addr + len; -	vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND; +	vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;  	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); -	vma->vm_ops = &special_mapping_vmops; -	vma->vm_private_data = pages; +	vma->vm_ops = ops; +	vma->vm_private_data = priv; -	if (unlikely(insert_vm_struct(mm, vma))) { -		kmem_cache_free(vm_area_cachep, vma); -		return -ENOMEM; -	} +	ret = insert_vm_struct(mm, vma); +	if (ret) +		goto out;  	mm->total_vm += len >> PAGE_SHIFT;  	perf_event_mmap(vma); -	return 0; +	return vma; + +out: +	kmem_cache_free(vm_area_cachep, vma); +	return ERR_PTR(ret); +} + +/* + * Called with mm->mmap_sem held for writing. + * Insert a new vma covering the given region, with the given flags. + * Its pages are supplied by the given array of struct page *. + * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. + * The region past the last page supplied will always produce SIGBUS. + * The array pointer and the pages it points to are assumed to stay alive + * for as long as this mapping might exist. + */ +struct vm_area_struct *_install_special_mapping( +	struct mm_struct *mm, +	unsigned long addr, unsigned long len, +	unsigned long vm_flags, const struct vm_special_mapping *spec) +{ +	return __install_special_mapping(mm, addr, len, vm_flags, +					 &special_mapping_vmops, (void *)spec); +} + +int install_special_mapping(struct mm_struct *mm, +			    unsigned long addr, unsigned long len, +			    unsigned long vm_flags, struct page **pages) +{ +	struct vm_area_struct *vma = __install_special_mapping( +		mm, addr, len, vm_flags, &legacy_special_mapping_vmops, +		(void *)pages); + +	return PTR_ERR_OR_ZERO(vma);  }  static DEFINE_MUTEX(mm_all_locks_mutex);  static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)  { -	if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { +	if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {  		/*  		 * The LSB of head.next can't change from under us  		 * because we hold the mm_all_locks_mutex.  		 */ -		spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem); +		down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);  		/*  		 * We can safely modify head.next after taking the -		 * anon_vma->root->lock. If some other vma in this mm shares +		 * anon_vma->root->rwsem. If some other vma in this mm shares  		 * the same anon_vma we won't take it again.  		 *  		 * No need of atomic instructions here, head.next  		 * can't change from under us thanks to the -		 * anon_vma->root->lock. +		 * anon_vma->root->rwsem.  		 */  		if (__test_and_set_bit(0, (unsigned long *) -				       &anon_vma->root->head.next)) +				       &anon_vma->root->rb_root.rb_node))  			BUG();  	}  } @@ -2530,7 +3040,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)  		 */  		if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))  			BUG(); -		spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem); +		mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem);  	}  } @@ -2552,12 +3062,12 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)   * A single task can't take more than one mm_take_all_locks() in a row   * or it would deadlock.   * - * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in + * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in   * mapping->flags avoid to take the same lock twice, if more than one   * vma in this mm is backed by the same anon_vma or address_space.   *   * We can take all the locks in random order because the VM code - * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never + * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never   * takes more than one of them in a row. Secondly we're protected   * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.   * @@ -2570,7 +3080,6 @@ int mm_take_all_locks(struct mm_struct *mm)  {  	struct vm_area_struct *vma;  	struct anon_vma_chain *avc; -	int ret = -EINTR;  	BUG_ON(down_read_trylock(&mm->mmap_sem)); @@ -2591,34 +3100,32 @@ int mm_take_all_locks(struct mm_struct *mm)  				vm_lock_anon_vma(mm, avc->anon_vma);  	} -	ret = 0; +	return 0;  out_unlock: -	if (ret) -		mm_drop_all_locks(mm); - -	return ret; +	mm_drop_all_locks(mm); +	return -EINTR;  }  static void vm_unlock_anon_vma(struct anon_vma *anon_vma)  { -	if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { +	if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {  		/*  		 * The LSB of head.next can't change to 0 from under  		 * us because we hold the mm_all_locks_mutex.  		 *  		 * We must however clear the bitflag before unlocking -		 * the vma so the users using the anon_vma->head will +		 * the vma so the users using the anon_vma->rb_root will  		 * never see our bitflag.  		 *  		 * No need of atomic instructions here, head.next  		 * can't change from under us until we release the -		 * anon_vma->root->lock. +		 * anon_vma->root->rwsem.  		 */  		if (!__test_and_clear_bit(0, (unsigned long *) -					  &anon_vma->root->head.next)) +					  &anon_vma->root->rb_root.rb_node))  			BUG(); -		anon_vma_unlock(anon_vma); +		anon_vma_unlock_write(anon_vma);  	}  } @@ -2629,7 +3136,7 @@ static void vm_unlock_mapping(struct address_space *mapping)  		 * AS_MM_ALL_LOCKS can't change to 0 from under us  		 * because we hold the mm_all_locks_mutex.  		 */ -		spin_unlock(&mapping->i_mmap_lock); +		mutex_unlock(&mapping->i_mmap_mutex);  		if (!test_and_clear_bit(AS_MM_ALL_LOCKS,  					&mapping->flags))  			BUG(); @@ -2669,3 +3176,115 @@ void __init mmap_init(void)  	ret = percpu_counter_init(&vm_committed_as, 0);  	VM_BUG_ON(ret);  } + +/* + * Initialise sysctl_user_reserve_kbytes. + * + * This is intended to prevent a user from starting a single memory hogging + * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER + * mode. + * + * The default value is min(3% of free memory, 128MB) + * 128MB is enough to recover with sshd/login, bash, and top/kill. + */ +static int init_user_reserve(void) +{ +	unsigned long free_kbytes; + +	free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + +	sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); +	return 0; +} +subsys_initcall(init_user_reserve); + +/* + * Initialise sysctl_admin_reserve_kbytes. + * + * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin + * to log in and kill a memory hogging process. + * + * Systems with more than 256MB will reserve 8MB, enough to recover + * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will + * only reserve 3% of free pages by default. + */ +static int init_admin_reserve(void) +{ +	unsigned long free_kbytes; + +	free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + +	sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); +	return 0; +} +subsys_initcall(init_admin_reserve); + +/* + * Reinititalise user and admin reserves if memory is added or removed. + * + * The default user reserve max is 128MB, and the default max for the + * admin reserve is 8MB. These are usually, but not always, enough to + * enable recovery from a memory hogging process using login/sshd, a shell, + * and tools like top. It may make sense to increase or even disable the + * reserve depending on the existence of swap or variations in the recovery + * tools. So, the admin may have changed them. + * + * If memory is added and the reserves have been eliminated or increased above + * the default max, then we'll trust the admin. + * + * If memory is removed and there isn't enough free memory, then we + * need to reset the reserves. + * + * Otherwise keep the reserve set by the admin. + */ +static int reserve_mem_notifier(struct notifier_block *nb, +			     unsigned long action, void *data) +{ +	unsigned long tmp, free_kbytes; + +	switch (action) { +	case MEM_ONLINE: +		/* Default max is 128MB. Leave alone if modified by operator. */ +		tmp = sysctl_user_reserve_kbytes; +		if (0 < tmp && tmp < (1UL << 17)) +			init_user_reserve(); + +		/* Default max is 8MB.  Leave alone if modified by operator. */ +		tmp = sysctl_admin_reserve_kbytes; +		if (0 < tmp && tmp < (1UL << 13)) +			init_admin_reserve(); + +		break; +	case MEM_OFFLINE: +		free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + +		if (sysctl_user_reserve_kbytes > free_kbytes) { +			init_user_reserve(); +			pr_info("vm.user_reserve_kbytes reset to %lu\n", +				sysctl_user_reserve_kbytes); +		} + +		if (sysctl_admin_reserve_kbytes > free_kbytes) { +			init_admin_reserve(); +			pr_info("vm.admin_reserve_kbytes reset to %lu\n", +				sysctl_admin_reserve_kbytes); +		} +		break; +	default: +		break; +	} +	return NOTIFY_OK; +} + +static struct notifier_block reserve_mem_nb = { +	.notifier_call = reserve_mem_notifier, +}; + +static int __meminit init_reserve_notifier(void) +{ +	if (register_hotmemory_notifier(&reserve_mem_nb)) +		pr_err("Failed registering memory add/remove notifier for admin reserve\n"); + +	return 0; +} +subsys_initcall(init_reserve_notifier);  | 
