diff options
Diffstat (limited to 'mm/mmap.c')
| -rw-r--r-- | mm/mmap.c | 207 | 
1 files changed, 187 insertions, 20 deletions
| diff --git a/mm/mmap.c b/mm/mmap.c index 0db0de1c2fb..f681e1842fa 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -6,6 +6,7 @@   * Address space accounting code	<alan@lxorguk.ukuu.org.uk>   */ +#include <linux/kernel.h>  #include <linux/slab.h>  #include <linux/backing-dev.h>  #include <linux/mm.h> @@ -33,6 +34,8 @@  #include <linux/uprobes.h>  #include <linux/rbtree_augmented.h>  #include <linux/sched/sysctl.h> +#include <linux/notifier.h> +#include <linux/memory.h>  #include <asm/uaccess.h>  #include <asm/cacheflush.h> @@ -84,6 +87,8 @@ EXPORT_SYMBOL(vm_get_page_prot);  int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;  /* heuristic overcommit */  int sysctl_overcommit_ratio __read_mostly = 50;	/* default is 50% */  int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; +unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ +unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */  /*   * Make sure vm_committed_as in one cacheline and not cacheline shared with   * other variables. It can be updated by several CPUs frequently. @@ -122,7 +127,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);   */  int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)  { -	unsigned long free, allowed; +	unsigned long free, allowed, reserve;  	vm_acct_memory(pages); @@ -163,10 +168,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)  			free -= totalreserve_pages;  		/* -		 * Leave the last 3% for root +		 * Reserve some for root  		 */  		if (!cap_sys_admin) -			free -= free / 32; +			free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);  		if (free > pages)  			return 0; @@ -177,16 +182,19 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)  	allowed = (totalram_pages - hugetlb_total_pages())  	       	* sysctl_overcommit_ratio / 100;  	/* -	 * Leave the last 3% for root +	 * Reserve some for root  	 */  	if (!cap_sys_admin) -		allowed -= allowed / 32; +		allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);  	allowed += total_swap_pages; -	/* Don't let a single process grow too big: -	   leave 3% of the size of this process for other processes */ -	if (mm) -		allowed -= mm->total_vm / 32; +	/* +	 * Don't let a single process grow so big a user can't recover +	 */ +	if (mm) { +		reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); +		allowed -= min(mm->total_vm / 32, reserve); +	}  	if (percpu_counter_read_positive(&vm_committed_as) < allowed)  		return 0; @@ -543,6 +551,34 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr,  	return 0;  } +static unsigned long count_vma_pages_range(struct mm_struct *mm, +		unsigned long addr, unsigned long end) +{ +	unsigned long nr_pages = 0; +	struct vm_area_struct *vma; + +	/* Find first overlaping mapping */ +	vma = find_vma_intersection(mm, addr, end); +	if (!vma) +		return 0; + +	nr_pages = (min(end, vma->vm_end) - +		max(addr, vma->vm_start)) >> PAGE_SHIFT; + +	/* Iterate over the rest of the overlaps */ +	for (vma = vma->vm_next; vma; vma = vma->vm_next) { +		unsigned long overlap_len; + +		if (vma->vm_start > end) +			break; + +		overlap_len = min(end, vma->vm_end) - vma->vm_start; +		nr_pages += overlap_len >> PAGE_SHIFT; +	} + +	return nr_pages; +} +  void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,  		struct rb_node **rb_link, struct rb_node *rb_parent)  { @@ -829,7 +865,7 @@ again:			remove_next = 1 + (end > next->vm_end);  		if (next->anon_vma)  			anon_vma_merge(vma, next);  		mm->map_count--; -		mpol_put(vma_policy(next)); +		vma_set_policy(vma, vma_policy(next));  		kmem_cache_free(vm_area_cachep, next);  		/*  		 * In mprotect's case 6 (see comments on vma_merge), @@ -1327,15 +1363,24 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,  		file = fget(fd);  		if (!file)  			goto out; +		if (is_file_hugepages(file)) +			len = ALIGN(len, huge_page_size(hstate_file(file)));  	} else if (flags & MAP_HUGETLB) {  		struct user_struct *user = NULL; +		struct hstate *hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & +						   SHM_HUGE_MASK); + +		if (!hs) +			return -EINVAL; + +		len = ALIGN(len, huge_page_size(hs));  		/*  		 * VM_NORESERVE is used because the reservations will be  		 * taken when vm_ops->mmap() is called  		 * A dummy user value is used because we are not locking  		 * memory so no accounting is necessary  		 */ -		file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, +		file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,  				VM_NORESERVE,  				&user, HUGETLB_ANONHUGE_INODE,  				(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); @@ -1435,6 +1480,23 @@ unsigned long mmap_region(struct file *file, unsigned long addr,  	unsigned long charged = 0;  	struct inode *inode =  file ? file_inode(file) : NULL; +	/* Check against address space limit. */ +	if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { +		unsigned long nr_pages; + +		/* +		 * MAP_FIXED may remove pages of mappings that intersects with +		 * requested mapping. Account for the pages it would unmap. +		 */ +		if (!(vm_flags & MAP_FIXED)) +			return -ENOMEM; + +		nr_pages = count_vma_pages_range(mm, addr, addr + len); + +		if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages)) +			return -ENOMEM; +	} +  	/* Clear old maps */  	error = -ENOMEM;  munmap_back: @@ -1444,10 +1506,6 @@ munmap_back:  		goto munmap_back;  	} -	/* Check against address space limit. */ -	if (!may_expand_vm(mm, len >> PAGE_SHIFT)) -		return -ENOMEM; -  	/*  	 * Private writable mapping: check memory availability  	 */ @@ -1935,9 +1993,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)  {  	struct vm_area_struct *vma = NULL; -	if (WARN_ON_ONCE(!mm))		/* Remove this in linux-3.6 */ -		return NULL; -  	/* Check the cache first. */  	/* (Cache hit rate is typically around 35%.) */  	vma = ACCESS_ONCE(mm->mmap_cache); @@ -2305,7 +2360,7 @@ static void unmap_region(struct mm_struct *mm,  	update_hiwater_rss(mm);  	unmap_vmas(&tlb, vma, start, end);  	free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, -				 next ? next->vm_start : 0); +				 next ? next->vm_start : USER_PGTABLES_CEILING);  	tlb_finish_mmu(&tlb, start, end);  } @@ -2685,7 +2740,7 @@ void exit_mmap(struct mm_struct *mm)  	/* Use -1 here to ensure all VMAs in the mm are unmapped */  	unmap_vmas(&tlb, vma, 0, -1); -	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); +	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);  	tlb_finish_mmu(&tlb, 0, -1);  	/* @@ -3097,3 +3152,115 @@ void __init mmap_init(void)  	ret = percpu_counter_init(&vm_committed_as, 0);  	VM_BUG_ON(ret);  } + +/* + * Initialise sysctl_user_reserve_kbytes. + * + * This is intended to prevent a user from starting a single memory hogging + * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER + * mode. + * + * The default value is min(3% of free memory, 128MB) + * 128MB is enough to recover with sshd/login, bash, and top/kill. + */ +static int init_user_reserve(void) +{ +	unsigned long free_kbytes; + +	free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + +	sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); +	return 0; +} +module_init(init_user_reserve) + +/* + * Initialise sysctl_admin_reserve_kbytes. + * + * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin + * to log in and kill a memory hogging process. + * + * Systems with more than 256MB will reserve 8MB, enough to recover + * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will + * only reserve 3% of free pages by default. + */ +static int init_admin_reserve(void) +{ +	unsigned long free_kbytes; + +	free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + +	sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); +	return 0; +} +module_init(init_admin_reserve) + +/* + * Reinititalise user and admin reserves if memory is added or removed. + * + * The default user reserve max is 128MB, and the default max for the + * admin reserve is 8MB. These are usually, but not always, enough to + * enable recovery from a memory hogging process using login/sshd, a shell, + * and tools like top. It may make sense to increase or even disable the + * reserve depending on the existence of swap or variations in the recovery + * tools. So, the admin may have changed them. + * + * If memory is added and the reserves have been eliminated or increased above + * the default max, then we'll trust the admin. + * + * If memory is removed and there isn't enough free memory, then we + * need to reset the reserves. + * + * Otherwise keep the reserve set by the admin. + */ +static int reserve_mem_notifier(struct notifier_block *nb, +			     unsigned long action, void *data) +{ +	unsigned long tmp, free_kbytes; + +	switch (action) { +	case MEM_ONLINE: +		/* Default max is 128MB. Leave alone if modified by operator. */ +		tmp = sysctl_user_reserve_kbytes; +		if (0 < tmp && tmp < (1UL << 17)) +			init_user_reserve(); + +		/* Default max is 8MB.  Leave alone if modified by operator. */ +		tmp = sysctl_admin_reserve_kbytes; +		if (0 < tmp && tmp < (1UL << 13)) +			init_admin_reserve(); + +		break; +	case MEM_OFFLINE: +		free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + +		if (sysctl_user_reserve_kbytes > free_kbytes) { +			init_user_reserve(); +			pr_info("vm.user_reserve_kbytes reset to %lu\n", +				sysctl_user_reserve_kbytes); +		} + +		if (sysctl_admin_reserve_kbytes > free_kbytes) { +			init_admin_reserve(); +			pr_info("vm.admin_reserve_kbytes reset to %lu\n", +				sysctl_admin_reserve_kbytes); +		} +		break; +	default: +		break; +	} +	return NOTIFY_OK; +} + +static struct notifier_block reserve_mem_nb = { +	.notifier_call = reserve_mem_notifier, +}; + +static int __meminit init_reserve_notifier(void) +{ +	if (register_hotmemory_notifier(&reserve_mem_nb)) +		printk("Failed registering memory add/remove notifier for admin reserve"); + +	return 0; +} +module_init(init_reserve_notifier) | 
