diff options
Diffstat (limited to 'mm/mmap.c')
| -rw-r--r-- | mm/mmap.c | 688 |
1 files changed, 462 insertions, 226 deletions
diff --git a/mm/mmap.c b/mm/mmap.c index 35730ee9d51..129b847d30c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -6,9 +6,13 @@ * Address space accounting code <alan@lxorguk.ukuu.org.uk> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/mm.h> +#include <linux/vmacache.h> #include <linux/shm.h> #include <linux/mman.h> #include <linux/pagemap.h> @@ -32,6 +36,10 @@ #include <linux/khugepaged.h> #include <linux/uprobes.h> #include <linux/rbtree_augmented.h> +#include <linux/sched/sysctl.h> +#include <linux/notifier.h> +#include <linux/memory.h> +#include <linux/printk.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -82,7 +90,10 @@ EXPORT_SYMBOL(vm_get_page_prot); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ +unsigned long sysctl_overcommit_kbytes __read_mostly; int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; +unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ +unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ /* * Make sure vm_committed_as in one cacheline and not cacheline shared with * other variables. It can be updated by several CPUs frequently. @@ -121,7 +132,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); */ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { - unsigned long free, allowed; + unsigned long free, allowed, reserve; vm_acct_memory(pages); @@ -143,7 +154,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) */ free -= global_page_state(NR_SHMEM); - free += nr_swap_pages; + free += get_nr_swap_pages(); /* * Any slabs which are created with the @@ -162,10 +173,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) free -= totalreserve_pages; /* - * Leave the last 3% for root + * Reserve some for root */ if (!cap_sys_admin) - free -= free / 32; + free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); if (free > pages) return 0; @@ -173,19 +184,20 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) goto error; } - allowed = (totalram_pages - hugetlb_total_pages()) - * sysctl_overcommit_ratio / 100; + allowed = vm_commit_limit(); /* - * Leave the last 3% for root + * Reserve some for root */ if (!cap_sys_admin) - allowed -= allowed / 32; - allowed += total_swap_pages; + allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - /* Don't let a single process grow too big: - leave 3% of the size of this process for other processes */ - if (mm) - allowed -= mm->total_vm / 32; + /* + * Don't let a single process grow so big a user can't recover + */ + if (mm) { + reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + allowed -= min(mm->total_vm / 32, reserve); + } if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; @@ -202,7 +214,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) { if (vma->vm_flags & VM_DENYWRITE) - atomic_inc(&file->f_path.dentry->d_inode->i_writecount); + atomic_inc(&file_inode(file)->i_writecount); if (vma->vm_flags & VM_SHARED) mapping->i_mmap_writable--; @@ -255,6 +267,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) unsigned long newbrk, oldbrk; struct mm_struct *mm = current->mm; unsigned long min_brk; + bool populate; down_write(&mm->mmap_sem); @@ -304,8 +317,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* Ok, looks good - let it rip. */ if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) goto out; + set_brk: mm->brk = brk; + populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; + up_write(&mm->mmap_sem); + if (populate) + mm_populate(oldbrk, newbrk - oldbrk); + return brk; + out: retval = mm->brk; up_write(&mm->mmap_sem); @@ -344,20 +364,20 @@ static int browse_rb(struct rb_root *root) struct vm_area_struct *vma; vma = rb_entry(nd, struct vm_area_struct, vm_rb); if (vma->vm_start < prev) { - printk("vm_start %lx prev %lx\n", vma->vm_start, prev); + pr_info("vm_start %lx prev %lx\n", vma->vm_start, prev); bug = 1; } if (vma->vm_start < pend) { - printk("vm_start %lx pend %lx\n", vma->vm_start, pend); + pr_info("vm_start %lx pend %lx\n", vma->vm_start, pend); bug = 1; } if (vma->vm_start > vma->vm_end) { - printk("vm_end %lx < vm_start %lx\n", + pr_info("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); bug = 1; } if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { - printk("free gap %lx, correct %lx\n", + pr_info("free gap %lx, correct %lx\n", vma->rb_subtree_gap, vma_compute_subtree_gap(vma)); bug = 1; @@ -371,7 +391,7 @@ static int browse_rb(struct rb_root *root) for (nd = pn; nd; nd = rb_prev(nd)) j++; if (i != j) { - printk("backwards %d, forwards %d\n", j, i); + pr_info("backwards %d, forwards %d\n", j, i); bug = 1; } return bug ? -1 : i; @@ -389,7 +409,7 @@ static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) } } -void validate_mm(struct mm_struct *mm) +static void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; @@ -406,17 +426,17 @@ void validate_mm(struct mm_struct *mm) i++; } if (i != mm->map_count) { - printk("map_count %d vm_next %d\n", mm->map_count, i); + pr_info("map_count %d vm_next %d\n", mm->map_count, i); bug = 1; } if (highest_address != mm->highest_vm_end) { - printk("mm->highest_vm_end %lx, found %lx\n", + pr_info("mm->highest_vm_end %lx, found %lx\n", mm->highest_vm_end, highest_address); bug = 1; } i = browse_rb(&mm->mm_rb); if (i != mm->map_count) { - printk("map_count %d rb %d\n", mm->map_count, i); + pr_info("map_count %d rb %d\n", mm->map_count, i); bug = 1; } BUG_ON(bug); @@ -534,6 +554,34 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr, return 0; } +static unsigned long count_vma_pages_range(struct mm_struct *mm, + unsigned long addr, unsigned long end) +{ + unsigned long nr_pages = 0; + struct vm_area_struct *vma; + + /* Find first overlaping mapping */ + vma = find_vma_intersection(mm, addr, end); + if (!vma) + return 0; + + nr_pages = (min(end, vma->vm_end) - + max(addr, vma->vm_start)) >> PAGE_SHIFT; + + /* Iterate over the rest of the overlaps */ + for (vma = vma->vm_next; vma; vma = vma->vm_next) { + unsigned long overlap_len; + + if (vma->vm_start > end) + break; + + overlap_len = min(end, vma->vm_end) - vma->vm_start; + nr_pages += overlap_len >> PAGE_SHIFT; + } + + return nr_pages; +} + void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, struct rb_node **rb_link, struct rb_node *rb_parent) { @@ -567,7 +615,7 @@ static void __vma_link_file(struct vm_area_struct *vma) struct address_space *mapping = file->f_mapping; if (vma->vm_flags & VM_DENYWRITE) - atomic_dec(&file->f_path.dentry->d_inode->i_writecount); + atomic_dec(&file_inode(file)->i_writecount); if (vma->vm_flags & VM_SHARED) mapping->i_mmap_writable++; @@ -595,11 +643,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, { struct address_space *mapping = NULL; - if (vma->vm_file) + if (vma->vm_file) { mapping = vma->vm_file->f_mapping; - - if (mapping) mutex_lock(&mapping->i_mmap_mutex); + } __vma_link(mm, vma, prev, rb_link, rb_parent); __vma_link_file(vma); @@ -637,8 +684,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, prev->vm_next = next = vma->vm_next; if (next) next->vm_prev = prev; - if (mm->mmap_cache == vma) - mm->mmap_cache = prev; + + /* Kill the cache */ + vmacache_invalidate(mm); } /* @@ -800,7 +848,7 @@ again: remove_next = 1 + (end > next->vm_end); anon_vma_interval_tree_post_update_vma(vma); if (adjust_next) anon_vma_interval_tree_post_update_vma(next); - anon_vma_unlock(anon_vma); + anon_vma_unlock_write(anon_vma); } if (mapping) mutex_unlock(&mapping->i_mmap_mutex); @@ -850,7 +898,15 @@ again: remove_next = 1 + (end > next->vm_end); static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) { - if (vma->vm_flags ^ vm_flags) + /* + * VM_SOFTDIRTY should not prevent from VMA merging, if we + * match the flags but dirty bit -- the caller should mark + * merged VMA as dirty. If dirty bit won't be excluded from + * comparison, we increase pressue on the memory system forcing + * the kernel to generate new VMAs when old one could be + * extended instead. + */ + if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) return 0; if (vma->vm_file != file) return 0; @@ -910,7 +966,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, if (is_mergeable_vma(vma, file, vm_flags) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; - vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + vm_pglen = vma_pages(vma); if (vma->vm_pgoff + vm_pglen == vm_pgoff) return 1; } @@ -1039,7 +1095,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct * return a->vm_end == b->vm_start && mpol_equal(vma_policy(a), vma_policy(b)) && a->vm_file == b->vm_file && - !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) && + !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) && b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); } @@ -1147,18 +1203,38 @@ static inline unsigned long round_hint_to_min(unsigned long hint) return hint; } +static inline int mlock_future_check(struct mm_struct *mm, + unsigned long flags, + unsigned long len) +{ + unsigned long locked, lock_limit; + + /* mlock MCL_FUTURE? */ + if (flags & VM_LOCKED) { + locked = len >> PAGE_SHIFT; + locked += mm->locked_vm; + lock_limit = rlimit(RLIMIT_MEMLOCK); + lock_limit >>= PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + return -EAGAIN; + } + return 0; +} + /* * The caller must hold down_write(¤t->mm->mmap_sem). */ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long pgoff) + unsigned long flags, unsigned long pgoff, + unsigned long *populate) { struct mm_struct * mm = current->mm; - struct inode *inode; vm_flags_t vm_flags; + *populate = 0; + /* * Does the application expect PROT_READ to imply PROT_EXEC? * @@ -1206,20 +1282,12 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, if (!can_do_mlock()) return -EPERM; - /* mlock MCL_FUTURE? */ - if (vm_flags & VM_LOCKED) { - unsigned long locked, lock_limit; - locked = len >> PAGE_SHIFT; - locked += mm->locked_vm; - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - return -EAGAIN; - } - - inode = file ? file->f_path.dentry->d_inode : NULL; + if (mlock_future_check(mm, vm_flags, len)) + return -EAGAIN; if (file) { + struct inode *inode = file_inode(file); + switch (flags & MAP_TYPE) { case MAP_SHARED: if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) @@ -1235,7 +1303,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, /* * Make sure there are no mandatory locks on the file. */ - if (locks_verify_locked(inode)) + if (locks_verify_locked(file)) return -EAGAIN; vm_flags |= VM_SHARED | VM_MAYSHARE; @@ -1252,8 +1320,10 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, vm_flags &= ~VM_MAYEXEC; } - if (!file->f_op || !file->f_op->mmap) + if (!file->f_op->mmap) return -ENODEV; + if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) + return -EINVAL; break; default: @@ -1262,6 +1332,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, } else { switch (flags & MAP_TYPE) { case MAP_SHARED: + if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) + return -EINVAL; /* * Ignore pgoff. */ @@ -1279,7 +1351,26 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, } } - return mmap_region(file, addr, len, flags, vm_flags, pgoff); + /* + * Set 'VM_NORESERVE' if we should not account for the + * memory use of this mapping. + */ + if (flags & MAP_NORESERVE) { + /* We honor MAP_NORESERVE if allowed to overcommit */ + if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) + vm_flags |= VM_NORESERVE; + + /* hugetlb applies strict overcommit unless MAP_NORESERVE */ + if (file && is_file_hugepages(file)) + vm_flags |= VM_NORESERVE; + } + + addr = mmap_region(file, addr, len, vm_flags, pgoff); + if (!IS_ERR_VALUE(addr) && + ((vm_flags & VM_LOCKED) || + (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) + *populate = len; + return addr; } SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, @@ -1291,20 +1382,30 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, if (!(flags & MAP_ANONYMOUS)) { audit_mmap_fd(fd, flags); - if (unlikely(flags & MAP_HUGETLB)) - return -EINVAL; file = fget(fd); if (!file) goto out; + if (is_file_hugepages(file)) + len = ALIGN(len, huge_page_size(hstate_file(file))); + retval = -EINVAL; + if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file))) + goto out_fput; } else if (flags & MAP_HUGETLB) { struct user_struct *user = NULL; + struct hstate *hs; + + hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK); + if (!hs) + return -EINVAL; + + len = ALIGN(len, huge_page_size(hs)); /* * VM_NORESERVE is used because the reservations will be * taken when vm_ops->mmap() is called * A dummy user value is used because we are not locking * memory so no accounting is necessary */ - file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, + file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, &user, HUGETLB_ANONHUGE_INODE, (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); @@ -1315,6 +1416,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); +out_fput: if (file) fput(file); out: @@ -1394,16 +1496,30 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) } unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, unsigned long flags, - vm_flags_t vm_flags, unsigned long pgoff) + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; - int correct_wcount = 0; int error; struct rb_node **rb_link, *rb_parent; unsigned long charged = 0; - struct inode *inode = file ? file->f_path.dentry->d_inode : NULL; + + /* Check against address space limit. */ + if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { + unsigned long nr_pages; + + /* + * MAP_FIXED may remove pages of mappings that intersects with + * requested mapping. Account for the pages it would unmap. + */ + if (!(vm_flags & MAP_FIXED)) + return -ENOMEM; + + nr_pages = count_vma_pages_range(mm, addr, addr + len); + + if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages)) + return -ENOMEM; + } /* Clear old maps */ error = -ENOMEM; @@ -1414,24 +1530,6 @@ munmap_back: goto munmap_back; } - /* Check against address space limit. */ - if (!may_expand_vm(mm, len >> PAGE_SHIFT)) - return -ENOMEM; - - /* - * Set 'VM_NORESERVE' if we should not account for the - * memory use of this mapping. - */ - if ((flags & MAP_NORESERVE)) { - /* We honor MAP_NORESERVE if allowed to overcommit */ - if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) - vm_flags |= VM_NORESERVE; - - /* hugetlb applies strict overcommit unless MAP_NORESERVE */ - if (file && is_file_hugepages(file)) - vm_flags |= VM_NORESERVE; - } - /* * Private writable mapping: check memory availability */ @@ -1468,16 +1566,11 @@ munmap_back: vma->vm_pgoff = pgoff; INIT_LIST_HEAD(&vma->anon_vma_chain); - error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */ - if (file) { - if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) - goto free_vma; if (vm_flags & VM_DENYWRITE) { error = deny_write_access(file); if (error) goto free_vma; - correct_wcount = 1; } vma->vm_file = get_file(file); error = file->f_op->mmap(file, vma); @@ -1494,11 +1587,8 @@ munmap_back: WARN_ON_ONCE(addr != vma->vm_start); addr = vma->vm_start; - pgoff = vma->vm_pgoff; vm_flags = vma->vm_flags; } else if (vm_flags & VM_SHARED) { - if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP))) - goto free_vma; error = shmem_zero_setup(vma); if (error) goto free_vma; @@ -1520,29 +1610,39 @@ munmap_back: } vma_link(mm, vma, prev, rb_link, rb_parent); - file = vma->vm_file; - /* Once vma denies write, undo our temporary denial count */ - if (correct_wcount) - atomic_inc(&inode->i_writecount); + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); + file = vma->vm_file; out: perf_event_mmap(vma); vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { - if (!mlock_vma_pages_range(vma, addr, addr + len)) + if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current->mm))) mm->locked_vm += (len >> PAGE_SHIFT); - } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) - make_pages_present(addr, addr + len); + else + vma->vm_flags &= ~VM_LOCKED; + } if (file) uprobe_mmap(vma); + /* + * New (or expanded) vma always get soft dirty status. + * Otherwise user-space soft-dirty page tracker won't + * be able to distinguish situation when vma area unmapped, + * then new mapped in-place (which must be aimed as + * a completely new data area). + */ + vma->vm_flags |= VM_SOFTDIRTY; + return addr; unmap_and_free_vma: - if (correct_wcount) - atomic_inc(&inode->i_writecount); + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); vma->vm_file = NULL; fput(file); @@ -1777,7 +1877,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, struct vm_area_struct *vma; struct vm_unmapped_area_info info; - if (len > TASK_SIZE) + if (len > TASK_SIZE - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) @@ -1786,29 +1886,20 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vma->vm_start)) return addr; } info.flags = 0; info.length = len; - info.low_limit = TASK_UNMAPPED_BASE; + info.low_limit = mm->mmap_base; info.high_limit = TASK_SIZE; info.align_mask = 0; return vm_unmapped_area(&info); } #endif -void arch_unmap_area(struct mm_struct *mm, unsigned long addr) -{ - /* - * Is this a new hole at the lowest possible address? - */ - if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) - mm->free_area_cache = addr; -} - /* * This mmap-allocator allocates new areas top-down from below the * stack's low limit (the base): @@ -1825,7 +1916,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, struct vm_unmapped_area_info info; /* requested length too big for entire address space */ - if (len > TASK_SIZE) + if (len > TASK_SIZE - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) @@ -1835,14 +1926,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vma->vm_start)) return addr; } info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; - info.low_limit = PAGE_SIZE; + info.low_limit = max(PAGE_SIZE, mmap_min_addr); info.high_limit = mm->mmap_base; info.align_mask = 0; addr = vm_unmapped_area(&info); @@ -1865,19 +1956,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, } #endif -void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) -{ - /* - * Is this a new hole at the highest possible address? - */ - if (addr > mm->free_area_cache) - mm->free_area_cache = addr; - - /* dont allow allocations above current base */ - if (mm->free_area_cache > mm->mmap_base) - mm->free_area_cache = mm->mmap_base; -} - unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) @@ -1894,7 +1972,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, return -ENOMEM; get_area = current->mm->get_unmapped_area; - if (file && file->f_op && file->f_op->get_unmapped_area) + if (file && file->f_op->get_unmapped_area) get_area = file->f_op->get_unmapped_area; addr = get_area(file, addr, len, pgoff, flags); if (IS_ERR_VALUE(addr)) @@ -1915,37 +1993,33 @@ EXPORT_SYMBOL(get_unmapped_area); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma = NULL; - - if (WARN_ON_ONCE(!mm)) /* Remove this in linux-3.6 */ - return NULL; + struct rb_node *rb_node; + struct vm_area_struct *vma; /* Check the cache first. */ - /* (Cache hit rate is typically around 35%.) */ - vma = mm->mmap_cache; - if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { - struct rb_node *rb_node; + vma = vmacache_find(mm, addr); + if (likely(vma)) + return vma; - rb_node = mm->mm_rb.rb_node; - vma = NULL; + rb_node = mm->mm_rb.rb_node; + vma = NULL; - while (rb_node) { - struct vm_area_struct *vma_tmp; - - vma_tmp = rb_entry(rb_node, - struct vm_area_struct, vm_rb); - - if (vma_tmp->vm_end > addr) { - vma = vma_tmp; - if (vma_tmp->vm_start <= addr) - break; - rb_node = rb_node->rb_left; - } else - rb_node = rb_node->rb_right; - } - if (vma) - mm->mmap_cache = vma; + while (rb_node) { + struct vm_area_struct *tmp; + + tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); + + if (tmp->vm_end > addr) { + vma = tmp; + if (tmp->vm_start <= addr) + break; + rb_node = rb_node->rb_left; + } else + rb_node = rb_node->rb_right; } + + if (vma) + vmacache_update(addr, vma); return vma; } @@ -2169,9 +2243,28 @@ int expand_downwards(struct vm_area_struct *vma, return error; } +/* + * Note how expand_stack() refuses to expand the stack all the way to + * abut the next virtual mapping, *unless* that mapping itself is also + * a stack mapping. We want to leave room for a guard page, after all + * (the guard page itself is not added here, that is done by the + * actual page faulting logic) + * + * This matches the behavior of the guard page logic (see mm/memory.c: + * check_stack_guard_page()), which only allows the guard page to be + * removed under these circumstances. + */ #ifdef CONFIG_STACK_GROWSUP int expand_stack(struct vm_area_struct *vma, unsigned long address) { + struct vm_area_struct *next; + + address &= PAGE_MASK; + next = vma->vm_next; + if (next && next->vm_start == address + PAGE_SIZE) { + if (!(next->vm_flags & VM_GROWSUP)) + return -ENOMEM; + } return expand_upwards(vma, address); } @@ -2186,14 +2279,21 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) return vma; if (!prev || expand_stack(prev, addr)) return NULL; - if (prev->vm_flags & VM_LOCKED) { - mlock_vma_pages_range(prev, addr, prev->vm_end); - } + if (prev->vm_flags & VM_LOCKED) + __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL); return prev; } #else int expand_stack(struct vm_area_struct *vma, unsigned long address) { + struct vm_area_struct *prev; + + address &= PAGE_MASK; + prev = vma->vm_prev; + if (prev && prev->vm_end == address) { + if (!(prev->vm_flags & VM_GROWSDOWN)) + return -ENOMEM; + } return expand_downwards(vma, address); } @@ -2214,9 +2314,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) start = vma->vm_start; if (expand_stack(vma, addr)) return NULL; - if (vma->vm_flags & VM_LOCKED) { - mlock_vma_pages_range(vma, addr, start); - } + if (vma->vm_flags & VM_LOCKED) + __mlock_vma_pages_range(vma, addr, start, NULL); return vma; } #endif @@ -2258,11 +2357,11 @@ static void unmap_region(struct mm_struct *mm, struct mmu_gather tlb; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, 0); + tlb_gather_mmu(&tlb, mm, start, end); update_hiwater_rss(mm); unmap_vmas(&tlb, vma, start, end); free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, - next ? next->vm_start : 0); + next ? next->vm_start : USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, start, end); } @@ -2276,7 +2375,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, { struct vm_area_struct **insertion_point; struct vm_area_struct *tail_vma = NULL; - unsigned long addr; insertion_point = (prev ? &prev->vm_next : &mm->mmap); vma->vm_prev = NULL; @@ -2293,12 +2391,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, } else mm->highest_vm_end = prev ? prev->vm_end : 0; tail_vma->vm_next = NULL; - if (mm->unmap_area == arch_unmap_area) - addr = prev ? prev->vm_end : mm->mmap_base; - else - addr = vma ? vma->vm_start : mm->mmap_base; - mm->unmap_area(mm, addr); - mm->mmap_cache = NULL; /* Kill the cache. */ + + /* Kill the cache */ + vmacache_invalidate(mm); } /* @@ -2308,7 +2403,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long addr, int new_below) { - struct mempolicy *pol; struct vm_area_struct *new; int err = -ENOMEM; @@ -2332,12 +2426,9 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); } - pol = mpol_dup(vma_policy(vma)); - if (IS_ERR(pol)) { - err = PTR_ERR(pol); + err = vma_dup_policy(vma, new); + if (err) goto out_free_vma; - } - vma_set_policy(new, pol); if (anon_vma_clone(new, vma)) goto out_free_mpol; @@ -2365,7 +2456,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, fput(new->vm_file); unlink_anon_vmas(new); out_free_mpol: - mpol_put(pol); + mpol_put(vma_policy(new)); out_free_vma: kmem_cache_free(vm_area_cachep, new); out_err: @@ -2524,18 +2615,9 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) if (error & ~PAGE_MASK) return error; - /* - * mlock MCL_FUTURE? - */ - if (mm->def_flags & VM_LOCKED) { - unsigned long locked, lock_limit; - locked = len >> PAGE_SHIFT; - locked += mm->locked_vm; - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - return -EAGAIN; - } + error = mlock_future_check(mm, mm->def_flags, len); + if (error) + return error; /* * mm->mmap_sem is required to protect against another thread @@ -2589,10 +2671,9 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; - if (flags & VM_LOCKED) { - if (!mlock_vma_pages_range(vma, addr, addr + len)) - mm->locked_vm += (len >> PAGE_SHIFT); - } + if (flags & VM_LOCKED) + mm->locked_vm += (len >> PAGE_SHIFT); + vma->vm_flags |= VM_SOFTDIRTY; return addr; } @@ -2600,10 +2681,14 @@ unsigned long vm_brk(unsigned long addr, unsigned long len) { struct mm_struct *mm = current->mm; unsigned long ret; + bool populate; down_write(&mm->mmap_sem); ret = do_brk(addr, len); + populate = ((mm->def_flags & VM_LOCKED) != 0); up_write(&mm->mmap_sem); + if (populate) + mm_populate(addr, len); return ret; } EXPORT_SYMBOL(vm_brk); @@ -2635,12 +2720,12 @@ void exit_mmap(struct mm_struct *mm) lru_add_drain(); flush_cache_mm(mm); - tlb_gather_mmu(&tlb, mm, 1); + tlb_gather_mmu(&tlb, mm, 0, -1); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, vma, 0, -1); - free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); + free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, 0, -1); /* @@ -2654,7 +2739,8 @@ void exit_mmap(struct mm_struct *mm) } vm_unacct_memory(nr_accounted); - WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); + WARN_ON(atomic_long_read(&mm->nr_ptes) > + (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); } /* Insert vm structure into process list sorted by address @@ -2706,7 +2792,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma, *prev; struct rb_node **rb_link, *rb_parent; - struct mempolicy *pol; bool faulted_in_anon_vma = true; /* @@ -2751,10 +2836,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, new_vma->vm_start = addr; new_vma->vm_end = addr + len; new_vma->vm_pgoff = pgoff; - pol = mpol_dup(vma_policy(vma)); - if (IS_ERR(pol)) + if (vma_dup_policy(vma, new_vma)) goto out_free_vma; - vma_set_policy(new_vma, pol); INIT_LIST_HEAD(&new_vma->anon_vma_chain); if (anon_vma_clone(new_vma, vma)) goto out_free_mempol; @@ -2769,7 +2852,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return new_vma; out_free_mempol: - mpol_put(pol); + mpol_put(vma_policy(new_vma)); out_free_vma: kmem_cache_free(vm_area_cachep, new_vma); return NULL; @@ -2791,6 +2874,31 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages) return 1; } +static int special_mapping_fault(struct vm_area_struct *vma, + struct vm_fault *vmf); + +/* + * Having a close hook prevents vma merging regardless of flags. + */ +static void special_mapping_close(struct vm_area_struct *vma) +{ +} + +static const char *special_mapping_name(struct vm_area_struct *vma) +{ + return ((struct vm_special_mapping *)vma->vm_private_data)->name; +} + +static const struct vm_operations_struct special_mapping_vmops = { + .close = special_mapping_close, + .fault = special_mapping_fault, + .name = special_mapping_name, +}; + +static const struct vm_operations_struct legacy_special_mapping_vmops = { + .close = special_mapping_close, + .fault = special_mapping_fault, +}; static int special_mapping_fault(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -2806,7 +2914,13 @@ static int special_mapping_fault(struct vm_area_struct *vma, */ pgoff = vmf->pgoff - vma->vm_pgoff; - for (pages = vma->vm_private_data; pgoff && *pages; ++pages) + if (vma->vm_ops == &legacy_special_mapping_vmops) + pages = vma->vm_private_data; + else + pages = ((struct vm_special_mapping *)vma->vm_private_data)-> + pages; + + for (; pgoff && *pages; ++pages) pgoff--; if (*pages) { @@ -2819,48 +2933,29 @@ static int special_mapping_fault(struct vm_area_struct *vma, return VM_FAULT_SIGBUS; } -/* - * Having a close hook prevents vma merging regardless of flags. - */ -static void special_mapping_close(struct vm_area_struct *vma) -{ -} - -static const struct vm_operations_struct special_mapping_vmops = { - .close = special_mapping_close, - .fault = special_mapping_fault, -}; - -/* - * Called with mm->mmap_sem held for writing. - * Insert a new vma covering the given region, with the given flags. - * Its pages are supplied by the given array of struct page *. - * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. - * The region past the last page supplied will always produce SIGBUS. - * The array pointer and the pages it points to are assumed to stay alive - * for as long as this mapping might exist. - */ -int install_special_mapping(struct mm_struct *mm, - unsigned long addr, unsigned long len, - unsigned long vm_flags, struct page **pages) +static struct vm_area_struct *__install_special_mapping( + struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, const struct vm_operations_struct *ops, + void *priv) { int ret; struct vm_area_struct *vma; vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (unlikely(vma == NULL)) - return -ENOMEM; + return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; - vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND; + vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); - vma->vm_ops = &special_mapping_vmops; - vma->vm_private_data = pages; + vma->vm_ops = ops; + vma->vm_private_data = priv; ret = insert_vm_struct(mm, vma); if (ret) @@ -2870,11 +2965,40 @@ int install_special_mapping(struct mm_struct *mm, perf_event_mmap(vma); - return 0; + return vma; out: kmem_cache_free(vm_area_cachep, vma); - return ret; + return ERR_PTR(ret); +} + +/* + * Called with mm->mmap_sem held for writing. + * Insert a new vma covering the given region, with the given flags. + * Its pages are supplied by the given array of struct page *. + * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. + * The region past the last page supplied will always produce SIGBUS. + * The array pointer and the pages it points to are assumed to stay alive + * for as long as this mapping might exist. + */ +struct vm_area_struct *_install_special_mapping( + struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, const struct vm_special_mapping *spec) +{ + return __install_special_mapping(mm, addr, len, vm_flags, + &special_mapping_vmops, (void *)spec); +} + +int install_special_mapping(struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, struct page **pages) +{ + struct vm_area_struct *vma = __install_special_mapping( + mm, addr, len, vm_flags, &legacy_special_mapping_vmops, + (void *)pages); + + return PTR_ERR_OR_ZERO(vma); } static DEFINE_MUTEX(mm_all_locks_mutex); @@ -2943,7 +3067,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) * vma in this mm is backed by the same anon_vma or address_space. * * We can take all the locks in random order because the VM code - * taking i_mmap_mutex or anon_vma->mutex outside the mmap_sem never + * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never * takes more than one of them in a row. Secondly we're protected * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. * @@ -3001,7 +3125,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) if (!__test_and_clear_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) BUG(); - anon_vma_unlock(anon_vma); + anon_vma_unlock_write(anon_vma); } } @@ -3052,3 +3176,115 @@ void __init mmap_init(void) ret = percpu_counter_init(&vm_committed_as, 0); VM_BUG_ON(ret); } + +/* + * Initialise sysctl_user_reserve_kbytes. + * + * This is intended to prevent a user from starting a single memory hogging + * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER + * mode. + * + * The default value is min(3% of free memory, 128MB) + * 128MB is enough to recover with sshd/login, bash, and top/kill. + */ +static int init_user_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); + return 0; +} +subsys_initcall(init_user_reserve); + +/* + * Initialise sysctl_admin_reserve_kbytes. + * + * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin + * to log in and kill a memory hogging process. + * + * Systems with more than 256MB will reserve 8MB, enough to recover + * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will + * only reserve 3% of free pages by default. + */ +static int init_admin_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); + return 0; +} +subsys_initcall(init_admin_reserve); + +/* + * Reinititalise user and admin reserves if memory is added or removed. + * + * The default user reserve max is 128MB, and the default max for the + * admin reserve is 8MB. These are usually, but not always, enough to + * enable recovery from a memory hogging process using login/sshd, a shell, + * and tools like top. It may make sense to increase or even disable the + * reserve depending on the existence of swap or variations in the recovery + * tools. So, the admin may have changed them. + * + * If memory is added and the reserves have been eliminated or increased above + * the default max, then we'll trust the admin. + * + * If memory is removed and there isn't enough free memory, then we + * need to reset the reserves. + * + * Otherwise keep the reserve set by the admin. + */ +static int reserve_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + unsigned long tmp, free_kbytes; + + switch (action) { + case MEM_ONLINE: + /* Default max is 128MB. Leave alone if modified by operator. */ + tmp = sysctl_user_reserve_kbytes; + if (0 < tmp && tmp < (1UL << 17)) + init_user_reserve(); + + /* Default max is 8MB. Leave alone if modified by operator. */ + tmp = sysctl_admin_reserve_kbytes; + if (0 < tmp && tmp < (1UL << 13)) + init_admin_reserve(); + + break; + case MEM_OFFLINE: + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + if (sysctl_user_reserve_kbytes > free_kbytes) { + init_user_reserve(); + pr_info("vm.user_reserve_kbytes reset to %lu\n", + sysctl_user_reserve_kbytes); + } + + if (sysctl_admin_reserve_kbytes > free_kbytes) { + init_admin_reserve(); + pr_info("vm.admin_reserve_kbytes reset to %lu\n", + sysctl_admin_reserve_kbytes); + } + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block reserve_mem_nb = { + .notifier_call = reserve_mem_notifier, +}; + +static int __meminit init_reserve_notifier(void) +{ + if (register_hotmemory_notifier(&reserve_mem_nb)) + pr_err("Failed registering memory add/remove notifier for admin reserve\n"); + + return 0; +} +subsys_initcall(init_reserve_notifier); |
