aboutsummaryrefslogtreecommitdiff
path: root/mm/mmap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/mmap.c')
-rw-r--r--mm/mmap.c688
1 files changed, 462 insertions, 226 deletions
diff --git a/mm/mmap.c b/mm/mmap.c
index 35730ee9d51..129b847d30c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -6,9 +6,13 @@
* Address space accounting code <alan@lxorguk.ukuu.org.uk>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
@@ -32,6 +36,10 @@
#include <linux/khugepaged.h>
#include <linux/uprobes.h>
#include <linux/rbtree_augmented.h>
+#include <linux/sched/sysctl.h>
+#include <linux/notifier.h>
+#include <linux/memory.h>
+#include <linux/printk.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -82,7 +90,10 @@ EXPORT_SYMBOL(vm_get_page_prot);
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
+unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
+unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
+unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
/*
* Make sure vm_committed_as in one cacheline and not cacheline shared with
* other variables. It can be updated by several CPUs frequently.
@@ -121,7 +132,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
*/
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
- unsigned long free, allowed;
+ unsigned long free, allowed, reserve;
vm_acct_memory(pages);
@@ -143,7 +154,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
*/
free -= global_page_state(NR_SHMEM);
- free += nr_swap_pages;
+ free += get_nr_swap_pages();
/*
* Any slabs which are created with the
@@ -162,10 +173,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
free -= totalreserve_pages;
/*
- * Leave the last 3% for root
+ * Reserve some for root
*/
if (!cap_sys_admin)
- free -= free / 32;
+ free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
if (free > pages)
return 0;
@@ -173,19 +184,20 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
goto error;
}
- allowed = (totalram_pages - hugetlb_total_pages())
- * sysctl_overcommit_ratio / 100;
+ allowed = vm_commit_limit();
/*
- * Leave the last 3% for root
+ * Reserve some for root
*/
if (!cap_sys_admin)
- allowed -= allowed / 32;
- allowed += total_swap_pages;
+ allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
- /* Don't let a single process grow too big:
- leave 3% of the size of this process for other processes */
- if (mm)
- allowed -= mm->total_vm / 32;
+ /*
+ * Don't let a single process grow so big a user can't recover
+ */
+ if (mm) {
+ reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+ allowed -= min(mm->total_vm / 32, reserve);
+ }
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
@@ -202,7 +214,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
struct file *file, struct address_space *mapping)
{
if (vma->vm_flags & VM_DENYWRITE)
- atomic_inc(&file->f_path.dentry->d_inode->i_writecount);
+ atomic_inc(&file_inode(file)->i_writecount);
if (vma->vm_flags & VM_SHARED)
mapping->i_mmap_writable--;
@@ -255,6 +267,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
unsigned long newbrk, oldbrk;
struct mm_struct *mm = current->mm;
unsigned long min_brk;
+ bool populate;
down_write(&mm->mmap_sem);
@@ -304,8 +317,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
/* Ok, looks good - let it rip. */
if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
goto out;
+
set_brk:
mm->brk = brk;
+ populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
+ up_write(&mm->mmap_sem);
+ if (populate)
+ mm_populate(oldbrk, newbrk - oldbrk);
+ return brk;
+
out:
retval = mm->brk;
up_write(&mm->mmap_sem);
@@ -344,20 +364,20 @@ static int browse_rb(struct rb_root *root)
struct vm_area_struct *vma;
vma = rb_entry(nd, struct vm_area_struct, vm_rb);
if (vma->vm_start < prev) {
- printk("vm_start %lx prev %lx\n", vma->vm_start, prev);
+ pr_info("vm_start %lx prev %lx\n", vma->vm_start, prev);
bug = 1;
}
if (vma->vm_start < pend) {
- printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
+ pr_info("vm_start %lx pend %lx\n", vma->vm_start, pend);
bug = 1;
}
if (vma->vm_start > vma->vm_end) {
- printk("vm_end %lx < vm_start %lx\n",
+ pr_info("vm_end %lx < vm_start %lx\n",
vma->vm_end, vma->vm_start);
bug = 1;
}
if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
- printk("free gap %lx, correct %lx\n",
+ pr_info("free gap %lx, correct %lx\n",
vma->rb_subtree_gap,
vma_compute_subtree_gap(vma));
bug = 1;
@@ -371,7 +391,7 @@ static int browse_rb(struct rb_root *root)
for (nd = pn; nd; nd = rb_prev(nd))
j++;
if (i != j) {
- printk("backwards %d, forwards %d\n", j, i);
+ pr_info("backwards %d, forwards %d\n", j, i);
bug = 1;
}
return bug ? -1 : i;
@@ -389,7 +409,7 @@ static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
}
}
-void validate_mm(struct mm_struct *mm)
+static void validate_mm(struct mm_struct *mm)
{
int bug = 0;
int i = 0;
@@ -406,17 +426,17 @@ void validate_mm(struct mm_struct *mm)
i++;
}
if (i != mm->map_count) {
- printk("map_count %d vm_next %d\n", mm->map_count, i);
+ pr_info("map_count %d vm_next %d\n", mm->map_count, i);
bug = 1;
}
if (highest_address != mm->highest_vm_end) {
- printk("mm->highest_vm_end %lx, found %lx\n",
+ pr_info("mm->highest_vm_end %lx, found %lx\n",
mm->highest_vm_end, highest_address);
bug = 1;
}
i = browse_rb(&mm->mm_rb);
if (i != mm->map_count) {
- printk("map_count %d rb %d\n", mm->map_count, i);
+ pr_info("map_count %d rb %d\n", mm->map_count, i);
bug = 1;
}
BUG_ON(bug);
@@ -534,6 +554,34 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr,
return 0;
}
+static unsigned long count_vma_pages_range(struct mm_struct *mm,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long nr_pages = 0;
+ struct vm_area_struct *vma;
+
+ /* Find first overlaping mapping */
+ vma = find_vma_intersection(mm, addr, end);
+ if (!vma)
+ return 0;
+
+ nr_pages = (min(end, vma->vm_end) -
+ max(addr, vma->vm_start)) >> PAGE_SHIFT;
+
+ /* Iterate over the rest of the overlaps */
+ for (vma = vma->vm_next; vma; vma = vma->vm_next) {
+ unsigned long overlap_len;
+
+ if (vma->vm_start > end)
+ break;
+
+ overlap_len = min(end, vma->vm_end) - vma->vm_start;
+ nr_pages += overlap_len >> PAGE_SHIFT;
+ }
+
+ return nr_pages;
+}
+
void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
struct rb_node **rb_link, struct rb_node *rb_parent)
{
@@ -567,7 +615,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
struct address_space *mapping = file->f_mapping;
if (vma->vm_flags & VM_DENYWRITE)
- atomic_dec(&file->f_path.dentry->d_inode->i_writecount);
+ atomic_dec(&file_inode(file)->i_writecount);
if (vma->vm_flags & VM_SHARED)
mapping->i_mmap_writable++;
@@ -595,11 +643,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct address_space *mapping = NULL;
- if (vma->vm_file)
+ if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
-
- if (mapping)
mutex_lock(&mapping->i_mmap_mutex);
+ }
__vma_link(mm, vma, prev, rb_link, rb_parent);
__vma_link_file(vma);
@@ -637,8 +684,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
prev->vm_next = next = vma->vm_next;
if (next)
next->vm_prev = prev;
- if (mm->mmap_cache == vma)
- mm->mmap_cache = prev;
+
+ /* Kill the cache */
+ vmacache_invalidate(mm);
}
/*
@@ -800,7 +848,7 @@ again: remove_next = 1 + (end > next->vm_end);
anon_vma_interval_tree_post_update_vma(vma);
if (adjust_next)
anon_vma_interval_tree_post_update_vma(next);
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_write(anon_vma);
}
if (mapping)
mutex_unlock(&mapping->i_mmap_mutex);
@@ -850,7 +898,15 @@ again: remove_next = 1 + (end > next->vm_end);
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags)
{
- if (vma->vm_flags ^ vm_flags)
+ /*
+ * VM_SOFTDIRTY should not prevent from VMA merging, if we
+ * match the flags but dirty bit -- the caller should mark
+ * merged VMA as dirty. If dirty bit won't be excluded from
+ * comparison, we increase pressue on the memory system forcing
+ * the kernel to generate new VMAs when old one could be
+ * extended instead.
+ */
+ if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
return 0;
if (vma->vm_file != file)
return 0;
@@ -910,7 +966,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
if (is_mergeable_vma(vma, file, vm_flags) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
pgoff_t vm_pglen;
- vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+ vm_pglen = vma_pages(vma);
if (vma->vm_pgoff + vm_pglen == vm_pgoff)
return 1;
}
@@ -1039,7 +1095,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
return a->vm_end == b->vm_start &&
mpol_equal(vma_policy(a), vma_policy(b)) &&
a->vm_file == b->vm_file &&
- !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) &&
+ !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) &&
b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
}
@@ -1147,18 +1203,38 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
return hint;
}
+static inline int mlock_future_check(struct mm_struct *mm,
+ unsigned long flags,
+ unsigned long len)
+{
+ unsigned long locked, lock_limit;
+
+ /* mlock MCL_FUTURE? */
+ if (flags & VM_LOCKED) {
+ locked = len >> PAGE_SHIFT;
+ locked += mm->locked_vm;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
+ lock_limit >>= PAGE_SHIFT;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ return -EAGAIN;
+ }
+ return 0;
+}
+
/*
* The caller must hold down_write(&current->mm->mmap_sem).
*/
unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
- unsigned long flags, unsigned long pgoff)
+ unsigned long flags, unsigned long pgoff,
+ unsigned long *populate)
{
struct mm_struct * mm = current->mm;
- struct inode *inode;
vm_flags_t vm_flags;
+ *populate = 0;
+
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
*
@@ -1206,20 +1282,12 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
if (!can_do_mlock())
return -EPERM;
- /* mlock MCL_FUTURE? */
- if (vm_flags & VM_LOCKED) {
- unsigned long locked, lock_limit;
- locked = len >> PAGE_SHIFT;
- locked += mm->locked_vm;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return -EAGAIN;
- }
-
- inode = file ? file->f_path.dentry->d_inode : NULL;
+ if (mlock_future_check(mm, vm_flags, len))
+ return -EAGAIN;
if (file) {
+ struct inode *inode = file_inode(file);
+
switch (flags & MAP_TYPE) {
case MAP_SHARED:
if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
@@ -1235,7 +1303,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
/*
* Make sure there are no mandatory locks on the file.
*/
- if (locks_verify_locked(inode))
+ if (locks_verify_locked(file))
return -EAGAIN;
vm_flags |= VM_SHARED | VM_MAYSHARE;
@@ -1252,8 +1320,10 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
vm_flags &= ~VM_MAYEXEC;
}
- if (!file->f_op || !file->f_op->mmap)
+ if (!file->f_op->mmap)
return -ENODEV;
+ if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
+ return -EINVAL;
break;
default:
@@ -1262,6 +1332,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
} else {
switch (flags & MAP_TYPE) {
case MAP_SHARED:
+ if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
+ return -EINVAL;
/*
* Ignore pgoff.
*/
@@ -1279,7 +1351,26 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
}
}
- return mmap_region(file, addr, len, flags, vm_flags, pgoff);
+ /*
+ * Set 'VM_NORESERVE' if we should not account for the
+ * memory use of this mapping.
+ */
+ if (flags & MAP_NORESERVE) {
+ /* We honor MAP_NORESERVE if allowed to overcommit */
+ if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+ vm_flags |= VM_NORESERVE;
+
+ /* hugetlb applies strict overcommit unless MAP_NORESERVE */
+ if (file && is_file_hugepages(file))
+ vm_flags |= VM_NORESERVE;
+ }
+
+ addr = mmap_region(file, addr, len, vm_flags, pgoff);
+ if (!IS_ERR_VALUE(addr) &&
+ ((vm_flags & VM_LOCKED) ||
+ (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
+ *populate = len;
+ return addr;
}
SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
@@ -1291,20 +1382,30 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
if (!(flags & MAP_ANONYMOUS)) {
audit_mmap_fd(fd, flags);
- if (unlikely(flags & MAP_HUGETLB))
- return -EINVAL;
file = fget(fd);
if (!file)
goto out;
+ if (is_file_hugepages(file))
+ len = ALIGN(len, huge_page_size(hstate_file(file)));
+ retval = -EINVAL;
+ if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
+ goto out_fput;
} else if (flags & MAP_HUGETLB) {
struct user_struct *user = NULL;
+ struct hstate *hs;
+
+ hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK);
+ if (!hs)
+ return -EINVAL;
+
+ len = ALIGN(len, huge_page_size(hs));
/*
* VM_NORESERVE is used because the reservations will be
* taken when vm_ops->mmap() is called
* A dummy user value is used because we are not locking
* memory so no accounting is necessary
*/
- file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
+ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
VM_NORESERVE,
&user, HUGETLB_ANONHUGE_INODE,
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
@@ -1315,6 +1416,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+out_fput:
if (file)
fput(file);
out:
@@ -1394,16 +1496,30 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
}
unsigned long mmap_region(struct file *file, unsigned long addr,
- unsigned long len, unsigned long flags,
- vm_flags_t vm_flags, unsigned long pgoff)
+ unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
- int correct_wcount = 0;
int error;
struct rb_node **rb_link, *rb_parent;
unsigned long charged = 0;
- struct inode *inode = file ? file->f_path.dentry->d_inode : NULL;
+
+ /* Check against address space limit. */
+ if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
+ unsigned long nr_pages;
+
+ /*
+ * MAP_FIXED may remove pages of mappings that intersects with
+ * requested mapping. Account for the pages it would unmap.
+ */
+ if (!(vm_flags & MAP_FIXED))
+ return -ENOMEM;
+
+ nr_pages = count_vma_pages_range(mm, addr, addr + len);
+
+ if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
+ return -ENOMEM;
+ }
/* Clear old maps */
error = -ENOMEM;
@@ -1414,24 +1530,6 @@ munmap_back:
goto munmap_back;
}
- /* Check against address space limit. */
- if (!may_expand_vm(mm, len >> PAGE_SHIFT))
- return -ENOMEM;
-
- /*
- * Set 'VM_NORESERVE' if we should not account for the
- * memory use of this mapping.
- */
- if ((flags & MAP_NORESERVE)) {
- /* We honor MAP_NORESERVE if allowed to overcommit */
- if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
- vm_flags |= VM_NORESERVE;
-
- /* hugetlb applies strict overcommit unless MAP_NORESERVE */
- if (file && is_file_hugepages(file))
- vm_flags |= VM_NORESERVE;
- }
-
/*
* Private writable mapping: check memory availability
*/
@@ -1468,16 +1566,11 @@ munmap_back:
vma->vm_pgoff = pgoff;
INIT_LIST_HEAD(&vma->anon_vma_chain);
- error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */
-
if (file) {
- if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
- goto free_vma;
if (vm_flags & VM_DENYWRITE) {
error = deny_write_access(file);
if (error)
goto free_vma;
- correct_wcount = 1;
}
vma->vm_file = get_file(file);
error = file->f_op->mmap(file, vma);
@@ -1494,11 +1587,8 @@ munmap_back:
WARN_ON_ONCE(addr != vma->vm_start);
addr = vma->vm_start;
- pgoff = vma->vm_pgoff;
vm_flags = vma->vm_flags;
} else if (vm_flags & VM_SHARED) {
- if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))
- goto free_vma;
error = shmem_zero_setup(vma);
if (error)
goto free_vma;
@@ -1520,29 +1610,39 @@ munmap_back:
}
vma_link(mm, vma, prev, rb_link, rb_parent);
- file = vma->vm_file;
-
/* Once vma denies write, undo our temporary denial count */
- if (correct_wcount)
- atomic_inc(&inode->i_writecount);
+ if (vm_flags & VM_DENYWRITE)
+ allow_write_access(file);
+ file = vma->vm_file;
out:
perf_event_mmap(vma);
vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
- if (!mlock_vma_pages_range(vma, addr, addr + len))
+ if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
+ vma == get_gate_vma(current->mm)))
mm->locked_vm += (len >> PAGE_SHIFT);
- } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
- make_pages_present(addr, addr + len);
+ else
+ vma->vm_flags &= ~VM_LOCKED;
+ }
if (file)
uprobe_mmap(vma);
+ /*
+ * New (or expanded) vma always get soft dirty status.
+ * Otherwise user-space soft-dirty page tracker won't
+ * be able to distinguish situation when vma area unmapped,
+ * then new mapped in-place (which must be aimed as
+ * a completely new data area).
+ */
+ vma->vm_flags |= VM_SOFTDIRTY;
+
return addr;
unmap_and_free_vma:
- if (correct_wcount)
- atomic_inc(&inode->i_writecount);
+ if (vm_flags & VM_DENYWRITE)
+ allow_write_access(file);
vma->vm_file = NULL;
fput(file);
@@ -1777,7 +1877,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
struct vm_area_struct *vma;
struct vm_unmapped_area_info info;
- if (len > TASK_SIZE)
+ if (len > TASK_SIZE - mmap_min_addr)
return -ENOMEM;
if (flags & MAP_FIXED)
@@ -1786,29 +1886,20 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
+ if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
info.flags = 0;
info.length = len;
- info.low_limit = TASK_UNMAPPED_BASE;
+ info.low_limit = mm->mmap_base;
info.high_limit = TASK_SIZE;
info.align_mask = 0;
return vm_unmapped_area(&info);
}
#endif
-void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
-{
- /*
- * Is this a new hole at the lowest possible address?
- */
- if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)
- mm->free_area_cache = addr;
-}
-
/*
* This mmap-allocator allocates new areas top-down from below the
* stack's low limit (the base):
@@ -1825,7 +1916,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
struct vm_unmapped_area_info info;
/* requested length too big for entire address space */
- if (len > TASK_SIZE)
+ if (len > TASK_SIZE - mmap_min_addr)
return -ENOMEM;
if (flags & MAP_FIXED)
@@ -1835,14 +1926,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
+ if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
- info.low_limit = PAGE_SIZE;
+ info.low_limit = max(PAGE_SIZE, mmap_min_addr);
info.high_limit = mm->mmap_base;
info.align_mask = 0;
addr = vm_unmapped_area(&info);
@@ -1865,19 +1956,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
}
#endif
-void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
-{
- /*
- * Is this a new hole at the highest possible address?
- */
- if (addr > mm->free_area_cache)
- mm->free_area_cache = addr;
-
- /* dont allow allocations above current base */
- if (mm->free_area_cache > mm->mmap_base)
- mm->free_area_cache = mm->mmap_base;
-}
-
unsigned long
get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
@@ -1894,7 +1972,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
return -ENOMEM;
get_area = current->mm->get_unmapped_area;
- if (file && file->f_op && file->f_op->get_unmapped_area)
+ if (file && file->f_op->get_unmapped_area)
get_area = file->f_op->get_unmapped_area;
addr = get_area(file, addr, len, pgoff, flags);
if (IS_ERR_VALUE(addr))
@@ -1915,37 +1993,33 @@ EXPORT_SYMBOL(get_unmapped_area);
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
- struct vm_area_struct *vma = NULL;
-
- if (WARN_ON_ONCE(!mm)) /* Remove this in linux-3.6 */
- return NULL;
+ struct rb_node *rb_node;
+ struct vm_area_struct *vma;
/* Check the cache first. */
- /* (Cache hit rate is typically around 35%.) */
- vma = mm->mmap_cache;
- if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
- struct rb_node *rb_node;
+ vma = vmacache_find(mm, addr);
+ if (likely(vma))
+ return vma;
- rb_node = mm->mm_rb.rb_node;
- vma = NULL;
+ rb_node = mm->mm_rb.rb_node;
+ vma = NULL;
- while (rb_node) {
- struct vm_area_struct *vma_tmp;
-
- vma_tmp = rb_entry(rb_node,
- struct vm_area_struct, vm_rb);
-
- if (vma_tmp->vm_end > addr) {
- vma = vma_tmp;
- if (vma_tmp->vm_start <= addr)
- break;
- rb_node = rb_node->rb_left;
- } else
- rb_node = rb_node->rb_right;
- }
- if (vma)
- mm->mmap_cache = vma;
+ while (rb_node) {
+ struct vm_area_struct *tmp;
+
+ tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+
+ if (tmp->vm_end > addr) {
+ vma = tmp;
+ if (tmp->vm_start <= addr)
+ break;
+ rb_node = rb_node->rb_left;
+ } else
+ rb_node = rb_node->rb_right;
}
+
+ if (vma)
+ vmacache_update(addr, vma);
return vma;
}
@@ -2169,9 +2243,28 @@ int expand_downwards(struct vm_area_struct *vma,
return error;
}
+/*
+ * Note how expand_stack() refuses to expand the stack all the way to
+ * abut the next virtual mapping, *unless* that mapping itself is also
+ * a stack mapping. We want to leave room for a guard page, after all
+ * (the guard page itself is not added here, that is done by the
+ * actual page faulting logic)
+ *
+ * This matches the behavior of the guard page logic (see mm/memory.c:
+ * check_stack_guard_page()), which only allows the guard page to be
+ * removed under these circumstances.
+ */
#ifdef CONFIG_STACK_GROWSUP
int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
+ struct vm_area_struct *next;
+
+ address &= PAGE_MASK;
+ next = vma->vm_next;
+ if (next && next->vm_start == address + PAGE_SIZE) {
+ if (!(next->vm_flags & VM_GROWSUP))
+ return -ENOMEM;
+ }
return expand_upwards(vma, address);
}
@@ -2186,14 +2279,21 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
return vma;
if (!prev || expand_stack(prev, addr))
return NULL;
- if (prev->vm_flags & VM_LOCKED) {
- mlock_vma_pages_range(prev, addr, prev->vm_end);
- }
+ if (prev->vm_flags & VM_LOCKED)
+ __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);
return prev;
}
#else
int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
+ struct vm_area_struct *prev;
+
+ address &= PAGE_MASK;
+ prev = vma->vm_prev;
+ if (prev && prev->vm_end == address) {
+ if (!(prev->vm_flags & VM_GROWSDOWN))
+ return -ENOMEM;
+ }
return expand_downwards(vma, address);
}
@@ -2214,9 +2314,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
start = vma->vm_start;
if (expand_stack(vma, addr))
return NULL;
- if (vma->vm_flags & VM_LOCKED) {
- mlock_vma_pages_range(vma, addr, start);
- }
+ if (vma->vm_flags & VM_LOCKED)
+ __mlock_vma_pages_range(vma, addr, start, NULL);
return vma;
}
#endif
@@ -2258,11 +2357,11 @@ static void unmap_region(struct mm_struct *mm,
struct mmu_gather tlb;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, 0);
+ tlb_gather_mmu(&tlb, mm, start, end);
update_hiwater_rss(mm);
unmap_vmas(&tlb, vma, start, end);
free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
- next ? next->vm_start : 0);
+ next ? next->vm_start : USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, start, end);
}
@@ -2276,7 +2375,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct vm_area_struct **insertion_point;
struct vm_area_struct *tail_vma = NULL;
- unsigned long addr;
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
vma->vm_prev = NULL;
@@ -2293,12 +2391,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
} else
mm->highest_vm_end = prev ? prev->vm_end : 0;
tail_vma->vm_next = NULL;
- if (mm->unmap_area == arch_unmap_area)
- addr = prev ? prev->vm_end : mm->mmap_base;
- else
- addr = vma ? vma->vm_start : mm->mmap_base;
- mm->unmap_area(mm, addr);
- mm->mmap_cache = NULL; /* Kill the cache. */
+
+ /* Kill the cache */
+ vmacache_invalidate(mm);
}
/*
@@ -2308,7 +2403,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
unsigned long addr, int new_below)
{
- struct mempolicy *pol;
struct vm_area_struct *new;
int err = -ENOMEM;
@@ -2332,12 +2426,9 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
}
- pol = mpol_dup(vma_policy(vma));
- if (IS_ERR(pol)) {
- err = PTR_ERR(pol);
+ err = vma_dup_policy(vma, new);
+ if (err)
goto out_free_vma;
- }
- vma_set_policy(new, pol);
if (anon_vma_clone(new, vma))
goto out_free_mpol;
@@ -2365,7 +2456,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
fput(new->vm_file);
unlink_anon_vmas(new);
out_free_mpol:
- mpol_put(pol);
+ mpol_put(vma_policy(new));
out_free_vma:
kmem_cache_free(vm_area_cachep, new);
out_err:
@@ -2524,18 +2615,9 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
if (error & ~PAGE_MASK)
return error;
- /*
- * mlock MCL_FUTURE?
- */
- if (mm->def_flags & VM_LOCKED) {
- unsigned long locked, lock_limit;
- locked = len >> PAGE_SHIFT;
- locked += mm->locked_vm;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return -EAGAIN;
- }
+ error = mlock_future_check(mm, mm->def_flags, len);
+ if (error)
+ return error;
/*
* mm->mmap_sem is required to protect against another thread
@@ -2589,10 +2671,9 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
out:
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
- if (flags & VM_LOCKED) {
- if (!mlock_vma_pages_range(vma, addr, addr + len))
- mm->locked_vm += (len >> PAGE_SHIFT);
- }
+ if (flags & VM_LOCKED)
+ mm->locked_vm += (len >> PAGE_SHIFT);
+ vma->vm_flags |= VM_SOFTDIRTY;
return addr;
}
@@ -2600,10 +2681,14 @@ unsigned long vm_brk(unsigned long addr, unsigned long len)
{
struct mm_struct *mm = current->mm;
unsigned long ret;
+ bool populate;
down_write(&mm->mmap_sem);
ret = do_brk(addr, len);
+ populate = ((mm->def_flags & VM_LOCKED) != 0);
up_write(&mm->mmap_sem);
+ if (populate)
+ mm_populate(addr, len);
return ret;
}
EXPORT_SYMBOL(vm_brk);
@@ -2635,12 +2720,12 @@ void exit_mmap(struct mm_struct *mm)
lru_add_drain();
flush_cache_mm(mm);
- tlb_gather_mmu(&tlb, mm, 1);
+ tlb_gather_mmu(&tlb, mm, 0, -1);
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
unmap_vmas(&tlb, vma, 0, -1);
- free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+ free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, 0, -1);
/*
@@ -2654,7 +2739,8 @@ void exit_mmap(struct mm_struct *mm)
}
vm_unacct_memory(nr_accounted);
- WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
+ WARN_ON(atomic_long_read(&mm->nr_ptes) >
+ (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
}
/* Insert vm structure into process list sorted by address
@@ -2706,7 +2792,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma, *prev;
struct rb_node **rb_link, *rb_parent;
- struct mempolicy *pol;
bool faulted_in_anon_vma = true;
/*
@@ -2751,10 +2836,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
new_vma->vm_start = addr;
new_vma->vm_end = addr + len;
new_vma->vm_pgoff = pgoff;
- pol = mpol_dup(vma_policy(vma));
- if (IS_ERR(pol))
+ if (vma_dup_policy(vma, new_vma))
goto out_free_vma;
- vma_set_policy(new_vma, pol);
INIT_LIST_HEAD(&new_vma->anon_vma_chain);
if (anon_vma_clone(new_vma, vma))
goto out_free_mempol;
@@ -2769,7 +2852,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
return new_vma;
out_free_mempol:
- mpol_put(pol);
+ mpol_put(vma_policy(new_vma));
out_free_vma:
kmem_cache_free(vm_area_cachep, new_vma);
return NULL;
@@ -2791,6 +2874,31 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages)
return 1;
}
+static int special_mapping_fault(struct vm_area_struct *vma,
+ struct vm_fault *vmf);
+
+/*
+ * Having a close hook prevents vma merging regardless of flags.
+ */
+static void special_mapping_close(struct vm_area_struct *vma)
+{
+}
+
+static const char *special_mapping_name(struct vm_area_struct *vma)
+{
+ return ((struct vm_special_mapping *)vma->vm_private_data)->name;
+}
+
+static const struct vm_operations_struct special_mapping_vmops = {
+ .close = special_mapping_close,
+ .fault = special_mapping_fault,
+ .name = special_mapping_name,
+};
+
+static const struct vm_operations_struct legacy_special_mapping_vmops = {
+ .close = special_mapping_close,
+ .fault = special_mapping_fault,
+};
static int special_mapping_fault(struct vm_area_struct *vma,
struct vm_fault *vmf)
@@ -2806,7 +2914,13 @@ static int special_mapping_fault(struct vm_area_struct *vma,
*/
pgoff = vmf->pgoff - vma->vm_pgoff;
- for (pages = vma->vm_private_data; pgoff && *pages; ++pages)
+ if (vma->vm_ops == &legacy_special_mapping_vmops)
+ pages = vma->vm_private_data;
+ else
+ pages = ((struct vm_special_mapping *)vma->vm_private_data)->
+ pages;
+
+ for (; pgoff && *pages; ++pages)
pgoff--;
if (*pages) {
@@ -2819,48 +2933,29 @@ static int special_mapping_fault(struct vm_area_struct *vma,
return VM_FAULT_SIGBUS;
}
-/*
- * Having a close hook prevents vma merging regardless of flags.
- */
-static void special_mapping_close(struct vm_area_struct *vma)
-{
-}
-
-static const struct vm_operations_struct special_mapping_vmops = {
- .close = special_mapping_close,
- .fault = special_mapping_fault,
-};
-
-/*
- * Called with mm->mmap_sem held for writing.
- * Insert a new vma covering the given region, with the given flags.
- * Its pages are supplied by the given array of struct page *.
- * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
- * The region past the last page supplied will always produce SIGBUS.
- * The array pointer and the pages it points to are assumed to stay alive
- * for as long as this mapping might exist.
- */
-int install_special_mapping(struct mm_struct *mm,
- unsigned long addr, unsigned long len,
- unsigned long vm_flags, struct page **pages)
+static struct vm_area_struct *__install_special_mapping(
+ struct mm_struct *mm,
+ unsigned long addr, unsigned long len,
+ unsigned long vm_flags, const struct vm_operations_struct *ops,
+ void *priv)
{
int ret;
struct vm_area_struct *vma;
vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
if (unlikely(vma == NULL))
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&vma->anon_vma_chain);
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
- vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND;
+ vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
- vma->vm_ops = &special_mapping_vmops;
- vma->vm_private_data = pages;
+ vma->vm_ops = ops;
+ vma->vm_private_data = priv;
ret = insert_vm_struct(mm, vma);
if (ret)
@@ -2870,11 +2965,40 @@ int install_special_mapping(struct mm_struct *mm,
perf_event_mmap(vma);
- return 0;
+ return vma;
out:
kmem_cache_free(vm_area_cachep, vma);
- return ret;
+ return ERR_PTR(ret);
+}
+
+/*
+ * Called with mm->mmap_sem held for writing.
+ * Insert a new vma covering the given region, with the given flags.
+ * Its pages are supplied by the given array of struct page *.
+ * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
+ * The region past the last page supplied will always produce SIGBUS.
+ * The array pointer and the pages it points to are assumed to stay alive
+ * for as long as this mapping might exist.
+ */
+struct vm_area_struct *_install_special_mapping(
+ struct mm_struct *mm,
+ unsigned long addr, unsigned long len,
+ unsigned long vm_flags, const struct vm_special_mapping *spec)
+{
+ return __install_special_mapping(mm, addr, len, vm_flags,
+ &special_mapping_vmops, (void *)spec);
+}
+
+int install_special_mapping(struct mm_struct *mm,
+ unsigned long addr, unsigned long len,
+ unsigned long vm_flags, struct page **pages)
+{
+ struct vm_area_struct *vma = __install_special_mapping(
+ mm, addr, len, vm_flags, &legacy_special_mapping_vmops,
+ (void *)pages);
+
+ return PTR_ERR_OR_ZERO(vma);
}
static DEFINE_MUTEX(mm_all_locks_mutex);
@@ -2943,7 +3067,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
* vma in this mm is backed by the same anon_vma or address_space.
*
* We can take all the locks in random order because the VM code
- * taking i_mmap_mutex or anon_vma->mutex outside the mmap_sem never
+ * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never
* takes more than one of them in a row. Secondly we're protected
* against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
*
@@ -3001,7 +3125,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
if (!__test_and_clear_bit(0, (unsigned long *)
&anon_vma->root->rb_root.rb_node))
BUG();
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_write(anon_vma);
}
}
@@ -3052,3 +3176,115 @@ void __init mmap_init(void)
ret = percpu_counter_init(&vm_committed_as, 0);
VM_BUG_ON(ret);
}
+
+/*
+ * Initialise sysctl_user_reserve_kbytes.
+ *
+ * This is intended to prevent a user from starting a single memory hogging
+ * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
+ * mode.
+ *
+ * The default value is min(3% of free memory, 128MB)
+ * 128MB is enough to recover with sshd/login, bash, and top/kill.
+ */
+static int init_user_reserve(void)
+{
+ unsigned long free_kbytes;
+
+ free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+ return 0;
+}
+subsys_initcall(init_user_reserve);
+
+/*
+ * Initialise sysctl_admin_reserve_kbytes.
+ *
+ * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
+ * to log in and kill a memory hogging process.
+ *
+ * Systems with more than 256MB will reserve 8MB, enough to recover
+ * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
+ * only reserve 3% of free pages by default.
+ */
+static int init_admin_reserve(void)
+{
+ unsigned long free_kbytes;
+
+ free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
+ return 0;
+}
+subsys_initcall(init_admin_reserve);
+
+/*
+ * Reinititalise user and admin reserves if memory is added or removed.
+ *
+ * The default user reserve max is 128MB, and the default max for the
+ * admin reserve is 8MB. These are usually, but not always, enough to
+ * enable recovery from a memory hogging process using login/sshd, a shell,
+ * and tools like top. It may make sense to increase or even disable the
+ * reserve depending on the existence of swap or variations in the recovery
+ * tools. So, the admin may have changed them.
+ *
+ * If memory is added and the reserves have been eliminated or increased above
+ * the default max, then we'll trust the admin.
+ *
+ * If memory is removed and there isn't enough free memory, then we
+ * need to reset the reserves.
+ *
+ * Otherwise keep the reserve set by the admin.
+ */
+static int reserve_mem_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ unsigned long tmp, free_kbytes;
+
+ switch (action) {
+ case MEM_ONLINE:
+ /* Default max is 128MB. Leave alone if modified by operator. */
+ tmp = sysctl_user_reserve_kbytes;
+ if (0 < tmp && tmp < (1UL << 17))
+ init_user_reserve();
+
+ /* Default max is 8MB. Leave alone if modified by operator. */
+ tmp = sysctl_admin_reserve_kbytes;
+ if (0 < tmp && tmp < (1UL << 13))
+ init_admin_reserve();
+
+ break;
+ case MEM_OFFLINE:
+ free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ if (sysctl_user_reserve_kbytes > free_kbytes) {
+ init_user_reserve();
+ pr_info("vm.user_reserve_kbytes reset to %lu\n",
+ sysctl_user_reserve_kbytes);
+ }
+
+ if (sysctl_admin_reserve_kbytes > free_kbytes) {
+ init_admin_reserve();
+ pr_info("vm.admin_reserve_kbytes reset to %lu\n",
+ sysctl_admin_reserve_kbytes);
+ }
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block reserve_mem_nb = {
+ .notifier_call = reserve_mem_notifier,
+};
+
+static int __meminit init_reserve_notifier(void)
+{
+ if (register_hotmemory_notifier(&reserve_mem_nb))
+ pr_err("Failed registering memory add/remove notifier for admin reserve\n");
+
+ return 0;
+}
+subsys_initcall(init_reserve_notifier);