From 3c77f845722158206a7209c45ccddc264d19319c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 30 Nov 2010 20:55:34 +0100 Subject: exec: make argv/envp memory visible to oom-killer Brad Spengler published a local memory-allocation DoS that evades the OOM-killer (though not the virtual memory RLIMIT): http://www.grsecurity.net/~spender/64bit_dos.c execve()->copy_strings() can allocate a lot of memory, but this is not visible to oom-killer, nobody can see the nascent bprm->mm and take it into account. With this patch get_arg_page() increments current's MM_ANONPAGES counter every time we allocate the new page for argv/envp. When do_execve() succeds or fails, we change this counter back. Technically this is not 100% correct, we can't know if the new page is swapped out and turn MM_ANONPAGES into MM_SWAPENTS, but I don't think this really matters and everything becomes correct once exec changes ->mm or fails. Reported-by: Brad Spengler Reviewed-and-discussed-by: KOSAKI Motohiro Signed-off-by: Oleg Nesterov Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/exec.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index 99d33a1371e..4303b9035fe 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -164,6 +164,25 @@ out: #ifdef CONFIG_MMU +static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +{ + struct mm_struct *mm = current->mm; + long diff = (long)(pages - bprm->vma_pages); + + if (!mm || !diff) + return; + + bprm->vma_pages = pages; + +#ifdef SPLIT_RSS_COUNTING + add_mm_counter(mm, MM_ANONPAGES, diff); +#else + spin_lock(&mm->page_table_lock); + add_mm_counter(mm, MM_ANONPAGES, diff); + spin_unlock(&mm->page_table_lock); +#endif +} + static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { @@ -186,6 +205,8 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start; struct rlimit *rlim; + acct_arg_size(bprm, size / PAGE_SIZE); + /* * We've historically supported up to 32 pages (ARG_MAX) * of argument strings even with small stacks @@ -276,6 +297,10 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len) #else +static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +{ +} + static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { @@ -1003,6 +1028,7 @@ int flush_old_exec(struct linux_binprm * bprm) /* * Release all of the old mmap stuff */ + acct_arg_size(bprm, 0); retval = exec_mmap(bprm->mm); if (retval) goto out; @@ -1426,8 +1452,10 @@ int do_execve(const char * filename, return retval; out: - if (bprm->mm) - mmput (bprm->mm); + if (bprm->mm) { + acct_arg_size(bprm, 0); + mmput(bprm->mm); + } out_file: if (bprm->file) { -- cgit v1.2.3-70-g09d2 From 114279be2120a916e8a04feeb2ac976a10016f2f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 30 Nov 2010 20:56:02 +0100 Subject: exec: copy-and-paste the fixes into compat_do_execve() paths Note: this patch targets 2.6.37 and tries to be as simple as possible. That is why it adds more copy-and-paste horror into fs/compat.c and uglifies fs/exec.c, this will be cleanuped later. compat_copy_strings() plays with bprm->vma/mm directly and thus has two problems: it lacks the RLIMIT_STACK check and argv/envp memory is not visible to oom killer. Export acct_arg_size() and get_arg_page(), change compat_copy_strings() to use get_arg_page(), change compat_do_execve() to do acct_arg_size(0) as do_execve() does. Add the fatal_signal_pending/cond_resched checks into compat_count() and compat_copy_strings(), this matches the code in fs/exec.c and certainly makes sense. Signed-off-by: Oleg Nesterov Cc: KOSAKI Motohiro Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/compat.c | 28 +++++++++++++++------------- fs/exec.c | 8 ++++---- include/linux/binfmts.h | 4 ++++ 3 files changed, 23 insertions(+), 17 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/compat.c b/fs/compat.c index c580c322fa6..eb1740ac8c0 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1350,6 +1350,10 @@ static int compat_count(compat_uptr_t __user *argv, int max) argv++; if (i++ >= max) return -E2BIG; + + if (fatal_signal_pending(current)) + return -ERESTARTNOHAND; + cond_resched(); } } return i; @@ -1391,6 +1395,12 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv, while (len > 0) { int offset, bytes_to_copy; + if (fatal_signal_pending(current)) { + ret = -ERESTARTNOHAND; + goto out; + } + cond_resched(); + offset = pos % PAGE_SIZE; if (offset == 0) offset = PAGE_SIZE; @@ -1407,18 +1417,8 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv, if (!kmapped_page || kpos != (pos & PAGE_MASK)) { struct page *page; -#ifdef CONFIG_STACK_GROWSUP - ret = expand_stack_downwards(bprm->vma, pos); - if (ret < 0) { - /* We've exceed the stack rlimit. */ - ret = -E2BIG; - goto out; - } -#endif - ret = get_user_pages(current, bprm->mm, pos, - 1, 1, 1, &page, NULL); - if (ret <= 0) { - /* We've exceed the stack rlimit. */ + page = get_arg_page(bprm, pos, 1); + if (!page) { ret = -E2BIG; goto out; } @@ -1539,8 +1539,10 @@ int compat_do_execve(char * filename, return retval; out: - if (bprm->mm) + if (bprm->mm) { + acct_arg_size(bprm, 0); mmput(bprm->mm); + } out_file: if (bprm->file) { diff --git a/fs/exec.c b/fs/exec.c index 4303b9035fe..d68c378a313 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -164,7 +164,7 @@ out: #ifdef CONFIG_MMU -static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { struct mm_struct *mm = current->mm; long diff = (long)(pages - bprm->vma_pages); @@ -183,7 +183,7 @@ static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) #endif } -static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, +struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; @@ -297,11 +297,11 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len) #else -static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { } -static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, +struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 7c87796d20d..64a7114a939 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -60,6 +60,10 @@ struct linux_binprm{ unsigned long loader, exec; }; +extern void acct_arg_size(struct linux_binprm *bprm, unsigned long pages); +extern struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, + int write); + #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0 #define BINPRM_FLAGS_ENFORCE_NONDUMP (1 << BINPRM_FLAGS_ENFORCE_NONDUMP_BIT) -- cgit v1.2.3-70-g09d2 From 462e635e5b73ba9a4c03913b77138cd57ce4b050 Mon Sep 17 00:00:00 2001 From: Tavis Ormandy Date: Thu, 9 Dec 2010 15:29:42 +0100 Subject: install_special_mapping skips security_file_mmap check. The install_special_mapping routine (used, for example, to setup the vdso) skips the security check before insert_vm_struct, allowing a local attacker to bypass the mmap_min_addr security restriction by limiting the available pages for special mappings. bprm_mm_init() also skips the check, and although I don't think this can be used to bypass any restrictions, I don't see any reason not to have the security check. $ uname -m x86_64 $ cat /proc/sys/vm/mmap_min_addr 65536 $ cat install_special_mapping.s section .bss resb BSS_SIZE section .text global _start _start: mov eax, __NR_pause int 0x80 $ nasm -D__NR_pause=29 -DBSS_SIZE=0xfffed000 -f elf -o install_special_mapping.o install_special_mapping.s $ ld -m elf_i386 -Ttext=0x10000 -Tbss=0x11000 -o install_special_mapping install_special_mapping.o $ ./install_special_mapping & [1] 14303 $ cat /proc/14303/maps 0000f000-00010000 r-xp 00000000 00:00 0 [vdso] 00010000-00011000 r-xp 00001000 00:19 2453665 /home/taviso/install_special_mapping 00011000-ffffe000 rwxp 00000000 00:00 0 [stack] It's worth noting that Red Hat are shipping with mmap_min_addr set to 4096. Signed-off-by: Tavis Ormandy Acked-by: Kees Cook Acked-by: Robert Swiecki [ Changed to not drop the error code - akpm ] Reviewed-by: James Morris Signed-off-by: Linus Torvalds --- fs/exec.c | 5 +++++ mm/mmap.c | 16 ++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index d68c378a313..c62efcb959c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -275,6 +275,11 @@ static int __bprm_mm_init(struct linux_binprm *bprm) vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); INIT_LIST_HEAD(&vma->anon_vma_chain); + + err = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1); + if (err) + goto err; + err = insert_vm_struct(mm, vma); if (err) goto err; diff --git a/mm/mmap.c b/mm/mmap.c index b179abb1474..50a4aa0255a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2462,6 +2462,7 @@ int install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long vm_flags, struct page **pages) { + int ret; struct vm_area_struct *vma; vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); @@ -2479,16 +2480,23 @@ int install_special_mapping(struct mm_struct *mm, vma->vm_ops = &special_mapping_vmops; vma->vm_private_data = pages; - if (unlikely(insert_vm_struct(mm, vma))) { - kmem_cache_free(vm_area_cachep, vma); - return -ENOMEM; - } + ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1); + if (ret) + goto out; + + ret = insert_vm_struct(mm, vma); + if (ret) + goto out; mm->total_vm += len >> PAGE_SHIFT; perf_event_mmap(vma); return 0; + +out: + kmem_cache_free(vm_area_cachep, vma); + return ret; } static DEFINE_MUTEX(mm_all_locks_mutex); -- cgit v1.2.3-70-g09d2