From c7e49c1488ab20342eaaf38f1ca35a207f4c051d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 10 Aug 2010 18:03:07 -0700 Subject: ptrace: optimize exit_ptrace() for the likely case exit_ptrace() takes tasklist_lock unconditionally. We need this lock to avoid the race with ptrace_traceme(), it acts as a barrier. Change its caller, forget_original_parent(), to call exit_ptrace() under tasklist_lock. Change exit_ptrace() to drop and reacquire this lock if needed. This allows us to add the fastpath list_empty(ptraced) check. In the likely no-tracees case exit_ptrace() just returns and we avoid the lock() + unlock() sequence. "Zhang, Yanmin" suggested to add this check, and he reports that this change adds about 11% improvement in some tests. Suggested-and-tested-by: "Zhang, Yanmin" Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index ceffc67b564..671ed56e0a4 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -771,9 +771,12 @@ static void forget_original_parent(struct task_struct *father) struct task_struct *p, *n, *reaper; LIST_HEAD(dead_children); - exit_ptrace(father); - write_lock_irq(&tasklist_lock); + /* + * Note that exit_ptrace() and find_new_reaper() might + * drop tasklist_lock and reacquire it. + */ + exit_ptrace(father); reaper = find_new_reaper(father); list_for_each_entry_safe(p, n, &father->children, sibling) { -- cgit v1.2.3-70-g09d2 From f362b73244fb16ea4ae127ced1467dd8adaa7733 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Tue, 17 Aug 2010 23:56:55 +0100 Subject: Fix unprotected access to task credentials in waitid() Using a program like the following: #include #include #include #include int main() { id_t id; siginfo_t infop; pid_t res; id = fork(); if (id == 0) { sleep(1); exit(0); } kill(id, SIGSTOP); alarm(1); waitid(P_PID, id, &infop, WCONTINUED); return 0; } to call waitid() on a stopped process results in access to the child task's credentials without the RCU read lock being held - which may be replaced in the meantime - eliciting the following warning: =================================================== [ INFO: suspicious rcu_dereference_check() usage. ] --------------------------------------------------- kernel/exit.c:1460 invoked rcu_dereference_check() without protection! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 1 2 locks held by waitid02/22252: #0: (tasklist_lock){.?.?..}, at: [] do_wait+0xc5/0x310 #1: (&(&sighand->siglock)->rlock){-.-...}, at: [] wait_consider_task+0x19a/0xbe0 stack backtrace: Pid: 22252, comm: waitid02 Not tainted 2.6.35-323cd+ #3 Call Trace: [] lockdep_rcu_dereference+0xa4/0xc0 [] wait_consider_task+0xaf1/0xbe0 [] do_wait+0xf5/0x310 [] sys_waitid+0x86/0x1f0 [] ? child_wait_callback+0x0/0x70 [] system_call_fastpath+0x16/0x1b This is fixed by holding the RCU read lock in wait_task_continued() to ensure that the task's current credentials aren't destroyed between us reading the cred pointer and us reading the UID from those credentials. Furthermore, protect wait_task_stopped() in the same way. We don't need to keep holding the RCU read lock once we've read the UID from the credentials as holding the RCU read lock doesn't stop the target task from changing its creds under us - so the credentials may be outdated immediately after we've read the pointer, lock or no lock. Signed-off-by: Daniel J Blueman Signed-off-by: David Howells Acked-by: Paul E. McKenney Acked-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 671ed56e0a4..03120229db2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1386,8 +1386,7 @@ static int wait_task_stopped(struct wait_opts *wo, if (!unlikely(wo->wo_flags & WNOWAIT)) *p_code = 0; - /* don't need the RCU readlock here as we're holding a spinlock */ - uid = __task_cred(p)->uid; + uid = task_uid(p); unlock_sig: spin_unlock_irq(&p->sighand->siglock); if (!exit_code) @@ -1460,7 +1459,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) } if (!unlikely(wo->wo_flags & WNOWAIT)) p->signal->flags &= ~SIGNAL_STOP_CONTINUED; - uid = __task_cred(p)->uid; + uid = task_uid(p); spin_unlock_irq(&p->sighand->siglock); pid = task_pid_vnr(p); -- cgit v1.2.3-70-g09d2 From 4e231c7962ce711c7d8c2a4dc23ecd1e8fc28363 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Sep 2010 21:01:59 +0200 Subject: perf: Fix up delayed_put_task_struct() I missed a perf_event_ctxp user when converting it to an array. Pull this last user into perf_event.c as well and fix it up. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 ++ kernel/exit.c | 4 +--- kernel/perf_event.c | 8 ++++++++ 3 files changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel/exit.c') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index c1173520f14..93bf53aa50e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -889,6 +889,7 @@ extern void perf_event_task_sched_out(struct task_struct *task, struct task_stru extern int perf_event_init_task(struct task_struct *child); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); +extern void perf_event_delayed_put(struct task_struct *task); extern void set_perf_event_pending(void); extern void perf_event_do_pending(void); extern void perf_event_print_debug(void); @@ -1067,6 +1068,7 @@ perf_event_task_sched_out(struct task_struct *task, static inline int perf_event_init_task(struct task_struct *child) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } +static inline void perf_event_delayed_put(struct task_struct *task) { } static inline void perf_event_do_pending(void) { } static inline void perf_event_print_debug(void) { } static inline int perf_event_task_disable(void) { return -EINVAL; } diff --git a/kernel/exit.c b/kernel/exit.c index 03120229db2..e2bdf37f9fd 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -149,9 +149,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); -#ifdef CONFIG_PERF_EVENTS - WARN_ON_ONCE(tsk->perf_event_ctxp); -#endif + perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); } diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 9819a69a61a..eaf1c5de6dc 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5893,6 +5893,14 @@ again: } } +void perf_event_delayed_put(struct task_struct *task) +{ + int ctxn; + + for_each_task_context_nr(ctxn) + WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); +} + /* * inherit a event from parent task to child task: */ -- cgit v1.2.3-70-g09d2 From 3d5992d2ac7dc09aed8ab537cba074589f0f0a52 Mon Sep 17 00:00:00 2001 From: Ying Han Date: Tue, 26 Oct 2010 14:21:23 -0700 Subject: oom: add per-mm oom disable count It's pointless to kill a task if another thread sharing its mm cannot be killed to allow future memory freeing. A subsequent patch will prevent kills in such cases, but first it's necessary to have a way to flag a task that shares memory with an OOM_DISABLE task that doesn't incur an additional tasklist scan, which would make select_bad_process() an O(n^2) function. This patch adds an atomic counter to struct mm_struct that follows how many threads attached to it have an oom_score_adj of OOM_SCORE_ADJ_MIN. They cannot be killed by the kernel, so their memory cannot be freed in oom conditions. This only requires task_lock() on the task that we're operating on, it does not require mm->mmap_sem since task_lock() pins the mm and the operation is atomic. [rientjes@google.com: changelog and sys_unshare() code] [rientjes@google.com: protect oom_disable_count with task_lock in fork] [rientjes@google.com: use old_mm for oom_disable_count in exec] Signed-off-by: Ying Han Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 5 +++++ fs/proc/base.c | 30 ++++++++++++++++++++++++++++++ include/linux/mm_types.h | 2 ++ kernel/exit.c | 3 +++ kernel/fork.c | 15 ++++++++++++++- 5 files changed, 54 insertions(+), 1 deletion(-) (limited to 'kernel/exit.c') diff --git a/fs/exec.c b/fs/exec.c index 6d2b6f93685..3aa75b8888a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include @@ -759,6 +760,10 @@ static int exec_mmap(struct mm_struct *mm) tsk->mm = mm; tsk->active_mm = mm; activate_mm(active_mm, mm); + if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + atomic_dec(&old_mm->oom_disable_count); + atomic_inc(&tsk->mm->oom_disable_count); + } task_unlock(tsk); arch_pick_mmap_layout(mm); if (old_mm) { diff --git a/fs/proc/base.c b/fs/proc/base.c index dc5d5f51f3f..6e50c8e6551 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1047,6 +1047,21 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, return -EACCES; } + task_lock(task); + if (!task->mm) { + task_unlock(task); + unlock_task_sighand(task, &flags); + put_task_struct(task); + return -EINVAL; + } + + if (oom_adjust != task->signal->oom_adj) { + if (oom_adjust == OOM_DISABLE) + atomic_inc(&task->mm->oom_disable_count); + if (task->signal->oom_adj == OOM_DISABLE) + atomic_dec(&task->mm->oom_disable_count); + } + /* * Warn that /proc/pid/oom_adj is deprecated, see * Documentation/feature-removal-schedule.txt. @@ -1065,6 +1080,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, else task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; + task_unlock(task); unlock_task_sighand(task, &flags); put_task_struct(task); @@ -1133,6 +1149,19 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, return -EACCES; } + task_lock(task); + if (!task->mm) { + task_unlock(task); + unlock_task_sighand(task, &flags); + put_task_struct(task); + return -EINVAL; + } + if (oom_score_adj != task->signal->oom_score_adj) { + if (oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_inc(&task->mm->oom_disable_count); + if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_dec(&task->mm->oom_disable_count); + } task->signal->oom_score_adj = oom_score_adj; /* * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is @@ -1143,6 +1172,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, else task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) / OOM_SCORE_ADJ_MAX; + task_unlock(task); unlock_task_sighand(task, &flags); put_task_struct(task); return count; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index cb57d657ce4..bb7288a782f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -310,6 +310,8 @@ struct mm_struct { #ifdef CONFIG_MMU_NOTIFIER struct mmu_notifier_mm *mmu_notifier_mm; #endif + /* How many tasks sharing this mm are OOM_DISABLE */ + atomic_t oom_disable_count; }; /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ diff --git a/kernel/exit.c b/kernel/exit.c index e2bdf37f9fd..894179a32ec 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -687,6 +688,8 @@ static void exit_mm(struct task_struct * tsk) enter_lazy_tlb(mm, current); /* We don't want this task to be frozen prematurely */ clear_freeze_flag(tsk); + if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_dec(&mm->oom_disable_count); task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); diff --git a/kernel/fork.c b/kernel/fork.c index c445f8cc408..e87aaaaf513 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -488,6 +489,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) mm->cached_hole_size = ~0UL; mm_init_aio(mm); mm_init_owner(mm, p); + atomic_set(&mm->oom_disable_count, 0); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -741,6 +743,8 @@ good_mm: /* Initializing for Swap token stuff */ mm->token_priority = 0; mm->last_interval = 0; + if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_inc(&mm->oom_disable_count); tsk->mm = mm; tsk->active_mm = mm; @@ -1299,8 +1303,13 @@ bad_fork_cleanup_io: bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: - if (p->mm) + if (p->mm) { + task_lock(p); + if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_dec(&p->mm->oom_disable_count); + task_unlock(p); mmput(p->mm); + } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); @@ -1693,6 +1702,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) active_mm = current->active_mm; current->mm = new_mm; current->active_mm = new_mm; + if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + atomic_dec(&mm->oom_disable_count); + atomic_inc(&new_mm->oom_disable_count); + } activate_mm(active_mm, new_mm); new_mm = mm; } -- cgit v1.2.3-70-g09d2 From d16e15f5b029fc7d03540ba0e5fb23b0abb0ebe0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 27 Oct 2010 15:34:10 -0700 Subject: exit: add lock context annotation on find_new_reaper() find_new_reaper() releases and regrabs tasklist_lock but was missing proper annotations. Add it. This remove following sparse warning: warning: context imbalance in 'find_new_reaper' - unexpected unlock Signed-off-by: Namhyung Kim Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 894179a32ec..b194febf579 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -703,6 +703,8 @@ static void exit_mm(struct task_struct * tsk) * space. */ static struct task_struct *find_new_reaper(struct task_struct *father) + __releases(&tasklist_lock) + __acquires(&tasklist_lock) { struct pid_namespace *pid_ns = task_active_pid_ns(father); struct task_struct *thread; -- cgit v1.2.3-70-g09d2 From e0a70217107e6f9844628120412cb27bb4cea194 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 5 Nov 2010 16:53:42 +0100 Subject: posix-cpu-timers: workaround to suppress the problems with mt exec posix-cpu-timers.c correctly assumes that the dying process does posix_cpu_timers_exit_group() and removes all !CPUCLOCK_PERTHREAD timers from signal->cpu_timers list. But, it also assumes that timer->it.cpu.task is always the group leader, and thus the dead ->task means the dead thread group. This is obviously not true after de_thread() changes the leader. After that almost every posix_cpu_timer_ method has problems. It is not simple to fix this bug correctly. First of all, I think that timer->it.cpu should use struct pid instead of task_struct. Also, the locking should be reworked completely. In particular, tasklist_lock should not be used at all. This all needs a lot of nontrivial and hard-to-test changes. Change __exit_signal() to do posix_cpu_timers_exit_group() when the old leader dies during exec. This is not the fix, just the temporary hack to hide the problem for 2.6.37 and stable. IOW, this is obviously wrong but this is what we currently have anyway: cpu timers do not work after mt exec. In theory this change adds another race. The exiting leader can detach the timers which were attached to the new leader. However, the window between de_thread() and release_task() is small, we can pretend that sys_timer_create() was called before de_thread(). Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index b194febf579..21aa7b3001f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -95,6 +95,14 @@ static void __exit_signal(struct task_struct *tsk) tty = sig->tty; sig->tty = NULL; } else { + /* + * This can only happen if the caller is de_thread(). + * FIXME: this is the temporary hack, we should teach + * posix-cpu-timers to handle this case correctly. + */ + if (unlikely(has_group_leader_pid(tsk))) + posix_cpu_timers_exit_group(tsk); + /* * If there is any task waiting for the group exit * then notify it: -- cgit v1.2.3-70-g09d2 From 33dd94ae1ccbfb7bf0fb6c692bc3d1c4269e6177 Mon Sep 17 00:00:00 2001 From: Nelson Elhage Date: Thu, 2 Dec 2010 14:31:21 -0800 Subject: do_exit(): make sure that we run with get_fs() == USER_DS If a user manages to trigger an oops with fs set to KERNEL_DS, fs is not otherwise reset before do_exit(). do_exit may later (via mm_release in fork.c) do a put_user to a user-controlled address, potentially allowing a user to leverage an oops into a controlled write into kernel memory. This is only triggerable in the presence of another bug, but this potentially turns a lot of DoS bugs into privilege escalations, so it's worth fixing. I have proof-of-concept code which uses this bug along with CVE-2010-3849 to write a zero to an arbitrary kernel address, so I've tested that this is not theoretical. A more logical place to put this fix might be when we know an oops has occurred, before we call do_exit(), but that would involve changing every architecture, in multiple places. Let's just stick it in do_exit instead. [akpm@linux-foundation.org: update code comment] Signed-off-by: Nelson Elhage Cc: KOSAKI Motohiro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 21aa7b3001f..676149a4ac5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -914,6 +914,15 @@ NORET_TYPE void do_exit(long code) if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); + /* + * If do_exit is called because this processes oopsed, it's possible + * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before + * continuing. Amongst other possible reasons, this is to prevent + * mm_release()->clear_child_tid() from writing to a user-controlled + * kernel address. + */ + set_fs(USER_DS); + tracehook_report_exit(&code); validate_creds_for_do_exit(tsk); -- cgit v1.2.3-70-g09d2