aboutsummaryrefslogtreecommitdiff
path: root/kernel/exit.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/exit.c')
-rw-r--r--kernel/exit.c713
1 files changed, 278 insertions, 435 deletions
diff --git a/kernel/exit.c b/kernel/exit.c
index 21aa7b3001f..e5c4668f179 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -20,6 +20,7 @@
#include <linux/tsacct_kern.h>
#include <linux/file.h>
#include <linux/fdtable.h>
+#include <linux/freezer.h>
#include <linux/binfmts.h>
#include <linux/nsproxy.h>
#include <linux/pid_namespace.h>
@@ -31,7 +32,6 @@
#include <linux/mempolicy.h>
#include <linux/taskstats_kern.h>
#include <linux/delayacct.h>
-#include <linux/freezer.h>
#include <linux/cgroup.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
@@ -51,6 +51,8 @@
#include <trace/events/sched.h>
#include <linux/hw_breakpoint.h>
#include <linux/oom.h>
+#include <linux/writeback.h>
+#include <linux/shm.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -69,9 +71,10 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
list_del_rcu(&p->tasks);
list_del_init(&p->sibling);
- __get_cpu_var(process_counts)--;
+ __this_cpu_dec(process_counts);
}
list_del_rcu(&p->thread_group);
+ list_del_rcu(&p->thread_node);
}
/*
@@ -83,9 +86,9 @@ static void __exit_signal(struct task_struct *tsk)
bool group_dead = thread_group_leader(tsk);
struct sighand_struct *sighand;
struct tty_struct *uninitialized_var(tty);
+ cputime_t utime, stime;
sighand = rcu_dereference_check(tsk->sighand,
- rcu_read_lock_held() ||
lockdep_tasklist_lock_is_held());
spin_lock(&sighand->siglock);
@@ -122,9 +125,10 @@ static void __exit_signal(struct task_struct *tsk)
* We won't ever get here for the group leader, since it
* will have been the last reference on the signal_struct.
*/
- sig->utime = cputime_add(sig->utime, tsk->utime);
- sig->stime = cputime_add(sig->stime, tsk->stime);
- sig->gtime = cputime_add(sig->gtime, tsk->gtime);
+ task_cputime(tsk, &utime, &stime);
+ sig->utime += utime;
+ sig->stime += stime;
+ sig->gtime += task_gtime(tsk);
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
sig->nvcsw += tsk->nvcsw;
@@ -169,7 +173,6 @@ void release_task(struct task_struct * p)
struct task_struct *leader;
int zap_leader;
repeat:
- tracehook_prepare_release_task(p);
/* don't need to get the RCU readlock here - the process is dead and
* can't be modifying its own credentials. But shut RCU-lockdep up */
rcu_read_lock();
@@ -179,7 +182,7 @@ repeat:
proc_flush_task(p);
write_lock_irq(&tasklist_lock);
- tracehook_finish_release_task(p);
+ ptrace_release_task(p);
__exit_signal(p);
/*
@@ -190,22 +193,12 @@ repeat:
zap_leader = 0;
leader = p->group_leader;
if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
- BUG_ON(task_detached(leader));
- do_notify_parent(leader, leader->exit_signal);
/*
* If we were the last child thread and the leader has
* exited already, and the leader's parent ignores SIGCHLD,
* then we are the one who should release the leader.
- *
- * do_notify_parent() will have marked it self-reaping in
- * that case.
- */
- zap_leader = task_detached(leader);
-
- /*
- * This maintains the invariant that release_task()
- * only runs on a task in EXIT_DEAD, just for sanity.
*/
+ zap_leader = do_notify_parent(leader, leader->exit_signal);
if (zap_leader)
leader->exit_state = EXIT_DEAD;
}
@@ -277,18 +270,16 @@ int is_current_pgrp_orphaned(void)
return retval;
}
-static int has_stopped_jobs(struct pid *pgrp)
+static bool has_stopped_jobs(struct pid *pgrp)
{
- int retval = 0;
struct task_struct *p;
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
- if (!task_is_stopped(p))
- continue;
- retval = 1;
- break;
+ if (p->signal->flags & SIGNAL_STOP_STOPPED)
+ return true;
} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
- return retval;
+
+ return false;
}
/*
@@ -322,268 +313,30 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
}
}
-/**
- * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
- *
- * If a kernel thread is launched as a result of a system call, or if
- * it ever exits, it should generally reparent itself to kthreadd so it
- * isn't in the way of other processes and is correctly cleaned up on exit.
- *
- * The various task state such as scheduling policy and priority may have
- * been inherited from a user process, so we reset them to sane values here.
- *
- * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
- */
-static void reparent_to_kthreadd(void)
-{
- write_lock_irq(&tasklist_lock);
-
- ptrace_unlink(current);
- /* Reparent to init */
- current->real_parent = current->parent = kthreadd_task;
- list_move_tail(&current->sibling, &current->real_parent->children);
-
- /* Set the exit signal to SIGCHLD so we signal init on exit */
- current->exit_signal = SIGCHLD;
-
- if (task_nice(current) < 0)
- set_user_nice(current, 0);
- /* cpus_allowed? */
- /* rt_priority? */
- /* signals? */
- memcpy(current->signal->rlim, init_task.signal->rlim,
- sizeof(current->signal->rlim));
-
- atomic_inc(&init_cred.usage);
- commit_creds(&init_cred);
- write_unlock_irq(&tasklist_lock);
-}
-
-void __set_special_pids(struct pid *pid)
-{
- struct task_struct *curr = current->group_leader;
-
- if (task_session(curr) != pid)
- change_pid(curr, PIDTYPE_SID, pid);
-
- if (task_pgrp(curr) != pid)
- change_pid(curr, PIDTYPE_PGID, pid);
-}
-
-static void set_special_pids(struct pid *pid)
-{
- write_lock_irq(&tasklist_lock);
- __set_special_pids(pid);
- write_unlock_irq(&tasklist_lock);
-}
-
-/*
- * Let kernel threads use this to say that they allow a certain signal.
- * Must not be used if kthread was cloned with CLONE_SIGHAND.
- */
-int allow_signal(int sig)
-{
- if (!valid_signal(sig) || sig < 1)
- return -EINVAL;
-
- spin_lock_irq(&current->sighand->siglock);
- /* This is only needed for daemonize()'ed kthreads */
- sigdelset(&current->blocked, sig);
- /*
- * Kernel threads handle their own signals. Let the signal code
- * know it'll be handled, so that they don't get converted to
- * SIGKILL or just silently dropped.
- */
- current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
- return 0;
-}
-
-EXPORT_SYMBOL(allow_signal);
-
-int disallow_signal(int sig)
-{
- if (!valid_signal(sig) || sig < 1)
- return -EINVAL;
-
- spin_lock_irq(&current->sighand->siglock);
- current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
- return 0;
-}
-
-EXPORT_SYMBOL(disallow_signal);
-
+#ifdef CONFIG_MEMCG
/*
- * Put all the gunge required to become a kernel thread without
- * attached user resources in one place where it belongs.
+ * A task is exiting. If it owned this mm, find a new owner for the mm.
*/
-
-void daemonize(const char *name, ...)
-{
- va_list args;
- sigset_t blocked;
-
- va_start(args, name);
- vsnprintf(current->comm, sizeof(current->comm), name, args);
- va_end(args);
-
- /*
- * If we were started as result of loading a module, close all of the
- * user space pages. We don't need them, and if we didn't close them
- * they would be locked into memory.
- */
- exit_mm(current);
- /*
- * We don't want to have TIF_FREEZE set if the system-wide hibernation
- * or suspend transition begins right now.
- */
- current->flags |= (PF_NOFREEZE | PF_KTHREAD);
-
- if (current->nsproxy != &init_nsproxy) {
- get_nsproxy(&init_nsproxy);
- switch_task_namespaces(current, &init_nsproxy);
- }
- set_special_pids(&init_struct_pid);
- proc_clear_tty(current);
-
- /* Block and flush all signals */
- sigfillset(&blocked);
- sigprocmask(SIG_BLOCK, &blocked, NULL);
- flush_signals(current);
-
- /* Become as one with the init task */
-
- daemonize_fs_struct();
- exit_files(current);
- current->files = init_task.files;
- atomic_inc(&current->files->count);
-
- reparent_to_kthreadd();
-}
-
-EXPORT_SYMBOL(daemonize);
-
-static void close_files(struct files_struct * files)
+void mm_update_next_owner(struct mm_struct *mm)
{
- int i, j;
- struct fdtable *fdt;
-
- j = 0;
+ struct task_struct *c, *g, *p = current;
+retry:
/*
- * It is safe to dereference the fd table without RCU or
- * ->file_lock because this is the last reference to the
- * files structure. But use RCU to shut RCU-lockdep up.
+ * If the exiting or execing task is not the owner, it's
+ * someone else's problem.
*/
- rcu_read_lock();
- fdt = files_fdtable(files);
- rcu_read_unlock();
- for (;;) {
- unsigned long set;
- i = j * __NFDBITS;
- if (i >= fdt->max_fds)
- break;
- set = fdt->open_fds->fds_bits[j++];
- while (set) {
- if (set & 1) {
- struct file * file = xchg(&fdt->fd[i], NULL);
- if (file) {
- filp_close(file, files);
- cond_resched();
- }
- }
- i++;
- set >>= 1;
- }
- }
-}
-
-struct files_struct *get_files_struct(struct task_struct *task)
-{
- struct files_struct *files;
-
- task_lock(task);
- files = task->files;
- if (files)
- atomic_inc(&files->count);
- task_unlock(task);
-
- return files;
-}
-
-void put_files_struct(struct files_struct *files)
-{
- struct fdtable *fdt;
-
- if (atomic_dec_and_test(&files->count)) {
- close_files(files);
- /*
- * Free the fd and fdset arrays if we expanded them.
- * If the fdtable was embedded, pass files for freeing
- * at the end of the RCU grace period. Otherwise,
- * you can free files immediately.
- */
- rcu_read_lock();
- fdt = files_fdtable(files);
- if (fdt != &files->fdtab)
- kmem_cache_free(files_cachep, files);
- free_fdtable(fdt);
- rcu_read_unlock();
- }
-}
-
-void reset_files_struct(struct files_struct *files)
-{
- struct task_struct *tsk = current;
- struct files_struct *old;
-
- old = tsk->files;
- task_lock(tsk);
- tsk->files = files;
- task_unlock(tsk);
- put_files_struct(old);
-}
-
-void exit_files(struct task_struct *tsk)
-{
- struct files_struct * files = tsk->files;
-
- if (files) {
- task_lock(tsk);
- tsk->files = NULL;
- task_unlock(tsk);
- put_files_struct(files);
- }
-}
-
-#ifdef CONFIG_MM_OWNER
-/*
- * Task p is exiting and it owned mm, lets find a new owner for it
- */
-static inline int
-mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
-{
+ if (mm->owner != p)
+ return;
/*
- * If there are other users of the mm and the owner (us) is exiting
- * we need to find a new owner to take on the responsibility.
+ * The current owner is exiting/execing and there are no other
+ * candidates. Do not leave the mm pointing to a possibly
+ * freed task structure.
*/
- if (atomic_read(&mm->mm_users) <= 1)
- return 0;
- if (mm->owner != p)
- return 0;
- return 1;
-}
-
-void mm_update_next_owner(struct mm_struct *mm)
-{
- struct task_struct *c, *g, *p = current;
-
-retry:
- if (!mm_need_new_owner(mm, p))
+ if (atomic_read(&mm->mm_users) <= 1) {
+ mm->owner = NULL;
return;
+ }
read_lock(&tasklist_lock);
/*
@@ -603,14 +356,18 @@ retry:
}
/*
- * Search through everything else. We should not get
- * here often
+ * Search through everything else, we should not get here often.
*/
- do_each_thread(g, c) {
- if (c->mm == mm)
- goto assign_new_owner;
- } while_each_thread(g, c);
-
+ for_each_process(g) {
+ if (g->flags & PF_KTHREAD)
+ continue;
+ for_each_thread(g, c) {
+ if (c->mm == mm)
+ goto assign_new_owner;
+ if (c->mm)
+ break;
+ }
+ }
read_unlock(&tasklist_lock);
/*
* We found no owner yet mm_users > 1: this implies that we are
@@ -642,7 +399,7 @@ assign_new_owner:
task_unlock(c);
put_task_struct(c);
}
-#endif /* CONFIG_MM_OWNER */
+#endif /* CONFIG_MEMCG */
/*
* Turn us into a lazy TLB process if we
@@ -656,6 +413,7 @@ static void exit_mm(struct task_struct * tsk)
mm_release(tsk, mm);
if (!mm)
return;
+ sync_mm_rss(mm);
/*
* Serialize with any possible pending coredump.
* We must hold mmap_sem around checking core_state
@@ -682,7 +440,7 @@ static void exit_mm(struct task_struct * tsk)
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
if (!self.task) /* see coredump_finish() */
break;
- schedule();
+ freezable_schedule();
}
__set_task_state(tsk, TASK_RUNNING);
down_read(&mm->mmap_sem);
@@ -694,21 +452,17 @@ static void exit_mm(struct task_struct * tsk)
tsk->mm = NULL;
up_read(&mm->mmap_sem);
enter_lazy_tlb(mm, current);
- /* We don't want this task to be frozen prematurely */
- clear_freeze_flag(tsk);
- if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
- atomic_dec(&mm->oom_disable_count);
task_unlock(tsk);
mm_update_next_owner(mm);
mmput(mm);
}
/*
- * When we die, we re-parent all our children.
- * Try to give them to another thread in our thread
- * group, and if no such member exists, give it to
- * the child reaper process (ie "init") in our pid
- * space.
+ * When we die, we re-parent all our children, and try to:
+ * 1. give them to another thread in our thread group, if such a member exists
+ * 2. give it to the first ancestor process which prctl'd itself as a
+ * child_subreaper for its children (like a service manager)
+ * 3. give it to the init process (PID 1) in our pid namespace
*/
static struct task_struct *find_new_reaper(struct task_struct *father)
__releases(&tasklist_lock)
@@ -728,17 +482,37 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
if (unlikely(pid_ns->child_reaper == father)) {
write_unlock_irq(&tasklist_lock);
- if (unlikely(pid_ns == &init_pid_ns))
- panic("Attempted to kill init!");
+ if (unlikely(pid_ns == &init_pid_ns)) {
+ panic("Attempted to kill init! exitcode=0x%08x\n",
+ father->signal->group_exit_code ?:
+ father->exit_code);
+ }
zap_pid_ns_processes(pid_ns);
write_lock_irq(&tasklist_lock);
+ } else if (father->signal->has_child_subreaper) {
+ struct task_struct *reaper;
+
/*
- * We can not clear ->child_reaper or leave it alone.
- * There may by stealth EXIT_DEAD tasks on ->children,
- * forget_original_parent() must move them somewhere.
+ * Find the first ancestor marked as child_subreaper.
+ * Note that the code below checks same_thread_group(reaper,
+ * pid_ns->child_reaper). This is what we need to DTRT in a
+ * PID namespace. However we still need the check above, see
+ * http://marc.info/?l=linux-kernel&m=131385460420380
*/
- pid_ns->child_reaper = init_pid_ns.child_reaper;
+ for (reaper = father->real_parent;
+ reaper != &init_task;
+ reaper = reaper->real_parent) {
+ if (same_thread_group(reaper, pid_ns->child_reaper))
+ break;
+ if (!reaper->signal->is_child_subreaper)
+ continue;
+ thread = reaper;
+ do {
+ if (!(thread->flags & PF_EXITING))
+ return reaper;
+ } while_each_thread(reaper, thread);
+ }
}
return pid_ns->child_reaper;
@@ -752,7 +526,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
{
list_move_tail(&p->sibling, &p->real_parent->children);
- if (task_detached(p))
+ if (p->exit_state == EXIT_DEAD)
return;
/*
* If this is a threaded reparent there is no need to
@@ -761,14 +535,13 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
if (same_thread_group(p->real_parent, father))
return;
- /* We don't want people slaying init. */
+ /* We don't want people slaying init. */
p->exit_signal = SIGCHLD;
/* If it has exited notify the new parent about this child's death. */
- if (!task_ptrace(p) &&
+ if (!p->ptrace &&
p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
- do_notify_parent(p, p->exit_signal);
- if (task_detached(p)) {
+ if (do_notify_parent(p, p->exit_signal)) {
p->exit_state = EXIT_DEAD;
list_move_tail(&p->sibling, dead);
}
@@ -795,7 +568,7 @@ static void forget_original_parent(struct task_struct *father)
do {
t->real_parent = reaper;
if (t->parent == father) {
- BUG_ON(task_ptrace(t));
+ BUG_ON(t->ptrace);
t->parent = t->real_parent;
}
if (t->pdeath_signal)
@@ -820,8 +593,7 @@ static void forget_original_parent(struct task_struct *father)
*/
static void exit_notify(struct task_struct *tsk, int group_dead)
{
- int signal;
- void *cookie;
+ bool autoreap;
/*
* This does two things:
@@ -832,46 +604,33 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
*/
forget_original_parent(tsk);
- exit_task_namespaces(tsk);
write_lock_irq(&tasklist_lock);
if (group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
- /* Let father know we died
- *
- * Thread signals are configurable, but you aren't going to use
- * that to send signals to arbitary processes.
- * That stops right now.
- *
- * If the parent exec id doesn't match the exec id we saved
- * when we started then we know the parent has changed security
- * domain.
- *
- * If our self_exec id doesn't match our parent_exec_id then
- * we have changed execution domain as these two values started
- * the same after a fork.
- */
- if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
- (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
- tsk->self_exec_id != tsk->parent_exec_id))
- tsk->exit_signal = SIGCHLD;
-
- signal = tracehook_notify_death(tsk, &cookie, group_dead);
- if (signal >= 0)
- signal = do_notify_parent(tsk, signal);
+ if (unlikely(tsk->ptrace)) {
+ int sig = thread_group_leader(tsk) &&
+ thread_group_empty(tsk) &&
+ !ptrace_reparented(tsk) ?
+ tsk->exit_signal : SIGCHLD;
+ autoreap = do_notify_parent(tsk, sig);
+ } else if (thread_group_leader(tsk)) {
+ autoreap = thread_group_empty(tsk) &&
+ do_notify_parent(tsk, tsk->exit_signal);
+ } else {
+ autoreap = true;
+ }
- tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
+ tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
/* mt-exec, de_thread() is waiting for group leader */
if (unlikely(tsk->signal->notify_count < 0))
wake_up_process(tsk->signal->group_exit_task);
write_unlock_irq(&tasklist_lock);
- tracehook_report_death(tsk, signal, cookie, group_dead);
-
/* If the process is dead, release it - nobody will wait for it */
- if (signal == DEATH_REAP)
+ if (autoreap)
release_task(tsk);
}
@@ -889,9 +648,9 @@ static void check_stack_usage(void)
spin_lock(&low_water_lock);
if (free < lowest_to_date) {
- printk(KERN_WARNING "%s used greatest stack depth: %lu bytes "
- "left\n",
- current->comm, free);
+ printk(KERN_WARNING "%s (%d) used greatest stack depth: "
+ "%lu bytes left\n",
+ current->comm, task_pid_nr(current), free);
lowest_to_date = free;
}
spin_unlock(&low_water_lock);
@@ -900,21 +659,30 @@ static void check_stack_usage(void)
static inline void check_stack_usage(void) {}
#endif
-NORET_TYPE void do_exit(long code)
+void do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
profile_task_exit(tsk);
- WARN_ON(atomic_read(&tsk->fs_excl));
+ WARN_ON(blk_needs_flush_plug(tsk));
if (unlikely(in_interrupt()))
panic("Aiee, killing interrupt handler!");
if (unlikely(!tsk->pid))
panic("Attempted to kill the idle task!");
- tracehook_report_exit(&code);
+ /*
+ * If do_exit is called because this processes oopsed, it's possible
+ * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
+ * continuing. Amongst other possible reasons, this is to prevent
+ * mm_release()->clear_child_tid() from writing to a user-controlled
+ * kernel address.
+ */
+ set_fs(USER_DS);
+
+ ptrace_event(PTRACE_EVENT_EXIT, code);
validate_creds_for_do_exit(tsk);
@@ -939,8 +707,6 @@ NORET_TYPE void do_exit(long code)
schedule();
}
- exit_irq_thread();
-
exit_signals(tsk); /* sets PF_EXITING */
/*
* tsk->flags are checked in the futex code to protect against
@@ -957,7 +723,7 @@ NORET_TYPE void do_exit(long code)
acct_update_integrals(tsk);
/* sync mm's RSS info before statistics gathering */
if (tsk->mm)
- sync_mm_rss(tsk, tsk->mm);
+ sync_mm_rss(tsk->mm);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
hrtimer_cancel(&tsk->signal->real_timer);
@@ -968,8 +734,7 @@ NORET_TYPE void do_exit(long code)
acct_collect(code, group_dead);
if (group_dead)
tty_audit_exit();
- if (unlikely(tsk->audit_context))
- audit_free(tsk);
+ audit_free(tsk);
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
@@ -981,30 +746,34 @@ NORET_TYPE void do_exit(long code)
trace_sched_process_exit(tsk);
exit_sem(tsk);
+ exit_shm(tsk);
exit_files(tsk);
exit_fs(tsk);
- check_stack_usage();
- exit_thread();
- cgroup_exit(tsk, 1);
-
if (group_dead)
disassociate_ctty(1);
+ exit_task_namespaces(tsk);
+ exit_task_work(tsk);
+ exit_thread();
- module_put(task_thread_info(tsk)->exec_domain->module);
+ /*
+ * Flush inherited counters to the parent - before the parent
+ * gets woken up by child-exit notifications.
+ *
+ * because of cgroup mode, must be called before cgroup_exit()
+ */
+ perf_event_exit_task(tsk);
- proc_exit_connector(tsk);
+ cgroup_exit(tsk);
+
+ module_put(task_thread_info(tsk)->exec_domain->module);
/*
* FIXME: do that only when needed, using sched_exit tracepoint
*/
flush_ptrace_hw_breakpoint(tsk);
- /*
- * Flush inherited counters to the parent - before the parent
- * gets woken up by child-exit notifications.
- */
- perf_event_exit_task(tsk);
exit_notify(tsk, group_dead);
+ proc_exit_connector(tsk);
#ifdef CONFIG_NUMA
task_lock(tsk);
mpol_put(tsk->mempolicy);
@@ -1018,7 +787,7 @@ NORET_TYPE void do_exit(long code)
/*
* Make sure we are holding no locks:
*/
- debug_check_no_locks_held(tsk);
+ debug_check_no_locks_held();
/*
* We can do this unlocked here. The futex code uses this flag
* just to verify whether the pi state cleanup has been done
@@ -1030,14 +799,37 @@ NORET_TYPE void do_exit(long code)
exit_io_context(tsk);
if (tsk->splice_pipe)
- __free_pipe_info(tsk->splice_pipe);
+ free_pipe_info(tsk->splice_pipe);
+
+ if (tsk->task_frag.page)
+ put_page(tsk->task_frag.page);
validate_creds_for_do_exit(tsk);
+ check_stack_usage();
preempt_disable();
+ if (tsk->nr_dirtied)
+ __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
exit_rcu();
+
+ /*
+ * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
+ * when the following two conditions become true.
+ * - There is race condition of mmap_sem (It is acquired by
+ * exit_mm()), and
+ * - SMI occurs before setting TASK_RUNINNG.
+ * (or hypervisor of virtual machine switches to other guest)
+ * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
+ *
+ * To avoid it, we have to wait for releasing tsk->pi_lock which
+ * is held by try_to_wake_up()
+ */
+ smp_mb();
+ raw_spin_unlock_wait(&tsk->pi_lock);
+
/* causes final put_task_struct in finish_task_switch(). */
tsk->state = TASK_DEAD;
+ tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
schedule();
BUG();
/* Avoid "noreturn function does return". */
@@ -1047,7 +839,7 @@ NORET_TYPE void do_exit(long code)
EXPORT_SYMBOL_GPL(do_exit);
-NORET_TYPE void complete_and_exit(struct completion *comp, long code)
+void complete_and_exit(struct completion *comp, long code)
{
if (comp)
complete(comp);
@@ -1066,7 +858,7 @@ SYSCALL_DEFINE1(exit, int, error_code)
* Take down every thread in the group. This is called by fatal signals
* as well as by sys_exit_group (below).
*/
-NORET_TYPE void
+void
do_group_exit(int exit_code)
{
struct signal_struct *sig = current->signal;
@@ -1187,7 +979,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
unsigned long state;
int retval, status, traced;
pid_t pid = task_pid_vnr(p);
- uid_t uid = __task_cred(p)->uid;
+ uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
struct siginfo __user *infop;
if (!likely(wo->wo_flags & WEXITED))
@@ -1209,22 +1001,18 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
return wait_noreap_copyout(wo, p, pid, uid, why, status);
}
+ traced = ptrace_reparented(p);
/*
- * Try to move the task's state to DEAD
- * only one thread is allowed to do this:
+ * Move the task's state to DEAD/TRACE, only one thread can do this.
*/
- state = xchg(&p->exit_state, EXIT_DEAD);
- if (state != EXIT_ZOMBIE) {
- BUG_ON(state != EXIT_DEAD);
+ state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD;
+ if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
return 0;
- }
-
- traced = ptrace_reparented(p);
/*
* It can be ptraced but not reparented, check
- * !task_detached() to filter out sub-threads.
+ * thread_group_leader() to filter out sub-threads.
*/
- if (likely(!traced) && likely(!task_detached(p))) {
+ if (likely(!traced) && thread_group_leader(p)) {
struct signal_struct *psig;
struct signal_struct *sig;
unsigned long maxrss;
@@ -1245,27 +1033,17 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* as other threads in the parent group can be right
* here reaping other children at the same time.
*
- * We use thread_group_times() to get times for the thread
+ * We use thread_group_cputime_adjusted() to get times for the thread
* group, which consolidates times for all threads in the
* group including the group leader.
*/
- thread_group_times(p, &tgutime, &tgstime);
+ thread_group_cputime_adjusted(p, &tgutime, &tgstime);
spin_lock_irq(&p->real_parent->sighand->siglock);
psig = p->real_parent->signal;
sig = p->signal;
- psig->cutime =
- cputime_add(psig->cutime,
- cputime_add(tgutime,
- sig->cutime));
- psig->cstime =
- cputime_add(psig->cstime,
- cputime_add(tgstime,
- sig->cstime));
- psig->cgtime =
- cputime_add(psig->cgtime,
- cputime_add(p->gtime,
- cputime_add(sig->gtime,
- sig->cgtime)));
+ psig->cutime += tgutime + sig->cutime;
+ psig->cstime += tgstime + sig->cstime;
+ psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
psig->cmin_flt +=
p->min_flt + sig->min_flt + sig->cmin_flt;
psig->cmaj_flt +=
@@ -1290,7 +1068,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
/*
* Now we are sure this task is interesting, and no other
- * thread can reap it because we set its state to EXIT_DEAD.
+ * thread can reap it because we its state == DEAD/TRACE.
*/
read_unlock(&tasklist_lock);
@@ -1327,25 +1105,19 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
if (!retval)
retval = pid;
- if (traced) {
+ if (state == EXIT_TRACE) {
write_lock_irq(&tasklist_lock);
/* We dropped tasklist, ptracer could die and untrace */
ptrace_unlink(p);
- /*
- * If this is not a detached task, notify the parent.
- * If it's still not detached after that, don't release
- * it now.
- */
- if (!task_detached(p)) {
- do_notify_parent(p, p->exit_signal);
- if (!task_detached(p)) {
- p->exit_state = EXIT_ZOMBIE;
- p = NULL;
- }
- }
+
+ /* If parent wants a zombie, don't release it now */
+ state = EXIT_ZOMBIE;
+ if (do_notify_parent(p, p->exit_signal))
+ state = EXIT_DEAD;
+ p->exit_state = state;
write_unlock_irq(&tasklist_lock);
}
- if (p != NULL)
+ if (state == EXIT_DEAD)
release_task(p);
return retval;
@@ -1354,7 +1126,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
static int *task_stopped_code(struct task_struct *p, bool ptrace)
{
if (ptrace) {
- if (task_is_stopped_or_traced(p))
+ if (task_is_stopped_or_traced(p) &&
+ !(p->jobctl & JOBCTL_LISTENING))
return &p->exit_code;
} else {
if (p->signal->flags & SIGNAL_STOP_STOPPED)
@@ -1363,11 +1136,23 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
return NULL;
}
-/*
- * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold
- * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
- * the lock and this task is uninteresting. If we return nonzero, we have
- * released the lock and the system call should return.
+/**
+ * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
+ * @wo: wait options
+ * @ptrace: is the wait for ptrace
+ * @p: task to wait for
+ *
+ * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
+ *
+ * CONTEXT:
+ * read_lock(&tasklist_lock), which is released if return value is
+ * non-zero. Also, grabs and releases @p->sighand->siglock.
+ *
+ * RETURNS:
+ * 0 if wait condition didn't exist and search for other wait conditions
+ * should continue. Non-zero return, -errno on failure and @p's pid on
+ * success, implies that tasklist_lock is released and wait condition
+ * search should terminate.
*/
static int wait_task_stopped(struct wait_opts *wo,
int ptrace, struct task_struct *p)
@@ -1383,6 +1168,9 @@ static int wait_task_stopped(struct wait_opts *wo,
if (!ptrace && !(wo->wo_flags & WUNTRACED))
return 0;
+ if (!task_stopped_code(p, ptrace))
+ return 0;
+
exit_code = 0;
spin_lock_irq(&p->sighand->siglock);
@@ -1397,7 +1185,7 @@ static int wait_task_stopped(struct wait_opts *wo,
if (!unlikely(wo->wo_flags & WNOWAIT))
*p_code = 0;
- uid = task_uid(p);
+ uid = from_kuid_munged(current_user_ns(), task_uid(p));
unlock_sig:
spin_unlock_irq(&p->sighand->siglock);
if (!exit_code)
@@ -1470,7 +1258,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
}
if (!unlikely(wo->wo_flags & WNOWAIT))
p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
- uid = task_uid(p);
+ uid = from_kuid_munged(current_user_ns(), task_uid(p));
spin_unlock_irq(&p->sighand->siglock);
pid = task_pid_vnr(p);
@@ -1506,7 +1294,12 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
static int wait_consider_task(struct wait_opts *wo, int ptrace,
struct task_struct *p)
{
- int ret = eligible_child(wo, p);
+ int ret;
+
+ if (unlikely(p->exit_state == EXIT_DEAD))
+ return 0;
+
+ ret = eligible_child(wo, p);
if (!ret)
return ret;
@@ -1524,33 +1317,88 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
return 0;
}
- if (likely(!ptrace) && unlikely(task_ptrace(p))) {
+ if (unlikely(p->exit_state == EXIT_TRACE)) {
/*
- * This child is hidden by ptrace.
- * We aren't allowed to see it now, but eventually we will.
+ * ptrace == 0 means we are the natural parent. In this case
+ * we should clear notask_error, debugger will notify us.
*/
- wo->notask_error = 0;
+ if (likely(!ptrace))
+ wo->notask_error = 0;
return 0;
}
- if (p->exit_state == EXIT_DEAD)
- return 0;
+ if (likely(!ptrace) && unlikely(p->ptrace)) {
+ /*
+ * If it is traced by its real parent's group, just pretend
+ * the caller is ptrace_do_wait() and reap this child if it
+ * is zombie.
+ *
+ * This also hides group stop state from real parent; otherwise
+ * a single stop can be reported twice as group and ptrace stop.
+ * If a ptracer wants to distinguish these two events for its
+ * own children it should create a separate process which takes
+ * the role of real parent.
+ */
+ if (!ptrace_reparented(p))
+ ptrace = 1;
+ }
+
+ /* slay zombie? */
+ if (p->exit_state == EXIT_ZOMBIE) {
+ /* we don't reap group leaders with subthreads */
+ if (!delay_group_leader(p)) {
+ /*
+ * A zombie ptracee is only visible to its ptracer.
+ * Notification and reaping will be cascaded to the
+ * real parent when the ptracer detaches.
+ */
+ if (unlikely(ptrace) || likely(!p->ptrace))
+ return wait_task_zombie(wo, p);
+ }
+
+ /*
+ * Allow access to stopped/continued state via zombie by
+ * falling through. Clearing of notask_error is complex.
+ *
+ * When !@ptrace:
+ *
+ * If WEXITED is set, notask_error should naturally be
+ * cleared. If not, subset of WSTOPPED|WCONTINUED is set,
+ * so, if there are live subthreads, there are events to
+ * wait for. If all subthreads are dead, it's still safe
+ * to clear - this function will be called again in finite
+ * amount time once all the subthreads are released and
+ * will then return without clearing.
+ *
+ * When @ptrace:
+ *
+ * Stopped state is per-task and thus can't change once the
+ * target task dies. Only continued and exited can happen.
+ * Clear notask_error if WCONTINUED | WEXITED.
+ */
+ if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
+ wo->notask_error = 0;
+ } else {
+ /*
+ * @p is alive and it's gonna stop, continue or exit, so
+ * there always is something to wait for.
+ */
+ wo->notask_error = 0;
+ }
/*
- * We don't reap group leaders with subthreads.
+ * Wait for stopped. Depending on @ptrace, different stopped state
+ * is used and the two don't interact with each other.
*/
- if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
- return wait_task_zombie(wo, p);
+ ret = wait_task_stopped(wo, ptrace, p);
+ if (ret)
+ return ret;
/*
- * It's stopped or running now, so it might
- * later continue, exit, or stop again.
+ * Wait for continued. There's only one continued state and the
+ * ptracer can consume it which can confuse the real parent. Don't
+ * use WCONTINUED from ptracer. You don't need or want it.
*/
- wo->notask_error = 0;
-
- if (task_stopped_code(p, ptrace))
- return wait_task_stopped(wo, ptrace, p);
-
return wait_task_continued(wo, p);
}
@@ -1730,9 +1578,6 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
}
put_pid(pid);
-
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(5, ret, which, upid, infop, options, ru);
return ret;
}
@@ -1770,8 +1615,6 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
ret = do_wait(&wo);
put_pid(pid);
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
return ret;
}