diff options
Diffstat (limited to 'kernel/exit.c')
| -rw-r--r-- | kernel/exit.c | 424 |
1 files changed, 86 insertions, 338 deletions
diff --git a/kernel/exit.c b/kernel/exit.c index 2f59cc33451..e5c4668f179 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -20,6 +20,7 @@ #include <linux/tsacct_kern.h> #include <linux/file.h> #include <linux/fdtable.h> +#include <linux/freezer.h> #include <linux/binfmts.h> #include <linux/nsproxy.h> #include <linux/pid_namespace.h> @@ -31,7 +32,6 @@ #include <linux/mempolicy.h> #include <linux/taskstats_kern.h> #include <linux/delayacct.h> -#include <linux/freezer.h> #include <linux/cgroup.h> #include <linux/syscalls.h> #include <linux/signal.h> @@ -72,20 +72,9 @@ static void __unhash_process(struct task_struct *p, bool group_dead) list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); - /* - * If we are the last child process in a pid namespace to be - * reaped, notify the reaper sleeping zap_pid_ns_processes(). - */ - if (IS_ENABLED(CONFIG_PID_NS)) { - struct task_struct *parent = p->real_parent; - - if ((task_active_pid_ns(parent)->child_reaper == parent) && - list_empty(&parent->children) && - (parent->flags & PF_EXITING)) - wake_up_process(parent); - } } list_del_rcu(&p->thread_group); + list_del_rcu(&p->thread_node); } /* @@ -97,6 +86,7 @@ static void __exit_signal(struct task_struct *tsk) bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; struct tty_struct *uninitialized_var(tty); + cputime_t utime, stime; sighand = rcu_dereference_check(tsk->sighand, lockdep_tasklist_lock_is_held()); @@ -135,9 +125,10 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ - sig->utime += tsk->utime; - sig->stime += tsk->stime; - sig->gtime += tsk->gtime; + task_cputime(tsk, &utime, &stime); + sig->utime += utime; + sig->stime += stime; + sig->gtime += task_gtime(tsk); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; @@ -322,244 +313,7 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } } -/** - * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd - * - * If a kernel thread is launched as a result of a system call, or if - * it ever exits, it should generally reparent itself to kthreadd so it - * isn't in the way of other processes and is correctly cleaned up on exit. - * - * The various task state such as scheduling policy and priority may have - * been inherited from a user process, so we reset them to sane values here. - * - * NOTE that reparent_to_kthreadd() gives the caller full capabilities. - */ -static void reparent_to_kthreadd(void) -{ - write_lock_irq(&tasklist_lock); - - ptrace_unlink(current); - /* Reparent to init */ - current->real_parent = current->parent = kthreadd_task; - list_move_tail(¤t->sibling, ¤t->real_parent->children); - - /* Set the exit signal to SIGCHLD so we signal init on exit */ - current->exit_signal = SIGCHLD; - - if (task_nice(current) < 0) - set_user_nice(current, 0); - /* cpus_allowed? */ - /* rt_priority? */ - /* signals? */ - memcpy(current->signal->rlim, init_task.signal->rlim, - sizeof(current->signal->rlim)); - - atomic_inc(&init_cred.usage); - commit_creds(&init_cred); - write_unlock_irq(&tasklist_lock); -} - -void __set_special_pids(struct pid *pid) -{ - struct task_struct *curr = current->group_leader; - - if (task_session(curr) != pid) - change_pid(curr, PIDTYPE_SID, pid); - - if (task_pgrp(curr) != pid) - change_pid(curr, PIDTYPE_PGID, pid); -} - -static void set_special_pids(struct pid *pid) -{ - write_lock_irq(&tasklist_lock); - __set_special_pids(pid); - write_unlock_irq(&tasklist_lock); -} - -/* - * Let kernel threads use this to say that they allow a certain signal. - * Must not be used if kthread was cloned with CLONE_SIGHAND. - */ -int allow_signal(int sig) -{ - if (!valid_signal(sig) || sig < 1) - return -EINVAL; - - spin_lock_irq(¤t->sighand->siglock); - /* This is only needed for daemonize()'ed kthreads */ - sigdelset(¤t->blocked, sig); - /* - * Kernel threads handle their own signals. Let the signal code - * know it'll be handled, so that they don't get converted to - * SIGKILL or just silently dropped. - */ - current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - return 0; -} - -EXPORT_SYMBOL(allow_signal); - -int disallow_signal(int sig) -{ - if (!valid_signal(sig) || sig < 1) - return -EINVAL; - - spin_lock_irq(¤t->sighand->siglock); - current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - return 0; -} - -EXPORT_SYMBOL(disallow_signal); - -/* - * Put all the gunge required to become a kernel thread without - * attached user resources in one place where it belongs. - */ - -void daemonize(const char *name, ...) -{ - va_list args; - sigset_t blocked; - - va_start(args, name); - vsnprintf(current->comm, sizeof(current->comm), name, args); - va_end(args); - - /* - * If we were started as result of loading a module, close all of the - * user space pages. We don't need them, and if we didn't close them - * they would be locked into memory. - */ - exit_mm(current); - /* - * We don't want to get frozen, in case system-wide hibernation - * or suspend transition begins right now. - */ - current->flags |= (PF_NOFREEZE | PF_KTHREAD); - - if (current->nsproxy != &init_nsproxy) { - get_nsproxy(&init_nsproxy); - switch_task_namespaces(current, &init_nsproxy); - } - set_special_pids(&init_struct_pid); - proc_clear_tty(current); - - /* Block and flush all signals */ - sigfillset(&blocked); - sigprocmask(SIG_BLOCK, &blocked, NULL); - flush_signals(current); - - /* Become as one with the init task */ - - daemonize_fs_struct(); - exit_files(current); - current->files = init_task.files; - atomic_inc(¤t->files->count); - - reparent_to_kthreadd(); -} - -EXPORT_SYMBOL(daemonize); - -static void close_files(struct files_struct * files) -{ - int i, j; - struct fdtable *fdt; - - j = 0; - - /* - * It is safe to dereference the fd table without RCU or - * ->file_lock because this is the last reference to the - * files structure. But use RCU to shut RCU-lockdep up. - */ - rcu_read_lock(); - fdt = files_fdtable(files); - rcu_read_unlock(); - for (;;) { - unsigned long set; - i = j * __NFDBITS; - if (i >= fdt->max_fds) - break; - set = fdt->open_fds[j++]; - while (set) { - if (set & 1) { - struct file * file = xchg(&fdt->fd[i], NULL); - if (file) { - filp_close(file, files); - cond_resched(); - } - } - i++; - set >>= 1; - } - } -} - -struct files_struct *get_files_struct(struct task_struct *task) -{ - struct files_struct *files; - - task_lock(task); - files = task->files; - if (files) - atomic_inc(&files->count); - task_unlock(task); - - return files; -} - -void put_files_struct(struct files_struct *files) -{ - struct fdtable *fdt; - - if (atomic_dec_and_test(&files->count)) { - close_files(files); - /* - * Free the fd and fdset arrays if we expanded them. - * If the fdtable was embedded, pass files for freeing - * at the end of the RCU grace period. Otherwise, - * you can free files immediately. - */ - rcu_read_lock(); - fdt = files_fdtable(files); - if (fdt != &files->fdtab) - kmem_cache_free(files_cachep, files); - free_fdtable(fdt); - rcu_read_unlock(); - } -} - -void reset_files_struct(struct files_struct *files) -{ - struct task_struct *tsk = current; - struct files_struct *old; - - old = tsk->files; - task_lock(tsk); - tsk->files = files; - task_unlock(tsk); - put_files_struct(old); -} - -void exit_files(struct task_struct *tsk) -{ - struct files_struct * files = tsk->files; - - if (files) { - task_lock(tsk); - tsk->files = NULL; - task_unlock(tsk); - put_files_struct(files); - } -} - -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ @@ -602,14 +356,18 @@ retry: } /* - * Search through everything else. We should not get - * here often + * Search through everything else, we should not get here often. */ - do_each_thread(g, c) { - if (c->mm == mm) - goto assign_new_owner; - } while_each_thread(g, c); - + for_each_process(g) { + if (g->flags & PF_KTHREAD) + continue; + for_each_thread(g, c) { + if (c->mm == mm) + goto assign_new_owner; + if (c->mm) + break; + } + } read_unlock(&tasklist_lock); /* * We found no owner yet mm_users > 1: this implies that we are @@ -641,7 +399,7 @@ assign_new_owner: task_unlock(c); put_task_struct(c); } -#endif /* CONFIG_MM_OWNER */ +#endif /* CONFIG_MEMCG */ /* * Turn us into a lazy TLB process if we @@ -682,7 +440,7 @@ static void exit_mm(struct task_struct * tsk) set_task_state(tsk, TASK_UNINTERRUPTIBLE); if (!self.task) /* see coredump_finish() */ break; - schedule(); + freezable_schedule(); } __set_task_state(tsk, TASK_RUNNING); down_read(&mm->mmap_sem); @@ -777,7 +535,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, if (same_thread_group(p->real_parent, father)) return; - /* We don't want people slaying init. */ + /* We don't want people slaying init. */ p->exit_signal = SIGCHLD; /* If it has exited notify the new parent about this child's death. */ @@ -846,7 +604,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ forget_original_parent(tsk); - exit_task_namespaces(tsk); write_lock_irq(&tasklist_lock); if (group_dead) @@ -953,14 +710,11 @@ void do_exit(long code) exit_signals(tsk); /* sets PF_EXITING */ /* * tsk->flags are checked in the futex code to protect against - * an exiting task cleaning up the robust pi futexes, and in - * task_work_add() to avoid the race with exit_task_work(). + * an exiting task cleaning up the robust pi futexes. */ smp_mb(); raw_spin_unlock_wait(&tsk->pi_lock); - exit_task_work(tsk); - if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), @@ -995,7 +749,10 @@ void do_exit(long code) exit_shm(tsk); exit_files(tsk); exit_fs(tsk); - check_stack_usage(); + if (group_dead) + disassociate_ctty(1); + exit_task_namespaces(tsk); + exit_task_work(tsk); exit_thread(); /* @@ -1006,21 +763,17 @@ void do_exit(long code) */ perf_event_exit_task(tsk); - cgroup_exit(tsk, 1); - - if (group_dead) - disassociate_ctty(1); + cgroup_exit(tsk); module_put(task_thread_info(tsk)->exec_domain->module); - proc_exit_connector(tsk); - /* * FIXME: do that only when needed, using sched_exit tracepoint */ - ptrace_put_breakpoints(tsk); + flush_ptrace_hw_breakpoint(tsk); exit_notify(tsk, group_dead); + proc_exit_connector(tsk); #ifdef CONFIG_NUMA task_lock(tsk); mpol_put(tsk->mempolicy); @@ -1034,7 +787,7 @@ void do_exit(long code) /* * Make sure we are holding no locks: */ - debug_check_no_locks_held(tsk); + debug_check_no_locks_held(); /* * We can do this unlocked here. The futex code uses this flag * just to verify whether the pi state cleanup has been done @@ -1046,10 +799,14 @@ void do_exit(long code) exit_io_context(tsk); if (tsk->splice_pipe) - __free_pipe_info(tsk->splice_pipe); + free_pipe_info(tsk->splice_pipe); + + if (tsk->task_frag.page) + put_page(tsk->task_frag.page); validate_creds_for_do_exit(tsk); + check_stack_usage(); preempt_disable(); if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); @@ -1244,17 +1001,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) return wait_noreap_copyout(wo, p, pid, uid, why, status); } + traced = ptrace_reparented(p); /* - * Try to move the task's state to DEAD - * only one thread is allowed to do this: + * Move the task's state to DEAD/TRACE, only one thread can do this. */ - state = xchg(&p->exit_state, EXIT_DEAD); - if (state != EXIT_ZOMBIE) { - BUG_ON(state != EXIT_DEAD); + state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD; + if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; - } - - traced = ptrace_reparented(p); /* * It can be ptraced but not reparented, check * thread_group_leader() to filter out sub-threads. @@ -1280,17 +1033,17 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * as other threads in the parent group can be right * here reaping other children at the same time. * - * We use thread_group_times() to get times for the thread + * We use thread_group_cputime_adjusted() to get times for the thread * group, which consolidates times for all threads in the * group including the group leader. */ - thread_group_times(p, &tgutime, &tgstime); + thread_group_cputime_adjusted(p, &tgutime, &tgstime); spin_lock_irq(&p->real_parent->sighand->siglock); psig = p->real_parent->signal; sig = p->signal; psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; - psig->cgtime += p->gtime + sig->gtime + sig->cgtime; + psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; psig->cmin_flt += p->min_flt + sig->min_flt + sig->cmin_flt; psig->cmaj_flt += @@ -1315,7 +1068,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) /* * Now we are sure this task is interesting, and no other - * thread can reap it because we set its state to EXIT_DEAD. + * thread can reap it because we its state == DEAD/TRACE. */ read_unlock(&tasklist_lock); @@ -1352,22 +1105,19 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (!retval) retval = pid; - if (traced) { + if (state == EXIT_TRACE) { write_lock_irq(&tasklist_lock); /* We dropped tasklist, ptracer could die and untrace */ ptrace_unlink(p); - /* - * If this is not a sub-thread, notify the parent. - * If parent wants a zombie, don't release it now. - */ - if (thread_group_leader(p) && - !do_notify_parent(p, p->exit_signal)) { - p->exit_state = EXIT_ZOMBIE; - p = NULL; - } + + /* If parent wants a zombie, don't release it now */ + state = EXIT_ZOMBIE; + if (do_notify_parent(p, p->exit_signal)) + state = EXIT_DEAD; + p->exit_state = state; write_unlock_irq(&tasklist_lock); } - if (p != NULL) + if (state == EXIT_DEAD) release_task(p); return retval; @@ -1544,7 +1294,12 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) { - int ret = eligible_child(wo, p); + int ret; + + if (unlikely(p->exit_state == EXIT_DEAD)) + return 0; + + ret = eligible_child(wo, p); if (!ret) return ret; @@ -1562,33 +1317,44 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, return 0; } - /* dead body doesn't have much to contribute */ - if (unlikely(p->exit_state == EXIT_DEAD)) { + if (unlikely(p->exit_state == EXIT_TRACE)) { /* - * But do not ignore this task until the tracer does - * wait_task_zombie()->do_notify_parent(). + * ptrace == 0 means we are the natural parent. In this case + * we should clear notask_error, debugger will notify us. */ - if (likely(!ptrace) && unlikely(ptrace_reparented(p))) + if (likely(!ptrace)) wo->notask_error = 0; return 0; } - /* slay zombie? */ - if (p->exit_state == EXIT_ZOMBIE) { + if (likely(!ptrace) && unlikely(p->ptrace)) { /* - * A zombie ptracee is only visible to its ptracer. - * Notification and reaping will be cascaded to the real - * parent when the ptracer detaches. + * If it is traced by its real parent's group, just pretend + * the caller is ptrace_do_wait() and reap this child if it + * is zombie. + * + * This also hides group stop state from real parent; otherwise + * a single stop can be reported twice as group and ptrace stop. + * If a ptracer wants to distinguish these two events for its + * own children it should create a separate process which takes + * the role of real parent. */ - if (likely(!ptrace) && unlikely(p->ptrace)) { - /* it will become visible, clear notask_error */ - wo->notask_error = 0; - return 0; - } + if (!ptrace_reparented(p)) + ptrace = 1; + } + /* slay zombie? */ + if (p->exit_state == EXIT_ZOMBIE) { /* we don't reap group leaders with subthreads */ - if (!delay_group_leader(p)) - return wait_task_zombie(wo, p); + if (!delay_group_leader(p)) { + /* + * A zombie ptracee is only visible to its ptracer. + * Notification and reaping will be cascaded to the + * real parent when the ptracer detaches. + */ + if (unlikely(ptrace) || likely(!p->ptrace)) + return wait_task_zombie(wo, p); + } /* * Allow access to stopped/continued state via zombie by @@ -1614,19 +1380,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, wo->notask_error = 0; } else { /* - * If @p is ptraced by a task in its real parent's group, - * hide group stop/continued state when looking at @p as - * the real parent; otherwise, a single stop can be - * reported twice as group and ptrace stops. - * - * If a ptracer wants to distinguish the two events for its - * own children, it should create a separate process which - * takes the role of real parent. - */ - if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p)) - return 0; - - /* * @p is alive and it's gonna stop, continue or exit, so * there always is something to wait for. */ @@ -1825,9 +1578,6 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, } put_pid(pid); - - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(5, ret, which, upid, infop, options, ru); return ret; } @@ -1865,8 +1615,6 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, ret = do_wait(&wo); put_pid(pid); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(4, ret, upid, stat_addr, options, ru); return ret; } |
