diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/audit.c | 29 | ||||
-rw-r--r-- | kernel/exit.c | 91 | ||||
-rw-r--r-- | kernel/fork.c | 33 | ||||
-rw-r--r-- | kernel/ptrace.c | 197 | ||||
-rw-r--r-- | kernel/rcutree_plugin.h | 53 | ||||
-rw-r--r-- | kernel/sched.c | 233 | ||||
-rw-r--r-- | kernel/sched_fair.c | 46 | ||||
-rw-r--r-- | kernel/sched_features.h | 2 | ||||
-rw-r--r-- | kernel/signal.c | 444 | ||||
-rw-r--r-- | kernel/softirq.c | 12 | ||||
-rw-r--r-- | kernel/workqueue.c | 81 |
11 files changed, 847 insertions, 374 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 93950031706..52501b5d490 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -55,6 +55,9 @@ #include <net/sock.h> #include <net/netlink.h> #include <linux/skbuff.h> +#ifdef CONFIG_SECURITY +#include <linux/security.h> +#endif #include <linux/netlink.h> #include <linux/freezer.h> #include <linux/tty.h> @@ -1502,6 +1505,32 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, } } +#ifdef CONFIG_SECURITY +/** + * audit_log_secctx - Converts and logs SELinux context + * @ab: audit_buffer + * @secid: security number + * + * This is a helper function that calls security_secid_to_secctx to convert + * secid to secctx and then adds the (converted) SELinux context to the audit + * log by calling audit_log_format, thus also preventing leak of internal secid + * to userspace. If secid cannot be converted audit_panic is called. + */ +void audit_log_secctx(struct audit_buffer *ab, u32 secid) +{ + u32 len; + char *secctx; + + if (security_secid_to_secctx(secid, &secctx, &len)) { + audit_panic("Cannot convert secid to context"); + } else { + audit_log_format(ab, " obj=%s", secctx); + security_release_secctx(secctx, len); + } +} +EXPORT_SYMBOL(audit_log_secctx); +#endif + EXPORT_SYMBOL(audit_log_start); EXPORT_SYMBOL(audit_log_end); EXPORT_SYMBOL(audit_log_format); diff --git a/kernel/exit.c b/kernel/exit.c index f2b321bae44..73bb192a3d3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -169,7 +169,6 @@ void release_task(struct task_struct * p) struct task_struct *leader; int zap_leader; repeat: - tracehook_prepare_release_task(p); /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); @@ -179,7 +178,7 @@ repeat: proc_flush_task(p); write_lock_irq(&tasklist_lock); - tracehook_finish_release_task(p); + ptrace_release_task(p); __exit_signal(p); /* @@ -190,22 +189,12 @@ repeat: zap_leader = 0; leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { - BUG_ON(task_detached(leader)); - do_notify_parent(leader, leader->exit_signal); /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, * then we are the one who should release the leader. - * - * do_notify_parent() will have marked it self-reaping in - * that case. - */ - zap_leader = task_detached(leader); - - /* - * This maintains the invariant that release_task() - * only runs on a task in EXIT_DEAD, just for sanity. */ + zap_leader = do_notify_parent(leader, leader->exit_signal); if (zap_leader) leader->exit_state = EXIT_DEAD; } @@ -277,18 +266,16 @@ int is_current_pgrp_orphaned(void) return retval; } -static int has_stopped_jobs(struct pid *pgrp) +static bool has_stopped_jobs(struct pid *pgrp) { - int retval = 0; struct task_struct *p; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { - if (!task_is_stopped(p)) - continue; - retval = 1; - break; + if (p->signal->flags & SIGNAL_STOP_STOPPED) + return true; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); - return retval; + + return false; } /* @@ -751,7 +738,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, { list_move_tail(&p->sibling, &p->real_parent->children); - if (task_detached(p)) + if (p->exit_state == EXIT_DEAD) return; /* * If this is a threaded reparent there is no need to @@ -764,10 +751,9 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, p->exit_signal = SIGCHLD; /* If it has exited notify the new parent about this child's death. */ - if (!task_ptrace(p) && + if (!p->ptrace && p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { - do_notify_parent(p, p->exit_signal); - if (task_detached(p)) { + if (do_notify_parent(p, p->exit_signal)) { p->exit_state = EXIT_DEAD; list_move_tail(&p->sibling, dead); } @@ -794,7 +780,7 @@ static void forget_original_parent(struct task_struct *father) do { t->real_parent = reaper; if (t->parent == father) { - BUG_ON(task_ptrace(t)); + BUG_ON(t->ptrace); t->parent = t->real_parent; } if (t->pdeath_signal) @@ -819,8 +805,7 @@ static void forget_original_parent(struct task_struct *father) */ static void exit_notify(struct task_struct *tsk, int group_dead) { - int signal; - void *cookie; + bool autoreap; /* * This does two things: @@ -851,26 +836,33 @@ static void exit_notify(struct task_struct *tsk, int group_dead) * we have changed execution domain as these two values started * the same after a fork. */ - if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && + if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD && (tsk->parent_exec_id != tsk->real_parent->self_exec_id || tsk->self_exec_id != tsk->parent_exec_id)) tsk->exit_signal = SIGCHLD; - signal = tracehook_notify_death(tsk, &cookie, group_dead); - if (signal >= 0) - signal = do_notify_parent(tsk, signal); + if (unlikely(tsk->ptrace)) { + int sig = thread_group_leader(tsk) && + thread_group_empty(tsk) && + !ptrace_reparented(tsk) ? + tsk->exit_signal : SIGCHLD; + autoreap = do_notify_parent(tsk, sig); + } else if (thread_group_leader(tsk)) { + autoreap = thread_group_empty(tsk) && + do_notify_parent(tsk, tsk->exit_signal); + } else { + autoreap = true; + } - tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; + tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) wake_up_process(tsk->signal->group_exit_task); write_unlock_irq(&tasklist_lock); - tracehook_report_death(tsk, signal, cookie, group_dead); - /* If the process is dead, release it - nobody will wait for it */ - if (signal == DEATH_REAP) + if (autoreap) release_task(tsk); } @@ -923,7 +915,7 @@ NORET_TYPE void do_exit(long code) */ set_fs(USER_DS); - tracehook_report_exit(&code); + ptrace_event(PTRACE_EVENT_EXIT, code); validate_creds_for_do_exit(tsk); @@ -1235,9 +1227,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) traced = ptrace_reparented(p); /* * It can be ptraced but not reparented, check - * !task_detached() to filter out sub-threads. + * thread_group_leader() to filter out sub-threads. */ - if (likely(!traced) && likely(!task_detached(p))) { + if (likely(!traced) && thread_group_leader(p)) { struct signal_struct *psig; struct signal_struct *sig; unsigned long maxrss; @@ -1345,16 +1337,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) /* We dropped tasklist, ptracer could die and untrace */ ptrace_unlink(p); /* - * If this is not a detached task, notify the parent. - * If it's still not detached after that, don't release - * it now. + * If this is not a sub-thread, notify the parent. + * If parent wants a zombie, don't release it now. */ - if (!task_detached(p)) { - do_notify_parent(p, p->exit_signal); - if (!task_detached(p)) { - p->exit_state = EXIT_ZOMBIE; - p = NULL; - } + if (thread_group_leader(p) && + !do_notify_parent(p, p->exit_signal)) { + p->exit_state = EXIT_ZOMBIE; + p = NULL; } write_unlock_irq(&tasklist_lock); } @@ -1367,7 +1356,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) static int *task_stopped_code(struct task_struct *p, bool ptrace) { if (ptrace) { - if (task_is_stopped_or_traced(p)) + if (task_is_stopped_or_traced(p) && + !(p->jobctl & JOBCTL_LISTENING)) return &p->exit_code; } else { if (p->signal->flags & SIGNAL_STOP_STOPPED) @@ -1563,7 +1553,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, * Notification and reaping will be cascaded to the real * parent when the ptracer detaches. */ - if (likely(!ptrace) && unlikely(task_ptrace(p))) { + if (likely(!ptrace) && unlikely(p->ptrace)) { /* it will become visible, clear notask_error */ wo->notask_error = 0; return 0; @@ -1606,8 +1596,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, * own children, it should create a separate process which * takes the role of real parent. */ - if (likely(!ptrace) && task_ptrace(p) && - same_thread_group(p->parent, p->real_parent)) + if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p)) return 0; /* diff --git a/kernel/fork.c b/kernel/fork.c index 0276c30401a..4d4117e0150 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -37,7 +37,6 @@ #include <linux/swap.h> #include <linux/syscalls.h> #include <linux/jiffies.h> -#include <linux/tracehook.h> #include <linux/futex.h> #include <linux/compat.h> #include <linux/kthread.h> @@ -1340,7 +1339,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, } if (likely(p->pid)) { - tracehook_finish_clone(p, clone_flags, trace); + ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); if (thread_group_leader(p)) { if (is_child_reaper(pid)) @@ -1481,10 +1480,22 @@ long do_fork(unsigned long clone_flags, } /* - * When called from kernel_thread, don't do user tracing stuff. + * Determine whether and which event to report to ptracer. When + * called from kernel_thread or CLONE_UNTRACED is explicitly + * requested, no event is reported; otherwise, report if the event + * for the type of forking is enabled. */ - if (likely(user_mode(regs))) - trace = tracehook_prepare_clone(clone_flags); + if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) { + if (clone_flags & CLONE_VFORK) + trace = PTRACE_EVENT_VFORK; + else if ((clone_flags & CSIGNAL) != SIGCHLD) + trace = PTRACE_EVENT_CLONE; + else + trace = PTRACE_EVENT_FORK; + + if (likely(!ptrace_event_enabled(current, trace))) + trace = 0; + } p = copy_process(clone_flags, stack_start, regs, stack_size, child_tidptr, NULL, trace); @@ -1508,26 +1519,26 @@ long do_fork(unsigned long clone_flags, } audit_finish_fork(p); - tracehook_report_clone(regs, clone_flags, nr, p); /* * We set PF_STARTING at creation in case tracing wants to * use this to distinguish a fully live task from one that - * hasn't gotten to tracehook_report_clone() yet. Now we - * clear it and set the child going. + * hasn't finished SIGSTOP raising yet. Now we clear it + * and set the child going. */ p->flags &= ~PF_STARTING; wake_up_new_task(p); - tracehook_report_clone_complete(trace, regs, - clone_flags, nr, p); + /* forking complete and child started to run, tell ptracer */ + if (unlikely(trace)) + ptrace_event(trace, nr); if (clone_flags & CLONE_VFORK) { freezer_do_not_count(); wait_for_completion(&vfork); freezer_count(); - tracehook_report_vfork_done(p, nr); + ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); } } else { nr = PTR_ERR(p); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2df115790cd..9de3ecfd20f 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -23,8 +23,15 @@ #include <linux/uaccess.h> #include <linux/regset.h> #include <linux/hw_breakpoint.h> +#include <linux/cn_proc.h> +static int ptrace_trapping_sleep_fn(void *flags) +{ + schedule(); + return 0; +} + /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. @@ -77,13 +84,20 @@ void __ptrace_unlink(struct task_struct *child) spin_lock(&child->sighand->siglock); /* - * Reinstate GROUP_STOP_PENDING if group stop is in effect and + * Clear all pending traps and TRAPPING. TRAPPING should be + * cleared regardless of JOBCTL_STOP_PENDING. Do it explicitly. + */ + task_clear_jobctl_pending(child, JOBCTL_TRAP_MASK); + task_clear_jobctl_trapping(child); + + /* + * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and * @child isn't dead. */ if (!(child->flags & PF_EXITING) && (child->signal->flags & SIGNAL_STOP_STOPPED || child->signal->group_stop_count)) - child->group_stop |= GROUP_STOP_PENDING; + child->jobctl |= JOBCTL_STOP_PENDING; /* * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick @@ -91,16 +105,30 @@ void __ptrace_unlink(struct task_struct *child) * is in TASK_TRACED; otherwise, we might unduly disrupt * TASK_KILLABLE sleeps. */ - if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child)) + if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child)) signal_wake_up(child, task_is_traced(child)); spin_unlock(&child->sighand->siglock); } -/* - * Check that we have indeed attached to the thing.. +/** + * ptrace_check_attach - check whether ptracee is ready for ptrace operation + * @child: ptracee to check for + * @ignore_state: don't check whether @child is currently %TASK_TRACED + * + * Check whether @child is being ptraced by %current and ready for further + * ptrace operations. If @ignore_state is %false, @child also should be in + * %TASK_TRACED state and on return the child is guaranteed to be traced + * and not executing. If @ignore_state is %true, @child can be in any + * state. + * + * CONTEXT: + * Grabs and releases tasklist_lock and @child->sighand->siglock. + * + * RETURNS: + * 0 on success, -ESRCH if %child is not ready. */ -int ptrace_check_attach(struct task_struct *child, int kill) +int ptrace_check_attach(struct task_struct *child, bool ignore_state) { int ret = -ESRCH; @@ -119,13 +147,14 @@ int ptrace_check_attach(struct task_struct *child, int kill) */ spin_lock_irq(&child->sighand->siglock); WARN_ON_ONCE(task_is_stopped(child)); - if (task_is_traced(child) || kill) + if (ignore_state || (task_is_traced(child) && + !(child->jobctl & JOBCTL_LISTENING))) ret = 0; spin_unlock_irq(&child->sighand->siglock); } read_unlock(&tasklist_lock); - if (!ret && !kill) + if (!ret && !ignore_state) ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; /* All systems go.. */ @@ -182,11 +211,28 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) return !err; } -static int ptrace_attach(struct task_struct *task) +static int ptrace_attach(struct task_struct *task, long request, + unsigned long flags) { - bool wait_trap = false; + bool seize = (request == PTRACE_SEIZE); int retval; + /* + * SEIZE will enable new ptrace behaviors which will be implemented + * gradually. SEIZE_DEVEL is used to prevent applications + * expecting full SEIZE behaviors trapping on kernel commits which + * are still in the process of implementing them. + * + * Only test programs for new ptrace behaviors being implemented + * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO. + * + * Once SEIZE behaviors are completely implemented, this flag and + * the following test will be removed. + */ + retval = -EIO; + if (seize && !(flags & PTRACE_SEIZE_DEVEL)) + goto out; + audit_ptrace(task); retval = -EPERM; @@ -218,16 +264,21 @@ static int ptrace_attach(struct task_struct *task) goto unlock_tasklist; task->ptrace = PT_PTRACED; + if (seize) + task->ptrace |= PT_SEIZED; if (task_ns_capable(task, CAP_SYS_PTRACE)) task->ptrace |= PT_PTRACE_CAP; __ptrace_link(task, current); - send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); + + /* SEIZE doesn't trap tracee on attach */ + if (!seize) + send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); spin_lock(&task->sighand->siglock); /* - * If the task is already STOPPED, set GROUP_STOP_PENDING and + * If the task is already STOPPED, set JOBCTL_TRAP_STOP and * TRAPPING, and kick it so that it transits to TRACED. TRAPPING * will be cleared if the child completes the transition or any * event which clears the group stop states happens. We'll wait @@ -243,11 +294,9 @@ static int ptrace_attach(struct task_struct *task) * The following task_is_stopped() test is safe as both transitions * in and out of STOPPED are protected by siglock. */ - if (task_is_stopped(task)) { - task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING; + if (task_is_stopped(task) && + task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) signal_wake_up(task, 1); - wait_trap = true; - } spin_unlock(&task->sighand->siglock); @@ -257,9 +306,12 @@ unlock_tasklist: unlock_creds: mutex_unlock(&task->signal->cred_guard_mutex); out: - if (wait_trap) - wait_event(current->signal->wait_chldexit, - !(task->group_stop & GROUP_STOP_TRAPPING)); + if (!retval) { + wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, + ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE); + proc_ptrace_connector(task, PTRACE_ATTACH); + } + return retval; } @@ -322,25 +374,27 @@ static int ignoring_children(struct sighand_struct *sigh) */ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) { + bool dead; + __ptrace_unlink(p); - if (p->exit_state == EXIT_ZOMBIE) { - if (!task_detached(p) && thread_group_empty(p)) { - if (!same_thread_group(p->real_parent, tracer)) - do_notify_parent(p, p->exit_signal); - else if (ignoring_children(tracer->sighand)) { - __wake_up_parent(p, tracer); - p->exit_signal = -1; - } - } - if (task_detached(p)) { - /* Mark it as in the process of being reaped. */ - p->exit_state = EXIT_DEAD; - return true; + if (p->exit_state != EXIT_ZOMBIE) + return false; + + dead = !thread_group_leader(p); + + if (!dead && thread_group_empty(p)) { + if (!same_thread_group(p->real_parent, tracer)) + dead = do_notify_parent(p, p->exit_signal); + else if (ignoring_children(tracer->sighand)) { + __wake_up_parent(p, tracer); + dead = true; } } - - return false; + /* Mark it as in the process of being reaped. */ + if (dead) + p->exit_state = EXIT_DEAD; + return dead; } static int ptrace_detach(struct task_struct *child, unsigned int data) @@ -365,6 +419,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) } write_unlock_irq(&tasklist_lock); + proc_ptrace_connector(child, PTRACE_DETACH); if (unlikely(dead)) release_task(child); @@ -611,10 +666,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, int ptrace_request(struct task_struct *child, long request, unsigned long addr, unsigned long data) { + bool seized = child->ptrace & PT_SEIZED; int ret = -EIO; - siginfo_t siginfo; + siginfo_t siginfo, *si; void __user *datavp = (void __user *) data; unsigned long __user *datalp = datavp; + unsigned long flags; switch (request) { case PTRACE_PEEKTEXT: @@ -647,6 +704,62 @@ int ptrace_request(struct task_struct *child, long request, ret = ptrace_setsiginfo(child, &siginfo); break; + case PTRACE_INTERRUPT: + /* + * Stop tracee without any side-effect on signal or job + * control. At least one trap is guaranteed to happen + * after this request. If @child is already trapped, the + * current trap is not disturbed and another trap will + * happen after the current trap is ended with PTRACE_CONT. + * + * The actual trap might not be PTRACE_EVENT_STOP trap but + * the pending condition is cleared regardless. + */ + if (unlikely(!seized || !lock_task_sighand(child, &flags))) + break; + + /* + * INTERRUPT doesn't disturb existing trap sans one + * exception. If ptracer issued LISTEN for the current + * STOP, this INTERRUPT should clear LISTEN and re-trap + * tracee into STOP. + */ + if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP))) + signal_wake_up(child, child->jobctl & JOBCTL_LISTENING); + + unlock_task_sighand(child, &flags); + ret = 0; + break; + + case PTRACE_LISTEN: + /* + * Listen for events. Tracee must be in STOP. It's not + * resumed per-se but is not considered to be in TRACED by + * wait(2) or ptrace(2). If an async event (e.g. group + * stop state change) happens, tracee will enter STOP trap + * again. Alternatively, ptracer can issue INTERRUPT to + * finish listening and re-trap tracee into STOP. + */ + if (unlikely(!seized || !lock_task_sighand(child, &flags))) + break; + + si = child->last_siginfo; + if (unlikely(!si || si->si_code >> 8 != PTRACE_EVENT_STOP)) + break; + + child->jobctl |= JOBCTL_LISTENING; + + /* + * If NOTIFY is set, it means event happened between start + * of this trap and now. Trigger re-trap immediately. + */ + if (child->jobctl & JOBCTL_TRAP_NOTIFY) + signal_wake_up(child, true); + + unlock_task_sighand(child, &flags); + ret = 0; + break; + case PTRACE_DETACH: /* detach a process that was attached. */ ret = ptrace_detach(child, data); break; @@ -761,8 +874,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, goto out; } - if (request == PTRACE_ATTACH) { - ret = ptrace_attach(child); + if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { + ret = ptrace_attach(child, request, data); /* * Some architectures need to do book-keeping after * a ptrace attach. @@ -772,7 +885,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, goto out_put_task_struct; } - ret = ptrace_check_attach(child, request == PTRACE_KILL); + ret = ptrace_check_attach(child, request == PTRACE_KILL || + request == PTRACE_INTERRUPT); if (ret < 0) goto out_put_task_struct; @@ -903,8 +1017,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, goto out; } - if (request == PTRACE_ATTACH) { - ret = ptrace_attach(child); + if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { + ret = ptrace_attach(child, request, data); /* * Some architectures need to do book-keeping after * a ptrace attach. @@ -914,7 +1028,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, goto out_put_task_struct; } - ret = ptrace_check_attach(child, request == PTRACE_KILL); + ret = ptrace_check_attach(child, request == PTRACE_KILL || + request == PTRACE_INTERRUPT); if (!ret) ret = compat_arch_ptrace(child, request, addr, data); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 75113cb7c4f..8aafbb80b8b 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -68,6 +68,7 @@ struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); static struct rcu_state *rcu_state = &rcu_preempt_state; +static void rcu_read_unlock_special(struct task_struct *t); static int rcu_preempted_readers_exp(struct rcu_node *rnp); /* @@ -147,7 +148,7 @@ static void rcu_preempt_note_context_switch(int cpu) struct rcu_data *rdp; struct rcu_node *rnp; - if (t->rcu_read_lock_nesting && + if (t->rcu_read_lock_nesting > 0 && (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { /* Possibly blocking in an RCU read-side critical section. */ @@ -190,6 +191,14 @@ static void rcu_preempt_note_context_switch(int cpu) rnp->gp_tasks = &t->rcu_node_entry; } raw_spin_unlock_irqrestore(&rnp->lock, flags); + } else if (t->rcu_read_lock_nesting < 0 && + t->rcu_read_unlock_special) { + + /* + * Complete exit from RCU read-side critical section on + * behalf of preempted instance of __rcu_read_unlock(). + */ + rcu_read_unlock_special(t); } /* @@ -284,7 +293,7 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t, * notify RCU core processing or task having blocked during the RCU * read-side critical section. */ -static void rcu_read_unlock_special(struct task_struct *t) +static noinline void rcu_read_unlock_special(struct task_struct *t) { int empty; int empty_exp; @@ -309,7 +318,7 @@ static void rcu_read_unlock_special(struct task_struct *t) } /* Hardware IRQ handlers cannot block. */ - if (in_irq()) { + if (in_irq() || in_serving_softirq()) { local_irq_restore(flags); return; } @@ -342,6 +351,11 @@ static void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST if (&t->rcu_node_entry == rnp->boost_tasks) rnp->boost_tasks = np; + /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */ + if (t->rcu_boosted) { + special |= RCU_READ_UNLOCK_BOOSTED; + t->rcu_boosted = 0; + } #endif /* #ifdef CONFIG_RCU_BOOST */ t->rcu_blocked_node = NULL; @@ -358,7 +372,6 @@ static void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ if (special & RCU_READ_UNLOCK_BOOSTED) { - t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; rt_mutex_unlock(t->rcu_boost_mutex); t->rcu_boost_mutex = NULL; } @@ -387,13 +400,22 @@ void __rcu_read_unlock(void) struct task_struct *t = current; barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ - --t->rcu_read_lock_nesting; - barrier(); /* decrement before load of ->rcu_read_unlock_special */ - if (t->rcu_read_lock_nesting == 0 && - unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) - rcu_read_unlock_special(t); + if (t->rcu_read_lock_nesting != 1) + --t->rcu_read_lock_nesting; + else { + t->rcu_read_lock_nesting = INT_MIN; + barrier(); /* assign before ->rcu_read_unlock_special load */ + if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + rcu_read_unlock_special(t); + barrier(); /* ->rcu_read_unlock_special load before assign */ + t->rcu_read_lock_nesting = 0; + } #ifdef CONFIG_PROVE_LOCKING - WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0); + { + int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); + + WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); + } #endif /* #ifdef CONFIG_PROVE_LOCKING */ } EXPORT_SYMBOL_GPL(__rcu_read_unlock); @@ -589,7 +611,8 @@ static void rcu_preempt_check_callbacks(int cpu) rcu_preempt_qs(cpu); return; } - if (per_cpu(rcu_preempt_data, cpu).qs_pending) + if (t->rcu_read_lock_nesting > 0 && + per_cpu(rcu_preempt_data, cpu).qs_pending) t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; } @@ -695,9 +718,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) raw_spin_lock_irqsave(&rnp->lock, flags); for (;;) { - if (!sync_rcu_preempt_exp_done(rnp)) + if (!sync_rcu_preempt_exp_done(rnp)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); break; + } if (rnp->parent == NULL) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); wake_up(&sync_rcu_preempt_exp_wq); break; } @@ -707,7 +733,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) raw_spin_lock(&rnp->lock); /* irqs already disabled */ rnp->expmask &= ~mask; } - raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -1174,7 +1199,7 @@ static int rcu_boost(struct rcu_node *rnp) t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&mtx, t); t->rcu_boost_mutex = &mtx; - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; + t->rcu_boosted = 1; raw_spin_unlock_irqrestore(&rnp->lock, flags); rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ diff --git a/kernel/sched.c b/kernel/sched.c index 3dc716f6d8a..fde6ff90352 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2544,13 +2544,9 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) } #ifdef CONFIG_SMP -static void sched_ttwu_pending(void) +static void sched_ttwu_do_pending(struct task_struct *list) { struct rq *rq = this_rq(); - struct task_struct *list = xchg(&rq->wake_list, NULL); - - if (!list) - return; raw_spin_lock(&rq->lock); @@ -2563,9 +2559,45 @@ static void sched_ttwu_pending(void) raw_spin_unlock(&rq->lock); } +#ifdef CONFIG_HOTPLUG_CPU + +static void sched_ttwu_pending(void) +{ + struct rq *rq = this_rq(); + struct task_struct *list = xchg(&rq->wake_list, NULL); + + if (!list) + return; + + sched_ttwu_do_pending(list); +} + +#endif /* CONFIG_HOTPLUG_CPU */ + void scheduler_ipi(void) { - sched_ttwu_pending(); + struct rq *rq = this_rq(); + struct task_struct *list = xchg(&rq->wake_list, NULL); + + if (!list) + return; + + /* + * Not all reschedule IPI handlers call irq_enter/irq_exit, since + * traditionally all their work was done from the interrupt return + * path. Now that we actually do some work, we need to make sure + * we do call them. + * + * Some archs already do call them, luckily irq_enter/exit nest + * properly. + * + * Arguably we should visit all archs and update all handlers, + * however a fair share of IPIs are still resched only so this would + * somewhat pessimize the simple resched case. + */ + irq_enter(); + sched_ttwu_do_pending(list); + irq_exit(); } static void ttwu_queue_remote(struct task_struct *p, int cpu) @@ -6557,7 +6589,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!group->cpu_power) { + if (!group->sgp->power) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: domain->cpu_power not " "set\n"); @@ -6581,9 +6613,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); - if (group->cpu_power != SCHED_POWER_SCALE) { + if (group->sgp->power != SCHED_POWER_SCALE) { printk(KERN_CONT " (cpu_power = %d)", - group->cpu_power); + group->sgp->power); } group = group->next; @@ -6774,11 +6806,39 @@ static struct root_domain *alloc_rootdomain(void) return rd; } +static void free_sched_groups(struct sched_group *sg, int free_sgp) +{ + struct sched_group *tmp, *first; + + if (!sg) + return; + + first = sg; + do { + tmp = sg->next; + + if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) + kfree(sg->sgp); + + kfree(sg); + sg = tmp; + } while (sg != first); +} + static void free_sched_domain(struct rcu_head *rcu) { struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); - if (atomic_dec_and_test(&sd->groups->ref)) + + /* + * If its an overlapping domain it has private groups, iterate and + * nuke them all. + */ + if (sd->flags & SD_OVERLAP) { + free_sched_groups(sd->groups, 1); + } else if (atomic_dec_and_test(&sd->groups->ref)) { + kfree(sd->groups->sgp); kfree(sd->groups); + } kfree(sd); } @@ -6945,6 +7005,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; struct sd_data { struct sched_domain **__percpu sd; struct sched_group **__percpu sg; + struct sched_group_power **__percpu sgp; }; struct s_data { @@ -6964,15 +7025,73 @@ struct sched_domain_topology_level; typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); +#define SDTL_OVERLAP 0x01 + struct sched_domain_topology_level { sched_domain_init_f init; sched_domain_mask_f mask; + int flags; struct sd_data data; }; -/* - * Assumes the sched_domain tree is fully constructed - */ +static int +build_overlap_sched_groups(struct sched_domain *sd, int cpu) +{ + struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; |