From db05021d49a994ee40a9735d9c3cb0060c9babb8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 27 Feb 2013 21:48:09 -0500 Subject: ftrace: Update the kconfig for DYNAMIC_FTRACE The prompt to enable DYNAMIC_FTRACE (the ability to nop and enable function tracing at run time) had a confusing statement: "enable/disable ftrace tracepoints dynamically" This was written before tracepoints were added to the kernel, but now that tracepoints have been added, this is very confusing and has confused people enough to give wrong information during presentations. Not only that, I looked at the help text, and it still references that dreaded daemon that use to wake up once a second to update the nop locations and brick NICs, that hasn't been around for over five years. Time to bring the text up to the current decade. Cc: stable@vger.kernel.org Reported-by: Ezequiel Garcia Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 36567564e22..b516a8e19d5 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -429,24 +429,28 @@ config PROBE_EVENTS def_bool n config DYNAMIC_FTRACE - bool "enable/disable ftrace tracepoints dynamically" + bool "enable/disable function tracing dynamically" depends on FUNCTION_TRACER depends on HAVE_DYNAMIC_FTRACE default y help - This option will modify all the calls to ftrace dynamically - (will patch them out of the binary image and replace them - with a No-Op instruction) as they are called. A table is - created to dynamically enable them again. + This option will modify all the calls to function tracing + dynamically (will patch them out of the binary image and + replace them with a No-Op instruction) on boot up. During + compile time, a table is made of all the locations that ftrace + can function trace, and this table is linked into the kernel + image. When this is enabled, functions can be individually + enabled, and the functions not enabled will not affect + performance of the system. + + See the files in /sys/kernel/debug/tracing: + available_filter_functions + set_ftrace_filter + set_ftrace_notrace This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise has native performance as long as no tracing is active. - The changes to the code are done by a kernel thread that - wakes up once a second and checks to see if any ftrace calls - were made. If so, it runs stop_machine (stops all CPUS) - and modifies the code to jump over the call to ftrace. - config DYNAMIC_FTRACE_WITH_REGS def_bool y depends on DYNAMIC_FTRACE -- cgit v1.2.3-70-g09d2 From d8741e2e88ac9a458765a0c7b4e6542d7c038334 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 5 Mar 2013 10:25:16 -0500 Subject: tracing: Add help of snapshot feature when snapshot is empty When cat'ing the snapshot file, instead of showing an empty trace header like the trace file does, show how to use the snapshot feature. Also, this is a good place to show if the snapshot has been allocated or not. Users may want to "pre allocate" the snapshot to have a fast "swap" of the current buffer. Otherwise, a swap would be slow and might fail as it would need to allocate the snapshot buffer, and that might fail under tight memory constraints. Here's what it looked like before: # tracer: nop # # entries-in-buffer/entries-written: 0/0 #P:4 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | Here's what it looks like now: # tracer: nop # # # * Snapshot is freed * # # Snapshot commands: # echo 0 > snapshot : Clears and frees snapshot buffer # echo 1 > snapshot : Allocates snapshot buffer, if not already allocated. # Takes a snapshot of the main buffer. # echo 2 > snapshot : Clears snapshot buffer (but does not allocate) # (Doesn't have to be '2' works with any number that # is not a '0' or '1') Acked-by: Hiraku Toyooka Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c2e2c231037..9e3120b8a2a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2400,6 +2400,27 @@ static void test_ftrace_alive(struct seq_file *m) seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); } +#ifdef CONFIG_TRACER_MAX_TRACE +static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) +{ + if (iter->trace->allocated_snapshot) + seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); + else + seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); + + seq_printf(m, "# Snapshot commands:\n"); + seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); + seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); + seq_printf(m, "# Takes a snapshot of the main buffer.\n"); + seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n"); + seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); + seq_printf(m, "# is not a '0' or '1')\n"); +} +#else +/* Should never be called */ +static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { } +#endif + static int s_show(struct seq_file *m, void *v) { struct trace_iterator *iter = v; @@ -2411,7 +2432,9 @@ static int s_show(struct seq_file *m, void *v) seq_puts(m, "#\n"); test_ftrace_alive(m); } - if (iter->trace && iter->trace->print_header) + if (iter->snapshot && trace_empty(iter)) + print_snapshot_help(m, iter); + else if (iter->trace && iter->trace->print_header) iter->trace->print_header(m); else trace_default_header(m); -- cgit v1.2.3-70-g09d2 From c9960e48543799f168c4c9486f9790fb686ce5a8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 5 Mar 2013 10:53:02 -0500 Subject: tracing: Do not return EINVAL in snapshot when not allocated To use the tracing snapshot feature, writing a '1' into the snapshot file causes the snapshot buffer to be allocated if it has not already been allocated and dose a 'swap' with the main buffer, so that the snapshot now contains what was in the main buffer, and the main buffer now writes to what was the snapshot buffer. To free the snapshot buffer, a '0' is written into the snapshot file. To clear the snapshot buffer, any number but a '0' or '1' is written into the snapshot file. But if the file is not allocated it returns -EINVAL error code. This is rather pointless. It is better just to do nothing and return success. Acked-by: Hiraku Toyooka Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9e3120b8a2a..1f835a83cb2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4167,8 +4167,6 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, default: if (current_trace->allocated_snapshot) tracing_reset_online_cpus(&max_tr); - else - ret = -EINVAL; break; } -- cgit v1.2.3-70-g09d2 From a7dc19b8652c862d5b7c4d2339bd3c428bd29c4a Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 7 Mar 2013 15:09:24 +0000 Subject: clockevents: Don't allow dummy broadcast timers Currently tick_check_broadcast_device doesn't reject clock_event_devices with CLOCK_EVT_FEAT_DUMMY, and may select them in preference to real hardware if they have a higher rating value. In this situation, the dummy timer is responsible for broadcasting to itself, and the core clockevents code may attempt to call non-existent callbacks for programming the dummy, eventually leading to a panic. This patch makes tick_check_broadcast_device always reject dummy timers, preventing this problem. Signed-off-by: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: Jon Medhurst (Tixy) Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 2fb8cb88df8..7f32fe0e52c 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -67,7 +67,8 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) */ int tick_check_broadcast_device(struct clock_event_device *dev) { - if ((tick_broadcast_device.evtdev && + if ((dev->features & CLOCK_EVT_FEAT_DUMMY) || + (tick_broadcast_device.evtdev && tick_broadcast_device.evtdev->rating >= dev->rating) || (dev->features & CLOCK_EVT_FEAT_C3STOP)) return 0; -- cgit v1.2.3-70-g09d2 From eb2834285cf172856cd12f66892fc7467935ebed Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 8 Mar 2013 15:18:28 -0800 Subject: workqueue: fix possible pool stall bug in wq_unbind_fn() Since multiple pools per cpu have been introduced, wq_unbind_fn() has a subtle bug which may theoretically stall work item processing. The problem is two-fold. * wq_unbind_fn() depends on the worker executing wq_unbind_fn() itself to start unbound chain execution, which works fine when there was only single pool. With multiple pools, only the pool which is running wq_unbind_fn() - the highpri one - is guaranteed to have such kick-off. The other pool could stall when its busy workers block. * The current code is setting WORKER_UNBIND / POOL_DISASSOCIATED of the two pools in succession without initiating work execution inbetween. Because setting the flags requires grabbing assoc_mutex which is held while new workers are created, this could lead to stalls if a pool's manager is waiting for the previous pool's work items to release memory. This is almost purely theoretical tho. Update wq_unbind_fn() such that it sets WORKER_UNBIND / POOL_DISASSOCIATED, goes over schedule() and explicitly kicks off execution for a pool and then moves on to the next one. tj: Updated comments and description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/workqueue.c | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 81f2457811e..604801b91cb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3446,28 +3446,34 @@ static void wq_unbind_fn(struct work_struct *work) spin_unlock_irq(&pool->lock); mutex_unlock(&pool->assoc_mutex); - } - /* - * Call schedule() so that we cross rq->lock and thus can guarantee - * sched callbacks see the %WORKER_UNBOUND flag. This is necessary - * as scheduler callbacks may be invoked from other cpus. - */ - schedule(); + /* + * Call schedule() so that we cross rq->lock and thus can + * guarantee sched callbacks see the %WORKER_UNBOUND flag. + * This is necessary as scheduler callbacks may be invoked + * from other cpus. + */ + schedule(); - /* - * Sched callbacks are disabled now. Zap nr_running. After this, - * nr_running stays zero and need_more_worker() and keep_working() - * are always true as long as the worklist is not empty. Pools on - * @cpu now behave as unbound (in terms of concurrency management) - * pools which are served by workers tied to the CPU. - * - * On return from this function, the current worker would trigger - * unbound chain execution of pending work items if other workers - * didn't already. - */ - for_each_std_worker_pool(pool, cpu) + /* + * Sched callbacks are disabled now. Zap nr_running. + * After this, nr_running stays zero and need_more_worker() + * and keep_working() are always true as long as the + * worklist is not empty. This pool now behaves as an + * unbound (in terms of concurrency management) pool which + * are served by workers tied to the pool. + */ atomic_set(&pool->nr_running, 0); + + /* + * With concurrency management just turned off, a busy + * worker blocking could lead to lengthy stalls. Kick off + * unbound chain execution of currently pending work items. + */ + spin_lock_irq(&pool->lock); + wake_up_worker(pool); + spin_unlock_irq(&pool->lock); + } } /* -- cgit v1.2.3-70-g09d2 From 2721e72dd10f71a3ba90f59781becf02638aa0d9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 12 Mar 2013 11:32:32 -0400 Subject: tracing: Fix race in snapshot swapping Although the swap is wrapped with a spin_lock, the assignment of the temp buffer used to swap is not within that lock. It needs to be moved into that lock, otherwise two swaps happening on two different CPUs, can end up using the wrong temp buffer to assign in the swap. Luckily, all current callers of the swap function appear to have their own locks. But in case something is added that allows two different callers to call the swap, then there's a chance that this race can trigger and corrupt the buffers. New code is coming soon that will allow for this race to trigger. I've Cc'd stable, so this bug will not show up if someone backports one of the changes that can trigger this bug. Cc: stable@vger.kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1f835a83cb2..53df2839bb9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -704,7 +704,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { - struct ring_buffer *buf = tr->buffer; + struct ring_buffer *buf; if (trace_stop_count) return; @@ -719,6 +719,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) arch_spin_lock(&ftrace_max_lock); + buf = tr->buffer; tr->buffer = max_tr.buffer; max_tr.buffer = buf; -- cgit v1.2.3-70-g09d2 From 20f22ab42e9c832bde6e9a7ed04cdc73ec737e5b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 4 Mar 2013 14:32:59 -0800 Subject: signals: fix new kernel-doc warnings Fix new kernel-doc warnings in kernel/signal.c: Warning(kernel/signal.c:2689): No description found for parameter 'uset' Warning(kernel/signal.c:2689): Excess function parameter 'set' description in 'sys_rt_sigpending' Signed-off-by: Randy Dunlap Cc: Alexander Viro Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 2ec870a4c3c..d63c79e7e41 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2682,7 +2682,7 @@ static int do_sigpending(void *set, unsigned long sigsetsize) /** * sys_rt_sigpending - examine a pending signal that has been raised * while blocked - * @set: stores pending signals + * @uset: stores pending signals * @sigsetsize: size of sigset_t type or larger */ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) -- cgit v1.2.3-70-g09d2 From 6c23cbbd5056b155401b0a2b5567d530e6c750c4 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 5 Mar 2013 10:00:24 -0800 Subject: futex: fix kernel-doc notation and spello Fix kernel-doc warning in futex.c and convert 'Returns' to the new Return: kernel-doc notation format. Warning(kernel/futex.c:2286): Excess function parameter 'clockrt' description in 'futex_wait_requeue_pi' Fix one spello. Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- kernel/futex.c | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index f0090a993da..b26dcfc02c9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -223,7 +223,8 @@ static void drop_futex_key_refs(union futex_key *key) * @rw: mapping needs to be read/write (values: VERIFY_READ, * VERIFY_WRITE) * - * Returns a negative error code or 0 + * Return: a negative error code or 0 + * * The key words are stored in *key on success. * * For shared mappings, it's (page->index, file_inode(vma->vm_file), @@ -705,9 +706,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, * be "current" except in the case of requeue pi. * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) * - * Returns: - * 0 - ready to wait - * 1 - acquired the lock + * Return: + * 0 - ready to wait; + * 1 - acquired the lock; * <0 - error * * The hb->lock and futex_key refs shall be held by the caller. @@ -1191,9 +1192,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. * hb1 and hb2 must be held by the caller. * - * Returns: - * 0 - failed to acquire the lock atomicly - * 1 - acquired the lock + * Return: + * 0 - failed to acquire the lock atomically; + * 1 - acquired the lock; * <0 - error */ static int futex_proxy_trylock_atomic(u32 __user *pifutex, @@ -1254,8 +1255,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire * uaddr2 atomically on behalf of the top waiter. * - * Returns: - * >=0 - on success, the number of tasks requeued or woken + * Return: + * >=0 - on success, the number of tasks requeued or woken; * <0 - on error */ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, @@ -1536,8 +1537,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must * be paired with exactly one earlier call to queue_me(). * - * Returns: - * 1 - if the futex_q was still queued (and we removed unqueued it) + * Return: + * 1 - if the futex_q was still queued (and we removed unqueued it); * 0 - if the futex_q was already removed by the waking thread */ static int unqueue_me(struct futex_q *q) @@ -1707,9 +1708,9 @@ static long futex_wait_restart(struct restart_block *restart); * the pi_state owner as well as handle race conditions that may allow us to * acquire the lock. Must be called with the hb lock held. * - * Returns: - * 1 - success, lock taken - * 0 - success, lock not taken + * Return: + * 1 - success, lock taken; + * 0 - success, lock not taken; * <0 - on error (-EFAULT) */ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) @@ -1824,8 +1825,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * Return with the hb lock held and a q.key reference on success, and unlocked * with no q.key reference on failure. * - * Returns: - * 0 - uaddr contains val and hb has been locked + * Return: + * 0 - uaddr contains val and hb has been locked; * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked */ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, @@ -2203,9 +2204,9 @@ pi_faulted: * the wakeup and return the appropriate error code to the caller. Must be * called with the hb lock held. * - * Returns - * 0 - no early wakeup detected - * <0 - -ETIMEDOUT or -ERESTARTNOINTR + * Return: + * 0 = no early wakeup detected; + * <0 = -ETIMEDOUT or -ERESTARTNOINTR */ static inline int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, @@ -2247,7 +2248,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * @val: the expected value of uaddr * @abs_time: absolute timeout * @bitset: 32 bit wakeup bitset set by userspace, defaults to all - * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) * @uaddr2: the pi futex we will take prior to returning to user-space * * The caller will wait on uaddr and will be requeued by futex_requeue() to @@ -2258,7 +2258,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * there was a need to. * * We call schedule in futex_wait_queue_me() when we enqueue and return there - * via the following: + * via the following-- * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() * 2) wakeup on uaddr2 after a requeue * 3) signal @@ -2276,8 +2276,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * * If 4 or 7, we cleanup and return with -ETIMEDOUT. * - * Returns: - * 0 - On success + * Return: + * 0 - On success; * <0 - On error */ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, -- cgit v1.2.3-70-g09d2 From 740466bc89ad8bd5afcc8de220f715f62b21e365 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 13 Mar 2013 11:15:19 -0400 Subject: tracing: Fix free of probe entry by calling call_rcu_sched() Because function tracing is very invasive, and can even trace calls to rcu_read_lock(), RCU access in function tracing is done with preempt_disable_notrace(). This requires a synchronize_sched() for updates and not a synchronize_rcu(). Function probes (traceon, traceoff, etc) must be freed after a synchronize_sched() after its entry has been removed from the hash. But call_rcu() is used. Fix this by using call_rcu_sched(). Also fix the usage to use hlist_del_rcu() instead of hlist_del(). Cc: stable@vger.kernel.org Cc: Paul McKenney Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 98ca94a4181..e6effd0c40a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3108,8 +3108,8 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, continue; } - hlist_del(&entry->node); - call_rcu(&entry->rcu, ftrace_free_entry_rcu); + hlist_del_rcu(&entry->node); + call_rcu_sched(&entry->rcu, ftrace_free_entry_rcu); } } __disable_ftrace_function_probe(); -- cgit v1.2.3-70-g09d2 From e66eded8309ebf679d3d3c1f5820d1f2ca332c71 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 13 Mar 2013 11:51:49 -0700 Subject: userns: Don't allow CLONE_NEWUSER | CLONE_FS Don't allowing sharing the root directory with processes in a different user namespace. There doesn't seem to be any point, and to allow it would require the overhead of putting a user namespace reference in fs_struct (for permission checks) and incrementing that reference count on practically every call to fork. So just perform the inexpensive test of forbidding sharing fs_struct acrosss processes in different user namespaces. We already disallow other forms of threading when unsharing a user namespace so this should be no real burden in practice. This updates setns, clone, and unshare to disallow multiple user namespaces sharing an fs_struct. Cc: stable@vger.kernel.org Signed-off-by: "Eric W. Biederman" Signed-off-by: Linus Torvalds --- kernel/fork.c | 5 ++++- kernel/user_namespace.c | 4 ++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8d932b1c905..1766d324d5e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1141,6 +1141,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); + if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) + return ERR_PTR(-EINVAL); + /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. @@ -1807,7 +1810,7 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) * If unsharing a user namespace must also unshare the thread. */ if (unshare_flags & CLONE_NEWUSER) - unshare_flags |= CLONE_THREAD; + unshare_flags |= CLONE_THREAD | CLONE_FS; /* * If unsharing a pid namespace must also unshare the thread. */ diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 8b650837083..b14f4d34204 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -21,6 +21,7 @@ #include #include #include +#include static struct kmem_cache *user_ns_cachep __read_mostly; @@ -837,6 +838,9 @@ static int userns_install(struct nsproxy *nsproxy, void *ns) if (atomic_read(¤t->mm->mm_users) > 1) return -EINVAL; + if (current->fs->users != 1) + return -EINVAL; + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; -- cgit v1.2.3-70-g09d2 From 2ca39528c01a933f6689cd6505ce65bd6d68a530 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 13 Mar 2013 14:59:33 -0700 Subject: signal: always clear sa_restorer on execve When the new signal handlers are set up, the location of sa_restorer is not cleared, leaking a parent process's address space location to children. This allows for a potential bypass of the parent's ASLR by examining the sa_restorer value returned when calling sigaction(). Based on what should be considered "secret" about addresses, it only matters across the exec not the fork (since the VMAs haven't changed until the exec). But since exec sets SIG_DFL and keeps sa_restorer, this is where it should be fixed. Given the few uses of sa_restorer, a "set" function was not written since this would be the only use. Instead, we use __ARCH_HAS_SA_RESTORER, as already done in other places. Example of the leak before applying this patch: $ cat /proc/$$/maps ... 7fb9f3083000-7fb9f3238000 r-xp 00000000 fd:01 404469 .../libc-2.15.so ... $ ./leak ... 7f278bc74000-7f278be29000 r-xp 00000000 fd:01 404469 .../libc-2.15.so ... 1 0 (nil) 0x7fb9f30b94a0 2 4000000 (nil) 0x7f278bcaa4a0 3 4000000 (nil) 0x7f278bcaa4a0 4 0 (nil) 0x7fb9f30b94a0 ... [akpm@linux-foundation.org: use SA_RESTORER for backportability] Signed-off-by: Kees Cook Reported-by: Emese Revfy Cc: Emese Revfy Cc: PaX Team Cc: Al Viro Cc: Oleg Nesterov Cc: "Eric W. Biederman" Cc: Serge Hallyn Cc: Julien Tinnes Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index d63c79e7e41..43b0d4a1b7b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -485,6 +485,9 @@ flush_signal_handlers(struct task_struct *t, int force_default) if (force_default || ka->sa.sa_handler != SIG_IGN) ka->sa.sa_handler = SIG_DFL; ka->sa.sa_flags = 0; +#ifdef SA_RESTORER + ka->sa.sa_restorer = NULL; +#endif sigemptyset(&ka->sa.sa_mask); ka++; } -- cgit v1.2.3-70-g09d2 From 522cff142d7d2f9230839c9e1f21a4d8bcc22a4a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 13 Mar 2013 14:59:34 -0700 Subject: kernel/signal.c: use __ARCH_HAS_SA_RESTORER instead of SA_RESTORER __ARCH_HAS_SA_RESTORER is the preferred conditional for use in 3.9 and later kernels, per Kees. Cc: Emese Revfy Cc: Emese Revfy Cc: PaX Team Cc: Al Viro Cc: Oleg Nesterov Cc: "Eric W. Biederman" Cc: Serge Hallyn Cc: Julien Tinnes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 43b0d4a1b7b..dd72567767d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -485,7 +485,7 @@ flush_signal_handlers(struct task_struct *t, int force_default) if (force_default || ka->sa.sa_handler != SIG_IGN) ka->sa.sa_handler = SIG_DFL; ka->sa.sa_flags = 0; -#ifdef SA_RESTORER +#ifdef __ARCH_HAS_SA_RESTORER ka->sa.sa_restorer = NULL; #endif sigemptyset(&ka->sa.sa_mask); -- cgit v1.2.3-70-g09d2 From e68035fb65dec05718d765fbea14d2e527214ff6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 Mar 2013 14:59:38 -0700 Subject: workqueue: convert to idr_alloc() idr_get_new*() and friends are about to be deprecated. Convert to the new idr_alloc() interface. Signed-off-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 81f2457811e..55fac5b991b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -457,11 +457,12 @@ static int worker_pool_assign_id(struct worker_pool *pool) int ret; mutex_lock(&worker_pool_idr_mutex); - idr_pre_get(&worker_pool_idr, GFP_KERNEL); - ret = idr_get_new(&worker_pool_idr, pool, &pool->id); + ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL); + if (ret >= 0) + pool->id = ret; mutex_unlock(&worker_pool_idr_mutex); - return ret; + return ret < 0 ? ret : 0; } /* -- cgit v1.2.3-70-g09d2 From 69d34da2984c95b33ea21518227e1f9470f11d95 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 14 Mar 2013 13:50:56 -0400 Subject: tracing: Protect tracer flags with trace_types_lock Seems that the tracer flags have never been protected from synchronous writes. Luckily, admins don't usually modify the tracing flags via two different tasks. But if scripts were to be used to modify them, then they could get corrupted. Move the trace_types_lock that protects against tracers changing to also protect the flags being set. Cc: stable@vger.kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 53df2839bb9..00daf5f8c50 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2916,6 +2916,8 @@ static int trace_set_options(char *option) cmp += 2; } + mutex_lock(&trace_types_lock); + for (i = 0; trace_options[i]; i++) { if (strcmp(cmp, trace_options[i]) == 0) { set_tracer_flags(1 << i, !neg); @@ -2924,11 +2926,10 @@ static int trace_set_options(char *option) } /* If no option could be set, test the specific tracer options */ - if (!trace_options[i]) { - mutex_lock(&trace_types_lock); + if (!trace_options[i]) ret = set_tracer_option(current_trace, cmp, neg); - mutex_unlock(&trace_types_lock); - } + + mutex_unlock(&trace_types_lock); return ret; } @@ -4781,7 +4782,10 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, if (val != 0 && val != 1) return -EINVAL; + + mutex_lock(&trace_types_lock); set_tracer_flags(1 << index, val); + mutex_unlock(&trace_types_lock); *ppos += cnt; -- cgit v1.2.3-70-g09d2 From 80902822658aab18330569587cdb69ac1dfdcea8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 14 Mar 2013 14:20:54 -0400 Subject: tracing: Keep overwrite in sync between regular and snapshot buffers Changing the overwrite mode for the ring buffer via the trace option only sets the normal buffer. But the snapshot buffer could swap with it, and then the snapshot would be in non overwrite mode and the normal buffer would be in overwrite mode, even though the option flag states otherwise. Keep the two buffers overwrite modes in sync. Cc: stable@vger.kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 00daf5f8c50..02debabe9ed 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2895,8 +2895,12 @@ static void set_tracer_flags(unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); - if (mask == TRACE_ITER_OVERWRITE) + if (mask == TRACE_ITER_OVERWRITE) { ring_buffer_change_overwrite(global_trace.buffer, enabled); +#ifdef CONFIG_TRACER_MAX_TRACE + ring_buffer_change_overwrite(max_tr.buffer, enabled); +#endif + } if (mask == TRACE_ITER_PRINTK) trace_printk_start_stop_comm(enabled); -- cgit v1.2.3-70-g09d2 From 613f04a0f51e6e68ac6fe571ab79da3c0a5eb4da Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 14 Mar 2013 15:03:53 -0400 Subject: tracing: Prevent buffer overwrite disabled for latency tracers The latency tracers require the buffers to be in overwrite mode, otherwise they get screwed up. Force the buffers to stay in overwrite mode when latency tracers are enabled. Added a flag_changed() method to the tracer structure to allow the tracers to see what flags are being changed, and also be able to prevent the change from happing. Cc: stable@vger.kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 38 ++++++++++++++++++++++++++++++++------ kernel/trace/trace.h | 6 ++++++ kernel/trace/trace_irqsoff.c | 19 ++++++++++++++----- kernel/trace/trace_sched_wakeup.c | 18 +++++++++++++----- 4 files changed, 65 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 02debabe9ed..4f1dade5698 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2881,11 +2881,25 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg) return -EINVAL; } -static void set_tracer_flags(unsigned int mask, int enabled) +/* Some tracers require overwrite to stay enabled */ +int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) +{ + if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set) + return -1; + + return 0; +} + +int set_tracer_flag(unsigned int mask, int enabled) { /* do nothing if flag is already set */ if (!!(trace_flags & mask) == !!enabled) - return; + return 0; + + /* Give the tracer a chance to approve the change */ + if (current_trace->flag_changed) + if (current_trace->flag_changed(current_trace, mask, !!enabled)) + return -EINVAL; if (enabled) trace_flags |= mask; @@ -2904,13 +2918,15 @@ static void set_tracer_flags(unsigned int mask, int enabled) if (mask == TRACE_ITER_PRINTK) trace_printk_start_stop_comm(enabled); + + return 0; } static int trace_set_options(char *option) { char *cmp; int neg = 0; - int ret = 0; + int ret = -ENODEV; int i; cmp = strstrip(option); @@ -2924,7 +2940,7 @@ static int trace_set_options(char *option) for (i = 0; trace_options[i]; i++) { if (strcmp(cmp, trace_options[i]) == 0) { - set_tracer_flags(1 << i, !neg); + ret = set_tracer_flag(1 << i, !neg); break; } } @@ -2943,6 +2959,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; + int ret; if (cnt >= sizeof(buf)) return -EINVAL; @@ -2952,7 +2969,9 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; - trace_set_options(buf); + ret = trace_set_options(buf); + if (ret < 0) + return ret; *ppos += cnt; @@ -3256,6 +3275,9 @@ static int tracing_set_tracer(const char *buf) goto out; trace_branch_disable(); + + current_trace->enabled = false; + if (current_trace->reset) current_trace->reset(tr); @@ -3300,6 +3322,7 @@ static int tracing_set_tracer(const char *buf) } current_trace = t; + current_trace->enabled = true; trace_branch_enable(tr); out: mutex_unlock(&trace_types_lock); @@ -4788,9 +4811,12 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, return -EINVAL; mutex_lock(&trace_types_lock); - set_tracer_flags(1 << index, val); + ret = set_tracer_flag(1 << index, val); mutex_unlock(&trace_types_lock); + if (ret < 0) + return ret; + *ppos += cnt; return cnt; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 57d7e5397d5..2081971367e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -283,11 +283,15 @@ struct tracer { enum print_line_t (*print_line)(struct trace_iterator *iter); /* If you handled the flag setting, return 0 */ int (*set_flag)(u32 old_flags, u32 bit, int set); + /* Return 0 if OK with change, else return non-zero */ + int (*flag_changed)(struct tracer *tracer, + u32 mask, int set); struct tracer *next; struct tracer_flags *flags; bool print_max; bool use_max_tr; bool allocated_snapshot; + bool enabled; }; @@ -943,6 +947,8 @@ extern const char *__stop___trace_bprintk_fmt[]; void trace_printk_init_buffers(void); void trace_printk_start_comm(void); +int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); +int set_tracer_flag(unsigned int mask, int enabled); #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 713a2cac488..443b25b43b4 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -32,7 +32,7 @@ enum { static int trace_type __read_mostly; -static int save_lat_flag; +static int save_flags; static void stop_irqsoff_tracer(struct trace_array *tr, int graph); static int start_irqsoff_tracer(struct trace_array *tr, int graph); @@ -558,8 +558,11 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph) static void __irqsoff_tracer_init(struct trace_array *tr) { - save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; - trace_flags |= TRACE_ITER_LATENCY_FMT; + save_flags = trace_flags; + + /* non overwrite screws up the latency tracers */ + set_tracer_flag(TRACE_ITER_OVERWRITE, 1); + set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1); tracing_max_latency = 0; irqsoff_trace = tr; @@ -573,10 +576,13 @@ static void __irqsoff_tracer_init(struct trace_array *tr) static void irqsoff_tracer_reset(struct trace_array *tr) { + int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; + int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; + stop_irqsoff_tracer(tr, is_graph()); - if (!save_lat_flag) - trace_flags &= ~TRACE_ITER_LATENCY_FMT; + set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag); + set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag); } static void irqsoff_tracer_start(struct trace_array *tr) @@ -609,6 +615,7 @@ static struct tracer irqsoff_tracer __read_mostly = .print_line = irqsoff_print_line, .flags = &tracer_flags, .set_flag = irqsoff_set_flag, + .flag_changed = trace_keep_overwrite, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_irqsoff, #endif @@ -642,6 +649,7 @@ static struct tracer preemptoff_tracer __read_mostly = .print_line = irqsoff_print_line, .flags = &tracer_flags, .set_flag = irqsoff_set_flag, + .flag_changed = trace_keep_overwrite, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptoff, #endif @@ -677,6 +685,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .print_line = irqsoff_print_line, .flags = &tracer_flags, .set_flag = irqsoff_set_flag, + .flag_changed = trace_keep_overwrite, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptirqsoff, #endif diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 75aa97fbe1a..fde652c9a51 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -36,7 +36,7 @@ static void __wakeup_reset(struct trace_array *tr); static int wakeup_graph_entry(struct ftrace_graph_ent *trace); static void wakeup_graph_return(struct ftrace_graph_ret *trace); -static int save_lat_flag; +static int save_flags; #define TRACE_DISPLAY_GRAPH 1 @@ -540,8 +540,11 @@ static void stop_wakeup_tracer(struct trace_array *tr) static int __wakeup_tracer_init(struct trace_array *tr) { - save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; - trace_flags |= TRACE_ITER_LATENCY_FMT; + save_flags = trace_flags; + + /* non overwrite screws up the latency tracers */ + set_tracer_flag(TRACE_ITER_OVERWRITE, 1); + set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1); tracing_max_latency = 0; wakeup_trace = tr; @@ -563,12 +566,15 @@ static int wakeup_rt_tracer_init(struct trace_array *tr) static void wakeup_tracer_reset(struct trace_array *tr) { + int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; + int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; + stop_wakeup_tracer(tr); /* make sure we put back any tasks we are tracing */ wakeup_reset(tr); - if (!save_lat_flag) - trace_flags &= ~TRACE_ITER_LATENCY_FMT; + set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag); + set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag); } static void wakeup_tracer_start(struct trace_array *tr) @@ -594,6 +600,7 @@ static struct tracer wakeup_tracer __read_mostly = .print_line = wakeup_print_line, .flags = &tracer_flags, .set_flag = wakeup_set_flag, + .flag_changed = trace_keep_overwrite, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif @@ -615,6 +622,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = .print_line = wakeup_print_line, .flags = &tracer_flags, .set_flag = wakeup_set_flag, + .flag_changed = trace_keep_overwrite, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif -- cgit v1.2.3-70-g09d2 From 778141e3cf0bf29f91cd3cb5c314ea477b9402a7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2013 11:41:46 +0900 Subject: perf: Reset hwc->last_period on sw clock events When cpu/task clock events are initialized, their sampling frequencies are converted to have a fixed value. However it missed to update the hwc->last_period which was set to 1 for initial sampling frequency calibration. Because this hwc->last_period value is used as a period in perf_swevent_ hrtime(), every recorded sample will have an incorrected period of 1. $ perf record -e task-clock noploop 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.158 MB perf.data (~6919 samples) ] $ perf report -n --show-total-period --stdio # Samples: 4K of event 'task-clock' # Event count (approx.): 4000 # # Overhead Samples Period Command Shared Object Symbol # ........ ............ ............ ....... ............. .................. # 99.95% 3998 3998 noploop noploop [.] main 0.03% 1 1 noploop libc-2.15.so [.] init_cacheinfo 0.03% 1 1 noploop ld-2.15.so [.] open_verify Note that it doesn't affect the non-sampling event so that the perf stat still gets correct value with or without this patch. $ perf stat -e task-clock noploop 1 Performance counter stats for 'noploop 1': 1000.272525 task-clock # 1.000 CPUs utilized 1.000560605 seconds time elapsed Signed-off-by: Namhyung Kim Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1363574507-18808-1-git-send-email-namhyung@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index b0cd86501c3..fa79c377d65 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5647,6 +5647,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event) event->attr.sample_period = NSEC_PER_SEC / freq; hwc->sample_period = event->attr.sample_period; local64_set(&hwc->period_left, hwc->sample_period); + hwc->last_period = hwc->sample_period; event->attr.freq = 0; } } -- cgit v1.2.3-70-g09d2 From d610d98b5de6860feb21539726e9af7c9094151c Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 15 Mar 2013 16:27:13 +0900 Subject: perf: Generate EXIT event only once per task context perf_event_task_event() iterates pmu list and generate events for each eligible pmu context. But if task_event has task_ctx like in EXIT it'll generate events even though the pmu doesn't have an eligible one. Fix it by moving the code to proper places. Before this patch: $ perf record -n true [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.006 MB perf.data (~248 samples) ] $ perf report -D | tail Aggregated stats: TOTAL events: 73 MMAP events: 67 COMM events: 2 EXIT events: 4 cycles stats: TOTAL events: 73 MMAP events: 67 COMM events: 2 EXIT events: 4 After this patch: $ perf report -D | tail Aggregated stats: TOTAL events: 70 MMAP events: 67 COMM events: 2 EXIT events: 1 cycles stats: TOTAL events: 70 MMAP events: 67 COMM events: 2 EXIT events: 1 Signed-off-by: Namhyung Kim Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1363332433-7637-1-git-send-email-namhyung@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index fa79c377d65..59412d037ee 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4434,12 +4434,15 @@ static void perf_event_task_event(struct perf_task_event *task_event) if (ctxn < 0) goto next; ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) + perf_event_task_ctx(ctx, task_event); } - if (ctx) - perf_event_task_ctx(ctx, task_event); next: put_cpu_ptr(pmu->pmu_cpu_context); } + if (task_event->task_ctx) + perf_event_task_ctx(task_event->task_ctx, task_event); + rcu_read_unlock(); } -- cgit v1.2.3-70-g09d2 From dc72c32e1fd872a9a4fdfe645283c9dcd68e556d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 22 Mar 2013 15:04:39 -0700 Subject: printk: Provide a wake_up_klogd() off-case wake_up_klogd() is useless when CONFIG_PRINTK=n because neither printk() nor printk_sched() are in use and there are actually no waiter on log_wait waitqueue. It should be a stub in this case for users like bust_spinlocks(). Otherwise this results in this warning when CONFIG_PRINTK=n and CONFIG_IRQ_WORK=n: kernel/built-in.o In function `wake_up_klogd': (.text.wake_up_klogd+0xb4): undefined reference to `irq_work_queue' To fix this, provide an off-case for wake_up_klogd() when CONFIG_PRINTK=n. There is much more from console_unlock() and other console related code in printk.c that should be moved under CONFIG_PRINTK. But for now, focus on a minimal fix as we passed the merged window already. [akpm@linux-foundation.org: include printk.h in bust_spinlocks.c] Signed-off-by: Frederic Weisbecker Reported-by: James Hogan Cc: James Hogan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 1 - include/linux/printk.h | 6 ++++ kernel/printk.c | 80 ++++++++++++++++++++++++-------------------------- lib/bust_spinlocks.c | 3 +- 4 files changed, 46 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 80d36874689..79fdd80a42d 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -390,7 +390,6 @@ extern struct pid *session_of_pgrp(struct pid *pgrp); unsigned long int_sqrt(unsigned long); extern void bust_spinlocks(int yes); -extern void wake_up_klogd(void); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_timeout; extern int panic_on_oops; diff --git a/include/linux/printk.h b/include/linux/printk.h index 1249a54d17e..822171fcb1c 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -134,6 +134,8 @@ extern int printk_delay_msec; extern int dmesg_restrict; extern int kptr_restrict; +extern void wake_up_klogd(void); + void log_buf_kexec_setup(void); void __init setup_log_buf(int early); #else @@ -162,6 +164,10 @@ static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, return false; } +static inline void wake_up_klogd(void) +{ +} + static inline void log_buf_kexec_setup(void) { } diff --git a/kernel/printk.c b/kernel/printk.c index 0b31715f335..abbdd9e2ac8 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -63,8 +63,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ #define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ -DECLARE_WAIT_QUEUE_HEAD(log_wait); - int console_printk[4] = { DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ @@ -224,6 +222,7 @@ struct log { static DEFINE_RAW_SPINLOCK(logbuf_lock); #ifdef CONFIG_PRINTK +DECLARE_WAIT_QUEUE_HEAD(log_wait); /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static u32 syslog_idx; @@ -1957,45 +1956,6 @@ int is_console_locked(void) return console_locked; } -/* - * Delayed printk version, for scheduler-internal messages: - */ -#define PRINTK_BUF_SIZE 512 - -#define PRINTK_PENDING_WAKEUP 0x01 -#define PRINTK_PENDING_SCHED 0x02 - -static DEFINE_PER_CPU(int, printk_pending); -static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); - -static void wake_up_klogd_work_func(struct irq_work *irq_work) -{ - int pending = __this_cpu_xchg(printk_pending, 0); - - if (pending & PRINTK_PENDING_SCHED) { - char *buf = __get_cpu_var(printk_sched_buf); - printk(KERN_WARNING "[sched_delayed] %s", buf); - } - - if (pending & PRINTK_PENDING_WAKEUP) - wake_up_interruptible(&log_wait); -} - -static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { - .func = wake_up_klogd_work_func, - .flags = IRQ_WORK_LAZY, -}; - -void wake_up_klogd(void) -{ - preempt_disable(); - if (waitqueue_active(&log_wait)) { - this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); - irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); - } - preempt_enable(); -} - static void console_cont_flush(char *text, size_t size) { unsigned long flags; @@ -2458,6 +2418,44 @@ static int __init printk_late_init(void) late_initcall(printk_late_init); #if defined CONFIG_PRINTK +/* + * Delayed printk version, for scheduler-internal messages: + */ +#define PRINTK_BUF_SIZE 512 + +#define PRINTK_PENDING_WAKEUP 0x01 +#define PRINTK_PENDING_SCHED 0x02 + +static DEFINE_PER_CPU(int, printk_pending); +static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); + +static void wake_up_klogd_work_func(struct irq_work *irq_work) +{ + int pending = __this_cpu_xchg(printk_pending, 0); + + if (pending & PRINTK_PENDING_SCHED) { + char *buf = __get_cpu_var(printk_sched_buf); + printk(KERN_WARNING "[sched_delayed] %s", buf); + } + + if (pending & PRINTK_PENDING_WAKEUP) + wake_up_interruptible(&log_wait); +} + +static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { + .func = wake_up_klogd_work_func, + .flags = IRQ_WORK_LAZY, +}; + +void wake_up_klogd(void) +{ + preempt_disable(); + if (waitqueue_active(&log_wait)) { + this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); + irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); + } + preempt_enable(); +} int printk_sched(const char *fmt, ...) { diff --git a/lib/bust_spinlocks.c b/lib/bust_spinlocks.c index 9681d54b95d..f8e0e536739 100644 --- a/lib/bust_spinlocks.c +++ b/lib/bust_spinlocks.c @@ -8,6 +8,7 @@ */ #include +#include #include #include #include @@ -28,5 +29,3 @@ void __attribute__((weak)) bust_spinlocks(int yes) wake_up_klogd(); } } - - -- cgit v1.2.3-70-g09d2 From 2ca067efd82939dfd87827d29d36a265823a4c2f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 22 Mar 2013 15:04:41 -0700 Subject: poweroff: change orderly_poweroff() to use schedule_work() David said: Commit 6c0c0d4d1080 ("poweroff: fix bug in orderly_poweroff()") apparently fixes one bug in orderly_poweroff(), but introduces another. The comments on orderly_poweroff() claim it can be called from any context - and indeed we call it from interrupt context in arch/powerpc/platforms/pseries/ras.c for example. But since that commit this is no longer safe, since call_usermodehelper_fns() is not safe in interrupt context without the UMH_NO_WAIT option. orderly_poweroff() can be used from any context but UMH_WAIT_EXEC is sleepable. Move the "force" logic into __orderly_poweroff() and change orderly_poweroff() to use the global poweroff_work which simply calls __orderly_poweroff(). While at it, remove the unneeded "int argc" and change argv_split() to use GFP_KERNEL. We use the global "bool poweroff_force" to pass the argument, this can obviously affect the previous request if it is pending/running. So we only allow the "false => true" transition assuming that the pending "true" should succeed anyway. If schedule_work() fails after that we know that work->func() was not called yet, it must see the new value. This means that orderly_poweroff() becomes async even if we do not run the command and always succeeds, schedule_work() can only fail if the work is already pending. We can export __orderly_poweroff() and change the non-atomic callers which want the old semantics. Signed-off-by: Oleg Nesterov Reported-by: Benjamin Herrenschmidt Reported-by: David Gibson Cc: Lucas De Marchi Cc: Feng Hong Cc: Kees Cook Cc: Serge Hallyn Cc: "Eric W. Biederman" Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 57 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 81f56445fba..39c9c4a2949 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2185,9 +2185,8 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; -static int __orderly_poweroff(void) +static int __orderly_poweroff(bool force) { - int argc; char **argv; static char *envp[] = { "HOME=/", @@ -2196,20 +2195,40 @@ static int __orderly_poweroff(void) }; int ret; - argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); - if (argv == NULL) { + argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); + if (argv) { + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + argv_free(argv); + } else { printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", - __func__, poweroff_cmd); - return -ENOMEM; + __func__, poweroff_cmd); + ret = -ENOMEM; } - ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, - NULL, NULL, NULL); - argv_free(argv); + if (ret && force) { + printk(KERN_WARNING "Failed to start orderly shutdown: " + "forcing the issue\n"); + /* + * I guess this should try to kick off some daemon to sync and + * poweroff asap. Or not even bother syncing if we're doing an + * emergency shutdown? + */ + emergency_sync(); + kernel_power_off(); + } return ret; } +static bool poweroff_force; + +static void poweroff_work_func(struct work_struct *work) +{ + __orderly_poweroff(poweroff_force); +} + +static DECLARE_WORK(poweroff_work, poweroff_work_func); + /** * orderly_poweroff - Trigger an orderly system poweroff * @force: force poweroff if command execution fails @@ -2219,21 +2238,9 @@ static int __orderly_poweroff(void) */ int orderly_poweroff(bool force) { - int ret = __orderly_poweroff(); - - if (ret && force) { - printk(KERN_WARNING "Failed to start orderly shutdown: " - "forcing the issue\n"); - - /* - * I guess this should try to kick off some daemon to sync and - * poweroff asap. Or not even bother syncing if we're doing an - * emergency shutdown? - */ - emergency_sync(); - kernel_power_off(); - } - - return ret; + if (force) /* do not override the pending "true" */ + poweroff_force = true; + schedule_work(&poweroff_work); + return 0; } EXPORT_SYMBOL_GPL(orderly_poweroff); -- cgit v1.2.3-70-g09d2 From 751c644b95bb48aaa8825f0c66abbcc184d92051 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Mar 2013 02:27:11 -0700 Subject: pid: Handle the exit of a multi-threaded init. When a multi-threaded init exits and the initial thread is not the last thread to exit the initial thread hangs around as a zombie until the last thread exits. In that case zap_pid_ns_processes needs to wait until there are only 2 hashed pids in the pid namespace not one. v2. Replace thread_pid_vnr(me) == 1 with the test thread_group_leader(me) as suggested by Oleg. Cc: stable@vger.kernel.org Cc: Oleg Nesterov Reported-by: Caj Larsson Signed-off-by: "Eric W. Biederman" --- kernel/pid_namespace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index c1c3dc1c602..bea15bdf82b 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -181,6 +181,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) int nr; int rc; struct task_struct *task, *me = current; + int init_pids = thread_group_leader(me) ? 1 : 2; /* Don't allow any more processes into the pid namespace */ disable_pid_allocation(pid_ns); @@ -230,7 +231,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) */ for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); - if (pid_ns->nr_hashed == 1) + if (pid_ns->nr_hashed == init_pids) break; schedule(); } -- cgit v1.2.3-70-g09d2 From 3151527ee007b73a0ebd296010f1c0454a919c7d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 15 Mar 2013 01:45:51 -0700 Subject: userns: Don't allow creation if the user is chrooted Guarantee that the policy of which files may be access that is established by setting the root directory will not be violated by user namespaces by verifying that the root directory points to the root of the mount namespace at the time of user namespace creation. Changing the root is a privileged operation, and as a matter of policy it serves to limit unprivileged processes to files below the current root directory. For reasons of simplicity and comprehensibility the privilege to change the root directory is gated solely on the CAP_SYS_CHROOT capability in the user namespace. Therefore when creating a user namespace we must ensure that the policy of which files may be access can not be violated by changing the root directory. Anyone who runs a processes in a chroot and would like to use user namespace can setup the same view of filesystems with a mount namespace instead. With this result that this is not a practical limitation for using user namespaces. Cc: stable@vger.kernel.org Acked-by: Serge Hallyn Reported-by: Andy Lutomirski Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 24 ++++++++++++++++++++++++ include/linux/fs_struct.h | 2 ++ kernel/user_namespace.c | 9 +++++++++ 3 files changed, 35 insertions(+) (limited to 'kernel') diff --git a/fs/namespace.c b/fs/namespace.c index 50ca17d3cb4..a3035223d42 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2732,6 +2732,30 @@ bool our_mnt(struct vfsmount *mnt) return check_mnt(real_mount(mnt)); } +bool current_chrooted(void) +{ + /* Does the current process have a non-standard root */ + struct path ns_root; + struct path fs_root; + bool chrooted; + + /* Find the namespace root */ + ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; + ns_root.dentry = ns_root.mnt->mnt_root; + path_get(&ns_root); + while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) + ; + + get_fs_root(current->fs, &fs_root); + + chrooted = !path_equal(&fs_root, &ns_root); + + path_put(&fs_root); + path_put(&ns_root); + + return chrooted; +} + static void *mntns_get(struct task_struct *task) { struct mnt_namespace *ns = NULL; diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 729eded4b24..2b93a9a5a1e 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -50,4 +50,6 @@ static inline void get_fs_root_and_pwd(struct fs_struct *fs, struct path *root, spin_unlock(&fs->lock); } +extern bool current_chrooted(void); + #endif /* _LINUX_FS_STRUCT_H */ diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index b14f4d34204..0f1e4288457 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -61,6 +61,15 @@ int create_user_ns(struct cred *new) kgid_t group = new->egid; int ret; + /* + * Verify that we can not violate the policy of which files + * may be accessed that is specified by the root directory, + * by verifing that the root directory is at the root of the + * mount namespace which allows all files to be accessed. + */ + if (current_chrooted()) + return -EPERM; + /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. -- cgit v1.2.3-70-g09d2 From 87a8ebd637dafc255070f503909a053cf0d98d3f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 24 Mar 2013 14:28:27 -0700 Subject: userns: Restrict when proc and sysfs can be mounted Only allow unprivileged mounts of proc and sysfs if they are already mounted when the user namespace is created. proc and sysfs are interesting because they have content that is per namespace, and so fresh mounts are needed when new namespaces are created while at the same time proc and sysfs have content that is shared between every instance. Respect the policy of who may see the shared content of proc and sysfs by only allowing new mounts if there was an existing mount at the time the user namespace was created. In practice there are only two interesting cases: proc and sysfs are mounted at their usual places, proc and sysfs are not mounted at all (some form of mount namespace jail). Cc: stable@vger.kernel.org Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 21 +++++++++++++++++++++ fs/proc/root.c | 4 ++++ fs/sysfs/mount.c | 4 ++++ include/linux/user_namespace.h | 4 ++++ kernel/user.c | 2 ++ kernel/user_namespace.c | 2 ++ 6 files changed, 37 insertions(+) (limited to 'kernel') diff --git a/fs/namespace.c b/fs/namespace.c index 968d4c5eae0..d581e45c0a9 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2763,6 +2763,27 @@ bool current_chrooted(void) return chrooted; } +void update_mnt_policy(struct user_namespace *userns) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct mount *mnt; + + down_read(&namespace_sem); + list_for_each_entry(mnt, &ns->list, mnt_list) { + switch (mnt->mnt.mnt_sb->s_magic) { + case SYSFS_MAGIC: + userns->may_mount_sysfs = true; + break; + case PROC_SUPER_MAGIC: + userns->may_mount_proc = true; + break; + } + if (userns->may_mount_sysfs && userns->may_mount_proc) + break; + } + up_read(&namespace_sem); +} + static void *mntns_get(struct task_struct *task) { struct mnt_namespace *ns = NULL; diff --git a/fs/proc/root.c b/fs/proc/root.c index c6e9fac26ba..9c7fab1d23f 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -108,6 +109,9 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, } else { ns = task_active_pid_ns(current); options = data; + + if (!current_user_ns()->may_mount_proc) + return ERR_PTR(-EPERM); } sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 8d924b5ec73..afd83273e6c 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "sysfs.h" @@ -111,6 +112,9 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, struct super_block *sb; int error; + if (!(flags & MS_KERNMOUNT) && !current_user_ns()->may_mount_sysfs) + return ERR_PTR(-EPERM); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 4ce00932493..b6b215f13b4 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -26,6 +26,8 @@ struct user_namespace { kuid_t owner; kgid_t group; unsigned int proc_inum; + bool may_mount_sysfs; + bool may_mount_proc; }; extern struct user_namespace init_user_ns; @@ -82,4 +84,6 @@ static inline void put_user_ns(struct user_namespace *ns) #endif +void update_mnt_policy(struct user_namespace *userns); + #endif /* _LINUX_USER_H */ diff --git a/kernel/user.c b/kernel/user.c index e81978e8c03..8e635a18ab5 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -51,6 +51,8 @@ struct user_namespace init_user_ns = { .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .proc_inum = PROC_USER_INIT_INO, + .may_mount_sysfs = true, + .may_mount_proc = true, }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 0f1e4288457..a54f26f82eb 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -96,6 +96,8 @@ int create_user_ns(struct cred *new) set_cred_user_ns(new, ns); + update_mnt_policy(ns); + return 0; } -- cgit v1.2.3-70-g09d2 From dbf520a9d7d4d5ba28d2947be11e34099a5e3e20 Mon Sep 17 00:00:00 2001 From: Paul Walmsley Date: Sun, 31 Mar 2013 00:04:40 +0000 Subject: Revert "lockdep: check that no locks held at freeze time" This reverts commit 6aa9707099c4b25700940eb3d016f16c4434360d. Commit 6aa9707099c4 ("lockdep: check that no locks held at freeze time") causes problems with NFS root filesystems. The failures were noticed on OMAP2 and 3 boards during kernel init: [ BUG: swapper/0/1 still has locks held! ] 3.9.0-rc3-00344-ga937536 #1 Not tainted ------------------------------------- 1 lock held by swapper/0/1: #0: (&type->s_umount_key#13/1){+.+.+.}, at: [] sget+0x248/0x574 stack backtrace: rpc_wait_bit_killable __wait_on_bit out_of_line_wait_on_bit __rpc_execute rpc_run_task rpc_call_sync nfs_proc_get_root nfs_get_root nfs_fs_mount_common nfs_try_mount nfs_fs_mount mount_fs vfs_kern_mount do_mount sys_mount do_mount_root mount_root prepare_namespace kernel_init_freeable kernel_init Although the rootfs mounts, the system is unstable. Here's a transcript from a PM test: http://www.pwsan.com/omap/testlogs/test_v3.9-rc3/20130317194234/pm/37xxevm/37xxevm_log.txt Here's what the test log should look like: http://www.pwsan.com/omap/testlogs/test_v3.8/20130218214403/pm/37xxevm/37xxevm_log.txt Mailing list discussion is here: http://lkml.org/lkml/2013/3/4/221 Deal with this for v3.9 by reverting the problem commit, until folks can figure out the right long-term course of action. Signed-off-by: Paul Walmsley Cc: Mandeep Singh Baines Cc: Jeff Layton Cc: Shawn Guo Cc: Cc: Fengguang Wu Cc: Trond Myklebust Cc: Ingo Molnar Cc: Ben Chan Cc: Oleg Nesterov Cc: Tejun Heo Cc: Rafael J. Wysocki Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/debug_locks.h | 4 ++-- include/linux/freezer.h | 3 --- kernel/exit.c | 2 +- kernel/lockdep.c | 17 +++++++++-------- 4 files changed, 12 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index a975de1ff59..3bd46f76675 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h @@ -51,7 +51,7 @@ struct task_struct; extern void debug_show_all_locks(void); extern void debug_show_held_locks(struct task_struct *task); extern void debug_check_no_locks_freed(const void *from, unsigned long len); -extern void debug_check_no_locks_held(void); +extern void debug_check_no_locks_held(struct task_struct *task); #else static inline void debug_show_all_locks(void) { @@ -67,7 +67,7 @@ debug_check_no_locks_freed(const void *from, unsigned long len) } static inline void -debug_check_no_locks_held(void) +debug_check_no_locks_held(struct task_struct *task) { } #endif diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 043a5cf8b5b..e70df40d84f 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -3,7 +3,6 @@ #ifndef FREEZER_H_INCLUDED #define FREEZER_H_INCLUDED -#include #include #include #include @@ -49,8 +48,6 @@ extern void thaw_kernel_threads(void); static inline bool try_to_freeze(void) { - if (!(current->flags & PF_NOFREEZE)) - debug_check_no_locks_held(); might_sleep(); if (likely(!freezing(current))) return false; diff --git a/kernel/exit.c b/kernel/exit.c index 51e485ca993..60bc027c61c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -835,7 +835,7 @@ void do_exit(long code) /* * Make sure we are holding no locks: */ - debug_check_no_locks_held(); + debug_check_no_locks_held(tsk); /* * We can do this unlocked here. The futex code uses this flag * just to verify whether the pi state cleanup has been done diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 259db207b5d..8a0efac4f99 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -4088,7 +4088,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) } EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); -static void print_held_locks_bug(void) +static void print_held_locks_bug(struct task_struct *curr) { if (!debug_locks_off()) return; @@ -4097,21 +4097,22 @@ static void print_held_locks_bug(void) printk("\n"); printk("=====================================\n"); - printk("[ BUG: %s/%d still has locks held! ]\n", - current->comm, task_pid_nr(current)); + printk("[ BUG: lock held at task exit time! ]\n"); print_kernel_ident(); printk("-------------------------------------\n"); - lockdep_print_held_locks(current); + printk("%s/%d is exiting with locks still held!\n", + curr->comm, task_pid_nr(curr)); + lockdep_print_held_locks(curr); + printk("\nstack backtrace:\n"); dump_stack(); } -void debug_check_no_locks_held(void) +void debug_check_no_locks_held(struct task_struct *task) { - if (unlikely(current->lockdep_depth > 0)) - print_held_locks_bug(); + if (unlikely(task->lockdep_depth > 0)) + print_held_locks_bug(task); } -EXPORT_SYMBOL_GPL(debug_check_no_locks_held); void debug_show_all_locks(void) { -- cgit v1.2.3-70-g09d2