From 426d31071ac476ea62c62656b242930c17b58c00 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Sat, 7 Aug 2010 12:30:03 +0200 Subject: fix printk typo 'faild' Signed-off-by: Paul Bolle Signed-off-by: Jiri Kosina --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f52b5f50299..58716e73e2a 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -490,7 +490,7 @@ static int register_trace_probe(struct trace_probe *tp) } ret = register_probe_event(tp); if (ret) { - pr_warning("Faild to register probe event(%d)\n", ret); + pr_warning("Failed to register probe event(%d)\n", ret); goto end; } -- cgit v1.2.3-18-g5258 From 401a8d048eadfbe1b1c1bf53d3b614fcc894c61a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 16 Sep 2010 10:36:00 +0200 Subject: workqueue: cleanup flush/cancel functions Make the following cleanup changes. * Relocate flush/cancel function prototypes and definitions. * Relocate wait_on_cpu_work() and wait_on_work() before try_to_grab_pending(). These will be used to implement flush_work_sync(). * Make all flush/cancel functions return bool instead of int. * Update wait_on_cpu_work() and wait_on_work() to return %true if they actually waited. * Add / update comments. This patch doesn't cause any functional changes. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 175 ++++++++++++++++++++++++++++------------------------- 1 file changed, 94 insertions(+), 81 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f77afd93922..1240b9d94b0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2327,16 +2327,24 @@ out_unlock: EXPORT_SYMBOL_GPL(flush_workqueue); /** - * flush_work - block until a work_struct's callback has terminated - * @work: the work which is to be flushed + * flush_work - wait for a work to finish executing the last queueing instance + * @work: the work to flush * - * Returns false if @work has already terminated. + * Wait until @work has finished execution. This function considers + * only the last queueing instance of @work. If @work has been + * enqueued across different CPUs on a non-reentrant workqueue or on + * multiple workqueues, @work might still be executing on return on + * some of the CPUs from earlier queueing. * - * It is expected that, prior to calling flush_work(), the caller has - * arranged for the work to not be requeued, otherwise it doesn't make - * sense to use this function. + * If @work was queued only on a non-reentrant, ordered or unbound + * workqueue, @work is guaranteed to be idle on return if it hasn't + * been requeued since flush started. + * + * RETURNS: + * %true if flush_work() waited for the work to finish execution, + * %false if it was already idle. */ -int flush_work(struct work_struct *work) +bool flush_work(struct work_struct *work) { struct worker *worker = NULL; struct global_cwq *gcwq; @@ -2374,13 +2382,49 @@ int flush_work(struct work_struct *work) wait_for_completion(&barr.done); destroy_work_on_stack(&barr.work); - return 1; + return true; already_gone: spin_unlock_irq(&gcwq->lock); - return 0; + return false; } EXPORT_SYMBOL_GPL(flush_work); +static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) +{ + struct wq_barrier barr; + struct worker *worker; + + spin_lock_irq(&gcwq->lock); + + worker = find_worker_executing_work(gcwq, work); + if (unlikely(worker)) + insert_wq_barrier(worker->current_cwq, &barr, work, worker); + + spin_unlock_irq(&gcwq->lock); + + if (unlikely(worker)) { + wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + return true; + } else + return false; +} + +static bool wait_on_work(struct work_struct *work) +{ + bool ret = false; + int cpu; + + might_sleep(); + + lock_map_acquire(&work->lockdep_map); + lock_map_release(&work->lockdep_map); + + for_each_gcwq_cpu(cpu) + ret |= wait_on_cpu_work(get_gcwq(cpu), work); + return ret; +} + /* * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, * so this work can't be re-armed in any way. @@ -2423,39 +2467,7 @@ static int try_to_grab_pending(struct work_struct *work) return ret; } -static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) -{ - struct wq_barrier barr; - struct worker *worker; - - spin_lock_irq(&gcwq->lock); - - worker = find_worker_executing_work(gcwq, work); - if (unlikely(worker)) - insert_wq_barrier(worker->current_cwq, &barr, work, worker); - - spin_unlock_irq(&gcwq->lock); - - if (unlikely(worker)) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - } -} - -static void wait_on_work(struct work_struct *work) -{ - int cpu; - - might_sleep(); - - lock_map_acquire(&work->lockdep_map); - lock_map_release(&work->lockdep_map); - - for_each_gcwq_cpu(cpu) - wait_on_cpu_work(get_gcwq(cpu), work); -} - -static int __cancel_work_timer(struct work_struct *work, +static bool __cancel_work_timer(struct work_struct *work, struct timer_list* timer) { int ret; @@ -2472,42 +2484,60 @@ static int __cancel_work_timer(struct work_struct *work, } /** - * cancel_work_sync - block until a work_struct's callback has terminated - * @work: the work which is to be flushed - * - * Returns true if @work was pending. + * cancel_work_sync - cancel a work and wait for it to finish + * @work: the work to cancel * - * cancel_work_sync() will cancel the work if it is queued. If the work's - * callback appears to be running, cancel_work_sync() will block until it - * has completed. + * Cancel @work and wait for its execution to finish. This function + * can be used even if the work re-queues itself or migrates to + * another workqueue. On return from this function, @work is + * guaranteed to be not pending or executing on any CPU. * - * It is possible to use this function if the work re-queues itself. It can - * cancel the work even if it migrates to another workqueue, however in that - * case it only guarantees that work->func() has completed on the last queued - * workqueue. - * - * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not - * pending, otherwise it goes into a busy-wait loop until the timer expires. + * cancel_work_sync(&delayed_work->work) must not be used for + * delayed_work's. Use cancel_delayed_work_sync() instead. * - * The caller must ensure that workqueue_struct on which this work was last + * The caller must ensure that the workqueue on which @work was last * queued can't be destroyed before this function returns. + * + * RETURNS: + * %true if @work was pending, %false otherwise. */ -int cancel_work_sync(struct work_struct *work) +bool cancel_work_sync(struct work_struct *work) { return __cancel_work_timer(work, NULL); } EXPORT_SYMBOL_GPL(cancel_work_sync); /** - * cancel_delayed_work_sync - reliably kill off a delayed work. - * @dwork: the delayed work struct + * flush_delayed_work - wait for a dwork to finish executing the last queueing + * @dwork: the delayed work to flush + * + * Delayed timer is cancelled and the pending work is queued for + * immediate execution. Like flush_work(), this function only + * considers the last queueing instance of @dwork. + * + * RETURNS: + * %true if flush_work() waited for the work to finish execution, + * %false if it was already idle. + */ +bool flush_delayed_work(struct delayed_work *dwork) +{ + if (del_timer_sync(&dwork->timer)) + __queue_work(raw_smp_processor_id(), + get_work_cwq(&dwork->work)->wq, &dwork->work); + return flush_work(&dwork->work); +} +EXPORT_SYMBOL(flush_delayed_work); + +/** + * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish + * @dwork: the delayed work cancel * - * Returns true if @dwork was pending. + * This is cancel_work_sync() for delayed works. * - * It is possible to use this function if @dwork rearms itself via queue_work() - * or queue_delayed_work(). See also the comment for cancel_work_sync(). + * RETURNS: + * %true if @dwork was pending, %false otherwise. */ -int cancel_delayed_work_sync(struct delayed_work *dwork) +bool cancel_delayed_work_sync(struct delayed_work *dwork) { return __cancel_work_timer(&dwork->work, &dwork->timer); } @@ -2558,23 +2588,6 @@ int schedule_delayed_work(struct delayed_work *dwork, } EXPORT_SYMBOL(schedule_delayed_work); -/** - * flush_delayed_work - block until a dwork_struct's callback has terminated - * @dwork: the delayed work which is to be flushed - * - * Any timeout is cancelled, and any pending work is run immediately. - */ -void flush_delayed_work(struct delayed_work *dwork) -{ - if (del_timer_sync(&dwork->timer)) { - __queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq, - &dwork->work); - put_cpu(); - } - flush_work(&dwork->work); -} -EXPORT_SYMBOL(flush_delayed_work); - /** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay * @cpu: cpu to use -- cgit v1.2.3-18-g5258 From baf59022c37d43f202e62d5130e4bac5e825b426 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 16 Sep 2010 10:42:16 +0200 Subject: workqueue: factor out start_flush_work() Factor out start_flush_work() from flush_work(). start_flush_work() has @wait_executing argument which controls whether the barrier is queued only if the work is pending or also if executing. As flush_work() needs to wait for execution too, it uses %true. This commit doesn't cause any behavior difference. start_flush_work() will be used to implement flush_work_sync(). Signed-off-by: Tejun Heo --- kernel/workqueue.c | 64 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1240b9d94b0..33d31d76870 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2326,35 +2326,17 @@ out_unlock: } EXPORT_SYMBOL_GPL(flush_workqueue); -/** - * flush_work - wait for a work to finish executing the last queueing instance - * @work: the work to flush - * - * Wait until @work has finished execution. This function considers - * only the last queueing instance of @work. If @work has been - * enqueued across different CPUs on a non-reentrant workqueue or on - * multiple workqueues, @work might still be executing on return on - * some of the CPUs from earlier queueing. - * - * If @work was queued only on a non-reentrant, ordered or unbound - * workqueue, @work is guaranteed to be idle on return if it hasn't - * been requeued since flush started. - * - * RETURNS: - * %true if flush_work() waited for the work to finish execution, - * %false if it was already idle. - */ -bool flush_work(struct work_struct *work) +static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, + bool wait_executing) { struct worker *worker = NULL; struct global_cwq *gcwq; struct cpu_workqueue_struct *cwq; - struct wq_barrier barr; might_sleep(); gcwq = get_work_gcwq(work); if (!gcwq) - return 0; + return false; spin_lock_irq(&gcwq->lock); if (!list_empty(&work->entry)) { @@ -2367,26 +2349,54 @@ bool flush_work(struct work_struct *work) cwq = get_work_cwq(work); if (unlikely(!cwq || gcwq != cwq->gcwq)) goto already_gone; - } else { + } else if (wait_executing) { worker = find_worker_executing_work(gcwq, work); if (!worker) goto already_gone; cwq = worker->current_cwq; - } + } else + goto already_gone; - insert_wq_barrier(cwq, &barr, work, worker); + insert_wq_barrier(cwq, barr, work, worker); spin_unlock_irq(&gcwq->lock); lock_map_acquire(&cwq->wq->lockdep_map); lock_map_release(&cwq->wq->lockdep_map); - - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); return true; already_gone: spin_unlock_irq(&gcwq->lock); return false; } + +/** + * flush_work - wait for a work to finish executing the last queueing instance + * @work: the work to flush + * + * Wait until @work has finished execution. This function considers + * only the last queueing instance of @work. If @work has been + * enqueued across different CPUs on a non-reentrant workqueue or on + * multiple workqueues, @work might still be executing on return on + * some of the CPUs from earlier queueing. + * + * If @work was queued only on a non-reentrant, ordered or unbound + * workqueue, @work is guaranteed to be idle on return if it hasn't + * been requeued since flush started. + * + * RETURNS: + * %true if flush_work() waited for the work to finish execution, + * %false if it was already idle. + */ +bool flush_work(struct work_struct *work) +{ + struct wq_barrier barr; + + if (start_flush_work(work, &barr, true)) { + wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + return true; + } else + return false; +} EXPORT_SYMBOL_GPL(flush_work); static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) -- cgit v1.2.3-18-g5258 From 09383498c5d35262e643bfdbae84826177a3c624 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 16 Sep 2010 10:48:29 +0200 Subject: workqueue: implement flush[_delayed]_work_sync() Implement flush[_delayed]_work_sync(). These are flush functions which also make sure no CPU is still executing the target work from earlier queueing instances. These are similar to cancel[_delayed]_work_sync() except that the target work item is flushed instead of cancelled. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 33d31d76870..19e4bc15ee9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2435,6 +2435,41 @@ static bool wait_on_work(struct work_struct *work) return ret; } +/** + * flush_work_sync - wait until a work has finished execution + * @work: the work to flush + * + * Wait until @work has finished execution. On return, it's + * guaranteed that all queueing instances of @work which happened + * before this function is called are finished. In other words, if + * @work hasn't been requeued since this function was called, @work is + * guaranteed to be idle on return. + * + * RETURNS: + * %true if flush_work_sync() waited for the work to finish execution, + * %false if it was already idle. + */ +bool flush_work_sync(struct work_struct *work) +{ + struct wq_barrier barr; + bool pending, waited; + + /* we'll wait for executions separately, queue barr only if pending */ + pending = start_flush_work(work, &barr, false); + + /* wait for executions to finish */ + waited = wait_on_work(work); + + /* wait for the pending one */ + if (pending) { + wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + } + + return pending || waited; +} +EXPORT_SYMBOL_GPL(flush_work_sync); + /* * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, * so this work can't be re-armed in any way. @@ -2538,6 +2573,27 @@ bool flush_delayed_work(struct delayed_work *dwork) } EXPORT_SYMBOL(flush_delayed_work); +/** + * flush_delayed_work_sync - wait for a dwork to finish + * @dwork: the delayed work to flush + * + * Delayed timer is cancelled and the pending work is queued for + * execution immediately. Other than timer handling, its behavior + * is identical to flush_work_sync(). + * + * RETURNS: + * %true if flush_work_sync() waited for the work to finish execution, + * %false if it was already idle. + */ +bool flush_delayed_work_sync(struct delayed_work *dwork) +{ + if (del_timer_sync(&dwork->timer)) + __queue_work(raw_smp_processor_id(), + get_work_cwq(&dwork->work)->wq, &dwork->work); + return flush_work_sync(&dwork->work); +} +EXPORT_SYMBOL(flush_delayed_work_sync); + /** * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish * @dwork: the delayed work cancel -- cgit v1.2.3-18-g5258 From 676cb02dc32adef13d9efb5ea52079e4ede1e3ec Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 20 Jul 2009 23:33:49 +0200 Subject: softirqs: Make wakeup_softirqd static No users outside of kernel/softirq.c Signed-off-by: Thomas Gleixner --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 07b4f1b1a73..80f6e3bd1d2 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -67,7 +67,7 @@ char *softirq_to_name[NR_SOFTIRQS] = { * to the pending events, so lets the scheduler to balance * the softirq load for us. */ -void wakeup_softirqd(void) +static void wakeup_softirqd(void) { /* Interrupts are disabled: no need to stop preemption */ struct task_struct *tsk = __get_cpu_var(ksoftirqd); -- cgit v1.2.3-18-g5258 From 99a51792dbb7e6e39b2a4ebcfe202f1dcc7354c4 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 4 Sep 2010 18:52:50 -0700 Subject: kernel/pm_qos_params.c: Remove unnecessary casts of private_data Signed-off-by: Joe Perches Signed-off-by: Jiri Kosina --- kernel/pm_qos_params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 996a4dec5f9..49829c5b4b4 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -394,7 +394,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, } else return -EINVAL; - pm_qos_req = (struct pm_qos_request_list *)filp->private_data; + pm_qos_req = filp->private_data; pm_qos_update_request(pm_qos_req, value); return count; -- cgit v1.2.3-18-g5258 From db71922217a214e5c9268448e537b54fc1f301ea Mon Sep 17 00:00:00 2001 From: Jan Blunck Date: Sun, 15 Aug 2010 22:51:10 +0200 Subject: BKL: Explicitly add BKL around get_sb/fill_super This patch is a preparation necessary to remove the BKL from do_new_mount(). It explicitly adds calls to lock_kernel()/unlock_kernel() around get_sb/fill_super operations for filesystems that still uses the BKL. I've read through all the code formerly covered by the BKL inside do_kern_mount() and have satisfied myself that it doesn't need the BKL any more. do_kern_mount() is already called without the BKL when mounting the rootfs and in nfsctl. do_kern_mount() calls vfs_kern_mount(), which is called from various places without BKL: simple_pin_fs(), nfs_do_clone_mount() through nfs_follow_mountpoint(), afs_mntpt_do_automount() through afs_mntpt_follow_link(). Both later functions are actually the filesystems follow_link inode operation. vfs_kern_mount() is calling the specified get_sb function and lets the filesystem do its job by calling the given fill_super function. Therefore I think it is safe to push down the BKL from the VFS to the low-level filesystems get_sb/fill_super operation. [arnd: do not add the BKL to those file systems that already don't use it elsewhere] Signed-off-by: Jan Blunck Signed-off-by: Arnd Bergmann Cc: Matthew Wilcox Cc: Christoph Hellwig --- kernel/cgroup.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c9483d8f614..a7ba3bccadc 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1430,6 +1430,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type, struct super_block *sb; struct cgroupfs_root *new_root; + lock_kernel(); + /* First find the desired set of subsystems */ mutex_lock(&cgroup_mutex); ret = parse_cgroupfs_options(data, &opts); @@ -1559,6 +1561,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, simple_set_mnt(mnt, sb); kfree(opts.release_agent); kfree(opts.name); + unlock_kernel(); return 0; drop_new_super: @@ -1568,6 +1571,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, out_err: kfree(opts.release_agent); kfree(opts.name); + unlock_kernel(); return ret; } -- cgit v1.2.3-18-g5258 From 38d018dba3f725b969f196550d92a6ec1c092428 Mon Sep 17 00:00:00 2001 From: Jan Blunck Date: Wed, 24 Feb 2010 13:25:34 +0100 Subject: BKL: Remove BKL from cgroup The BKL is only used in remount_fs and get_sb that are both protected by the superblocks s_umount rw_semaphore. Therefore it is safe to remove the BKL entirely. Signed-off-by: Jan Blunck Signed-off-by: Arnd Bergmann --- kernel/cgroup.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a7ba3bccadc..304d2775994 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -52,7 +52,6 @@ #include #include #include -#include #include #include #include /* TODO: replace with more sophisticated array */ @@ -1222,7 +1221,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) struct cgroup *cgrp = &root->top_cgroup; struct cgroup_sb_opts opts; - lock_kernel(); mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); @@ -1255,7 +1253,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) kfree(opts.name); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); - unlock_kernel(); return ret; } @@ -1430,8 +1427,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type, struct super_block *sb; struct cgroupfs_root *new_root; - lock_kernel(); - /* First find the desired set of subsystems */ mutex_lock(&cgroup_mutex); ret = parse_cgroupfs_options(data, &opts); @@ -1561,7 +1556,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type, simple_set_mnt(mnt, sb); kfree(opts.release_agent); kfree(opts.name); - unlock_kernel(); return 0; drop_new_super: @@ -1571,8 +1565,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type, out_err: kfree(opts.release_agent); kfree(opts.name); - unlock_kernel(); - return ret; } -- cgit v1.2.3-18-g5258 From 97bd234701b2b39a0e749c1fe0e44f1d14c94292 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Oct 2010 10:41:14 +0200 Subject: workqueue: prepare for more tracepoints Define workqueue_work event class and use it for workqueue_execute_end trace point. Also, move trace/events/workqueue.h include downwards such that all struct definitions are visible to it. This is to prepare for more tracepoints and doesn't cause any functional change. Signed-off-by: Tejun Heo Cc: Frederic Weisbecker --- kernel/workqueue.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 19e4bc15ee9..026f778e879 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -42,9 +42,6 @@ #include #include -#define CREATE_TRACE_POINTS -#include - #include "workqueue_sched.h" enum { @@ -257,6 +254,9 @@ EXPORT_SYMBOL_GPL(system_long_wq); EXPORT_SYMBOL_GPL(system_nrt_wq); EXPORT_SYMBOL_GPL(system_unbound_wq); +#define CREATE_TRACE_POINTS +#include + #define for_each_busy_worker(worker, i, pos, gcwq) \ for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) -- cgit v1.2.3-18-g5258 From cdadf0097cdca06c497ffaeb5982e028c6e4ed38 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Oct 2010 10:49:55 +0200 Subject: workqueue: add queue_work and activate_work trace points These two tracepoints allow tracking when and how a work is queued and activated. This patch is based on Frederic's patch to add queue_work trace point. Signed-off-by: Tejun Heo Cc: Frederic Weisbecker --- kernel/workqueue.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 026f778e879..cb2ccfbed0c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -997,6 +997,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, /* gcwq determined, get cwq and queue */ cwq = get_cwq(gcwq->cpu, wq); + trace_workqueue_queue_work(cpu, cwq, work); BUG_ON(!list_empty(&work->entry)); @@ -1004,6 +1005,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, work_flags = work_color_to_flags(cwq->work_color); if (likely(cwq->nr_active < cwq->max_active)) { + trace_workqueue_activate_work(work); cwq->nr_active++; worklist = gcwq_determine_ins_pos(gcwq, cwq); } else { @@ -1679,6 +1681,7 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) struct work_struct, entry); struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); + trace_workqueue_activate_work(work); move_linked_works(work, pos, NULL); __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); cwq->nr_active++; -- cgit v1.2.3-18-g5258 From 30310045dd20a286cf3800f063f79b468e132fb1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 11 Oct 2010 11:51:57 +0200 Subject: workqueue: fix HIGHPRI handling in keep_working() The policy function keep_working() didn't check GCWQ_HIGHPRI_PENDING and could return %false with highpri work pending. This could lead to late execution of a highpri work which was delayed due to @max_active throttling if other works are actively consuming CPU cycles. For example, the following could happen. 1. Work W0 which burns CPU cycles. 2. Two works W1 and W2 are queued to a highpri wq w/ @max_active of 1. 3. W1 starts executing and W2 is put to delayed queue. W0 and W1 are both runnable. 4. W1 finishes which puts W2 to pending queue but keep_working() incorrectly returns %false and the worker goes to sleep. 5. W0 finishes and W2 starts execution. With this patch applied, W2 starts execution as soon as W1 finishes. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index cb2ccfbed0c..b57a8babdec 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -604,7 +604,9 @@ static bool keep_working(struct global_cwq *gcwq) { atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); - return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1; + return !list_empty(&gcwq->worklist) && + (atomic_read(nr_running) <= 1 || + gcwq->flags & GCWQ_HIGHPRI_PENDING); } /* Do we need a new worker? Called from manager. */ -- cgit v1.2.3-18-g5258 From 6370a6ad3b53df90b4700977f7718118a2cd524a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 11 Oct 2010 15:12:27 +0200 Subject: workqueue: add and use WQ_MEM_RECLAIM flag Add WQ_MEM_RECLAIM flag which currently maps to WQ_RESCUER, mark WQ_RESCUER as internal and replace all external WQ_RESCUER usages to WQ_MEM_RECLAIM. This makes the API users express the intent of the workqueue instead of indicating the internal mechanism used to guarantee forward progress. This is also to make it cleaner to add more semantics to WQ_MEM_RECLAIM. For example, if deemed necessary, memory reclaim workqueues can be made highpri. This patch doesn't introduce any functional change. Signed-off-by: Tejun Heo Cc: Jeff Garzik Cc: Dave Chinner Cc: Steven Whitehouse --- kernel/workqueue.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b57a8babdec..2c6871cbcbe 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2847,6 +2847,13 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, struct workqueue_struct *wq; unsigned int cpu; + /* + * Workqueues which may be used during memory reclaim should + * have a rescuer to guarantee forward progress. + */ + if (flags & WQ_MEM_RECLAIM) + flags |= WQ_RESCUER; + /* * Unbound workqueues aren't concurrency managed and should be * dispatched to workers immediately. -- cgit v1.2.3-18-g5258 From 6038f373a3dc1f1c26496e60b6c40b164716f07e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 15 Aug 2010 18:52:59 +0200 Subject: llseek: automatically add .llseek fop All file_operations should get a .llseek operation so we can make nonseekable_open the default for future file operations without a .llseek pointer. The three cases that we can automatically detect are no_llseek, seq_lseek and default_llseek. For cases where we can we can automatically prove that the file offset is always ignored, we use noop_llseek, which maintains the current behavior of not returning an error from a seek. New drivers should normally not use noop_llseek but instead use no_llseek and call nonseekable_open at open time. Existing drivers can be converted to do the same when the maintainer knows for certain that no user code relies on calling seek on the device file. The generated code is often incorrectly indented and right now contains comments that clarify for each added line why a specific variant was chosen. In the version that gets submitted upstream, the comments will be gone and I will manually fix the indentation, because there does not seem to be a way to do that using coccinelle. Some amount of new code is currently sitting in linux-next that should get the same modifications, which I will do at the end of the merge window. Many thanks to Julia Lawall for helping me learn to write a semantic patch that does all this. ===== begin semantic patch ===== // This adds an llseek= method to all file operations, // as a preparation for making no_llseek the default. // // The rules are // - use no_llseek explicitly if we do nonseekable_open // - use seq_lseek for sequential files // - use default_llseek if we know we access f_pos // - use noop_llseek if we know we don't access f_pos, // but we still want to allow users to call lseek // @ open1 exists @ identifier nested_open; @@ nested_open(...) { <+... nonseekable_open(...) ...+> } @ open exists@ identifier open_f; identifier i, f; identifier open1.nested_open; @@ int open_f(struct inode *i, struct file *f) { <+... ( nonseekable_open(...) | nested_open(...) ) ...+> } @ read disable optional_qualifier exists @ identifier read_f; identifier f, p, s, off; type ssize_t, size_t, loff_t; expression E; identifier func; @@ ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off) { <+... ( *off = E | *off += E | func(..., off, ...) | E = *off ) ...+> } @ read_no_fpos disable optional_qualifier exists @ identifier read_f; identifier f, p, s, off; type ssize_t, size_t, loff_t; @@ ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off) { ... when != off } @ write @ identifier write_f; identifier f, p, s, off; type ssize_t, size_t, loff_t; expression E; identifier func; @@ ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off) { <+... ( *off = E | *off += E | func(..., off, ...) | E = *off ) ...+> } @ write_no_fpos @ identifier write_f; identifier f, p, s, off; type ssize_t, size_t, loff_t; @@ ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off) { ... when != off } @ fops0 @ identifier fops; @@ struct file_operations fops = { ... }; @ has_llseek depends on fops0 @ identifier fops0.fops; identifier llseek_f; @@ struct file_operations fops = { ... .llseek = llseek_f, ... }; @ has_read depends on fops0 @ identifier fops0.fops; identifier read_f; @@ struct file_operations fops = { ... .read = read_f, ... }; @ has_write depends on fops0 @ identifier fops0.fops; identifier write_f; @@ struct file_operations fops = { ... .write = write_f, ... }; @ has_open depends on fops0 @ identifier fops0.fops; identifier open_f; @@ struct file_operations fops = { ... .open = open_f, ... }; // use no_llseek if we call nonseekable_open //////////////////////////////////////////// @ nonseekable1 depends on !has_llseek && has_open @ identifier fops0.fops; identifier nso ~= "nonseekable_open"; @@ struct file_operations fops = { ... .open = nso, ... +.llseek = no_llseek, /* nonseekable */ }; @ nonseekable2 depends on !has_llseek @ identifier fops0.fops; identifier open.open_f; @@ struct file_operations fops = { ... .open = open_f, ... +.llseek = no_llseek, /* open uses nonseekable */ }; // use seq_lseek for sequential files ///////////////////////////////////// @ seq depends on !has_llseek @ identifier fops0.fops; identifier sr ~= "seq_read"; @@ struct file_operations fops = { ... .read = sr, ... +.llseek = seq_lseek, /* we have seq_read */ }; // use default_llseek if there is a readdir /////////////////////////////////////////// @ fops1 depends on !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier readdir_e; @@ // any other fop is used that changes pos struct file_operations fops = { ... .readdir = readdir_e, ... +.llseek = default_llseek, /* readdir is present */ }; // use default_llseek if at least one of read/write touches f_pos ///////////////////////////////////////////////////////////////// @ fops2 depends on !fops1 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier read.read_f; @@ // read fops use offset struct file_operations fops = { ... .read = read_f, ... +.llseek = default_llseek, /* read accesses f_pos */ }; @ fops3 depends on !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier write.write_f; @@ // write fops use offset struct file_operations fops = { ... .write = write_f, ... + .llseek = default_llseek, /* write accesses f_pos */ }; // Use noop_llseek if neither read nor write accesses f_pos /////////////////////////////////////////////////////////// @ fops4 depends on !fops1 && !fops2 && !fops3 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier read_no_fpos.read_f; identifier write_no_fpos.write_f; @@ // write fops use offset struct file_operations fops = { ... .write = write_f, .read = read_f, ... +.llseek = noop_llseek, /* read and write both use no f_pos */ }; @ depends on has_write && !has_read && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier write_no_fpos.write_f; @@ struct file_operations fops = { ... .write = write_f, ... +.llseek = noop_llseek, /* write uses no f_pos */ }; @ depends on has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier read_no_fpos.read_f; @@ struct file_operations fops = { ... .read = read_f, ... +.llseek = noop_llseek, /* read uses no f_pos */ }; @ depends on !has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; @@ struct file_operations fops = { ... +.llseek = noop_llseek, /* no read or write fn */ }; ===== End semantic patch ===== Signed-off-by: Arnd Bergmann Cc: Julia Lawall Cc: Christoph Hellwig --- kernel/configs.c | 1 + kernel/gcov/fs.c | 1 + kernel/kprobes.c | 1 + kernel/pm_qos_params.c | 1 + kernel/profile.c | 1 + kernel/trace/blktrace.c | 2 ++ kernel/trace/ftrace.c | 2 ++ kernel/trace/ring_buffer.c | 1 + kernel/trace/trace_events.c | 6 ++++++ kernel/trace/trace_stack.c | 1 + 10 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/configs.c b/kernel/configs.c index abaee684ecb..b4066b44a99 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -66,6 +66,7 @@ ikconfig_read_current(struct file *file, char __user *buf, static const struct file_operations ikconfig_file_ops = { .owner = THIS_MODULE, .read = ikconfig_read_current, + .llseek = default_llseek, }; static int __init ikconfig_init(void) diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index f83972b1656..9bd0934f6c3 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -561,6 +561,7 @@ static ssize_t reset_read(struct file *file, char __user *addr, size_t len, static const struct file_operations gcov_reset_fops = { .write = reset_write, .read = reset_read, + .llseek = noop_llseek, }; /* diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 282035f3ae9..8b5ff2655ae 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1992,6 +1992,7 @@ static ssize_t write_enabled_file_bool(struct file *file, static const struct file_operations fops_kp = { .read = read_enabled_file_bool, .write = write_enabled_file_bool, + .llseek = default_llseek, }; static int __kprobes debugfs_kprobe_init(void) diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 645e541a45f..a96b850ba08 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -110,6 +110,7 @@ static const struct file_operations pm_qos_power_fops = { .write = pm_qos_power_write, .open = pm_qos_power_open, .release = pm_qos_power_release, + .llseek = noop_llseek, }; /* unlocked internal variant */ diff --git a/kernel/profile.c b/kernel/profile.c index b22a899934c..66f841b7fbd 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -555,6 +555,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf, static const struct file_operations proc_profile_operations = { .read = read_profile, .write = write_profile, + .llseek = default_llseek, }; #ifdef CONFIG_SMP diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 959f8d6c8cc..2d5f3a75731 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -326,6 +326,7 @@ static const struct file_operations blk_dropped_fops = { .owner = THIS_MODULE, .open = blk_dropped_open, .read = blk_dropped_read, + .llseek = default_llseek, }; static int blk_msg_open(struct inode *inode, struct file *filp) @@ -365,6 +366,7 @@ static const struct file_operations blk_msg_fops = { .owner = THIS_MODULE, .open = blk_msg_open, .write = blk_msg_write, + .llseek = noop_llseek, }; /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fa7ece649fe..5e1ad476309 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -800,6 +800,7 @@ static const struct file_operations ftrace_profile_fops = { .open = tracing_open_generic, .read = ftrace_profile_read, .write = ftrace_profile_write, + .llseek = default_llseek, }; /* used to initialize the real stat files */ @@ -2632,6 +2633,7 @@ static const struct file_operations ftrace_graph_fops = { .read = seq_read, .write = ftrace_graph_write, .release = ftrace_graph_release, + .llseek = seq_lseek, }; #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 492197e2f86..3aea966d16d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3965,6 +3965,7 @@ static const struct file_operations rb_simple_fops = { .open = tracing_open_generic, .read = rb_simple_read, .write = rb_simple_write, + .llseek = default_llseek, }; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 4c758f14632..0369c5e0998 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -951,6 +951,7 @@ static const struct file_operations ftrace_enable_fops = { .open = tracing_open_generic, .read = event_enable_read, .write = event_enable_write, + .llseek = default_llseek, }; static const struct file_operations ftrace_event_format_fops = { @@ -963,29 +964,34 @@ static const struct file_operations ftrace_event_format_fops = { static const struct file_operations ftrace_event_id_fops = { .open = tracing_open_generic, .read = event_id_read, + .llseek = default_llseek, }; static const struct file_operations ftrace_event_filter_fops = { .open = tracing_open_generic, .read = event_filter_read, .write = event_filter_write, + .llseek = default_llseek, }; static const struct file_operations ftrace_subsystem_filter_fops = { .open = tracing_open_generic, .read = subsystem_filter_read, .write = subsystem_filter_write, + .llseek = default_llseek, }; static const struct file_operations ftrace_system_enable_fops = { .open = tracing_open_generic, .read = system_enable_read, .write = system_enable_write, + .llseek = default_llseek, }; static const struct file_operations ftrace_show_header_fops = { .open = tracing_open_generic, .read = show_header, + .llseek = default_llseek, }; static struct dentry *event_trace_events_dir(void) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index a6b7e0e0f3e..4c5dead0c23 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -195,6 +195,7 @@ static const struct file_operations stack_max_size_fops = { .open = tracing_open_generic, .read = stack_max_size_read, .write = stack_max_size_write, + .llseek = default_llseek, }; static void * -- cgit v1.2.3-18-g5258 From 31ddd871fc3db73e2024cb3eb3ee5051edf5a80f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 19 Oct 2010 11:14:49 +0200 Subject: workqueue: Clarify that schedule_on_each_cpu is synchronous The documentation for schedule_on_each_cpu() states that it calls a function on each online CPU from keventd. This can easily be interpreted as an asyncronous call because the description does not mention that flush_work is called. Clarify that it is synchronous. tj: rephrased a bit Signed-off-by: Mel Gorman Reviewed-by: KOSAKI Motohiro Signed-off-by: Tejun Heo --- kernel/workqueue.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2c6871cbcbe..eb5c1972443 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2676,13 +2676,15 @@ int schedule_delayed_work_on(int cpu, EXPORT_SYMBOL(schedule_delayed_work_on); /** - * schedule_on_each_cpu - call a function on each online CPU from keventd + * schedule_on_each_cpu - execute a function synchronously on each online CPU * @func: the function to call * - * Returns zero on success. - * Returns -ve errno on failure. - * + * schedule_on_each_cpu() executes @func on each online CPU using the + * system workqueue and blocks until all CPUs have completed. * schedule_on_each_cpu() is very slow. + * + * RETURNS: + * 0 on success, -errno on failure. */ int schedule_on_each_cpu(work_func_t func) { -- cgit v1.2.3-18-g5258 From daaae6b010ac0f60c9c35e481589966f9f1fcc22 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 19 Oct 2010 11:28:15 +0200 Subject: workqueue: remove in_workqueue_context() Commit a25909a4 (lockdep: Add an in_workqueue_context() lockdep-based test function) added in_workqueue_context() but there hasn't been any in-kernel user and the lockdep annotation in workqueue is scheduled to change. Remove the unused function. Signed-off-by: Tejun Heo Cc: Paul E. McKenney --- kernel/workqueue.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index eb5c1972443..30acdb74cc2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -310,21 +310,6 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, (cpu) < WORK_CPU_NONE; \ (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) -#ifdef CONFIG_LOCKDEP -/** - * in_workqueue_context() - in context of specified workqueue? - * @wq: the workqueue of interest - * - * Checks lockdep state to see if the current task is executing from - * within a workqueue item. This function exists only if lockdep is - * enabled. - */ -int in_workqueue_context(struct workqueue_struct *wq) -{ - return lock_is_held(&wq->lockdep_map); -} -#endif - #ifdef CONFIG_DEBUG_OBJECTS_WORK static struct debug_obj_descr work_debug_descr; -- cgit v1.2.3-18-g5258 From 0fc86c7bd924debd0bddee790ecc884604fdcc63 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 11 Sep 2010 20:11:08 +0200 Subject: rtmutex-tester: make it build without BKL The big kernel lock is going away, so make sure that if it is disabled by Kconfig, we do not try to validate it, which would result in compile errors. Signed-off-by: Arnd Bergmann Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Arjan van de Ven Cc: Andrew Morton --- kernel/rtmutex-tester.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index a56f629b057..66cb89bc5ef 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -76,7 +76,9 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) } if (!lockwakeup && td->bkl == 4) { +#ifdef CONFIG_LOCK_KERNEL unlock_kernel(); +#endif td->bkl = 0; } return 0; @@ -133,14 +135,18 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) if (td->bkl) return 0; td->bkl = 1; +#ifdef CONFIG_LOCK_KERNEL lock_kernel(); +#endif td->bkl = 4; return 0; case RTTEST_UNLOCKBKL: if (td->bkl != 4) break; +#ifdef CONFIG_LOCK_KERNEL unlock_kernel(); +#endif td->bkl = 0; return 0; -- cgit v1.2.3-18-g5258 From 01b284f9b6d51cc3f3bcf3b49f16d2601d3ca22d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Sep 2010 20:39:22 +0200 Subject: blktrace: remove the big kernel lock According to Jens, this code does not need the BKL at all, it is sufficiently serialized by bd_mutex. Signed-off-by: Arnd Bergmann Cc: Jens Axboe Cc: Steven Rostedt --- kernel/trace/blktrace.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 959f8d6c8cc..5328e8779d4 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include @@ -639,7 +638,6 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) if (!q) return -ENXIO; - lock_kernel(); mutex_lock(&bdev->bd_mutex); switch (cmd) { @@ -667,7 +665,6 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) } mutex_unlock(&bdev->bd_mutex); - unlock_kernel(); return ret; } @@ -1652,10 +1649,9 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct block_device *bdev; ssize_t ret = -ENXIO; - lock_kernel(); bdev = bdget(part_devt(p)); if (bdev == NULL) - goto out_unlock_kernel; + goto out; q = blk_trace_get_queue(bdev); if (q == NULL) @@ -1683,8 +1679,7 @@ out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); out_bdput: bdput(bdev); -out_unlock_kernel: - unlock_kernel(); +out: return ret; } @@ -1714,11 +1709,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, ret = -ENXIO; - lock_kernel(); p = dev_to_part(dev); bdev = bdget(part_devt(p)); if (bdev == NULL) - goto out_unlock_kernel; + goto out; q = blk_trace_get_queue(bdev); if (q == NULL) @@ -1753,8 +1747,6 @@ out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); out_bdput: bdput(bdev); -out_unlock_kernel: - unlock_kernel(); out: return ret ? ret : count; } -- cgit v1.2.3-18-g5258 From 747e94ae3d1b4c9bf5380e569f614eb9040b79e7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 8 Oct 2010 13:51:48 -0400 Subject: ring-buffer: Make write slow path out of line Gcc inlines the slow path of the ring buffer write which can hurt performance. This patch simply forces the slow path function rb_move_tail() to always be a function. The ring_buffer_benchmark module with reader_disabled=1 shows that this patch changes the time to record an event from 135 ns to 132 ns. (3 ns or 2.22% improvement) Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bca96377fd4..0b88df849a5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1823,7 +1823,10 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, local_sub(length, &tail_page->write); } -static struct ring_buffer_event * +/* + * This is the slow path, force gcc not to inline it. + */ +static noinline struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long length, unsigned long tail, struct buffer_page *tail_page, u64 *ts) @@ -1943,7 +1946,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, tail = write - length; /* See if we shot pass the end of this buffer page */ - if (write > BUF_PAGE_SIZE) + if (unlikely(write > BUF_PAGE_SIZE)) return rb_move_tail(cpu_buffer, length, tail, tail_page, ts); -- cgit v1.2.3-18-g5258 From e8bc43e84fada397af1b677b07dbf26e6ac78fcc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Oct 2010 10:58:02 -0400 Subject: ring-buffer: Pass timestamp by value and not by reference The original code for the ring buffer had locations that modified the timestamp and that change was used by the callers. Now, the timestamp is not reused by the callers and there is no reason to pass it by reference. By changing the call to pass by value, lets gcc optimize the code a bit more where it can store the timestamp in a register and not worry about updating the reference. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 0b88df849a5..c8ce6bde7fa 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1829,7 +1829,7 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, static noinline struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long length, unsigned long tail, - struct buffer_page *tail_page, u64 *ts) + struct buffer_page *tail_page, u64 ts) { struct buffer_page *commit_page = cpu_buffer->commit_page; struct ring_buffer *buffer = cpu_buffer->buffer; @@ -1912,8 +1912,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, * Nested commits always have zero deltas, so * just reread the time stamp */ - *ts = rb_time_stamp(buffer); - next_page->page->time_stamp = *ts; + ts = rb_time_stamp(buffer); + next_page->page->time_stamp = ts; } out_again: @@ -1932,7 +1932,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned type, unsigned long length, u64 *ts) + unsigned type, unsigned long length, u64 ts) { struct buffer_page *tail_page; struct ring_buffer_event *event; @@ -1965,7 +1965,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * its timestamp. */ if (!tail) - tail_page->page->time_stamp = *ts; + tail_page->page->time_stamp = ts; return event; } @@ -2008,7 +2008,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, static int rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, - u64 *ts, u64 *delta) + u64 ts, u64 *delta) { struct ring_buffer_event *event; int ret; @@ -2016,7 +2016,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, WARN_ONCE(*delta > (1ULL << 59), KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", (unsigned long long)*delta, - (unsigned long long)*ts, + (unsigned long long)ts, (unsigned long long)cpu_buffer->write_stamp); /* @@ -2051,7 +2051,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, event->array[0] = 0; } } - cpu_buffer->write_stamp = *ts; + cpu_buffer->write_stamp = ts; /* let the caller know this was the commit */ ret = 1; } else { @@ -2175,7 +2175,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, delta = diff; if (unlikely(test_time_stamp(delta))) { - commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); + commit = rb_add_time_stamp(cpu_buffer, ts, &delta); if (commit == -EBUSY) goto out_fail; @@ -2187,7 +2187,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, } get_event: - event = __rb_reserve_next(cpu_buffer, 0, length, &ts); + event = __rb_reserve_next(cpu_buffer, 0, length, ts); if (unlikely(PTR_ERR(event) == -EAGAIN)) goto again; -- cgit v1.2.3-18-g5258 From f25106aeab7408394b9dd707e5ecf557e269c723 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Oct 2010 12:40:12 -0400 Subject: ring-buffer: Pass delta by value and not by reference The delta between events is passed to the timestamp code by reference and the timestamp code will reset the value. But it can be reset from the caller. No need to pass it in by reference. By changing the call to pass by value, lets gcc optimize the code a bit more where it can store the delta in a register and not worry about updating the reference. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c8ce6bde7fa..3af77cd47f2 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2008,14 +2008,14 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, static int rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, - u64 ts, u64 *delta) + u64 ts, u64 delta) { struct ring_buffer_event *event; int ret; - WARN_ONCE(*delta > (1ULL << 59), + WARN_ONCE(delta > (1ULL << 59), KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", - (unsigned long long)*delta, + (unsigned long long)delta, (unsigned long long)ts, (unsigned long long)cpu_buffer->write_stamp); @@ -2041,8 +2041,8 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, * and if we can't just make it zero. */ if (rb_event_index(event)) { - event->time_delta = *delta & TS_MASK; - event->array[0] = *delta >> TS_SHIFT; + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; } else { /* try to discard, since we do not need this */ if (!rb_try_to_discard(cpu_buffer, event)) { @@ -2064,8 +2064,6 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, ret = 0; } - *delta = 0; - return ret; } @@ -2175,7 +2173,9 @@ rb_reserve_next_event(struct ring_buffer *buffer, delta = diff; if (unlikely(test_time_stamp(delta))) { - commit = rb_add_time_stamp(cpu_buffer, ts, &delta); + commit = rb_add_time_stamp(cpu_buffer, ts, delta); + delta = 0; + if (commit == -EBUSY) goto out_fail; -- cgit v1.2.3-18-g5258 From 69d1b839f7eee347e357b3f6cce7f630cc6ff93d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 Oct 2010 18:18:05 -0400 Subject: ring-buffer: Bind time extend and data events together When the time between two timestamps is greater than 2^27 nanosecs (~134 ms) a time extend event is added that extends the time difference to 59 bits (~18 years). This is due to events only having a 27 bit field to store time. Currently this time extend is a separate event. We add it just before the event data that is being written to the buffer. But before the event data is committed, the event data can also be discarded (as with the case of filters). But because the time extend has already been committed, it will stay in the buffer. If lots of events are being filtered and no event is being written, then every 134ms a time extend can be added to the buffer without any data attached. To keep from filling the entire buffer with time extends, a time extend will never be the first event in a page because the page timestamp can be used. Time extends can only fill the rest of a page with some data at the beginning. This patch binds the time extend with the data. The difference here is that the time extend is not committed before the data is added. Instead, when a time extend is needed, the space reserved on the ring buffer is the time extend + the data event size. The time extend is added to the first part of the reserved block and the data is added to the second. The time extend event is passed back to the reserver, but since the reserver also uses a function to find the data portion of the reserved block, no changes to the ring buffer interface need to be made. When a commit is discarded, we now remove both the time extend and the event. With this approach no more than one time extend can be in the buffer in a row. Data must always follow a time extend. Thanks to Mathieu Desnoyers for suggesting this idea. Suggested-by: Mathieu Desnoyers Cc: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 266 ++++++++++++++++++++++++--------------------- 1 file changed, 142 insertions(+), 124 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3af77cd47f2..f50f43107e9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -224,6 +224,9 @@ enum { RB_LEN_TIME_STAMP = 16, }; +#define skip_time_extend(event) \ + ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND)) + static inline int rb_null_event(struct ring_buffer_event *event) { return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; @@ -248,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event) return length + RB_EVNT_HDR_SIZE; } -/* inline for ring buffer fast paths */ -static unsigned +/* + * Return the length of the given event. Will return + * the length of the time extend if the event is a + * time extend. + */ +static inline unsigned rb_event_length(struct ring_buffer_event *event) { switch (event->type_len) { @@ -274,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event) return 0; } +/* + * Return total length of time extend and data, + * or just the event length for all other events. + */ +static inline unsigned +rb_event_ts_length(struct ring_buffer_event *event) +{ + unsigned len = 0; + + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + /* time extends include the data event after it */ + len = RB_LEN_TIME_EXTEND; + event = skip_time_extend(event); + } + return len + rb_event_length(event); +} + /** * ring_buffer_event_length - return the length of the event * @event: the event to get the length of + * + * Returns the size of the data load of a data event. + * If the event is something other than a data event, it + * returns the size of the event itself. With the exception + * of a TIME EXTEND, where it still returns the size of the + * data load of the data event after it. */ unsigned ring_buffer_event_length(struct ring_buffer_event *event) { - unsigned length = rb_event_length(event); + unsigned length; + + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); + + length = rb_event_length(event); if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) return length; length -= RB_EVNT_HDR_SIZE; @@ -294,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length); static void * rb_event_data(struct ring_buffer_event *event) { + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); /* If length is in len field, then array[0] has the data */ if (event->type_len) @@ -1546,6 +1583,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) iter->head = 0; } +/* Slow path, do not inline */ +static noinline struct ring_buffer_event * +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) +{ + event->type_len = RINGBUF_TYPE_TIME_EXTEND; + + /* Not the first event on the page? */ + if (rb_event_index(event)) { + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + } else { + /* nope, just zero it */ + event->time_delta = 0; + event->array[0] = 0; + } + + return skip_time_extend(event); +} + /** * ring_buffer_update_event - update event type and data * @event: the even to update @@ -1558,28 +1614,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) * data field. */ static void -rb_update_event(struct ring_buffer_event *event, - unsigned type, unsigned length) +rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, unsigned length, + int add_timestamp, u64 delta) { - event->type_len = type; - - switch (type) { - - case RINGBUF_TYPE_PADDING: - case RINGBUF_TYPE_TIME_EXTEND: - case RINGBUF_TYPE_TIME_STAMP: - break; + /* Only a commit updates the timestamp */ + if (unlikely(!rb_event_is_commit(cpu_buffer, event))) + delta = 0; - case 0: - length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) - event->array[0] = length; - else - event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); - break; - default: - BUG(); + /* + * If we need to add a timestamp, then we + * add it to the start of the resevered space. + */ + if (unlikely(add_timestamp)) { + event = rb_add_time_stamp(event, delta); + length -= RB_LEN_TIME_EXTEND; + delta = 0; } + + event->time_delta = delta; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { + event->type_len = 0; + event->array[0] = length; + } else + event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); } /* @@ -1932,12 +1991,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned type, unsigned long length, u64 ts) + unsigned long length, u64 ts, + u64 delta, int add_timestamp) { struct buffer_page *tail_page; struct ring_buffer_event *event; unsigned long tail, write; + /* + * If the time delta since the last event is too big to + * hold in the time field of the event, then we append a + * TIME EXTEND event ahead of the data event. + */ + if (unlikely(add_timestamp)) + length += RB_LEN_TIME_EXTEND; + tail_page = cpu_buffer->tail_page; write = local_add_return(length, &tail_page->write); @@ -1954,11 +2022,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, event = __rb_page_index(tail_page, tail); kmemcheck_annotate_bitfield(event, bitfield); - rb_update_event(event, type, length); + rb_update_event(cpu_buffer, event, length, add_timestamp, delta); - /* The passed in type is zero for DATA */ - if (likely(!type)) - local_inc(&tail_page->entries); + local_inc(&tail_page->entries); /* * If this is the first commit on the page, then update @@ -1980,7 +2046,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, unsigned long addr; new_index = rb_event_index(event); - old_index = new_index + rb_event_length(event); + old_index = new_index + rb_event_ts_length(event); addr = (unsigned long)event; addr &= PAGE_MASK; @@ -2006,67 +2072,6 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, return 0; } -static int -rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, - u64 ts, u64 delta) -{ - struct ring_buffer_event *event; - int ret; - - WARN_ONCE(delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", - (unsigned long long)delta, - (unsigned long long)ts, - (unsigned long long)cpu_buffer->write_stamp); - - /* - * The delta is too big, we to add a - * new timestamp. - */ - event = __rb_reserve_next(cpu_buffer, - RINGBUF_TYPE_TIME_EXTEND, - RB_LEN_TIME_EXTEND, - ts); - if (!event) - return -EBUSY; - - if (PTR_ERR(event) == -EAGAIN) - return -EAGAIN; - - /* Only a commited time event can update the write stamp */ - if (rb_event_is_commit(cpu_buffer, event)) { - /* - * If this is the first on the page, then it was - * updated with the page itself. Try to discard it - * and if we can't just make it zero. - */ - if (rb_event_index(event)) { - event->time_delta = delta & TS_MASK; - event->array[0] = delta >> TS_SHIFT; - } else { - /* try to discard, since we do not need this */ - if (!rb_try_to_discard(cpu_buffer, event)) { - /* nope, just zero it */ - event->time_delta = 0; - event->array[0] = 0; - } - } - cpu_buffer->write_stamp = ts; - /* let the caller know this was the commit */ - ret = 1; - } else { - /* Try to discard the event */ - if (!rb_try_to_discard(cpu_buffer, event)) { - /* Darn, this is just wasted space */ - event->time_delta = 0; - event->array[0] = 0; - } - ret = 0; - } - - return ret; -} - static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) { local_inc(&cpu_buffer->committing); @@ -2111,9 +2116,9 @@ rb_reserve_next_event(struct ring_buffer *buffer, unsigned long length) { struct ring_buffer_event *event; - u64 ts, delta = 0; - int commit = 0; + u64 ts, delta; int nr_loops = 0; + int add_timestamp; rb_start_commit(cpu_buffer); @@ -2134,6 +2139,9 @@ rb_reserve_next_event(struct ring_buffer *buffer, length = rb_calculate_event_length(length); again: + add_timestamp = 0; + delta = 0; + /* * We allow for interrupts to reenter here and do a trace. * If one does, it will cause this original code to loop @@ -2172,33 +2180,24 @@ rb_reserve_next_event(struct ring_buffer *buffer, delta = diff; if (unlikely(test_time_stamp(delta))) { - - commit = rb_add_time_stamp(cpu_buffer, ts, delta); - delta = 0; - - if (commit == -EBUSY) - goto out_fail; - - if (commit == -EAGAIN) - goto again; - - RB_WARN_ON(cpu_buffer, commit < 0); + WARN_ONCE(delta > (1ULL << 59), + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", + (unsigned long long)delta, + (unsigned long long)ts, + (unsigned long long)cpu_buffer->write_stamp); + add_timestamp = 1; } } get_event: - event = __rb_reserve_next(cpu_buffer, 0, length, ts); + event = __rb_reserve_next(cpu_buffer, length, ts, + delta, add_timestamp); if (unlikely(PTR_ERR(event) == -EAGAIN)) goto again; if (!event) goto out_fail; - if (!rb_event_is_commit(cpu_buffer, event)) - delta = 0; - - event->time_delta = delta; - return event; out_fail: @@ -2311,12 +2310,28 @@ static void rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { + u64 delta; + /* * The event first in the commit queue updates the * time stamp. */ - if (rb_event_is_commit(cpu_buffer, event)) - cpu_buffer->write_stamp += event->time_delta; + if (rb_event_is_commit(cpu_buffer, event)) { + /* + * A commit event that is first on a page + * updates the write timestamp with the page stamp + */ + if (!rb_event_index(event)) + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + cpu_buffer->write_stamp += delta; + } else + cpu_buffer->write_stamp += event->time_delta; + } } static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, @@ -2356,6 +2371,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); static inline void rb_event_discard(struct ring_buffer_event *event) { + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); + /* array[0] holds the actual length for the discarded event */ event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; event->type_len = RINGBUF_TYPE_PADDING; @@ -3043,12 +3061,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, again: /* - * We repeat when a timestamp is encountered. It is possible - * to get multiple timestamps from an interrupt entering just - * as one timestamp is about to be written, or from discarded - * commits. The most that we can have is the number on a single page. + * We repeat when a time extend is encountered. + * Since the time extend is always attached to a data event, + * we should never loop more than once. + * (We never hit the following condition more than twice). */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) return NULL; reader = rb_get_reader_page(cpu_buffer); @@ -3124,14 +3142,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) return NULL; /* - * We repeat when a timestamp is encountered. - * We can get multiple timestamps by nested interrupts or also - * if filtering is on (discarding commits). Since discarding - * commits can be frequent we can get a lot of timestamps. - * But we limit them by not adding timestamps if they begin - * at the start of a page. + * We repeat when a time extend is encountered. + * Since the time extend is always attached to a data event, + * we should never loop more than once. + * (We never hit the following condition more than twice). */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) return NULL; if (rb_per_cpu_empty(cpu_buffer)) @@ -3829,7 +3845,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer, if (len > (commit - read)) len = (commit - read); - size = rb_event_length(event); + /* Always keep the time extend and data together */ + size = rb_event_ts_length(event); if (len < size) goto out_unlock; @@ -3851,7 +3868,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer, break; event = rb_reader_event(cpu_buffer); - size = rb_event_length(event); + /* Always keep the time extend and data together */ + size = rb_event_ts_length(event); } while (len > size); /* update bpage */ -- cgit v1.2.3-18-g5258 From 140ff89127c74b1b1c1b0152a36ea3720ccf6bc3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 8 Oct 2010 10:50:30 -0400 Subject: ring-buffer: Remove condition to add timestamp in fast path There's a condition to check if we should add a time extend or not in the fast path. But this condition is racey (in the sense that we can add a unnecessary time extend, but nothing that can break anything). We later check if the time or event time delta should be zero or have real data in it (not racey), making this first check redundant. This check may help save space once in a while, but really is not worth the hassle to try to save some space that happens at most 134 ms at a time. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f50f43107e9..d9f3e7a8213 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2119,6 +2119,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, u64 ts, delta; int nr_loops = 0; int add_timestamp; + u64 diff; rb_start_commit(cpu_buffer); @@ -2155,29 +2156,13 @@ rb_reserve_next_event(struct ring_buffer *buffer, goto out_fail; ts = rb_time_stamp(cpu_buffer->buffer); + diff = ts - cpu_buffer->write_stamp; - /* - * Only the first commit can update the timestamp. - * Yes there is a race here. If an interrupt comes in - * just after the conditional and it traces too, then it - * will also check the deltas. More than one timestamp may - * also be made. But only the entry that did the actual - * commit will be something other than zero. - */ - if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page && - rb_page_write(cpu_buffer->tail_page) == - rb_commit_index(cpu_buffer))) { - u64 diff; - - diff = ts - cpu_buffer->write_stamp; - - /* make sure this diff is calculated here */ - barrier(); - - /* Did the write stamp get updated already? */ - if (unlikely(ts < cpu_buffer->write_stamp)) - goto get_event; + /* make sure this diff is calculated here */ + barrier(); + /* Did the write stamp get updated already? */ + if (likely(ts >= cpu_buffer->write_stamp)) { delta = diff; if (unlikely(test_time_stamp(delta))) { WARN_ONCE(delta > (1ULL << 59), @@ -2189,7 +2174,6 @@ rb_reserve_next_event(struct ring_buffer *buffer, } } - get_event: event = __rb_reserve_next(cpu_buffer, length, ts, delta, add_timestamp); if (unlikely(PTR_ERR(event) == -EAGAIN)) -- cgit v1.2.3-18-g5258 From d9abde2138e0a00a0d7e44676928efa0ef629d48 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 19 Oct 2010 13:17:08 -0400 Subject: ring-buffer: Micro-optimize with some strategic inlining By using inline and noinline, we are able to make the fast path of recording an event 4% faster. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d9f3e7a8213..f5007d0d932 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2078,7 +2078,7 @@ static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) local_inc(&cpu_buffer->commits); } -static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) +static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long commits; @@ -2193,13 +2193,9 @@ rb_reserve_next_event(struct ring_buffer *buffer, #define TRACE_RECURSIVE_DEPTH 16 -static int trace_recursive_lock(void) +/* Keep this code out of the fast path cache */ +static noinline void trace_recursive_fail(void) { - current->trace_recursion++; - - if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) - return 0; - /* Disable all tracing before we do anything else */ tracing_off_permanent(); @@ -2211,10 +2207,21 @@ static int trace_recursive_lock(void) in_nmi()); WARN_ON_ONCE(1); +} + +static inline int trace_recursive_lock(void) +{ + current->trace_recursion++; + + if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) + return 0; + + trace_recursive_fail(); + return -1; } -static void trace_recursive_unlock(void) +static inline void trace_recursive_unlock(void) { WARN_ON_ONCE(!current->trace_recursion); -- cgit v1.2.3-18-g5258 From b8b2663bd7c9da04ac804659b9f617c199d0252c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 19 Oct 2010 13:23:25 -0400 Subject: ring-buffer: Remove unused macro RB_TIMESTAMPS_PER_PAGE With the binding of time extends to events we no longer need to use the macro RB_TIMESTAMPS_PER_PAGE. Remove it. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f5007d0d932..ad25490f8b4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -441,9 +441,6 @@ static inline int test_time_stamp(u64 delta) /* Max payload is BUF_PAGE_SIZE - header (8bytes) */ #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) -/* Max number of timestamps that can fit on a page */ -#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND) - int ring_buffer_print_page_header(struct trace_seq *s) { struct buffer_data_page field; -- cgit v1.2.3-18-g5258 From dd49a38cf30944be27892c10b1c0e5b3fa73bcb2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Oct 2010 21:51:26 -0400 Subject: tracing: Do not limit the size of the number of CPU buffers The tracing per_cpu buffers were limited to 999 CPUs for a mear savings in stack space of a char array. Up the array to 30 characters which is more than enough to hold a 64 bit number. Reported-by: Robin Holt Suggested-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 001bcd2ccf4..82d9b8106cd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3996,13 +3996,9 @@ static void tracing_init_debugfs_percpu(long cpu) { struct dentry *d_percpu = tracing_dentry_percpu(); struct dentry *d_cpu; - /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ - char cpu_dir[7]; + char cpu_dir[30]; /* 30 characters should be more than enough */ - if (cpu > 999 || cpu < 0) - return; - - sprintf(cpu_dir, "cpu%ld", cpu); + snprintf(cpu_dir, 30, "cpu%ld", cpu); d_cpu = debugfs_create_dir(cpu_dir, d_percpu); if (!d_cpu) { pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); -- cgit v1.2.3-18-g5258 From f4bc6bb2d562703eafc895c37e7be20906de139d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 19 Oct 2010 15:00:13 +0200 Subject: tracing: Cleanup the convoluted softirq tracepoints With the addition of trace_softirq_raise() the softirq tracepoint got even more convoluted. Why the tracepoints take two pointers to assign an integer is beyond my comprehension. But adding an extra case which treats the first pointer as an unsigned long when the second pointer is NULL including the back and forth type casting is just horrible. Convert the softirq tracepoints to take a single unsigned int argument for the softirq vector number and fix the call sites. Signed-off-by: Thomas Gleixner LKML-Reference: Acked-by: Peter Zijlstra Acked-by: mathieu.desnoyers@efficios.com Cc: Frederic Weisbecker Cc: Steven Rostedt --- kernel/softirq.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 07b4f1b1a73..b3cb1dc1579 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -212,18 +212,20 @@ restart: do { if (pending & 1) { + unsigned int vec_nr = h - softirq_vec; int prev_count = preempt_count(); - kstat_incr_softirqs_this_cpu(h - softirq_vec); - trace_softirq_entry(h, softirq_vec); + kstat_incr_softirqs_this_cpu(vec_nr); + + trace_softirq_entry(vec_nr); h->action(h); - trace_softirq_exit(h, softirq_vec); + trace_softirq_exit(vec_nr); if (unlikely(prev_count != preempt_count())) { - printk(KERN_ERR "huh, entered softirq %td %s %p" + printk(KERN_ERR "huh, entered softirq %u %s %p" "with preempt_count %08x," - " exited with %08x?\n", h - softirq_vec, - softirq_to_name[h - softirq_vec], - h->action, prev_count, preempt_count()); + " exited with %08x?\n", vec_nr, + softirq_to_name[vec_nr], h->action, + prev_count, preempt_count()); preempt_count() = prev_count; } -- cgit v1.2.3-18-g5258 From b2b5ce022acf5e9f52f7b78c5579994fdde191d4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Oct 2010 15:24:15 +0200 Subject: sched, cgroup: Fixup broken cgroup movement Dima noticed that we fail to correct the ->vruntime of sleeping tasks when we move them between cgroups. Reported-by: Dima Zavin Signed-off-by: Peter Zijlstra Tested-by: Mike Galbraith LKML-Reference: <1287150604.29097.1513.camel@twins> Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++---- kernel/sched_fair.c | 25 +++++++++++++++++++------ 2 files changed, 23 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 5998222f901..3fe253e6a6e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8498,12 +8498,12 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); - set_task_rq(tsk, task_cpu(tsk)); - #ifdef CONFIG_FAIR_GROUP_SCHED - if (tsk->sched_class->moved_group) - tsk->sched_class->moved_group(tsk, on_rq); + if (tsk->sched_class->task_move_group) + tsk->sched_class->task_move_group(tsk, on_rq); + else #endif + set_task_rq(tsk, task_cpu(tsk)); if (unlikely(running)) tsk->sched_class->set_curr_task(rq); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 74cccfae87a..3acc2a487c1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3866,13 +3866,26 @@ static void set_curr_task_fair(struct rq *rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void moved_group_fair(struct task_struct *p, int on_rq) +static void task_move_group_fair(struct task_struct *p, int on_rq) { - struct cfs_rq *cfs_rq = task_cfs_rq(p); - - update_curr(cfs_rq); + /* + * If the task was not on the rq at the time of this cgroup movement + * it must have been asleep, sleeping tasks keep their ->vruntime + * absolute on their old rq until wakeup (needed for the fair sleeper + * bonus in place_entity()). + * + * If it was on the rq, we've just 'preempted' it, which does convert + * ->vruntime to a relative base. + * + * Make sure both cases convert their relative position when migrating + * to another cgroup's rq. This does somewhat interfere with the + * fair sleeper stuff for the first placement, but who cares. + */ + if (!on_rq) + p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; + set_task_rq(p, task_cpu(p)); if (!on_rq) - place_entity(cfs_rq, &p->se, 1); + p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; } #endif @@ -3924,7 +3937,7 @@ static const struct sched_class fair_sched_class = { .get_rr_interval = get_rr_interval_fair, #ifdef CONFIG_FAIR_GROUP_SCHED - .moved_group = moved_group_fair, + .task_move_group = task_move_group_fair, #endif }; -- cgit v1.2.3-18-g5258 From 9ffcfa6f1f63eeac15555b745c292eb9f59130f6 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 20 Oct 2010 15:25:01 +0200 Subject: perf_events: Revert: Fix transaction recovery in group_sched_in() This patch reverts commit 8e5fc1a (perf_events: Fix transaction recovery in group_sched_in()) because it had one flaw in case the group could never be scheduled. It would cause time_enabled to get negative. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4cbeeeb7.0aefd80a.6e40.0e2f@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 76 +++++++++-------------------------------------------- 1 file changed, 13 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index f309e8014c7..39afdb07d75 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -417,8 +417,8 @@ event_filter_match(struct perf_event *event) return event->cpu == -1 || event->cpu == smp_processor_id(); } -static int -__event_sched_out(struct perf_event *event, +static void +event_sched_out(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { @@ -437,13 +437,14 @@ __event_sched_out(struct perf_event *event, } if (event->state != PERF_EVENT_STATE_ACTIVE) - return 0; + return; event->state = PERF_EVENT_STATE_INACTIVE; if (event->pending_disable) { event->pending_disable = 0; event->state = PERF_EVENT_STATE_OFF; } + event->tstamp_stopped = ctx->time; event->pmu->del(event, 0); event->oncpu = -1; @@ -452,19 +453,6 @@ __event_sched_out(struct perf_event *event, ctx->nr_active--; if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; - return 1; -} - -static void -event_sched_out(struct perf_event *event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - int ret; - - ret = __event_sched_out(event, cpuctx, ctx); - if (ret) - event->tstamp_stopped = ctx->time; } static void @@ -664,7 +652,7 @@ retry: } static int -__event_sched_in(struct perf_event *event, +event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { @@ -684,6 +672,8 @@ __event_sched_in(struct perf_event *event, return -EAGAIN; } + event->tstamp_running += ctx->time - event->tstamp_stopped; + if (!is_software_event(event)) cpuctx->active_oncpu++; ctx->nr_active++; @@ -694,35 +684,6 @@ __event_sched_in(struct perf_event *event, return 0; } -static inline int -event_sched_in(struct perf_event *event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - int ret = __event_sched_in(event, cpuctx, ctx); - if (ret) - return ret; - event->tstamp_running += ctx->time - event->tstamp_stopped; - return 0; -} - -static void -group_commit_event_sched_in(struct perf_event *group_event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - struct perf_event *event; - u64 now = ctx->time; - - group_event->tstamp_running += now - group_event->tstamp_stopped; - /* - * Schedule in siblings as one group (if any): - */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) { - event->tstamp_running += now - event->tstamp_stopped; - } -} - static int group_sched_in(struct perf_event *group_event, struct perf_cpu_context *cpuctx, @@ -736,13 +697,7 @@ group_sched_in(struct perf_event *group_event, pmu->start_txn(pmu); - /* - * use __event_sched_in() to delay updating tstamp_running - * until the transaction is committed. In case of failure - * we will keep an unmodified tstamp_running which is a - * requirement to get correct timing information - */ - if (__event_sched_in(group_event, cpuctx, ctx)) { + if (event_sched_in(group_event, cpuctx, ctx)) { pmu->cancel_txn(pmu); return -EAGAIN; } @@ -751,31 +706,26 @@ group_sched_in(struct perf_event *group_event, * Schedule in siblings as one group (if any): */ list_for_each_entry(event, &group_event->sibling_list, group_entry) { - if (__event_sched_in(event, cpuctx, ctx)) { + if (event_sched_in(event, cpuctx, ctx)) { partial_group = event; goto group_error; } } - if (!pmu->commit_txn(pmu)) { - /* commit tstamp_running */ - group_commit_event_sched_in(group_event, cpuctx, ctx); + if (!pmu->commit_txn(pmu)) return 0; - } + group_error: /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: - * - * use __event_sched_out() to avoid updating tstamp_stopped - * because the event never actually ran */ list_for_each_entry(event, &group_event->sibling_list, group_entry) { if (event == partial_group) break; - __event_sched_out(event, cpuctx, ctx); + event_sched_out(event, cpuctx, ctx); } - __event_sched_out(group_event, cpuctx, ctx); + event_sched_out(group_event, cpuctx, ctx); pmu->cancel_txn(pmu); -- cgit v1.2.3-18-g5258 From d7842da470f244d258f21c5f72cd8388b3541d04 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 20 Oct 2010 15:25:01 +0200 Subject: perf_events: Fix for transaction recovery in group_sched_in() This new version (see commit 8e5fc1a) is much simpler and ensures that in case of error in group_sched_in() during event_sched_in(), the events up to the failed event go through regular event_sched_out(). But the failed event and the remaining events in the group have their timings adjusted as if they had also gone through event_sched_in() and event_sched_out(). This ensures timing uniformity across all events in a group. This also takes care of the tstamp_stopped problem in case the group could never be scheduled. The tstamp_stopped is updated as if the event had actually run. With this patch, the following now reports correct time_enabled, in case the NMI watchdog is active: $ task -e unhalted_core_cycles,instructions_retired,baclears,baclears noploop 1 noploop for 1 seconds 0 unhalted_core_cycles (100.00% scaling, ena=997,552,872, run=0) 0 instructions_retired (100.00% scaling, ena=997,552,872, run=0) 0 baclears (100.00% scaling, ena=997,552,872, run=0) 0 baclears (100.00% scaling, ena=997,552,872, run=0) And the older test case also works: $ task -einstructions_retired,baclears,baclears -e unhalted_core_cycles,baclears,baclears sleep 5 1680885 instructions_retired (69.39% scaling, ena=950756, run=291006) 10735 baclears (69.39% scaling, ena=950756, run=291006) 10735 baclears (69.39% scaling, ena=950756, run=291006) 0 unhalted_core_cycles (100.00% scaling, ena=817932, run=0) 0 baclears (100.00% scaling, ena=817932, run=0) 0 baclears (100.00% scaling, ena=817932, run=0) Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4cbeeebc.8ee7d80a.5a28.0d5f@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 39afdb07d75..517d827f498 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -691,6 +691,8 @@ group_sched_in(struct perf_event *group_event, { struct perf_event *event, *partial_group = NULL; struct pmu *pmu = group_event->pmu; + u64 now = ctx->time; + bool simulate = false; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; @@ -719,11 +721,27 @@ group_error: /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: + * The events up to the failed event are scheduled out normally, + * tstamp_stopped will be updated. + * + * The failed events and the remaining siblings need to have + * their timings updated as if they had gone thru event_sched_in() + * and event_sched_out(). This is required to get consistent timings + * across the group. This also takes care of the case where the group + * could never be scheduled by ensuring tstamp_stopped is set to mark + * the time the event was actually stopped, such that time delta + * calculation in update_event_times() is correct. */ list_for_each_entry(event, &group_event->sibling_list, group_entry) { if (event == partial_group) - break; - event_sched_out(event, cpuctx, ctx); + simulate = true; + + if (simulate) { + event->tstamp_running += now - event->tstamp_stopped; + event->tstamp_stopped = now; + } else { + event_sched_out(event, cpuctx, ctx); + } } event_sched_out(group_event, cpuctx, ctx); -- cgit v1.2.3-18-g5258 From 2656c36699677238edc9ec1fea79039f1fddbcb6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 22 Oct 2010 14:47:57 +0200 Subject: genirq: Warn if enable_irq is called before irq is set up The recent changes in the genirq core unearthed a bug in arch/um which called enable_irq() before the interrupt was set up. Warn and return instead of crashing the machine with a NULL pointer dereference. Signed-off-by: Thomas Gleixner Cc: Richard Weinberger --- kernel/irq/manage.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 644e8d5fa36..5f92acc5f95 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -324,6 +324,10 @@ void enable_irq(unsigned int irq) if (!desc) return; + if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable, + KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) + return; + chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, false); -- cgit v1.2.3-18-g5258 From 16cdc628c3aed47d02205135b7e2f01e0064f566 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 6 Aug 2010 11:47:14 -0500 Subject: debug_core: move all watch dog syncs to a single function Move the various clock and watch dog syncs to a single function in advance of adding another sync for the rcu stall detector. Signed-off-by: Jason Wessel --- kernel/debug/debug_core.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index de407c78178..c812857d0b8 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -470,6 +470,12 @@ static void dbg_cpu_switch(int cpu, int next_cpu) kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER; } +static void dbg_touch_watchdogs(void) +{ + touch_softlockup_watchdog_sync(); + clocksource_touch_watchdog(); +} + static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs) { unsigned long flags; @@ -523,8 +529,7 @@ return_normal: if (trace_on) tracing_on(); atomic_dec(&cpu_in_kgdb[cpu]); - touch_softlockup_watchdog_sync(); - clocksource_touch_watchdog(); + dbg_touch_watchdogs(); local_irq_restore(flags); return 0; } @@ -541,8 +546,7 @@ return_normal: (kgdb_info[cpu].task && kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog_sync(); - clocksource_touch_watchdog(); + dbg_touch_watchdogs(); local_irq_restore(flags); goto acquirelock; @@ -659,8 +663,7 @@ kgdb_restore: tracing_on(); /* Free kgdb_active */ atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog_sync(); - clocksource_touch_watchdog(); + dbg_touch_watchdogs(); local_irq_restore(flags); return kgdb_info[cpu].ret_state; -- cgit v1.2.3-18-g5258 From fb70b5888b70b0b50f738fbfc019445493112eb1 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 13 Aug 2010 12:44:04 -0500 Subject: debug_core: stop rcu warnings on kernel resume When returning from the kernel debugger reset the rcu jiffies_stall value to prevent the rcu stall detector from sending NMI events which invoke a stack dump for each cpu in the system. Signed-off-by: Jason Wessel --- kernel/debug/debug_core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index c812857d0b8..5a3b04d2049 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -474,6 +475,7 @@ static void dbg_touch_watchdogs(void) { touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); + rcu_cpu_stall_reset(); } static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs) -- cgit v1.2.3-18-g5258 From f7030bbc446430ecd12c9ad02cf0ea94934e5f91 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 11 Oct 2010 10:20:14 -0500 Subject: kdb: Allow kernel loadable modules to add kdb shell functions In order to allow kernel modules to dynamically add a command to the kdb shell the kdb_register, kdb_register_repeat, kdb_unregister, and kdb_printf need to be exported as GPL symbols. Any kernel module that adds a dynamic kdb shell function should only need to include linux/kdb.h. Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_io.c | 2 +- kernel/debug/kdb/kdb_main.c | 4 ++++ kernel/debug/kdb/kdb_private.h | 39 --------------------------------------- 3 files changed, 5 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index c9b7f4f90bb..96fdaac46a8 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -823,4 +823,4 @@ int kdb_printf(const char *fmt, ...) return r; } - +EXPORT_SYMBOL_GPL(kdb_printf); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index caf057a3de0..5448990a299 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2783,6 +2783,8 @@ int kdb_register_repeat(char *cmd, return 0; } +EXPORT_SYMBOL_GPL(kdb_register_repeat); + /* * kdb_register - Compatibility register function for commands that do @@ -2805,6 +2807,7 @@ int kdb_register(char *cmd, return kdb_register_repeat(cmd, func, usage, help, minlen, KDB_REPEAT_NONE); } +EXPORT_SYMBOL_GPL(kdb_register); /* * kdb_unregister - This function is used to unregister a kernel @@ -2833,6 +2836,7 @@ int kdb_unregister(char *cmd) /* Couldn't find it. */ return 1; } +EXPORT_SYMBOL_GPL(kdb_unregister); /* Initialize the kdb command table. */ static void __init kdb_inittab(void) diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index be775f7e81e..1921e6e4c0b 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -15,29 +15,6 @@ #include #include "../debug_core.h" -/* Kernel Debugger Error codes. Must not overlap with command codes. */ -#define KDB_NOTFOUND (-1) -#define KDB_ARGCOUNT (-2) -#define KDB_BADWIDTH (-3) -#define KDB_BADRADIX (-4) -#define KDB_NOTENV (-5) -#define KDB_NOENVVALUE (-6) -#define KDB_NOTIMP (-7) -#define KDB_ENVFULL (-8) -#define KDB_ENVBUFFULL (-9) -#define KDB_TOOMANYBPT (-10) -#define KDB_TOOMANYDBREGS (-11) -#define KDB_DUPBPT (-12) -#define KDB_BPTNOTFOUND (-13) -#define KDB_BADMODE (-14) -#define KDB_BADINT (-15) -#define KDB_INVADDRFMT (-16) -#define KDB_BADREG (-17) -#define KDB_BADCPUNUM (-18) -#define KDB_BADLENGTH (-19) -#define KDB_NOBP (-20) -#define KDB_BADADDR (-21) - /* Kernel Debugger Command codes. Must not overlap with error codes. */ #define KDB_CMD_GO (-1001) #define KDB_CMD_CPU (-1002) @@ -93,17 +70,6 @@ */ #define KDB_MAXBPT 16 -/* Maximum number of arguments to a function */ -#define KDB_MAXARGS 16 - -typedef enum { - KDB_REPEAT_NONE = 0, /* Do not repeat this command */ - KDB_REPEAT_NO_ARGS, /* Repeat the command without arguments */ - KDB_REPEAT_WITH_ARGS, /* Repeat the command including its arguments */ -} kdb_repeat_t; - -typedef int (*kdb_func_t)(int, const char **); - /* Symbol table format returned by kallsyms. */ typedef struct __ksymtab { unsigned long value; /* Address of symbol */ @@ -123,11 +89,6 @@ extern int kallsyms_symbol_next(char *prefix_name, int flag); extern int kallsyms_symbol_complete(char *prefix_name, int max_len); /* Exported Symbols for kernel loadable modules to use. */ -extern int kdb_register(char *, kdb_func_t, char *, char *, short); -extern int kdb_register_repeat(char *, kdb_func_t, char *, char *, - short, kdb_repeat_t); -extern int kdb_unregister(char *); - extern int kdb_getarea_size(void *, unsigned long, size_t); extern int kdb_putarea_size(unsigned long, void *, size_t); -- cgit v1.2.3-18-g5258 From e3bda3ac33d3bf3e5a4049e2cabe82d3caaffc26 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 11 Oct 2010 10:20:14 -0500 Subject: kdb,ftdump: Remove reference to internal kdb include Now that include/linux/kdb.h properly exports all the functions required to dynamically add a kdb shell command, the reference to the private kdb header can be removed. Signed-off-by: Jason Wessel --- kernel/trace/trace_kdb.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 7b8ecd751d9..3c5c5dfea0b 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -13,7 +13,6 @@ #include #include -#include "../debug/kdb/kdb_private.h" #include "trace.h" #include "trace_output.h" -- cgit v1.2.3-18-g5258 From 75d14edee5689716b55afe467acfc13206a31f95 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 11 Oct 2010 10:20:14 -0500 Subject: kdb: Fix oops in kdb_unregister Nothing should try to use kdb_commands directly as sometimes it is null. Instead, use the for_each_kdbcmd() iterator. This particular problem dates back to the initial kdb merge (2.6.35), but at that point nothing was dynamically unregistering commands from the kdb shell. Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 5448990a299..4226f32517d 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2826,7 +2826,7 @@ int kdb_unregister(char *cmd) /* * find the command. */ - for (i = 0, kp = kdb_commands; i < kdb_max_commands; i++, kp++) { + for_each_kdbcmd(kp, i) { if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { kp->cmd_name = NULL; return 0; -- cgit v1.2.3-18-g5258 From 91b152aa85bbcf076e269565394c31964f940371 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 23 Aug 2010 09:20:14 -0500 Subject: kdb,kgdb: fix sparse fixups Fix the following sparse warnings: kdb_main.c:328:5: warning: symbol 'kdbgetu64arg' was not declared. Should it be static? kgdboc.c:246:12: warning: symbol 'kgdboc_early_init' was not declared. Should it be static? kgdb.c:652:26: warning: incorrect type in argument 1 (different address spaces) kgdb.c:652:26: expected void const *ptr kgdb.c:652:26: got struct perf_event *[noderef] *pev The one in kgdb.c required the (void * __force) because of the return code from register_wide_hw_breakpoint looking like: return (void __percpu __force *)ERR_PTR(err); Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_private.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 1921e6e4c0b..35d69ed1dfb 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -105,6 +105,7 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t); extern int kdb_putword(unsigned long, unsigned long, size_t); extern int kdbgetularg(const char *, unsigned long *); +extern int kdbgetu64arg(const char *, u64 *); extern char *kdbgetenv(const char *); extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, long *, char **); @@ -216,14 +217,6 @@ extern void kdb_ps1(const struct task_struct *p); extern void kdb_print_nameval(const char *name, unsigned long val); extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); extern void kdb_meminfo_proc_show(void); -#ifdef CONFIG_KALLSYMS -extern const char *kdb_walk_kallsyms(loff_t *pos); -#else /* ! CONFIG_KALLSYMS */ -static inline const char *kdb_walk_kallsyms(loff_t *pos) -{ - return NULL; -} -#endif /* ! CONFIG_KALLSYMS */ extern char *kdb_getstr(char *, size_t, char *); /* Defines for kdb_symbol_print */ -- cgit v1.2.3-18-g5258 From c1bb9a9c1911036549c5cdfb23f32d7d20ffdc5a Mon Sep 17 00:00:00 2001 From: Dongdong Deng Date: Mon, 13 Sep 2010 06:58:00 -0500 Subject: debug_core: disable hw_breakpoints on all cores in kgdb_cpu_enter() The slave cpus do not have the hw breakpoints disabled upon entry to the debug_core and as a result could cause unrecoverable recursive faults on badly placed breakpoints, or get out of sync with the arch specific hw breakpoint operations. This patch addresses the problem by invoking kgdb_disable_hw_debug() earlier in kgdb_enter_cpu for each cpu that enters the debug core. The hw breakpoint dis/enable flow should be: master_debug_cpu slave_debug_cpu \ / kgdb_cpu_enter | kgdb_disable_hw_debug --> uninstall pre-enabled hw_breakpoint | do add/rm dis/enable operates to hw_breakpoints on master_debug_cpu.. | correct_hw_break --> correct/install the enabled hw_breakpoint | leave_kgdb Signed-off-by: Dongdong Deng Signed-off-by: Jason Wessel --- kernel/debug/debug_core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 5a3b04d2049..bb949772480 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -485,6 +485,9 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs) int error; int i, cpu; int trace_on = 0; + + kgdb_disable_hw_debug(ks->linux_regs); + acquirelock: /* * Interrupts will be restored by the 'trap return' code, except when @@ -569,8 +572,6 @@ return_normal: if (dbg_io_ops->pre_exception) dbg_io_ops->pre_exception(); - kgdb_disable_hw_debug(ks->linux_regs); - /* * Get the passive CPU lock which will hold all the non-primary * CPU in a spin state while the debugger is active @@ -661,6 +662,8 @@ kgdb_restore: else kgdb_sstep_pid = 0; } + if (arch_kgdb_ops.correct_hw_break) + arch_kgdb_ops.correct_hw_break(); if (trace_on) tracing_on(); /* Free kgdb_active */ -- cgit v1.2.3-18-g5258 From dfee3a7b92208b30f77876068aece9ea571270c2 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 21 May 2010 08:46:00 -0500 Subject: debug_core: refactor locking for master/slave cpus For quite some time there have been problems with memory barriers and various races with NMI on multi processor systems using the kernel debugger. The algorithm for entering the kernel debug core and resuming kernel execution was racy and had several known edge case problems with attempting to debug something on a heavily loaded system using breakpoints that are hit repeatedly and quickly. The prior "locking" design entry worked as follows: * The atomic counter kgdb_active was used with atomic exchange in order to elect a master cpu out of all the cpus that may have taken a debug exception. * The master cpu increments all elements of passive_cpu_wait[]. * The master cpu issues the round up cpus message. * Each "slave cpu" that enters the debug core increments its own element in cpu_in_kgdb[]. * Each "slave cpu" spins on passive_cpu_wait[] until it becomes 0. * The master cpu debugs the system. The new scheme removes the two arrays of atomic counters and replaces them with 2 single counters. One counter is used to count the number of cpus waiting to become a master cpu (because one or more hit an exception). The second counter is use to indicate how many cpus have entered as slave cpus. The new entry logic works as follows: * One or more cpus enters via kgdb_handle_exception() and increments the masters_in_kgdb. Each cpu attempts to get the spin lock called dbg_master_lock. * The master cpu sets kgdb_active to the current cpu. * The master cpu takes the spinlock dbg_slave_lock. * The master cpu asks to round up all the other cpus. * Each slave cpu that is not already in kgdb_handle_exception() will enter and increment slaves_in_kgdb. Each slave will now spin try_locking on dbg_slave_lock. * The master cpu waits for the sum of masters_in_kgdb and slaves_in_kgdb to be equal to the sum of the online cpus. * The master cpu debugs the system. In the new design the kgdb_active can only be changed while holding dbg_master_lock. Stress testing has not turned up any further entry/exit races that existed in the prior locking design. The prior locking design suffered from atomic variables not being truly atomic (in the capacity as used by kgdb) along with memory barrier races. Signed-off-by: Jason Wessel Acked-by: Dongdong Deng --- kernel/debug/debug_core.c | 105 ++++++++++++++++++++++++---------------------- kernel/debug/debug_core.h | 1 + 2 files changed, 56 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index bb949772480..26dbdc37d21 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -110,13 +110,15 @@ static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = { */ atomic_t kgdb_active = ATOMIC_INIT(-1); EXPORT_SYMBOL_GPL(kgdb_active); +static DEFINE_RAW_SPINLOCK(dbg_master_lock); +static DEFINE_RAW_SPINLOCK(dbg_slave_lock); /* * We use NR_CPUs not PERCPU, in case kgdb is used to debug early * bootup code (which might not have percpu set up yet): */ -static atomic_t passive_cpu_wait[NR_CPUS]; -static atomic_t cpu_in_kgdb[NR_CPUS]; +static atomic_t masters_in_kgdb; +static atomic_t slaves_in_kgdb; static atomic_t kgdb_break_tasklet_var; atomic_t kgdb_setting_breakpoint; @@ -478,14 +480,23 @@ static void dbg_touch_watchdogs(void) rcu_cpu_stall_reset(); } -static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs) +static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, + int exception_state) { unsigned long flags; int sstep_tries = 100; int error; - int i, cpu; + int cpu; int trace_on = 0; + int online_cpus = num_online_cpus(); + kgdb_info[ks->cpu].enter_kgdb++; + kgdb_info[ks->cpu].exception_state |= exception_state; + + if (exception_state == DCPU_WANT_MASTER) + atomic_inc(&masters_in_kgdb); + else + atomic_inc(&slaves_in_kgdb); kgdb_disable_hw_debug(ks->linux_regs); acquirelock: @@ -500,14 +511,15 @@ acquirelock: kgdb_info[cpu].task = current; kgdb_info[cpu].ret_state = 0; kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT; - /* - * Make sure the above info reaches the primary CPU before - * our cpu_in_kgdb[] flag setting does: - */ - atomic_inc(&cpu_in_kgdb[cpu]); - if (exception_level == 1) + /* Make sure the above info reaches the primary CPU */ + smp_mb(); + + if (exception_level == 1) { + if (raw_spin_trylock(&dbg_master_lock)) + atomic_xchg(&kgdb_active, cpu); goto cpu_master_loop; + } /* * CPU will loop if it is a slave or request to become a kgdb @@ -519,10 +531,12 @@ cpu_loop: kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER; goto cpu_master_loop; } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) { - if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu) + if (raw_spin_trylock(&dbg_master_lock)) { + atomic_xchg(&kgdb_active, cpu); break; + } } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) { - if (!atomic_read(&passive_cpu_wait[cpu])) + if (!raw_spin_is_locked(&dbg_slave_lock)) goto return_normal; } else { return_normal: @@ -533,7 +547,11 @@ return_normal: arch_kgdb_ops.correct_hw_break(); if (trace_on) tracing_on(); - atomic_dec(&cpu_in_kgdb[cpu]); + kgdb_info[cpu].exception_state &= + ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); + kgdb_info[cpu].enter_kgdb--; + smp_mb__before_atomic_dec(); + atomic_dec(&slaves_in_kgdb); dbg_touch_watchdogs(); local_irq_restore(flags); return 0; @@ -551,6 +569,7 @@ return_normal: (kgdb_info[cpu].task && kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { atomic_set(&kgdb_active, -1); + raw_spin_unlock(&dbg_master_lock); dbg_touch_watchdogs(); local_irq_restore(flags); @@ -576,10 +595,8 @@ return_normal: * Get the passive CPU lock which will hold all the non-primary * CPU in a spin state while the debugger is active */ - if (!kgdb_single_step) { - for (i = 0; i < NR_CPUS; i++) - atomic_inc(&passive_cpu_wait[i]); - } + if (!kgdb_single_step) + raw_spin_lock(&dbg_slave_lock); #ifdef CONFIG_SMP /* Signal the other CPUs to enter kgdb_wait() */ @@ -590,10 +607,9 @@ return_normal: /* * Wait for the other CPUs to be notified and be waiting for us: */ - for_each_online_cpu(i) { - while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i])) - cpu_relax(); - } + while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) + + atomic_read(&slaves_in_kgdb)) != online_cpus) + cpu_relax(); /* * At this point the primary processor is completely @@ -634,24 +650,11 @@ cpu_master_loop: if (dbg_io_ops->post_exception) dbg_io_ops->post_exception(); - atomic_dec(&cpu_in_kgdb[ks->cpu]); - if (!kgdb_single_step) { - for (i = NR_CPUS-1; i >= 0; i--) - atomic_dec(&passive_cpu_wait[i]); - /* - * Wait till all the CPUs have quit from the debugger, - * but allow a CPU that hit an exception and is - * waiting to become the master to remain in the debug - * core. - */ - for_each_online_cpu(i) { - while (kgdb_do_roundup && - atomic_read(&cpu_in_kgdb[i]) && - !(kgdb_info[i].exception_state & - DCPU_WANT_MASTER)) - cpu_relax(); - } + raw_spin_unlock(&dbg_slave_lock); + /* Wait till all the CPUs have quit from the debugger. */ + while (kgdb_do_roundup && atomic_read(&slaves_in_kgdb)) + cpu_relax(); } kgdb_restore: @@ -666,8 +669,15 @@ kgdb_restore: arch_kgdb_ops.correct_hw_break(); if (trace_on) tracing_on(); + + kgdb_info[cpu].exception_state &= + ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); + kgdb_info[cpu].enter_kgdb--; + smp_mb__before_atomic_dec(); + atomic_dec(&masters_in_kgdb); /* Free kgdb_active */ atomic_set(&kgdb_active, -1); + raw_spin_unlock(&dbg_master_lock); dbg_touch_watchdogs(); local_irq_restore(flags); @@ -686,7 +696,6 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) { struct kgdb_state kgdb_var; struct kgdb_state *ks = &kgdb_var; - int ret; ks->cpu = raw_smp_processor_id(); ks->ex_vector = evector; @@ -697,11 +706,10 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) if (kgdb_reenter_check(ks)) return 0; /* Ouch, double exception ! */ - kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER; - ret = kgdb_cpu_enter(ks, regs); - kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER | - DCPU_IS_SLAVE); - return ret; + if (kgdb_info[ks->cpu].enter_kgdb != 0) + return 0; + + return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); } int kgdb_nmicallback(int cpu, void *regs) @@ -714,12 +722,9 @@ int kgdb_nmicallback(int cpu, void *regs) ks->cpu = cpu; ks->linux_regs = regs; - if (!atomic_read(&cpu_in_kgdb[cpu]) && - atomic_read(&kgdb_active) != -1 && - atomic_read(&kgdb_active) != cpu) { - kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE; - kgdb_cpu_enter(ks, regs); - kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE; + if (kgdb_info[ks->cpu].enter_kgdb == 0 && + raw_spin_is_locked(&dbg_master_lock)) { + kgdb_cpu_enter(ks, regs, DCPU_IS_SLAVE); return 0; } #endif diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h index c5d753d80f6..3494c28a7e7 100644 --- a/kernel/debug/debug_core.h +++ b/kernel/debug/debug_core.h @@ -40,6 +40,7 @@ struct debuggerinfo_struct { int exception_state; int ret_state; int irq_depth; + int enter_kgdb; }; extern struct debuggerinfo_struct kgdb_info[]; -- cgit v1.2.3-18-g5258 From 495363d380b4f4745bd8677912688654afc44020 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 21 May 2010 08:46:00 -0500 Subject: kdb,debug_core: adjust master cpu switch logic against new debug_core locking The kdb shell needs to enforce switching back to the original CPU that took the exception before restoring normal kernel execution. Resuming from a different CPU than what took the original exception will cause problems with spin locks that are freed from the a different processor than had taken the lock. The special logic in dbg_cpu_switch() can go away entirely with because the state of what cpus want to be masters or slaves will remain unchanged between entry and exit of the debug_core exception context. Signed-off-by: Jason Wessel --- kernel/debug/debug_core.c | 16 ++-------------- kernel/debug/kdb/kdb_debugger.c | 3 +-- kernel/debug/kdb/kdb_main.c | 12 ++++++------ 3 files changed, 9 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 26dbdc37d21..fec596da9bd 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -460,19 +460,6 @@ static int kgdb_reenter_check(struct kgdb_state *ks) return 1; } -static void dbg_cpu_switch(int cpu, int next_cpu) -{ - /* Mark the cpu we are switching away from as a slave when it - * holds the kgdb_active token. This must be done so that the - * that all the cpus wait in for the debug core will not enter - * again as the master. */ - if (cpu == atomic_read(&kgdb_active)) { - kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE; - kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER; - } - kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER; -} - static void dbg_touch_watchdogs(void) { touch_softlockup_watchdog_sync(); @@ -638,7 +625,8 @@ cpu_master_loop: if (error == DBG_PASS_EVENT) { dbg_kdb_mode = !dbg_kdb_mode; } else if (error == DBG_SWITCH_CPU_EVENT) { - dbg_cpu_switch(cpu, dbg_switch_cpu); + kgdb_info[dbg_switch_cpu].exception_state |= + DCPU_NEXT_MASTER; goto cpu_loop; } else { kgdb_info[cpu].ret_state = error; diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index bf6e8270e95..dd0b1b7dd02 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -86,7 +86,7 @@ int kdb_stub(struct kgdb_state *ks) } /* Set initial kdb state variables */ KDB_STATE_CLEAR(KGDB_TRANS); - kdb_initial_cpu = ks->cpu; + kdb_initial_cpu = atomic_read(&kgdb_active); kdb_current_task = kgdb_info[ks->cpu].task; kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo; /* Remove any breakpoints as needed by kdb and clear single step */ @@ -105,7 +105,6 @@ int kdb_stub(struct kgdb_state *ks) ks->pass_exception = 1; KDB_FLAG_SET(CATASTROPHIC); } - kdb_initial_cpu = ks->cpu; if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { KDB_STATE_CLEAR(SSBPT); KDB_STATE_CLEAR(DOING_SS); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 4226f32517d..d7bda21a106 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1749,13 +1749,13 @@ static int kdb_go(int argc, const char **argv) int nextarg; long offset; + if (raw_smp_processor_id() != kdb_initial_cpu) { + kdb_printf("go must execute on the entry cpu, " + "please use \"cpu %d\" and then execute go\n", + kdb_initial_cpu); + return KDB_BADCPUNUM; + } if (argc == 1) { - if (raw_smp_processor_id() != kdb_initial_cpu) { - kdb_printf("go
must be issued from the " - "initial cpu, do cpu %d first\n", - kdb_initial_cpu); - return KDB_ARGCOUNT; - } nextarg = 1; diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL); -- cgit v1.2.3-18-g5258 From aa7b250c252cc8e6b1daf0e1eada5eba42a1a68d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 21 Oct 2010 22:17:19 -0700 Subject: tracing: Fix 'faild' -> 'failed' typo Signed-off-by: Joe Perches Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Jiri Kosina LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 544301d29de..b8d2852baa4 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -648,7 +648,7 @@ static int register_trace_probe(struct trace_probe *tp) } ret = register_probe_event(tp); if (ret) { - pr_warning("Faild to register probe event(%d)\n", ret); + pr_warning("Failed to register probe event(%d)\n", ret); goto end; } -- cgit v1.2.3-18-g5258 From d4a6f3c32c39132318454e77d59ab14b06f6eb02 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Sun, 24 Oct 2010 16:28:47 +0600 Subject: sched_stat: Update sched_info_queue/dequeue() code comments Remove some sched_info_queue(), sched_info_dequeue() code comment. We no longer belongs to the era of O(1) and we don't use active or expired array anymore. Signed-off-by: Rakib Mullick Cc: Oleg Nesterov Cc: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_stats.h | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 25c2f962f6f..48ddf431db0 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -157,15 +157,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) } /* - * Called when a process is dequeued from the active array and given - * the cpu. We should note that with the exception of interactive - * tasks, the expired queue will become the active queue after the active - * queue is empty, without explicitly dequeuing and requeuing tasks in the - * expired queue. (Interactive tasks may be requeued directly to the - * active queue, thus delaying tasks in the expired queue from running; - * see scheduler_tick()). - * - * Though we are interested in knowing how long it was from the *first* time a + * We are interested in knowing how long it was from the *first* time a * task was queued to the time that it finally hit a cpu, we call this routine * from dequeue_task() to account for possible rq->clock skew across cpus. The * delta taken on each cpu would annul the skew. @@ -203,16 +195,6 @@ static void sched_info_arrive(struct task_struct *t) } /* - * Called when a process is queued into either the active or expired - * array. The time is noted and later used to determine how long we - * had to wait for us to reach the cpu. Since the expired queue will - * become the active queue after active queue is empty, without dequeuing - * and requeuing any tasks, we are interested in queuing to either. It - * is unusual but not impossible for tasks to be dequeued and immediately - * requeued in the same or another array: this can happen in sched_yield(), - * set_user_nice(), and even load_balance() as it moves tasks from runqueue - * to runqueue. - * * This function is only called from enqueue_task(), but also only updates * the timestamp if it is already not set. It's assumed that * sched_info_dequeued() will clear that stamp when appropriate. -- cgit v1.2.3-18-g5258 From 43948f50276eca010a22726860dfe9a4e8130136 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 25 Oct 2010 22:18:01 +0900 Subject: kprobes: Remove redundant text_mutex lock in optimize Remove text_mutex locking in optimize_all_kprobes, because this function doesn't modify text. It simply queues probes on optimization list for kprobe_optimizer worker thread. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S. Miller Cc: Namhyung Kim Cc: Jason Baron Cc: Peter Zijlstra LKML-Reference: <20101025131801.19160.70939.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ec4210c6501..7c44133f51e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -74,7 +74,8 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; /* NOTE: change this value only with kprobe_mutex held */ static bool kprobes_all_disarmed; -static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ +/* This protects kprobe_table and optimizing_list */ +static DEFINE_MUTEX(kprobe_mutex); static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; static struct { spinlock_t lock ____cacheline_aligned_in_smp; @@ -595,6 +596,7 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p) } #ifdef CONFIG_SYSCTL +/* This should be called with kprobe_mutex locked */ static void __kprobes optimize_all_kprobes(void) { struct hlist_head *head; @@ -607,17 +609,16 @@ static void __kprobes optimize_all_kprobes(void) return; kprobes_allow_optimization = true; - mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) if (!kprobe_disabled(p)) optimize_kprobe(p); } - mutex_unlock(&text_mutex); printk(KERN_INFO "Kprobes globally optimized\n"); } +/* This should be called with kprobe_mutex locked */ static void __kprobes unoptimize_all_kprobes(void) { struct hlist_head *head; -- cgit v1.2.3-18-g5258 From 5260562754c0aa4b95eebb1f851eaccce7286365 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 25 Oct 2010 23:41:11 +0100 Subject: MN10300: Fix the PERCPU() alignment to allow for workqueues In the MN10300 arch, we occasionally see an assertion being tripped in alloc_cwqs() at the following line: /* just in case, make sure it's actually aligned */ ---> BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); return wq->cpu_wq.v ? 0 : -ENOMEM; The values are: wa->cpu_wq.v => 0x902776e0 align => 0x100 and align is calculated by the following: const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, __alignof__(unsigned long long)); This is because the pointer in question (wq->cpu_wq.v) loses some of its lower bits to control flags, and so the object it points to must be sufficiently aligned to avoid the need to use those bits for pointing to things. Currently, 4 control bits and 4 colour bits are used in normal circumstances, plus a debugging bit if debugging is set. This requires the cpu_workqueue_struct struct to be at least 256 bytes aligned (or 512 bytes aligned with debugging). PERCPU() alignment on MN13000, however, is only 32 bytes as set in vmlinux.lds.S. So we set this to PAGE_SIZE (4096) to match most other arches and stick a comment in alloc_cwqs() for anyone else who triggers the assertion. Reported-by: Akira Takeuchi Signed-off-by: David Howells Acked-by: Mark Salter Cc: Tejun Heo Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 30acdb74cc2..e5ff2cbaadc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2791,7 +2791,9 @@ static int alloc_cwqs(struct workqueue_struct *wq) } } - /* just in case, make sure it's actually aligned */ + /* just in case, make sure it's actually aligned + * - this is affected by PERCPU() alignment in vmlinux.lds.S + */ BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); return wq->cpu_wq.v ? 0 : -ENOMEM; } -- cgit v1.2.3-18-g5258 From 7e360c38abe2c70eae3ba5a8a17f17671d8b77c5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Oct 2010 09:32:55 +0200 Subject: fs: allow for more than 2^31 files Andrew, Could you please review this patch, you probably are the right guy to take it, because it crosses fs and net trees. Note : /proc/sys/fs/file-nr is a read-only file, so this patch doesnt depend on previous patch (sysctl: fix min/max handling in __do_proc_doulongvec_minmax()) Thanks ! [PATCH V4] fs: allow for more than 2^31 files Robin Holt tried to boot a 16TB system and found af_unix was overflowing a 32bit value : We were seeing a failure which prevented boot. The kernel was incapable of creating either a named pipe or unix domain socket. This comes down to a common kernel function called unix_create1() which does: atomic_inc(&unix_nr_socks); if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) goto out; The function get_max_files() is a simple return of files_stat.max_files. files_stat.max_files is a signed integer and is computed in fs/file_table.c's files_init(). n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = n; In our case, mempages (total_ram_pages) is approx 3,758,096,384 (0xe0000000). That leaves max_files at approximately 1,503,238,553. This causes 2 * get_max_files() to integer overflow. Fix is to let /proc/sys/fs/file-nr & /proc/sys/fs/file-max use long integers, and change af_unix to use an atomic_long_t instead of atomic_t. get_max_files() is changed to return an unsigned long. get_nr_files() is changed to return a long. unix_nr_socks is changed from atomic_t to atomic_long_t, while not strictly needed to address Robin problem. Before patch (on a 64bit kernel) : # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max -18446744071562067968 After patch: # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max 2147483648 # cat /proc/sys/fs/file-nr 704 0 2147483648 Reported-by: Robin Holt Signed-off-by: Eric Dumazet Acked-by: David Miller Reviewed-by: Robin Holt Tested-by: Robin Holt Signed-off-by: Al Viro --- kernel/sysctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3a45c224770..694b140852c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1352,16 +1352,16 @@ static struct ctl_table fs_table[] = { { .procname = "file-nr", .data = &files_stat, - .maxlen = 3*sizeof(int), + .maxlen = sizeof(files_stat), .mode = 0444, .proc_handler = proc_nr_files, }, { .procname = "file-max", .data = &files_stat.max_files, - .maxlen = sizeof(int), + .maxlen = sizeof(files_stat.max_files), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax, }, { .procname = "nr_open", -- cgit v1.2.3-18-g5258 From cffbc8aa334f55c9ed42d25202eb3ebf3a97c195 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sat, 23 Oct 2010 05:03:02 -0400 Subject: fs: Convert nr_inodes and nr_unused to per-cpu counters The number of inodes allocated does not need to be tied to the addition or removal of an inode to/from a list. If we are not tied to a list lock, we could update the counters when inodes are initialised or destroyed, but to do that we need to convert the counters to be per-cpu (i.e. independent of a lock). This means that we have the freedom to change the list/locking implementation without needing to care about the counters. Based on a patch originally from Eric Dumazet. [AV: cleaned up a bit, fixed build breakage on weird configs Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 694b140852c..99a510cbfbb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1340,14 +1340,14 @@ static struct ctl_table fs_table[] = { .data = &inodes_stat, .maxlen = 2*sizeof(int), .mode = 0444, - .proc_handler = proc_dointvec, + .proc_handler = proc_nr_inodes, }, { .procname = "inode-state", .data = &inodes_stat, .maxlen = 7*sizeof(int), .mode = 0444, - .proc_handler = proc_dointvec, + .proc_handler = proc_nr_inodes, }, { .procname = "file-nr", -- cgit v1.2.3-18-g5258 From 7de9c6ee3ecffd99e1628e81a5ea5468f7581a1f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 23 Oct 2010 11:11:40 -0400 Subject: new helper: ihold() Clones an existing reference to inode; caller must already hold one. Signed-off-by: Al Viro --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index a118bf160e0..6c683b37f2c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -169,7 +169,7 @@ static void get_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - atomic_inc(&key->shared.inode->i_count); + ihold(key->shared.inode); break; case FUT_OFF_MMSHARED: atomic_inc(&key->private.mm->mm_count); -- cgit v1.2.3-18-g5258 From 85fe4025c616a7c0ed07bc2fc8c5371b07f3888c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 Oct 2010 11:19:54 -0400 Subject: fs: do not assign default i_ino in new_inode Instead of always assigning an increasing inode number in new_inode move the call to assign it into those callers that actually need it. For now callers that need it is estimated conservatively, that is the call is added to all filesystems that do not assign an i_ino by themselves. For a few more filesystems we can avoid assigning any inode number given that they aren't user visible, and for others it could be done lazily when an inode number is actually needed, but that's left for later patches. Signed-off-by: Christoph Hellwig Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- kernel/cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7b69b8d0313..9270d532ec3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -777,6 +777,7 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) struct inode *inode = new_inode(sb); if (inode) { + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); -- cgit v1.2.3-18-g5258 From 312d3ca856d369bb04d0443846b85b4cdde6fa8a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 10 Oct 2010 05:36:23 -0400 Subject: fs: use percpu counter for nr_dentry and nr_dentry_unused The nr_dentry stat is a globally touched cacheline and atomic operation twice over the lifetime of a dentry. It is used for the benfit of userspace only. Turn it into a per-cpu counter and always decrement it in d_free instead of doing various batching operations to reduce lock hold times in the callers. Based on an earlier patch from Nick Piggin . Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 99a510cbfbb..8b77ff5c502 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1377,7 +1377,7 @@ static struct ctl_table fs_table[] = { .data = &dentry_stat, .maxlen = 6*sizeof(int), .mode = 0444, - .proc_handler = proc_dointvec, + .proc_handler = proc_nr_dentry, }, { .procname = "overflowuid", -- cgit v1.2.3-18-g5258 From a9cea017411c95ec789092971f9baaef1f826883 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 26 Oct 2010 15:41:13 -0600 Subject: resources: add a default alignf to simplify find_resource() This removes a test from find_resource(), which is getting cluttered. No functional change. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- kernel/resource.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 7b36976e5de..7dc8ad24f91 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -357,6 +357,14 @@ int __weak page_is_ram(unsigned long pfn) return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; } +static resource_size_t simple_align_resource(void *data, + const struct resource *avail, + resource_size_t size, + resource_size_t align) +{ + return avail->start; +} + /* * Find empty slot in the resource tree given range and alignment. */ @@ -391,8 +399,8 @@ static int find_resource(struct resource *root, struct resource *new, if (tmp.end > max) tmp.end = max; tmp.start = ALIGN(tmp.start, align); - if (alignf) - tmp.start = alignf(alignf_data, &tmp, size, align); + + tmp.start = alignf(alignf_data, &tmp, size, align); if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { new->start = tmp.start; new->end = tmp.start + size - 1; @@ -428,6 +436,9 @@ int allocate_resource(struct resource *root, struct resource *new, { int err; + if (!alignf) + alignf = simple_align_resource; + write_lock(&resource_lock); err = find_resource(root, new, size, min, max, align, alignf, alignf_data); if (err >= 0 && __request_resource(root, new)) -- cgit v1.2.3-18-g5258 From 5d6b1fa301b13cc651ee717a9b518124dea2f814 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 26 Oct 2010 15:41:18 -0600 Subject: resources: factor out resource_clip() to simplify find_resource() This factors out the min/max clipping to simplify find_resource(). No functional change. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- kernel/resource.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 7dc8ad24f91..26e9f254692 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -365,6 +365,15 @@ static resource_size_t simple_align_resource(void *data, return avail->start; } +static void resource_clip(struct resource *res, resource_size_t min, + resource_size_t max) +{ + if (res->start < min) + res->start = min; + if (res->end > max) + res->end = max; +} + /* * Find empty slot in the resource tree given range and alignment. */ @@ -394,10 +403,8 @@ static int find_resource(struct resource *root, struct resource *new, tmp.end = this->start - 1; else tmp.end = root->end; - if (tmp.start < min) - tmp.start = min; - if (tmp.end > max) - tmp.end = max; + + resource_clip(&tmp, min, max); tmp.start = ALIGN(tmp.start, align); tmp.start = alignf(alignf_data, &tmp, size, align); -- cgit v1.2.3-18-g5258 From 6909ba14c25b4db6be2ff89f4fa0fac2d70151a0 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 26 Oct 2010 15:41:23 -0600 Subject: resources: ensure callback doesn't allocate outside available space The alignment callback returns a proposed location, which may have been adjusted to avoid ISA aliases or for other architecture-specific reasons. We already had a check ("tmp.start < tmp.end") to make sure the callback doesn't return an area that extends past the available area. This patch reworks the check to make sure it doesn't return an area that extends either below or above the available area. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- kernel/resource.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 26e9f254692..89d50412508 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -374,6 +374,11 @@ static void resource_clip(struct resource *res, resource_size_t min, res->end = max; } +static bool resource_contains(struct resource *res1, struct resource *res2) +{ + return res1->start <= res2->start && res1->end >= res2->end; +} + /* * Find empty slot in the resource tree given range and alignment. */ @@ -387,7 +392,7 @@ static int find_resource(struct resource *root, struct resource *new, void *alignf_data) { struct resource *this = root->child; - struct resource tmp = *new; + struct resource tmp = *new, alloc; tmp.start = root->start; /* @@ -407,10 +412,11 @@ static int find_resource(struct resource *root, struct resource *new, resource_clip(&tmp, min, max); tmp.start = ALIGN(tmp.start, align); - tmp.start = alignf(alignf_data, &tmp, size, align); - if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { - new->start = tmp.start; - new->end = tmp.start + size - 1; + alloc.start = alignf(alignf_data, &tmp, size, align); + alloc.end = alloc.start + size - 1; + if (resource_contains(&tmp, &alloc)) { + new->start = alloc.start; + new->end = alloc.end; return 0; } if (!this) -- cgit v1.2.3-18-g5258 From a1862e31079149a52b6223776228c3aee493d4a7 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 26 Oct 2010 15:41:28 -0600 Subject: resources: handle overflow when aligning start of available area If tmp.start is near ~0, ALIGN(tmp.start) may overflow, which would make us think there's more available space than there really is. We would likely return something that conflicts with a previous resource, which would cause a failure when allocate_resource() requests the newly- allocated region. Reference: https://bugzilla.redhat.com/show_bug.cgi?id=646027 Reported-by: Fabrice Bellet Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- kernel/resource.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 89d50412508..e15b922d4ba 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -392,7 +392,7 @@ static int find_resource(struct resource *root, struct resource *new, void *alignf_data) { struct resource *this = root->child; - struct resource tmp = *new, alloc; + struct resource tmp = *new, avail, alloc; tmp.start = root->start; /* @@ -410,14 +410,19 @@ static int find_resource(struct resource *root, struct resource *new, tmp.end = root->end; resource_clip(&tmp, min, max); - tmp.start = ALIGN(tmp.start, align); - alloc.start = alignf(alignf_data, &tmp, size, align); - alloc.end = alloc.start + size - 1; - if (resource_contains(&tmp, &alloc)) { - new->start = alloc.start; - new->end = alloc.end; - return 0; + /* Check for overflow after ALIGN() */ + avail = *new; + avail.start = ALIGN(tmp.start, align); + avail.end = tmp.end; + if (avail.start >= tmp.start) { + alloc.start = alignf(alignf_data, &avail, size, align); + alloc.end = alloc.start + size - 1; + if (resource_contains(&avail, &alloc)) { + new->start = alloc.start; + new->end = alloc.end; + return 0; + } } if (!this) break; -- cgit v1.2.3-18-g5258 From e7f8567db9a7f6b3151b0b275e245c1cef0d9c70 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 26 Oct 2010 15:41:33 -0600 Subject: resources: support allocating space within a region from the top down Allocate space from the top of a region first, then work downward, if an architecture desires this. When we allocate space from a resource, we look for gaps between children of the resource. Previously, we always looked at gaps from the bottom up. For example, given this: [mem 0xbff00000-0xf7ffffff] PCI Bus 0000:00 [mem 0xbff00000-0xbfffffff] gap -- available [mem 0xc0000000-0xdfffffff] PCI Bus 0000:02 [mem 0xe0000000-0xf7ffffff] gap -- available we attempted to allocate from the [mem 0xbff00000-0xbfffffff] gap first, then the [mem 0xe0000000-0xf7ffffff] gap. With this patch an architecture can choose to allocate from the top gap [mem 0xe0000000-0xf7ffffff] first. We can't do this across the board because iomem_resource.end is initialized to 0xffffffff_ffffffff on 64-bit architectures, and most machines can't address the entire 64-bit physical address space. Therefore, we only allocate top-down if the arch requests it by clearing "resource_alloc_from_bottom". Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- kernel/resource.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 94 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index e15b922d4ba..716b6804077 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -40,6 +40,23 @@ EXPORT_SYMBOL(iomem_resource); static DEFINE_RWLOCK(resource_lock); +/* + * By default, we allocate free space bottom-up. The architecture can request + * top-down by clearing this flag. The user can override the architecture's + * choice with the "resource_alloc_from_bottom" kernel boot option, but that + * should only be a debugging tool. + */ +int resource_alloc_from_bottom = 1; + +static __init int setup_alloc_from_bottom(char *s) +{ + printk(KERN_INFO + "resource: allocating from bottom-up; please report a bug\n"); + resource_alloc_from_bottom = 1; + return 0; +} +early_param("resource_alloc_from_bottom", setup_alloc_from_bottom); + static void *r_next(struct seq_file *m, void *v, loff_t *pos) { struct resource *p = v; @@ -379,8 +396,75 @@ static bool resource_contains(struct resource *res1, struct resource *res2) return res1->start <= res2->start && res1->end >= res2->end; } +/* + * Find the resource before "child" in the sibling list of "root" children. + */ +static struct resource *find_sibling_prev(struct resource *root, struct resource *child) +{ + struct resource *this; + + for (this = root->child; this; this = this->sibling) + if (this->sibling == child) + return this; + + return NULL; +} + +/* + * Find empty slot in the resource tree given range and alignment. + * This version allocates from the end of the root resource first. + */ +static int find_resource_from_top(struct resource *root, struct resource *new, + resource_size_t size, resource_size_t min, + resource_size_t max, resource_size_t align, + resource_size_t (*alignf)(void *, + const struct resource *, + resource_size_t, + resource_size_t), + void *alignf_data) +{ + struct resource *this; + struct resource tmp, avail, alloc; + + tmp.start = root->end; + tmp.end = root->end; + + this = find_sibling_prev(root, NULL); + for (;;) { + if (this) { + if (this->end < root->end) + tmp.start = this->end + 1; + } else + tmp.start = root->start; + + resource_clip(&tmp, min, max); + + /* Check for overflow after ALIGN() */ + avail = *new; + avail.start = ALIGN(tmp.start, align); + avail.end = tmp.end; + if (avail.start >= tmp.start) { + alloc.start = alignf(alignf_data, &avail, size, align); + alloc.end = alloc.start + size - 1; + if (resource_contains(&avail, &alloc)) { + new->start = alloc.start; + new->end = alloc.end; + return 0; + } + } + + if (!this || this->start == root->start) + break; + + tmp.end = this->start - 1; + this = find_sibling_prev(root, this); + } + return -EBUSY; +} + /* * Find empty slot in the resource tree given range and alignment. + * This version allocates from the beginning of the root resource first. */ static int find_resource(struct resource *root, struct resource *new, resource_size_t size, resource_size_t min, @@ -396,14 +480,15 @@ static int find_resource(struct resource *root, struct resource *new, tmp.start = root->start; /* - * Skip past an allocated resource that starts at 0, since the assignment - * of this->start - 1 to tmp->end below would cause an underflow. + * Skip past an allocated resource that starts at 0, since the + * assignment of this->start - 1 to tmp->end below would cause an + * underflow. */ if (this && this->start == 0) { tmp.start = this->end + 1; this = this->sibling; } - for(;;) { + for (;;) { if (this) tmp.end = this->start - 1; else @@ -424,8 +509,10 @@ static int find_resource(struct resource *root, struct resource *new, return 0; } } + if (!this) break; + tmp.start = this->end + 1; this = this->sibling; } @@ -458,7 +545,10 @@ int allocate_resource(struct resource *root, struct resource *new, alignf = simple_align_resource; write_lock(&resource_lock); - err = find_resource(root, new, size, min, max, align, alignf, alignf_data); + if (resource_alloc_from_bottom) + err = find_resource(root, new, size, min, max, align, alignf, alignf_data); + else + err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data); if (err >= 0 && __request_resource(root, new)) err = -EBUSY; write_unlock(&resource_lock); -- cgit v1.2.3-18-g5258 From 3d5992d2ac7dc09aed8ab537cba074589f0f0a52 Mon Sep 17 00:00:00 2001 From: Ying Han Date: Tue, 26 Oct 2010 14:21:23 -0700 Subject: oom: add per-mm oom disable count It's pointless to kill a task if another thread sharing its mm cannot be killed to allow future memory freeing. A subsequent patch will prevent kills in such cases, but first it's necessary to have a way to flag a task that shares memory with an OOM_DISABLE task that doesn't incur an additional tasklist scan, which would make select_bad_process() an O(n^2) function. This patch adds an atomic counter to struct mm_struct that follows how many threads attached to it have an oom_score_adj of OOM_SCORE_ADJ_MIN. They cannot be killed by the kernel, so their memory cannot be freed in oom conditions. This only requires task_lock() on the task that we're operating on, it does not require mm->mmap_sem since task_lock() pins the mm and the operation is atomic. [rientjes@google.com: changelog and sys_unshare() code] [rientjes@google.com: protect oom_disable_count with task_lock in fork] [rientjes@google.com: use old_mm for oom_disable_count in exec] Signed-off-by: Ying Han Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 3 +++ kernel/fork.c | 15 ++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index e2bdf37f9fd..894179a32ec 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -687,6 +688,8 @@ static void exit_mm(struct task_struct * tsk) enter_lazy_tlb(mm, current); /* We don't want this task to be frozen prematurely */ clear_freeze_flag(tsk); + if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_dec(&mm->oom_disable_count); task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); diff --git a/kernel/fork.c b/kernel/fork.c index c445f8cc408..e87aaaaf513 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -488,6 +489,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) mm->cached_hole_size = ~0UL; mm_init_aio(mm); mm_init_owner(mm, p); + atomic_set(&mm->oom_disable_count, 0); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -741,6 +743,8 @@ good_mm: /* Initializing for Swap token stuff */ mm->token_priority = 0; mm->last_interval = 0; + if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_inc(&mm->oom_disable_count); tsk->mm = mm; tsk->active_mm = mm; @@ -1299,8 +1303,13 @@ bad_fork_cleanup_io: bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: - if (p->mm) + if (p->mm) { + task_lock(p); + if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_dec(&p->mm->oom_disable_count); + task_unlock(p); mmput(p->mm); + } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); @@ -1693,6 +1702,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) active_mm = current->active_mm; current->mm = new_mm; current->active_mm = new_mm; + if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + atomic_dec(&mm->oom_disable_count); + atomic_inc(&new_mm->oom_disable_count); + } activate_mm(active_mm, new_mm); new_mm = mm; } -- cgit v1.2.3-18-g5258 From 61ecdb801ef2cd28e32442383106d7837d76deac Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2010 14:21:47 -0700 Subject: mm: strictly nested kmap_atomic() Ensure kmap_atomic() usage is strictly nested Signed-off-by: Peter Zijlstra Reviewed-by: Rik van Riel Acked-by: Chris Metcalf Cc: David Howells Cc: Hugh Dickins Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Steven Rostedt Cc: Russell King Cc: Ralf Baechle Cc: David Miller Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index ac7eb109f19..9e3581f4619 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -984,8 +984,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) src = kmap_atomic(s_page, KM_USER0); dst = kmap_atomic(d_page, KM_USER1); do_copy_page(dst, src); - kunmap_atomic(src, KM_USER0); kunmap_atomic(dst, KM_USER1); + kunmap_atomic(src, KM_USER0); } else { if (PageHighMem(d_page)) { /* Page pointed to by src may contain some kernel @@ -2273,8 +2273,8 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf) memcpy(buf, kaddr1, PAGE_SIZE); memcpy(kaddr1, kaddr2, PAGE_SIZE); memcpy(kaddr2, buf, PAGE_SIZE); - kunmap_atomic(kaddr1, KM_USER0); kunmap_atomic(kaddr2, KM_USER1); + kunmap_atomic(kaddr1, KM_USER0); } /** -- cgit v1.2.3-18-g5258 From 3ecb01df3261d3b1f02ccfcf8384e2a255d2a1d0 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 26 Oct 2010 14:22:27 -0700 Subject: use clear_page()/copy_page() in favor of memset()/memcpy() on whole pages After all that's what they are intended for. Signed-off-by: Jan Beulich Cc: Miklos Szeredi Cc: "Eric W. Biederman" Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 2 +- kernel/power/snapshot.c | 14 +++++++------- kernel/power/swap.c | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index c0613f7d673..b55045bc756 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -816,7 +816,7 @@ static int kimage_load_normal_segment(struct kimage *image, ptr = kmap(page); /* Start with a clear page */ - memset(ptr, 0, PAGE_SIZE); + clear_page(ptr); ptr += maddr & ~PAGE_MASK; mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); if (mchunk > mbytes) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 9e3581f4619..0dac75ea445 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -993,7 +993,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) */ safe_copy_page(buffer, s_page); dst = kmap_atomic(d_page, KM_USER0); - memcpy(dst, buffer, PAGE_SIZE); + copy_page(dst, buffer); kunmap_atomic(dst, KM_USER0); } else { safe_copy_page(page_address(d_page), s_page); @@ -1687,7 +1687,7 @@ int snapshot_read_next(struct snapshot_handle *handle) memory_bm_position_reset(&orig_bm); memory_bm_position_reset(©_bm); } else if (handle->cur <= nr_meta_pages) { - memset(buffer, 0, PAGE_SIZE); + clear_page(buffer); pack_pfns(buffer, &orig_bm); } else { struct page *page; @@ -1701,7 +1701,7 @@ int snapshot_read_next(struct snapshot_handle *handle) void *kaddr; kaddr = kmap_atomic(page, KM_USER0); - memcpy(buffer, kaddr, PAGE_SIZE); + copy_page(buffer, kaddr); kunmap_atomic(kaddr, KM_USER0); handle->buffer = buffer; } else { @@ -1984,7 +1984,7 @@ static void copy_last_highmem_page(void) void *dst; dst = kmap_atomic(last_highmem_page, KM_USER0); - memcpy(dst, buffer, PAGE_SIZE); + copy_page(dst, buffer); kunmap_atomic(dst, KM_USER0); last_highmem_page = NULL; } @@ -2270,9 +2270,9 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf) kaddr1 = kmap_atomic(p1, KM_USER0); kaddr2 = kmap_atomic(p2, KM_USER1); - memcpy(buf, kaddr1, PAGE_SIZE); - memcpy(kaddr1, kaddr2, PAGE_SIZE); - memcpy(kaddr2, buf, PAGE_SIZE); + copy_page(buf, kaddr1); + copy_page(kaddr1, kaddr2); + copy_page(kaddr2, buf); kunmap_atomic(kaddr2, KM_USER1); kunmap_atomic(kaddr1, KM_USER0); } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 916eaa79039..a0e4a86ccf9 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -251,7 +251,7 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) if (bio_chain) { src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); if (src) { - memcpy(src, buf, PAGE_SIZE); + copy_page(src, buf); } else { WARN_ON_ONCE(1); bio_chain = NULL; /* Go synchronous */ @@ -325,7 +325,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, error = write_page(handle->cur, handle->cur_swap, NULL); if (error) goto out; - memset(handle->cur, 0, PAGE_SIZE); + clear_page(handle->cur); handle->cur_swap = offset; handle->k = 0; } @@ -910,7 +910,7 @@ int swsusp_check(void) hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); - memset(swsusp_header, 0, PAGE_SIZE); + clear_page(swsusp_header); error = hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); if (error) -- cgit v1.2.3-18-g5258 From ca1cab37d91cbe8a8333732540d43cabb54cfa85 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 26 Oct 2010 14:22:34 -0700 Subject: workqueues: s/ON_STACK/ONSTACK/ Silly though it is, completions and wait_queue_heads use foo_ONSTACK (COMPLETION_INITIALIZER_ONSTACK, DECLARE_COMPLETION_ONSTACK, __WAIT_QUEUE_HEAD_INIT_ONSTACK and DECLARE_WAIT_QUEUE_HEAD_ONSTACK) so I guess workqueues should do the same thing. s/INIT_WORK_ON_STACK/INIT_WORK_ONSTACK/ s/INIT_DELAYED_WORK_ON_STACK/INIT_DELAYED_WORK_ONSTACK/ Cc: Peter Zijlstra Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e5ff2cbaadc..90db1bd1a97 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2064,7 +2064,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, * checks and call back into the fixup functions where we * might deadlock. */ - INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); + INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); init_completion(&barr->done); -- cgit v1.2.3-18-g5258 From 571428be550fbe37160596995e96ad398873fcbd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:43 -0700 Subject: kernel/user.c: add lock release annotation on free_user() free_user() releases uidhash_lock but was missing annotation. Add it. This removes following sparse warnings: include/linux/spinlock.h:339:9: warning: context imbalance in 'free_user' - unexpected unlock kernel/user.c:120:6: warning: context imbalance in 'free_uid' - wrong count at exit Signed-off-by: Namhyung Kim Cc: Ingo Molnar Cc: Dhaval Giani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/user.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 7e72614b736..2c7d8d5914b 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -91,6 +91,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) * upon function exit. */ static void free_user(struct user_struct *up, unsigned long flags) + __releases(&uidhash_lock) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); -- cgit v1.2.3-18-g5258 From 518de9b39e854542de59bfb8b9f61c8f7ecf808b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Oct 2010 14:22:44 -0700 Subject: fs: allow for more than 2^31 files Robin Holt tried to boot a 16TB system and found af_unix was overflowing a 32bit value : We were seeing a failure which prevented boot. The kernel was incapable of creating either a named pipe or unix domain socket. This comes down to a common kernel function called unix_create1() which does: atomic_inc(&unix_nr_socks); if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) goto out; The function get_max_files() is a simple return of files_stat.max_files. files_stat.max_files is a signed integer and is computed in fs/file_table.c's files_init(). n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = n; In our case, mempages (total_ram_pages) is approx 3,758,096,384 (0xe0000000). That leaves max_files at approximately 1,503,238,553. This causes 2 * get_max_files() to integer overflow. Fix is to let /proc/sys/fs/file-nr & /proc/sys/fs/file-max use long integers, and change af_unix to use an atomic_long_t instead of atomic_t. get_max_files() is changed to return an unsigned long. get_nr_files() is changed to return a long. unix_nr_socks is changed from atomic_t to atomic_long_t, while not strictly needed to address Robin problem. Before patch (on a 64bit kernel) : # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max -18446744071562067968 After patch: # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max 2147483648 # cat /proc/sys/fs/file-nr 704 0 2147483648 Reported-by: Robin Holt Signed-off-by: Eric Dumazet Acked-by: David Miller Reviewed-by: Robin Holt Tested-by: Robin Holt Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3a45c224770..694b140852c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1352,16 +1352,16 @@ static struct ctl_table fs_table[] = { { .procname = "file-nr", .data = &files_stat, - .maxlen = 3*sizeof(int), + .maxlen = sizeof(files_stat), .mode = 0444, .proc_handler = proc_nr_files, }, { .procname = "file-max", .data = &files_stat.max_files, - .maxlen = sizeof(int), + .maxlen = sizeof(files_stat.max_files), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax, }, { .procname = "nr_open", -- cgit v1.2.3-18-g5258 From ca51c5a76345b28c6f1b742f9f5f0a6fc9afd9ca Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Tue, 26 Oct 2010 14:22:44 -0700 Subject: kernel/stop_machine.c: fix unused variable warning kernel/stop_machine.c: In function `cpu_stopper_thread': kernel/stop_machine.c:265: warning: unused variable `ksym_buf' ksym_buf[] is unused if WARN_ON() is a no-op. Signed-off-by: Rakib Mullick Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/stop_machine.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 090c28812ce..3cb4a9a8ae1 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -262,7 +262,7 @@ repeat: cpu_stop_fn_t fn = work->fn; void *arg = work->arg; struct cpu_stop_done *done = work->done; - char ksym_buf[KSYM_NAME_LEN]; + char ksym_buf[KSYM_NAME_LEN] __maybe_unused; __set_current_state(TASK_RUNNING); -- cgit v1.2.3-18-g5258 From 4ce6494dbd8909718840bb88d5a699ef6ce5c212 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 26 Oct 2010 14:22:45 -0700 Subject: stop_machine: convert cpu notifier to return encapsulate errno value In commit e6bde73b07edeb703d4c89c1daabc09c303de11f ("cpu-hotplug: return better errno on cpu hotplug failure"), the cpu notifier can return an encapsulated errno value. This converts the cpu notifier to return an encapsulated errno value for stop_machine(). Signed-off-by: Akinobu Mita Cc: Rusty Russell Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/stop_machine.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 3cb4a9a8ae1..2df820b03be 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -304,7 +304,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", cpu); if (IS_ERR(p)) - return NOTIFY_BAD; + return notifier_from_errno(PTR_ERR(p)); get_task_struct(p); kthread_bind(p, cpu); sched_set_stop_task(cpu, p); @@ -372,7 +372,7 @@ static int __init cpu_stop_init(void) /* start one for the boot cpu */ err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE, bcpu); - BUG_ON(err == NOTIFY_BAD); + BUG_ON(err != NOTIFY_OK); cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); register_cpu_notifier(&cpu_stop_cpu_notifier); -- cgit v1.2.3-18-g5258 From 6c095efd82e8f6a98515426a733110f91cf0a709 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:47 -0700 Subject: printk: fixup declaration of kmsg_reasons Move redundant 'const' after '*' to make pointer itself const Signed-off-by: Namhyung Kim Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 2531017795f..d18933a9281 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1511,7 +1511,7 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper) } EXPORT_SYMBOL_GPL(kmsg_dump_unregister); -static const char const *kmsg_reasons[] = { +static const char * const kmsg_reasons[] = { [KMSG_DUMP_OOPS] = "oops", [KMSG_DUMP_PANIC] = "panic", [KMSG_DUMP_KEXEC] = "kexec", -- cgit v1.2.3-18-g5258 From 8155c02a44a95562e1ae0999360eb31288d7195a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:47 -0700 Subject: printk: add lock context annotation acquire_console_semaphore_for_printk() releases logbuf_lock but was missing proper annotation. Add it. Signed-off-by: Namhyung Kim Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index d18933a9281..6ff26151f4f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -647,6 +647,7 @@ static inline int can_use_console(unsigned int cpu) * released but interrupts still disabled. */ static int acquire_console_semaphore_for_printk(unsigned int cpu) + __releases(&logbuf_lock) { int retval = 0; -- cgit v1.2.3-18-g5258 From 674dff6507d3f9b110219ea125cf5e1213c9acef Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:48 -0700 Subject: printk: change type of 'boot_delay' to int * get_option() takes its 2nd arg as int * so passing boot_delay to it caused following warnings from sparse: kernel/printk.c:223:27: warning: incorrect type in argument 2 (different signedness) kernel/printk.c:223:27: expected int *pint kernel/printk.c:223:27: got unsigned int static [toplevel] * Since boot_delay can't grow more than 10,000 changing it to 'int *' will not produce any problem. Signed-off-by: Namhyung Kim Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 6ff26151f4f..b2ebaee8c37 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -210,7 +210,7 @@ __setup("log_buf_len=", log_buf_len_setup); #ifdef CONFIG_BOOT_PRINTK_DELAY -static unsigned int boot_delay; /* msecs delay after each printk during bootup */ +static int boot_delay; /* msecs delay after each printk during bootup */ static unsigned long long loops_per_msec; /* based on boot_delay */ static int __init boot_delay_setup(char *str) -- cgit v1.2.3-18-g5258 From f5d87d851d76a390d0fab2f77bd1d563d69ee586 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:49 -0700 Subject: printk: declare printk_ratelimit_state in ratelimit.h Adding declaration of printk_ratelimit_state in ratelimit.h removes potential build breakage and following sparse warning: kernel/printk.c:1426:1: warning: symbol 'printk_ratelimit_state' was not declared. Should it be static? [akpm@linux-foundation.org: remove unneeded ifdef] Signed-off-by: Namhyung Kim Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 694b140852c..48d9d689498 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -161,8 +161,6 @@ extern int no_unaligned_warning; extern int unaligned_dump_stack; #endif -extern struct ratelimit_state printk_ratelimit_state; - #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -- cgit v1.2.3-18-g5258 From ee2f154a598e96df2ebb01648a7699373bc085c7 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 26 Oct 2010 14:17:25 -0700 Subject: docbook: add more wait/wake/completion to device-drivers docbook Add more wait, wake, and completion interfaces to the device-drivers docbook. Fix kernel-doc notation in the added files. Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- kernel/wait.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/wait.c b/kernel/wait.c index c4bd3d825f3..b0310eb6cc1 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -92,7 +92,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) } EXPORT_SYMBOL(prepare_to_wait_exclusive); -/* +/** * finish_wait - clean up after waiting in a queue * @q: waitqueue waited on * @wait: wait descriptor @@ -127,11 +127,11 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) } EXPORT_SYMBOL(finish_wait); -/* +/** * abort_exclusive_wait - abort exclusive waiting in a queue * @q: waitqueue waited on * @wait: wait descriptor - * @state: runstate of the waiter to be woken + * @mode: runstate of the waiter to be woken * @key: key to identify a wait bit queue or %NULL * * Sets current thread back to running state and removes -- cgit v1.2.3-18-g5258 From abbce906d05ec37289cd0c3b4e35b2db26eab19b Mon Sep 17 00:00:00 2001 From: MichaÅ‚ MirosÅ‚aw Date: Mon, 20 Sep 2010 01:58:08 +0200 Subject: (trivial) Fix compiler warning in kernel/modules.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Building with CONFIG_KALLSYMS=n gives following warning: /mnt/src/linux-git/kernel/module.c: In function ‘post_relocation’: /mnt/src/linux-git/kernel/module.c:2534:2: warning: passing argument 2 of ‘add_kallsyms’ discards qualifiers from pointer target type /mnt/src/linux-git/kernel/module.c:2038:13: note: expected ‘struct load_info *’ but argument is of type ‘const struct load_info *’ Signed-off-by: MichaÅ‚ MirosÅ‚aw Signed-off-by: Rusty Russell --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 2df46301a7a..437a74a7524 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2037,7 +2037,7 @@ static inline void layout_symtab(struct module *mod, struct load_info *info) { } -static void add_kallsyms(struct module *mod, struct load_info *info) +static void add_kallsyms(struct module *mod, const struct load_info *info) { } #endif /* CONFIG_KALLSYMS */ -- cgit v1.2.3-18-g5258 From 3a5f65df5a0fcbaa35e5417c0420d691fee4ac56 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 27 Oct 2010 17:28:36 +0100 Subject: Typedef SMP call function pointer Typedef the pointer to the function to be called by smp_call_function() and friends: typedef void (*smp_call_func_t)(void *info); as it is used in a fair number of places. Signed-off-by: David Howells cc: linux-arch@vger.kernel.org --- kernel/smp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index ed6aacfcb7e..12ed8b013e2 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -267,7 +267,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); * * Returns 0 on success, else a negative status code. */ -int smp_call_function_single(int cpu, void (*func) (void *info), void *info, +int smp_call_function_single(int cpu, smp_call_func_t func, void *info, int wait) { struct call_single_data d = { @@ -336,7 +336,7 @@ EXPORT_SYMBOL(smp_call_function_single); * 3) any other online cpu in @mask */ int smp_call_function_any(const struct cpumask *mask, - void (*func)(void *info), void *info, int wait) + smp_call_func_t func, void *info, int wait) { unsigned int cpu; const struct cpumask *nodemask; @@ -416,7 +416,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, * must be disabled when calling this function. */ void smp_call_function_many(const struct cpumask *mask, - void (*func)(void *), void *info, bool wait) + smp_call_func_t func, void *info, bool wait) { struct call_function_data *data; unsigned long flags; @@ -500,7 +500,7 @@ EXPORT_SYMBOL(smp_call_function_many); * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. */ -int smp_call_function(void (*func)(void *), void *info, int wait) +int smp_call_function(smp_call_func_t func, void *info, int wait) { preempt_disable(); smp_call_function_many(cpu_online_mask, func, info, wait); -- cgit v1.2.3-18-g5258 From d5de4ddb1bc430289bede76c0d87cabee93f749a Mon Sep 17 00:00:00 2001 From: Tomasz Buchert Date: Wed, 27 Oct 2010 15:33:32 -0700 Subject: cgroup_freezer: unnecessary test in cgroup_freezing_or_frozen() The root freezer_state is always CGROUP_THAWED so we can remove the special case from the code. The test itself can be handy and is extracted to static function. Signed-off-by: Tomasz Buchert Cc: Matt Helsley Cc: Paul Menage Cc: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup_freezer.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index ce71ed53e88..321e2a007d2 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -48,20 +48,19 @@ static inline struct freezer *task_freezer(struct task_struct *task) struct freezer, css); } -int cgroup_freezing_or_frozen(struct task_struct *task) +static inline int __cgroup_freezing_or_frozen(struct task_struct *task) { - struct freezer *freezer; - enum freezer_state state; + enum freezer_state state = task_freezer(task)->state; + return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); +} +int cgroup_freezing_or_frozen(struct task_struct *task) +{ + int result; task_lock(task); - freezer = task_freezer(task); - if (!freezer->css.cgroup->parent) - state = CGROUP_THAWED; /* root cgroup can't be frozen */ - else - state = freezer->state; + result = __cgroup_freezing_or_frozen(task); task_unlock(task); - - return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); + return result; } /* -- cgit v1.2.3-18-g5258 From 0bdba580ab052a21e3eda2764ed22d9ee962392b Mon Sep 17 00:00:00 2001 From: Tomasz Buchert Date: Wed, 27 Oct 2010 15:33:33 -0700 Subject: cgroup_freezer: fix can_attach() to prohibit moving from/to freezing/frozen cgroups It is possible to move a task from its cgroup even if this group is 'FREEZING'. This results in a nasty bug - the moved task will become frozen OUTSIDE its original cgroup and will remain in a permanent 'D' state. This patch allows to migrate the task only between THAWED cgroups. This behavior was observed and easily reproduced on a single core laptop. Notice that reproducibility depends highly on the machine used. Program and instructions how to reproduce the bug can be fetched from: http://pentium.hopto.org/~thinred/repos/linux-misc/freezer_bug.c Signed-off-by: Tomasz Buchert Cc: Matt Helsley Cc: Paul Menage Cc: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup_freezer.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 321e2a007d2..c287627bed3 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -173,24 +173,25 @@ static int freezer_can_attach(struct cgroup_subsys *ss, /* * Anything frozen can't move or be moved to/from. - * - * Since orig_freezer->state == FROZEN means that @task has been - * frozen, so it's sufficient to check the latter condition. */ - if (is_task_frozen_enough(task)) + freezer = cgroup_freezer(new_cgroup); + if (freezer->state != CGROUP_THAWED) return -EBUSY; - freezer = cgroup_freezer(new_cgroup); - if (freezer->state == CGROUP_FROZEN) + rcu_read_lock(); + if (__cgroup_freezing_or_frozen(task)) { + rcu_read_unlock(); return -EBUSY; + } + rcu_read_unlock(); if (threadgroup) { struct task_struct *c; rcu_read_lock(); list_for_each_entry_rcu(c, &task->thread_group, thread_group) { - if (is_task_frozen_enough(c)) { + if (__cgroup_freezing_or_frozen(c)) { rcu_read_unlock(); return -EBUSY; } -- cgit v1.2.3-18-g5258 From 2d3cbf8bc852ac1bc3d098186143c5973f87b753 Mon Sep 17 00:00:00 2001 From: Tomasz Buchert Date: Wed, 27 Oct 2010 15:33:34 -0700 Subject: cgroup_freezer: update_freezer_state() does incorrect state transitions There are 4 state transitions possible for a freezer. Only FREEZING -> FROZEN transaction is done lazily. This patch allows update_freezer_state only to perform this transaction and renames the function to update_if_frozen. Moreover is_task_frozen_enough function is removed and its every occurence is replaced with frozen(). Therefore for a group to become FROZEN every task must be frozen. The previous version could trigger a following bug: When cgroup is in the process of freezing (but none of its tasks are frozen yet), update_freezer_state() (called from freezer_read or freezer_write) would incorrectly report that a group is 'THAWED' (because nfrozen = 0), allowing the transaction FREEZING -> THAWED without writing anything to 'freezer.state'. This is incorrect according to the documentation. This could result in a 'THAWED' cgroup with frozen tasks inside. A code to reproduce this bug is available here: http://pentium.hopto.org/~thinred/repos/linux-misc/freezer_bug2.c [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Tomasz Buchert Cc: Matt Helsley Cc: Paul Menage Cc: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup_freezer.c | 38 +++++++++++++++----------------------- 1 file changed, 15 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index c287627bed3..e7bebb7c6c3 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -153,13 +153,6 @@ static void freezer_destroy(struct cgroup_subsys *ss, kfree(cgroup_freezer(cgroup)); } -/* Task is frozen or will freeze immediately when next it gets woken */ -static bool is_task_frozen_enough(struct task_struct *task) -{ - return frozen(task) || - (task_is_stopped_or_traced(task) && freezing(task)); -} - /* * The call to cgroup_lock() in the freezer.state write method prevents * a write to that file racing against an attach, and hence the @@ -236,31 +229,30 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) /* * caller must hold freezer->lock */ -static void update_freezer_state(struct cgroup *cgroup, +static void update_if_frozen(struct cgroup *cgroup, struct freezer *freezer) { struct cgroup_iter it; struct task_struct *task; unsigned int nfrozen = 0, ntotal = 0; + enum freezer_state old_state = freezer->state; cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { ntotal++; - if (is_task_frozen_enough(task)) + if (frozen(task)) nfrozen++; } - /* - * Transition to FROZEN when no new tasks can be added ensures - * that we never exist in the FROZEN state while there are unfrozen - * tasks. - */ - if (nfrozen == ntotal) - freezer->state = CGROUP_FROZEN; - else if (nfrozen > 0) - freezer->state = CGROUP_FREEZING; - else - freezer->state = CGROUP_THAWED; + if (old_state == CGROUP_THAWED) { + BUG_ON(nfrozen > 0); + } else if (old_state == CGROUP_FREEZING) { + if (nfrozen == ntotal) + freezer->state = CGROUP_FROZEN; + } else { /* old_state == CGROUP_FROZEN */ + BUG_ON(nfrozen != ntotal); + } + cgroup_iter_end(cgroup, &it); } @@ -279,7 +271,7 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft, if (state == CGROUP_FREEZING) { /* We change from FREEZING to FROZEN lazily if the cgroup was * only partially frozen when we exitted write. */ - update_freezer_state(cgroup, freezer); + update_if_frozen(cgroup, freezer); state = freezer->state; } spin_unlock_irq(&freezer->lock); @@ -301,7 +293,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) while ((task = cgroup_iter_next(cgroup, &it))) { if (!freeze_task(task, true)) continue; - if (is_task_frozen_enough(task)) + if (frozen(task)) continue; if (!freezing(task) && !freezer_should_skip(task)) num_cant_freeze_now++; @@ -335,7 +327,7 @@ static int freezer_change_state(struct cgroup *cgroup, spin_lock_irq(&freezer->lock); - update_freezer_state(cgroup, freezer); + update_if_frozen(cgroup, freezer); if (goal_state == freezer->state) goto out; -- cgit v1.2.3-18-g5258 From 97978e6d1f2da0073416870410459694fbdbfd9b Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Wed, 27 Oct 2010 15:33:35 -0700 Subject: cgroup: add clone_children control file The ns_cgroup is a control group interacting with the namespaces. When a new namespace is created, a corresponding cgroup is automatically created too. The cgroup name is the pid of the process who did 'unshare' or the child of 'clone'. This cgroup is tied with the namespace because it prevents a process to escape the control group and use the post_clone callback, so the child cgroup inherits the values of the parent cgroup. Unfortunately, the more we use this cgroup and the more we are facing problems with it: (1) when a process unshares, the cgroup name may conflict with a previous cgroup with the same pid, so unshare or clone return -EEXIST (2) the cgroup creation is out of control because there may have an application creating several namespaces where the system will automatically create several cgroups in his back and let them on the cgroupfs (eg. a vrf based on the network namespace). (3) the mix of (1) and (2) force an administrator to regularly check and clean these cgroups. This patchset removes the ns_cgroup by adding a new flag to the cgroup and the cgroupfs mount option. It enables the copy of the parent cgroup when a child cgroup is created. We can then safely remove the ns_cgroup as this flag brings a compatibility. We have now to manually create and add the task to a cgroup, which is consistent with the cgroup framework. This patch: Sent as an answer to a previous thread around the ns_cgroup. https://lists.linux-foundation.org/pipermail/containers/2009-June/018627.html It adds a control file 'clone_children' for a cgroup. This control file is a boolean specifying if the child cgroup should be a clone of the parent cgroup or not. The default value is 'false'. This flag makes the child cgroup to call the post_clone callback of all the subsystem, if it is available. At present, the cpuset is the only one which had implemented the post_clone callback. The option can be set at mount time by specifying the 'clone_children' mount option. Signed-off-by: Daniel Lezcano Signed-off-by: Serge E. Hallyn Cc: Eric W. Biederman Acked-by: Paul Menage Reviewed-by: Li Zefan Cc: Jamal Hadi Salim Cc: Matt Helsley Acked-by: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9270d532ec3..4b218a46ddd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -243,6 +243,11 @@ static int notify_on_release(const struct cgroup *cgrp) return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } +static int clone_children(const struct cgroup *cgrp) +{ + return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); +} + /* * for_each_subsys() allows you to iterate on each subsystem attached to * an active hierarchy @@ -1040,6 +1045,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",noprefix"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); + if (clone_children(&root->top_cgroup)) + seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_printf(seq, ",name=%s", root->name); mutex_unlock(&cgroup_mutex); @@ -1050,6 +1057,7 @@ struct cgroup_sb_opts { unsigned long subsys_bits; unsigned long flags; char *release_agent; + bool clone_children; char *name; /* User explicitly requested empty subsystem */ bool none; @@ -1097,6 +1105,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) opts->none = true; } else if (!strcmp(token, "noprefix")) { set_bit(ROOT_NOPREFIX, &opts->flags); + } else if (!strcmp(token, "clone_children")) { + opts->clone_children = true; } else if (!strncmp(token, "release_agent=", 14)) { /* Specifying two release agents is forbidden */ if (opts->release_agent) @@ -1355,6 +1365,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) strcpy(root->release_agent_path, opts->release_agent); if (opts->name) strcpy(root->name, opts->name); + if (opts->clone_children) + set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); return root; } @@ -3173,6 +3185,23 @@ fail: return ret; } +static u64 cgroup_clone_children_read(struct cgroup *cgrp, + struct cftype *cft) +{ + return clone_children(cgrp); +} + +static int cgroup_clone_children_write(struct cgroup *cgrp, + struct cftype *cft, + u64 val) +{ + if (val) + set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + else + clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + return 0; +} + /* * for the common functions, 'private' gives the type of file */ @@ -3203,6 +3232,11 @@ static struct cftype files[] = { .write_string = cgroup_write_event_control, .mode = S_IWUGO, }, + { + .name = "cgroup.clone_children", + .read_u64 = cgroup_clone_children_read, + .write_u64 = cgroup_clone_children_write, + }, }; static struct cftype cft_release_agent = { @@ -3332,6 +3366,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + if (clone_children(parent)) + set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + for_each_subsys(root, ss) { struct cgroup_subsys_state *css = ss->create(ss, cgrp); @@ -3346,6 +3383,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_destroy; } /* At error, ->destroy() callback has to free assigned ID. */ + if (clone_children(parent) && ss->post_clone) + ss->post_clone(ss, cgrp); } cgroup_lock_hierarchy(root); -- cgit v1.2.3-18-g5258 From 32a8cf235e2f192eb002755076994525cdbaa35a Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Wed, 27 Oct 2010 15:33:37 -0700 Subject: cgroup: make the mount options parsing more accurate Current behavior: ================= (1) When we mount a cgroup, we can specify the 'all' option which means to enable all the cgroup subsystems. This is the default option when no option is specified. (2) If we want to mount a cgroup with a subset of the supported cgroup subsystems, we have to specify a subsystems name list for the mount option. (3) If we specify another option like 'noprefix' or 'release_agent', the actual code wants the 'all' or a subsystem name option specified also. Not critical but a bit not friendly as we should assume (1) in this case. (4) Logically, the 'all' option is mutually exclusive with a subsystem name, but this is not detected. In other words: succeed : mount -t cgroup -o all,freezer cgroup /cgroup => is it 'all' or 'freezer' ? fails : mount -t cgroup -o noprefix cgroup /cgroup => succeed if we do '-o noprefix,all' The following patches consolidate a bit the mount options check. New behavior: ============= (1) untouched (2) untouched (3) the 'all' option will be by default when specifying other than a subsystem name option (4) raises an error In other words: fails : mount -t cgroup -o all,freezer cgroup /cgroup succeed : mount -t cgroup -o noprefix cgroup /cgroup For the sake of lisibility, the if ... then ... else ... if ... indentation when parsing the options has been changed to: if ... then ... continue fi Signed-off-by: Daniel Lezcano Signed-off-by: Serge E. Hallyn Reviewed-by: Li Zefan Reviewed-by: Paul Menage Cc: Eric W. Biederman Cc: Jamal Hadi Salim Cc: Matt Helsley Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 90 ++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 60 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4b218a46ddd..3e6517e51fd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1074,7 +1074,8 @@ struct cgroup_sb_opts { */ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { - char *token, *o = data ?: "all"; + char *token, *o = data; + bool all_ss = false, one_ss = false; unsigned long mask = (unsigned long)-1; int i; bool module_pin_failed = false; @@ -1090,24 +1091,27 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) while ((token = strsep(&o, ",")) != NULL) { if (!*token) return -EINVAL; - if (!strcmp(token, "all")) { - /* Add all non-disabled subsystems */ - opts->subsys_bits = 0; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss == NULL) - continue; - if (!ss->disabled) - opts->subsys_bits |= 1ul << i; - } - } else if (!strcmp(token, "none")) { + if (!strcmp(token, "none")) { /* Explicitly have no subsystems */ opts->none = true; - } else if (!strcmp(token, "noprefix")) { + continue; + } + if (!strcmp(token, "all")) { + /* Mutually exclusive option 'all' + subsystem name */ + if (one_ss) + return -EINVAL; + all_ss = true; + continue; + } + if (!strcmp(token, "noprefix")) { set_bit(ROOT_NOPREFIX, &opts->flags); - } else if (!strcmp(token, "clone_children")) { + continue; + } + if (!strcmp(token, "clone_children")) { opts->clone_children = true; - } else if (!strncmp(token, "release_agent=", 14)) { + continue; + } + if (!strncmp(token, "release_agent=", 14)) { /* Specifying two release agents is forbidden */ if (opts->release_agent) return -EINVAL; @@ -1115,7 +1119,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); if (!opts->release_agent) return -ENOMEM; - } else if (!strncmp(token, "name=", 5)) { + continue; + } + if (!strncmp(token, "name=", 5)) { const char *name = token + 5; /* Can't specify an empty name */ if (!strlen(name)) @@ -1137,20 +1143,44 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) GFP_KERNEL); if (!opts->name) return -ENOMEM; - } else { - struct cgroup_subsys *ss; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - ss = subsys[i]; - if (ss == NULL) - continue; - if (!strcmp(token, ss->name)) { - if (!ss->disabled) - set_bit(i, &opts->subsys_bits); - break; - } - } - if (i == CGROUP_SUBSYS_COUNT) - return -ENOENT; + + continue; + } + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; + if (strcmp(token, ss->name)) + continue; + if (ss->disabled) + continue; + + /* Mutually exclusive option 'all' + subsystem name */ + if (all_ss) + return -EINVAL; + set_bit(i, &opts->subsys_bits); + one_ss = true; + + break; + } + if (i == CGROUP_SUBSYS_COUNT) + return -ENOENT; + } + + /* + * If the 'all' option was specified select all the subsystems, + * otherwise 'all, 'none' and a subsystem name options were not + * specified, let's default to 'all' + */ + if (all_ss || (!all_ss && !one_ss && !opts->none)) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; + if (ss->disabled) + continue; + set_bit(i, &opts->subsys_bits); } } -- cgit v1.2.3-18-g5258 From f4a2589feaef0a9b737a3e582b37ee96695bb25f Mon Sep 17 00:00:00 2001 From: Evgeny Kuznetsov Date: Wed, 27 Oct 2010 15:33:37 -0700 Subject: cgroups: add check for strcpy destination string overflow Function "strcpy" is used without check for maximum allowed source string length and could cause destination string overflow. Check for string length is added before using "strcpy". Function now is return error if source string length is more than a maximum. akpm: presently considered NotABug, but add the check for general future-safeness and robustness. Signed-off-by: Evgeny Kuznetsov Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3e6517e51fd..5cf366965d0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1922,6 +1922,8 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, const char *buffer) { BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + if (strlen(buffer) >= PATH_MAX) + return -EINVAL; if (!cgroup_lock_live_group(cgrp)) return -ENODEV; strcpy(cgrp->root->release_agent_path, buffer); -- cgit v1.2.3-18-g5258 From 45531757b45cae0ce64c5aff08c2534d5a0fa3e7 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Wed, 27 Oct 2010 15:33:38 -0700 Subject: cgroup: notify ns_cgroup deprecated The ns_cgroup will be removed very soon. Let's warn, for this version, ns_cgroup is deprecated. Make ns_cgroup and clone_children exclusive. If the clone_children is set and the ns_cgroup is mounted, let's fail with EINVAL when the ns_cgroup subsys is created (a printk will help the user to understand why the creation fails). Update the feature remove schedule file with the deprecated ns_cgroup. Signed-off-by: Daniel Lezcano Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ns_cgroup.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c index 2a5dfec8efe..2c98ad94ba0 100644 --- a/kernel/ns_cgroup.c +++ b/kernel/ns_cgroup.c @@ -85,6 +85,14 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss, return ERR_PTR(-EPERM); if (!cgroup_is_descendant(cgroup, current)) return ERR_PTR(-EPERM); + if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) { + printk("ns_cgroup can't be created with parent " + "'clone_children' set.\n"); + return ERR_PTR(-EINVAL); + } + + printk_once("ns_cgroup deprecated: consider using the " + "'clone_children' flag without the ns_cgroup.\n"); ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); if (!ns_cgroup) -- cgit v1.2.3-18-g5258 From c4b5ed250eebf854d40f27b43362c80f115cb57a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 27 Oct 2010 15:33:44 -0700 Subject: ptrace: annotate lock context change on exit_ptrace() exit_ptrace() releases and regrabs tasklist_lock but was missing proper annotation. Add it. Signed-off-by: Namhyung Kim Acked-by: Roland McGrath Cc: Ingo Molnar Cc: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index f34d798ef4a..4afd9b86cc0 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -329,6 +329,8 @@ int ptrace_detach(struct task_struct *child, unsigned int data) * and reacquire the lock. */ void exit_ptrace(struct task_struct *tracer) + __releases(&tasklist_lock) + __acquires(&tasklist_lock) { struct task_struct *p, *n; LIST_HEAD(ptrace_dead); -- cgit v1.2.3-18-g5258 From 4abf986960ecda6a87fc2f795aacf888a2f0127e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 27 Oct 2010 15:33:45 -0700 Subject: ptrace: change signature of sys_ptrace() and friends Since userspace API of ptrace syscall defines @addr and @data as void pointers, it would be more appropriate to define them as unsigned long in kernel. Therefore related functions are changed also. 'unsigned long' is typically used in other places in kernel as an opaque data type and that using this helps cleaning up a lot of warnings from sparse. Suggested-by: Arnd Bergmann Signed-off-by: Namhyung Kim Acked-by: Arnd Bergmann Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 4afd9b86cc0..06981a8b271 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -404,7 +404,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds return copied; } -static int ptrace_setoptions(struct task_struct *child, long data) +static int ptrace_setoptions(struct task_struct *child, unsigned long data) { child->ptrace &= ~PT_TRACE_MASK; @@ -483,7 +483,8 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) #define is_sysemu_singlestep(request) 0 #endif -static int ptrace_resume(struct task_struct *child, long request, long data) +static int ptrace_resume(struct task_struct *child, long request, + unsigned long data) { if (!valid_signal(data)) return -EIO; @@ -560,7 +561,7 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, #endif int ptrace_request(struct task_struct *child, long request, - long addr, long data) + unsigned long addr, unsigned long data) { int ret = -EIO; siginfo_t siginfo; @@ -693,7 +694,8 @@ static struct task_struct *ptrace_get_task_struct(pid_t pid) #define arch_ptrace_attach(child) do { } while (0) #endif -SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) +SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, + unsigned long, data) { struct task_struct *child; long ret; @@ -734,7 +736,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) return ret; } -int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) +int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, + unsigned long data) { unsigned long tmp; int copied; @@ -745,7 +748,8 @@ int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) return put_user(tmp, (unsigned long __user *)data); } -int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) +int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, + unsigned long data) { int copied; -- cgit v1.2.3-18-g5258 From 9fed81dc40f5a1ac2783bcc78d4029873be72894 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 27 Oct 2010 15:33:46 -0700 Subject: ptrace: cleanup ptrace_request() Use new 'datavp' and 'datalp' variables to remove unnecesary castings. Signed-off-by: Namhyung Kim Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 06981a8b271..ea7ce0215cd 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -565,6 +565,8 @@ int ptrace_request(struct task_struct *child, long request, { int ret = -EIO; siginfo_t siginfo; + void __user *datavp = (void __user *) data; + unsigned long __user *datalp = datavp; switch (request) { case PTRACE_PEEKTEXT: @@ -581,19 +583,17 @@ int ptrace_request(struct task_struct *child, long request, ret = ptrace_setoptions(child, data); break; case PTRACE_GETEVENTMSG: - ret = put_user(child->ptrace_message, (unsigned long __user *) data); + ret = put_user(child->ptrace_message, datalp); break; case PTRACE_GETSIGINFO: ret = ptrace_getsiginfo(child, &siginfo); if (!ret) - ret = copy_siginfo_to_user((siginfo_t __user *) data, - &siginfo); + ret = copy_siginfo_to_user(datavp, &siginfo); break; case PTRACE_SETSIGINFO: - if (copy_from_user(&siginfo, (siginfo_t __user *) data, - sizeof siginfo)) + if (copy_from_user(&siginfo, datavp, sizeof siginfo)) ret = -EFAULT; else ret = ptrace_setsiginfo(child, &siginfo); @@ -624,7 +624,7 @@ int ptrace_request(struct task_struct *child, long request, } mmput(mm); - ret = put_user(tmp, (unsigned long __user *) data); + ret = put_user(tmp, datalp); break; } #endif @@ -653,7 +653,7 @@ int ptrace_request(struct task_struct *child, long request, case PTRACE_SETREGSET: { struct iovec kiov; - struct iovec __user *uiov = (struct iovec __user *) data; + struct iovec __user *uiov = datavp; if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) return -EFAULT; -- cgit v1.2.3-18-g5258 From b8ed374e202e23caaf9bd77dcadc9de6447faaa8 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 27 Oct 2010 15:34:06 -0700 Subject: signals: annotate lock_task_sighand() lock_task_sighand() grabs sighand->siglock in case of returning non-NULL but unlock_task_sighand() releases it unconditionally. This leads sparse to complain about the lock context imbalance. Rename and wrap lock_task_sighand() using __cond_lock() macro to make sparse happy. Suggested-by: Eric Dumazet Signed-off-by: Namhyung Kim Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 919562c3d6b..e921409b85a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1105,7 +1105,8 @@ int zap_other_threads(struct task_struct *p) return count; } -struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) +struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, + unsigned long *flags) { struct sighand_struct *sighand; -- cgit v1.2.3-18-g5258 From b84011508360d6885a9d95a235ec77d56f133377 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 27 Oct 2010 15:34:07 -0700 Subject: signals: annotate lock context change on ptrace_stop() ptrace_stop() releases and regrabs current->sighand->siglock but was missing proper annotation. Add it. Signed-off-by: Namhyung Kim Acked-by: Roland McGrath Cc: Ingo Molnar Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index e921409b85a..4e3cff10fdc 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1618,6 +1618,8 @@ static int sigkill_pending(struct task_struct *tsk) * is gone, we keep current->exit_code unless clear_code. */ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) + __releases(¤t->sighand->siglock) + __acquires(¤t->sighand->siglock) { if (arch_ptrace_stop_needed(exit_code, info)) { /* -- cgit v1.2.3-18-g5258 From 9b1bf12d5d51bca178dea21b04a0805e29d60cf1 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 27 Oct 2010 15:34:08 -0700 Subject: signals: move cred_guard_mutex from task_struct to signal_struct Oleg Nesterov pointed out we have to prevent multiple-threads-inside-exec itself and we can reuse ->cred_guard_mutex for it. Yes, concurrent execve() has no worth. Let's move ->cred_guard_mutex from task_struct to signal_struct. It naturally prevent multiple-threads-inside-exec. Signed-off-by: KOSAKI Motohiro Reviewed-by: Oleg Nesterov Acked-by: Roland McGrath Acked-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cred.c | 4 +--- kernel/fork.c | 2 ++ kernel/ptrace.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 9a3e22641fe..6a1aa004e37 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -325,7 +325,7 @@ EXPORT_SYMBOL(prepare_creds); /* * Prepare credentials for current to perform an execve() - * - The caller must hold current->cred_guard_mutex + * - The caller must hold ->cred_guard_mutex */ struct cred *prepare_exec_creds(void) { @@ -384,8 +384,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) struct cred *new; int ret; - mutex_init(&p->cred_guard_mutex); - if ( #ifdef CONFIG_KEYS !p->cred->thread_keyring && diff --git a/kernel/fork.c b/kernel/fork.c index e87aaaaf513..3b159c5991b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -908,6 +908,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; + mutex_init(&sig->cred_guard_mutex); + return 0; } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ea7ce0215cd..99bbaa3e5b0 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -181,7 +181,7 @@ int ptrace_attach(struct task_struct *task) * under ptrace. */ retval = -ERESTARTNOINTR; - if (mutex_lock_interruptible(&task->cred_guard_mutex)) + if (mutex_lock_interruptible(&task->signal->cred_guard_mutex)) goto out; task_lock(task); @@ -208,7 +208,7 @@ int ptrace_attach(struct task_struct *task) unlock_tasklist: write_unlock_irq(&tasklist_lock); unlock_creds: - mutex_unlock(&task->cred_guard_mutex); + mutex_unlock(&task->signal->cred_guard_mutex); out: return retval; } -- cgit v1.2.3-18-g5258 From d16e15f5b029fc7d03540ba0e5fb23b0abb0ebe0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 27 Oct 2010 15:34:10 -0700 Subject: exit: add lock context annotation on find_new_reaper() find_new_reaper() releases and regrabs tasklist_lock but was missing proper annotations. Add it. This remove following sparse warning: warning: context imbalance in 'find_new_reaper' - unexpected unlock Signed-off-by: Namhyung Kim Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 894179a32ec..b194febf579 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -703,6 +703,8 @@ static void exit_mm(struct task_struct * tsk) * space. */ static struct task_struct *find_new_reaper(struct task_struct *father) + __releases(&tasklist_lock) + __acquires(&tasklist_lock) { struct pid_namespace *pid_ns = task_active_pid_ns(father); struct task_struct *thread; -- cgit v1.2.3-18-g5258 From 478735e38887077ac77a9756121b6ce0cb956e2f Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 27 Oct 2010 15:34:15 -0700 Subject: /proc/stat: fix scalability of irq sum of all cpu In /proc/stat, the number of per-IRQ event is shown by making a sum each irq's events on all cpus. But we can make use of kstat_irqs(). kstat_irqs() do the same calculation, If !CONFIG_GENERIC_HARDIRQ, it's not a big cost. (Both of the number of cpus and irqs are small.) If a system is very big and CONFIG_GENERIC_HARDIRQ, it does for_each_irq() for_each_cpu() - look up a radix tree - read desc->irq_stat[cpu] This seems not efficient. This patch adds kstat_irqs() for CONFIG_GENRIC_HARDIRQ and change the calculation as for_each_irq() look up radix tree for_each_cpu() - read desc->irq_stat[cpu] This reduces cost. A test on (4096cpusp, 256 nodes, 4592 irqs) host (by Jack Steiner) %time cat /proc/stat > /dev/null Before Patch: 2.459 sec After Patch : .561 sec [akpm@linux-foundation.org: unexport kstat_irqs, coding-style tweaks] [akpm@linux-foundation.org: fix unused variable 'per_irq_sum'] Signed-off-by: KAMEZAWA Hiroyuki Tested-by: Jack Steiner Acked-by: Jack Steiner Cc: Yinghai Lu Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/irqdesc.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9d917ff7267..9988d03797f 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -393,3 +393,18 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) struct irq_desc *desc = irq_to_desc(irq); return desc ? desc->kstat_irqs[cpu] : 0; } + +#ifdef CONFIG_GENERIC_HARDIRQS +unsigned int kstat_irqs(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + int cpu; + int sum = 0; + + if (!desc) + return 0; + for_each_possible_cpu(cpu) + sum += desc->kstat_irqs[cpu]; + return sum; +} +#endif /* CONFIG_GENERIC_HARDIRQS */ -- cgit v1.2.3-18-g5258 From 85893120699f8bae8caa12a8ee18ab5fceac978e Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Wed, 27 Oct 2010 15:34:43 -0700 Subject: delayacct: align to 8 byte boundary on 64-bit systems prepare_reply() sets up an skb for the response. The payload contains: +--------------------------------+ | genlmsghdr - 4 bytes | +--------------------------------+ | NLA header - 4 bytes | /* Aggregate header */ +-+------------------------------+ | | NLA header - 4 bytes | /* PID header */ | +------------------------------+ | | pid/tgid - 4 bytes | | +------------------------------+ | | NLA header - 4 bytes | /* stats header */ | + -----------------------------+ <- oops. aligned on 4 byte boundary | | struct taskstats - 328 bytes | +-+------------------------------+ The start of the taskstats struct must be 8 byte aligned on IA64 (and other systems with 8 byte alignment rules for 64-bit types) or runtime alignment warnings will be issued. This patch pads the pid/tgid field out to sizeof(long), which forces the alignment of taskstats. The getdelays userspace code is ok with this since it assumes 32-bit pid/tgid and then honors that header's length field. An array is used to avoid exposing kernel memory contents to userspace in the response. Signed-off-by: Jeff Mahoney Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 11281d5792b..5a651aa63d6 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -360,6 +360,12 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) struct nlattr *na, *ret; int aggr; + /* If we don't pad, we end up with alignment on a 4 byte boundary. + * This causes lots of runtime warnings on systems requiring 8 byte + * alignment */ + u32 pids[2] = { pid, 0 }; + int pid_size = ALIGN(sizeof(pid), sizeof(long)); + aggr = (type == TASKSTATS_TYPE_PID) ? TASKSTATS_TYPE_AGGR_PID : TASKSTATS_TYPE_AGGR_TGID; @@ -367,7 +373,7 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) na = nla_nest_start(skb, aggr); if (!na) goto err; - if (nla_put(skb, type, sizeof(pid), &pid) < 0) + if (nla_put(skb, type, pid_size, pids) < 0) goto err; ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); if (!ret) -- cgit v1.2.3-18-g5258 From 9323312592cca636d7c2580dc85fa4846efa86a2 Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Wed, 27 Oct 2010 15:34:44 -0700 Subject: taskstats: separate taskstats commands Move each taskstats command into a single function. This makes the code more readable and makes it easier to add new commands. Signed-off-by: Michael Holzheu Acked-by: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 118 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 78 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 5a651aa63d6..9970cae04f1 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -430,39 +430,46 @@ err: return rc; } -static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) +static int cmd_attr_register_cpumask(struct genl_info *info) { - int rc; - struct sk_buff *rep_skb; - struct taskstats *stats; - size_t size; cpumask_var_t mask; + int rc; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; - rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); if (rc < 0) - goto free_return_rc; - if (rc == 0) { - rc = add_del_listener(info->snd_pid, mask, REGISTER); - goto free_return_rc; - } + goto out; + rc = add_del_listener(info->snd_pid, mask, REGISTER); +out: + free_cpumask_var(mask); + return rc; +} +static int cmd_attr_deregister_cpumask(struct genl_info *info) +{ + cpumask_var_t mask; + int rc; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); if (rc < 0) - goto free_return_rc; - if (rc == 0) { - rc = add_del_listener(info->snd_pid, mask, DEREGISTER); -free_return_rc: - free_cpumask_var(mask); - return rc; - } + goto out; + rc = add_del_listener(info->snd_pid, mask, DEREGISTER); +out: free_cpumask_var(mask); + return rc; +} + +static int cmd_attr_pid(struct genl_info *info) +{ + struct taskstats *stats; + struct sk_buff *rep_skb; + size_t size; + u32 pid; + int rc; - /* - * Size includes space for nested attributes - */ size = nla_total_size(sizeof(u32)) + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); @@ -471,33 +478,64 @@ free_return_rc: return rc; rc = -EINVAL; - if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { - u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); - stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); - if (!stats) - goto err; - - rc = fill_pid(pid, NULL, stats); - if (rc < 0) - goto err; - } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { - u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); - stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); - if (!stats) - goto err; - - rc = fill_tgid(tgid, NULL, stats); - if (rc < 0) - goto err; - } else + pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); + stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); + if (!stats) + goto err; + + rc = fill_pid(pid, NULL, stats); + if (rc < 0) + goto err; + return send_reply(rep_skb, info); +err: + nlmsg_free(rep_skb); + return rc; +} + +static int cmd_attr_tgid(struct genl_info *info) +{ + struct taskstats *stats; + struct sk_buff *rep_skb; + size_t size; + u32 tgid; + int rc; + + size = nla_total_size(sizeof(u32)) + + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + + rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); + if (rc < 0) + return rc; + + rc = -EINVAL; + tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); + stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); + if (!stats) goto err; + rc = fill_tgid(tgid, NULL, stats); + if (rc < 0) + goto err; return send_reply(rep_skb, info); err: nlmsg_free(rep_skb); return rc; } +static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) +{ + if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) + return cmd_attr_register_cpumask(info); + else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK]) + return cmd_attr_deregister_cpumask(info); + else if (info->attrs[TASKSTATS_CMD_ATTR_PID]) + return cmd_attr_pid(info); + else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) + return cmd_attr_tgid(info); + else + return -EINVAL; +} + static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; -- cgit v1.2.3-18-g5258 From 3d9e0cf1fe007b88db55d43dfdb6839e1a029ca5 Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Wed, 27 Oct 2010 15:34:44 -0700 Subject: taskstats: split fill_pid function Separate the finding of a task_struct by pid or tgid from filling the taskstats data. This makes the code more readable. Signed-off-by: Michael Holzheu Acked-by: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 50 +++++++++++++++++++++----------------------------- 1 file changed, 21 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 9970cae04f1..c8231fb1570 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -175,22 +175,8 @@ static void send_cpu_listeners(struct sk_buff *skb, up_write(&listeners->sem); } -static int fill_pid(pid_t pid, struct task_struct *tsk, - struct taskstats *stats) +static void fill_stats(struct task_struct *tsk, struct taskstats *stats) { - int rc = 0; - - if (!tsk) { - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (tsk) - get_task_struct(tsk); - rcu_read_unlock(); - if (!tsk) - return -ESRCH; - } else - get_task_struct(tsk); - memset(stats, 0, sizeof(*stats)); /* * Each accounting subsystem adds calls to its functions to @@ -209,17 +195,27 @@ static int fill_pid(pid_t pid, struct task_struct *tsk, /* fill in extended acct fields */ xacct_add_tsk(stats, tsk); +} - /* Define err: label here if needed */ - put_task_struct(tsk); - return rc; +static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) +{ + struct task_struct *tsk; + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (tsk) + get_task_struct(tsk); + rcu_read_unlock(); + if (!tsk) + return -ESRCH; + fill_stats(tsk, stats); + put_task_struct(tsk); + return 0; } -static int fill_tgid(pid_t tgid, struct task_struct *first, - struct taskstats *stats) +static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) { - struct task_struct *tsk; + struct task_struct *tsk, *first; unsigned long flags; int rc = -ESRCH; @@ -228,8 +224,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first, * leaders who are already counted with the dead tasks */ rcu_read_lock(); - if (!first) - first = find_task_by_vpid(tgid); + first = find_task_by_vpid(tgid); if (!first || !lock_task_sighand(first, &flags)) goto out; @@ -268,7 +263,6 @@ out: return rc; } - static void fill_tgid_exit(struct task_struct *tsk) { unsigned long flags; @@ -483,7 +477,7 @@ static int cmd_attr_pid(struct genl_info *info) if (!stats) goto err; - rc = fill_pid(pid, NULL, stats); + rc = fill_stats_for_pid(pid, stats); if (rc < 0) goto err; return send_reply(rep_skb, info); @@ -513,7 +507,7 @@ static int cmd_attr_tgid(struct genl_info *info) if (!stats) goto err; - rc = fill_tgid(tgid, NULL, stats); + rc = fill_stats_for_tgid(tgid, stats); if (rc < 0) goto err; return send_reply(rep_skb, info); @@ -599,9 +593,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) if (!stats) goto err; - rc = fill_pid(-1, tsk, stats); - if (rc < 0) - goto err; + fill_stats(tsk, stats); /* * Doesn't matter if tsk is the leader or the last group member leaving -- cgit v1.2.3-18-g5258 From d57af9b2142f31a39dcfdeb30776baadfc802827 Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Wed, 27 Oct 2010 15:34:45 -0700 Subject: taskstats: use real microsecond granularity for CPU times The taskstats interface uses microsecond granularity for the user and system time values. The conversion from cputime to the taskstats values uses the cputime_to_msecs primitive which effectively limits the granularity to milliseconds. Add the cputime_to_usecs primitive for architectures that have better, more precise CPU time values. Remove cputime_to_msecs primitive because there are no more users left. Signed-off-by: Michael Holzheu Acked-by: Balbir Singh Cc: Luck Tony Cc: Shailabh Nagar Cc: Martin Schwidefsky Cc: Oleg Nesterov Cc: Benjamin Herrenschmidt Cc: Heiko Carstens Cc: Thomas Gleixner Cc: Shailabh Nagar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/tsacct.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 0a67e041edf..24dc60d9fa1 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -63,12 +63,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) stats->ac_ppid = pid_alive(tsk) ? rcu_dereference(tsk->real_parent)->tgid : 0; rcu_read_unlock(); - stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; - stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; - stats->ac_utimescaled = - cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC; - stats->ac_stimescaled = - cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC; + stats->ac_utime = cputime_to_usecs(tsk->utime); + stats->ac_stime = cputime_to_usecs(tsk->stime); + stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled); + stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled); stats->ac_minflt = tsk->min_flt; stats->ac_majflt = tsk->maj_flt; -- cgit v1.2.3-18-g5258 From 5de1cb2d0f1c1e5475d2bedf65b76828f8cdde22 Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Wed, 27 Oct 2010 15:34:52 -0700 Subject: kernel/resource.c: handle reinsertion of an already-inserted resource If the same resource is inserted to the resource tree (maybe not on purpose), a dead loop will be created. In this situation, The kernel does not report any warning or error :( The command below will show a endless print. #cat /proc/iomem [akpm@linux-foundation.org: add WARN_ON()] Signed-off-by: Huang Shijie Cc: Jesse Barnes Cc: Bjorn Helgaas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 7b36976e5de..9c9841cb690 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -453,6 +453,8 @@ static struct resource * __insert_resource(struct resource *parent, struct resou if (first == parent) return first; + if (WARN_ON(first == new)) /* duplicated insertion */ + return first; if ((first->start > new->start) || (first->end < new->end)) break; -- cgit v1.2.3-18-g5258 From 61d8e11e519ee7912ab59610fba1aaf08e3c1d84 Mon Sep 17 00:00:00 2001 From: Zimny Lech Date: Wed, 27 Oct 2010 15:34:53 -0700 Subject: Remove duplicate includes from many files Signed-off-by: Zimny Lech Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/trace/trace_kprobe.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b8d2852baa4..2dec9bcde8b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include "trace.h" -- cgit v1.2.3-18-g5258 From b842f8faf6c7dc2005c6a70631c1a91bac02f180 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 1 Oct 2010 17:23:41 -0400 Subject: jump label: Fix module __init section race Jump label uses is_module_text_address() to ensure that the module __init sections are valid before updating them. However, between the check for a valid module __init section and the subsequent jump label update, the module's __init section could be freed out from under us. We fix this potential race by adding a notifier callback to the MODULE_STATE_LIVE state. This notifier is called *after* the __init section has been run but before it is going to be freed. In the callback, the jump label code zeros the key value for any __init jump code within the module, and we add a check for a non-zero key value when we update jump labels. In this way we require no additional data structures. Thanks to Mathieu Desnoyers for pointing out this race condition. Reported-by: Mathieu Desnoyers Cc: Masami Hiramatsu Signed-off-by: Jason Baron LKML-Reference: [ Renamed remove_module_init() to remove_jump_label_module_init() as suggested by Masami Hiramatsu. ] Signed-off-by: Steven Rostedt --- kernel/jump_label.c | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 7be868bf25c..be9e105345e 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -168,7 +168,8 @@ void jump_label_update(unsigned long key, enum jump_label_type type) count = e_module->nr_entries; iter = e_module->table; while (count--) { - if (kernel_text_address(iter->code)) + if (iter->key && + kernel_text_address(iter->code)) arch_jump_label_transform(iter, type); iter++; } @@ -366,6 +367,39 @@ static void remove_jump_label_module(struct module *mod) } } +static void remove_jump_label_module_init(struct module *mod) +{ + struct hlist_head *head; + struct hlist_node *node, *node_next, *module_node, *module_node_next; + struct jump_label_entry *e; + struct jump_label_module_entry *e_module; + struct jump_entry *iter; + int i, count; + + /* if the module doesn't have jump label entries, just return */ + if (!mod->num_jump_entries) + return; + + for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { + head = &jump_label_table[i]; + hlist_for_each_entry_safe(e, node, node_next, head, hlist) { + hlist_for_each_entry_safe(e_module, module_node, + module_node_next, + &(e->modules), hlist) { + if (e_module->mod != mod) + continue; + count = e_module->nr_entries; + iter = e_module->table; + while (count--) { + if (within_module_init(iter->code, mod)) + iter->key = 0; + iter++; + } + } + } + } +} + static int jump_label_module_notify(struct notifier_block *self, unsigned long val, void *data) @@ -386,6 +420,11 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, remove_jump_label_module(mod); mutex_unlock(&jump_label_mutex); break; + case MODULE_STATE_LIVE: + mutex_lock(&jump_label_mutex); + remove_jump_label_module_init(mod); + mutex_unlock(&jump_label_mutex); + break; } return ret; } -- cgit v1.2.3-18-g5258 From 91bad2f8d3057482b9afb599f14421b007136960 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 1 Oct 2010 17:23:48 -0400 Subject: jump label: Fix deadlock b/w jump_label_mutex vs. text_mutex register_kprobe() downs the 'text_mutex' and then calls jump_label_text_reserved(), which downs the 'jump_label_mutex'. However, the jump label code takes those mutexes in the reverse order. Fix by requiring the caller of jump_label_text_reserved() to do the jump label locking via the newly added: jump_label_lock(), jump_label_unlock(). Currently, kprobes is the only user of jump_label_text_reserved(). Reported-by: Ingo Molnar Acked-by: Masami Hiramatsu Signed-off-by: Jason Baron LKML-Reference: <759032c48d5e30c27f0bba003d09bffa8e9f28bb.1285965957.git.jbaron@redhat.com> Signed-off-by: Steven Rostedt --- kernel/jump_label.c | 33 +++++++++++++++++++++------------ kernel/kprobes.c | 6 ++++++ 2 files changed, 27 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index be9e105345e..12cce78e956 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -39,6 +39,16 @@ struct jump_label_module_entry { struct module *mod; }; +void jump_label_lock(void) +{ + mutex_lock(&jump_label_mutex); +} + +void jump_label_unlock(void) +{ + mutex_unlock(&jump_label_mutex); +} + static int jump_label_cmp(const void *a, const void *b) { const struct jump_entry *jea = a; @@ -152,7 +162,7 @@ void jump_label_update(unsigned long key, enum jump_label_type type) struct jump_label_module_entry *e_module; int count; - mutex_lock(&jump_label_mutex); + jump_label_lock(); entry = get_jump_label_entry((jump_label_t)key); if (entry) { count = entry->nr_entries; @@ -175,7 +185,7 @@ void jump_label_update(unsigned long key, enum jump_label_type type) } } } - mutex_unlock(&jump_label_mutex); + jump_label_unlock(); } static int addr_conflict(struct jump_entry *entry, void *start, void *end) @@ -232,6 +242,7 @@ out: * overlaps with any of the jump label patch addresses. Code * that wants to modify kernel text should first verify that * it does not overlap with any of the jump label addresses. + * Caller must hold jump_label_mutex. * * returns 1 if there is an overlap, 0 otherwise */ @@ -242,7 +253,6 @@ int jump_label_text_reserved(void *start, void *end) struct jump_entry *iter_stop = __start___jump_table; int conflict = 0; - mutex_lock(&jump_label_mutex); iter = iter_start; while (iter < iter_stop) { if (addr_conflict(iter, start, end)) { @@ -257,7 +267,6 @@ int jump_label_text_reserved(void *start, void *end) conflict = module_conflict(start, end); #endif out: - mutex_unlock(&jump_label_mutex); return conflict; } @@ -268,7 +277,7 @@ static __init int init_jump_label(void) struct jump_entry *iter_stop = __stop___jump_table; struct jump_entry *iter; - mutex_lock(&jump_label_mutex); + jump_label_lock(); ret = build_jump_label_hashtable(__start___jump_table, __stop___jump_table); iter = iter_start; @@ -276,7 +285,7 @@ static __init int init_jump_label(void) arch_jump_label_text_poke_early(iter->code); iter++; } - mutex_unlock(&jump_label_mutex); + jump_label_unlock(); return ret; } early_initcall(init_jump_label); @@ -409,21 +418,21 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, switch (val) { case MODULE_STATE_COMING: - mutex_lock(&jump_label_mutex); + jump_label_lock(); ret = add_jump_label_module(mod); if (ret) remove_jump_label_module(mod); - mutex_unlock(&jump_label_mutex); + jump_label_unlock(); break; case MODULE_STATE_GOING: - mutex_lock(&jump_label_mutex); + jump_label_lock(); remove_jump_label_module(mod); - mutex_unlock(&jump_label_mutex); + jump_label_unlock(); break; case MODULE_STATE_LIVE: - mutex_lock(&jump_label_mutex); + jump_label_lock(); remove_jump_label_module_init(mod); - mutex_unlock(&jump_label_mutex); + jump_label_unlock(); break; } return ret; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 99865c33a60..9437e14f36b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1146,13 +1146,16 @@ int __kprobes register_kprobe(struct kprobe *p) return ret; preempt_disable(); + jump_label_lock(); if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr) || ftrace_text_reserved(p->addr, p->addr) || jump_label_text_reserved(p->addr, p->addr)) { preempt_enable(); + jump_label_unlock(); return -EINVAL; } + jump_label_unlock(); /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ p->flags &= KPROBE_FLAG_DISABLED; @@ -1187,6 +1190,8 @@ int __kprobes register_kprobe(struct kprobe *p) INIT_LIST_HEAD(&p->list); mutex_lock(&kprobe_mutex); + jump_label_lock(); /* needed to call jump_label_text_reserved() */ + get_online_cpus(); /* For avoiding text_mutex deadlock. */ mutex_lock(&text_mutex); @@ -1214,6 +1219,7 @@ int __kprobes register_kprobe(struct kprobe *p) out: mutex_unlock(&text_mutex); put_online_cpus(); + jump_label_unlock(); mutex_unlock(&kprobe_mutex); if (probed_mod) -- cgit v1.2.3-18-g5258 From f7e835710ab5f6e43933c983f38f2d2e262b718c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 26 Jul 2010 13:23:11 +0400 Subject: convert cgroup and cpuset Signed-off-by: Al Viro --- kernel/cgroup.c | 11 +++++------ kernel/cpuset.c | 13 ++++++------- 2 files changed, 11 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5cf366965d0..66a416b42c1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1460,9 +1460,9 @@ static int cgroup_get_rootdir(struct super_block *sb) return 0; } -static int cgroup_get_sb(struct file_system_type *fs_type, +static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, - void *data, struct vfsmount *mnt) + void *data) { struct cgroup_sb_opts opts; struct cgroupfs_root *root; @@ -1596,10 +1596,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type, drop_parsed_module_refcounts(opts.subsys_bits); } - simple_set_mnt(mnt, sb); kfree(opts.release_agent); kfree(opts.name); - return 0; + return dget(sb->s_root); drop_new_super: deactivate_locked_super(sb); @@ -1608,7 +1607,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, out_err: kfree(opts.release_agent); kfree(opts.name); - return ret; + return ERR_PTR(ret); } static void cgroup_kill_sb(struct super_block *sb) { @@ -1658,7 +1657,7 @@ static void cgroup_kill_sb(struct super_block *sb) { static struct file_system_type cgroup_fs_type = { .name = "cgroup", - .get_sb = cgroup_get_sb, + .mount = cgroup_mount, .kill_sb = cgroup_kill_sb, }; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 51b143e2a07..4349935c2ad 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -231,18 +231,17 @@ static DEFINE_SPINLOCK(cpuset_buffer_lock); * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead */ -static int cpuset_get_sb(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, - void *data, struct vfsmount *mnt) +static struct dentry *cpuset_mount(struct file_system_type *fs_type, + int flags, const char *unused_dev_name, void *data) { struct file_system_type *cgroup_fs = get_fs_type("cgroup"); - int ret = -ENODEV; + struct dentry *ret = ERR_PTR(-ENODEV); if (cgroup_fs) { char mountopts[] = "cpuset,noprefix," "release_agent=/sbin/cpuset_release_agent"; - ret = cgroup_fs->get_sb(cgroup_fs, flags, - unused_dev_name, mountopts, mnt); + ret = cgroup_fs->mount(cgroup_fs, flags, + unused_dev_name, mountopts); put_filesystem(cgroup_fs); } return ret; @@ -250,7 +249,7 @@ static int cpuset_get_sb(struct file_system_type *fs_type, static struct file_system_type cpuset_fs_type = { .name = "cpuset", - .get_sb = cpuset_get_sb, + .mount = cpuset_mount, }; /* -- cgit v1.2.3-18-g5258 From de31c3ca8179d7c21def7ecb56e4fec0c8659d36 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 18 Oct 2010 10:38:58 -0400 Subject: jump label: Fix error with preempt disable holding mutex Kprobes and jump label were having a race between mutexes that was fixed by reordering the jump label. But this reordering moved the jump label mutex into a preempt disable location. This patch does a little fiddling to move the grabbing of the jump label mutex from inside the preempt disable section and still keep the order correct between the mutex and the kprobes lock. Reported-by: Ingo Molnar Acked-by: Masami Hiramatsu Cc: Jason Baron Signed-off-by: Steven Rostedt --- kernel/kprobes.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9437e14f36b..9737a76e106 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1145,17 +1145,13 @@ int __kprobes register_kprobe(struct kprobe *p) if (ret) return ret; - preempt_disable(); jump_label_lock(); + preempt_disable(); if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr) || ftrace_text_reserved(p->addr, p->addr) || - jump_label_text_reserved(p->addr, p->addr)) { - preempt_enable(); - jump_label_unlock(); - return -EINVAL; - } - jump_label_unlock(); + jump_label_text_reserved(p->addr, p->addr)) + goto fail_with_jump_label; /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ p->flags &= KPROBE_FLAG_DISABLED; @@ -1169,10 +1165,9 @@ int __kprobes register_kprobe(struct kprobe *p) * We must hold a refcount of the probed module while updating * its code to prohibit unexpected unloading. */ - if (unlikely(!try_module_get(probed_mod))) { - preempt_enable(); - return -EINVAL; - } + if (unlikely(!try_module_get(probed_mod))) + goto fail_with_jump_label; + /* * If the module freed .init.text, we couldn't insert * kprobes in there. @@ -1180,11 +1175,11 @@ int __kprobes register_kprobe(struct kprobe *p) if (within_module_init((unsigned long)p->addr, probed_mod) && probed_mod->state != MODULE_STATE_COMING) { module_put(probed_mod); - preempt_enable(); - return -EINVAL; + goto fail_with_jump_label; } } preempt_enable(); + jump_label_unlock(); p->nmissed = 0; INIT_LIST_HEAD(&p->list); @@ -1226,6 +1221,11 @@ out: module_put(probed_mod); return ret; + +fail_with_jump_label: + preempt_enable(); + jump_label_unlock(); + return -EINVAL; } EXPORT_SYMBOL_GPL(register_kprobe); -- cgit v1.2.3-18-g5258 From 95bcd683fb694a3e2d0538bf486430a0dfbb4111 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 29 Oct 2010 11:02:43 -0400 Subject: jump label: Make arch_jump_label_text_poke_early() optional Some archs do not need to do anything special for jump labels on startup (like MIPS). This patch adds a weak function stub for arch_jump_label_text_poke_early(); Cc: Jason Baron Cc: David Miller Cc: David Daney Suggested-by: Thomas Gleixner LKML-Reference: <1286218615-24011-2-git-send-email-ddaney@caviumnetworks.com> LKML-Reference: <20101015201037.703989993@goodmis.org> Signed-off-by: Steven Rostedt --- kernel/jump_label.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 12cce78e956..3b79bd93833 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -270,6 +270,13 @@ out: return conflict; } +/* + * Not all archs need this. + */ +void __weak arch_jump_label_text_poke_early(jump_label_t addr) +{ +} + static __init int init_jump_label(void) { int ret; -- cgit v1.2.3-18-g5258 From 931ea24819f2bd40cca2dc214558bfcc3c91549e Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 29 Oct 2010 08:04:16 -0500 Subject: kdb: fix per_cpu command to remove supress mask Rusty pointed out that the per_cpu command uses up lots of space on the stack and the cpu supress mask is probably not needed. This patch removes the need for the supress mask as well as fixing up the following problems with the kdb per_cpu command: * The per_cpu command should allow an address as an argument * When you have more data than can be displayed on one screen allow the user to break out of the print loop. Reported-by: Rusty Russell Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_main.c | 46 +++++++++++---------------------------------- 1 file changed, 11 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index d7bda21a106..9755ac05e44 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2603,20 +2603,17 @@ static int kdb_summary(int argc, const char **argv) */ static int kdb_per_cpu(int argc, const char **argv) { - char buf[256], fmtstr[64]; - kdb_symtab_t symtab; - cpumask_t suppress = CPU_MASK_NONE; - int cpu, diag; - unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL; + char fmtstr[64]; + int cpu, diag, nextarg = 1; + unsigned long addr, symaddr, val, bytesperword = 0, whichcpu = ~0UL; if (argc < 1 || argc > 3) return KDB_ARGCOUNT; - snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]); - if (!kdbgetsymval(buf, &symtab)) { - kdb_printf("%s is not a per_cpu variable\n", argv[1]); - return KDB_BADADDR; - } + diag = kdbgetaddrarg(argc, argv, &nextarg, &symaddr, NULL, NULL); + if (diag) + return diag; + if (argc >= 2) { diag = kdbgetularg(argv[2], &bytesperword); if (diag) @@ -2649,46 +2646,25 @@ static int kdb_per_cpu(int argc, const char **argv) #define KDB_PCU(cpu) 0 #endif #endif - for_each_online_cpu(cpu) { + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; + if (whichcpu != ~0UL && whichcpu != cpu) continue; - addr = symtab.sym_start + KDB_PCU(cpu); + addr = symaddr + KDB_PCU(cpu); diag = kdb_getword(&val, addr, bytesperword); if (diag) { kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to " "read, diag=%d\n", cpu, addr, diag); continue; } -#ifdef CONFIG_SMP - if (!val) { - cpu_set(cpu, suppress); - continue; - } -#endif /* CONFIG_SMP */ kdb_printf("%5d ", cpu); kdb_md_line(fmtstr, addr, bytesperword == KDB_WORD_SIZE, 1, bytesperword, 1, 1, 0); } - if (cpus_weight(suppress) == 0) - return 0; - kdb_printf("Zero suppressed cpu(s):"); - for (cpu = first_cpu(suppress); cpu < num_possible_cpus(); - cpu = next_cpu(cpu, suppress)) { - kdb_printf(" %d", cpu); - if (cpu == num_possible_cpus() - 1 || - next_cpu(cpu, suppress) != cpu + 1) - continue; - while (cpu < num_possible_cpus() && - next_cpu(cpu, suppress) == cpu + 1) - ++cpu; - kdb_printf("-%d", cpu); - } - kdb_printf("\n"); - #undef KDB_PCU - return 0; } -- cgit v1.2.3-18-g5258 From 578bd4dfcda63d2ef15f025f1d5d55c0e56b9660 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 29 Oct 2010 13:14:41 -0500 Subject: kdb: Fix early debugging crash regression The kdb_current legally be equal to NULL in the early boot of the x86 arch. The problem pcan be observed by booting with the kernel arguments: earlyprintk=vga ekgdboc=kbd kgdbwait The kdb shell will oops on entry and recursively fault because it cannot get past the final stage of shell initialization. Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 9755ac05e44..37755d62192 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1127,7 +1127,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, /* special case below */ } else { kdb_printf("\nEntering kdb (current=0x%p, pid %d) ", - kdb_current, kdb_current->pid); + kdb_current, kdb_current ? kdb_current->pid : 0); #if defined(CONFIG_SMP) kdb_printf("on processor %d ", raw_smp_processor_id()); #endif -- cgit v1.2.3-18-g5258 From d7ba979d45272385ce0fdf141d922e61ff48e07b Mon Sep 17 00:00:00 2001 From: Dongdong Deng Date: Wed, 18 Aug 2010 06:02:00 -0500 Subject: debug_core,x86,blackfin: Clean up hw debug disable API The kgdb_disable_hw_debug() was an architecture specific function for disabling all hardware breakpoints on a per cpu basis when entering the debug core. This patch will remove the weak function kdbg_disable_hw_debug() and change it into a call back which lives with the rest of hw breakpoint call backs in struct kgdb_arch. Signed-off-by: Dongdong Deng Signed-off-by: Jason Wessel --- kernel/debug/debug_core.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index fec596da9bd..cefd4a11f6d 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -209,18 +209,6 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs) return 0; } -/** - * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb. - * @regs: Current &struct pt_regs. - * - * This function will be called if the particular architecture must - * disable hardware debugging while it is processing gdb packets or - * handling exception. - */ -void __weak kgdb_disable_hw_debug(struct pt_regs *regs) -{ -} - /* * Some architectures need cache flushes when we set/clear a * breakpoint: @@ -484,7 +472,9 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, atomic_inc(&masters_in_kgdb); else atomic_inc(&slaves_in_kgdb); - kgdb_disable_hw_debug(ks->linux_regs); + + if (arch_kgdb_ops.disable_hw_break) + arch_kgdb_ops.disable_hw_break(regs); acquirelock: /* -- cgit v1.2.3-18-g5258 From d29be158a68254f58cf1fbf60ce1e89557a321aa Mon Sep 17 00:00:00 2001 From: Miloslav Trmac Date: Thu, 16 Sep 2010 18:14:11 -0400 Subject: Audit: add support to match lsm labels on user audit messages Add support for matching by security label (e.g. SELinux context) of the sender of an user-space audit record. The audit filter code already allows user space to configure such filters, but they were ignored during evaluation. This patch implements evaluation of these filters. For example, after application of this patch, PAM authentication logs caused by cron can be disabled using auditctl -a user,never -F subj_type=crond_t Signed-off-by: Miloslav Trmac Acked-by: Eric Paris Signed-off-by: Al Viro --- kernel/auditfilter.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index eb7675499fb..add2819af71 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1252,6 +1252,18 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, case AUDIT_LOGINUID: result = audit_comparator(cb->loginuid, f->op, f->val); break; + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: + if (f->lsm_rule) + result = security_audit_rule_match(cb->sid, + f->type, + f->op, + f->lsm_rule, + NULL); + break; } if (!result) -- cgit v1.2.3-18-g5258 From b8800aa5d9c7e4e2869321c77b80f322a0d9663a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 20 Oct 2010 17:23:50 -0700 Subject: audit: make functions static I was doing some namespace checks and found some simple stuff in audit that could be cleaned up. Make some functions static, and put const on make_reply payload arg. Signed-off-by: Stephen Hemminger Signed-off-by: Al Viro --- kernel/audit.c | 6 +++--- kernel/audit.h | 5 +---- kernel/audit_watch.c | 4 ++-- 3 files changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index d96045789b5..a300931fc45 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -506,7 +506,7 @@ int audit_send_list(void *_dest) } struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, - int multi, void *payload, int size) + int multi, const void *payload, int size) { struct sk_buff *skb; struct nlmsghdr *nlh; @@ -555,8 +555,8 @@ static int audit_send_reply_thread(void *arg) * Allocates an skb, builds the netlink message, and sends it to the pid. * No failure notifications. */ -void audit_send_reply(int pid, int seq, int type, int done, int multi, - void *payload, int size) +static void audit_send_reply(int pid, int seq, int type, int done, int multi, + const void *payload, int size) { struct sk_buff *skb; struct task_struct *tsk; diff --git a/kernel/audit.h b/kernel/audit.h index f7206db4e13..91e7071c4d2 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -84,10 +84,7 @@ extern int audit_compare_dname_path(const char *dname, const char *path, int *dirlen); extern struct sk_buff * audit_make_reply(int pid, int seq, int type, int done, int multi, - void *payload, int size); -extern void audit_send_reply(int pid, int seq, int type, - int done, int multi, - void *payload, int size); + const void *payload, int size); extern void audit_panic(const char *message); struct audit_netlink_list { diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index f0c9b2e7542..d2e3c786646 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -60,7 +60,7 @@ struct audit_parent { }; /* fsnotify handle. */ -struct fsnotify_group *audit_watch_group; +static struct fsnotify_group *audit_watch_group; /* fsnotify events we care about. */ #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ @@ -123,7 +123,7 @@ void audit_put_watch(struct audit_watch *watch) } } -void audit_remove_watch(struct audit_watch *watch) +static void audit_remove_watch(struct audit_watch *watch) { list_del(&watch->wlist); audit_put_parent(watch->parent); -- cgit v1.2.3-18-g5258 From f7a998a9491f2da1d3e44d150aa611d10093da4f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 30 Oct 2010 02:18:32 -0400 Subject: in untag_chunk() we need to do alloc_chunk() a bit earlier ... while we are not holding spinlocks. Signed-off-by: Al Viro --- kernel/audit_tree.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 7f18d3a4527..37b2bea170c 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -223,7 +223,7 @@ static void untag_chunk(struct node *p) { struct audit_chunk *chunk = find_chunk(p); struct fsnotify_mark *entry = &chunk->mark; - struct audit_chunk *new; + struct audit_chunk *new = NULL; struct audit_tree *owner; int size = chunk->count - 1; int i, j; @@ -232,9 +232,14 @@ static void untag_chunk(struct node *p) spin_unlock(&hash_lock); + if (size) + new = alloc_chunk(size); + spin_lock(&entry->lock); if (chunk->dead || !entry->i.inode) { spin_unlock(&entry->lock); + if (new) + free_chunk(new); goto out; } @@ -255,9 +260,9 @@ static void untag_chunk(struct node *p) goto out; } - new = alloc_chunk(size); if (!new) goto Fallback; + fsnotify_duplicate_mark(&new->mark, entry); if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { free_chunk(new); -- cgit v1.2.3-18-g5258 From 3c80fe4ac9cfb13b1bfa4edf1544e8b656716694 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Dec 2009 14:19:31 +0000 Subject: audit: Call tty_audit_push_task() outside preempt disabled While auditing all tasklist_lock read_lock sites I stumbled over the following call chain: audit_prepare_user_tty() read_lock(&tasklist_lock); tty_audit_push_task(); mutex_lock(&buf->mutex); --> buf->mutex is locked with preemption disabled. Solve this by acquiring a reference to the task struct under rcu_read_lock and call tty_audit_push_task outside of the preempt disabled region. Move all code which needs to be protected by sighand lock into tty_audit_push_task() and use lock/unlock_sighand as we do not hold tasklist_lock. Signed-off-by: Thomas Gleixner Cc: Al Viro Cc: Eric Paris Cc: Oleg Nesterov Signed-off-by: Al Viro --- kernel/audit.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index a300931fc45..8429afea37b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -467,23 +467,16 @@ static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid) struct task_struct *tsk; int err; - read_lock(&tasklist_lock); + rcu_read_lock(); tsk = find_task_by_vpid(pid); - err = -ESRCH; - if (!tsk) - goto out; - err = 0; - - spin_lock_irq(&tsk->sighand->siglock); - if (!tsk->signal->audit_tty) - err = -EPERM; - spin_unlock_irq(&tsk->sighand->siglock); - if (err) - goto out; - - tty_audit_push_task(tsk, loginuid, sessionid); -out: - read_unlock(&tasklist_lock); + if (!tsk) { + rcu_read_unlock(); + return -ESRCH; + } + get_task_struct(tsk); + rcu_read_unlock(); + err = tty_audit_push_task(tsk, loginuid, sessionid); + put_task_struct(tsk); return err; } -- cgit v1.2.3-18-g5258 From 207032051a5ed38df332729ba42e98e9a1e60434 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Dec 2009 14:19:35 +0000 Subject: audit: Do not send uninitialized data for AUDIT_TTY_GET audit_receive_msg() sends uninitialized data for AUDIT_TTY_GET when the task was not found. Send reply only when task was found. Signed-off-by: Thomas Gleixner Cc: Al Viro Cc: Eric Paris Signed-off-by: Al Viro --- kernel/audit.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 8429afea37b..57f4038694d 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -884,8 +884,10 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) spin_unlock_irq(&tsk->sighand->siglock); } read_unlock(&tasklist_lock); - audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0, - &s, sizeof(s)); + + if (!err) + audit_send_reply(NETLINK_CB(skb).pid, seq, + AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { -- cgit v1.2.3-18-g5258 From ab263f47c9781a644de8b28013434b645082922e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Dec 2009 14:19:41 +0000 Subject: audit: Use rcu for task lookup protection Protect the task lookups in audit_receive_msg() with rcu_read_lock() instead of tasklist_lock and use lock/unlock_sighand to protect against the exit race. Signed-off-by: Thomas Gleixner Cc: Al Viro Cc: Eric Paris Cc: Oleg Nesterov Signed-off-by: Al Viro --- kernel/audit.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 57f4038694d..77770a034d5 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -873,17 +873,16 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_TTY_GET: { struct audit_tty_status s; struct task_struct *tsk; + unsigned long flags; - read_lock(&tasklist_lock); + rcu_read_lock(); tsk = find_task_by_vpid(pid); - if (!tsk) - err = -ESRCH; - else { - spin_lock_irq(&tsk->sighand->siglock); + if (tsk && lock_task_sighand(tsk, &flags)) { s.enabled = tsk->signal->audit_tty != 0; - spin_unlock_irq(&tsk->sighand->siglock); - } - read_unlock(&tasklist_lock); + unlock_task_sighand(tsk, &flags); + } else + err = -ESRCH; + rcu_read_unlock(); if (!err) audit_send_reply(NETLINK_CB(skb).pid, seq, @@ -893,22 +892,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_TTY_SET: { struct audit_tty_status *s; struct task_struct *tsk; + unsigned long flags; if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) return -EINVAL; s = data; if (s->enabled != 0 && s->enabled != 1) return -EINVAL; - read_lock(&tasklist_lock); + rcu_read_lock(); tsk = find_task_by_vpid(pid); - if (!tsk) - err = -ESRCH; - else { - spin_lock_irq(&tsk->sighand->siglock); + if (tsk && lock_task_sighand(tsk, &flags)) { tsk->signal->audit_tty = s->enabled != 0; - spin_unlock_irq(&tsk->sighand->siglock); - } - read_unlock(&tasklist_lock); + unlock_task_sighand(tsk, &flags); + } else + err = -ESRCH; + rcu_read_unlock(); break; } default: -- cgit v1.2.3-18-g5258 From 120a795da07c9a02221ca23464c28a7c6ad7de1d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 30 Oct 2010 02:54:44 -0400 Subject: audit mmap Normal syscall audit doesn't catch 5th argument of syscall. It also doesn't catch the contents of userland structures pointed to be syscall argument, so for both old and new mmap(2) ABI it doesn't record the descriptor we are mapping. For old one it also misses flags. Signed-off-by: Al Viro --- kernel/auditsc.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 1b31c130d03..f49a0318c2e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -241,6 +241,10 @@ struct audit_context { pid_t pid; struct audit_cap_data cap; } capset; + struct { + int fd; + int flags; + } mmap; }; int fds[2]; @@ -1305,6 +1309,10 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); break; } + case AUDIT_MMAP: { + audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, + context->mmap.flags); + break; } } audit_log_end(ab); } @@ -2476,6 +2484,14 @@ void __audit_log_capset(pid_t pid, context->type = AUDIT_CAPSET; } +void __audit_mmap_fd(int fd, int flags) +{ + struct audit_context *context = current->audit_context; + context->mmap.fd = fd; + context->mmap.flags = flags; + context->type = AUDIT_MMAP; +} + /** * audit_core_dumps - record information about processes that end abnormally * @signr: signal value -- cgit v1.2.3-18-g5258 From 408af87a397a8ddef56ad39a79481f592aa1ac1a Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 4 Nov 2010 21:44:41 +0100 Subject: Clean up relay_alloc_page_array() slightly by using vzalloc rather than vmalloc and memset We can optimize kernel/relay.c::relay_alloc_page_array() slightly by using vzalloc. The patch makes these changes: - use vzalloc instead of vmalloc+memset. - remove redundant local variable 'array'. - declare local 'pa_size' as const. Cuts down nicely on both source and object-code size. Signed-off-by: Jesper Juhl Acked-by: Pekka Enberg Acked-by: Mathieu Desnoyers Signed-off-by: Linus Torvalds --- kernel/relay.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index c7cf397fb92..859ea5a9605 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -70,17 +70,10 @@ static const struct vm_operations_struct relay_file_mmap_ops = { */ static struct page **relay_alloc_page_array(unsigned int n_pages) { - struct page **array; - size_t pa_size = n_pages * sizeof(struct page *); - - if (pa_size > PAGE_SIZE) { - array = vmalloc(pa_size); - if (array) - memset(array, 0, pa_size); - } else { - array = kzalloc(pa_size, GFP_KERNEL); - } - return array; + const size_t pa_size = n_pages * sizeof(struct page *); + if (pa_size > PAGE_SIZE) + return vzalloc(pa_size); + return kzalloc(pa_size, GFP_KERNEL); } /* -- cgit v1.2.3-18-g5258 From e0a70217107e6f9844628120412cb27bb4cea194 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 5 Nov 2010 16:53:42 +0100 Subject: posix-cpu-timers: workaround to suppress the problems with mt exec posix-cpu-timers.c correctly assumes that the dying process does posix_cpu_timers_exit_group() and removes all !CPUCLOCK_PERTHREAD timers from signal->cpu_timers list. But, it also assumes that timer->it.cpu.task is always the group leader, and thus the dead ->task means the dead thread group. This is obviously not true after de_thread() changes the leader. After that almost every posix_cpu_timer_ method has problems. It is not simple to fix this bug correctly. First of all, I think that timer->it.cpu should use struct pid instead of task_struct. Also, the locking should be reworked completely. In particular, tasklist_lock should not be used at all. This all needs a lot of nontrivial and hard-to-test changes. Change __exit_signal() to do posix_cpu_timers_exit_group() when the old leader dies during exec. This is not the fix, just the temporary hack to hide the problem for 2.6.37 and stable. IOW, this is obviously wrong but this is what we currently have anyway: cpu timers do not work after mt exec. In theory this change adds another race. The exiting leader can detach the timers which were attached to the new leader. However, the window between de_thread() and release_task() is small, we can pretend that sys_timer_create() was called before de_thread(). Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index b194febf579..21aa7b3001f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -95,6 +95,14 @@ static void __exit_signal(struct task_struct *tsk) tty = sig->tty; sig->tty = NULL; } else { + /* + * This can only happen if the caller is de_thread(). + * FIXME: this is the temporary hack, we should teach + * posix-cpu-timers to handle this case correctly. + */ + if (unlikely(has_group_leader_pid(tsk))) + posix_cpu_timers_exit_group(tsk); + /* * If there is any task waiting for the group exit * then notify it: -- cgit v1.2.3-18-g5258 From 433039e97f672b81e6c8f6daef385dcf035c6e29 Mon Sep 17 00:00:00 2001 From: David Daney Date: Fri, 5 Nov 2010 16:17:39 -0700 Subject: watchdog: Fix section mismatch and potential undefined behavior. Commit d9ca07a05ce1 ("watchdog: Avoid kernel crash when disabling watchdog") introduces a section mismatch. Now that we reference no_watchdog from non-__init code it can no longer be __initdata. Signed-off-by: David Daney Cc: Stephane Eranian Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index bafba687a6d..6e3c41a4024 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -43,7 +43,7 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif -static int __initdata no_watchdog; +static int no_watchdog; /* boot commands */ -- cgit v1.2.3-18-g5258 From 02e031cbc843b010e72fcc05c76113c688b2860f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Nov 2010 14:54:09 +0100 Subject: block: remove REQ_HARDBARRIER REQ_HARDBARRIER is dead now, so remove the leftovers. What's left at this point is: - various checks inside the block layer. - sanity checks in bio based drivers. - now unused bio_empty_barrier helper. - Xen blockfront use of BLKIF_OP_WRITE_BARRIER - it's dead for a while, but Xen really needs to sort out it's barrier situaton. - setting of ordered tags in uas - dead code copied from old scsi drivers. - scsi different retry for barriers - it's dead and should have been removed when flushes were converted to FS requests. - blktrace handling of barriers - removed. Someone who knows blktrace better should add support for REQ_FLUSH and REQ_FUA, though. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index bc251ed6672..7b8ec028154 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -168,7 +168,6 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; -#define BLK_TC_HARDBARRIER BLK_TC_BARRIER #define BLK_TC_RAHEAD BLK_TC_AHEAD /* The ilog2() calls fall out because they're constant */ @@ -196,7 +195,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, return; what |= ddir_act[rw & WRITE]; - what |= MASK_TC_BIT(rw, HARDBARRIER); what |= MASK_TC_BIT(rw, SYNC); what |= MASK_TC_BIT(rw, RAHEAD); what |= MASK_TC_BIT(rw, META); @@ -1807,8 +1805,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) if (rw & REQ_RAHEAD) rwbs[i++] = 'A'; - if (rw & REQ_HARDBARRIER) - rwbs[i++] = 'B'; if (rw & REQ_SYNC) rwbs[i++] = 'S'; if (rw & REQ_META) -- cgit v1.2.3-18-g5258 From eed01528a45dc4138e9a08064b4b6cc1a9426899 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 26 Oct 2010 16:08:01 +0200 Subject: perf_events: Fix time tracking in samples This patch corrects time tracking in samples. Without this patch both time_enabled and time_running are bogus when user asks for PERF_SAMPLE_READ. One uses PERF_SAMPLE_READ to sample the values of other counters in each sample. Because of multiplexing, it is necessary to know both time_enabled, time_running to be able to scale counts correctly. In this second version of the patch, we maintain a shadow copy of ctx->time which allows us to compute ctx->time without calling update_context_time() from NMI context. We avoid the issue that update_context_time() must always be called with ctx->lock held. We do not keep shadow copies of the other event timings because if the lead event is overflowing then it is active and thus it's been scheduled in via event_sched_in() in which case neither tstamp_stopped, tstamp_running can be modified. This timing logic only applies to samples when PERF_SAMPLE_READ is used. Note that this patch does not address timing issues related to sampling inheritance between tasks. This will be addressed in a future patch. With this patch, the libpfm4 example task_smpl now reports correct counts (shown on 2.4GHz Core 2): $ task_smpl -p 2400000000 -e unhalted_core_cycles:u,instructions_retired:u,baclears noploop 5 noploop for 5 seconds IIP:0x000000004006d6 PID:5596 TID:5596 TIME:466,210,211,430 STREAM_ID:33 PERIOD:2,400,000,000 ENA=1,010,157,814 RUN=1,010,157,814 NR=3 2,400,000,254 unhalted_core_cycles:u (33) 2,399,273,744 instructions_retired:u (34) 53,340 baclears (35) Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4cc6e14b.1e07e30a.256e.5190@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 42 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 517d827f498..cb6c0d2af68 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -674,6 +674,8 @@ event_sched_in(struct perf_event *event, event->tstamp_running += ctx->time - event->tstamp_stopped; + event->shadow_ctx_time = ctx->time - ctx->timestamp; + if (!is_software_event(event)) cpuctx->active_oncpu++; ctx->nr_active++; @@ -3396,7 +3398,8 @@ static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) } static void perf_output_read_one(struct perf_output_handle *handle, - struct perf_event *event) + struct perf_event *event, + u64 enabled, u64 running) { u64 read_format = event->attr.read_format; u64 values[4]; @@ -3404,11 +3407,11 @@ static void perf_output_read_one(struct perf_output_handle *handle, values[n++] = perf_event_count(event); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = event->total_time_enabled + + values[n++] = enabled + atomic64_read(&event->child_total_time_enabled); } if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = event->total_time_running + + values[n++] = running + atomic64_read(&event->child_total_time_running); } if (read_format & PERF_FORMAT_ID) @@ -3421,7 +3424,8 @@ static void perf_output_read_one(struct perf_output_handle *handle, * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. */ static void perf_output_read_group(struct perf_output_handle *handle, - struct perf_event *event) + struct perf_event *event, + u64 enabled, u64 running) { struct perf_event *leader = event->group_leader, *sub; u64 read_format = event->attr.read_format; @@ -3431,10 +3435,10 @@ static void perf_output_read_group(struct perf_output_handle *handle, values[n++] = 1 + leader->nr_siblings; if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = leader->total_time_enabled; + values[n++] = enabled; if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = leader->total_time_running; + values[n++] = running; if (leader != event) leader->pmu->read(leader); @@ -3459,13 +3463,35 @@ static void perf_output_read_group(struct perf_output_handle *handle, } } +#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ + PERF_FORMAT_TOTAL_TIME_RUNNING) + static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) { + u64 enabled = 0, running = 0, now, ctx_time; + u64 read_format = event->attr.read_format; + + /* + * compute total_time_enabled, total_time_running + * based on snapshot values taken when the event + * was last scheduled in. + * + * we cannot simply called update_context_time() + * because of locking issue as we are called in + * NMI context + */ + if (read_format & PERF_FORMAT_TOTAL_TIMES) { + now = perf_clock(); + ctx_time = event->shadow_ctx_time + now; + enabled = ctx_time - event->tstamp_enabled; + running = ctx_time - event->tstamp_running; + } + if (event->attr.read_format & PERF_FORMAT_GROUP) - perf_output_read_group(handle, event); + perf_output_read_group(handle, event, enabled, running); else - perf_output_read_one(handle, event); + perf_output_read_one(handle, event, enabled, running); } void perf_output_sample(struct perf_output_handle *handle, -- cgit v1.2.3-18-g5258 From 834b40380e93e36f1c9b48ec1d280cebe3d7bd8c Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Thu, 11 Nov 2010 14:05:14 -0800 Subject: kernel/range.c: fix clean_sort_range() for the case of full array clean_sort_range() should return a number of nonempty elements of range array, but if the array is full clean_sort_range() returns 0. The problem is that the number of nonempty elements is evaluated by finding the first empty element of the array. If there is no such element it returns an initial value of local variable nr_range that is zero. The fix is trivial: it changes initial value of nr_range to size of the array. The bug can lead to loss of information regarding all ranges, since typically returned value of clean_sort_range() is considered as an actual number of ranges in the array after a series of add/subtract operations. Found by Analytical Verification project of Linux Verification Center (linuxtesting.org), thanks to Alexander Kolosov. Signed-off-by: Alexey Khoroshilov Cc: Yinghai Lu Cc: "H. Peter Anvin" Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/range.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/range.c b/kernel/range.c index 471b66acabb..37fa9b99ad5 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -119,7 +119,7 @@ static int cmp_range(const void *x1, const void *x2) int clean_sort_range(struct range *range, int az) { - int i, j, k = az - 1, nr_range = 0; + int i, j, k = az - 1, nr_range = az; for (i = 0; i < k; i++) { if (range[i].end) -- cgit v1.2.3-18-g5258 From 38715258aa2e8cd94bd4aafadc544e5104efd551 Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Thu, 11 Nov 2010 14:05:16 -0800 Subject: latencytop: fix per task accumulator Per task latencytop accumulator prematurely terminates due to erroneous placement of latency_record_count. It should be incremented whenever a new record is allocated instead of increment on every latencytop event. Also fix search iterator to only search known record events instead of blindly searching all pre-allocated space. Signed-off-by: Ken Chen Reviewed-by: Arjan van de Ven Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/latencytop.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 877fb306d41..17110a4a4fc 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -194,14 +194,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) account_global_scheduler_latency(tsk, &lat); - /* - * short term hack; if we're > 32 we stop; future we recycle: - */ - tsk->latency_record_count++; - if (tsk->latency_record_count >= LT_SAVECOUNT) - goto out_unlock; - - for (i = 0; i < LT_SAVECOUNT; i++) { + for (i = 0; i < tsk->latency_record_count; i++) { struct latency_record *mylat; int same = 1; @@ -227,8 +220,14 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) } } + /* + * short term hack; if we're > 32 we stop; future we recycle: + */ + if (tsk->latency_record_count >= LT_SAVECOUNT) + goto out_unlock; + /* Allocated a new one: */ - i = tsk->latency_record_count; + i = tsk->latency_record_count++; memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); out_unlock: -- cgit v1.2.3-18-g5258 From eaf06b241b091357e72b76863ba16e89610d31bd Mon Sep 17 00:00:00 2001 From: Dan Rosenberg Date: Thu, 11 Nov 2010 14:05:18 -0800 Subject: Restrict unprivileged access to kernel syslog The kernel syslog contains debugging information that is often useful during exploitation of other vulnerabilities, such as kernel heap addresses. Rather than futilely attempt to sanitize hundreds (or thousands) of printk statements and simultaneously cripple useful debugging functionality, it is far simpler to create an option that prevents unprivileged users from reading the syslog. This patch, loosely based on grsecurity's GRKERNSEC_DMESG, creates the dmesg_restrict sysctl. When set to "0", the default, no restrictions are enforced. When set to "1", only users with CAP_SYS_ADMIN can read the kernel syslog via dmesg(8) or other mechanisms. [akpm@linux-foundation.org: explain the config option in kernel.txt] Signed-off-by: Dan Rosenberg Acked-by: Ingo Molnar Acked-by: Eugene Teo Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 6 ++++++ kernel/sysctl.c | 9 +++++++++ 2 files changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index b2ebaee8c37..38e7d5868d6 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -261,6 +261,12 @@ static inline void boot_delay_msec(void) } #endif +#ifdef CONFIG_SECURITY_DMESG_RESTRICT +int dmesg_restrict = 1; +#else +int dmesg_restrict; +#endif + int do_syslog(int type, char __user *buf, int len, bool from_file) { unsigned i, j, limit, count; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c33a1edb799..b65bf634035 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -703,6 +703,15 @@ static struct ctl_table kern_table[] = { .extra2 = &ten_thousand, }, #endif + { + .procname = "dmesg_restrict", + .data = &dmesg_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, { .procname = "ngroups_max", .data = &ngroups_max, -- cgit v1.2.3-18-g5258 From 12b3052c3ee8f508b2c7ee4ddd63ed03423409d8 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 15 Nov 2010 18:36:29 -0500 Subject: capabilities/syslog: open code cap_syslog logic to fix build failure The addition of CONFIG_SECURITY_DMESG_RESTRICT resulted in a build failure when CONFIG_PRINTK=n. This is because the capabilities code which used the new option was built even though the variable in question didn't exist. The patch here fixes this by moving the capabilities checks out of the LSM and into the caller. All (known) LSMs should have been calling the capabilities hook already so it actually makes the code organization better to eliminate the hook altogether. Signed-off-by: Eric Paris Acked-by: James Morris Signed-off-by: Linus Torvalds --- kernel/printk.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 38e7d5868d6..9a2264fc42c 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -274,7 +274,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) char c; int error = 0; - error = security_syslog(type, from_file); + /* + * If this is from /proc/kmsg we only do the capabilities checks + * at open time. + */ + if (type == SYSLOG_ACTION_OPEN || !from_file) { + if (dmesg_restrict && !capable(CAP_SYS_ADMIN)) + return -EPERM; + if ((type != SYSLOG_ACTION_READ_ALL && + type != SYSLOG_ACTION_SIZE_BUFFER) && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + } + + error = security_syslog(type); if (error) return error; -- cgit v1.2.3-18-g5258