From 55af77969fbd7a841838220ea2287432e0da8ae5 Mon Sep 17 00:00:00 2001 From: Mitsuo Hayasaka Date: Tue, 29 Nov 2011 15:08:36 +0900 Subject: x86: Panic on detection of stack overflow Currently, messages are just output on the detection of stack overflow, which is not sufficient for systems that need a high reliability. This is because in general the overflow may corrupt data, and the additional corruption may occur due to reading them unless systems stop. This patch adds the sysctl parameter kernel.panic_on_stackoverflow and causes a panic when detecting the overflows of kernel, IRQ and exception stacks except user stack according to the parameter. It is disabled by default. Signed-off-by: Mitsuo Hayasaka Cc: yrl.pp-manager.tt@hitachi.com Cc: Randy Dunlap Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/20111129060836.11076.12323.stgit@ltc219.sdl.hitachi.co.jp Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ae271964385..f487f257e05 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -803,6 +803,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_DEBUG_STACKOVERFLOW + { + .procname = "panic_on_stackoverflow", + .data = &sysctl_panic_on_stackoverflow, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif { .procname = "bootloader_type", .data = &bootloader_type, -- cgit v1.2.3-70-g09d2 From 6e736be7f282fff705db7c34a15313281b372a76 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Dec 2011 00:33:38 +0100 Subject: block: make ioc get/put interface more conventional and fix race on alloction Ignoring copy_io() during fork, io_context can be allocated from two places - current_io_context() and set_task_ioprio(). The former is always called from local task while the latter can be called from different task. The synchornization between them are peculiar and dubious. * current_io_context() doesn't grab task_lock() and assumes that if it saw %NULL ->io_context, it would stay that way until allocation and assignment is complete. It has smp_wmb() between alloc/init and assignment. * set_task_ioprio() grabs task_lock() for assignment and does smp_read_barrier_depends() between "ioc = task->io_context" and "if (ioc)". Unfortunately, this doesn't achieve anything - the latter is not a dependent load of the former. ie, if ioc itself were being dereferenced "ioc->xxx", it would mean something (not sure what tho) but as the code currently stands, the dependent read barrier is noop. As only one of the the two test-assignment sequences is task_lock() protected, the task_lock() can't do much about race between the two. Nothing prevents current_io_context() and set_task_ioprio() allocating its own ioc for the same task and overwriting the other's. Also, set_task_ioprio() can race with exiting task and create a new ioc after exit_io_context() is finished. ioc get/put doesn't have any reason to be complex. The only hot path is accessing the existing ioc of %current, which is simple to achieve given that ->io_context is never destroyed as long as the task is alive. All other paths can happily go through task_lock() like all other task sub structures without impacting anything. This patch updates ioc get/put so that it becomes more conventional. * alloc_io_context() is replaced with get_task_io_context(). This is the only interface which can acquire access to ioc of another task. On return, the caller has an explicit reference to the object which should be put using put_io_context() afterwards. * The functionality of current_io_context() remains the same but when creating a new ioc, it shares the code path with get_task_io_context() and always goes through task_lock(). * get_io_context() now means incrementing ref on an ioc which the caller already has access to (be that an explicit refcnt or implicit %current one). * PF_EXITING inhibits creation of new io_context and once exit_io_context() is finished, it's guaranteed that both ioc acquisition functions return %NULL. * All users are updated. Most are trivial but smp_read_barrier_depends() removal from cfq_get_io_context() needs a bit of explanation. I suppose the original intention was to ensure ioc->ioprio is visible when set_task_ioprio() allocates new io_context and installs it; however, this wouldn't have worked because set_task_ioprio() doesn't have wmb between init and install. There are other problems with this which will be fixed in another patch. * While at it, use NUMA_NO_NODE instead of -1 for wildcard node specification. -v2: Vivek spotted contamination from debug patch. Removed. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 9 +++-- block/blk-ioc.c | 99 +++++++++++++++++++++++++++++++---------------- block/blk.h | 1 + block/cfq-iosched.c | 18 ++++----- fs/ioprio.c | 21 ++-------- include/linux/iocontext.h | 4 +- kernel/fork.c | 8 ++-- 7 files changed, 91 insertions(+), 69 deletions(-) (limited to 'kernel') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 8f630cec906..4b001dcd85b 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1645,11 +1645,12 @@ static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { struct io_context *ioc; - task_lock(tsk); - ioc = tsk->io_context; - if (ioc) + /* we don't lose anything even if ioc allocation fails */ + ioc = get_task_io_context(tsk, GFP_ATOMIC, NUMA_NO_NODE); + if (ioc) { ioc->cgroup_changed = 1; - task_unlock(tsk); + put_io_context(ioc); + } } void blkio_policy_register(struct blkio_policy_type *blkiop) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 8bebf06bac7..b13ed96776c 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -16,6 +16,19 @@ */ static struct kmem_cache *iocontext_cachep; +/** + * get_io_context - increment reference count to io_context + * @ioc: io_context to get + * + * Increment reference count to @ioc. + */ +void get_io_context(struct io_context *ioc) +{ + BUG_ON(atomic_long_read(&ioc->refcount) <= 0); + atomic_long_inc(&ioc->refcount); +} +EXPORT_SYMBOL(get_io_context); + static void cfq_dtor(struct io_context *ioc) { if (!hlist_empty(&ioc->cic_list)) { @@ -71,6 +84,9 @@ void exit_io_context(struct task_struct *task) { struct io_context *ioc; + /* PF_EXITING prevents new io_context from being attached to @task */ + WARN_ON_ONCE(!(current->flags & PF_EXITING)); + task_lock(task); ioc = task->io_context; task->io_context = NULL; @@ -82,7 +98,9 @@ void exit_io_context(struct task_struct *task) put_io_context(ioc); } -struct io_context *alloc_io_context(gfp_t gfp_flags, int node) +static struct io_context *create_task_io_context(struct task_struct *task, + gfp_t gfp_flags, int node, + bool take_ref) { struct io_context *ioc; @@ -98,6 +116,20 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node) INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH); INIT_HLIST_HEAD(&ioc->cic_list); + /* try to install, somebody might already have beaten us to it */ + task_lock(task); + + if (!task->io_context && !(task->flags & PF_EXITING)) { + task->io_context = ioc; + } else { + kmem_cache_free(iocontext_cachep, ioc); + ioc = task->io_context; + } + + if (ioc && take_ref) + get_io_context(ioc); + + task_unlock(task); return ioc; } @@ -114,46 +146,47 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node) */ struct io_context *current_io_context(gfp_t gfp_flags, int node) { - struct task_struct *tsk = current; - struct io_context *ret; - - ret = tsk->io_context; - if (likely(ret)) - return ret; - - ret = alloc_io_context(gfp_flags, node); - if (ret) { - /* make sure set_task_ioprio() sees the settings above */ - smp_wmb(); - tsk->io_context = ret; - } + might_sleep_if(gfp_flags & __GFP_WAIT); - return ret; + if (current->io_context) + return current->io_context; + + return create_task_io_context(current, gfp_flags, node, false); } +EXPORT_SYMBOL(current_io_context); -/* - * If the current task has no IO context then create one and initialise it. - * If it does have a context, take a ref on it. +/** + * get_task_io_context - get io_context of a task + * @task: task of interest + * @gfp_flags: allocation flags, used if allocation is necessary + * @node: allocation node, used if allocation is necessary + * + * Return io_context of @task. If it doesn't exist, it is created with + * @gfp_flags and @node. The returned io_context has its reference count + * incremented. * - * This is always called in the context of the task which submitted the I/O. + * This function always goes through task_lock() and it's better to use + * current_io_context() + get_io_context() for %current. */ -struct io_context *get_io_context(gfp_t gfp_flags, int node) +struct io_context *get_task_io_context(struct task_struct *task, + gfp_t gfp_flags, int node) { - struct io_context *ioc = NULL; - - /* - * Check for unlikely race with exiting task. ioc ref count is - * zero when ioc is being detached. - */ - do { - ioc = current_io_context(gfp_flags, node); - if (unlikely(!ioc)) - break; - } while (!atomic_long_inc_not_zero(&ioc->refcount)); + struct io_context *ioc; - return ioc; + might_sleep_if(gfp_flags & __GFP_WAIT); + + task_lock(task); + ioc = task->io_context; + if (likely(ioc)) { + get_io_context(ioc); + task_unlock(task); + return ioc; + } + task_unlock(task); + + return create_task_io_context(task, gfp_flags, node, true); } -EXPORT_SYMBOL(get_io_context); +EXPORT_SYMBOL(get_task_io_context); static int __init blk_ioc_init(void) { diff --git a/block/blk.h b/block/blk.h index aae4d88fc52..fc3c41b2fd2 100644 --- a/block/blk.h +++ b/block/blk.h @@ -122,6 +122,7 @@ static inline int blk_should_fake_timeout(struct request_queue *q) } #endif +void get_io_context(struct io_context *ioc); struct io_context *current_io_context(gfp_t gfp_flags, int node); int ll_back_merge_fn(struct request_queue *q, struct request *req, diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ec3f5e8ba56..d42d89ccce1 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -14,6 +14,7 @@ #include #include #include +#include "blk.h" #include "cfq.h" /* @@ -3194,13 +3195,13 @@ static struct cfq_io_context * cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) { struct io_context *ioc = NULL; - struct cfq_io_context *cic; + struct cfq_io_context *cic = NULL; might_sleep_if(gfp_mask & __GFP_WAIT); - ioc = get_io_context(gfp_mask, cfqd->queue->node); + ioc = current_io_context(gfp_mask, cfqd->queue->node); if (!ioc) - return NULL; + goto err; cic = cfq_cic_lookup(cfqd, ioc); if (cic) @@ -3211,10 +3212,10 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) goto err; if (cfq_cic_link(cfqd, ioc, cic, gfp_mask)) - goto err_free; - + goto err; out: - smp_read_barrier_depends(); + get_io_context(ioc); + if (unlikely(ioc->ioprio_changed)) cfq_ioc_set_ioprio(ioc); @@ -3223,10 +3224,9 @@ out: cfq_ioc_set_cgroup(ioc); #endif return cic; -err_free: - cfq_cic_free(cic); err: - put_io_context(ioc); + if (cic) + cfq_cic_free(cic); return NULL; } diff --git a/fs/ioprio.c b/fs/ioprio.c index f79dab83e17..998ec239d1e 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -48,28 +48,13 @@ int set_task_ioprio(struct task_struct *task, int ioprio) if (err) return err; - task_lock(task); - do { - ioc = task->io_context; - /* see wmb() in current_io_context() */ - smp_read_barrier_depends(); - if (ioc) - break; - - ioc = alloc_io_context(GFP_ATOMIC, -1); - if (!ioc) { - err = -ENOMEM; - break; - } - task->io_context = ioc; - } while (1); - - if (!err) { + ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); + if (ioc) { ioc->ioprio = ioprio; ioc->ioprio_changed = 1; + put_io_context(ioc); } - task_unlock(task); return err; } EXPORT_SYMBOL_GPL(set_task_ioprio); diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 8a6ecb66346..28bb621ef5a 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -78,8 +78,8 @@ struct task_struct; #ifdef CONFIG_BLOCK void put_io_context(struct io_context *ioc); void exit_io_context(struct task_struct *task); -struct io_context *get_io_context(gfp_t gfp_flags, int node); -struct io_context *alloc_io_context(gfp_t gfp_flags, int node); +struct io_context *get_task_io_context(struct task_struct *task, + gfp_t gfp_flags, int node); #else struct io_context; static inline void put_io_context(struct io_context *ioc) { } diff --git a/kernel/fork.c b/kernel/fork.c index da4a6a10d08..5bcfc739bb7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -870,6 +870,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) { #ifdef CONFIG_BLOCK struct io_context *ioc = current->io_context; + struct io_context *new_ioc; if (!ioc) return 0; @@ -881,11 +882,12 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) if (unlikely(!tsk->io_context)) return -ENOMEM; } else if (ioprio_valid(ioc->ioprio)) { - tsk->io_context = alloc_io_context(GFP_KERNEL, -1); - if (unlikely(!tsk->io_context)) + new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); + if (unlikely(!new_ioc)) return -ENOMEM; - tsk->io_context->ioprio = ioc->ioprio; + new_ioc->ioprio = ioc->ioprio; + put_io_context(new_ioc); } #endif return 0; -- cgit v1.2.3-70-g09d2 From b2efa05265d62bc29f3a64400fad4b44340eedb8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Dec 2011 00:33:39 +0100 Subject: block, cfq: unlink cfq_io_context's immediately cic is association between io_context and request_queue. A cic is linked from both ioc and q and should be destroyed when either one goes away. As ioc and q both have their own locks, locking becomes a bit complex - both orders work for removal from one but not from the other. Currently, cfq tries to circumvent this locking order issue with RCU. ioc->lock nests inside queue_lock but the radix tree and cic's are also protected by RCU allowing either side to walk their lists without grabbing lock. This rather unconventional use of RCU quickly devolves into extremely fragile convolution. e.g. The following is from cfqd going away too soon after ioc and q exits raced. general protection fault: 0000 [#1] PREEMPT SMP CPU 2 Modules linked in: [ 88.503444] Pid: 599, comm: hexdump Not tainted 3.1.0-rc10-work+ #158 Bochs Bochs RIP: 0010:[] [] cfq_exit_single_io_context+0x58/0xf0 ... Call Trace: [] call_for_each_cic+0x5a/0x90 [] cfq_exit_io_context+0x15/0x20 [] exit_io_context+0x100/0x140 [] do_exit+0x579/0x850 [] do_group_exit+0x5b/0xd0 [] sys_exit_group+0x17/0x20 [] system_call_fastpath+0x16/0x1b The only real hot path here is cic lookup during request initialization and avoiding extra locking requires very confined use of RCU. This patch makes cic removal from both ioc and request_queue perform double-locking and unlink immediately. * From q side, the change is almost trivial as ioc->lock nests inside queue_lock. It just needs to grab each ioc->lock as it walks cic_list and unlink it. * From ioc side, it's a bit more difficult because of inversed lock order. ioc needs its lock to walk its cic_list but can't grab the matching queue_lock and needs to perform unlock-relock dancing. Unlinking is now wholly done from put_io_context() and fast path is optimized by using the queue_lock the caller already holds, which is by far the most common case. If the ioc accessed multiple devices, it tries with trylock. In unlikely cases of fast path failure, it falls back to full double-locking dance from workqueue. Double-locking isn't the prettiest thing in the world but it's *far* simpler and more understandable than RCU trick without adding any meaningful overhead. This still leaves a lot of now unnecessary RCU logics. Future patches will trim them. -v2: Vivek pointed out that cic->q was being dereferenced after cic->release() was called. Updated to use local variable @this_q instead. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- block/blk-ioc.c | 166 ++++++++++++++++++++++++++++++++++++++-------- block/cfq-iosched.c | 44 +++--------- fs/ioprio.c | 2 +- include/linux/blkdev.h | 3 + include/linux/iocontext.h | 12 ++-- kernel/fork.c | 2 +- 7 files changed, 159 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index dc00835aab6..27886935804 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1649,7 +1649,7 @@ static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk) ioc = get_task_io_context(tsk, GFP_ATOMIC, NUMA_NO_NODE); if (ioc) { ioc_cgroup_changed(ioc); - put_io_context(ioc); + put_io_context(ioc, NULL); } } diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 6f59fbad93d..fb23965595d 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -29,55 +29,164 @@ void get_io_context(struct io_context *ioc) } EXPORT_SYMBOL(get_io_context); -static void cfq_dtor(struct io_context *ioc) +/* + * Releasing ioc may nest into another put_io_context() leading to nested + * fast path release. As the ioc's can't be the same, this is okay but + * makes lockdep whine. Keep track of nesting and use it as subclass. + */ +#ifdef CONFIG_LOCKDEP +#define ioc_release_depth(q) ((q) ? (q)->ioc_release_depth : 0) +#define ioc_release_depth_inc(q) (q)->ioc_release_depth++ +#define ioc_release_depth_dec(q) (q)->ioc_release_depth-- +#else +#define ioc_release_depth(q) 0 +#define ioc_release_depth_inc(q) do { } while (0) +#define ioc_release_depth_dec(q) do { } while (0) +#endif + +/* + * Slow path for ioc release in put_io_context(). Performs double-lock + * dancing to unlink all cic's and then frees ioc. + */ +static void ioc_release_fn(struct work_struct *work) { - if (!hlist_empty(&ioc->cic_list)) { - struct cfq_io_context *cic; + struct io_context *ioc = container_of(work, struct io_context, + release_work); + struct request_queue *last_q = NULL; + + spin_lock_irq(&ioc->lock); + + while (!hlist_empty(&ioc->cic_list)) { + struct cfq_io_context *cic = hlist_entry(ioc->cic_list.first, + struct cfq_io_context, + cic_list); + struct request_queue *this_q = cic->q; + + if (this_q != last_q) { + /* + * Need to switch to @this_q. Once we release + * @ioc->lock, it can go away along with @cic. + * Hold on to it. + */ + __blk_get_queue(this_q); + + /* + * blk_put_queue() might sleep thanks to kobject + * idiocy. Always release both locks, put and + * restart. + */ + if (last_q) { + spin_unlock(last_q->queue_lock); + spin_unlock_irq(&ioc->lock); + blk_put_queue(last_q); + } else { + spin_unlock_irq(&ioc->lock); + } + + last_q = this_q; + spin_lock_irq(this_q->queue_lock); + spin_lock(&ioc->lock); + continue; + } + ioc_release_depth_inc(this_q); + cic->exit(cic); + cic->release(cic); + ioc_release_depth_dec(this_q); + } - cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context, - cic_list); - cic->dtor(ioc); + if (last_q) { + spin_unlock(last_q->queue_lock); + spin_unlock_irq(&ioc->lock); + blk_put_queue(last_q); + } else { + spin_unlock_irq(&ioc->lock); } + + kmem_cache_free(iocontext_cachep, ioc); } /** * put_io_context - put a reference of io_context * @ioc: io_context to put + * @locked_q: request_queue the caller is holding queue_lock of (hint) * * Decrement reference count of @ioc and release it if the count reaches - * zero. + * zero. If the caller is holding queue_lock of a queue, it can indicate + * that with @locked_q. This is an optimization hint and the caller is + * allowed to pass in %NULL even when it's holding a queue_lock. */ -void put_io_context(struct io_context *ioc) +void put_io_context(struct io_context *ioc, struct request_queue *locked_q) { + struct request_queue *last_q = locked_q; + unsigned long flags; + if (ioc == NULL) return; BUG_ON(atomic_long_read(&ioc->refcount) <= 0); + if (locked_q) + lockdep_assert_held(locked_q->queue_lock); if (!atomic_long_dec_and_test(&ioc->refcount)) return; - rcu_read_lock(); - cfq_dtor(ioc); - rcu_read_unlock(); - - kmem_cache_free(iocontext_cachep, ioc); -} -EXPORT_SYMBOL(put_io_context); + /* + * Destroy @ioc. This is a bit messy because cic's are chained + * from both ioc and queue, and ioc->lock nests inside queue_lock. + * The inner ioc->lock should be held to walk our cic_list and then + * for each cic the outer matching queue_lock should be grabbed. + * ie. We need to do reverse-order double lock dancing. + * + * Another twist is that we are often called with one of the + * matching queue_locks held as indicated by @locked_q, which + * prevents performing double-lock dance for other queues. + * + * So, we do it in two stages. The fast path uses the queue_lock + * the caller is holding and, if other queues need to be accessed, + * uses trylock to avoid introducing locking dependency. This can + * handle most cases, especially if @ioc was performing IO on only + * single device. + * + * If trylock doesn't cut it, we defer to @ioc->release_work which + * can do all the double-locking dancing. + */ + spin_lock_irqsave_nested(&ioc->lock, flags, + ioc_release_depth(locked_q)); + + while (!hlist_empty(&ioc->cic_list)) { + struct cfq_io_context *cic = hlist_entry(ioc->cic_list.first, + struct cfq_io_context, + cic_list); + struct request_queue *this_q = cic->q; + + if (this_q != last_q) { + if (last_q && last_q != locked_q) + spin_unlock(last_q->queue_lock); + last_q = NULL; + + if (!spin_trylock(this_q->queue_lock)) + break; + last_q = this_q; + continue; + } + ioc_release_depth_inc(this_q); + cic->exit(cic); + cic->release(cic); + ioc_release_depth_dec(this_q); + } -static void cfq_exit(struct io_context *ioc) -{ - rcu_read_lock(); + if (last_q && last_q != locked_q) + spin_unlock(last_q->queue_lock); - if (!hlist_empty(&ioc->cic_list)) { - struct cfq_io_context *cic; + spin_unlock_irqrestore(&ioc->lock, flags); - cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context, - cic_list); - cic->exit(ioc); - } - rcu_read_unlock(); + /* if no cic's left, we're done; otherwise, kick release_work */ + if (hlist_empty(&ioc->cic_list)) + kmem_cache_free(iocontext_cachep, ioc); + else + schedule_work(&ioc->release_work); } +EXPORT_SYMBOL(put_io_context); /* Called by the exiting task */ void exit_io_context(struct task_struct *task) @@ -92,10 +201,8 @@ void exit_io_context(struct task_struct *task) task->io_context = NULL; task_unlock(task); - if (atomic_dec_and_test(&ioc->nr_tasks)) - cfq_exit(ioc); - - put_io_context(ioc); + atomic_dec(&ioc->nr_tasks); + put_io_context(ioc, NULL); } static struct io_context *create_task_io_context(struct task_struct *task, @@ -115,6 +222,7 @@ static struct io_context *create_task_io_context(struct task_struct *task, spin_lock_init(&ioc->lock); INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH); INIT_HLIST_HEAD(&ioc->cic_list); + INIT_WORK(&ioc->release_work, ioc_release_fn); /* try to install, somebody might already have beaten us to it */ task_lock(task); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index e617b088c59..6cc60656040 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1778,7 +1778,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqd->active_queue = NULL; if (cfqd->active_cic) { - put_io_context(cfqd->active_cic->ioc); + put_io_context(cfqd->active_cic->ioc, cfqd->queue); cfqd->active_cic = NULL; } } @@ -2812,38 +2812,6 @@ static void cfq_exit_cic(struct cfq_io_context *cic) } } -static void cfq_exit_single_io_context(struct io_context *ioc, - struct cfq_io_context *cic) -{ - struct cfq_data *cfqd = cic_to_cfqd(cic); - - if (cfqd) { - struct request_queue *q = cfqd->queue; - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - - /* - * Ensure we get a fresh copy of the ->key to prevent - * race between exiting task and queue - */ - smp_read_barrier_depends(); - if (cic->key == cfqd) - cfq_exit_cic(cic); - - spin_unlock_irqrestore(q->queue_lock, flags); - } -} - -/* - * The process that ioc belongs to has exited, we need to clean up - * and put the internal structures we have that belongs to that process. - */ -static void cfq_exit_io_context(struct io_context *ioc) -{ - call_for_each_cic(ioc, cfq_exit_single_io_context); -} - static struct cfq_io_context * cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) { @@ -2855,8 +2823,8 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) cic->ttime.last_end_request = jiffies; INIT_LIST_HEAD(&cic->queue_list); INIT_HLIST_NODE(&cic->cic_list); - cic->dtor = cfq_free_io_context; - cic->exit = cfq_exit_io_context; + cic->exit = cfq_exit_cic; + cic->release = cfq_release_cic; elv_ioc_count_inc(cfq_ioc_count); } @@ -3726,7 +3694,7 @@ static void cfq_put_request(struct request *rq) BUG_ON(!cfqq->allocated[rw]); cfqq->allocated[rw]--; - put_io_context(RQ_CIC(rq)->ioc); + put_io_context(RQ_CIC(rq)->ioc, cfqq->cfqd->queue); rq->elevator_private[0] = NULL; rq->elevator_private[1] = NULL; @@ -3937,8 +3905,12 @@ static void cfq_exit_queue(struct elevator_queue *e) struct cfq_io_context *cic = list_entry(cfqd->cic_list.next, struct cfq_io_context, queue_list); + struct io_context *ioc = cic->ioc; + spin_lock(&ioc->lock); cfq_exit_cic(cic); + cfq_release_cic(cic); + spin_unlock(&ioc->lock); } cfq_put_async_queues(cfqd); diff --git a/fs/ioprio.c b/fs/ioprio.c index 0f1b9515213..f84b380d65e 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -51,7 +51,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio) ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); if (ioc) { ioc_ioprio_changed(ioc, ioprio); - put_io_context(ioc); + put_io_context(ioc, NULL); } return err; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d1b6f4ed1f9..65c2f8c7008 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -393,6 +393,9 @@ struct request_queue { /* Throttle data */ struct throtl_data *td; #endif +#ifdef CONFIG_LOCKDEP + int ioc_release_depth; +#endif }; #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 2c2b6da96b3..01e86312878 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -3,6 +3,7 @@ #include #include +#include struct cfq_queue; struct cfq_ttime { @@ -33,8 +34,8 @@ struct cfq_io_context { unsigned long changed; - void (*dtor)(struct io_context *); /* destructor */ - void (*exit)(struct io_context *); /* called on task exit */ + void (*exit)(struct cfq_io_context *); + void (*release)(struct cfq_io_context *); struct rcu_head rcu_head; }; @@ -61,6 +62,8 @@ struct io_context { struct radix_tree_root radix_root; struct hlist_head cic_list; void __rcu *ioc_data; + + struct work_struct release_work; }; static inline struct io_context *ioc_task_link(struct io_context *ioc) @@ -79,7 +82,7 @@ static inline struct io_context *ioc_task_link(struct io_context *ioc) struct task_struct; #ifdef CONFIG_BLOCK -void put_io_context(struct io_context *ioc); +void put_io_context(struct io_context *ioc, struct request_queue *locked_q); void exit_io_context(struct task_struct *task); struct io_context *get_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node); @@ -87,7 +90,8 @@ void ioc_ioprio_changed(struct io_context *ioc, int ioprio); void ioc_cgroup_changed(struct io_context *ioc); #else struct io_context; -static inline void put_io_context(struct io_context *ioc) { } +static inline void put_io_context(struct io_context *ioc, + struct request_queue *locked_q) { } static inline void exit_io_context(struct task_struct *task) { } #endif diff --git a/kernel/fork.c b/kernel/fork.c index 5bcfc739bb7..2753449f203 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -887,7 +887,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) return -ENOMEM; new_ioc->ioprio = ioc->ioprio; - put_io_context(new_ioc); + put_io_context(new_ioc, NULL); } #endif return 0; -- cgit v1.2.3-70-g09d2 From 54848d73f9f254631303d6eab9b976855988b266 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 5 Apr 2011 13:21:19 -0600 Subject: writeback: charge leaked page dirties to active tasks It's a years long problem that a large number of short-lived dirtiers (eg. gcc instances in a fast kernel build) may starve long-run dirtiers (eg. dd) as well as pushing the dirty pages to the global hard limit. The solution is to charge the pages dirtied by the exited gcc to the other random dirtying tasks. It sounds not perfect, however should behave good enough in practice, seeing as that throttled tasks aren't actually running so those that are running are more likely to pick it up and get throttled, therefore promoting an equal spread. Randy: fix compile error: 'dirty_throttle_leaks' undeclared in exit.c Acked-by: Jan Kara Acked-by: Peter Zijlstra Signed-off-by: Randy Dunlap Signed-off-by: Wu Fengguang --- include/linux/writeback.h | 2 ++ kernel/exit.c | 3 +++ mm/page-writeback.c | 27 +++++++++++++++++++++++++++ 3 files changed, 32 insertions(+) (limited to 'kernel') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a378c295851..05eaf5e3aad 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -7,6 +7,8 @@ #include #include +DECLARE_PER_CPU(int, dirty_throttle_leaks); + /* * The 1/4 region under the global dirty thresh is for smooth dirty throttling: * diff --git a/kernel/exit.c b/kernel/exit.c index d0b7d988f87..d4aac24cc46 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -1037,6 +1038,8 @@ NORET_TYPE void do_exit(long code) validate_creds_for_do_exit(tsk); preempt_disable(); + if (tsk->nr_dirtied) + __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 50f08241f98..619c445fc03 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1214,6 +1214,22 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) static DEFINE_PER_CPU(int, bdp_ratelimits); +/* + * Normal tasks are throttled by + * loop { + * dirty tsk->nr_dirtied_pause pages; + * take a snap in balance_dirty_pages(); + * } + * However there is a worst case. If every task exit immediately when dirtied + * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be + * called to throttle the page dirties. The solution is to save the not yet + * throttled page dirties in dirty_throttle_leaks on task exit and charge them + * randomly into the running tasks. This works well for the above worst case, + * as the new task will pick up and accumulate the old task's leaked dirty + * count and eventually get throttled. + */ +DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; + /** * balance_dirty_pages_ratelimited_nr - balance dirty memory state * @mapping: address_space which was dirtied @@ -1261,6 +1277,17 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, ratelimit = 0; } } + /* + * Pick up the dirtied pages by the exited tasks. This avoids lots of + * short-lived tasks (eg. gcc invocations in a kernel build) escaping + * the dirty throttling and livelock other long-run dirtiers. + */ + p = &__get_cpu_var(dirty_throttle_leaks); + if (*p > 0 && current->nr_dirtied < ratelimit) { + nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); + *p -= nr_pages_dirtied; + current->nr_dirtied += nr_pages_dirtied; + } preempt_enable(); if (unlikely(current->nr_dirtied >= ratelimit)) -- cgit v1.2.3-70-g09d2 From 83712358ba0a1497ce59a4f84ce4dd0f803fe6fc Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sat, 11 Jun 2011 19:25:42 -0600 Subject: writeback: dirty ratelimit - think time compensation Compensate the task's think time when computing the final pause time, so that ->dirty_ratelimit can be executed accurately. think time := time spend outside of balance_dirty_pages() In the rare case that the task slept longer than the 200ms period time (result in negative pause time), the sleep time will be compensated in the following periods, too, if it's less than 1 second. Accumulated errors are carefully avoided as long as the max pause area is not hitted. Pseudo code: period = pages_dirtied / task_ratelimit; think = jiffies - dirty_paused_when; pause = period - think; 1) normal case: period > think pause = period - think dirty_paused_when = jiffies + pause nr_dirtied = 0 period time |===============================>| think time pause time |===============>|==============>| ------|----------------|---------------|------------------------ dirty_paused_when jiffies 2) no pause case: period <= think don't pause; reduce future pause time by: dirty_paused_when += period nr_dirtied = 0 period time |===============================>| think time |===================================================>| ------|--------------------------------+-------------------|---- dirty_paused_when jiffies Acked-by: Jan Kara Acked-by: Peter Zijlstra Signed-off-by: Wu Fengguang --- include/linux/sched.h | 1 + include/trace/events/writeback.h | 14 +++++++++++--- kernel/fork.c | 1 + mm/page-writeback.c | 36 ++++++++++++++++++++++++++++++++---- 4 files changed, 45 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 1c4f3e9b9bc..984c3b29597 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1527,6 +1527,7 @@ struct task_struct { */ int nr_dirtied; int nr_dirtied_pause; + unsigned long dirty_paused_when; /* start of a write-and-pause period */ #ifdef CONFIG_LATENCYTOP int latency_record_count; diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 99d1d0decf8..8588a891802 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -300,12 +300,13 @@ TRACE_EVENT(balance_dirty_pages, unsigned long dirty_ratelimit, unsigned long task_ratelimit, unsigned long dirtied, + unsigned long period, long pause, unsigned long start_time), TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, dirty_ratelimit, task_ratelimit, - dirtied, pause, start_time), + dirtied, period, pause, start_time), TP_STRUCT__entry( __array( char, bdi, 32) @@ -320,6 +321,8 @@ TRACE_EVENT(balance_dirty_pages, __field(unsigned int, dirtied_pause) __field(unsigned long, paused) __field( long, pause) + __field(unsigned long, period) + __field( long, think) ), TP_fast_assign( @@ -336,6 +339,9 @@ TRACE_EVENT(balance_dirty_pages, __entry->task_ratelimit = KBps(task_ratelimit); __entry->dirtied = dirtied; __entry->dirtied_pause = current->nr_dirtied_pause; + __entry->think = current->dirty_paused_when == 0 ? 0 : + (long)(jiffies - current->dirty_paused_when) * 1000/HZ; + __entry->period = period * 1000 / HZ; __entry->pause = pause * 1000 / HZ; __entry->paused = (jiffies - start_time) * 1000 / HZ; ), @@ -346,7 +352,7 @@ TRACE_EVENT(balance_dirty_pages, "bdi_setpoint=%lu bdi_dirty=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "dirtied=%u dirtied_pause=%u " - "paused=%lu pause=%ld", + "paused=%lu pause=%ld period=%lu think=%ld", __entry->bdi, __entry->limit, __entry->setpoint, @@ -358,7 +364,9 @@ TRACE_EVENT(balance_dirty_pages, __entry->dirtied, __entry->dirtied_pause, __entry->paused, /* ms */ - __entry->pause /* ms */ + __entry->pause, /* ms */ + __entry->period, /* ms */ + __entry->think /* ms */ ) ); diff --git a/kernel/fork.c b/kernel/fork.c index da4a6a10d08..f8668cf6a32 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1296,6 +1296,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->nr_dirtied = 0; p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); + p->dirty_paused_when = 0; /* * Ok, make it visible to the rest of the system. diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 96b3e7aa705..49193215582 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1016,6 +1016,7 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; + long period; long pause = 0; long uninitialized_var(max_pause); bool dirty_exceeded = false; @@ -1026,6 +1027,8 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long start_time = jiffies; for (;;) { + unsigned long now = jiffies; + /* * Unstable writes are a feature of certain networked * filesystems (i.e. NFS) in which data may have been @@ -1045,8 +1048,11 @@ static void balance_dirty_pages(struct address_space *mapping, */ freerun = dirty_freerun_ceiling(dirty_thresh, background_thresh); - if (nr_dirty <= freerun) + if (nr_dirty <= freerun) { + current->dirty_paused_when = now; + current->nr_dirtied = 0; break; + } if (unlikely(!writeback_in_progress(bdi))) bdi_start_background_writeback(bdi); @@ -1104,10 +1110,21 @@ static void balance_dirty_pages(struct address_space *mapping, task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> RATELIMIT_CALC_SHIFT; if (unlikely(task_ratelimit == 0)) { + period = max_pause; pause = max_pause; goto pause; } - pause = HZ * pages_dirtied / task_ratelimit; + period = HZ * pages_dirtied / task_ratelimit; + pause = period; + if (current->dirty_paused_when) + pause -= now - current->dirty_paused_when; + /* + * For less than 1s think time (ext3/4 may block the dirtier + * for up to 800ms from time to time on 1-HDD; so does xfs, + * however at much less frequency), try to compensate it in + * future periods by updating the virtual time; otherwise just + * do a reset, as it may be a light dirtier. + */ if (unlikely(pause <= 0)) { trace_balance_dirty_pages(bdi, dirty_thresh, @@ -1118,8 +1135,16 @@ static void balance_dirty_pages(struct address_space *mapping, dirty_ratelimit, task_ratelimit, pages_dirtied, + period, pause, start_time); + if (pause < -HZ) { + current->dirty_paused_when = now; + current->nr_dirtied = 0; + } else if (period) { + current->dirty_paused_when += period; + current->nr_dirtied = 0; + } pause = 1; /* avoid resetting nr_dirtied_pause below */ break; } @@ -1135,11 +1160,15 @@ pause: dirty_ratelimit, task_ratelimit, pages_dirtied, + period, pause, start_time); __set_current_state(TASK_KILLABLE); io_schedule_timeout(pause); + current->dirty_paused_when = now + pause; + current->nr_dirtied = 0; + /* * This is typically equal to (nr_dirty < dirty_thresh) and can * also keep "1000+ dd on a slow USB stick" under control. @@ -1167,11 +1196,10 @@ pause: if (!dirty_exceeded && bdi->dirty_exceeded) bdi->dirty_exceeded = 0; - current->nr_dirtied = 0; if (pause == 0) { /* in freerun area */ current->nr_dirtied_pause = dirty_poll_interval(nr_dirty, dirty_thresh); - } else if (pause <= max_pause / 4 && + } else if (period <= max_pause / 4 && pages_dirtied >= current->nr_dirtied_pause) { current->nr_dirtied_pause = clamp_val( dirty_ratelimit * (max_pause / 2) / HZ, -- cgit v1.2.3-70-g09d2 From 30fb6aa74011dcf595f306ca2727254d708b786e Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 5 Dec 2011 18:22:48 +0100 Subject: ftrace: Fix unregister ftrace_ops accounting Multiple users of the function tracer can register their functions with the ftrace_ops structure. The accounting within ftrace will update the counter on each function record that is being traced. When the ftrace_ops filtering adds or removes functions, the function records will be updated accordingly if the ftrace_ops is still registered. When a ftrace_ops is removed, the counter of the function records, that the ftrace_ops traces, are decremented. When they reach zero the functions that they represent are modified to stop calling the mcount code. When changes are made, the code is updated via stop_machine() with a command passed to the function to tell it what to do. There is an ENABLE and DISABLE command that tells the called function to enable or disable the functions. But the ENABLE is really a misnomer as it should just update the records, as records that have been enabled and now have a count of zero should be disabled. The DISABLE command is used to disable all functions regardless of their counter values. This is the big off switch and is not the complement of the ENABLE command. To make matters worse, when a ftrace_ops is unregistered and there is another ftrace_ops registered, neither the DISABLE nor the ENABLE command are set when calling into the stop_machine() function and the records will not be updated to match their counter. A command is passed to that function that will update the mcount code to call the registered callback directly if it is the only one left. This means that the ftrace_ops that is still registered will have its callback called by all functions that have been set for it as well as the ftrace_ops that was just unregistered. Here's a way to trigger this bug. Compile the kernel with CONFIG_FUNCTION_PROFILER set and with CONFIG_FUNCTION_GRAPH not set: CONFIG_FUNCTION_PROFILER=y # CONFIG_FUNCTION_GRAPH is not set This will force the function profiler to use the function tracer instead of the function graph tracer. # cd /sys/kernel/debug/tracing # echo schedule > set_ftrace_filter # echo function > current_tracer # cat set_ftrace_filter schedule # cat trace # tracer: nop # # entries-in-buffer/entries-written: 692/68108025 #P:4 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | kworker/0:2-909 [000] .... 531.235574: schedule <-worker_thread -0 [001] .N.. 531.235575: schedule <-cpu_idle kworker/0:2-909 [000] .... 531.235597: schedule <-worker_thread sshd-2563 [001] .... 531.235647: schedule <-schedule_hrtimeout_range_clock # echo 1 > function_profile_enabled # echo 0 > function_porfile_enabled # cat set_ftrace_filter schedule # cat trace # tracer: function # # entries-in-buffer/entries-written: 159701/118821262 #P:4 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | -0 [002] ...1 604.870655: local_touch_nmi <-cpu_idle -0 [002] d..1 604.870655: enter_idle <-cpu_idle -0 [002] d..1 604.870656: atomic_notifier_call_chain <-enter_idle -0 [002] d..1 604.870656: __atomic_notifier_call_chain <-atomic_notifier_call_chain The same problem could have happened with the trace_probe_ops, but they are modified with the set_frace_filter file which does the update at closure of the file. The simple solution is to change ENABLE to UPDATE and call it every time an ftrace_ops is unregistered. Link: http://lkml.kernel.org/r/1323105776-26961-3-git-send-email-jolsa@redhat.com Cc: stable@vger.kernel.org # 3.0+ Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b1e8943fed1..25b4f4da0fe 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -948,7 +948,7 @@ struct ftrace_func_probe { }; enum { - FTRACE_ENABLE_CALLS = (1 << 0), + FTRACE_UPDATE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), FTRACE_UPDATE_TRACE_FUNC = (1 << 2), FTRACE_START_FUNC_RET = (1 << 3), @@ -1519,7 +1519,7 @@ int ftrace_text_reserved(void *start, void *end) static int -__ftrace_replace_code(struct dyn_ftrace *rec, int enable) +__ftrace_replace_code(struct dyn_ftrace *rec, int update) { unsigned long ftrace_addr; unsigned long flag = 0UL; @@ -1527,17 +1527,17 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) ftrace_addr = (unsigned long)FTRACE_ADDR; /* - * If we are enabling tracing: + * If we are updating calls: * * If the record has a ref count, then we need to enable it * because someone is using it. * * Otherwise we make sure its disabled. * - * If we are disabling tracing, then disable all records that + * If we are disabling calls, then disable all records that * are enabled. */ - if (enable && (rec->flags & ~FTRACE_FL_MASK)) + if (update && (rec->flags & ~FTRACE_FL_MASK)) flag = FTRACE_FL_ENABLED; /* If the state of this record hasn't changed, then do nothing */ @@ -1553,7 +1553,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) return ftrace_make_nop(NULL, rec, ftrace_addr); } -static void ftrace_replace_code(int enable) +static void ftrace_replace_code(int update) { struct dyn_ftrace *rec; struct ftrace_page *pg; @@ -1567,7 +1567,7 @@ static void ftrace_replace_code(int enable) if (rec->flags & FTRACE_FL_FREE) continue; - failed = __ftrace_replace_code(rec, enable); + failed = __ftrace_replace_code(rec, update); if (failed) { ftrace_bug(failed, rec->ip); /* Stop processing */ @@ -1623,7 +1623,7 @@ static int __ftrace_modify_code(void *data) */ function_trace_stop++; - if (*command & FTRACE_ENABLE_CALLS) + if (*command & FTRACE_UPDATE_CALLS) ftrace_replace_code(1); else if (*command & FTRACE_DISABLE_CALLS) ftrace_replace_code(0); @@ -1691,7 +1691,7 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) return -ENODEV; ftrace_start_up++; - command |= FTRACE_ENABLE_CALLS; + command |= FTRACE_UPDATE_CALLS; /* ops marked global share the filter hashes */ if (ops->flags & FTRACE_OPS_FL_GLOBAL) { @@ -1743,8 +1743,7 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command) if (ops != &global_ops || !global_start_up) ops->flags &= ~FTRACE_OPS_FL_ENABLED; - if (!ftrace_start_up) - command |= FTRACE_DISABLE_CALLS; + command |= FTRACE_UPDATE_CALLS; if (saved_ftrace_func != ftrace_trace_function) { saved_ftrace_func = ftrace_trace_function; @@ -1766,7 +1765,7 @@ static void ftrace_startup_sysctl(void) saved_ftrace_func = NULL; /* ftrace_start_up is true if we want ftrace running */ if (ftrace_start_up) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); + ftrace_run_update_code(FTRACE_UPDATE_CALLS); } static void ftrace_shutdown_sysctl(void) @@ -2919,7 +2918,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, ret = ftrace_hash_move(ops, enable, orig_hash, hash); if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); + ftrace_run_update_code(FTRACE_UPDATE_CALLS); mutex_unlock(&ftrace_lock); @@ -3107,7 +3106,7 @@ ftrace_regex_release(struct inode *inode, struct file *file) orig_hash, iter->hash); if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) && ftrace_enabled) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); + ftrace_run_update_code(FTRACE_UPDATE_CALLS); mutex_unlock(&ftrace_lock); } -- cgit v1.2.3-70-g09d2 From c88fd8634ea68e74c7d19fd2621b4078fd22864c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 16 Aug 2011 09:53:39 -0400 Subject: ftrace: Allow archs to modify code without stop machine The stop machine method to modify all functions in the kernel (some 20,000 of them) is the safest way to do so across all archs. But some archs may not need this big hammer approach to modify code on SMP machines, and can simply just update the code it needs. Adding a weak function arch_ftrace_update_code() that now does the stop machine, will also let any arch override this method. If the arch needs to check the system and then decide if it can avoid stop machine, it can still call ftrace_run_stop_machine() to use the old method. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 31 ++++++ kernel/trace/ftrace.c | 253 +++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 246 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 26eafcef75b..4f0b6fec379 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -133,6 +133,8 @@ struct ftrace_func_command { int ftrace_arch_code_modify_prepare(void); int ftrace_arch_code_modify_post_process(void); +void ftrace_bug(int err, unsigned long ip); + struct seq_file; struct ftrace_probe_ops { @@ -190,6 +192,35 @@ void ftrace_set_global_notrace(unsigned char *buf, int len, int reset); int register_ftrace_command(struct ftrace_func_command *cmd); int unregister_ftrace_command(struct ftrace_func_command *cmd); +enum { + FTRACE_UPDATE_CALLS = (1 << 0), + FTRACE_DISABLE_CALLS = (1 << 1), + FTRACE_UPDATE_TRACE_FUNC = (1 << 2), + FTRACE_START_FUNC_RET = (1 << 3), + FTRACE_STOP_FUNC_RET = (1 << 4), +}; + +enum { + FTRACE_UPDATE_IGNORE, + FTRACE_UPDATE_MAKE_CALL, + FTRACE_UPDATE_MAKE_NOP, +}; + +void arch_ftrace_update_code(int command); + +struct ftrace_rec_iter; + +struct ftrace_rec_iter *ftrace_rec_iter_start(void); +struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter); +struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter); + +int ftrace_update_record(struct dyn_ftrace *rec, int enable); +int ftrace_test_record(struct dyn_ftrace *rec, int enable); +void ftrace_run_stop_machine(int command); +int ftrace_location(unsigned long ip); + +extern ftrace_func_t ftrace_trace_function; + /* defined in arch */ extern int ftrace_ip_converted(unsigned long ip); extern int ftrace_dyn_arch_init(void *data); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 25b4f4da0fe..655b432fb89 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -947,13 +947,6 @@ struct ftrace_func_probe { struct rcu_head rcu; }; -enum { - FTRACE_UPDATE_CALLS = (1 << 0), - FTRACE_DISABLE_CALLS = (1 << 1), - FTRACE_UPDATE_TRACE_FUNC = (1 << 2), - FTRACE_START_FUNC_RET = (1 << 3), - FTRACE_STOP_FUNC_RET = (1 << 4), -}; struct ftrace_func_entry { struct hlist_node hlist; unsigned long ip; @@ -1307,6 +1300,28 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) } \ } +/** + * ftrace_location - return true if the ip giving is a traced location + * @ip: the instruction pointer to check + * + * Returns 1 if @ip given is a pointer to a ftrace location. + * That is, the instruction that is either a NOP or call to + * the function tracer. It checks the ftrace internal tables to + * determine if the address belongs or not. + */ +int ftrace_location(unsigned long ip) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec; + + do_for_each_ftrace_rec(pg, rec) { + if (rec->ip == ip) + return 1; + } while_for_each_ftrace_rec(); + + return 0; +} + static void __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) @@ -1475,7 +1490,19 @@ static void print_ip_ins(const char *fmt, unsigned char *p) printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); } -static void ftrace_bug(int failed, unsigned long ip) +/** + * ftrace_bug - report and shutdown function tracer + * @failed: The failed type (EFAULT, EINVAL, EPERM) + * @ip: The address that failed + * + * The arch code that enables or disables the function tracing + * can call ftrace_bug() when it has detected a problem in + * modifying the code. @failed should be one of either: + * EFAULT - if the problem happens on reading the @ip address + * EINVAL - if what is read at @ip is not what was expected + * EPERM - if the problem happens on writting to the @ip address + */ +void ftrace_bug(int failed, unsigned long ip) { switch (failed) { case -EFAULT: @@ -1517,15 +1544,10 @@ int ftrace_text_reserved(void *start, void *end) return 0; } - -static int -__ftrace_replace_code(struct dyn_ftrace *rec, int update) +static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) { - unsigned long ftrace_addr; unsigned long flag = 0UL; - ftrace_addr = (unsigned long)FTRACE_ADDR; - /* * If we are updating calls: * @@ -1537,20 +1559,74 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int update) * If we are disabling calls, then disable all records that * are enabled. */ - if (update && (rec->flags & ~FTRACE_FL_MASK)) + if (enable && (rec->flags & ~FTRACE_FL_MASK)) flag = FTRACE_FL_ENABLED; /* If the state of this record hasn't changed, then do nothing */ if ((rec->flags & FTRACE_FL_ENABLED) == flag) - return 0; + return FTRACE_UPDATE_IGNORE; if (flag) { - rec->flags |= FTRACE_FL_ENABLED; + if (update) + rec->flags |= FTRACE_FL_ENABLED; + return FTRACE_UPDATE_MAKE_CALL; + } + + if (update) + rec->flags &= ~FTRACE_FL_ENABLED; + + return FTRACE_UPDATE_MAKE_NOP; +} + +/** + * ftrace_update_record, set a record that now is tracing or not + * @rec: the record to update + * @enable: set to 1 if the record is tracing, zero to force disable + * + * The records that represent all functions that can be traced need + * to be updated when tracing has been enabled. + */ +int ftrace_update_record(struct dyn_ftrace *rec, int enable) +{ + return ftrace_check_record(rec, enable, 1); +} + +/** + * ftrace_test_record, check if the record has been enabled or not + * @rec: the record to test + * @enable: set to 1 to check if enabled, 0 if it is disabled + * + * The arch code may need to test if a record is already set to + * tracing to determine how to modify the function code that it + * represents. + */ +int ftrace_test_record(struct dyn_ftrace *rec, int enable) +{ + return ftrace_check_record(rec, enable, 0); +} + +static int +__ftrace_replace_code(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_addr; + int ret; + + ftrace_addr = (unsigned long)FTRACE_ADDR; + + ret = ftrace_update_record(rec, enable); + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MAKE_CALL: return ftrace_make_call(rec, ftrace_addr); + + case FTRACE_UPDATE_MAKE_NOP: + return ftrace_make_nop(NULL, rec, ftrace_addr); } - rec->flags &= ~FTRACE_FL_ENABLED; - return ftrace_make_nop(NULL, rec, ftrace_addr); + return -1; /* unknow ftrace bug */ } static void ftrace_replace_code(int update) @@ -1576,6 +1652,78 @@ static void ftrace_replace_code(int update) } while_for_each_ftrace_rec(); } +struct ftrace_rec_iter { + struct ftrace_page *pg; + int index; +}; + +/** + * ftrace_rec_iter_start, start up iterating over traced functions + * + * Returns an iterator handle that is used to iterate over all + * the records that represent address locations where functions + * are traced. + * + * May return NULL if no records are available. + */ +struct ftrace_rec_iter *ftrace_rec_iter_start(void) +{ + /* + * We only use a single iterator. + * Protected by the ftrace_lock mutex. + */ + static struct ftrace_rec_iter ftrace_rec_iter; + struct ftrace_rec_iter *iter = &ftrace_rec_iter; + + iter->pg = ftrace_pages_start; + iter->index = 0; + + /* Could have empty pages */ + while (iter->pg && !iter->pg->index) + iter->pg = iter->pg->next; + + if (!iter->pg) + return NULL; + + return iter; +} + +/** + * ftrace_rec_iter_next, get the next record to process. + * @iter: The handle to the iterator. + * + * Returns the next iterator after the given iterator @iter. + */ +struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter) +{ + iter->index++; + + if (iter->index >= iter->pg->index) { + iter->pg = iter->pg->next; + iter->index = 0; + + /* Could have empty pages */ + while (iter->pg && !iter->pg->index) + iter->pg = iter->pg->next; + } + + if (!iter->pg) + return NULL; + + return iter; +} + +/** + * ftrace_rec_iter_record, get the record at the iterator location + * @iter: The current iterator location + * + * Returns the record that the current @iter is at. + */ +struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter) +{ + return &iter->pg->records[iter->index]; +} + static int ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) { @@ -1617,12 +1765,6 @@ static int __ftrace_modify_code(void *data) { int *command = data; - /* - * Do not call function tracer while we update the code. - * We are in stop machine, no worrying about races. - */ - function_trace_stop++; - if (*command & FTRACE_UPDATE_CALLS) ftrace_replace_code(1); else if (*command & FTRACE_DISABLE_CALLS) @@ -1636,21 +1778,33 @@ static int __ftrace_modify_code(void *data) else if (*command & FTRACE_STOP_FUNC_RET) ftrace_disable_ftrace_graph_caller(); -#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - /* - * For archs that call ftrace_test_stop_func(), we must - * wait till after we update all the function callers - * before we update the callback. This keeps different - * ops that record different functions from corrupting - * each other. - */ - __ftrace_trace_function = __ftrace_trace_function_delay; -#endif - function_trace_stop--; - return 0; } +/** + * ftrace_run_stop_machine, go back to the stop machine method + * @command: The command to tell ftrace what to do + * + * If an arch needs to fall back to the stop machine method, the + * it can call this function. + */ +void ftrace_run_stop_machine(int command) +{ + stop_machine(__ftrace_modify_code, &command, NULL); +} + +/** + * arch_ftrace_update_code, modify the code to trace or not trace + * @command: The command that needs to be done + * + * Archs can override this function if it does not need to + * run stop_machine() to modify code. + */ +void __weak arch_ftrace_update_code(int command) +{ + ftrace_run_stop_machine(command); +} + static void ftrace_run_update_code(int command) { int ret; @@ -1659,8 +1813,31 @@ static void ftrace_run_update_code(int command) FTRACE_WARN_ON(ret); if (ret) return; + /* + * Do not call function tracer while we update the code. + * We are in stop machine. + */ + function_trace_stop++; - stop_machine(__ftrace_modify_code, &command, NULL); + /* + * By default we use stop_machine() to modify the code. + * But archs can do what ever they want as long as it + * is safe. The stop_machine() is the safest, but also + * produces the most overhead. + */ + arch_ftrace_update_code(command); + +#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST + /* + * For archs that call ftrace_test_stop_func(), we must + * wait till after we update all the function callers + * before we update the callback. This keeps different + * ops that record different functions from corrupting + * each other. + */ + __ftrace_trace_function = __ftrace_trace_function_delay; +#endif + function_trace_stop--; ret = ftrace_arch_code_modify_post_process(); FTRACE_WARN_ON(ret); -- cgit v1.2.3-70-g09d2 From 3208230983a0ee3d95be22d463257e530c684956 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 16 Dec 2011 14:42:37 -0500 Subject: ftrace: Remove usage of "freed" records Records that are added to the function trace table are permanently there, except for modules. By separating out the modules to their own pages that can be freed in one shot we can remove the "freed" flag and simplify some of the record management. Another benefit of doing this is that we can also move the records around; sort them. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 1 - kernel/trace/ftrace.c | 100 ++++++++++++++++++++++++------------------------- 2 files changed, 49 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4f0b6fec379..3f79bc458bf 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -163,7 +163,6 @@ extern int ftrace_text_reserved(void *start, void *end); enum { FTRACE_FL_ENABLED = (1 << 30), - FTRACE_FL_FREE = (1 << 31), }; #define FTRACE_FL_MASK (0x3UL << 30) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 655b432fb89..be6888f40d2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -996,8 +996,6 @@ struct ftrace_page { static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; -static struct dyn_ftrace *ftrace_free_records; - static struct ftrace_func_entry * ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) { @@ -1421,32 +1419,8 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops, __ftrace_hash_rec_update(ops, filter_hash, 1); } -static void ftrace_free_rec(struct dyn_ftrace *rec) -{ - rec->freelist = ftrace_free_records; - ftrace_free_records = rec; - rec->flags |= FTRACE_FL_FREE; -} - static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { - struct dyn_ftrace *rec; - - /* First check for freed records */ - if (ftrace_free_records) { - rec = ftrace_free_records; - - if (unlikely(!(rec->flags & FTRACE_FL_FREE))) { - FTRACE_WARN_ON_ONCE(1); - ftrace_free_records = NULL; - return NULL; - } - - ftrace_free_records = rec->freelist; - memset(rec, 0, sizeof(*rec)); - return rec; - } - if (ftrace_pages->index == ENTRIES_PER_PAGE) { if (!ftrace_pages->next) { /* allocate another page */ @@ -1639,10 +1613,6 @@ static void ftrace_replace_code(int update) return; do_for_each_ftrace_rec(pg, rec) { - /* Skip over free records */ - if (rec->flags & FTRACE_FL_FREE) - continue; - failed = __ftrace_replace_code(rec, update); if (failed) { ftrace_bug(failed, rec->ip); @@ -2007,11 +1977,8 @@ static int ftrace_update_code(struct module *mod) * Do the initial record conversion from mcount jump * to the NOP instructions. */ - if (!ftrace_code_disable(mod, p)) { - ftrace_free_rec(p); - /* Game over */ + if (!ftrace_code_disable(mod, p)) break; - } ftrace_update_cnt++; @@ -2026,10 +1993,8 @@ static int ftrace_update_code(struct module *mod) */ if (ftrace_start_up && ref) { int failed = __ftrace_replace_code(p, 1); - if (failed) { + if (failed) ftrace_bug(failed, p->ip); - ftrace_free_rec(p); - } } } @@ -2223,9 +2188,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } } else { rec = &iter->pg->records[iter->idx++]; - if ((rec->flags & FTRACE_FL_FREE) || - - ((iter->flags & FTRACE_ITER_FILTER) && + if (((iter->flags & FTRACE_ITER_FILTER) && !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || ((iter->flags & FTRACE_ITER_NOTRACE) && @@ -2602,7 +2565,6 @@ match_records(struct ftrace_hash *hash, char *buff, goto out_unlock; do_for_each_ftrace_rec(pg, rec) { - if (ftrace_match_record(rec, mod, search, search_len, type)) { ret = enter_record(hash, rec, not); if (ret < 0) { @@ -3446,9 +3408,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) do_for_each_ftrace_rec(pg, rec) { - if (rec->flags & FTRACE_FL_FREE) - continue; - if (ftrace_match_record(rec, NULL, search, search_len, type)) { /* if it is in the array */ exists = false; @@ -3566,6 +3525,27 @@ static int ftrace_process_locs(struct module *mod, unsigned long flags = 0; /* Shut up gcc */ mutex_lock(&ftrace_lock); + /* + * Core and each module needs their own pages, as + * modules will free them when they are removed. + * Force a new page to be allocated for modules. + */ + if (mod) { + if (!ftrace_pages) + return -ENOMEM; + + /* + * If the last page was full, it will be + * allocated anyway. + */ + if (ftrace_pages->index != ENTRIES_PER_PAGE) { + ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); + if (!ftrace_pages->next) + return -ENOMEM; + ftrace_pages = ftrace_pages->next; + } + } + p = start; while (p < end) { addr = ftrace_call_adjust(*p++); @@ -3599,9 +3579,13 @@ static int ftrace_process_locs(struct module *mod, } #ifdef CONFIG_MODULES + +#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) + void ftrace_release_mod(struct module *mod) { struct dyn_ftrace *rec; + struct ftrace_page **last_pg; struct ftrace_page *pg; mutex_lock(&ftrace_lock); @@ -3609,16 +3593,30 @@ void ftrace_release_mod(struct module *mod) if (ftrace_disabled) goto out_unlock; - do_for_each_ftrace_rec(pg, rec) { + /* + * Each module has its own ftrace_pages, remove + * them from the list. + */ + last_pg = &ftrace_pages_start; + for (pg = ftrace_pages_start; pg; pg = *last_pg) { + rec = &pg->records[0]; if (within_module_core(rec->ip, mod)) { /* - * rec->ip is changed in ftrace_free_rec() - * It should not between s and e if record was freed. + * As core pages are first, the first + * page should never be a module page. */ - FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); - ftrace_free_rec(rec); - } - } while_for_each_ftrace_rec(); + if (WARN_ON(pg == ftrace_pages_start)) + goto out_unlock; + + /* Check if we are deleting the last page */ + if (pg == ftrace_pages) + ftrace_pages = next_to_ftrace_page(last_pg); + + *last_pg = pg->next; + free_page((unsigned long)pg); + } else + last_pg = &pg->next; + } out_unlock: mutex_unlock(&ftrace_lock); } -- cgit v1.2.3-70-g09d2 From a79008755497daff157f5294c02e3b940641cc11 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 16 Dec 2011 16:23:44 -0500 Subject: ftrace: Allocate the mcount record pages as groups Allocate the mcount record pages as a group of pages as big as can be allocated and waste no more than a single page. Grouping the mcount pages as much as possible helps with cache locality, as we do not need to redirect with descriptors as we cross from page to page. It also allows us to do more with the records later on (sort them with bigger benefits). Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 179 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 128 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index be6888f40d2..2e7218869fe 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -983,12 +983,13 @@ static DEFINE_MUTEX(ftrace_regex_lock); struct ftrace_page { struct ftrace_page *next; + struct dyn_ftrace *records; int index; - struct dyn_ftrace records[]; + int size; }; -#define ENTRIES_PER_PAGE \ - ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace)) +#define ENTRY_SIZE sizeof(struct dyn_ftrace) +#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE) /* estimate from running different kernels */ #define NR_TO_INIT 10000 @@ -1421,14 +1422,10 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops, static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { - if (ftrace_pages->index == ENTRIES_PER_PAGE) { - if (!ftrace_pages->next) { - /* allocate another page */ - ftrace_pages->next = - (void *)get_zeroed_page(GFP_KERNEL); - if (!ftrace_pages->next) - return NULL; - } + if (ftrace_pages->index == ftrace_pages->size) { + /* We should have allocated enough */ + if (WARN_ON(!ftrace_pages->next)) + return NULL; ftrace_pages = ftrace_pages->next; } @@ -2005,47 +2002,106 @@ static int ftrace_update_code(struct module *mod) return 0; } -static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) +static int ftrace_allocate_records(struct ftrace_page *pg, int count) { - struct ftrace_page *pg; + int order; int cnt; - int i; - /* allocate a few pages */ - ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); - if (!ftrace_pages_start) - return -1; + if (WARN_ON(!count)) + return -EINVAL; + + order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE)); /* - * Allocate a few more pages. - * - * TODO: have some parser search vmlinux before - * final linking to find all calls to ftrace. - * Then we can: - * a) know how many pages to allocate. - * and/or - * b) set up the table then. - * - * The dynamic code is still necessary for - * modules. + * We want to fill as much as possible. No more than a page + * may be empty. */ + while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE) + order--; - pg = ftrace_pages = ftrace_pages_start; + again: + pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); - cnt = num_to_init / ENTRIES_PER_PAGE; - pr_info("ftrace: allocating %ld entries in %d pages\n", - num_to_init, cnt + 1); + if (!pg->records) { + /* if we can't allocate this size, try something smaller */ + if (!order) + return -ENOMEM; + order >>= 1; + goto again; + } - for (i = 0; i < cnt; i++) { - pg->next = (void *)get_zeroed_page(GFP_KERNEL); + cnt = (PAGE_SIZE << order) / ENTRY_SIZE; + pg->size = cnt; - /* If we fail, we'll try later anyway */ - if (!pg->next) + if (cnt > count) + cnt = count; + + return cnt; +} + +static struct ftrace_page * +ftrace_allocate_pages(unsigned long num_to_init) +{ + struct ftrace_page *start_pg; + struct ftrace_page *pg; + int order; + int cnt; + + if (!num_to_init) + return 0; + + start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL); + if (!pg) + return NULL; + + /* + * Try to allocate as much as possible in one continues + * location that fills in all of the space. We want to + * waste as little space as possible. + */ + for (;;) { + cnt = ftrace_allocate_records(pg, num_to_init); + if (cnt < 0) + goto free_pages; + + num_to_init -= cnt; + if (!num_to_init) break; + pg->next = kzalloc(sizeof(*pg), GFP_KERNEL); + if (!pg->next) + goto free_pages; + pg = pg->next; } + return start_pg; + + free_pages: + while (start_pg) { + order = get_count_order(pg->size / ENTRIES_PER_PAGE); + free_pages((unsigned long)pg->records, order); + start_pg = pg->next; + kfree(pg); + pg = start_pg; + } + pr_info("ftrace: FAILED to allocate memory for functions\n"); + return NULL; +} + +static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) +{ + int cnt; + + if (!num_to_init) { + pr_info("ftrace: No functions to be traced?\n"); + return -1; + } + + cnt = num_to_init / ENTRIES_PER_PAGE; + pr_info("ftrace: allocating %ld entries in %d pages\n", + num_to_init, cnt + 1); + return 0; } @@ -3520,30 +3576,45 @@ static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) { + struct ftrace_page *pg; + unsigned long count; unsigned long *p; unsigned long addr; unsigned long flags = 0; /* Shut up gcc */ + int ret = -ENOMEM; + + count = end - start; + + if (!count) + return 0; + + pg = ftrace_allocate_pages(count); + if (!pg) + return -ENOMEM; mutex_lock(&ftrace_lock); + /* * Core and each module needs their own pages, as * modules will free them when they are removed. * Force a new page to be allocated for modules. */ - if (mod) { + if (!mod) { + WARN_ON(ftrace_pages || ftrace_pages_start); + /* First initialization */ + ftrace_pages = ftrace_pages_start = pg; + } else { if (!ftrace_pages) - return -ENOMEM; + goto out; - /* - * If the last page was full, it will be - * allocated anyway. - */ - if (ftrace_pages->index != ENTRIES_PER_PAGE) { - ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); - if (!ftrace_pages->next) - return -ENOMEM; - ftrace_pages = ftrace_pages->next; + if (WARN_ON(ftrace_pages->next)) { + /* Hmm, we have free pages? */ + while (ftrace_pages->next) + ftrace_pages = ftrace_pages->next; } + + ftrace_pages->next = pg; + ftrace_pages = pg; } p = start; @@ -3557,7 +3628,8 @@ static int ftrace_process_locs(struct module *mod, */ if (!addr) continue; - ftrace_record_ip(addr); + if (!ftrace_record_ip(addr)) + break; } /* @@ -3573,9 +3645,11 @@ static int ftrace_process_locs(struct module *mod, ftrace_update_code(mod); if (!mod) local_irq_restore(flags); + ret = 0; + out: mutex_unlock(&ftrace_lock); - return 0; + return ret; } #ifdef CONFIG_MODULES @@ -3587,6 +3661,7 @@ void ftrace_release_mod(struct module *mod) struct dyn_ftrace *rec; struct ftrace_page **last_pg; struct ftrace_page *pg; + int order; mutex_lock(&ftrace_lock); @@ -3613,7 +3688,9 @@ void ftrace_release_mod(struct module *mod) ftrace_pages = next_to_ftrace_page(last_pg); *last_pg = pg->next; - free_page((unsigned long)pg); + order = get_count_order(pg->size / ENTRIES_PER_PAGE); + free_pages((unsigned long)pg->records, order); + kfree(pg); } else last_pg = &pg->next; } -- cgit v1.2.3-70-g09d2 From 85ae32ae019bc1c2cc22e5f51fe0c9f2812ef68c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 16 Dec 2011 16:30:31 -0500 Subject: ftrace: Replace record newlist with record page list As new functions come in to be initalized from mcount to nop, they are done by groups of pages. Whether it is the core kernel or a module. There's no need to keep track of these on a per record basis. At startup, and as any module is loaded, the functions to be traced are stored in a group of pages and added to the function list at the end. We just need to keep a pointer to the first page of the list that was added, and use that to know where to start on the list for initializing functions. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 5 +--- kernel/trace/ftrace.c | 68 +++++++++++++++++++++++++++----------------------- 2 files changed, 38 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 3f79bc458bf..31b9fd7aedc 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -173,10 +173,7 @@ struct dyn_ftrace { unsigned long ip; /* address of mcount call-site */ struct dyn_ftrace *freelist; }; - union { - unsigned long flags; - struct dyn_ftrace *newlist; - }; + unsigned long flags; struct dyn_arch_ftrace arch; }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2e7218869fe..366d7881f18 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -977,8 +977,6 @@ static struct ftrace_ops global_ops = { .filter_hash = EMPTY_HASH, }; -static struct dyn_ftrace *ftrace_new_addrs; - static DEFINE_MUTEX(ftrace_regex_lock); struct ftrace_page { @@ -988,6 +986,8 @@ struct ftrace_page { int size; }; +static struct ftrace_page *ftrace_new_pgs; + #define ENTRY_SIZE sizeof(struct dyn_ftrace) #define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE) @@ -1445,8 +1445,6 @@ ftrace_record_ip(unsigned long ip) return NULL; rec->ip = ip; - rec->newlist = ftrace_new_addrs; - ftrace_new_addrs = rec; return rec; } @@ -1936,9 +1934,11 @@ static int ops_traces_mod(struct ftrace_ops *ops) static int ftrace_update_code(struct module *mod) { + struct ftrace_page *pg; struct dyn_ftrace *p; cycle_t start, stop; unsigned long ref = 0; + int i; /* * When adding a module, we need to check if tracers are @@ -1960,41 +1960,44 @@ static int ftrace_update_code(struct module *mod) start = ftrace_now(raw_smp_processor_id()); ftrace_update_cnt = 0; - while (ftrace_new_addrs) { + for (pg = ftrace_new_pgs; pg; pg = pg->next) { - /* If something went wrong, bail without enabling anything */ - if (unlikely(ftrace_disabled)) - return -1; + for (i = 0; i < pg->index; i++) { + /* If something went wrong, bail without enabling anything */ + if (unlikely(ftrace_disabled)) + return -1; - p = ftrace_new_addrs; - ftrace_new_addrs = p->newlist; - p->flags = ref; + p = &pg->records[i]; + p->flags = ref; - /* - * Do the initial record conversion from mcount jump - * to the NOP instructions. - */ - if (!ftrace_code_disable(mod, p)) - break; + /* + * Do the initial record conversion from mcount jump + * to the NOP instructions. + */ + if (!ftrace_code_disable(mod, p)) + break; - ftrace_update_cnt++; + ftrace_update_cnt++; - /* - * If the tracing is enabled, go ahead and enable the record. - * - * The reason not to enable the record immediatelly is the - * inherent check of ftrace_make_nop/ftrace_make_call for - * correct previous instructions. Making first the NOP - * conversion puts the module to the correct state, thus - * passing the ftrace_make_call check. - */ - if (ftrace_start_up && ref) { - int failed = __ftrace_replace_code(p, 1); - if (failed) - ftrace_bug(failed, p->ip); + /* + * If the tracing is enabled, go ahead and enable the record. + * + * The reason not to enable the record immediatelly is the + * inherent check of ftrace_make_nop/ftrace_make_call for + * correct previous instructions. Making first the NOP + * conversion puts the module to the correct state, thus + * passing the ftrace_make_call check. + */ + if (ftrace_start_up && ref) { + int failed = __ftrace_replace_code(p, 1); + if (failed) + ftrace_bug(failed, p->ip); + } } } + ftrace_new_pgs = NULL; + stop = ftrace_now(raw_smp_processor_id()); ftrace_update_time = stop - start; ftrace_update_tot_cnt += ftrace_update_cnt; @@ -3632,6 +3635,9 @@ static int ftrace_process_locs(struct module *mod, break; } + /* These new locations need to be initialized */ + ftrace_new_pgs = pg; + /* * We only need to disable interrupts on start up * because we are modifying code that an interrupt -- cgit v1.2.3-70-g09d2 From 68950619f8c82e468d8976130462617abea605a8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 16 Dec 2011 17:06:45 -0500 Subject: ftrace: Sort the mcount records on each page Sort records by ip locations of the ftrace mcount calls on each of the set of pages in the function list. This helps in localizing cache usuage when updating the function locations, as well as gives us the ability to quickly find an ip location in the list. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 366d7881f18..2d6f8bcd188 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -3575,6 +3576,29 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) return 0; } +static void ftrace_swap_recs(void *a, void *b, int size) +{ + struct dyn_ftrace *reca = a; + struct dyn_ftrace *recb = b; + struct dyn_ftrace t; + + t = *reca; + *reca = *recb; + *recb = t; +} + +static int ftrace_cmp_recs(const void *a, const void *b) +{ + const struct dyn_ftrace *reca = a; + const struct dyn_ftrace *recb = b; + + if (reca->ip > recb->ip) + return 1; + if (reca->ip < recb->ip) + return -1; + return 0; +} + static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) @@ -3638,6 +3662,11 @@ static int ftrace_process_locs(struct module *mod, /* These new locations need to be initialized */ ftrace_new_pgs = pg; + /* Make each individual set of pages sorted by ips */ + for (; pg; pg = pg->next) + sort(pg->records, pg->index, sizeof(struct dyn_ftrace), + ftrace_cmp_recs, ftrace_swap_recs); + /* * We only need to disable interrupts on start up * because we are modifying code that an interrupt -- cgit v1.2.3-70-g09d2 From 5855fead9cc358adebd6bdeec202d040c623ae38 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 16 Dec 2011 19:27:42 -0500 Subject: ftrace: Use bsearch to find record ip Now that each set of pages in the function list are sorted by ip, we can use bsearch to find a record within each set of pages. This speeds up the ftrace_location() function by magnitudes. For archs (like x86) that need to add a breakpoint at every function that will be converted from a nop to a callback and vice versa, the breakpoint callback needs to know if the breakpoint was for ftrace or not. It requires finding the breakpoint ip within the records. Doing a linear search is extremely inefficient. It is a must to be able to do a fast binary search to find these locations. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2d6f8bcd188..dcd3a814d39 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -1300,6 +1301,19 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) } \ } + +static int ftrace_cmp_recs(const void *a, const void *b) +{ + const struct dyn_ftrace *reca = a; + const struct dyn_ftrace *recb = b; + + if (reca->ip > recb->ip) + return 1; + if (reca->ip < recb->ip) + return -1; + return 0; +} + /** * ftrace_location - return true if the ip giving is a traced location * @ip: the instruction pointer to check @@ -1313,11 +1327,17 @@ int ftrace_location(unsigned long ip) { struct ftrace_page *pg; struct dyn_ftrace *rec; + struct dyn_ftrace key; - do_for_each_ftrace_rec(pg, rec) { - if (rec->ip == ip) + key.ip = ip; + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + rec = bsearch(&key, pg->records, pg->index, + sizeof(struct dyn_ftrace), + ftrace_cmp_recs); + if (rec) return 1; - } while_for_each_ftrace_rec(); + } return 0; } @@ -3587,18 +3607,6 @@ static void ftrace_swap_recs(void *a, void *b, int size) *recb = t; } -static int ftrace_cmp_recs(const void *a, const void *b) -{ - const struct dyn_ftrace *reca = a; - const struct dyn_ftrace *recb = b; - - if (reca->ip > recb->ip) - return 1; - if (reca->ip < recb->ip) - return -1; - return 0; -} - static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) -- cgit v1.2.3-70-g09d2 From c842e975520f8ab09e293cc92f51a1f396251fd5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Dec 2011 18:44:44 -0500 Subject: ftrace: Fix ftrace hash record update with notrace When disabling the "notrace" records, that means we want to trace them. If the notrace_hash is zero, it means that we want to trace all records. But to disable a zero notrace_hash means nothing. The check for the notrace_hash count was incorrect with: if (hash && !hash->count) return With the correct comment above it that states that we do nothing if the notrace_hash has zero count. But !hash also means that the notrace hash has zero count. I think this was done to protect against dereferencing NULL. But if !hash is true, then we go through the following loop without doing a single thing. Fix it to: if (!hash || !hash->count) return; Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index dcd3a814d39..a383d6c67bf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1381,7 +1381,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * If the notrace hash has no items, * then there's nothing to do. */ - if (hash && !hash->count) + if (!hash || !hash->count) return; } -- cgit v1.2.3-70-g09d2 From 06a51d9307380c78bb5c92e68fc80ad2c7d7f890 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Dec 2011 19:07:36 -0500 Subject: ftrace: Create ftrace_hash_empty() helper routine There are two types of hashes in the ftrace_ops; one type is the filter_hash and the other is the notrace_hash. Either one may be null, meaning it has no elements. But when elements are added, the hash is allocated. Throughout the code, a check needs to be made to see if a hash exists or the hash has elements, but the check if the hash exists is usually missing causing the possible "NULL pointer dereference bug". Add a helper routine called "ftrace_hash_empty()" that returns true if the hash doesn't exist or its count is zero. As they mean the same thing. Last-bug-reported-by: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a383d6c67bf..e1ee07f81ca 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -999,6 +999,11 @@ static struct ftrace_page *ftrace_new_pgs; static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; +static bool ftrace_hash_empty(struct ftrace_hash *hash) +{ + return !hash || !hash->count; +} + static struct ftrace_func_entry * ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) { @@ -1007,7 +1012,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) struct hlist_head *hhd; struct hlist_node *n; - if (!hash->count) + if (ftrace_hash_empty(hash)) return NULL; if (hash->size_bits > 0) @@ -1151,7 +1156,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) return NULL; /* Empty hash? */ - if (!hash || !hash->count) + if (ftrace_hash_empty(hash)) return new_hash; size = 1 << hash->size_bits; @@ -1276,9 +1281,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) filter_hash = rcu_dereference_raw(ops->filter_hash); notrace_hash = rcu_dereference_raw(ops->notrace_hash); - if ((!filter_hash || !filter_hash->count || + if ((ftrace_hash_empty(filter_hash) || ftrace_lookup_ip(filter_hash, ip)) && - (!notrace_hash || !notrace_hash->count || + (ftrace_hash_empty(notrace_hash) || !ftrace_lookup_ip(notrace_hash, ip))) ret = 1; else @@ -1371,7 +1376,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (filter_hash) { hash = ops->filter_hash; other_hash = ops->notrace_hash; - if (!hash || !hash->count) + if (ftrace_hash_empty(hash)) all = 1; } else { inc = !inc; @@ -1381,7 +1386,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * If the notrace hash has no items, * then there's nothing to do. */ - if (!hash || !hash->count) + if (ftrace_hash_empty(hash)) return; } @@ -1398,8 +1403,8 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) match = 1; } else { - in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip); - in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip); + in_hash = !!ftrace_lookup_ip(hash, rec->ip); + in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); /* * @@ -1407,7 +1412,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (filter_hash && in_hash && !in_other_hash) match = 1; else if (!filter_hash && in_hash && - (in_other_hash || !other_hash->count)) + (in_other_hash || ftrace_hash_empty(other_hash))) match = 1; } if (!match) @@ -1950,7 +1955,7 @@ static int ops_traces_mod(struct ftrace_ops *ops) struct ftrace_hash *hash; hash = ops->filter_hash; - return !!(!hash || !hash->count); + return ftrace_hash_empty(hash); } static int ftrace_update_code(struct module *mod) @@ -2320,7 +2325,8 @@ static void *t_start(struct seq_file *m, loff_t *pos) * off, we can short cut and just print out that all * functions are enabled. */ - if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) { + if (iter->flags & FTRACE_ITER_FILTER && + ftrace_hash_empty(ops->filter_hash)) { if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; -- cgit v1.2.3-70-g09d2 From fc13cb0ce45296f331263a6034aa1814203e1ac3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Dec 2011 14:41:25 -0500 Subject: ftrace: Allow other users of function tracing to use the output listing The function tracer is set up to allow any other subsystem (like perf) to use it. Ftrace already has a way to list what functions are enabled by the global_ops. It would be very helpful to let other users of the function tracer to be able to use the same code. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 35 +++++++++++++++++++++++++++++++++++ kernel/trace/ftrace.c | 41 +++++++++++++++++++++++++---------------- 2 files changed, 60 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 31b9fd7aedc..aa7559f0a22 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -202,6 +202,14 @@ enum { FTRACE_UPDATE_MAKE_NOP, }; +enum { + FTRACE_ITER_FILTER = (1 << 0), + FTRACE_ITER_NOTRACE = (1 << 1), + FTRACE_ITER_PRINTALL = (1 << 2), + FTRACE_ITER_HASH = (1 << 3), + FTRACE_ITER_ENABLED = (1 << 4), +}; + void arch_ftrace_update_code(int command); struct ftrace_rec_iter; @@ -217,6 +225,15 @@ int ftrace_location(unsigned long ip); extern ftrace_func_t ftrace_trace_function; +int ftrace_regex_open(struct ftrace_ops *ops, int flag, + struct inode *inode, struct file *file); +ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos); +ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos); +loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin); +int ftrace_regex_release(struct inode *inode, struct file *file); + /* defined in arch */ extern int ftrace_ip_converted(unsigned long ip); extern int ftrace_dyn_arch_init(void *data); @@ -311,6 +328,24 @@ static inline int ftrace_text_reserved(void *start, void *end) { return 0; } + +/* + * Again users of functions that have ftrace_ops may not + * have them defined when ftrace is not enabled, but these + * functions may still be called. Use a macro instead of inline. + */ +#define ftrace_regex_open(ops, flag, inod, file) ({ -ENODEV; }) + +static inline ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) { return -ENODEV; } +static inline ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) { return -ENODEV; } +static inline loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin) +{ + return -ENODEV; +} +static inline int +ftrace_regex_release(struct inode *inode, struct file *file) { return -ENODEV; } #endif /* CONFIG_DYNAMIC_FTRACE */ /* totally disable ftrace - can not re-enable after this */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e1ee07f81ca..5b105c5ddc0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2134,14 +2134,6 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) return 0; } -enum { - FTRACE_ITER_FILTER = (1 << 0), - FTRACE_ITER_NOTRACE = (1 << 1), - FTRACE_ITER_PRINTALL = (1 << 2), - FTRACE_ITER_HASH = (1 << 3), - FTRACE_ITER_ENABLED = (1 << 4), -}; - #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ struct ftrace_iterator { @@ -2249,7 +2241,7 @@ static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct ftrace_ops *ops = &global_ops; + struct ftrace_ops *ops = iter->ops; struct dyn_ftrace *rec = NULL; if (unlikely(ftrace_disabled)) @@ -2305,7 +2297,7 @@ static void reset_iter_read(struct ftrace_iterator *iter) static void *t_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct ftrace_ops *ops = &global_ops; + struct ftrace_ops *ops = iter->ops; void *p = NULL; loff_t l; @@ -2414,6 +2406,7 @@ ftrace_avail_open(struct inode *inode, struct file *file) return -ENOMEM; iter->pg = ftrace_pages_start; + iter->ops = &global_ops; ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { @@ -2442,6 +2435,7 @@ ftrace_enabled_open(struct inode *inode, struct file *file) iter->pg = ftrace_pages_start; iter->flags = FTRACE_ITER_ENABLED; + iter->ops = &global_ops; ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { @@ -2462,7 +2456,23 @@ static void ftrace_filter_reset(struct ftrace_hash *hash) mutex_unlock(&ftrace_lock); } -static int +/** + * ftrace_regex_open - initialize function tracer filter files + * @ops: The ftrace_ops that hold the hash filters + * @flag: The type of filter to process + * @inode: The inode, usually passed in to your open routine + * @file: The file, usually passed in to your open routine + * + * ftrace_regex_open() initializes the filter files for the + * @ops. Depending on @flag it may process the filter hash or + * the notrace hash of @ops. With this called from the open + * routine, you can use ftrace_filter_write() for the write + * routine if @flag has FTRACE_ITER_FILTER set, or + * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. + * ftrace_regex_lseek() should be used as the lseek routine, and + * release must call ftrace_regex_release(). + */ +int ftrace_regex_open(struct ftrace_ops *ops, int flag, struct inode *inode, struct file *file) { @@ -2542,7 +2552,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file) inode, file); } -static loff_t +loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin) { loff_t ret; @@ -3095,14 +3105,14 @@ out_unlock: return ret; } -static ssize_t +ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { return ftrace_regex_write(file, ubuf, cnt, ppos, 1); } -static ssize_t +ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -3292,8 +3302,7 @@ static void __init set_ftrace_early_filters(void) #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ } -static int -ftrace_regex_release(struct inode *inode, struct file *file) +int ftrace_regex_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; struct ftrace_iterator *iter; -- cgit v1.2.3-70-g09d2 From 69a3083c4a7df0322d97bb2b43a33cb12af8131a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Dec 2011 15:21:16 -0500 Subject: ftrace: Decouple hash items from showing filtered functions The set_ftrace_filter shows "hashed" functions, which are functions that are added with operations to them (like traceon and traceoff). As other subsystems may be able to show what functions they are using for function tracing, the hash items should no longer be shown just because the FILTER flag is set. As they have nothing to do with other subsystems filters. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 5 +++-- kernel/trace/ftrace.c | 16 ++++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index aa7559f0a22..d1ff0de1897 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -206,8 +206,9 @@ enum { FTRACE_ITER_FILTER = (1 << 0), FTRACE_ITER_NOTRACE = (1 << 1), FTRACE_ITER_PRINTALL = (1 << 2), - FTRACE_ITER_HASH = (1 << 3), - FTRACE_ITER_ENABLED = (1 << 4), + FTRACE_ITER_DO_HASH = (1 << 3), + FTRACE_ITER_HASH = (1 << 4), + FTRACE_ITER_ENABLED = (1 << 5), }; void arch_ftrace_update_code(int command); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5b105c5ddc0..5728d9aa632 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2198,6 +2198,9 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) void *p = NULL; loff_t l; + if (!(iter->flags & FTRACE_ITER_DO_HASH)) + return NULL; + if (iter->func_pos > *pos) return NULL; @@ -2343,12 +2346,8 @@ static void *t_start(struct seq_file *m, loff_t *pos) break; } - if (!p) { - if (iter->flags & FTRACE_ITER_FILTER) - return t_hash_start(m, pos); - - return NULL; - } + if (!p) + return t_hash_start(m, pos); return iter; } @@ -2541,8 +2540,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, static int ftrace_filter_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER, - inode, file); + return ftrace_regex_open(&global_ops, + FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, + inode, file); } static int -- cgit v1.2.3-70-g09d2 From d2d45c7a03a2b1a14159cbb665e9dd60991a7d4f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Dec 2011 14:44:09 -0500 Subject: tracing: Have stack_tracer use a separate list of functions The stack_tracer is used to look at every function and check if the current stack is bigger than the last recorded max stack size. When a new max is found, then it saves that stack off. Currently the stack tracer is limited by the global_ops of the function tracer. As the stack tracer has nothing to do with the ftrace function tracer, except that it uses it as its internal engine, the stack tracer should have its own list. A new file is added to the tracing debugfs directory called: stack_trace_filter that can be used to select which functions you want to check the stack on. Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 77575b386d9..0398b7c7afd 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -133,7 +133,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = stack_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL, }; static ssize_t @@ -311,6 +310,21 @@ static const struct file_operations stack_trace_fops = { .release = seq_release, }; +static int +stack_trace_filter_open(struct inode *inode, struct file *file) +{ + return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER, + inode, file); +} + +static const struct file_operations stack_trace_filter_fops = { + .open = stack_trace_filter_open, + .read = seq_read, + .write = ftrace_filter_write, + .llseek = ftrace_regex_lseek, + .release = ftrace_regex_release, +}; + int stack_trace_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -358,6 +372,9 @@ static __init int stack_trace_init(void) trace_create_file("stack_trace", 0444, d_tracer, NULL, &stack_trace_fops); + trace_create_file("stack_trace_filter", 0444, d_tracer, + NULL, &stack_trace_filter_fops); + if (stack_tracer_enabled) register_ftrace_function(&trace_ops); -- cgit v1.2.3-70-g09d2 From 2a85a37f168d2b4d74d493b578af4dc9032be92e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Dec 2011 21:57:44 -0500 Subject: ftrace: Allow access to the boot time function enabling Change set_ftrace_early_filter() to ftrace_set_early_filter() and make it a global function. This will allow other subsystems in the kernel to be able to enable function tracing at start up and reuse the ftrace function parsing code. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 3 +++ kernel/trace/ftrace.c | 8 ++++---- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index d1ff0de1897..41df6f50165 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -235,6 +235,9 @@ ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf, loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin); int ftrace_regex_release(struct inode *inode, struct file *file); +void __init +ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable); + /* defined in arch */ extern int ftrace_ip_converted(unsigned long ip); extern int ftrace_dyn_arch_init(void *data); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5728d9aa632..683d559a0ee 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3279,8 +3279,8 @@ static void __init set_ftrace_early_graph(char *buf) } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -static void __init -set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable) +void __init +ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable) { char *func; @@ -3293,9 +3293,9 @@ set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable) static void __init set_ftrace_early_filters(void) { if (ftrace_filter_buf[0]) - set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1); + ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1); if (ftrace_notrace_buf[0]) - set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0); + ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0); #ifdef CONFIG_FUNCTION_GRAPH_TRACER if (ftrace_graph_buf[0]) set_ftrace_early_graph(ftrace_graph_buf); -- cgit v1.2.3-70-g09d2 From 762e1207889b3451c50d365b741af6f9ce958886 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Dec 2011 22:01:00 -0500 Subject: tracing: Have stack tracing set filtered functions at boot Add stacktrace_filter= to the kernel command line that lets the user pick specific functions to check the stack on. Signed-off-by: Steven Rostedt --- Documentation/kernel-parameters.txt | 8 ++++++++ kernel/trace/trace_stack.c | 11 +++++++++++ 2 files changed, 19 insertions(+) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index fd5c913c33c..fde2ae06539 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2435,6 +2435,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted. stacktrace [FTRACE] Enabled the stack tracer on boot up. + stacktrace_filter=[function-list] + [FTRACE] Limit the functions that the stack tracer + will trace at boot up. function-list is a comma separated + list of functions. This list can be changed at run + time by the stack_trace_filter file in the debugfs + tracing directory. Note, this enables stack tracing + and the stacktrace above is not needed. + sti= [PARISC,HW] Format: Set the STI (builtin display/keyboard on the HP-PARISC diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 0398b7c7afd..d4545f49242 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -13,6 +13,9 @@ #include #include #include + +#include + #include "trace.h" #define STACK_TRACE_ENTRIES 500 @@ -352,8 +355,13 @@ stack_trace_sysctl(struct ctl_table *table, int write, return ret; } +static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata; + static __init int enable_stacktrace(char *str) { + if (strncmp(str, "_filter=", 8) == 0) + strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE); + stack_tracer_enabled = 1; last_stack_tracer_enabled = 1; return 1; @@ -375,6 +383,9 @@ static __init int stack_trace_init(void) trace_create_file("stack_trace_filter", 0444, d_tracer, NULL, &stack_trace_filter_fops); + if (stack_trace_filter_buf[0]) + ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); + if (stack_tracer_enabled) register_ftrace_function(&trace_ops); -- cgit v1.2.3-70-g09d2 From 38b78eb855409a05f9d370228bec1955e6878e08 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Dec 2011 14:31:35 -0800 Subject: tracing: Factorize filter creation There are four places where new filter for a given filter string is created, which involves several different steps. This patch factors those steps into create_[system_]filter() functions which in turn make use of create_filter_{start|finish}() for common parts. The only functional change is that if replace_filter_string() is requested and fails, creation fails without any side effect instead of being ignored. Note that system filter is now installed after the processing is complete which makes freeing before and then restoring filter string on error unncessary. -v2: Rebased to resolve conflict with 49aa29513e and updated both create_filter() functions to always set *filterp instead of requiring the caller to clear it to %NULL on entry. Link: http://lkml.kernel.org/r/1323988305-1469-2-git-send-email-tj@kernel.org Signed-off-by: Tejun Heo Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 283 +++++++++++++++++++------------------ 1 file changed, 142 insertions(+), 141 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f04cc3136bd..24aee712745 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1738,11 +1738,121 @@ static int replace_system_preds(struct event_subsystem *system, return -ENOMEM; } +static int create_filter_start(char *filter_str, bool set_str, + struct filter_parse_state **psp, + struct event_filter **filterp) +{ + struct event_filter *filter; + struct filter_parse_state *ps = NULL; + int err = 0; + + WARN_ON_ONCE(*psp || *filterp); + + /* allocate everything, and if any fails, free all and fail */ + filter = __alloc_filter(); + if (filter && set_str) + err = replace_filter_string(filter, filter_str); + + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + + if (!filter || !ps || err) { + kfree(ps); + __free_filter(filter); + return -ENOMEM; + } + + /* we're committed to creating a new filter */ + *filterp = filter; + *psp = ps; + + parse_init(ps, filter_ops, filter_str); + err = filter_parse(ps); + if (err && set_str) + append_filter_err(ps, filter); + return err; +} + +static void create_filter_finish(struct filter_parse_state *ps) +{ + if (ps) { + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); + } +} + +/** + * create_filter - create a filter for a ftrace_event_call + * @call: ftrace_event_call to create a filter for + * @filter_str: filter string + * @set_str: remember @filter_str and enable detailed error in filter + * @filterp: out param for created filter (always updated on return) + * + * Creates a filter for @call with @filter_str. If @set_str is %true, + * @filter_str is copied and recorded in the new filter. + * + * On success, returns 0 and *@filterp points to the new filter. On + * failure, returns -errno and *@filterp may point to %NULL or to a new + * filter. In the latter case, the returned filter contains error + * information if @set_str is %true and the caller is responsible for + * freeing it. + */ +static int create_filter(struct ftrace_event_call *call, + char *filter_str, bool set_str, + struct event_filter **filterp) +{ + struct event_filter *filter = NULL; + struct filter_parse_state *ps = NULL; + int err; + + err = create_filter_start(filter_str, set_str, &ps, &filter); + if (!err) { + err = replace_preds(call, filter, ps, filter_str, false); + if (err && set_str) + append_filter_err(ps, filter); + } + create_filter_finish(ps); + + *filterp = filter; + return err; +} + +/** + * create_system_filter - create a filter for an event_subsystem + * @system: event_subsystem to create a filter for + * @filter_str: filter string + * @filterp: out param for created filter (always updated on return) + * + * Identical to create_filter() except that it creates a subsystem filter + * and always remembers @filter_str. + */ +static int create_system_filter(struct event_subsystem *system, + char *filter_str, struct event_filter **filterp) +{ + struct event_filter *filter = NULL; + struct filter_parse_state *ps = NULL; + int err; + + err = create_filter_start(filter_str, true, &ps, &filter); + if (!err) { + err = replace_system_preds(system, ps, filter_str); + if (!err) { + /* System filters just show a default message */ + kfree(filter->filter_string); + filter->filter_string = NULL; + } else { + append_filter_err(ps, filter); + } + } + create_filter_finish(ps); + + *filterp = filter; + return err; +} + int apply_event_filter(struct ftrace_event_call *call, char *filter_string) { - struct filter_parse_state *ps; struct event_filter *filter; - struct event_filter *tmp; int err = 0; mutex_lock(&event_mutex); @@ -1759,49 +1869,30 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) goto out_unlock; } - err = -ENOMEM; - ps = kzalloc(sizeof(*ps), GFP_KERNEL); - if (!ps) - goto out_unlock; - - filter = __alloc_filter(); - if (!filter) { - kfree(ps); - goto out_unlock; - } - - replace_filter_string(filter, filter_string); - - parse_init(ps, filter_ops, filter_string); - err = filter_parse(ps); - if (err) { - append_filter_err(ps, filter); - goto out; - } + err = create_filter(call, filter_string, true, &filter); - err = replace_preds(call, filter, ps, filter_string, false); - if (err) { - filter_disable(call); - append_filter_err(ps, filter); - } else - call->flags |= TRACE_EVENT_FL_FILTERED; -out: /* * Always swap the call filter with the new filter * even if there was an error. If there was an error * in the filter, we disable the filter and show the error * string */ - tmp = call->filter; - rcu_assign_pointer(call->filter, filter); - if (tmp) { - /* Make sure the call is done with the filter */ - synchronize_sched(); - __free_filter(tmp); + if (filter) { + struct event_filter *tmp = call->filter; + + if (!err) + call->flags |= TRACE_EVENT_FL_FILTERED; + else + filter_disable(call); + + rcu_assign_pointer(call->filter, filter); + + if (tmp) { + /* Make sure the call is done with the filter */ + synchronize_sched(); + __free_filter(tmp); + } } - filter_opstack_clear(ps); - postfix_clear(ps); - kfree(ps); out_unlock: mutex_unlock(&event_mutex); @@ -1811,7 +1902,6 @@ out_unlock: int apply_subsystem_event_filter(struct event_subsystem *system, char *filter_string) { - struct filter_parse_state *ps; struct event_filter *filter; int err = 0; @@ -1835,48 +1925,19 @@ int apply_subsystem_event_filter(struct event_subsystem *system, goto out_unlock; } - err = -ENOMEM; - ps = kzalloc(sizeof(*ps), GFP_KERNEL); - if (!ps) - goto out_unlock; - - filter = __alloc_filter(); - if (!filter) - goto out; - - /* System filters just show a default message */ - kfree(filter->filter_string); - filter->filter_string = NULL; - - /* - * No event actually uses the system filter - * we can free it without synchronize_sched(). - */ - __free_filter(system->filter); - system->filter = filter; - - parse_init(ps, filter_ops, filter_string); - err = filter_parse(ps); - if (err) - goto err_filter; - - err = replace_system_preds(system, ps, filter_string); - if (err) - goto err_filter; - -out: - filter_opstack_clear(ps); - postfix_clear(ps); - kfree(ps); + err = create_system_filter(system, filter_string, &filter); + if (filter) { + /* + * No event actually uses the system filter + * we can free it without synchronize_sched(). + */ + __free_filter(system->filter); + system->filter = filter; + } out_unlock: mutex_unlock(&event_mutex); return err; - -err_filter: - replace_filter_string(filter, filter_string); - append_filter_err(ps, system->filter); - goto out; } #ifdef CONFIG_PERF_EVENTS @@ -1894,7 +1955,6 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, { int err; struct event_filter *filter; - struct filter_parse_state *ps; struct ftrace_event_call *call; mutex_lock(&event_mutex); @@ -1909,33 +1969,10 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, if (event->filter) goto out_unlock; - filter = __alloc_filter(); - if (!filter) { - err = PTR_ERR(filter); - goto out_unlock; - } - - err = -ENOMEM; - ps = kzalloc(sizeof(*ps), GFP_KERNEL); - if (!ps) - goto free_filter; - - parse_init(ps, filter_ops, filter_str); - err = filter_parse(ps); - if (err) - goto free_ps; - - err = replace_preds(call, filter, ps, filter_str, false); + err = create_filter(call, filter_str, false, &filter); if (!err) event->filter = filter; - -free_ps: - filter_opstack_clear(ps); - postfix_clear(ps); - kfree(ps); - -free_filter: - if (err) + else __free_filter(filter); out_unlock: @@ -1954,43 +1991,6 @@ out_unlock: #define CREATE_TRACE_POINTS #include "trace_events_filter_test.h" -static int test_get_filter(char *filter_str, struct ftrace_event_call *call, - struct event_filter **pfilter) -{ - struct event_filter *filter; - struct filter_parse_state *ps; - int err = -ENOMEM; - - filter = __alloc_filter(); - if (!filter) - goto out; - - ps = kzalloc(sizeof(*ps), GFP_KERNEL); - if (!ps) - goto free_filter; - - parse_init(ps, filter_ops, filter_str); - err = filter_parse(ps); - if (err) - goto free_ps; - - err = replace_preds(call, filter, ps, filter_str, false); - if (!err) - *pfilter = filter; - - free_ps: - filter_opstack_clear(ps); - postfix_clear(ps); - kfree(ps); - - free_filter: - if (err) - __free_filter(filter); - - out: - return err; -} - #define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \ { \ .filter = FILTER, \ @@ -2109,12 +2109,13 @@ static __init int ftrace_test_event_filter(void) struct test_filter_data_t *d = &test_filter_data[i]; int err; - err = test_get_filter(d->filter, &event_ftrace_test_filter, - &filter); + err = create_filter(&event_ftrace_test_filter, d->filter, + false, &filter); if (err) { printk(KERN_INFO "Failed to get filter for '%s', err %d\n", d->filter, err); + __free_filter(filter); break; } -- cgit v1.2.3-70-g09d2 From b7e724d303b684655e4ca3dabd5a6840ad19012d Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 12:25:15 -0500 Subject: capabilities: reverse arguments to security_capable security_capable takes ns, cred, cap. But the LSM capable() hook takes cred, ns, cap. The capability helper functions also take cred, ns, cap. Rather than flip argument order just to flip it back, leave them alone. Heck, this should be a little faster since argument will be in the right place! Signed-off-by: Eric Paris --- drivers/pci/pci-sysfs.c | 2 +- include/linux/security.h | 6 +++--- kernel/capability.c | 2 +- security/security.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 7bcf12adced..a4457ab6134 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -431,7 +431,7 @@ pci_read_config(struct file *filp, struct kobject *kobj, u8 *data = (u8*) buf; /* Several chips lock up trying to read undefined config space */ - if (security_capable(&init_user_ns, filp->f_cred, CAP_SYS_ADMIN) == 0) { + if (security_capable(filp->f_cred, &init_user_ns, CAP_SYS_ADMIN) == 0) { size = dev->cfg_size; } else if (dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) { size = 128; diff --git a/include/linux/security.h b/include/linux/security.h index 4921163b275..ee969ff40a2 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1666,7 +1666,7 @@ int security_capset(struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted); -int security_capable(struct user_namespace *ns, const struct cred *cred, +int security_capable(const struct cred *cred, struct user_namespace *ns, int cap); int security_real_capable(struct task_struct *tsk, struct user_namespace *ns, int cap); @@ -1863,8 +1863,8 @@ static inline int security_capset(struct cred *new, return cap_capset(new, old, effective, inheritable, permitted); } -static inline int security_capable(struct user_namespace *ns, - const struct cred *cred, int cap) +static inline int security_capable(const struct cred *cred, + struct user_namespace *ns, int cap) { return cap_capable(cred, ns, cap, SECURITY_CAP_AUDIT); } diff --git a/kernel/capability.c b/kernel/capability.c index 283c529f8b1..d98392719ad 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -374,7 +374,7 @@ bool ns_capable(struct user_namespace *ns, int cap) BUG(); } - if (security_capable(ns, current_cred(), cap) == 0) { + if (security_capable(current_cred(), ns, cap) == 0) { current->flags |= PF_SUPERPRIV; return true; } diff --git a/security/security.c b/security/security.c index 9ae68c64455..b9e57f4fc44 100644 --- a/security/security.c +++ b/security/security.c @@ -154,7 +154,7 @@ int security_capset(struct cred *new, const struct cred *old, effective, inheritable, permitted); } -int security_capable(struct user_namespace *ns, const struct cred *cred, +int security_capable(const struct cred *cred, struct user_namespace *ns, int cap) { return security_ops->capable(cred, ns, cap, SECURITY_CAP_AUDIT); -- cgit v1.2.3-70-g09d2 From 2920a8409de5a51575d03deca07e5bb2be6fc98d Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 12:25:15 -0500 Subject: capabilities: remove all _real_ interfaces The name security_real_capable and security_real_capable_noaudit just don't make much sense to me. Convert them to use security_capable and security_capable_noaudit. Signed-off-by: Eric Paris Acked-by: Serge E. Hallyn --- include/linux/security.h | 25 ------------------------- kernel/capability.c | 18 +++++++++++++++--- security/security.c | 24 ------------------------ 3 files changed, 15 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/include/linux/security.h b/include/linux/security.h index caff54eee68..e345a9313a6 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1670,10 +1670,6 @@ int security_capable(const struct cred *cred, struct user_namespace *ns, int cap); int security_capable_noaudit(const struct cred *cred, struct user_namespace *ns, int cap); -int security_real_capable(struct task_struct *tsk, struct user_namespace *ns, - int cap); -int security_real_capable_noaudit(struct task_struct *tsk, - struct user_namespace *ns, int cap); int security_quotactl(int cmds, int type, int id, struct super_block *sb); int security_quota_on(struct dentry *dentry); int security_syslog(int type); @@ -1876,27 +1872,6 @@ static inline int security_capable_noaudit(const struct cred *cred, return cap_capable(cred, ns, cap, SECURITY_CAP_NOAUDIT); } -static inline int security_real_capable(struct task_struct *tsk, struct user_namespace *ns, int cap) -{ - int ret; - - rcu_read_lock(); - ret = cap_capable(__task_cred(tsk), ns, cap, SECURITY_CAP_AUDIT); - rcu_read_unlock(); - return ret; -} - -static inline -int security_real_capable_noaudit(struct task_struct *tsk, struct user_namespace *ns, int cap) -{ - int ret; - - rcu_read_lock(); - ret = cap_capable(__task_cred(tsk), ns, cap, SECURITY_CAP_NOAUDIT); - rcu_read_unlock(); - return ret; -} - static inline int security_quotactl(int cmds, int type, int id, struct super_block *sb) { diff --git a/kernel/capability.c b/kernel/capability.c index d98392719ad..ff50ab62cfc 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -298,7 +298,11 @@ error: */ bool has_capability(struct task_struct *t, int cap) { - int ret = security_real_capable(t, &init_user_ns, cap); + int ret; + + rcu_read_lock(); + ret = security_capable(__task_cred(t), &init_user_ns, cap); + rcu_read_unlock(); return (ret == 0); } @@ -317,7 +321,11 @@ bool has_capability(struct task_struct *t, int cap) bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap) { - int ret = security_real_capable(t, ns, cap); + int ret; + + rcu_read_lock(); + ret = security_capable(__task_cred(t), ns, cap); + rcu_read_unlock(); return (ret == 0); } @@ -335,7 +343,11 @@ bool has_ns_capability(struct task_struct *t, */ bool has_capability_noaudit(struct task_struct *t, int cap) { - int ret = security_real_capable_noaudit(t, &init_user_ns, cap); + int ret; + + rcu_read_lock(); + ret = security_capable_noaudit(__task_cred(t), &init_user_ns, cap); + rcu_read_unlock(); return (ret == 0); } diff --git a/security/security.c b/security/security.c index b7edaae77d1..8900c5c4db5 100644 --- a/security/security.c +++ b/security/security.c @@ -166,30 +166,6 @@ int security_capable_noaudit(const struct cred *cred, struct user_namespace *ns, return security_ops->capable(cred, ns, cap, SECURITY_CAP_NOAUDIT); } -int security_real_capable(struct task_struct *tsk, struct user_namespace *ns, - int cap) -{ - const struct cred *cred; - int ret; - - cred = get_task_cred(tsk); - ret = security_ops->capable(cred, ns, cap, SECURITY_CAP_AUDIT); - put_cred(cred); - return ret; -} - -int security_real_capable_noaudit(struct task_struct *tsk, - struct user_namespace *ns, int cap) -{ - const struct cred *cred; - int ret; - - cred = get_task_cred(tsk); - ret = security_ops->capable(cred, ns, cap, SECURITY_CAP_NOAUDIT); - put_cred(cred); - return ret; -} - int security_quotactl(int cmds, int type, int id, struct super_block *sb) { return security_ops->quotactl(cmds, type, id, sb); -- cgit v1.2.3-70-g09d2 From 25e75703410a84b80623da3653db6b70282e5c6a Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 12:25:15 -0500 Subject: capabilities: call has_ns_capability from has_capability Declare the more specific has_ns_capability first in the code and then call it from has_capability. The declaration reversal isn't stricty necessary since they are both declared in header files, but it just makes sense to put more specific functions first in the code. Signed-off-by: Eric Paris Acked-by: Serge E. Hallyn --- kernel/capability.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index ff50ab62cfc..fb815d1b9ea 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -287,47 +287,41 @@ error: } /** - * has_capability - Does a task have a capability in init_user_ns + * has_ns_capability - Does a task have a capability in a specific user ns * @t: The task in question + * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability - * currently in effect to the initial user namespace, false if not. + * currently in effect to the specified user namespace, false if not. * * Note that this does not set PF_SUPERPRIV on the task. */ -bool has_capability(struct task_struct *t, int cap) +bool has_ns_capability(struct task_struct *t, + struct user_namespace *ns, int cap) { int ret; rcu_read_lock(); - ret = security_capable(__task_cred(t), &init_user_ns, cap); + ret = security_capable(__task_cred(t), ns, cap); rcu_read_unlock(); return (ret == 0); } /** - * has_capability - Does a task have a capability in a specific user ns + * has_capability - Does a task have a capability in init_user_ns * @t: The task in question - * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability - * currently in effect to the specified user namespace, false if not. + * currently in effect to the initial user namespace, false if not. * * Note that this does not set PF_SUPERPRIV on the task. */ -bool has_ns_capability(struct task_struct *t, - struct user_namespace *ns, int cap) +bool has_capability(struct task_struct *t, int cap) { - int ret; - - rcu_read_lock(); - ret = security_capable(__task_cred(t), ns, cap); - rcu_read_unlock(); - - return (ret == 0); + return has_ns_capability(t, &init_user_ns, cap); } /** -- cgit v1.2.3-70-g09d2 From 7b61d648499e74dbec3d4ce645675e0ae040ae78 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 12:25:15 -0500 Subject: capabilites: introduce new has_ns_capabilities_noaudit For consistency in interfaces, introduce a new interface called has_ns_capabilities_noaudit. It checks if the given task has the given capability in the given namespace. Use this new function by has_capabilities_noaudit. Signed-off-by: Eric Paris Acked-by: Serge E. Hallyn --- include/linux/capability.h | 2 ++ kernel/capability.c | 30 +++++++++++++++++++++++++----- 2 files changed, 27 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/capability.h b/include/linux/capability.h index c4211235000..63f59fa8769 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -543,6 +543,8 @@ extern bool has_capability(struct task_struct *t, int cap); extern bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap); extern bool has_capability_noaudit(struct task_struct *t, int cap); +extern bool has_ns_capability_noaudit(struct task_struct *t, + struct user_namespace *ns, int cap); extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); extern bool task_ns_capable(struct task_struct *t, int cap); diff --git a/kernel/capability.c b/kernel/capability.c index fb815d1b9ea..d8398e96247 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -325,27 +325,47 @@ bool has_capability(struct task_struct *t, int cap) } /** - * has_capability_noaudit - Does a task have a capability (unaudited) + * has_ns_capability_noaudit - Does a task have a capability (unaudited) + * in a specific user ns. * @t: The task in question + * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability - * currently in effect to init_user_ns, false if not. Don't write an - * audit message for the check. + * currently in effect to the specified user namespace, false if not. + * Do not write an audit message for the check. * * Note that this does not set PF_SUPERPRIV on the task. */ -bool has_capability_noaudit(struct task_struct *t, int cap) +bool has_ns_capability_noaudit(struct task_struct *t, + struct user_namespace *ns, int cap) { int ret; rcu_read_lock(); - ret = security_capable_noaudit(__task_cred(t), &init_user_ns, cap); + ret = security_capable_noaudit(__task_cred(t), ns, cap); rcu_read_unlock(); return (ret == 0); } +/** + * has_capability_noaudit - Does a task have a capability (unaudited) in the + * initial user ns + * @t: The task in question + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect to init_user_ns, false if not. Don't write an + * audit message for the check. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +bool has_capability_noaudit(struct task_struct *t, int cap) +{ + return has_ns_capability_noaudit(t, &init_user_ns, cap); +} + /** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for -- cgit v1.2.3-70-g09d2 From 105ddf49cd301b7929a92f269440e8e562ef19db Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 12:25:15 -0500 Subject: capabilities: style only - move capable below ns_capable Although the current code is fine for consistency this moves the capable code below the function it calls in the c file. It doesn't actually change code. Signed-off-by: Eric Paris Acked-by: Serge E. Hallyn --- kernel/capability.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index d8398e96247..5f99e5d68e1 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -366,22 +366,6 @@ bool has_capability_noaudit(struct task_struct *t, int cap) return has_ns_capability_noaudit(t, &init_user_ns, cap); } -/** - * capable - Determine if the current task has a superior capability in effect - * @cap: The capability to be tested for - * - * Return true if the current task has the given superior capability currently - * available for use, false if not. - * - * This sets PF_SUPERPRIV on the task if the capability is available on the - * assumption that it's about to be used. - */ -bool capable(int cap) -{ - return ns_capable(&init_user_ns, cap); -} -EXPORT_SYMBOL(capable); - /** * ns_capable - Determine if the current task has a superior capability in effect * @ns: The usernamespace we want the capability in @@ -408,6 +392,22 @@ bool ns_capable(struct user_namespace *ns, int cap) } EXPORT_SYMBOL(ns_capable); +/** + * capable - Determine if the current task has a superior capability in effect + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool capable(int cap) +{ + return ns_capable(&init_user_ns, cap); +} +EXPORT_SYMBOL(capable); + /** * task_ns_capable - Determine whether current task has a superior * capability targeted at a specific task's user namespace. -- cgit v1.2.3-70-g09d2 From d2a7009f0bb03fa22ad08dd25472efa0568126b9 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 12:25:15 -0500 Subject: capabitlies: ns_capable can use the cap helpers rather than lsm call Just to reduce the number of places to change if we every change the LSM hook, use the capability helpers internally when possible. Signed-off-by: Eric Paris Acked-by: Serge E. Hallyn --- kernel/capability.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 5f99e5d68e1..47626446c39 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -384,7 +384,7 @@ bool ns_capable(struct user_namespace *ns, int cap) BUG(); } - if (security_capable(current_cred(), ns, cap) == 0) { + if (has_ns_capability(current, ns, cap)) { current->flags |= PF_SUPERPRIV; return true; } -- cgit v1.2.3-70-g09d2 From f1c84dae0ecc51aa35c81f19a0ebcd6c0921ddcb Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 12:25:15 -0500 Subject: capabilities: remove task_ns_* functions task_ in the front of a function, in the security subsystem anyway, means to me at least, that we are operating with that task as the subject of the security decision. In this case what it means is that we are using current as the subject but we use the task to get the right namespace. Who in the world would ever realize that's what task_ns_capability means just by the name? This patch eliminates the task_ns functions entirely and uses the has_ns_capability function instead. This means we explicitly open code the ns in question in the caller. I think it makes the caller a LOT more clear what is going on. Signed-off-by: Eric Paris Acked-by: Serge E. Hallyn --- include/linux/capability.h | 1 - include/linux/cred.h | 6 ++++-- kernel/capability.c | 14 -------------- kernel/ptrace.c | 4 ++-- kernel/sched.c | 2 +- 5 files changed, 7 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/linux/capability.h b/include/linux/capability.h index 63f59fa8769..e3e8d9cb9b0 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -547,7 +547,6 @@ extern bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap); extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); -extern bool task_ns_capable(struct task_struct *t, int cap); extern bool nsown_capable(int cap); /* audit system wants to get cap info from files as well */ diff --git a/include/linux/cred.h b/include/linux/cred.h index 40308969ed0..adadf71a732 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -358,10 +358,12 @@ static inline void put_cred(const struct cred *_cred) #define current_security() (current_cred_xxx(security)) #ifdef CONFIG_USER_NS -#define current_user_ns() (current_cred_xxx(user_ns)) +#define current_user_ns() (current_cred_xxx(user_ns)) +#define task_user_ns(task) (task_cred_xxx((task), user_ns)) #else extern struct user_namespace init_user_ns; -#define current_user_ns() (&init_user_ns) +#define current_user_ns() (&init_user_ns) +#define task_user_ns(task) (&init_user_ns) #endif diff --git a/kernel/capability.c b/kernel/capability.c index 47626446c39..74fb3b60304 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -408,20 +408,6 @@ bool capable(int cap) } EXPORT_SYMBOL(capable); -/** - * task_ns_capable - Determine whether current task has a superior - * capability targeted at a specific task's user namespace. - * @t: The task whose user namespace is targeted. - * @cap: The capability in question. - * - * Return true if it does, false otherwise. - */ -bool task_ns_capable(struct task_struct *t, int cap) -{ - return ns_capable(task_cred_xxx(t, user)->user_ns, cap); -} -EXPORT_SYMBOL(task_ns_capable); - /** * nsown_capable - Check superior capability to one's own user_ns * @cap: The capability in question diff --git a/kernel/ptrace.c b/kernel/ptrace.c index a70d2a5d8c7..210bbf045ee 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -196,7 +196,7 @@ ok: smp_rmb(); if (task->mm) dumpable = get_dumpable(task->mm); - if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE)) + if (!dumpable && !ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) return -EPERM; return security_ptrace_access_check(task, mode); @@ -266,7 +266,7 @@ static int ptrace_attach(struct task_struct *task, long request, task->ptrace = PT_PTRACED; if (seize) task->ptrace |= PT_SEIZED; - if (task_ns_capable(task, CAP_SYS_PTRACE)) + if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) task->ptrace |= PT_PTRACE_CAP; __ptrace_link(task, current); diff --git a/kernel/sched.c b/kernel/sched.c index b50b0f0c9aa..5670028a9c1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5409,7 +5409,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) goto out_free_cpus_allowed; } retval = -EPERM; - if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) + if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) goto out_unlock; retval = security_task_setscheduler(p); -- cgit v1.2.3-70-g09d2 From 69f594a38967f4540ce7a29b3fd214e68a8330bd Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 12:25:15 -0500 Subject: ptrace: do not audit capability check when outputing /proc/pid/stat Reading /proc/pid/stat of another process checks if one has ptrace permissions on that process. If one does have permissions it outputs some data about the process which might have security and attack implications. If the current task does not have ptrace permissions the read still works, but those fields are filled with inocuous (0) values. Since this check and a subsequent denial is not a violation of the security policy we should not audit such denials. This can be quite useful to removing ptrace broadly across a system without flooding the logs when ps is run or something which harmlessly walks proc. Signed-off-by: Eric Paris Acked-by: Serge E. Hallyn --- fs/proc/array.c | 2 +- include/linux/ptrace.h | 5 +++-- kernel/ptrace.c | 12 ++++++++++-- security/selinux/hooks.c | 2 +- 4 files changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/fs/proc/array.c b/fs/proc/array.c index 3a1dafd228d..ddffd7a88b9 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -380,7 +380,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, state = *get_task_state(task); vsize = eip = esp = 0; - permitted = ptrace_may_access(task, PTRACE_MODE_READ); + permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT); mm = get_task_mm(task); if (mm) { vsize = task_vsize(mm); diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 800f113bea6..a27e56ca41a 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -127,8 +127,9 @@ extern void __ptrace_link(struct task_struct *child, struct task_struct *new_parent); extern void __ptrace_unlink(struct task_struct *child); extern void exit_ptrace(struct task_struct *tracer); -#define PTRACE_MODE_READ 1 -#define PTRACE_MODE_ATTACH 2 +#define PTRACE_MODE_READ 0x01 +#define PTRACE_MODE_ATTACH 0x02 +#define PTRACE_MODE_NOAUDIT 0x04 /* Returns 0 on success, -errno on denial. */ extern int __ptrace_may_access(struct task_struct *task, unsigned int mode); /* Returns true on success, false on denial. */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 210bbf045ee..c890ac9a796 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -161,6 +161,14 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state) return ret; } +static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) +{ + if (mode & PTRACE_MODE_NOAUDIT) + return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE); + else + return has_ns_capability(current, ns, CAP_SYS_PTRACE); +} + int __ptrace_may_access(struct task_struct *task, unsigned int mode) { const struct cred *cred = current_cred(), *tcred; @@ -187,7 +195,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) cred->gid == tcred->sgid && cred->gid == tcred->gid)) goto ok; - if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE)) + if (ptrace_has_cap(tcred->user->user_ns, mode)) goto ok; rcu_read_unlock(); return -EPERM; @@ -196,7 +204,7 @@ ok: smp_rmb(); if (task->mm) dumpable = get_dumpable(task->mm); - if (!dumpable && !ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) + if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode)) return -EPERM; return security_ptrace_access_check(task, mode); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c9605c4a2e0..14f94cd29c8 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1809,7 +1809,7 @@ static int selinux_ptrace_access_check(struct task_struct *child, if (rc) return rc; - if (mode == PTRACE_MODE_READ) { + if (mode & PTRACE_MODE_READ) { u32 sid = current_sid(); u32 csid = task_sid(child); return avc_has_perm(sid, csid, SECCLASS_FILE, FILE__READ, NULL); -- cgit v1.2.3-70-g09d2 From fd778461524849afd035679030ae8e8873c72b81 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 12:25:16 -0500 Subject: security: remove the security_netlink_recv hook as it is equivalent to capable() Once upon a time netlink was not sync and we had to get the effective capabilities from the skb that was being received. Today we instead get the capabilities from the current task. This has rendered the entire purpose of the hook moot as it is now functionally equivalent to the capable() call. Signed-off-by: Eric Paris --- drivers/scsi/scsi_netlink.c | 2 +- include/linux/security.h | 14 -------------- kernel/audit.c | 4 ++-- net/core/rtnetlink.c | 2 +- net/decnet/netfilter/dn_rtmsg.c | 2 +- net/ipv4/netfilter/ip_queue.c | 2 +- net/ipv6/netfilter/ip6_queue.c | 2 +- net/netfilter/nfnetlink.c | 2 +- net/netlink/genetlink.c | 2 +- net/xfrm/xfrm_user.c | 2 +- security/capability.c | 1 - security/commoncap.c | 8 -------- security/security.c | 6 ------ security/selinux/hooks.c | 19 ------------------- 14 files changed, 10 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/drivers/scsi/scsi_netlink.c b/drivers/scsi/scsi_netlink.c index 26a8a45584e..feee1cc39ea 100644 --- a/drivers/scsi/scsi_netlink.c +++ b/drivers/scsi/scsi_netlink.c @@ -111,7 +111,7 @@ scsi_nl_rcv_msg(struct sk_buff *skb) goto next_msg; } - if (security_netlink_recv(skb, CAP_SYS_ADMIN)) { + if (!capable(CAP_SYS_ADMIN)) { err = -EPERM; goto next_msg; } diff --git a/include/linux/security.h b/include/linux/security.h index e345a9313a6..ba2d531c123 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -95,7 +95,6 @@ struct xfrm_user_sec_ctx; struct seq_file; extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb); -extern int cap_netlink_recv(struct sk_buff *skb, int cap); void reset_security_ops(void); @@ -792,12 +791,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @skb contains the sk_buff structure for the netlink message. * Return 0 if the information was successfully saved and message * is allowed to be transmitted. - * @netlink_recv: - * Check permission before processing the received netlink message in - * @skb. - * @skb contains the sk_buff structure for the netlink message. - * @cap indicates the capability required - * Return 0 if permission is granted. * * Security hooks for Unix domain networking. * @@ -1556,7 +1549,6 @@ struct security_operations { struct sembuf *sops, unsigned nsops, int alter); int (*netlink_send) (struct sock *sk, struct sk_buff *skb); - int (*netlink_recv) (struct sk_buff *skb, int cap); void (*d_instantiate) (struct dentry *dentry, struct inode *inode); @@ -1803,7 +1795,6 @@ void security_d_instantiate(struct dentry *dentry, struct inode *inode); int security_getprocattr(struct task_struct *p, char *name, char **value); int security_setprocattr(struct task_struct *p, char *name, void *value, size_t size); int security_netlink_send(struct sock *sk, struct sk_buff *skb); -int security_netlink_recv(struct sk_buff *skb, int cap); int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen); int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid); void security_release_secctx(char *secdata, u32 seclen); @@ -2478,11 +2469,6 @@ static inline int security_netlink_send(struct sock *sk, struct sk_buff *skb) return cap_netlink_send(sk, skb); } -static inline int security_netlink_recv(struct sk_buff *skb, int cap) -{ - return cap_netlink_recv(skb, cap); -} - static inline int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) { return -EOPNOTSUPP; diff --git a/kernel/audit.c b/kernel/audit.c index 0a1355ca3d7..f3ba55fa0b7 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -601,13 +601,13 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) case AUDIT_TTY_SET: case AUDIT_TRIM: case AUDIT_MAKE_EQUIV: - if (security_netlink_recv(skb, CAP_AUDIT_CONTROL)) + if (!capable(CAP_AUDIT_CONTROL)) err = -EPERM; break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: - if (security_netlink_recv(skb, CAP_AUDIT_WRITE)) + if (!capable(CAP_AUDIT_WRITE)) err = -EPERM; break; default: /* bad msg */ diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 99d9e953fe3..d3a62819671 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1931,7 +1931,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) sz_idx = type>>2; kind = type&3; - if (kind != 2 && security_netlink_recv(skb, CAP_NET_ADMIN)) + if (kind != 2 && !capable(CAP_NET_ADMIN)) return -EPERM; if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index 69975e0bcde..1531135130d 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -108,7 +108,7 @@ static inline void dnrmg_receive_user_skb(struct sk_buff *skb) if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) return; - if (security_netlink_recv(skb, CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN)) RCV_SKB_FAIL(-EPERM); /* Eventually we might send routing messages too */ diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index e59aabd0eae..ffabb267471 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -430,7 +430,7 @@ __ipq_rcv_skb(struct sk_buff *skb) if (type <= IPQM_BASE) return; - if (security_netlink_recv(skb, CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN)) RCV_SKB_FAIL(-EPERM); spin_lock_bh(&queue_lock); diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c index e63c3972a73..5e5ce778be7 100644 --- a/net/ipv6/netfilter/ip6_queue.c +++ b/net/ipv6/netfilter/ip6_queue.c @@ -431,7 +431,7 @@ __ipq_rcv_skb(struct sk_buff *skb) if (type <= IPQM_BASE) return; - if (security_netlink_recv(skb, CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN)) RCV_SKB_FAIL(-EPERM); spin_lock_bh(&queue_lock); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 1905976b513..e6c2b8f3218 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -130,7 +130,7 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) const struct nfnetlink_subsystem *ss; int type, err; - if (security_netlink_recv(skb, CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN)) return -EPERM; /* All the messages must at least contain nfgenmsg */ diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 482fa571b4e..05fedbf489a 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -516,7 +516,7 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EOPNOTSUPP; if ((ops->flags & GENL_ADMIN_PERM) && - security_netlink_recv(skb, CAP_NET_ADMIN)) + !capable(CAP_NET_ADMIN)) return -EPERM; if (nlh->nlmsg_flags & NLM_F_DUMP) { diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 0256b8a0a7c..71de86698ef 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2290,7 +2290,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) link = &xfrm_dispatch[type]; /* All operations require privileges, even GET */ - if (security_netlink_recv(skb, CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN)) return -EPERM; if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) || diff --git a/security/capability.c b/security/capability.c index 2984ea4f776..a2c064d1044 100644 --- a/security/capability.c +++ b/security/capability.c @@ -999,7 +999,6 @@ void __init security_fixup_ops(struct security_operations *ops) set_to_cap_if_null(ops, sem_semctl); set_to_cap_if_null(ops, sem_semop); set_to_cap_if_null(ops, netlink_send); - set_to_cap_if_null(ops, netlink_recv); set_to_cap_if_null(ops, d_instantiate); set_to_cap_if_null(ops, getprocattr); set_to_cap_if_null(ops, setprocattr); diff --git a/security/commoncap.c b/security/commoncap.c index 89f02ff66af..7817a763444 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -56,14 +56,6 @@ int cap_netlink_send(struct sock *sk, struct sk_buff *skb) return 0; } -int cap_netlink_recv(struct sk_buff *skb, int cap) -{ - if (!cap_raised(current_cap(), cap)) - return -EPERM; - return 0; -} -EXPORT_SYMBOL(cap_netlink_recv); - /** * cap_capable - Determine whether a task has a particular effective capability * @cred: The credentials to use diff --git a/security/security.c b/security/security.c index 8900c5c4db5..85481a9c563 100644 --- a/security/security.c +++ b/security/security.c @@ -922,12 +922,6 @@ int security_netlink_send(struct sock *sk, struct sk_buff *skb) return security_ops->netlink_send(sk, skb); } -int security_netlink_recv(struct sk_buff *skb, int cap) -{ - return security_ops->netlink_recv(skb, cap); -} -EXPORT_SYMBOL(security_netlink_recv); - int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) { return security_ops->secid_to_secctx(secid, secdata, seclen); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 14f94cd29c8..3e37d25a9bb 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -4713,24 +4713,6 @@ static int selinux_netlink_send(struct sock *sk, struct sk_buff *skb) return selinux_nlmsg_perm(sk, skb); } -static int selinux_netlink_recv(struct sk_buff *skb, int capability) -{ - int err; - struct common_audit_data ad; - u32 sid; - - err = cap_netlink_recv(skb, capability); - if (err) - return err; - - COMMON_AUDIT_DATA_INIT(&ad, CAP); - ad.u.cap = capability; - - security_task_getsecid(current, &sid); - return avc_has_perm(sid, sid, SECCLASS_CAPABILITY, - CAP_TO_MASK(capability), &ad); -} - static int ipc_alloc_security(struct task_struct *task, struct kern_ipc_perm *perm, u16 sclass) @@ -5459,7 +5441,6 @@ static struct security_operations selinux_ops = { .vm_enough_memory = selinux_vm_enough_memory, .netlink_send = selinux_netlink_send, - .netlink_recv = selinux_netlink_recv, .bprm_set_creds = selinux_bprm_set_creds, .bprm_committing_creds = selinux_bprm_committing_creds, -- cgit v1.2.3-70-g09d2 From 9b9fb610f6800e0db46cccd8618dd7e609c9bb5a Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 10 Jan 2012 09:24:05 +0900 Subject: sched: Remove empty #ifdefs Signed-off-by: Hiroshi Shimamoto Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/4F0B8525.8070901@ct.jp.nec.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4dbfd04a214..457c881873c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7136,10 +7136,6 @@ void set_curr_task(int cpu, struct task_struct *p) #endif -#ifdef CONFIG_RT_GROUP_SCHED -#else /* !CONFIG_RT_GROUP_SCHED */ -#endif /* CONFIG_RT_GROUP_SCHED */ - #ifdef CONFIG_CGROUP_SCHED /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); @@ -7248,9 +7244,6 @@ void sched_move_task(struct task_struct *tsk) } #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_FAIR_GROUP_SCHED -#endif - #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) static unsigned long to_ratio(u64 period, u64 runtime) { -- cgit v1.2.3-70-g09d2 From c6968e73b90c2a2fb9a32d4bad249f8f70f70125 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 10 Jan 2012 15:07:31 -0800 Subject: PM/Hibernate: do not count debug pages as savable When debugging with CONFIG_DEBUG_PAGEALLOC and debug_guardpage_minorder > 0, we have lot of free pages that are not marked so. Snapshot code account them as savable, what cause hibernate memory preallocation failure. It is pretty hard to make hibernate allocation succeed with debug_guardpage_minorder=1. This change at least make it possible when system has relatively big amount of RAM. Signed-off-by: Stanislaw Gruszka Acked-by: Rafael J. Wysocki Cc: Andrea Arcangeli Cc: Christoph Lameter Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index cbe2c144139..1cf88900ec4 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -858,6 +858,9 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) PageReserved(page)) return NULL; + if (page_is_guard(page)) + return NULL; + return page; } @@ -920,6 +923,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) && (!kernel_page_present(page) || pfn_is_nosave(pfn))) return NULL; + if (page_is_guard(page)) + return NULL; + return page; } -- cgit v1.2.3-70-g09d2 From 43d2b113241d6797b890318767e0af78e313414b Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 10 Jan 2012 15:08:09 -0800 Subject: tracepoint: add tracepoints for debugging oom_score_adj oom_score_adj is used for guarding processes from OOM-Killer. One of problem is that it's inherited at fork(). When a daemon set oom_score_adj and make children, it's hard to know where the value is set. This patch adds some tracepoints useful for debugging. This patch adds 3 trace points. - creating new task - renaming a task (exec) - set oom_score_adj To debug, users need to enable some trace pointer. Maybe filtering is useful as # EVENT=/sys/kernel/debug/tracing/events/task/ # echo "oom_score_adj != 0" > $EVENT/task_newtask/filter # echo "oom_score_adj != 0" > $EVENT/task_rename/filter # echo 1 > $EVENT/enable # EVENT=/sys/kernel/debug/tracing/events/oom/ # echo 1 > $EVENT/enable output will be like this. # grep oom /sys/kernel/debug/tracing/trace bash-7699 [007] d..3 5140.744510: oom_score_adj_update: pid=7699 comm=bash oom_score_adj=-1000 bash-7699 [007] ...1 5151.818022: task_newtask: pid=7729 comm=bash clone_flags=1200011 oom_score_adj=-1000 ls-7729 [003] ...2 5151.818504: task_rename: pid=7729 oldcomm=bash newcomm=ls oom_score_adj=-1000 bash-7699 [002] ...1 5175.701468: task_newtask: pid=7730 comm=bash clone_flags=1200011 oom_score_adj=-1000 grep-7730 [007] ...2 5175.701993: task_rename: pid=7730 oldcomm=bash newcomm=grep oom_score_adj=-1000 Signed-off-by: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 4 +++ fs/proc/base.c | 3 +++ include/trace/events/oom.h | 33 ++++++++++++++++++++++++ include/trace/events/task.h | 61 +++++++++++++++++++++++++++++++++++++++++++++ kernel/fork.c | 6 +++++ mm/oom_kill.c | 6 +++++ 6 files changed, 113 insertions(+) create mode 100644 include/trace/events/oom.h create mode 100644 include/trace/events/task.h (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 3f64b9f26e7..aeb135c7ff5 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -59,6 +59,8 @@ #include #include #include + +#include #include "internal.h" int core_uses_pid; @@ -1054,6 +1056,8 @@ void set_task_comm(struct task_struct *tsk, char *buf) { task_lock(tsk); + trace_task_rename(tsk, buf); + /* * Threads may access current->comm without holding * the task lock, so write the string carefully. diff --git a/fs/proc/base.c b/fs/proc/base.c index a1dddda999f..1aab5fe05a1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -86,6 +86,7 @@ #ifdef CONFIG_HARDWALL #include #endif +#include #include "internal.h" /* NOTE: @@ -1010,6 +1011,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, else task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; + trace_oom_score_adj_update(task); err_sighand: unlock_task_sighand(task, &flags); err_task_lock: @@ -1097,6 +1099,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, task->signal->oom_score_adj = oom_score_adj; if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) task->signal->oom_score_adj_min = oom_score_adj; + trace_oom_score_adj_update(task); /* * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is * always attainable. diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h new file mode 100644 index 00000000000..dd4ba3b9200 --- /dev/null +++ b/include/trace/events/oom.h @@ -0,0 +1,33 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM oom + +#if !defined(_TRACE_OOM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_OOM_H +#include + +TRACE_EVENT(oom_score_adj_update, + + TP_PROTO(struct task_struct *task), + + TP_ARGS(task), + + TP_STRUCT__entry( + __field( pid_t, pid) + __array( char, comm, TASK_COMM_LEN ) + __field( int, oom_score_adj) + ), + + TP_fast_assign( + __entry->pid = task->pid; + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); + __entry->oom_score_adj = task->signal->oom_score_adj; + ), + + TP_printk("pid=%d comm=%s oom_score_adj=%d", + __entry->pid, __entry->comm, __entry->oom_score_adj) +); + +#endif + +/* This part must be outside protection */ +#include diff --git a/include/trace/events/task.h b/include/trace/events/task.h new file mode 100644 index 00000000000..b53add02e92 --- /dev/null +++ b/include/trace/events/task.h @@ -0,0 +1,61 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM task + +#if !defined(_TRACE_TASK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_TASK_H +#include + +TRACE_EVENT(task_newtask, + + TP_PROTO(struct task_struct *task, unsigned long clone_flags), + + TP_ARGS(task, clone_flags), + + TP_STRUCT__entry( + __field( pid_t, pid) + __array( char, comm, TASK_COMM_LEN) + __field( unsigned long, clone_flags) + __field( int, oom_score_adj) + ), + + TP_fast_assign( + __entry->pid = task->pid; + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); + __entry->clone_flags = clone_flags; + __entry->oom_score_adj = task->signal->oom_score_adj; + ), + + TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%d", + __entry->pid, __entry->comm, + __entry->clone_flags, __entry->oom_score_adj) +); + +TRACE_EVENT(task_rename, + + TP_PROTO(struct task_struct *task, char *comm), + + TP_ARGS(task, comm), + + TP_STRUCT__entry( + __field( pid_t, pid) + __array( char, oldcomm, TASK_COMM_LEN) + __array( char, newcomm, TASK_COMM_LEN) + __field( int, oom_score_adj) + ), + + TP_fast_assign( + __entry->pid = task->pid; + memcpy(entry->oldcomm, task->comm, TASK_COMM_LEN); + memcpy(entry->newcomm, comm, TASK_COMM_LEN); + __entry->oom_score_adj = task->signal->oom_score_adj; + ), + + TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%d", + __entry->pid, __entry->oldcomm, + __entry->newcomm, __entry->oom_score_adj) +); + +#endif + +/* This part must be outside protection */ +#include diff --git a/kernel/fork.c b/kernel/fork.c index b00711ce7c1..5e1391b5ade 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -76,6 +76,9 @@ #include +#define CREATE_TRACE_POINTS +#include + /* * Protected counters by write_lock_irq(&tasklist_lock) */ @@ -1370,6 +1373,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (clone_flags & CLONE_THREAD) threadgroup_change_end(current); perf_event_fork(p); + + trace_task_newtask(p, clone_flags); + return p; bad_fork_free_pid: diff --git a/mm/oom_kill.c b/mm/oom_kill.c index eeb27e27dce..7c122faa05c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -33,6 +33,10 @@ #include #include #include +#include + +#define CREATE_TRACE_POINTS +#include int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; @@ -55,6 +59,7 @@ void compare_swap_oom_score_adj(int old_val, int new_val) spin_lock_irq(&sighand->siglock); if (current->signal->oom_score_adj == old_val) current->signal->oom_score_adj = new_val; + trace_oom_score_adj_update(current); spin_unlock_irq(&sighand->siglock); } @@ -74,6 +79,7 @@ int test_set_oom_score_adj(int new_val) spin_lock_irq(&sighand->siglock); old_val = current->signal->oom_score_adj; current->signal->oom_score_adj = new_val; + trace_oom_score_adj_update(current); spin_unlock_irq(&sighand->siglock); return old_val; -- cgit v1.2.3-70-g09d2 From 5e6292c0f28f03dfdb8ea3d685f0b838a23bfba4 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 10 Jan 2012 15:11:17 -0800 Subject: signal: add block_sigmask() for adding sigmask to current->blocked Abstract the code sequence for adding a signal handler's sa_mask to current->blocked because the sequence is identical for all architectures. Furthermore, in the past some architectures actually got this code wrong, so introduce a wrapper that all architectures can use. Signed-off-by: Matt Fleming Signed-off-by: Oleg Nesterov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Tejun Heo Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/signal.c | 6 +----- include/linux/signal.h | 1 + kernel/signal.c | 21 +++++++++++++++++++++ 3 files changed, 23 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 54ddaeb221c..46a01bdc27e 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -682,7 +682,6 @@ static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs *regs) { - sigset_t blocked; int ret; /* Are we from a system call? */ @@ -733,10 +732,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, */ regs->flags &= ~X86_EFLAGS_TF; - sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(&blocked, sig); - set_current_blocked(&blocked); + block_sigmask(ka, sig); tracehook_signal_handler(sig, info, ka, regs, test_thread_flag(TIF_SINGLESTEP)); diff --git a/include/linux/signal.h b/include/linux/signal.h index a822300a253..7987ce74874 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -254,6 +254,7 @@ extern void set_current_blocked(const sigset_t *); extern int show_unhandled_signals; extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie); +extern void block_sigmask(struct k_sigaction *ka, int signr); extern void exit_signals(struct task_struct *tsk); extern struct kmem_cache *sighand_cachep; diff --git a/kernel/signal.c b/kernel/signal.c index bb0efa5705e..d532f1709fb 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2318,6 +2318,27 @@ relock: return signr; } +/** + * block_sigmask - add @ka's signal mask to current->blocked + * @ka: action for @signr + * @signr: signal that has been successfully delivered + * + * This function should be called when a signal has succesfully been + * delivered. It adds the mask of signals for @ka to current->blocked + * so that they are blocked during the execution of the signal + * handler. In addition, @signr will be blocked unless %SA_NODEFER is + * set in @ka->sa.sa_flags. + */ +void block_sigmask(struct k_sigaction *ka, int signr) +{ + sigset_t blocked; + + sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); + if (!(ka->sa.sa_flags & SA_NODEFER)) + sigaddset(&blocked, signr); + set_current_blocked(&blocked); +} + /* * It could be that complete_signal() picked us to notify about the * group-wide signal. Other threads should be notified now to take -- cgit v1.2.3-70-g09d2 From b196be89cdc14a88cc637cdad845a75c5886c82d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 10 Jan 2012 15:11:35 -0800 Subject: workqueue: make alloc_workqueue() take printf fmt and args for name alloc_workqueue() currently expects the passed in @name pointer to remain accessible. This is inconvenient and a bit silly given that the whole wq is being dynamically allocated. This patch updates alloc_workqueue() and friends to take printf format string instead of opaque string and matching varargs at the end. The name is allocated together with the wq and formatted. alloc_ordered_workqueue() is converted to a macro to unify varargs handling with alloc_workqueue(), and, while at it, add comment to alloc_workqueue(). None of the current in-kernel users pass in string with '%' as constant name and this change shouldn't cause any problem. [akpm@linux-foundation.org: use __printf] Signed-off-by: Tejun Heo Suggested-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/workqueue.h | 47 +++++++++++++++++++++++++++++++---------------- kernel/workqueue.c | 32 ++++++++++++++++++++++---------- 2 files changed, 53 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 0d556deb497..eb8b9f15f2e 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -297,32 +297,50 @@ extern struct workqueue_struct *system_unbound_wq; extern struct workqueue_struct *system_freezable_wq; extern struct workqueue_struct * -__alloc_workqueue_key(const char *name, unsigned int flags, int max_active, - struct lock_class_key *key, const char *lock_name); +__alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, + struct lock_class_key *key, const char *lock_name, ...) __printf(1, 6); +/** + * alloc_workqueue - allocate a workqueue + * @fmt: printf format for the name of the workqueue + * @flags: WQ_* flags + * @max_active: max in-flight work items, 0 for default + * @args: args for @fmt + * + * Allocate a workqueue with the specified parameters. For detailed + * information on WQ_* flags, please refer to Documentation/workqueue.txt. + * + * The __lock_name macro dance is to guarantee that single lock_class_key + * doesn't end up with different namesm, which isn't allowed by lockdep. + * + * RETURNS: + * Pointer to the allocated workqueue on success, %NULL on failure. + */ #ifdef CONFIG_LOCKDEP -#define alloc_workqueue(name, flags, max_active) \ +#define alloc_workqueue(fmt, flags, max_active, args...) \ ({ \ static struct lock_class_key __key; \ const char *__lock_name; \ \ - if (__builtin_constant_p(name)) \ - __lock_name = (name); \ + if (__builtin_constant_p(fmt)) \ + __lock_name = (fmt); \ else \ - __lock_name = #name; \ + __lock_name = #fmt; \ \ - __alloc_workqueue_key((name), (flags), (max_active), \ - &__key, __lock_name); \ + __alloc_workqueue_key((fmt), (flags), (max_active), \ + &__key, __lock_name, ##args); \ }) #else -#define alloc_workqueue(name, flags, max_active) \ - __alloc_workqueue_key((name), (flags), (max_active), NULL, NULL) +#define alloc_workqueue(fmt, flags, max_active, args...) \ + __alloc_workqueue_key((fmt), (flags), (max_active), \ + NULL, NULL, ##args) #endif /** * alloc_ordered_workqueue - allocate an ordered workqueue - * @name: name of the workqueue + * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful) + * @args: args for @fmt * * Allocate an ordered workqueue. An ordered workqueue executes at * most one work item at any given time in the queued order. They are @@ -331,11 +349,8 @@ __alloc_workqueue_key(const char *name, unsigned int flags, int max_active, * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ -static inline struct workqueue_struct * -alloc_ordered_workqueue(const char *name, unsigned int flags) -{ - return alloc_workqueue(name, WQ_UNBOUND | flags, 1); -} +#define alloc_ordered_workqueue(fmt, flags, args...) \ + alloc_workqueue(fmt, WQ_UNBOUND | (flags), 1, ##args) #define create_workqueue(name) \ alloc_workqueue((name), WQ_MEM_RECLAIM, 1) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 42fa9ad0a81..bec7b5b53e0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -242,10 +242,10 @@ struct workqueue_struct { int nr_drainers; /* W: drain in progress */ int saved_max_active; /* W: saved cwq max_active */ - const char *name; /* I: workqueue name */ #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif + char name[]; /* I: workqueue name */ }; struct workqueue_struct *system_wq __read_mostly; @@ -2954,14 +2954,29 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, return clamp_val(max_active, 1, lim); } -struct workqueue_struct *__alloc_workqueue_key(const char *name, +struct workqueue_struct *__alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, struct lock_class_key *key, - const char *lock_name) + const char *lock_name, ...) { + va_list args, args1; struct workqueue_struct *wq; unsigned int cpu; + size_t namelen; + + /* determine namelen, allocate wq and format name */ + va_start(args, lock_name); + va_copy(args1, args); + namelen = vsnprintf(NULL, 0, fmt, args) + 1; + + wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL); + if (!wq) + goto err; + + vsnprintf(wq->name, namelen, fmt, args1); + va_end(args); + va_end(args1); /* * Workqueues which may be used during memory reclaim should @@ -2978,12 +2993,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, flags |= WQ_HIGHPRI; max_active = max_active ?: WQ_DFL_ACTIVE; - max_active = wq_clamp_max_active(max_active, flags, name); - - wq = kzalloc(sizeof(*wq), GFP_KERNEL); - if (!wq) - goto err; + max_active = wq_clamp_max_active(max_active, flags, wq->name); + /* init wq */ wq->flags = flags; wq->saved_max_active = max_active; mutex_init(&wq->flush_mutex); @@ -2991,7 +3003,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, INIT_LIST_HEAD(&wq->flusher_queue); INIT_LIST_HEAD(&wq->flusher_overflow); - wq->name = name; lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); INIT_LIST_HEAD(&wq->list); @@ -3020,7 +3031,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, if (!rescuer) goto err; - rescuer->task = kthread_create(rescuer_thread, wq, "%s", name); + rescuer->task = kthread_create(rescuer_thread, wq, "%s", + wq->name); if (IS_ERR(rescuer->task)) goto err; -- cgit v1.2.3-70-g09d2 From 6b550f9495947fc279d12c38feaf98500e8d0646 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Tue, 10 Jan 2012 15:11:37 -0800 Subject: user namespace: make signal.c respect user namespaces ipc/mqueue.c: for __SI_MESQ, convert the uid being sent to recipient's user namespace. (new, thanks Oleg) __send_signal: convert current's uid to the recipient's user namespace for any siginfo which is not SI_FROMKERNEL (patch from Oleg, thanks again :) do_notify_parent and do_notify_parent_cldstop: map task's uid to parent's user namespace ptrace_signal maps parent's uid into current's user namespace before including in signal to current. IIUC Oleg has argued that this shouldn't matter as the debugger will play with it, but it seems like not converting the value currently being set is misleading. Changelog: Sep 20: Inspired by Oleg's suggestion, define map_cred_ns() helper to simplify callers and help make clear what we are translating (which uid into which namespace). Passing the target task would make callers even easier to read, but we pass in user_ns because current_user_ns() != task_cred_xxx(current, user_ns). Sep 20: As recommended by Oleg, also put task_pid_vnr() under rcu_read_lock in ptrace_signal(). Sep 23: In send_signal(), detect when (user) signal is coming from an ancestor or unrelated user namespace. Pass that on to __send_signal, which sets si_uid to 0 or overflowuid if needed. Oct 12: Base on Oleg's fixup_uid() patch. On top of that, handle all SI_FROMKERNEL cases at callers, because we can't assume sender is current in those cases. Nov 10: (mhelsley) rename fixup_uid to more meaningful usern_fixup_signal_uid Nov 10: (akpm) make the !CONFIG_USER_NS case clearer Signed-off-by: Serge Hallyn Cc: Oleg Nesterov Cc: Matt Helsley Cc: "Eric W. Biederman" From: Serge Hallyn Subject: __send_signal: pass q->info, not info, to userns_fixup_signal_uid (v2) Eric Biederman pointed out that passing info is a bug and could lead to a NULL pointer deref to boot. A collection of signal, securebits, filecaps, cap_bounds, and a few other ltp tests passed with this kernel. Changelog: Nov 18: previous patch missed a leading '&' Signed-off-by: Serge Hallyn Cc: "Eric W. Biederman" From: Dan Carpenter Subject: ipc/mqueue: lock() => unlock() typo There was a double lock typo introduced in b085f4bd6b21 "user namespace: make signal.c respect user namespaces" Signed-off-by: Dan Carpenter Cc: Oleg Nesterov Cc: Matt Helsley Cc: "Eric W. Biederman" Acked-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/mqueue.c | 7 ++++++- kernel/signal.c | 43 ++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 46 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 9a142a29074..9b7c8ab7d75 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -542,9 +543,13 @@ static void __do_notify(struct mqueue_inode_info *info) sig_i.si_errno = 0; sig_i.si_code = SI_MESGQ; sig_i.si_value = info->notify.sigev_value; + /* map current pid/uid into info->owner's namespaces */ + rcu_read_lock(); sig_i.si_pid = task_tgid_nr_ns(current, ns_of_pid(info->notify_owner)); - sig_i.si_uid = current_uid(); + sig_i.si_uid = user_ns_map_uid(info->user->user_ns, + current_cred(), current_uid()); + rcu_read_unlock(); kill_pid_info(info->notify.sigev_signo, &sig_i, info->notify_owner); diff --git a/kernel/signal.c b/kernel/signal.c index d532f1709fb..c73c4284160 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -28,6 +28,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -1019,6 +1020,34 @@ static inline int legacy_queue(struct sigpending *signals, int sig) return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } +/* + * map the uid in struct cred into user namespace *ns + */ +static inline uid_t map_cred_ns(const struct cred *cred, + struct user_namespace *ns) +{ + return user_ns_map_uid(ns, cred, cred->uid); +} + +#ifdef CONFIG_USER_NS +static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) +{ + if (current_user_ns() == task_cred_xxx(t, user_ns)) + return; + + if (SI_FROMKERNEL(info)) + return; + + info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns), + current_cred(), info->si_uid); +} +#else +static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) +{ + return; +} +#endif + static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, int group, int from_ancestor_ns) { @@ -1088,6 +1117,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, q->info.si_pid = 0; break; } + + userns_fixup_signal_uid(&q->info, t); + } else if (!is_si_special(info)) { if (sig >= SIGRTMIN && info->si_code != SI_USER) { /* @@ -1626,7 +1658,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig) */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); - info.si_uid = __task_cred(tsk)->uid; + info.si_uid = map_cred_ns(__task_cred(tsk), + task_cred_xxx(tsk->parent, user_ns)); rcu_read_unlock(); info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); @@ -1709,7 +1742,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); - info.si_uid = __task_cred(tsk)->uid; + info.si_uid = map_cred_ns(__task_cred(tsk), + task_cred_xxx(parent, user_ns)); rcu_read_unlock(); info.si_utime = cputime_to_clock_t(tsk->utime); @@ -2125,8 +2159,11 @@ static int ptrace_signal(int signr, siginfo_t *info, info->si_signo = signr; info->si_errno = 0; info->si_code = SI_USER; + rcu_read_lock(); info->si_pid = task_pid_vnr(current->parent); - info->si_uid = task_uid(current->parent); + info->si_uid = map_cred_ns(__task_cred(current->parent), + current_user_ns()); + rcu_read_unlock(); } /* If the (new) signal is now blocked, requeue it. */ -- cgit v1.2.3-70-g09d2 From bced76aeaca03b45e3b4bdb868cada328e497847 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Jan 2012 13:11:12 +0100 Subject: sched: Fix lockup by limiting load-balance retries on lock-break Eric and David reported dead machines and traced it to commit a195f004 ("sched: Fix load-balance lock-breaking"), it turns out there's still a scenario where we can end up re-trying forever. Since there is no strict forward progress guarantee in the load-balance iteration we can get stuck re-retrying the same task-set over and over. Creating a forward progress guarantee with the existing structure is somewhat non-trivial, for now simply terminate the retry loop after a few tries. Reported-by: Eric Dumazet Tested-by: Eric Dumazet Reported-by: David Ahern [ logic cleanup as suggested by Eric ] Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Frederic Weisbecker Cc: Suresh Siddha Link: http://lkml.kernel.org/r/1326297936.2442.157.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8e42de9105f..84adb2d66cb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3130,8 +3130,10 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) } #define LBF_ALL_PINNED 0x01 -#define LBF_NEED_BREAK 0x02 -#define LBF_ABORT 0x04 +#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */ +#define LBF_HAD_BREAK 0x04 +#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ +#define LBF_ABORT 0x10 /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? @@ -4508,7 +4510,9 @@ redo: goto out_balanced; if (lb_flags & LBF_NEED_BREAK) { - lb_flags &= ~LBF_NEED_BREAK; + lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK; + if (lb_flags & LBF_ABORT) + goto out_balanced; goto redo; } -- cgit v1.2.3-70-g09d2 From 70b1e9161e903a9e1682aca3a832ed29ef876a4d Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Sat, 12 Nov 2011 19:08:55 -0800 Subject: module: Add comments describing how the "strmap" logic works Signed-off-by: Kevin Cernekee Signed-off-by: Rusty Russell --- kernel/module.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 178333c48d1..cf9f1b6b326 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2193,6 +2193,13 @@ static void layout_symtab(struct module *mod, struct load_info *info) src = (void *)info->hdr + symsect->sh_offset; nsrc = symsect->sh_size / sizeof(*src); + + /* + * info->strmap has a '1' bit for each byte of .strtab we want to + * keep resident in mod->core_strtab. Everything else in .strtab + * is unreferenced by the symbols in mod->core_symtab, and will be + * discarded when add_kallsyms() compacts the string table. + */ for (ndst = i = 1; i < nsrc; ++i, ++src) if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { unsigned int j = src->st_name; @@ -2215,6 +2222,8 @@ static void layout_symtab(struct module *mod, struct load_info *info) /* Append room for core symbols' strings at end of core part. */ info->stroffs = mod->core_size; + + /* First strtab byte (and first symtab entry) are zeroes. */ __set_bit(0, info->strmap); mod->core_size += bitmap_weight(info->strmap, strsect->sh_size); } -- cgit v1.2.3-70-g09d2 From 48fd11880b5ef04270be8a87d9a9a9ee2fdae338 Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Fri, 13 Jan 2012 09:32:14 +1030 Subject: module: Fix performance regression on modules with large symbol tables Looking at /proc/kallsyms, one starts to ponder whether all of the extra strtab-related complexity in module.c is worth the memory savings. Instead of making the add_kallsyms() loop even more complex, I tried the other route of deleting the strmap logic and naively copying each string into core_strtab with no consideration for consolidating duplicates. Performance on an "already exists" insmod of nvidia.ko (runs add_kallsyms() but does not actually initialize the module): Original scheme: 1.230s With naive copying: 0.058s Extra space used: 35k (of a 408k module). Signed-off-by: Kevin Cernekee Signed-off-by: Rusty Russell LKML-Reference: <73defb5e4bca04a6431392cc341112b1@localhost> --- kernel/module.c | 65 +++++++++++++++++++-------------------------------------- 1 file changed, 21 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index cf9f1b6b326..4928cffc3dc 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -138,7 +138,6 @@ struct load_info { unsigned long len; Elf_Shdr *sechdrs; char *secstrings, *strtab; - unsigned long *strmap; unsigned long symoffs, stroffs; struct _ddebug *debug; unsigned int num_debug; @@ -2178,12 +2177,19 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, return true; } +/* + * We only allocate and copy the strings needed by the parts of symtab + * we keep. This is simple, but has the effect of making multiple + * copies of duplicates. We could be more sophisticated, see + * linux-kernel thread starting with + * <73defb5e4bca04a6431392cc341112b1@localhost>. + */ static void layout_symtab(struct module *mod, struct load_info *info) { Elf_Shdr *symsect = info->sechdrs + info->index.sym; Elf_Shdr *strsect = info->sechdrs + info->index.str; const Elf_Sym *src; - unsigned int i, nsrc, ndst; + unsigned int i, nsrc, ndst, strtab_size; /* Put symbol section at end of init part of module. */ symsect->sh_flags |= SHF_ALLOC; @@ -2194,38 +2200,23 @@ static void layout_symtab(struct module *mod, struct load_info *info) src = (void *)info->hdr + symsect->sh_offset; nsrc = symsect->sh_size / sizeof(*src); - /* - * info->strmap has a '1' bit for each byte of .strtab we want to - * keep resident in mod->core_strtab. Everything else in .strtab - * is unreferenced by the symbols in mod->core_symtab, and will be - * discarded when add_kallsyms() compacts the string table. - */ - for (ndst = i = 1; i < nsrc; ++i, ++src) + /* Compute total space required for the core symbols' strtab. */ + for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src) if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { - unsigned int j = src->st_name; - - while (!__test_and_set_bit(j, info->strmap) - && info->strtab[j]) - ++j; - ++ndst; + strtab_size += strlen(&info->strtab[src->st_name]) + 1; + ndst++; } /* Append room for core symbols at end of core part. */ info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); - mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); + info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); + mod->core_size += strtab_size; /* Put string table section at end of init part of module. */ strsect->sh_flags |= SHF_ALLOC; strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, info->index.str) | INIT_OFFSET_MASK; DEBUGP("\t%s\n", info->secstrings + strsect->sh_name); - - /* Append room for core symbols' strings at end of core part. */ - info->stroffs = mod->core_size; - - /* First strtab byte (and first symtab entry) are zeroes. */ - __set_bit(0, info->strmap); - mod->core_size += bitmap_weight(info->strmap, strsect->sh_size); } static void add_kallsyms(struct module *mod, const struct load_info *info) @@ -2246,22 +2237,19 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); mod->core_symtab = dst = mod->module_core + info->symoffs; + mod->core_strtab = s = mod->module_core + info->stroffs; src = mod->symtab; *dst = *src; + *s++ = 0; for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) continue; + dst[ndst] = *src; - dst[ndst].st_name = bitmap_weight(info->strmap, - dst[ndst].st_name); - ++ndst; + dst[ndst++].st_name = s - mod->core_strtab; + s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1; } mod->core_num_syms = ndst; - - mod->core_strtab = s = mod->module_core + info->stroffs; - for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i) - if (test_bit(i, info->strmap)) - *++s = mod->strtab[i]; } #else static inline void layout_symtab(struct module *mod, struct load_info *info) @@ -2751,27 +2739,18 @@ static struct module *layout_and_allocate(struct load_info *info) this is done generically; there doesn't appear to be any special cases for the architectures. */ layout_sections(mod, info); - - info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size) - * sizeof(long), GFP_KERNEL); - if (!info->strmap) { - err = -ENOMEM; - goto free_percpu; - } layout_symtab(mod, info); /* Allocate and move to the final place */ err = move_module(mod, info); if (err) - goto free_strmap; + goto free_percpu; /* Module has been copied to its final place now: return it. */ mod = (void *)info->sechdrs[info->index.mod].sh_addr; kmemleak_load_module(mod, info); return mod; -free_strmap: - kfree(info->strmap); free_percpu: percpu_modfree(mod); out: @@ -2781,7 +2760,6 @@ out: /* mod is no longer valid after this! */ static void module_deallocate(struct module *mod, struct load_info *info) { - kfree(info->strmap); percpu_modfree(mod); module_free(mod, mod->module_init); module_free(mod, mod->module_core); @@ -2911,8 +2889,7 @@ static struct module *load_module(void __user *umod, if (err < 0) goto unlink; - /* Get rid of temporary copy and strmap. */ - kfree(info.strmap); + /* Get rid of temporary copy. */ free_copy(&info); /* Done! */ -- cgit v1.2.3-70-g09d2 From bd77c04772da38fca510c81f78e51f727123b919 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 Jan 2012 09:32:14 +1030 Subject: module: struct module_ref should contains long fields module_ref contains two "unsigned int" fields. Thats now too small, since some machines can open more than 2^32 files. Check commit 518de9b39e8 (fs: allow for more than 2^31 files) for reference. We can add an aligned(2 * sizeof(unsigned long)) attribute to force alloc_percpu() allocating module_ref areas in single cache lines. Signed-off-by: Eric Dumazet CC: Rusty Russell CC: Tejun Heo CC: Robin Holt CC: David Miller Signed-off-by: Rusty Russell --- include/linux/module.h | 21 ++++++++++++++++----- kernel/debug/kdb/kdb_main.c | 2 +- kernel/module.c | 8 ++++---- 3 files changed, 21 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index 3cb7839a60b..4598bf03e98 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -205,6 +205,20 @@ enum module_state MODULE_STATE_GOING, }; +/** + * struct module_ref - per cpu module reference counts + * @incs: number of module get on this cpu + * @decs: number of module put on this cpu + * + * We force an alignment on 8 or 16 bytes, so that alloc_percpu() + * put @incs/@decs in same cache line, with no extra memory cost, + * since alloc_percpu() is fine grained. + */ +struct module_ref { + unsigned long incs; + unsigned long decs; +} __attribute((aligned(2 * sizeof(unsigned long)))); + struct module { enum module_state state; @@ -347,10 +361,7 @@ struct module /* Destruction function. */ void (*exit)(void); - struct module_ref { - unsigned int incs; - unsigned int decs; - } __percpu *refptr; + struct module_ref __percpu *refptr; #endif #ifdef CONFIG_CONSTRUCTORS @@ -434,7 +445,7 @@ extern void __module_put_and_exit(struct module *mod, long code) #define module_put_and_exit(code) __module_put_and_exit(THIS_MODULE, code); #ifdef CONFIG_MODULE_UNLOAD -unsigned int module_refcount(struct module *mod); +unsigned long module_refcount(struct module *mod); void __symbol_put(const char *symbol); #define symbol_put(x) __symbol_put(MODULE_SYMBOL_PREFIX #x) void symbol_put_addr(void *addr); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 63786e71a3c..e2ae7349437 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1982,7 +1982,7 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf("%-20s%8u 0x%p ", mod->name, mod->core_size, (void *)mod); #ifdef CONFIG_MODULE_UNLOAD - kdb_printf("%4d ", module_refcount(mod)); + kdb_printf("%4ld ", module_refcount(mod)); #endif if (mod->state == MODULE_STATE_GOING) kdb_printf(" (Unloading)"); diff --git a/kernel/module.c b/kernel/module.c index 4928cffc3dc..14b8e82e05d 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -725,9 +725,9 @@ static int try_stop_module(struct module *mod, int flags, int *forced) } } -unsigned int module_refcount(struct module *mod) +unsigned long module_refcount(struct module *mod) { - unsigned int incs = 0, decs = 0; + unsigned long incs = 0, decs = 0; int cpu; for_each_possible_cpu(cpu) @@ -853,7 +853,7 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) struct module_use *use; int printed_something = 0; - seq_printf(m, " %u ", module_refcount(mod)); + seq_printf(m, " %lu ", module_refcount(mod)); /* Always include a trailing , so userspace can differentiate between this and the old multi-field proc format. */ @@ -903,7 +903,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr); static ssize_t show_refcnt(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { - return sprintf(buffer, "%u\n", module_refcount(mk->mod)); + return sprintf(buffer, "%lu\n", module_refcount(mk->mod)); } static struct module_attribute refcnt = { -- cgit v1.2.3-70-g09d2 From 5e12416927975aa3c58394cea15db6c3e488a033 Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Tue, 6 Dec 2011 12:11:31 -0700 Subject: module: replace DEBUGP with pr_debug Use more flexible pr_debug. This allows: echo "module module +p" > /dbg/dynamic_debug/control to turn on debug messages when needed. Signed-off-by: Jim Cromie Signed-off-by: Rusty Russell --- kernel/module.c | 46 ++++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 14b8e82e05d..b02d6335f8a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -62,12 +62,6 @@ #define CREATE_TRACE_POINTS #include -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(fmt , a...) -#endif - #ifndef ARCH_SHF_SMALL #define ARCH_SHF_SMALL 0 #endif @@ -409,7 +403,7 @@ const struct kernel_symbol *find_symbol(const char *name, return fsa.sym; } - DEBUGP("Failed to find symbol %s\n", name); + pr_debug("Failed to find symbol %s\n", name); return NULL; } EXPORT_SYMBOL_GPL(find_symbol); @@ -599,11 +593,11 @@ static int already_uses(struct module *a, struct module *b) list_for_each_entry(use, &b->source_list, source_list) { if (use->source == a) { - DEBUGP("%s uses %s!\n", a->name, b->name); + pr_debug("%s uses %s!\n", a->name, b->name); return 1; } } - DEBUGP("%s does not use %s!\n", a->name, b->name); + pr_debug("%s does not use %s!\n", a->name, b->name); return 0; } @@ -618,7 +612,7 @@ static int add_module_usage(struct module *a, struct module *b) { struct module_use *use; - DEBUGP("Allocating new usage for %s.\n", a->name); + pr_debug("Allocating new usage for %s.\n", a->name); use = kmalloc(sizeof(*use), GFP_ATOMIC); if (!use) { printk(KERN_WARNING "%s: out of memory loading\n", a->name); @@ -662,7 +656,7 @@ static void module_unload_free(struct module *mod) mutex_lock(&module_mutex); list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { struct module *i = use->target; - DEBUGP("%s unusing %s\n", mod->name, i->name); + pr_debug("%s unusing %s\n", mod->name, i->name); module_put(i); list_del(&use->source_list); list_del(&use->target_list); @@ -760,7 +754,7 @@ static void wait_for_zero_refcount(struct module *mod) /* Since we might sleep for some time, release the mutex first */ mutex_unlock(&module_mutex); for (;;) { - DEBUGP("Looking at refcount...\n"); + pr_debug("Looking at refcount...\n"); set_current_state(TASK_UNINTERRUPTIBLE); if (module_refcount(mod) == 0) break; @@ -803,7 +797,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, if (mod->state != MODULE_STATE_LIVE) { /* FIXME: if (force), slam module count and wake up waiter --RR */ - DEBUGP("%s already dying\n", mod->name); + pr_debug("%s already dying\n", mod->name); ret = -EBUSY; goto out; } @@ -1056,7 +1050,7 @@ static int check_version(Elf_Shdr *sechdrs, if (versions[i].crc == maybe_relocated(*crc, crc_owner)) return 1; - DEBUGP("Found checksum %lX vs module %lX\n", + pr_debug("Found checksum %lX vs module %lX\n", maybe_relocated(*crc, crc_owner), versions[i].crc); goto bad_version; } @@ -1833,7 +1827,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) case SHN_COMMON: /* We compiled with -fno-common. These are not supposed to happen. */ - DEBUGP("Common symbol: %s\n", name); + pr_debug("Common symbol: %s\n", name); printk("%s: please compile with -fno-common\n", mod->name); ret = -ENOEXEC; @@ -1841,7 +1835,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) case SHN_ABS: /* Don't need to do anything */ - DEBUGP("Absolute symbol: 0x%08lx\n", + pr_debug("Absolute symbol: 0x%08lx\n", (long)sym[i].st_value); break; @@ -1965,7 +1959,7 @@ static void layout_sections(struct module *mod, struct load_info *info) for (i = 0; i < info->hdr->e_shnum; i++) info->sechdrs[i].sh_entsize = ~0UL; - DEBUGP("Core section allocation order:\n"); + pr_debug("Core section allocation order:\n"); for (m = 0; m < ARRAY_SIZE(masks); ++m) { for (i = 0; i < info->hdr->e_shnum; ++i) { Elf_Shdr *s = &info->sechdrs[i]; @@ -1977,7 +1971,7 @@ static void layout_sections(struct module *mod, struct load_info *info) || strstarts(sname, ".init")) continue; s->sh_entsize = get_offset(mod, &mod->core_size, s, i); - DEBUGP("\t%s\n", name); + pr_debug("\t%s\n", sname); } switch (m) { case 0: /* executable */ @@ -1994,7 +1988,7 @@ static void layout_sections(struct module *mod, struct load_info *info) } } - DEBUGP("Init section allocation order:\n"); + pr_debug("Init section allocation order:\n"); for (m = 0; m < ARRAY_SIZE(masks); ++m) { for (i = 0; i < info->hdr->e_shnum; ++i) { Elf_Shdr *s = &info->sechdrs[i]; @@ -2007,7 +2001,7 @@ static void layout_sections(struct module *mod, struct load_info *info) continue; s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) | INIT_OFFSET_MASK); - DEBUGP("\t%s\n", sname); + pr_debug("\t%s\n", sname); } switch (m) { case 0: /* executable */ @@ -2195,7 +2189,7 @@ static void layout_symtab(struct module *mod, struct load_info *info) symsect->sh_flags |= SHF_ALLOC; symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, info->index.sym) | INIT_OFFSET_MASK; - DEBUGP("\t%s\n", info->secstrings + symsect->sh_name); + pr_debug("\t%s\n", info->secstrings + symsect->sh_name); src = (void *)info->hdr + symsect->sh_offset; nsrc = symsect->sh_size / sizeof(*src); @@ -2216,7 +2210,7 @@ static void layout_symtab(struct module *mod, struct load_info *info) strsect->sh_flags |= SHF_ALLOC; strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, info->index.str) | INIT_OFFSET_MASK; - DEBUGP("\t%s\n", info->secstrings + strsect->sh_name); + pr_debug("\t%s\n", info->secstrings + strsect->sh_name); } static void add_kallsyms(struct module *mod, const struct load_info *info) @@ -2618,7 +2612,7 @@ static int move_module(struct module *mod, struct load_info *info) mod->module_init = ptr; /* Transfer each section which specifies SHF_ALLOC */ - DEBUGP("final section addresses:\n"); + pr_debug("final section addresses:\n"); for (i = 0; i < info->hdr->e_shnum; i++) { void *dest; Elf_Shdr *shdr = &info->sechdrs[i]; @@ -2636,8 +2630,8 @@ static int move_module(struct module *mod, struct load_info *info) memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); /* Update sh_addr to point to copy in image. */ shdr->sh_addr = (unsigned long)dest; - DEBUGP("\t0x%lx %s\n", - shdr->sh_addr, info->secstrings + shdr->sh_name); + pr_debug("\t0x%lx %s\n", + (long)shdr->sh_addr, info->secstrings + shdr->sh_name); } return 0; @@ -2798,7 +2792,7 @@ static struct module *load_module(void __user *umod, struct module *mod; long err; - DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", + pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n", umod, len, uargs); /* Copy in the blobs from userspace, check they are vaguely sane. */ -- cgit v1.2.3-70-g09d2 From 8487bfd954928660a52e91384a9b1f1049217e35 Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Tue, 6 Dec 2011 12:11:31 -0700 Subject: kernel/params: replace DEBUGP with pr_debug Use more flexible pr_debug. This allows: echo "module params +p" > /dbg/dynamic_debug/control to turn on debug messages when needed. Signed-off-by: Jim Cromie Signed-off-by: Rusty Russell --- kernel/params.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 65aae11eb93..9240664af11 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -25,12 +25,6 @@ #include #include -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(fmt, a...) -#endif - /* Protects all parameters, and incidentally kmalloced_param list. */ static DEFINE_MUTEX(param_lock); @@ -105,7 +99,7 @@ static int parse_one(char *param, /* No one handled NULL, so do it here. */ if (!val && params[i].ops->set != param_set_bool) return -EINVAL; - DEBUGP("They are equal! Calling %p\n", + pr_debug("They are equal! Calling %p\n", params[i].ops->set); mutex_lock(¶m_lock); err = params[i].ops->set(val, ¶ms[i]); @@ -115,11 +109,11 @@ static int parse_one(char *param, } if (handle_unknown) { - DEBUGP("Unknown argument: calling %p\n", handle_unknown); + pr_debug("Unknown argument: calling %p\n", handle_unknown); return handle_unknown(param, val); } - DEBUGP("Unknown argument `%s'\n", param); + pr_debug("Unknown argument `%s'\n", param); return -ENOENT; } @@ -184,7 +178,7 @@ int parse_args(const char *name, { char *param, *val; - DEBUGP("Parsing ARGS: %s\n", args); + pr_debug("Parsing ARGS: %s\n", args); /* Chew leading spaces */ args = skip_spaces(args); -- cgit v1.2.3-70-g09d2 From cca3e707301862ca9b9327e6a732463982f8cd1b Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 13 Jan 2012 09:32:15 +1030 Subject: modules: sysfs - export: taint, coresize, initsize Recent tools do not want to use /proc to retrieve module information. A few values are currently missing from sysfs to replace the information available in /proc/modules. This adds /sys/module/*/{coresize,initsize,taint} attributes. TAINT_PROPRIETARY_MODULE (P) and TAINT_OOT_MODULE (O) flags are both always shown now, and do no longer exclude each other, also in /proc/modules. Replace the open-coded sysfs attribute initializers with the __ATTR() macro. Add the new attributes to Documentation/ABI. Cc: Lucas De Marchi Signed-off-by: Kay Sievers Signed-off-by: Rusty Russell --- Documentation/ABI/testing/sysfs-module | 16 ++++++ kernel/module.c | 93 +++++++++++++++++++++++----------- 2 files changed, 80 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/Documentation/ABI/testing/sysfs-module b/Documentation/ABI/testing/sysfs-module index 9489ea8e294..47064c2b1f7 100644 --- a/Documentation/ABI/testing/sysfs-module +++ b/Documentation/ABI/testing/sysfs-module @@ -33,3 +33,19 @@ Description: Maximum time allowed for periodic transfers per microframe (μs) Beware, non-standard modes are usually not thoroughly tested by hardware designers, and the hardware can malfunction when this setting differ from default 100. + +What: /sys/module/*/{coresize,initsize} +Date: Jan 2012 +KernelVersion:»·3.3 +Contact: Kay Sievers +Description: Module size in bytes. + +What: /sys/module/*/taint +Date: Jan 2012 +KernelVersion:»·3.3 +Contact: Kay Sievers +Description: Module taint flags: + P - proprietary module + O - out-of-tree module + F - force-loaded module + C - staging driver module diff --git a/kernel/module.c b/kernel/module.c index b02d6335f8a..acf6ed3ebe8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -842,6 +842,26 @@ out: return ret; } +static size_t module_flags_taint(struct module *mod, char *buf) +{ + size_t l = 0; + + if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) + buf[l++] = 'P'; + if (mod->taints & (1 << TAINT_OOT_MODULE)) + buf[l++] = 'O'; + if (mod->taints & (1 << TAINT_FORCED_MODULE)) + buf[l++] = 'F'; + if (mod->taints & (1 << TAINT_CRAP)) + buf[l++] = 'C'; + /* + * TAINT_FORCED_RMMOD: could be added. + * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't + * apply to modules. + */ + return l; +} + static inline void print_unload_info(struct seq_file *m, struct module *mod) { struct module_use *use; @@ -900,10 +920,8 @@ static ssize_t show_refcnt(struct module_attribute *mattr, return sprintf(buffer, "%lu\n", module_refcount(mk->mod)); } -static struct module_attribute refcnt = { - .attr = { .name = "refcnt", .mode = 0444 }, - .show = show_refcnt, -}; +static struct module_attribute modinfo_refcnt = + __ATTR(refcnt, 0444, show_refcnt, NULL); void module_put(struct module *module) { @@ -963,10 +981,8 @@ static ssize_t show_initstate(struct module_attribute *mattr, return sprintf(buffer, "%s\n", state); } -static struct module_attribute initstate = { - .attr = { .name = "initstate", .mode = 0444 }, - .show = show_initstate, -}; +static struct module_attribute modinfo_initstate = + __ATTR(initstate, 0444, show_initstate, NULL); static ssize_t store_uevent(struct module_attribute *mattr, struct module_kobject *mk, @@ -979,18 +995,50 @@ static ssize_t store_uevent(struct module_attribute *mattr, return count; } -struct module_attribute module_uevent = { - .attr = { .name = "uevent", .mode = 0200 }, - .store = store_uevent, -}; +struct module_attribute module_uevent = + __ATTR(uevent, 0200, NULL, store_uevent); + +static ssize_t show_coresize(struct module_attribute *mattr, + struct module_kobject *mk, char *buffer) +{ + return sprintf(buffer, "%u\n", mk->mod->core_size); +} + +static struct module_attribute modinfo_coresize = + __ATTR(coresize, 0444, show_coresize, NULL); + +static ssize_t show_initsize(struct module_attribute *mattr, + struct module_kobject *mk, char *buffer) +{ + return sprintf(buffer, "%u\n", mk->mod->init_size); +} + +static struct module_attribute modinfo_initsize = + __ATTR(initsize, 0444, show_initsize, NULL); + +static ssize_t show_taint(struct module_attribute *mattr, + struct module_kobject *mk, char *buffer) +{ + size_t l; + + l = module_flags_taint(mk->mod, buffer); + buffer[l++] = '\n'; + return l; +} + +static struct module_attribute modinfo_taint = + __ATTR(taint, 0444, show_taint, NULL); static struct module_attribute *modinfo_attrs[] = { + &module_uevent, &modinfo_version, &modinfo_srcversion, - &initstate, - &module_uevent, + &modinfo_initstate, + &modinfo_coresize, + &modinfo_initsize, + &modinfo_taint, #ifdef CONFIG_MODULE_UNLOAD - &refcnt, + &modinfo_refcnt, #endif NULL, }; @@ -3236,20 +3284,7 @@ static char *module_flags(struct module *mod, char *buf) mod->state == MODULE_STATE_GOING || mod->state == MODULE_STATE_COMING) { buf[bx++] = '('; - if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) - buf[bx++] = 'P'; - else if (mod->taints & (1 << TAINT_OOT_MODULE)) - buf[bx++] = 'O'; - if (mod->taints & (1 << TAINT_FORCED_MODULE)) - buf[bx++] = 'F'; - if (mod->taints & (1 << TAINT_CRAP)) - buf[bx++] = 'C'; - /* - * TAINT_FORCED_RMMOD: could be added. - * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't - * apply to modules. - */ - + bx += module_flags_taint(mod, buf + bx); /* Show a - for module-is-being-unloaded */ if (mod->state == MODULE_STATE_GOING) buf[bx++] = '-'; -- cgit v1.2.3-70-g09d2 From 69116f279a9eaf4c540934269342d9149538fc79 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Jan 2012 09:32:17 +1030 Subject: module_param: avoid bool abuse, add bint for special cases. For historical reasons, we allow module_param(bool) to take an int (or an unsigned int). That's going away. A few drivers really want an int: they set it to -1 and a parameter will set it to 0 or 1. This sucks: reading them from sysfs will give 'Y' for both -1 and 1, but if we change it to an int, then the users might be broken (if they did "param" instead of "param=1"). Use a new 'bint' parser for them. (ntfs has a different problem: it needs an int for debug_msgs because it's also exposed via sysctl.) Cc: Steve Glendinning Cc: Jean Delvare Cc: Guenter Roeck Cc: Hoang-Nam Nguyen Cc: Christoph Raisch Cc: Roland Dreier Cc: Sean Hefty Cc: Hal Rosenstock Cc: linux390@de.ibm.com Cc: Anton Altaparmakov Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: lm-sensors@lm-sensors.org Cc: linux-rdma@vger.kernel.org Cc: linux-s390@vger.kernel.org Cc: linux-ntfs-dev@lists.sourceforge.net Cc: alsa-devel@alsa-project.org Acked-by: Takashi Iwai (For the sound part) Acked-by: Guenter Roeck (For the hwmon driver) Signed-off-by: Rusty Russell --- drivers/hwmon/emc2103.c | 2 +- drivers/infiniband/hw/ehca/ehca_main.c | 2 +- drivers/s390/cio/cmf.c | 2 +- fs/ntfs/super.c | 2 +- include/linux/moduleparam.h | 6 ++++++ kernel/params.c | 24 ++++++++++++++++++++++++ sound/pci/intel8x0.c | 4 ++-- 7 files changed, 36 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/drivers/hwmon/emc2103.c b/drivers/hwmon/emc2103.c index 848a2b0bc83..865063914d7 100644 --- a/drivers/hwmon/emc2103.c +++ b/drivers/hwmon/emc2103.c @@ -55,7 +55,7 @@ static const u8 REG_TEMP_MAX[4] = { 0x34, 0x30, 0x31, 0x32 }; * it. Default is to leave the device in the state it's already in (-1). * This parameter allows APD mode to be optionally forced on or off */ static int apd = -1; -module_param(apd, bool, 0); +module_param(apd, bint, 0); MODULE_PARM_DESC(init, "Set to zero to disable anti-parallel diode mode"); struct temperature { diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c index c240e9972cb..8af8d4f7bdb 100644 --- a/drivers/infiniband/hw/ehca/ehca_main.c +++ b/drivers/infiniband/hw/ehca/ehca_main.c @@ -82,7 +82,7 @@ module_param_named(port_act_time, ehca_port_act_time, int, S_IRUGO); module_param_named(poll_all_eqs, ehca_poll_all_eqs, bool, S_IRUGO); module_param_named(static_rate, ehca_static_rate, int, S_IRUGO); module_param_named(scaling_code, ehca_scaling_code, bool, S_IRUGO); -module_param_named(lock_hcalls, ehca_lock_hcalls, bool, S_IRUGO); +module_param_named(lock_hcalls, ehca_lock_hcalls, bint, S_IRUGO); module_param_named(number_of_cqs, ehca_max_cq, int, S_IRUGO); module_param_named(number_of_qps, ehca_max_qp, int, S_IRUGO); diff --git a/drivers/s390/cio/cmf.c b/drivers/s390/cio/cmf.c index 2985eb43948..204ca728e7f 100644 --- a/drivers/s390/cio/cmf.c +++ b/drivers/s390/cio/cmf.c @@ -98,7 +98,7 @@ enum cmb_format { * enum cmb_format. */ static int format = CMF_AUTODETECT; -module_param(format, bool, 0444); +module_param(format, bint, 0444); /** * struct cmb_operations - functions to use depending on cmb_format diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 608be451609..5a4a8af5c40 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -3198,7 +3198,7 @@ MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparm MODULE_VERSION(NTFS_VERSION); MODULE_LICENSE("GPL"); #ifdef DEBUG -module_param(debug_msgs, bool, 0); +module_param(debug_msgs, bint, 0); MODULE_PARM_DESC(debug_msgs, "Enable debug messages."); #endif diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 794d4b0f121..6bdde0c3bcc 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -367,6 +367,12 @@ extern int param_set_invbool(const char *val, const struct kernel_param *kp); extern int param_get_invbool(char *buffer, const struct kernel_param *kp); #define param_check_invbool(name, p) __param_check(name, p, bool) +/* An int, which can only be set like a bool (though it shows as an int). */ +extern struct kernel_param_ops param_ops_bint; +extern int param_set_bint(const char *val, const struct kernel_param *kp); +#define param_get_bint param_get_int +#define param_check_bint param_check_int + /** * module_param_array - a parameter which is an array of some type * @name: the name of the array variable diff --git a/kernel/params.c b/kernel/params.c index 9240664af11..32ee0430828 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -363,6 +363,30 @@ struct kernel_param_ops param_ops_invbool = { }; EXPORT_SYMBOL(param_ops_invbool); +int param_set_bint(const char *val, const struct kernel_param *kp) +{ + struct kernel_param boolkp; + bool v; + int ret; + + /* Match bool exactly, by re-using it. */ + boolkp = *kp; + boolkp.arg = &v; + boolkp.flags |= KPARAM_ISBOOL; + + ret = param_set_bool(val, &boolkp); + if (ret == 0) + *(int *)kp->arg = v; + return ret; +} +EXPORT_SYMBOL(param_set_bint); + +struct kernel_param_ops param_ops_bint = { + .set = param_set_bint, + .get = param_get_int, +}; +EXPORT_SYMBOL(param_ops_bint); + /* We break the rule and mangle the string. */ static int param_array(const char *name, const char *val, diff --git a/sound/pci/intel8x0.c b/sound/pci/intel8x0.c index 40b181bab93..9f3b01bb72c 100644 --- a/sound/pci/intel8x0.c +++ b/sound/pci/intel8x0.c @@ -95,13 +95,13 @@ module_param(ac97_quirk, charp, 0444); MODULE_PARM_DESC(ac97_quirk, "AC'97 workaround for strange hardware."); module_param(buggy_semaphore, bool, 0444); MODULE_PARM_DESC(buggy_semaphore, "Enable workaround for hardwares with problematic codec semaphores."); -module_param(buggy_irq, bool, 0444); +module_param(buggy_irq, bint, 0444); MODULE_PARM_DESC(buggy_irq, "Enable workaround for buggy interrupts on some motherboards."); module_param(xbox, bool, 0444); MODULE_PARM_DESC(xbox, "Set to 1 for Xbox, if you have problems with the AC'97 codec detection."); module_param(spdif_aclink, int, 0444); MODULE_PARM_DESC(spdif_aclink, "S/PDIF over AC-link."); -module_param(inside_vm, bool, 0444); +module_param(inside_vm, bint, 0444); MODULE_PARM_DESC(inside_vm, "KVM/Parallels optimization."); /* just for backward compatibility */ -- cgit v1.2.3-70-g09d2 From 29d4d6df107b9d86982dc759f5b1ddfe2c6b29c0 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Jan 2012 09:32:17 +1030 Subject: printk: fix unnecessary module_param_name. You don't need module_param_name if the name is the same! Cc: Yanmin Zhang Cc: Andrew Morton Signed-off-by: Rusty Russell --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 989e4a52da7..63b3bc31fe3 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -532,7 +532,7 @@ static int __init ignore_loglevel_setup(char *str) } early_param("ignore_loglevel", ignore_loglevel_setup); -module_param_named(ignore_loglevel, ignore_loglevel, bool, S_IRUGO | S_IWUSR); +module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" "print all kernel messages to the console."); -- cgit v1.2.3-70-g09d2 From 6d6a55ec0877393f467067d44b9a2a8c2e4a82d2 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Jan 2012 09:32:18 +1030 Subject: kernel/async: remove redundant declaration. It's in linux/init.h, and I'm about to change it to a bool. Cc: Arjan van de Ven Signed-off-by: Rusty Russell --- kernel/async.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index 80b74b88fef..bd0c168a3bb 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -78,8 +78,6 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done); static atomic_t entry_count; -extern int initcall_debug; - /* * MUST be called with the lock held! -- cgit v1.2.3-70-g09d2 From 2329abfa344a9a824bc4c71f2415528777265510 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Jan 2012 09:32:18 +1030 Subject: module_param: make bool parameters really bool (core code) module_param(bool) used to counter-intuitively take an int. In fddd5201 (mid-2009) we allowed bool or int/unsigned int using a messy trick. It's time to remove the int/unsigned int option. For this version it'll simply give a warning, but it'll break next kernel version. Signed-off-by: Rusty Russell --- include/linux/init.h | 3 ++- init/main.c | 2 +- kernel/irq/internals.h | 2 +- kernel/irq/spurious.c | 2 +- kernel/printk.c | 8 ++++---- 5 files changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/init.h b/include/linux/init.h index 9146f39cddd..6b951095a42 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -2,6 +2,7 @@ #define _LINUX_INIT_H #include +#include /* These macros are used to mark some functions or * initialized data (doesn't apply to uninitialized data) @@ -156,7 +157,7 @@ void prepare_namespace(void); extern void (*late_time_init)(void); -extern int initcall_debug; +extern bool initcall_debug; #endif diff --git a/init/main.c b/init/main.c index 415548e808d..ff49a6dacfb 100644 --- a/init/main.c +++ b/init/main.c @@ -648,7 +648,7 @@ static void __init do_ctors(void) #endif } -int initcall_debug; +bool initcall_debug; core_param(initcall_debug, initcall_debug, bool, 0644); static char msgbuf[64]; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index a73dd6c7372..b7952316016 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -15,7 +15,7 @@ #define istate core_internal_state__do_not_mess_with_it -extern int noirqdebug; +extern bool noirqdebug; /* * Bits used by threaded handlers: diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index dc813a948be..611cd6003c4 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -325,7 +325,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, desc->irqs_unhandled = 0; } -int noirqdebug __read_mostly; +bool noirqdebug __read_mostly; int noirqdebug_setup(char *str) { diff --git a/kernel/printk.c b/kernel/printk.c index 63b3bc31fe3..13c0a1143f4 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -521,7 +521,7 @@ static void __call_console_drivers(unsigned start, unsigned end) } } -static int __read_mostly ignore_loglevel; +static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) { @@ -696,9 +696,9 @@ static void zap_locks(void) } #if defined(CONFIG_PRINTK_TIME) -static int printk_time = 1; +static bool printk_time = 1; #else -static int printk_time = 0; +static bool printk_time = 0; #endif module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); @@ -1098,7 +1098,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha return -1; } -int console_suspend_enabled = 1; +bool console_suspend_enabled = 1; EXPORT_SYMBOL(console_suspend_enabled); static int __init console_suspend_disable(char *str) -- cgit v1.2.3-70-g09d2 From efeb156e7275c5b6c6e0f96aceb3c6abf98fc392 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 12 Jan 2012 17:17:11 -0800 Subject: kprobes: silence DEBUG_STRICT_USER_COPY_CHECKS=y warning Enabling DEBUG_STRICT_USER_COPY_CHECKS causes the following warning: In file included from arch/x86/include/asm/uaccess.h:573, from kernel/kprobes.c:55: In function 'copy_from_user', inlined from 'write_enabled_file_bool' at kernel/kprobes.c:2191: arch/x86/include/asm/uaccess_64.h:65: warning: call to 'copy_from_user_overflow' declared with attribute warning: copy_from_user() buffer size is not provably correct presumably due to buf_size being signed causing GCC to fail to see that buf_size can't become negative. Signed-off-by: Stephen Boyd Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S. Miller Acked-by: Masami Hiramatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e5d84644823..95dd7212e61 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2198,7 +2198,7 @@ static ssize_t write_enabled_file_bool(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { char buf[32]; - int buf_size; + size_t buf_size; buf_size = min(count, (sizeof(buf)-1)); if (copy_from_user(buf, user_buf, buf_size)) -- cgit v1.2.3-70-g09d2 From 9402c95f34a66e81eba473a2f7267bbae5a1dee2 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 12 Jan 2012 17:17:17 -0800 Subject: treewide: remove useless NORET_TYPE macro and uses It's a very old and now unused prototype marking so just delete it. Neaten panic pointer argument style to keep checkpatch quiet. Signed-off-by: Joe Perches Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Tony Luck Cc: Fenghua Yu Acked-by: Geert Uytterhoeven Acked-by: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/avr32/include/asm/system.h | 2 +- arch/avr32/kernel/traps.c | 2 +- arch/ia64/kernel/machine_kexec.c | 2 +- arch/m68k/amiga/config.c | 2 +- arch/mips/include/asm/ptrace.h | 2 +- arch/mips/kernel/traps.c | 2 +- arch/powerpc/kernel/machine_kexec_32.c | 2 +- arch/powerpc/kernel/machine_kexec_64.c | 6 +++--- arch/s390/kernel/nmi.c | 2 +- arch/tile/kernel/machine_kexec.c | 4 ++-- include/linux/kernel.h | 6 +++--- include/linux/linkage.h | 1 - include/linux/sched.h | 2 +- kernel/exit.c | 6 +++--- kernel/panic.c | 2 +- 15 files changed, 21 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/arch/avr32/include/asm/system.h b/arch/avr32/include/asm/system.h index 9702c2213e1..62d9ded0163 100644 --- a/arch/avr32/include/asm/system.h +++ b/arch/avr32/include/asm/system.h @@ -169,7 +169,7 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr, #define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n)) struct pt_regs; -void NORET_TYPE die(const char *str, struct pt_regs *regs, long err); +void die(const char *str, struct pt_regs *regs, long err); void _exception(long signr, struct pt_regs *regs, int code, unsigned long addr); diff --git a/arch/avr32/kernel/traps.c b/arch/avr32/kernel/traps.c index 7aa25756412..3d760c06f02 100644 --- a/arch/avr32/kernel/traps.c +++ b/arch/avr32/kernel/traps.c @@ -24,7 +24,7 @@ static DEFINE_SPINLOCK(die_lock); -void NORET_TYPE die(const char *str, struct pt_regs *regs, long err) +void die(const char *str, struct pt_regs *regs, long err) { static int die_counter; diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c index 3d3aeef4694..581a16d5e85 100644 --- a/arch/ia64/kernel/machine_kexec.c +++ b/arch/ia64/kernel/machine_kexec.c @@ -27,7 +27,7 @@ #include #include -typedef NORET_TYPE void (*relocate_new_kernel_t)( +typedef void (*relocate_new_kernel_t)( unsigned long indirection_page, unsigned long start_address, struct ia64_boot_param *boot_param, diff --git a/arch/m68k/amiga/config.c b/arch/m68k/amiga/config.c index 82a4bb51d5d..a3b0558328b 100644 --- a/arch/m68k/amiga/config.c +++ b/arch/m68k/amiga/config.c @@ -511,7 +511,7 @@ static unsigned long amiga_gettimeoffset(void) return ticks + offset; } -static NORET_TYPE void amiga_reset(void) +static void amiga_reset(void) ATTRIB_NORET; static void amiga_reset(void) diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h index de39b1f343e..3d913259e50 100644 --- a/arch/mips/include/asm/ptrace.h +++ b/arch/mips/include/asm/ptrace.h @@ -144,7 +144,7 @@ extern int ptrace_set_watch_regs(struct task_struct *child, extern asmlinkage void syscall_trace_enter(struct pt_regs *regs); extern asmlinkage void syscall_trace_leave(struct pt_regs *regs); -extern NORET_TYPE void die(const char *, struct pt_regs *) ATTRIB_NORET; +extern void die(const char *, struct pt_regs *) ATTRIB_NORET; static inline void die_if_kernel(const char *str, struct pt_regs *regs) { diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 5c8a49d5505..725e9a5ca96 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -1340,7 +1340,7 @@ void ejtag_exception_handler(struct pt_regs *regs) /* * NMI exception handler. */ -NORET_TYPE void ATTRIB_NORET nmi_exception_handler(struct pt_regs *regs) +void ATTRIB_NORET nmi_exception_handler(struct pt_regs *regs) { bust_spinlocks(1); printk("NMI taken!!!!\n"); diff --git a/arch/powerpc/kernel/machine_kexec_32.c b/arch/powerpc/kernel/machine_kexec_32.c index e63f2e7d2ef..026e7f15394 100644 --- a/arch/powerpc/kernel/machine_kexec_32.c +++ b/arch/powerpc/kernel/machine_kexec_32.c @@ -16,7 +16,7 @@ #include #include -typedef NORET_TYPE void (*relocate_new_kernel_t)( +typedef void (*relocate_new_kernel_t)( unsigned long indirection_page, unsigned long reboot_code_buffer, unsigned long start_address) ATTRIB_NORET; diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 26ccbf77dd4..5fbbf814923 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -307,9 +307,9 @@ static union thread_union kexec_stack __init_task_data = struct paca_struct kexec_paca; /* Our assembly helper, in kexec_stub.S */ -extern NORET_TYPE void kexec_sequence(void *newstack, unsigned long start, - void *image, void *control, - void (*clear_all)(void)) ATTRIB_NORET; +extern void kexec_sequence(void *newstack, unsigned long start, + void *image, void *control, + void (*clear_all)(void)) ATTRIB_NORET; /* too late to fail here */ void default_machine_kexec(struct kimage *image) diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index fab88431a06..0fd2e863e11 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -30,7 +30,7 @@ struct mcck_struct { static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck); -static NORET_TYPE void s390_handle_damage(char *msg) +static void s390_handle_damage(char *msg) { smp_send_stop(); disabled_wait((unsigned long) __builtin_return_address(0)); diff --git a/arch/tile/kernel/machine_kexec.c b/arch/tile/kernel/machine_kexec.c index e00d7179989..b0c90705906 100644 --- a/arch/tile/kernel/machine_kexec.c +++ b/arch/tile/kernel/machine_kexec.c @@ -248,10 +248,10 @@ static void setup_quasi_va_is_pa(void) } -NORET_TYPE void machine_kexec(struct kimage *image) +void machine_kexec(struct kimage *image) { void *reboot_code_buffer; - NORET_TYPE void (*rnk)(unsigned long, void *, unsigned long) + void (*rnk)(unsigned long, void *, unsigned long) ATTRIB_NORET; /* Mask all interrupts before starting to reboot. */ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 60934395e36..aaf1753dd2b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -185,16 +185,16 @@ static inline void might_fault(void) extern struct atomic_notifier_head panic_notifier_list; extern long (*panic_blink)(int state); -NORET_TYPE __printf(1, 2) +__printf(1, 2) void panic(const char *fmt, ...) ATTRIB_NORET __cold; extern void oops_enter(void); extern void oops_exit(void); void print_oops_end_marker(void); extern int oops_may_print(void); -NORET_TYPE void do_exit(long error_code) +void do_exit(long error_code) ATTRIB_NORET; -NORET_TYPE void complete_and_exit(struct completion *, long) +void complete_and_exit(struct completion *, long) ATTRIB_NORET; /* Internal, do not use. */ diff --git a/include/linux/linkage.h b/include/linux/linkage.h index c75074cb8ad..6a8f252e49e 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -88,7 +88,6 @@ #endif -#define NORET_TYPE /**/ #define ATTRIB_NORET __attribute__((noreturn)) #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 21cd0303af5..4032ec1cf83 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2275,7 +2275,7 @@ extern void __cleanup_sighand(struct sighand_struct *); extern void exit_itimers(struct signal_struct *); extern void flush_itimer_signals(void); -extern NORET_TYPE void do_group_exit(int); +extern void do_group_exit(int); extern void daemonize(const char *, ...); extern int allow_signal(int); diff --git a/kernel/exit.c b/kernel/exit.c index 94ed6e20bb5..c44738267be 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -887,7 +887,7 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif -NORET_TYPE void do_exit(long code) +void do_exit(long code) { struct task_struct *tsk = current; int group_dead; @@ -1051,7 +1051,7 @@ NORET_TYPE void do_exit(long code) EXPORT_SYMBOL_GPL(do_exit); -NORET_TYPE void complete_and_exit(struct completion *comp, long code) +void complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); @@ -1070,7 +1070,7 @@ SYSCALL_DEFINE1(exit, int, error_code) * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). */ -NORET_TYPE void +void do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; diff --git a/kernel/panic.c b/kernel/panic.c index 3458469eb7c..6fd09ed6fd9 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -57,7 +57,7 @@ EXPORT_SYMBOL(panic_blink); * * This function never returns. */ -NORET_TYPE void panic(const char * fmt, ...) +void panic(const char *fmt, ...) { static char buf[1024]; va_list args; -- cgit v1.2.3-70-g09d2 From a3dd3323058d281abd584b15ad4c5b65064d7a61 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 12 Jan 2012 17:20:11 -0800 Subject: kexec: remove KMSG_DUMP_KEXEC KMSG_DUMP_KEXEC is useless because we already save kernel messages inside /proc/vmcore, and it is unsafe to allow modules to do other stuffs in a crash dump scenario. [akpm@linux-foundation.org: fix powerpc build] Signed-off-by: WANG Cong Reported-by: Vivek Goyal Acked-by: Vivek Goyal Acked-by: Jarod Wilson Cc: "Eric W. Biederman" Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/platforms/pseries/nvram.c | 1 - drivers/char/ramoops.c | 3 +-- drivers/mtd/mtdoops.c | 3 +-- include/linux/kmsg_dump.h | 1 - kernel/kexec.c | 3 --- 5 files changed, 2 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 330a57b7c17..36f957f3184 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -638,7 +638,6 @@ static void oops_to_nvram(struct kmsg_dumper *dumper, /* These are almost always orderly shutdowns. */ return; case KMSG_DUMP_OOPS: - case KMSG_DUMP_KEXEC: break; case KMSG_DUMP_PANIC: panicking = true; diff --git a/drivers/char/ramoops.c b/drivers/char/ramoops.c index 7c7f42a1f88..feda90cdac9 100644 --- a/drivers/char/ramoops.c +++ b/drivers/char/ramoops.c @@ -83,8 +83,7 @@ static void ramoops_do_dump(struct kmsg_dumper *dumper, struct timeval timestamp; if (reason != KMSG_DUMP_OOPS && - reason != KMSG_DUMP_PANIC && - reason != KMSG_DUMP_KEXEC) + reason != KMSG_DUMP_PANIC) return; /* Only dump oopses if dump_oops is set */ diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c index db8e8272d69..3ce99e00a49 100644 --- a/drivers/mtd/mtdoops.c +++ b/drivers/mtd/mtdoops.c @@ -315,8 +315,7 @@ static void mtdoops_do_dump(struct kmsg_dumper *dumper, char *dst; if (reason != KMSG_DUMP_OOPS && - reason != KMSG_DUMP_PANIC && - reason != KMSG_DUMP_KEXEC) + reason != KMSG_DUMP_PANIC) return; /* Only dump oopses if dump_oops is set */ diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h index ee0c952188d..fee66317e07 100644 --- a/include/linux/kmsg_dump.h +++ b/include/linux/kmsg_dump.h @@ -18,7 +18,6 @@ enum kmsg_dump_reason { KMSG_DUMP_OOPS, KMSG_DUMP_PANIC, - KMSG_DUMP_KEXEC, KMSG_DUMP_RESTART, KMSG_DUMP_HALT, KMSG_DUMP_POWEROFF, diff --git a/kernel/kexec.c b/kernel/kexec.c index 090ee10d960..20ed47ae252 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include @@ -1094,8 +1093,6 @@ void crash_kexec(struct pt_regs *regs) if (kexec_crash_image) { struct pt_regs fixed_regs; - kmsg_dump(KMSG_DUMP_KEXEC); - crash_setup_regs(&fixed_regs, regs); crash_save_vmcoreinfo(); machine_crash_shutdown(&fixed_regs); -- cgit v1.2.3-70-g09d2 From 6480e5a0923756b500634d9777ec4189492fbbfe Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Thu, 12 Jan 2012 17:20:14 -0800 Subject: kdump: add missing RAM resource in crash_shrink_memory() When shrinking crashkernel memory using /sys/kernel/kexec_crash_size for the newly added memory no RAM resource is created at the moment. Example: $ cat /proc/iomem 00000000-bfffffff : System RAM 00000000-005b7ac3 : Kernel code 005b7ac4-009743bf : Kernel data 009bb000-00a85c33 : Kernel bss c0000000-cfffffff : Crash kernel d0000000-ffffffff : System RAM $ echo 0 > /sys/kernel/kexec_crash_size $ cat /proc/iomem 00000000-bfffffff : System RAM 00000000-005b7ac3 : Kernel code 005b7ac4-009743bf : Kernel data 009bb000-00a85c33 : Kernel bss <<-- here is System RAM missing d0000000-ffffffff : System RAM One result of this bug is that the memory chunk can never be set offline using memory hotplug. With this patch I insert a new "System RAM" resource for the released memory. Then the upper example looks like the following: $ echo 0 > /sys/kernel/kexec_crash_size $ cat /proc/iomem 00000000-bfffffff : System RAM 00000000-005b7ac3 : Kernel code 005b7ac4-009743bf : Kernel data 009bb000-00a85c33 : Kernel bss c0000000-cfffffff : System RAM <<-- new rescoure d0000000-ffffffff : System RAM And now I can set chunk c0000000-cfffffff offline. Signed-off-by: Michael Holzheu Cc: Vivek Goyal Cc: "Eric W. Biederman" Cc: Heiko Carstens Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 20ed47ae252..60bf181b3ea 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1129,6 +1129,7 @@ int crash_shrink_memory(unsigned long new_size) { int ret = 0; unsigned long start, end; + struct resource *ram_res; mutex_lock(&kexec_mutex); @@ -1146,6 +1147,12 @@ int crash_shrink_memory(unsigned long new_size) goto unlock; } + ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); + if (!ram_res) { + ret = -ENOMEM; + goto unlock; + } + start = roundup(start, KEXEC_CRASH_MEM_ALIGN); end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); @@ -1154,7 +1161,15 @@ int crash_shrink_memory(unsigned long new_size) if ((start == end) && (crashk_res.parent != NULL)) release_resource(&crashk_res); + + ram_res->start = end; + ram_res->end = crashk_res.end; + ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; + ram_res->name = "System RAM"; + crashk_res.end = end - 1; + + insert_resource(&iomem_resource, ram_res); crash_unmap_reserved_pages(); unlock: -- cgit v1.2.3-70-g09d2 From bec013c40bc89671d8d457944fdf7d2b8e79d651 Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Thu, 12 Jan 2012 17:20:15 -0800 Subject: kdump: crashk_res init check for /sys/kernel/kexec_crash_size Currently it is possible to set the crash_size via the sysfs /sys/kernel/kexec_crash_size even if no crash kernel memory has been defined with the "crashkernel" parameter. In this case "crashk_res" is not initialized and crashk_res.start = crashk_res.end = 0. Unfortunately resource_size(&crashk_res) returns 1 in this case. This breaks the s390 implementation of crash_(un)map_reserved_pages(). To fix the problem the correct "old_size" is now calculated in crash_shrink_memory(). "old_size is set to "0" if crashk_res is not initialized. With this change crash_shrink_memory() will do nothing, when "crashk_res" is not initialized. It will return "0" for "echo 0 > /sys/kernel/kexec_crash_size" and -EINVAL for "echo [not zero] > /sys/kernel/kexec_crash_size". In addition to that this patch also simplifies the "ret = -EINVAL" vs. "ret = 0" logic as suggested by Simon Horman. Signed-off-by: Michael Holzheu Reviewed-by: Dave Young Reviewed-by: WANG Cong Reviewed-by: Simon Horman Cc: Vivek Goyal Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 60bf181b3ea..7b088678670 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1129,6 +1129,7 @@ int crash_shrink_memory(unsigned long new_size) { int ret = 0; unsigned long start, end; + unsigned long old_size; struct resource *ram_res; mutex_lock(&kexec_mutex); @@ -1139,11 +1140,9 @@ int crash_shrink_memory(unsigned long new_size) } start = crashk_res.start; end = crashk_res.end; - - if (new_size >= end - start + 1) { - ret = -EINVAL; - if (new_size == end - start + 1) - ret = 0; + old_size = (end == 0) ? 0 : end - start + 1; + if (new_size >= old_size) { + ret = (new_size == old_size) ? 0 : -EINVAL; goto unlock; } -- cgit v1.2.3-70-g09d2 From 93e13a360ba331915220f82f6e9543df961ffa1f Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Thu, 12 Jan 2012 17:20:18 -0800 Subject: kdump: fix crash_kexec()/smp_send_stop() race in panic() When two CPUs call panic at the same time there is a possible race condition that can stop kdump. The first CPU calls crash_kexec() and the second CPU calls smp_send_stop() in panic() before crash_kexec() finished on the first CPU. So the second CPU stops the first CPU and therefore kdump fails: 1st CPU: panic()->crash_kexec()->mutex_trylock(&kexec_mutex)-> do kdump 2nd CPU: panic()->crash_kexec()->kexec_mutex already held by 1st CPU ->smp_send_stop()-> stop 1st CPU (stop kdump) This patch fixes the problem by introducing a spinlock in panic that allows only one CPU to process crash_kexec() and the subsequent panic code. All other CPUs call the weak function panic_smp_self_stop() that stops the CPU itself. This function can be overloaded by architecture code. For example "tile" can use their lower-power "nap" instruction for that. Signed-off-by: Michael Holzheu Acked-by: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 6fd09ed6fd9..5dce5404eee 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -49,6 +49,15 @@ static long no_blink(int state) long (*panic_blink)(int state); EXPORT_SYMBOL(panic_blink); +/* + * Stop ourself in panic -- architecture code may override this + */ +void __weak panic_smp_self_stop(void) +{ + while (1) + cpu_relax(); +} + /** * panic - halt the system * @fmt: The text string to print @@ -59,6 +68,7 @@ EXPORT_SYMBOL(panic_blink); */ void panic(const char *fmt, ...) { + static DEFINE_SPINLOCK(panic_lock); static char buf[1024]; va_list args; long i, i_next = 0; @@ -68,8 +78,14 @@ void panic(const char *fmt, ...) * It's possible to come here directly from a panic-assertion and * not have preempt disabled. Some functions called from here want * preempt to be disabled. No point enabling it later though... + * + * Only one CPU is allowed to execute the panic code from here. For + * multiple parallel invocations of panic, all other CPUs either + * stop themself or will wait until they are stopped by the 1st CPU + * with smp_send_stop(). */ - preempt_disable(); + if (!spin_trylock(&panic_lock)) + panic_smp_self_stop(); console_verbose(); bust_spinlocks(1); -- cgit v1.2.3-70-g09d2 From b8f566b04d3cddd192cfd2418ae6d54ac6353792 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 12 Jan 2012 17:20:27 -0800 Subject: sysctl: add the kernel.ns_last_pid control The sysctl works on the current task's pid namespace, getting and setting its last_pid field. Writing is allowed for CAP_SYS_ADMIN-capable tasks thus making it possible to create a task with desired pid value. This ability is required badly for the checkpoint/restore in userspace. This approach suits all the parties for now. Signed-off-by: Pavel Emelyanov Acked-by: Tejun Heo Cc: Oleg Nesterov Cc: Cyrill Gorcunov Cc: "Eric W. Biederman" Cc: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 8 ++++++++ kernel/pid.c | 4 +++- kernel/pid_namespace.c | 31 +++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 6d8cd8b2c30..8c20fbd8b42 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -415,6 +415,14 @@ PIDs of value pid_max or larger are not allocated. ============================================================== +ns_last_pid: + +The last pid allocated in the current (the one task using this sysctl +lives in) pid namespace. When selecting a pid for a next task on fork +kernel tries to allocate a number starting from this one. + +============================================================== + powersave-nap: (PPC only) If set, Linux-PPC will use the 'nap' mode of powersaving, diff --git a/kernel/pid.c b/kernel/pid.c index fa5f72227e5..ce8e00deacc 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b) } /* - * We might be racing with someone else trying to set pid_ns->last_pid. + * We might be racing with someone else trying to set pid_ns->last_pid + * at the pid allocation time (there's also a sysctl for this, but racing + * with this one is OK, see comment in kernel/pid_namespace.c about it). * We want the winner to have the "later" value, because if the * "earlier" value prevails, then a pid may get reused immediately. * diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index e9c9adc84ca..a8968396046 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) return; } +static int pid_ns_ctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * Writing directly to ns' last_pid field is OK, since this field + * is volatile in a living namespace anyway and a code writing to + * it should synchronize its usage with external means. + */ + + tmp.data = ¤t->nsproxy->pid_ns->last_pid; + return proc_dointvec(&tmp, write, buffer, lenp, ppos); +} + +static struct ctl_table pid_ns_ctl_table[] = { + { + .procname = "ns_last_pid", + .maxlen = sizeof(int), + .mode = 0666, /* permissions are checked in the handler */ + .proc_handler = pid_ns_ctl_handler, + }, + { } +}; + +static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; + static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); + register_sysctl_paths(kern_path, pid_ns_ctl_table); return 0; } -- cgit v1.2.3-70-g09d2 From 6e6f0a1f0fa6bba1493c296eb30d1e176e1f8530 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Jan 2012 17:20:30 -0800 Subject: panic: don't print redundant backtraces on oops When an oops causes a panic and panic prints another backtrace it's pretty common to have the original oops data be scrolled away on a 80x50 screen. The second backtrace is quite redundant and not needed anyways. So don't print the panic backtrace when oops_in_progress is true. [akpm@linux-foundation.org: add comment] Signed-off-by: Andi Kleen Cc: Michael Holzheu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 5dce5404eee..80aed44e345 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -94,7 +94,11 @@ void panic(const char *fmt, ...) va_end(args); printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); #ifdef CONFIG_DEBUG_BUGVERBOSE - dump_stack(); + /* + * Avoid nested stack-dumping if a panic occurs during oops processing + */ + if (!oops_in_progress) + dump_stack(); #endif /* -- cgit v1.2.3-70-g09d2 From 028ee4be34a09a6d48bdf30ab991ae933a7bc036 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 12 Jan 2012 17:20:55 -0800 Subject: c/r: prctl: add PR_SET_MM codes to set up mm_struct entries When we restore a task we need to set up text, data and data heap sizes from userspace to the values a task had at checkpoint time. This patch adds auxilary prctl codes for that. While most of them have a statistical nature (their values are involved into calculation of /proc//statm output) the start_brk and brk values are used to compute an allowed size of program data segment expansion. Which means an arbitrary changes of this values might be dangerous operation. So to restrict access the following requirements applied to prctl calls: - The process has to have CAP_SYS_ADMIN capability granted. - For all opcodes except start_brk/brk members an appropriate VMA area must exist and should fit certain VMA flags, such as: - code segment must be executable but not writable; - data segment must not be executable. start_brk/brk values must not intersect with data segment and must not exceed RLIMIT_DATA resource limit. Still the main guard is CAP_SYS_ADMIN capability check. Note the kernel should be compiled with CONFIG_CHECKPOINT_RESTORE support otherwise these prctl calls will return -EINVAL. [akpm@linux-foundation.org: cache current->mm in a local, saving 200 bytes text] Signed-off-by: Cyrill Gorcunov Reviewed-by: Kees Cook Cc: Tejun Heo Cc: Andrew Vagin Cc: Serge Hallyn Cc: Pavel Emelyanov Cc: Vasiliy Kulikov Cc: KAMEZAWA Hiroyuki Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/prctl.h | 12 +++++ kernel/sys.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+) (limited to 'kernel') diff --git a/include/linux/prctl.h b/include/linux/prctl.h index a3baeb2c216..7ddc7f1b480 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h @@ -102,4 +102,16 @@ #define PR_MCE_KILL_GET 34 +/* + * Tune up process memory map specifics. + */ +#define PR_SET_MM 35 +# define PR_SET_MM_START_CODE 1 +# define PR_SET_MM_END_CODE 2 +# define PR_SET_MM_START_DATA 3 +# define PR_SET_MM_END_DATA 4 +# define PR_SET_MM_START_STACK 5 +# define PR_SET_MM_START_BRK 6 +# define PR_SET_MM_BRK 7 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sys.c b/kernel/sys.c index ddf8155bf3f..40701538fbd 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1692,6 +1692,124 @@ SYSCALL_DEFINE1(umask, int, mask) return mask; } +#ifdef CONFIG_CHECKPOINT_RESTORE +static int prctl_set_mm(int opt, unsigned long addr, + unsigned long arg4, unsigned long arg5) +{ + unsigned long rlim = rlimit(RLIMIT_DATA); + unsigned long vm_req_flags; + unsigned long vm_bad_flags; + struct vm_area_struct *vma; + int error = 0; + struct mm_struct *mm = current->mm; + + if (arg4 | arg5) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (addr >= TASK_SIZE) + return -EINVAL; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, addr); + + if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) { + /* It must be existing VMA */ + if (!vma || vma->vm_start > addr) + goto out; + } + + error = -EINVAL; + switch (opt) { + case PR_SET_MM_START_CODE: + case PR_SET_MM_END_CODE: + vm_req_flags = VM_READ | VM_EXEC; + vm_bad_flags = VM_WRITE | VM_MAYSHARE; + + if ((vma->vm_flags & vm_req_flags) != vm_req_flags || + (vma->vm_flags & vm_bad_flags)) + goto out; + + if (opt == PR_SET_MM_START_CODE) + mm->start_code = addr; + else + mm->end_code = addr; + break; + + case PR_SET_MM_START_DATA: + case PR_SET_MM_END_DATA: + vm_req_flags = VM_READ | VM_WRITE; + vm_bad_flags = VM_EXEC | VM_MAYSHARE; + + if ((vma->vm_flags & vm_req_flags) != vm_req_flags || + (vma->vm_flags & vm_bad_flags)) + goto out; + + if (opt == PR_SET_MM_START_DATA) + mm->start_data = addr; + else + mm->end_data = addr; + break; + + case PR_SET_MM_START_STACK: + +#ifdef CONFIG_STACK_GROWSUP + vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP; +#else + vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN; +#endif + if ((vma->vm_flags & vm_req_flags) != vm_req_flags) + goto out; + + mm->start_stack = addr; + break; + + case PR_SET_MM_START_BRK: + if (addr <= mm->end_data) + goto out; + + if (rlim < RLIM_INFINITY && + (mm->brk - addr) + + (mm->end_data - mm->start_data) > rlim) + goto out; + + mm->start_brk = addr; + break; + + case PR_SET_MM_BRK: + if (addr <= mm->end_data) + goto out; + + if (rlim < RLIM_INFINITY && + (addr - mm->start_brk) + + (mm->end_data - mm->start_data) > rlim) + goto out; + + mm->brk = addr; + break; + + default: + error = -EINVAL; + goto out; + } + + error = 0; + +out: + up_read(&mm->mmap_sem); + + return error; +} +#else /* CONFIG_CHECKPOINT_RESTORE */ +static int prctl_set_mm(int opt, unsigned long addr, + unsigned long arg4, unsigned long arg5) +{ + return -EINVAL; +} +#endif + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -1841,6 +1959,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, else error = PR_MCE_KILL_DEFAULT; break; + case PR_SET_MM: + error = prctl_set_mm(arg2, arg3, arg4, arg5); + break; default: error = -EINVAL; break; -- cgit v1.2.3-70-g09d2 From dae5cbc2440b1d21a15715d0f1fb20f632dd38ee Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 14 Jan 2012 00:33:03 +0100 Subject: PM: Make sysrq-o be available for CONFIG_PM unset After commit 1eb208aea3179dd2fc0cdeea45ef869d75b4fe70, "PM: Make CONFIG_PM depend on (CONFIG_PM_SLEEP || CONFIG_PM_RUNTIME)", the files under kernel/power are not built unless CONFIG_PM_SLEEP or CONFIG_PM_RUNTIME is set. In particular, this causes kernel/power/poweroff.c to be omitted, even though it should be compiled, because CONFIG_MAGIC_SYSRQ is set. Fix the problem by causing kernel/power/Makefile to be processed for CONFIG_PM unset too. Reported-and-tested-by: Phil Oester Signed-off-by: Rafael J. Wysocki --- kernel/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index f70396e5a24..2d9de86b7e7 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -23,6 +23,7 @@ CFLAGS_REMOVE_irq_work.o = -pg endif obj-y += sched/ +obj-y += power/ obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o @@ -52,8 +53,6 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_KALLSYMS) += kallsyms.o -obj-$(CONFIG_PM) += power/ -obj-$(CONFIG_FREEZER) += power/ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o -- cgit v1.2.3-70-g09d2 From ee34a37049114303011e154478c63b977bcff24c Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 9 Jan 2012 12:56:23 +0800 Subject: PM / Hibernate: Drop the check of swap space size for compressed image For compressed image, the space required is not known until we finish compressing and writing all pages. This patch drops the check, and if swap space is not enough finally, system can still restore to normal after writing swap fails for compressed images. Signed-off-by: Barry Song Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 3739ecced08..8742fd013a9 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -773,8 +773,7 @@ static int enough_swap(unsigned int nr_pages, unsigned int flags) pr_debug("PM: Free swap pages: %u\n", free_swap); - required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ? - nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1); + required = PAGES_FOR_IO + nr_pages; return free_swap > required; } @@ -802,10 +801,12 @@ int swsusp_write(unsigned int flags) printk(KERN_ERR "PM: Cannot get swap writer\n"); return error; } - if (!enough_swap(pages, flags)) { - printk(KERN_ERR "PM: Not enough free swap\n"); - error = -ENOSPC; - goto out_finish; + if (flags & SF_NOCOMPRESS_MODE) { + if (!enough_swap(pages, flags)) { + printk(KERN_ERR "PM: Not enough free swap\n"); + error = -ENOSPC; + goto out_finish; + } } memset(&snapshot, 0, sizeof(struct snapshot_handle)); error = snapshot_read_next(&snapshot); -- cgit v1.2.3-70-g09d2 From 53999bf34d55981328f8ba9def558d3e104d6e36 Mon Sep 17 00:00:00 2001 From: Kevin Winchester Date: Sun, 15 Jan 2012 19:32:55 -0400 Subject: error: implicit declaration of function 'module_flags_taint' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recent changes to kernel/module.c caused the following compile error: kernel/module.c: In function ‘show_taint’: kernel/module.c:1024:2: error: implicit declaration of function ‘module_flags_taint’ [-Werror=implicit-function-declaration] cc1: some warnings being treated as errors Correct this error by moving the definition of module_flags_taint outside of the #ifdef CONFIG_MODULE_UNLOAD section. Signed-off-by: Kevin Winchester Signed-off-by: Linus Torvalds --- kernel/module.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index acf6ed3ebe8..2c932760fd3 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -842,26 +842,6 @@ out: return ret; } -static size_t module_flags_taint(struct module *mod, char *buf) -{ - size_t l = 0; - - if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) - buf[l++] = 'P'; - if (mod->taints & (1 << TAINT_OOT_MODULE)) - buf[l++] = 'O'; - if (mod->taints & (1 << TAINT_FORCED_MODULE)) - buf[l++] = 'F'; - if (mod->taints & (1 << TAINT_CRAP)) - buf[l++] = 'C'; - /* - * TAINT_FORCED_RMMOD: could be added. - * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't - * apply to modules. - */ - return l; -} - static inline void print_unload_info(struct seq_file *m, struct module *mod) { struct module_use *use; @@ -962,6 +942,26 @@ static inline int module_unload_init(struct module *mod) } #endif /* CONFIG_MODULE_UNLOAD */ +static size_t module_flags_taint(struct module *mod, char *buf) +{ + size_t l = 0; + + if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) + buf[l++] = 'P'; + if (mod->taints & (1 << TAINT_OOT_MODULE)) + buf[l++] = 'O'; + if (mod->taints & (1 << TAINT_FORCED_MODULE)) + buf[l++] = 'F'; + if (mod->taints & (1 << TAINT_CRAP)) + buf[l++] = 'C'; + /* + * TAINT_FORCED_RMMOD: could be added. + * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't + * apply to modules. + */ + return l; +} + static ssize_t show_initstate(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { -- cgit v1.2.3-70-g09d2 From c10076c4304083af15a41f6bc5e657e781c1f9a6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 13 Jan 2012 21:40:59 -0500 Subject: tracepoints/module: Fix disabling tracepoints with taint CRAP or OOT Tracepoints are disabled for tainted modules, which is usually because the module is either proprietary or was forced, and we don't want either of them using kernel tracepoints. But, a module can also be tainted by being in the staging directory or compiled out of tree. Either is fine for use with tracepoints, no need to punish them. I found this out when I noticed that my sample trace event module, when done out of tree, stopped working. Cc: stable@vger.kernel.org # 3.2 Cc: Mathieu Desnoyers Cc: Ben Hutchings Cc: Dave Jones Cc: Greg Kroah-Hartman Cc: Rusty Russell Signed-off-by: Steven Rostedt --- kernel/tracepoint.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index db110b8ae03..f1539decd99 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -634,10 +634,11 @@ static int tracepoint_module_coming(struct module *mod) int ret = 0; /* - * We skip modules that tain the kernel, especially those with different - * module header (for forced load), to make sure we don't cause a crash. + * We skip modules that taint the kernel, especially those with different + * module headers (for forced load), to make sure we don't cause a crash. + * Staging and out-of-tree GPL modules are fine. */ - if (mod->taints) + if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP))) return 0; mutex_lock(&tracepoints_mutex); tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); -- cgit v1.2.3-70-g09d2 From d8e8ed95cda1dfd6813588333d36552935eba4a1 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 15 Dec 2011 13:43:24 +1030 Subject: rcu: Make rcutorture bool parameters really bool (core code) module_param(bool) used to counter-intuitively take an int. In fddd5201 (mid-2009) we allowed bool or int/unsigned int using a messy trick. It's time to remove the int/unsigned int option. For this version it'll simply give a warning, but it'll break next kernel version. This commit makes this change to rcutorture. Signed-off-by: Rusty Russell Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 88f17b8a3b1..e29edc374ce 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -56,8 +56,8 @@ static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ static int nfakewriters = 4; /* # fake writer threads */ static int stat_interval; /* Interval between stats, in seconds. */ /* Defaults to "only at end of test". */ -static int verbose; /* Print more debug info. */ -static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ +static bool verbose; /* Print more debug info. */ +static bool test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ static int stutter = 5; /* Start/stop testing interval (in sec) */ static int irqreader = 1; /* RCU readers from irq (timers). */ -- cgit v1.2.3-70-g09d2 From 4410030646be072b82ec1892ad5cc7d91af384d8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 27 Dec 2011 15:04:26 +0100 Subject: rcu: Add missing __cpuinit annotation in rcutorture code "rcu: Add rcutorture CPU-hotplug capability" adds cpu hotplug operations to the rcutorture code but produces a false positive warning about section mismatches: WARNING: vmlinux.o(.text+0x1e420c): Section mismatch in reference from the function rcu_torture_onoff() to the function .cpuinit.text:cpu_up() The function rcu_torture_onoff() references the function __cpuinit cpu_up(). This is often because rcu_torture_onoff lacks a __cpuinit annotation or the annotation of cpu_up is wrong. This commit therefore adds a __cpuinit annotation so the warning goes away. Signed-off-by: Heiko Carstens Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index e29edc374ce..a58ac285fc6 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1399,7 +1399,7 @@ rcu_torture_shutdown(void *arg) * Execute random CPU-hotplug operations at the interval specified * by the onoff_interval. */ -static int +static int __cpuinit rcu_torture_onoff(void *arg) { int cpu; @@ -1447,7 +1447,7 @@ rcu_torture_onoff(void *arg) return 0; } -static int +static int __cpuinit rcu_torture_onoff_init(void) { if (onoff_interval <= 0) -- cgit v1.2.3-70-g09d2 From 951880e634a79884236a575b896abf55c39ae0bf Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 17 Jan 2012 10:19:41 -0800 Subject: Revert "capabitlies: ns_capable can use the cap helpers rather than lsm call" This reverts commit d2a7009f0bb03fa22ad08dd25472efa0568126b9. J. R. Okajima explains: "After this commit, I am afraid access(2) on NFS may not work correctly. The scenario based upon my guess. - access(2) overrides the credentials. - calls inode_permission() -- ... -- generic_permission() -- ns_capable(). - while the old ns_capable() calls security_capable(current_cred()), the new ns_capable() calls has_ns_capability(current) -- security_capable(__task_cred(t)). current_cred() returns current->cred which is effective (overridden) credentials, but __task_cred(current) returns current->real_cred (the NFSD's credential). And the overridden credentials by access(2) lost." Requested-by: J. R. Okajima Acked-by: Eric Paris Signed-off-by: Linus Torvalds --- kernel/capability.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 0fcf1c14a29..3f1adb6c647 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -384,7 +384,7 @@ bool ns_capable(struct user_namespace *ns, int cap) BUG(); } - if (has_ns_capability(current, ns, cap)) { + if (security_capable(current_cred(), ns, cap) == 0) { current->flags |= PF_SUPERPRIV; return true; } -- cgit v1.2.3-70-g09d2 From 5ef30ee53b187786e64bdc1f8109e39d17f2ce58 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:05 -0500 Subject: audit: make filetype matching consistent with other filters Every other filter that matches part of the inodes list collected by audit will match against any of the inodes on that list. The filetype matching however had a strange way of doing things. It allowed userspace to indicated if it should match on the first of the second name collected by the kernel. Name collection ordering seems like a kernel internal and making userspace rules get that right just seems like a bad idea. As it turns out the userspace audit writers had no idea it was doing this and thus never overloaded the value field. The kernel always checked the first name collected which for the tested rules was always correct. This patch just makes the filetype matching like the major, minor, inode, and LSM rules in that it will match against any of the names collected. It also changes the rule validation to reject the old unused rule types. Noone knew it was there. Noone used it. Why keep around the extra code? Signed-off-by: Eric Paris --- kernel/auditfilter.c | 4 ++-- kernel/auditsc.c | 19 +++++++++---------- 2 files changed, 11 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index f8277c80d67..d94dde82c3c 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -385,7 +385,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) goto exit_free; break; case AUDIT_FILETYPE: - if ((f->val & ~S_IFMT) > S_IFMT) + if (f->val & ~S_IFMT) goto exit_free; break; case AUDIT_INODE: @@ -536,7 +536,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, goto exit_free; break; case AUDIT_FILETYPE: - if ((f->val & ~S_IFMT) > S_IFMT) + if (f->val & ~S_IFMT) goto exit_free; break; default: diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e7fe2b0d29b..a09c5031705 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -305,21 +305,20 @@ static int audit_match_perm(struct audit_context *ctx, int mask) } } -static int audit_match_filetype(struct audit_context *ctx, int which) +static int audit_match_filetype(struct audit_context *ctx, int val) { - unsigned index = which & ~S_IFMT; - umode_t mode = which & S_IFMT; + int index; + umode_t mode = (umode_t)val; if (unlikely(!ctx)) return 0; - if (index >= ctx->name_count) - return 0; - if (ctx->names[index].ino == -1) - return 0; - if ((ctx->names[index].mode ^ mode) & S_IFMT) - return 0; - return 1; + for (index = 0; index < ctx->name_count; index++) { + if ((ctx->names[index].ino != -1) && + ((ctx->names[index].mode & S_IFMT) == mode)) + return 1; + } + return 0; } /* -- cgit v1.2.3-70-g09d2 From 5195d8e217a78697152d64fc09a16e063a022465 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:05 -0500 Subject: audit: dynamically allocate audit_names when not enough space is in the names array This patch does 2 things. First it reduces the number of audit_names allocated in every audit context from 20 to 5. 5 should be enough for all 'normal' syscalls (rename being the worst). Some syscalls can still touch more the 5 inodes such as mount. When rpc filesystem is mounted it will create inodes and those can exceed 5. To handle that problem this patch will dynamically allocate audit_names if it needs more than 5. This should decrease the typicall memory usage while still supporting all the possible kernel operations. Signed-off-by: Eric Paris --- kernel/auditsc.c | 403 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 215 insertions(+), 188 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index a09c5031705..1a92d61ddd2 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -71,8 +71,9 @@ #include "audit.h" /* AUDIT_NAMES is the number of slots we reserve in the audit_context - * for saving names from getname(). */ -#define AUDIT_NAMES 20 + * for saving names from getname(). If we get more names we will allocate + * a name dynamically and also add those to the list anchored by names_list. */ +#define AUDIT_NAMES 5 /* Indicates that audit should log the full pathname. */ #define AUDIT_NAME_FULL -1 @@ -101,9 +102,8 @@ struct audit_cap_data { * * Further, in fs/namei.c:path_lookup() we store the inode and device. */ struct audit_names { + struct list_head list; /* audit_context->names_list */ const char *name; - int name_len; /* number of name's characters to log */ - unsigned name_put; /* call __putname() for this name */ unsigned long ino; dev_t dev; umode_t mode; @@ -113,6 +113,14 @@ struct audit_names { u32 osid; struct audit_cap_data fcap; unsigned int fcap_ver; + int name_len; /* number of name's characters to log */ + bool name_put; /* call __putname() for this name */ + /* + * This was an allocated audit_names and not from the array of + * names allocated in the task audit context. Thus this name + * should be freed on syscall exit + */ + bool should_free; }; struct audit_aux_data { @@ -174,8 +182,17 @@ struct audit_context { long return_code;/* syscall return code */ u64 prio; int return_valid; /* return code is valid */ - int name_count; - struct audit_names names[AUDIT_NAMES]; + /* + * The names_list is the list of all audit_names collected during this + * syscall. The first AUDIT_NAMES entries in the names_list will + * actually be from the preallocated_names array for performance + * reasons. Except during allocation they should never be referenced + * through the preallocated_names array and should only be found/used + * by running the names_list. + */ + struct audit_names preallocated_names[AUDIT_NAMES]; + int name_count; /* total records in names_list */ + struct list_head names_list; /* anchor for struct audit_names->list */ char * filterkey; /* key for rule that triggered record */ struct path pwd; struct audit_context *previous; /* For nested syscalls */ @@ -307,17 +324,18 @@ static int audit_match_perm(struct audit_context *ctx, int mask) static int audit_match_filetype(struct audit_context *ctx, int val) { - int index; + struct audit_names *n; umode_t mode = (umode_t)val; if (unlikely(!ctx)) return 0; - for (index = 0; index < ctx->name_count; index++) { - if ((ctx->names[index].ino != -1) && - ((ctx->names[index].mode & S_IFMT) == mode)) + list_for_each_entry(n, &ctx->names_list, list) { + if ((n->ino != -1) && + ((n->mode & S_IFMT) == mode)) return 1; } + return 0; } @@ -456,13 +474,14 @@ static int audit_filter_rules(struct task_struct *tsk, bool task_creation) { const struct cred *cred; - int i, j, need_sid = 1; + int i, need_sid = 1; u32 sid; cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; + struct audit_names *n; int result = 0; switch (f->type) { @@ -525,8 +544,8 @@ static int audit_filter_rules(struct task_struct *tsk, result = audit_comparator(MAJOR(name->dev), f->op, f->val); else if (ctx) { - for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(MAJOR(n->dev), f->op, f->val)) { ++result; break; } @@ -538,8 +557,8 @@ static int audit_filter_rules(struct task_struct *tsk, result = audit_comparator(MINOR(name->dev), f->op, f->val); else if (ctx) { - for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(MINOR(n->dev), f->op, f->val)) { ++result; break; } @@ -550,8 +569,8 @@ static int audit_filter_rules(struct task_struct *tsk, if (name) result = (name->ino == f->val); else if (ctx) { - for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(ctx->names[j].ino, f->op, f->val)) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(n->ino, f->op, f->val)) { ++result; break; } @@ -606,11 +625,10 @@ static int audit_filter_rules(struct task_struct *tsk, name->osid, f->type, f->op, f->lsm_rule, ctx); } else if (ctx) { - for (j = 0; j < ctx->name_count; j++) { - if (security_audit_rule_match( - ctx->names[j].osid, - f->type, f->op, - f->lsm_rule, ctx)) { + list_for_each_entry(n, &ctx->names_list, list) { + if (security_audit_rule_match(n->osid, f->type, + f->op, f->lsm_rule, + ctx)) { ++result; break; } @@ -721,40 +739,53 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, return AUDIT_BUILD_CONTEXT; } -/* At syscall exit time, this filter is called if any audit_names[] have been +/* + * Given an audit_name check the inode hash table to see if they match. + * Called holding the rcu read lock to protect the use of audit_inode_hash + */ +static int audit_filter_inode_name(struct task_struct *tsk, + struct audit_names *n, + struct audit_context *ctx) { + int word, bit; + int h = audit_hash_ino((u32)n->ino); + struct list_head *list = &audit_inode_hash[h]; + struct audit_entry *e; + enum audit_state state; + + word = AUDIT_WORD(ctx->major); + bit = AUDIT_BIT(ctx->major); + + if (list_empty(list)) + return 0; + + list_for_each_entry_rcu(e, list, list) { + if ((e->rule.mask[word] & bit) == bit && + audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { + ctx->current_state = state; + return 1; + } + } + + return 0; +} + +/* At syscall exit time, this filter is called if any audit_names have been * collected during syscall processing. We only check rules in sublists at hash - * buckets applicable to the inode numbers in audit_names[]. + * buckets applicable to the inode numbers in audit_names. * Regarding audit_state, same rules apply as for audit_filter_syscall(). */ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) { - int i; - struct audit_entry *e; - enum audit_state state; + struct audit_names *n; if (audit_pid && tsk->tgid == audit_pid) return; rcu_read_lock(); - for (i = 0; i < ctx->name_count; i++) { - int word = AUDIT_WORD(ctx->major); - int bit = AUDIT_BIT(ctx->major); - struct audit_names *n = &ctx->names[i]; - int h = audit_hash_ino((u32)n->ino); - struct list_head *list = &audit_inode_hash[h]; - if (list_empty(list)) - continue; - - list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit && - audit_filter_rules(tsk, &e->rule, ctx, n, - &state, false)) { - rcu_read_unlock(); - ctx->current_state = state; - return; - } - } + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_filter_inode_name(tsk, n, ctx)) + break; } rcu_read_unlock(); } @@ -798,7 +829,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, static inline void audit_free_names(struct audit_context *context) { - int i; + struct audit_names *n, *next; #if AUDIT_DEBUG == 2 if (context->put_count + context->ino_count != context->name_count) { @@ -809,10 +840,9 @@ static inline void audit_free_names(struct audit_context *context) context->serial, context->major, context->in_syscall, context->name_count, context->put_count, context->ino_count); - for (i = 0; i < context->name_count; i++) { + list_for_each_entry(n, &context->names_list, list) { printk(KERN_ERR "names[%d] = %p = %s\n", i, - context->names[i].name, - context->names[i].name ?: "(null)"); + n->name, n->name ?: "(null)"); } dump_stack(); return; @@ -823,9 +853,12 @@ static inline void audit_free_names(struct audit_context *context) context->ino_count = 0; #endif - for (i = 0; i < context->name_count; i++) { - if (context->names[i].name && context->names[i].name_put) - __putname(context->names[i].name); + list_for_each_entry_safe(n, next, &context->names_list, list) { + list_del(&n->list); + if (n->name && n->name_put) + __putname(n->name); + if (n->should_free) + kfree(n); } context->name_count = 0; path_put(&context->pwd); @@ -863,6 +896,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state) return NULL; audit_zero_context(context, state); INIT_LIST_HEAD(&context->killed_trees); + INIT_LIST_HEAD(&context->names_list); return context; } @@ -1323,6 +1357,68 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_end(ab); } +static void audit_log_name(struct audit_context *context, struct audit_names *n, + int record_num, int *call_panic) +{ + struct audit_buffer *ab; + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); + if (!ab) + return; /* audit_panic has been called */ + + audit_log_format(ab, "item=%d", record_num); + + if (n->name) { + switch (n->name_len) { + case AUDIT_NAME_FULL: + /* log the full path */ + audit_log_format(ab, " name="); + audit_log_untrustedstring(ab, n->name); + break; + case 0: + /* name was specified as a relative path and the + * directory component is the cwd */ + audit_log_d_path(ab, "name=", &context->pwd); + break; + default: + /* log the name's directory component */ + audit_log_format(ab, " name="); + audit_log_n_untrustedstring(ab, n->name, + n->name_len); + } + } else + audit_log_format(ab, " name=(null)"); + + if (n->ino != (unsigned long)-1) { + audit_log_format(ab, " inode=%lu" + " dev=%02x:%02x mode=%#ho" + " ouid=%u ogid=%u rdev=%02x:%02x", + n->ino, + MAJOR(n->dev), + MINOR(n->dev), + n->mode, + n->uid, + n->gid, + MAJOR(n->rdev), + MINOR(n->rdev)); + } + if (n->osid != 0) { + char *ctx = NULL; + u32 len; + if (security_secid_to_secctx( + n->osid, &ctx, &len)) { + audit_log_format(ab, " osid=%u", n->osid); + *call_panic = 2; + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } + } + + audit_log_fcaps(ab, n); + + audit_log_end(ab); +} + static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { const struct cred *cred; @@ -1330,6 +1426,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts struct audit_buffer *ab; struct audit_aux_data *aux; const char *tty; + struct audit_names *n; /* tsk == current */ context->pid = tsk->pid; @@ -1469,66 +1566,10 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_end(ab); } } - for (i = 0; i < context->name_count; i++) { - struct audit_names *n = &context->names[i]; - - ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); - if (!ab) - continue; /* audit_panic has been called */ - - audit_log_format(ab, "item=%d", i); - - if (n->name) { - switch(n->name_len) { - case AUDIT_NAME_FULL: - /* log the full path */ - audit_log_format(ab, " name="); - audit_log_untrustedstring(ab, n->name); - break; - case 0: - /* name was specified as a relative path and the - * directory component is the cwd */ - audit_log_d_path(ab, "name=", &context->pwd); - break; - default: - /* log the name's directory component */ - audit_log_format(ab, " name="); - audit_log_n_untrustedstring(ab, n->name, - n->name_len); - } - } else - audit_log_format(ab, " name=(null)"); - - if (n->ino != (unsigned long)-1) { - audit_log_format(ab, " inode=%lu" - " dev=%02x:%02x mode=%#ho" - " ouid=%u ogid=%u rdev=%02x:%02x", - n->ino, - MAJOR(n->dev), - MINOR(n->dev), - n->mode, - n->uid, - n->gid, - MAJOR(n->rdev), - MINOR(n->rdev)); - } - if (n->osid != 0) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx( - n->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", n->osid); - call_panic = 2; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); - } - } - - audit_log_fcaps(ab, n); - audit_log_end(ab); - } + i = 0; + list_for_each_entry(n, &context->names_list, list) + audit_log_name(context, n, i++, &call_panic); /* Send end of event record to help user space know we are finished */ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); @@ -1820,6 +1861,30 @@ retry: #endif } +static struct audit_names *audit_alloc_name(struct audit_context *context) +{ + struct audit_names *aname; + + if (context->name_count < AUDIT_NAMES) { + aname = &context->preallocated_names[context->name_count]; + memset(aname, 0, sizeof(*aname)); + } else { + aname = kzalloc(sizeof(*aname), GFP_NOFS); + if (!aname) + return NULL; + aname->should_free = true; + } + + aname->ino = (unsigned long)-1; + list_add_tail(&aname->list, &context->names_list); + + context->name_count++; +#if AUDIT_DEBUG + context->ino_count++; +#endif + return aname; +} + /** * audit_getname - add a name to the list * @name: name to add @@ -1830,6 +1895,7 @@ retry: void __audit_getname(const char *name) { struct audit_context *context = current->audit_context; + struct audit_names *n; if (IS_ERR(name) || !name) return; @@ -1842,13 +1908,15 @@ void __audit_getname(const char *name) #endif return; } - BUG_ON(context->name_count >= AUDIT_NAMES); - context->names[context->name_count].name = name; - context->names[context->name_count].name_len = AUDIT_NAME_FULL; - context->names[context->name_count].name_put = 1; - context->names[context->name_count].ino = (unsigned long)-1; - context->names[context->name_count].osid = 0; - ++context->name_count; + + n = audit_alloc_name(context); + if (!n) + return; + + n->name = name; + n->name_len = AUDIT_NAME_FULL; + n->name_put = true; + if (!context->pwd.dentry) get_fs_pwd(current->fs, &context->pwd); } @@ -1870,12 +1938,13 @@ void audit_putname(const char *name) printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", __FILE__, __LINE__, context->serial, name); if (context->name_count) { + struct audit_names *n; int i; - for (i = 0; i < context->name_count; i++) + + list_for_each_entry(n, &context->names_list, list) printk(KERN_ERR "name[%d] = %p = %s\n", i, - context->names[i].name, - context->names[i].name ?: "(null)"); - } + n->name, n->name ?: "(null)"); + } #endif __putname(name); } @@ -1896,39 +1965,11 @@ void audit_putname(const char *name) #endif } -static int audit_inc_name_count(struct audit_context *context, - const struct inode *inode) -{ - if (context->name_count >= AUDIT_NAMES) { - if (inode) - printk(KERN_DEBUG "audit: name_count maxed, losing inode data: " - "dev=%02x:%02x, inode=%lu\n", - MAJOR(inode->i_sb->s_dev), - MINOR(inode->i_sb->s_dev), - inode->i_ino); - - else - printk(KERN_DEBUG "name_count maxed, losing inode data\n"); - return 1; - } - context->name_count++; -#if AUDIT_DEBUG - context->ino_count++; -#endif - return 0; -} - - static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) { struct cpu_vfs_cap_data caps; int rc; - memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t)); - memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t)); - name->fcap.fE = 0; - name->fcap_ver = 0; - if (!dentry) return 0; @@ -1968,30 +2009,25 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent */ void __audit_inode(const char *name, const struct dentry *dentry) { - int idx; struct audit_context *context = current->audit_context; const struct inode *inode = dentry->d_inode; + struct audit_names *n; if (!context->in_syscall) return; - if (context->name_count - && context->names[context->name_count-1].name - && context->names[context->name_count-1].name == name) - idx = context->name_count - 1; - else if (context->name_count > 1 - && context->names[context->name_count-2].name - && context->names[context->name_count-2].name == name) - idx = context->name_count - 2; - else { - /* FIXME: how much do we care about inodes that have no - * associated name? */ - if (audit_inc_name_count(context, inode)) - return; - idx = context->name_count - 1; - context->names[idx].name = NULL; + + list_for_each_entry_reverse(n, &context->names_list, list) { + if (n->name && (n->name == name)) + goto out; } + + /* unable to find the name from a previous getname() */ + n = audit_alloc_name(context); + if (!n) + return; +out: handle_path(dentry); - audit_copy_inode(&context->names[idx], dentry, inode); + audit_copy_inode(n, dentry, inode); } /** @@ -2010,11 +2046,11 @@ void __audit_inode(const char *name, const struct dentry *dentry) void __audit_inode_child(const struct dentry *dentry, const struct inode *parent) { - int idx; struct audit_context *context = current->audit_context; const char *found_parent = NULL, *found_child = NULL; const struct inode *inode = dentry->d_inode; const char *dname = dentry->d_name.name; + struct audit_names *n; int dirlen = 0; if (!context->in_syscall) @@ -2024,9 +2060,7 @@ void __audit_inode_child(const struct dentry *dentry, handle_one(inode); /* parent is more likely, look for it first */ - for (idx = 0; idx < context->name_count; idx++) { - struct audit_names *n = &context->names[idx]; - + list_for_each_entry(n, &context->names_list, list) { if (!n->name) continue; @@ -2039,9 +2073,7 @@ void __audit_inode_child(const struct dentry *dentry, } /* no matching parent, look for matching child */ - for (idx = 0; idx < context->name_count; idx++) { - struct audit_names *n = &context->names[idx]; - + list_for_each_entry(n, &context->names_list, list) { if (!n->name) continue; @@ -2059,34 +2091,29 @@ void __audit_inode_child(const struct dentry *dentry, add_names: if (!found_parent) { - if (audit_inc_name_count(context, parent)) + n = audit_alloc_name(context); + if (!n) return; - idx = context->name_count - 1; - context->names[idx].name = NULL; - audit_copy_inode(&context->names[idx], NULL, parent); + audit_copy_inode(n, NULL, parent); } if (!found_child) { - if (audit_inc_name_count(context, inode)) + n = audit_alloc_name(context); + if (!n) return; - idx = context->name_count - 1; /* Re-use the name belonging to the slot for a matching parent * directory. All names for this context are relinquished in * audit_free_names() */ if (found_parent) { - context->names[idx].name = found_parent; - context->names[idx].name_len = AUDIT_NAME_FULL; + n->name = found_parent; + n->name_len = AUDIT_NAME_FULL; /* don't call __putname() */ - context->names[idx].name_put = 0; - } else { - context->names[idx].name = NULL; + n->name_put = false; } if (inode) - audit_copy_inode(&context->names[idx], NULL, inode); - else - context->names[idx].ino = (unsigned long)-1; + audit_copy_inode(n, NULL, inode); } } EXPORT_SYMBOL_GPL(__audit_inode_child); -- cgit v1.2.3-70-g09d2 From 3035c51e8ac0512686ceb9f2bd1d13bdc6e4fb29 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:05 -0500 Subject: audit: drop the meaningless and format breaking word 'user' userspace audit messages look like so: type=USER msg=audit(1271170549.415:24710): user pid=14722 uid=0 auid=500 ses=1 subj=unconfined_u:unconfined_r:auditctl_t:s0-s0:c0.c1023 msg='' That third field just says 'user'. That's useless and doesn't follow the key=value pair we are trying to enforce. We already know it came from the user based on the record type. Kill that word. Die. Signed-off-by: Eric Paris --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 2c1d6ab7106..00efe4758c8 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -631,7 +631,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, } *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); - audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u", + audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", pid, uid, auid, ses); if (sid) { rc = security_secid_to_secctx(sid, &ctx, &len); -- cgit v1.2.3-70-g09d2 From 16c174bd95cb07c9d0ad3fcd8c70f9cea7214c9d Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:05 -0500 Subject: audit: check current inode and containing object when filtering on major and minor The audit system has the ability to filter on the major and minor number of the device containing the inode being operated upon. Lets say that /dev/sda1 has major,minor 8,1 and that we mount /dev/sda1 on /boot. Now lets say we add a watch with a filter on 8,1. If we proceed to open an inode inside /boot, such as /vboot/vmlinuz, we will match the major,minor filter. Lets instead assume that one were to use a tool like debugfs and were to open /dev/sda1 directly and to modify it's contents. We might hope that this would also be logged, but it isn't. The rules will check the major,minor of the device containing /dev/sda1. In other words the rule would match on the major/minor of the tmpfs mounted at /dev. I believe these rules should trigger on either device. The man page is devoid of useful information about the intended semantics. It only seems logical that if you want to know everything that happened on a major,minor that would include things that happened to the device itself... Signed-off-by: Eric Paris --- kernel/auditsc.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 1a92d61ddd2..7c495147c3d 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -540,12 +540,14 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_DEVMAJOR: - if (name) - result = audit_comparator(MAJOR(name->dev), - f->op, f->val); - else if (ctx) { + if (name) { + if (audit_comparator(MAJOR(name->dev), f->op, f->val) || + audit_comparator(MAJOR(name->rdev), f->op, f->val)) + ++result; + } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { - if (audit_comparator(MAJOR(n->dev), f->op, f->val)) { + if (audit_comparator(MAJOR(n->dev), f->op, f->val) || + audit_comparator(MAJOR(n->rdev), f->op, f->val)) { ++result; break; } @@ -553,12 +555,14 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_DEVMINOR: - if (name) - result = audit_comparator(MINOR(name->dev), - f->op, f->val); - else if (ctx) { + if (name) { + if (audit_comparator(MINOR(name->dev), f->op, f->val) || + audit_comparator(MINOR(name->rdev), f->op, f->val)) + ++result; + } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { - if (audit_comparator(MINOR(n->dev), f->op, f->val)) { + if (audit_comparator(MINOR(n->dev), f->op, f->val) || + audit_comparator(MINOR(n->rdev), f->op, f->val)) { ++result; break; } -- cgit v1.2.3-70-g09d2 From 85e7bac33b8d5edafc4e219c7dfdb3d48e0b4e31 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:05 -0500 Subject: seccomp: audit abnormal end to a process due to seccomp The audit system likes to collect information about processes that end abnormally (SIGSEGV) as this may me useful intrusion detection information. This patch adds audit support to collect information when seccomp forces a task to exit because of misbehavior in a similar way. Signed-off-by: Eric Paris --- include/linux/audit.h | 8 ++++++++ kernel/auditsc.c | 50 +++++++++++++++++++++++++++++--------------------- kernel/seccomp.c | 2 ++ 3 files changed, 39 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 426ab9f4dd8..6e1c533f9b4 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -430,6 +430,7 @@ extern void audit_putname(const char *name); extern void __audit_inode(const char *name, const struct dentry *dentry); extern void __audit_inode_child(const struct dentry *dentry, const struct inode *parent); +extern void __audit_seccomp(unsigned long syscall); extern void __audit_ptrace(struct task_struct *t); static inline int audit_dummy_context(void) @@ -453,6 +454,12 @@ static inline void audit_inode_child(const struct dentry *dentry, } void audit_core_dumps(long signr); +static inline void audit_seccomp(unsigned long syscall) +{ + if (unlikely(!audit_dummy_context())) + __audit_seccomp(syscall); +} + static inline void audit_ptrace(struct task_struct *t) { if (unlikely(!audit_dummy_context())) @@ -558,6 +565,7 @@ extern int audit_signals; #define audit_inode(n,d) do { (void)(d); } while (0) #define audit_inode_child(i,p) do { ; } while (0) #define audit_core_dumps(i) do { ; } while (0) +#define audit_seccomp(i) do { ; } while (0) #define auditsc_get_stamp(c,t,s) (0) #define audit_get_loginuid(t) (-1) #define audit_get_sessionid(t) (-1) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 7c495147c3d..e9bcb93800d 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2529,6 +2529,25 @@ void __audit_mmap_fd(int fd, int flags) context->type = AUDIT_MMAP; } +static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) +{ + uid_t auid, uid; + gid_t gid; + unsigned int sessionid; + + auid = audit_get_loginuid(current); + sessionid = audit_get_sessionid(current); + current_uid_gid(&uid, &gid); + + audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", + auid, uid, gid, sessionid); + audit_log_task_context(ab); + audit_log_format(ab, " pid=%d comm=", current->pid); + audit_log_untrustedstring(ab, current->comm); + audit_log_format(ab, " reason="); + audit_log_string(ab, reason); + audit_log_format(ab, " sig=%ld", signr); +} /** * audit_core_dumps - record information about processes that end abnormally * @signr: signal value @@ -2539,10 +2558,6 @@ void __audit_mmap_fd(int fd, int flags) void audit_core_dumps(long signr) { struct audit_buffer *ab; - u32 sid; - uid_t auid = audit_get_loginuid(current), uid; - gid_t gid; - unsigned int sessionid = audit_get_sessionid(current); if (!audit_enabled) return; @@ -2551,24 +2566,17 @@ void audit_core_dumps(long signr) return; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); - current_uid_gid(&uid, &gid); - audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", - auid, uid, gid, sessionid); - security_task_getsecid(current, &sid); - if (sid) { - char *ctx = NULL; - u32 len; + audit_log_abend(ab, "memory violation", signr); + audit_log_end(ab); +} - if (security_secid_to_secctx(sid, &ctx, &len)) - audit_log_format(ab, " ssid=%u", sid); - else { - audit_log_format(ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - } - } - audit_log_format(ab, " pid=%d comm=", current->pid); - audit_log_untrustedstring(ab, current->comm); - audit_log_format(ab, " sig=%ld", signr); +void __audit_seccomp(unsigned long syscall) +{ + struct audit_buffer *ab; + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); + audit_log_abend(ab, "seccomp", SIGKILL); + audit_log_format(ab, " syscall=%ld", syscall); audit_log_end(ab); } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 57d4b13b631..e8d76c5895e 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -6,6 +6,7 @@ * This defines a simple but solid secure-computing mode. */ +#include #include #include #include @@ -54,6 +55,7 @@ void __secure_computing(int this_syscall) #ifdef SECCOMP_DEBUG dump_stack(); #endif + audit_seccomp(this_syscall); do_exit(SIGKILL); } -- cgit v1.2.3-70-g09d2 From d7e7528bcd456f5c36ad4a202ccfb43c5aa98bc4 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:06 -0500 Subject: Audit: push audit success and retcode into arch ptrace.h The audit system previously expected arches calling to audit_syscall_exit to supply as arguments if the syscall was a success and what the return code was. Audit also provides a helper AUDITSC_RESULT which was supposed to simplify things by converting from negative retcodes to an audit internal magic value stating success or failure. This helper was wrong and could indicate that a valid pointer returned to userspace was a failed syscall. The fix is to fix the layering foolishness. We now pass audit_syscall_exit a struct pt_reg and it in turns calls back into arch code to collect the return value and to determine if the syscall was a success or failure. We also define a generic is_syscall_success() macro which determines success/failure based on if the value is < -MAX_ERRNO. This works for arches like x86 which do not use a separate mechanism to indicate syscall failure. We make both the is_syscall_success() and regs_return_value() static inlines instead of macros. The reason is because the audit function must take a void* for the regs. (uml calls theirs struct uml_pt_regs instead of just struct pt_regs so audit_syscall_exit can't take a struct pt_regs). Since the audit function takes a void* we need to use static inlines to cast it back to the arch correct structure to dereference it. The other major change is that on some arches, like ia64, MIPS and ppc, we change regs_return_value() to give us the negative value on syscall failure. THE only other user of this macro, kretprobe_example.c, won't notice and it makes the value signed consistently for the audit functions across all archs. In arch/sh/kernel/ptrace_64.c I see that we were using regs[9] in the old audit code as the return value. But the ptrace_64.h code defined the macro regs_return_value() as regs[3]. I have no idea which one is correct, but this patch now uses the regs_return_value() function, so it now uses regs[3]. For powerpc we previously used regs->result but now use the regs_return_value() function which uses regs->gprs[3]. regs->gprs[3] is always positive so the regs_return_value(), much like ia64 makes it negative before calling the audit code when appropriate. Signed-off-by: Eric Paris Acked-by: H. Peter Anvin [for x86 portion] Acked-by: Tony Luck [for ia64] Acked-by: Richard Weinberger [for uml] Acked-by: David S. Miller [for sparc] Acked-by: Ralf Baechle [for mips] Acked-by: Benjamin Herrenschmidt [for ppc] --- arch/ia64/include/asm/ptrace.h | 13 ++++++++++++- arch/ia64/kernel/ptrace.c | 9 +-------- arch/microblaze/include/asm/ptrace.h | 5 +++++ arch/microblaze/kernel/ptrace.c | 3 +-- arch/mips/include/asm/ptrace.h | 14 +++++++++++++- arch/mips/kernel/ptrace.c | 4 +--- arch/powerpc/include/asm/ptrace.h | 13 ++++++++++++- arch/powerpc/kernel/ptrace.c | 4 +--- arch/s390/include/asm/ptrace.h | 6 +++++- arch/s390/kernel/ptrace.c | 4 +--- arch/sh/include/asm/ptrace_32.h | 5 ++++- arch/sh/include/asm/ptrace_64.h | 5 ++++- arch/sh/kernel/ptrace_32.c | 4 +--- arch/sh/kernel/ptrace_64.c | 4 +--- arch/sparc/include/asm/ptrace.h | 10 +++++++++- arch/sparc/kernel/ptrace_64.c | 11 +---------- arch/um/kernel/ptrace.c | 4 ++-- arch/x86/ia32/ia32entry.S | 10 +++++----- arch/x86/kernel/entry_32.S | 8 ++++---- arch/x86/kernel/entry_64.S | 10 +++++----- arch/x86/kernel/ptrace.c | 3 +-- arch/x86/kernel/vm86_32.c | 4 ++-- arch/x86/um/shared/sysdep/ptrace.h | 5 +++++ include/linux/audit.h | 22 ++++++++++++++-------- include/linux/ptrace.h | 10 ++++++++++ kernel/auditsc.c | 16 ++++++++++++---- 26 files changed, 132 insertions(+), 74 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/include/asm/ptrace.h b/arch/ia64/include/asm/ptrace.h index f5cb27614e3..68c98f5b3ca 100644 --- a/arch/ia64/include/asm/ptrace.h +++ b/arch/ia64/include/asm/ptrace.h @@ -246,7 +246,18 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs) return regs->ar_bspstore; } -#define regs_return_value(regs) ((regs)->r8) +static inline int is_syscall_success(struct pt_regs *regs) +{ + return regs->r10 != -1; +} + +static inline long regs_return_value(struct pt_regs *regs) +{ + if (is_syscall_success(regs)) + return regs->r8; + else + return -regs->r8; +} /* Conserve space in histogram by encoding slot bits in address * bits 2 and 3 rather than bits 0 and 1. diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 8848f43d819..2c154088cce 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -1268,14 +1268,7 @@ syscall_trace_leave (long arg0, long arg1, long arg2, long arg3, { int step; - if (unlikely(current->audit_context)) { - int success = AUDITSC_RESULT(regs.r10); - long result = regs.r8; - - if (success != AUDITSC_SUCCESS) - result = -result; - audit_syscall_exit(success, result); - } + audit_syscall_exit(®s); step = test_thread_flag(TIF_SINGLESTEP); if (step || test_thread_flag(TIF_SYSCALL_TRACE)) diff --git a/arch/microblaze/include/asm/ptrace.h b/arch/microblaze/include/asm/ptrace.h index 816bee64b19..94e92c80585 100644 --- a/arch/microblaze/include/asm/ptrace.h +++ b/arch/microblaze/include/asm/ptrace.h @@ -61,6 +61,11 @@ struct pt_regs { #define instruction_pointer(regs) ((regs)->pc) #define profile_pc(regs) instruction_pointer(regs) +static inline long regs_return_value(struct pt_regs *regs) +{ + return regs->r3; +} + #else /* __KERNEL__ */ /* pt_regs offsets used by gdbserver etc in ptrace syscalls */ diff --git a/arch/microblaze/kernel/ptrace.c b/arch/microblaze/kernel/ptrace.c index 043cb58f9c4..f564b1bfd38 100644 --- a/arch/microblaze/kernel/ptrace.c +++ b/arch/microblaze/kernel/ptrace.c @@ -159,8 +159,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs) { int step; - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->r3), regs->r3); + audit_syscall_exit(regs); step = test_thread_flag(TIF_SINGLESTEP); if (step || test_thread_flag(TIF_SYSCALL_TRACE)) diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h index de39b1f343e..7d409505df2 100644 --- a/arch/mips/include/asm/ptrace.h +++ b/arch/mips/include/asm/ptrace.h @@ -137,7 +137,19 @@ extern int ptrace_set_watch_regs(struct task_struct *child, */ #define user_mode(regs) (((regs)->cp0_status & KU_MASK) == KU_USER) -#define regs_return_value(_regs) ((_regs)->regs[2]) +static inline int is_syscall_success(struct pt_regs *regs) +{ + return !regs->regs[7]; +} + +static inline long regs_return_value(struct pt_regs *regs) +{ + if (is_syscall_success(regs)) + return regs->regs[2]; + else + return -regs->regs[2]; +} + #define instruction_pointer(regs) ((regs)->cp0_epc) #define profile_pc(regs) instruction_pointer(regs) diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 4e6ea1ffad4..ab0f1963a7b 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -572,9 +572,7 @@ out: */ asmlinkage void syscall_trace_leave(struct pt_regs *regs) { - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->regs[7]), - -regs->regs[2]); + audit_syscall_exit(regs); if (!(current->ptrace & PT_PTRACED)) return; diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 48223f9b872..78a205162fd 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -86,7 +86,18 @@ struct pt_regs { #define instruction_pointer(regs) ((regs)->nip) #define user_stack_pointer(regs) ((regs)->gpr[1]) #define kernel_stack_pointer(regs) ((regs)->gpr[1]) -#define regs_return_value(regs) ((regs)->gpr[3]) +static inline int is_syscall_success(struct pt_regs *regs) +{ + return !(regs->ccr & 0x10000000); +} + +static inline long regs_return_value(struct pt_regs *regs) +{ + if (is_syscall_success(regs)) + return regs->gpr[3]; + else + return -regs->gpr[3]; +} #ifdef CONFIG_SMP extern unsigned long profile_pc(struct pt_regs *regs); diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 5de73dbd15c..09d31c12a5e 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -1748,9 +1748,7 @@ void do_syscall_trace_leave(struct pt_regs *regs) { int step; - if (unlikely(current->audit_context)) - audit_syscall_exit((regs->ccr&0x10000000)?AUDITSC_FAILURE:AUDITSC_SUCCESS, - regs->result); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->result); diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index 56da355678f..aeb77f01798 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -541,9 +541,13 @@ struct user_regs_struct #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0) #define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN) #define user_stack_pointer(regs)((regs)->gprs[15]) -#define regs_return_value(regs)((regs)->gprs[2]) #define profile_pc(regs) instruction_pointer(regs) +static inline long regs_return_value(struct pt_regs *regs) +{ + return regs->gprs[2]; +} + int regs_query_register_offset(const char *name); const char *regs_query_register_name(unsigned int offset); unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset); diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 573bc29551e..f5275860098 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -751,9 +751,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) asmlinkage void do_syscall_trace_exit(struct pt_regs *regs) { - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]), - regs->gprs[2]); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->gprs[2]); diff --git a/arch/sh/include/asm/ptrace_32.h b/arch/sh/include/asm/ptrace_32.h index 6c2239cca1a..2d3e906aa72 100644 --- a/arch/sh/include/asm/ptrace_32.h +++ b/arch/sh/include/asm/ptrace_32.h @@ -76,7 +76,10 @@ struct pt_dspregs { #ifdef __KERNEL__ #define MAX_REG_OFFSET offsetof(struct pt_regs, tra) -#define regs_return_value(_regs) ((_regs)->regs[0]) +static inline long regs_return_value(struct pt_regs *regs) +{ + return regs->regs[0]; +} #endif /* __KERNEL__ */ diff --git a/arch/sh/include/asm/ptrace_64.h b/arch/sh/include/asm/ptrace_64.h index bf9be7764d6..eb3fcceaf64 100644 --- a/arch/sh/include/asm/ptrace_64.h +++ b/arch/sh/include/asm/ptrace_64.h @@ -13,7 +13,10 @@ struct pt_regs { #ifdef __KERNEL__ #define MAX_REG_OFFSET offsetof(struct pt_regs, tregs[7]) -#define regs_return_value(_regs) ((_regs)->regs[3]) +static inline long regs_return_value(struct pt_regs *regs) +{ + return regs->regs[3]; +} #endif /* __KERNEL__ */ diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c index 92b3c276339..c0b5c179d27 100644 --- a/arch/sh/kernel/ptrace_32.c +++ b/arch/sh/kernel/ptrace_32.c @@ -530,9 +530,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs) { int step; - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->regs[0]), - regs->regs[0]); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->regs[0]); diff --git a/arch/sh/kernel/ptrace_64.c b/arch/sh/kernel/ptrace_64.c index c8f97649f35..ba720d68643 100644 --- a/arch/sh/kernel/ptrace_64.c +++ b/arch/sh/kernel/ptrace_64.c @@ -548,9 +548,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs) { int step; - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->regs[9]), - regs->regs[9]); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->regs[9]); diff --git a/arch/sparc/include/asm/ptrace.h b/arch/sparc/include/asm/ptrace.h index a0e1bcf843a..c00c3b5c280 100644 --- a/arch/sparc/include/asm/ptrace.h +++ b/arch/sparc/include/asm/ptrace.h @@ -207,7 +207,15 @@ do { current_thread_info()->syscall_noerror = 1; \ #define instruction_pointer(regs) ((regs)->tpc) #define instruction_pointer_set(regs, val) ((regs)->tpc = (val)) #define user_stack_pointer(regs) ((regs)->u_regs[UREG_FP]) -#define regs_return_value(regs) ((regs)->u_regs[UREG_I0]) +static inline int is_syscall_success(struct pt_regs *regs) +{ + return !(regs->tstate & (TSTATE_XCARRY | TSTATE_ICARRY)); +} + +static inline long regs_return_value(struct pt_regs *regs) +{ + return regs->u_regs[UREG_I0]; +} #ifdef CONFIG_SMP extern unsigned long profile_pc(struct pt_regs *); #else diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c index 96ee50a8066..c73c8c50f11 100644 --- a/arch/sparc/kernel/ptrace_64.c +++ b/arch/sparc/kernel/ptrace_64.c @@ -1086,17 +1086,8 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs) asmlinkage void syscall_trace_leave(struct pt_regs *regs) { -#ifdef CONFIG_AUDITSYSCALL - if (unlikely(current->audit_context)) { - unsigned long tstate = regs->tstate; - int result = AUDITSC_SUCCESS; + audit_syscall_exit(regs); - if (unlikely(tstate & (TSTATE_XCARRY | TSTATE_ICARRY))) - result = AUDITSC_FAILURE; - - audit_syscall_exit(result, regs->u_regs[UREG_I0]); - } -#endif if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->u_regs[UREG_G1]); diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c index c9da32b0c70..2ccf25c42fe 100644 --- a/arch/um/kernel/ptrace.c +++ b/arch/um/kernel/ptrace.c @@ -175,8 +175,8 @@ void syscall_trace(struct uml_pt_regs *regs, int entryexit) UPT_SYSCALL_ARG2(regs), UPT_SYSCALL_ARG3(regs), UPT_SYSCALL_ARG4(regs)); - else audit_syscall_exit(AUDITSC_RESULT(UPT_SYSCALL_RET(regs)), - UPT_SYSCALL_RET(regs)); + else + audit_syscall_exit(regs); } /* Fake a debug trap */ diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 3e274564f6b..64ced0b8f8f 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -14,6 +14,7 @@ #include #include #include +#include /* Avoid __ASSEMBLER__'ifying just for this. */ #include @@ -208,12 +209,11 @@ sysexit_from_sys_call: TRACE_IRQS_ON sti movl %eax,%esi /* second arg, syscall return value */ - cmpl $0,%eax /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + setbe %al /* 1 if so, 0 if not */ movzbl %al,%edi /* zero-extend that into %edi */ - inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit - movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */ + call __audit_syscall_exit + movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi cli TRACE_IRQS_OFF diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 22d0e21b4dd..a22facf06f0 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -42,6 +42,7 @@ */ #include +#include #include #include #include @@ -466,11 +467,10 @@ sysexit_audit: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) movl %eax,%edx /* second arg, syscall return value */ - cmpl $0,%eax /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + setbe %al /* 1 if so, 0 if not */ movzbl %al,%eax /* zero-extend that */ - inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit + call __audit_syscall_exit DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a20e1cb9dc8..e51393dd93a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -55,6 +55,7 @@ #include #include #include +#include /* Avoid __ASSEMBLER__'ifying just for this. */ #include @@ -563,17 +564,16 @@ auditsys: jmp system_call_fastpath /* - * Return fast path for syscall audit. Call audit_syscall_exit() + * Return fast path for syscall audit. Call __audit_syscall_exit() * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT * masked off. */ sysret_audit: movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ - cmpq $0,%rsi /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */ + setbe %al /* 1 if so, 0 if not */ movzbl %al,%edi /* zero-extend that into %edi */ - inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit + call __audit_syscall_exit movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi jmp sysret_check #endif /* CONFIG_AUDITSYSCALL */ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 89a04c7b5bb..8b021875877 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1414,8 +1414,7 @@ void syscall_trace_leave(struct pt_regs *regs) { bool step; - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->ax); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 863f8753ab0..af17e1c966d 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -335,9 +335,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk if (info->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk->mm); - /*call audit_syscall_exit since we do not exit via the normal paths */ + /*call __audit_syscall_exit since we do not exit via the normal paths */ if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(0), 0); + __audit_syscall_exit(1, 0); __asm__ __volatile__( "movl %0,%%esp\n\t" diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h index 711b1621747..5ef9344a8b2 100644 --- a/arch/x86/um/shared/sysdep/ptrace.h +++ b/arch/x86/um/shared/sysdep/ptrace.h @@ -3,3 +3,8 @@ #else #include "ptrace_64.h" #endif + +static inline long regs_return_value(struct uml_pt_regs *regs) +{ + return UPT_SYSCALL_RET(regs); +} diff --git a/include/linux/audit.h b/include/linux/audit.h index 6e1c533f9b4..3d65e4b3ba0 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -26,6 +26,7 @@ #include #include +#include /* The netlink messages for the audit system is divided into blocks: * 1000 - 1099 are for commanding the audit system @@ -408,10 +409,6 @@ struct audit_field { void *lsm_rule; }; -#define AUDITSC_INVALID 0 -#define AUDITSC_SUCCESS 1 -#define AUDITSC_FAILURE 2 -#define AUDITSC_RESULT(x) ( ((long)(x))<0?AUDITSC_FAILURE:AUDITSC_SUCCESS ) extern int __init audit_register_class(int class, unsigned *list); extern int audit_classify_syscall(int abi, unsigned syscall); extern int audit_classify_arch(int arch); @@ -424,7 +421,7 @@ extern void audit_free(struct task_struct *task); extern void audit_syscall_entry(int arch, int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3); -extern void audit_syscall_exit(int failed, long return_code); +extern void __audit_syscall_exit(int ret_success, long ret_value); extern void __audit_getname(const char *name); extern void audit_putname(const char *name); extern void __audit_inode(const char *name, const struct dentry *dentry); @@ -438,6 +435,15 @@ static inline int audit_dummy_context(void) void *p = current->audit_context; return !p || *(int *)p; } +static inline void audit_syscall_exit(void *pt_regs) +{ + if (unlikely(current->audit_context)) { + int success = is_syscall_success(pt_regs); + int return_code = regs_return_value(pt_regs); + + __audit_syscall_exit(success, return_code); + } +} static inline void audit_getname(const char *name) { if (unlikely(!audit_dummy_context())) @@ -551,12 +557,12 @@ static inline void audit_mmap_fd(int fd, int flags) extern int audit_n_rules; extern int audit_signals; -#else +#else /* CONFIG_AUDITSYSCALL */ #define audit_finish_fork(t) #define audit_alloc(t) ({ 0; }) #define audit_free(t) do { ; } while (0) #define audit_syscall_entry(ta,a,b,c,d,e) do { ; } while (0) -#define audit_syscall_exit(f,r) do { ; } while (0) +#define audit_syscall_exit(r) do { ; } while (0) #define audit_dummy_context() 1 #define audit_getname(n) do { ; } while (0) #define audit_putname(n) do { ; } while (0) @@ -587,7 +593,7 @@ extern int audit_signals; #define audit_ptrace(t) ((void)0) #define audit_n_rules 0 #define audit_signals 0 -#endif +#endif /* CONFIG_AUDITSYSCALL */ #ifdef CONFIG_AUDIT /* These are defined in audit.c */ diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 800f113bea6..dd4cefa6519 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -112,6 +112,7 @@ #include /* For unlikely. */ #include /* For struct task_struct. */ +#include /* for IS_ERR_VALUE */ extern long arch_ptrace(struct task_struct *child, long request, @@ -265,6 +266,15 @@ static inline void ptrace_release_task(struct task_struct *task) #define force_successful_syscall_return() do { } while (0) #endif +#ifndef is_syscall_success +/* + * On most systems we can tell if a syscall is a success based on if the retval + * is an error value. On some systems like ia64 and powerpc they have different + * indicators of success/failure and must define their own. + */ +#define is_syscall_success(regs) (!IS_ERR_VALUE((unsigned long)(regs_return_value(regs)))) +#endif + /* * should define the following things inside #ifdef __KERNEL__. * diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e9bcb93800d..3d285380818 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -70,6 +70,11 @@ #include "audit.h" +/* flags stating the success for a syscall */ +#define AUDITSC_INVALID 0 +#define AUDITSC_SUCCESS 1 +#define AUDITSC_FAILURE 2 + /* AUDIT_NAMES is the number of slots we reserve in the audit_context * for saving names from getname(). If we get more names we will allocate * a name dynamically and also add those to the list anchored by names_list. */ @@ -1724,8 +1729,7 @@ void audit_finish_fork(struct task_struct *child) /** * audit_syscall_exit - deallocate audit context after a system call - * @valid: success/failure flag - * @return_code: syscall return value + * @pt_regs: syscall registers * * Tear down after system call. If the audit context has been marked as * auditable (either because of the AUDIT_RECORD_CONTEXT state from @@ -1733,13 +1737,17 @@ void audit_finish_fork(struct task_struct *child) * message), then write out the syscall information. In call cases, * free the names stored from getname(). */ -void audit_syscall_exit(int valid, long return_code) +void __audit_syscall_exit(int success, long return_code) { struct task_struct *tsk = current; struct audit_context *context; - context = audit_get_context(tsk, valid, return_code); + if (success) + success = AUDITSC_SUCCESS; + else + success = AUDITSC_FAILURE; + context = audit_get_context(tsk, success, return_code); if (likely(!context)) return; -- cgit v1.2.3-70-g09d2 From b05d8447e7821695bc2fa3359431f7a664232743 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:06 -0500 Subject: audit: inline audit_syscall_entry to reduce burden on archs Every arch calls: if (unlikely(current->audit_context)) audit_syscall_entry() which requires knowledge about audit (the existance of audit_context) in the arch code. Just do it all in static inline in audit.h so that arch's can remain blissfully ignorant. Signed-off-by: Eric Paris --- arch/ia64/kernel/ptrace.c | 9 +-------- arch/microblaze/kernel/ptrace.c | 6 ++---- arch/mips/kernel/ptrace.c | 7 +++---- arch/powerpc/kernel/ptrace.c | 26 ++++++++++++-------------- arch/s390/kernel/ptrace.c | 11 +++++------ arch/sh/kernel/ptrace_32.c | 7 +++---- arch/sh/kernel/ptrace_64.c | 7 +++---- arch/sparc/kernel/ptrace_64.c | 17 ++++++++--------- arch/um/kernel/ptrace.c | 20 +++++++++----------- arch/x86/ia32/ia32entry.S | 2 +- arch/x86/kernel/entry_32.S | 2 +- arch/x86/kernel/entry_64.S | 4 ++-- arch/x86/kernel/ptrace.c | 22 ++++++++++------------ arch/xtensa/kernel/ptrace.c | 3 +-- include/linux/audit.h | 13 ++++++++++--- kernel/auditsc.c | 2 +- 16 files changed, 72 insertions(+), 86 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 2c154088cce..dad91661ddf 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -1246,15 +1246,8 @@ syscall_trace_enter (long arg0, long arg1, long arg2, long arg3, if (test_thread_flag(TIF_RESTORE_RSE)) ia64_sync_krbs(); - if (unlikely(current->audit_context)) { - long syscall; - int arch; - syscall = regs.r15; - arch = AUDIT_ARCH_IA64; - - audit_syscall_entry(arch, syscall, arg0, arg1, arg2, arg3); - } + audit_syscall_entry(AUDIT_ARCH_IA64, regs.r15, arg0, arg1, arg2, arg3); return 0; } diff --git a/arch/microblaze/kernel/ptrace.c b/arch/microblaze/kernel/ptrace.c index f564b1bfd38..6eb2aa927d8 100644 --- a/arch/microblaze/kernel/ptrace.c +++ b/arch/microblaze/kernel/ptrace.c @@ -147,10 +147,8 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) */ ret = -1L; - if (unlikely(current->audit_context)) - audit_syscall_entry(EM_MICROBLAZE, regs->r12, - regs->r5, regs->r6, - regs->r7, regs->r8); + audit_syscall_entry(EM_MICROBLAZE, regs->r12, regs->r5, regs->r6, + regs->r7, regs->r8); return ret ?: regs->r12; } diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index ab0f1963a7b..7786b608d93 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -560,10 +560,9 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs) } out: - if (unlikely(current->audit_context)) - audit_syscall_entry(audit_arch(), regs->regs[2], - regs->regs[4], regs->regs[5], - regs->regs[6], regs->regs[7]); + audit_syscall_entry(audit_arch(), regs->regs[2], + regs->regs[4], regs->regs[5], + regs->regs[6], regs->regs[7]); } /* diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 09d31c12a5e..5b43325402b 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -1724,22 +1724,20 @@ long do_syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->gpr[0]); - if (unlikely(current->audit_context)) { #ifdef CONFIG_PPC64 - if (!is_32bit_task()) - audit_syscall_entry(AUDIT_ARCH_PPC64, - regs->gpr[0], - regs->gpr[3], regs->gpr[4], - regs->gpr[5], regs->gpr[6]); - else + if (!is_32bit_task()) + audit_syscall_entry(AUDIT_ARCH_PPC64, + regs->gpr[0], + regs->gpr[3], regs->gpr[4], + regs->gpr[5], regs->gpr[6]); + else #endif - audit_syscall_entry(AUDIT_ARCH_PPC, - regs->gpr[0], - regs->gpr[3] & 0xffffffff, - regs->gpr[4] & 0xffffffff, - regs->gpr[5] & 0xffffffff, - regs->gpr[6] & 0xffffffff); - } + audit_syscall_entry(AUDIT_ARCH_PPC, + regs->gpr[0], + regs->gpr[3] & 0xffffffff, + regs->gpr[4] & 0xffffffff, + regs->gpr[5] & 0xffffffff, + regs->gpr[6] & 0xffffffff); return ret ?: regs->gpr[0]; } diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index f5275860098..9d82ed4bcb2 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -740,12 +740,11 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->gprs[2]); - if (unlikely(current->audit_context)) - audit_syscall_entry(is_compat_task() ? - AUDIT_ARCH_S390 : AUDIT_ARCH_S390X, - regs->gprs[2], regs->orig_gpr2, - regs->gprs[3], regs->gprs[4], - regs->gprs[5]); + audit_syscall_entry(is_compat_task() ? + AUDIT_ARCH_S390 : AUDIT_ARCH_S390X, + regs->gprs[2], regs->orig_gpr2, + regs->gprs[3], regs->gprs[4], + regs->gprs[5]); return ret ?: regs->gprs[2]; } diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c index c0b5c179d27..a3e65156376 100644 --- a/arch/sh/kernel/ptrace_32.c +++ b/arch/sh/kernel/ptrace_32.c @@ -518,10 +518,9 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->regs[0]); - if (unlikely(current->audit_context)) - audit_syscall_entry(audit_arch(), regs->regs[3], - regs->regs[4], regs->regs[5], - regs->regs[6], regs->regs[7]); + audit_syscall_entry(audit_arch(), regs->regs[3], + regs->regs[4], regs->regs[5], + regs->regs[6], regs->regs[7]); return ret ?: regs->regs[0]; } diff --git a/arch/sh/kernel/ptrace_64.c b/arch/sh/kernel/ptrace_64.c index ba720d68643..3d0080b5c97 100644 --- a/arch/sh/kernel/ptrace_64.c +++ b/arch/sh/kernel/ptrace_64.c @@ -536,10 +536,9 @@ asmlinkage long long do_syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->regs[9]); - if (unlikely(current->audit_context)) - audit_syscall_entry(audit_arch(), regs->regs[1], - regs->regs[2], regs->regs[3], - regs->regs[4], regs->regs[5]); + audit_syscall_entry(audit_arch(), regs->regs[1], + regs->regs[2], regs->regs[3], + regs->regs[4], regs->regs[5]); return ret ?: regs->regs[9]; } diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c index c73c8c50f11..9388844cd88 100644 --- a/arch/sparc/kernel/ptrace_64.c +++ b/arch/sparc/kernel/ptrace_64.c @@ -1071,15 +1071,14 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->u_regs[UREG_G1]); - if (unlikely(current->audit_context) && !ret) - audit_syscall_entry((test_thread_flag(TIF_32BIT) ? - AUDIT_ARCH_SPARC : - AUDIT_ARCH_SPARC64), - regs->u_regs[UREG_G1], - regs->u_regs[UREG_I0], - regs->u_regs[UREG_I1], - regs->u_regs[UREG_I2], - regs->u_regs[UREG_I3]); + audit_syscall_entry((test_thread_flag(TIF_32BIT) ? + AUDIT_ARCH_SPARC : + AUDIT_ARCH_SPARC64), + regs->u_regs[UREG_G1], + regs->u_regs[UREG_I0], + regs->u_regs[UREG_I1], + regs->u_regs[UREG_I2], + regs->u_regs[UREG_I3]); return ret; } diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c index 2ccf25c42fe..06b19039050 100644 --- a/arch/um/kernel/ptrace.c +++ b/arch/um/kernel/ptrace.c @@ -167,17 +167,15 @@ void syscall_trace(struct uml_pt_regs *regs, int entryexit) int is_singlestep = (current->ptrace & PT_DTRACE) && entryexit; int tracesysgood; - if (unlikely(current->audit_context)) { - if (!entryexit) - audit_syscall_entry(HOST_AUDIT_ARCH, - UPT_SYSCALL_NR(regs), - UPT_SYSCALL_ARG1(regs), - UPT_SYSCALL_ARG2(regs), - UPT_SYSCALL_ARG3(regs), - UPT_SYSCALL_ARG4(regs)); - else - audit_syscall_exit(regs); - } + if (!entryexit) + audit_syscall_entry(HOST_AUDIT_ARCH, + UPT_SYSCALL_NR(regs), + UPT_SYSCALL_ARG1(regs), + UPT_SYSCALL_ARG2(regs), + UPT_SYSCALL_ARG3(regs), + UPT_SYSCALL_ARG4(regs)); + else + audit_syscall_exit(regs); /* Fake a debug trap */ if (is_singlestep) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 025f0f01d25..cecfd9a8f73 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -192,7 +192,7 @@ sysexit_from_sys_call: movl %ebx,%edx /* 3rd arg: 1st syscall arg */ movl %eax,%esi /* 2nd arg: syscall number */ movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */ - call audit_syscall_entry + call __audit_syscall_entry movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index a22facf06f0..1ccd742eba1 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -456,7 +456,7 @@ sysenter_audit: movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ movl %eax,%edx /* 2nd arg: syscall number */ movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ - call audit_syscall_entry + call __audit_syscall_entry pushl_cfi %ebx movl PT_EAX(%esp),%eax /* reload syscall number */ jmp sysenter_do_call diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e51393dd93a..1ca66b65012 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -549,7 +549,7 @@ badsys: #ifdef CONFIG_AUDITSYSCALL /* * Fast path for syscall audit without full syscall trace. - * We just call audit_syscall_entry() directly, and then + * We just call __audit_syscall_entry() directly, and then * jump back to the normal fast path. */ auditsys: @@ -559,7 +559,7 @@ auditsys: movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ movq %rax,%rsi /* 2nd arg: syscall number */ movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ - call audit_syscall_entry + call __audit_syscall_entry LOAD_ARGS 0 /* reload call-clobbered registers */ jmp system_call_fastpath diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 8b021875877..50267386b76 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1392,20 +1392,18 @@ long syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->orig_ax); - if (unlikely(current->audit_context)) { - if (IS_IA32) - audit_syscall_entry(AUDIT_ARCH_I386, - regs->orig_ax, - regs->bx, regs->cx, - regs->dx, regs->si); + if (IS_IA32) + audit_syscall_entry(AUDIT_ARCH_I386, + regs->orig_ax, + regs->bx, regs->cx, + regs->dx, regs->si); #ifdef CONFIG_X86_64 - else - audit_syscall_entry(AUDIT_ARCH_X86_64, - regs->orig_ax, - regs->di, regs->si, - regs->dx, regs->r10); + else + audit_syscall_entry(AUDIT_ARCH_X86_64, + regs->orig_ax, + regs->di, regs->si, + regs->dx, regs->r10); #endif - } return ret ?: regs->orig_ax; } diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c index a0d042aa296..2dff698ab02 100644 --- a/arch/xtensa/kernel/ptrace.c +++ b/arch/xtensa/kernel/ptrace.c @@ -334,8 +334,7 @@ void do_syscall_trace_enter(struct pt_regs *regs) do_syscall_trace(); #if 0 - if (unlikely(current->audit_context)) - audit_syscall_entry(current, AUDIT_ARCH_XTENSA..); + audit_syscall_entry(current, AUDIT_ARCH_XTENSA..); #endif } diff --git a/include/linux/audit.h b/include/linux/audit.h index 3d65e4b3ba0..f56ce2669b8 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -418,9 +418,9 @@ extern int audit_classify_arch(int arch); extern void audit_finish_fork(struct task_struct *child); extern int audit_alloc(struct task_struct *task); extern void audit_free(struct task_struct *task); -extern void audit_syscall_entry(int arch, - int major, unsigned long a0, unsigned long a1, - unsigned long a2, unsigned long a3); +extern void __audit_syscall_entry(int arch, + int major, unsigned long a0, unsigned long a1, + unsigned long a2, unsigned long a3); extern void __audit_syscall_exit(int ret_success, long ret_value); extern void __audit_getname(const char *name); extern void audit_putname(const char *name); @@ -435,6 +435,13 @@ static inline int audit_dummy_context(void) void *p = current->audit_context; return !p || *(int *)p; } +static inline void audit_syscall_entry(int arch, int major, unsigned long a0, + unsigned long a1, unsigned long a2, + unsigned long a3) +{ + if (unlikely(!audit_dummy_context())) + __audit_syscall_entry(arch, major, a0, a1, a2, a3); +} static inline void audit_syscall_exit(void *pt_regs) { if (unlikely(current->audit_context)) { diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3d285380818..b408100dd6e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1632,7 +1632,7 @@ void audit_free(struct task_struct *tsk) * will only be written if another part of the kernel requests that it * be written). */ -void audit_syscall_entry(int arch, int major, +void __audit_syscall_entry(int arch, int major, unsigned long a1, unsigned long a2, unsigned long a3, unsigned long a4) { -- cgit v1.2.3-70-g09d2 From 997f5b6444f4608692ec807fb802fd9767c80e76 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:06 -0500 Subject: audit: remove AUDIT_SETUP_CONTEXT as it isn't used Audit contexts have 3 states. Disabled, which doesn't collect anything, build, which collects info but might not emit it, and record, which collects and emits. There is a 4th state, setup, which isn't used. Get rid of it. Signed-off-by: Eric Paris --- kernel/audit.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index 91e7071c4d2..81676680337 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -36,12 +36,8 @@ enum audit_state { AUDIT_DISABLED, /* Do not create per-task audit_context. * No syscall-specific audit records can * be generated. */ - AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context, - * but don't necessarily fill it in at - * syscall entry time (i.e., filter - * instead). */ AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context, - * and always fill it in at syscall + * and fill it in at syscall * entry time. This makes a full * syscall record available if some * other part of the kernel decides it -- cgit v1.2.3-70-g09d2 From 56179a6ec65a56e0279a58e35cb450d38f061b94 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:06 -0500 Subject: audit: drop some potentially inadvisable likely notations The audit code makes heavy use of likely() and unlikely() macros, but they don't always make sense. Drop any that seem questionable and let the computer do it's thing. Signed-off-by: Eric Paris --- kernel/auditsc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b408100dd6e..d7382c2aaa9 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -805,7 +805,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, { struct audit_context *context = tsk->audit_context; - if (likely(!context)) + if (!context) return NULL; context->return_valid = return_valid; @@ -928,7 +928,7 @@ int audit_alloc(struct task_struct *tsk) return 0; /* Return if not auditing. */ state = audit_filter_task(tsk, &key); - if (likely(state == AUDIT_DISABLED)) + if (state == AUDIT_DISABLED) return 0; if (!(context = audit_alloc_context(state))) { @@ -1599,7 +1599,7 @@ void audit_free(struct task_struct *tsk) struct audit_context *context; context = audit_get_context(tsk, 0, 0); - if (likely(!context)) + if (!context) return; /* Check for system calls that do not go through the exit @@ -1640,7 +1640,7 @@ void __audit_syscall_entry(int arch, int major, struct audit_context *context = tsk->audit_context; enum audit_state state; - if (unlikely(!context)) + if (!context) return; /* @@ -1697,7 +1697,7 @@ void __audit_syscall_entry(int arch, int major, context->prio = 0; state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); } - if (likely(state == AUDIT_DISABLED)) + if (state == AUDIT_DISABLED) return; context->serial = 0; @@ -1748,7 +1748,7 @@ void __audit_syscall_exit(int success, long return_code) success = AUDITSC_FAILURE; context = audit_get_context(tsk, success, return_code); - if (likely(!context)) + if (!context) return; if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) -- cgit v1.2.3-70-g09d2 From 07c49417877f8658a6aa0ad9b4e21e4fd4df11b6 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:07 -0500 Subject: audit: inline checks for not needing to collect aux records A number of audit hooks make function calls before they determine that auxilary records do not need to be collected. Do those checks as static inlines since the most common case is going to be that records are not needed and we can skip the function call overhead. Signed-off-by: Eric Paris --- include/linux/audit.h | 23 ++++++++++++++++++++--- kernel/auditsc.c | 15 +++------------ 2 files changed, 23 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index f56ce2669b8..cf16faff6b8 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -489,9 +489,9 @@ extern int audit_set_loginuid(struct task_struct *task, uid_t loginuid); extern void audit_log_task_context(struct audit_buffer *ab); extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp); extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode); -extern int audit_bprm(struct linux_binprm *bprm); -extern void audit_socketcall(int nargs, unsigned long *args); -extern int audit_sockaddr(int len, void *addr); +extern int __audit_bprm(struct linux_binprm *bprm); +extern void __audit_socketcall(int nargs, unsigned long *args); +extern int __audit_sockaddr(int len, void *addr); extern void __audit_fd_pair(int fd1, int fd2); extern int audit_set_macxattr(const char *name); extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr); @@ -519,6 +519,23 @@ static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid if (unlikely(!audit_dummy_context())) __audit_ipc_set_perm(qbytes, uid, gid, mode); } +static inline int audit_bprm(struct linux_binprm *bprm) +{ + if (unlikely(!audit_dummy_context())) + return __audit_bprm(bprm); + return 0; +} +static inline void audit_socketcall(int nargs, unsigned long *args) +{ + if (unlikely(!audit_dummy_context())) + __audit_socketcall(nargs, args); +} +static inline int audit_sockaddr(int len, void *addr) +{ + if (unlikely(!audit_dummy_context())) + return __audit_sockaddr(len, addr); + return 0; +} static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) { if (unlikely(!audit_dummy_context())) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d7382c2aaa9..e1062f66b01 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2309,14 +2309,11 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo context->ipc.has_perm = 1; } -int audit_bprm(struct linux_binprm *bprm) +int __audit_bprm(struct linux_binprm *bprm) { struct audit_aux_data_execve *ax; struct audit_context *context = current->audit_context; - if (likely(!audit_enabled || !context || context->dummy)) - return 0; - ax = kmalloc(sizeof(*ax), GFP_KERNEL); if (!ax) return -ENOMEM; @@ -2337,13 +2334,10 @@ int audit_bprm(struct linux_binprm *bprm) * @args: args array * */ -void audit_socketcall(int nargs, unsigned long *args) +void __audit_socketcall(int nargs, unsigned long *args) { struct audit_context *context = current->audit_context; - if (likely(!context || context->dummy)) - return; - context->type = AUDIT_SOCKETCALL; context->socketcall.nargs = nargs; memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); @@ -2369,13 +2363,10 @@ void __audit_fd_pair(int fd1, int fd2) * * Returns 0 for success or NULL context or < 0 on error. */ -int audit_sockaddr(int len, void *a) +int __audit_sockaddr(int len, void *a) { struct audit_context *context = current->audit_context; - if (likely(!context || context->dummy)) - return 0; - if (!context->sockaddr) { void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); if (!p) -- cgit v1.2.3-70-g09d2 From a4ff8dba7d8ce5ceb43fb27df66292251cc73bdc Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:07 -0500 Subject: audit: inline audit_free to simplify the look of generic code make the conditional a static inline instead of doing it in generic code. Signed-off-by: Eric Paris --- include/linux/audit.h | 7 ++++++- kernel/auditsc.c | 2 +- kernel/exit.c | 3 +-- 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 4f1efe3e861..8eb8bda749b 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -417,7 +417,7 @@ extern int audit_classify_arch(int arch); /* Public API */ extern void audit_finish_fork(struct task_struct *child); extern int audit_alloc(struct task_struct *task); -extern void audit_free(struct task_struct *task); +extern void __audit_free(struct task_struct *task); extern void __audit_syscall_entry(int arch, int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3); @@ -435,6 +435,11 @@ static inline int audit_dummy_context(void) void *p = current->audit_context; return !p || *(int *)p; } +static inline void audit_free(struct task_struct *task) +{ + if (unlikely(task->audit_context)) + __audit_free(task); +} static inline void audit_syscall_entry(int arch, int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e1062f66b01..7aaeb38b262 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1594,7 +1594,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts * * Called from copy_process and do_exit */ -void audit_free(struct task_struct *tsk) +void __audit_free(struct task_struct *tsk) { struct audit_context *context; diff --git a/kernel/exit.c b/kernel/exit.c index 94ed6e20bb5..88dcbbc446f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -964,8 +964,7 @@ NORET_TYPE void do_exit(long code) acct_collect(code, group_dead); if (group_dead) tty_audit_exit(); - if (unlikely(tsk->audit_context)) - audit_free(tsk); + audit_free(tsk); tsk->exit_code = code; taskstats_exit(tsk, group_dead); -- cgit v1.2.3-70-g09d2 From 7ff68e53ece8c175d2951bb8a30b3cce8f9c5579 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:07 -0500 Subject: audit: reject entry,always rules We deprecated entry,always rules a long time ago. Reject those rules as invalid. Signed-off-by: Eric Paris --- kernel/auditfilter.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index d94dde82c3c..903caa269b5 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -235,13 +235,15 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) switch(listnr) { default: goto exit_err; - case AUDIT_FILTER_USER: - case AUDIT_FILTER_TYPE: #ifdef CONFIG_AUDITSYSCALL case AUDIT_FILTER_ENTRY: + if (rule->action == AUDIT_ALWAYS) + goto exit_err; case AUDIT_FILTER_EXIT: case AUDIT_FILTER_TASK: #endif + case AUDIT_FILTER_USER: + case AUDIT_FILTER_TYPE: ; } if (unlikely(rule->action == AUDIT_POSSIBLE)) { -- cgit v1.2.3-70-g09d2 From 6422e78de6880c66a82af512d9bd0c85eb62e661 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:07 -0500 Subject: audit: remove audit_finish_fork as it can't be called Audit entry,always rules are not allowed and are automatically changed in exit,always rules in userspace. The kernel refuses to load such rules. Thus a task in the middle of a syscall (and thus in audit_finish_fork()) can only be in one of two states: AUDIT_BUILD_CONTEXT or AUDIT_DISABLED. Since the current task cannot be in AUDIT_RECORD_CONTEXT we aren't every going to actually use the code in audit_finish_fork() since it will return without doing anything. Thus drop the code. Signed-off-by: Eric Paris --- include/linux/audit.h | 2 -- kernel/auditsc.c | 20 -------------------- kernel/fork.c | 2 -- 3 files changed, 24 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 8eb8bda749b..67b66c37a25 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -415,7 +415,6 @@ extern int audit_classify_arch(int arch); #ifdef CONFIG_AUDITSYSCALL /* These are defined in auditsc.c */ /* Public API */ -extern void audit_finish_fork(struct task_struct *child); extern int audit_alloc(struct task_struct *task); extern void __audit_free(struct task_struct *task); extern void __audit_syscall_entry(int arch, @@ -586,7 +585,6 @@ static inline void audit_mmap_fd(int fd, int flags) extern int audit_n_rules; extern int audit_signals; #else /* CONFIG_AUDITSYSCALL */ -#define audit_finish_fork(t) #define audit_alloc(t) ({ 0; }) #define audit_free(t) do { ; } while (0) #define audit_syscall_entry(ta,a,b,c,d,e) do { ; } while (0) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 7aaeb38b262..4d8920f5ab8 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1707,26 +1707,6 @@ void __audit_syscall_entry(int arch, int major, context->ppid = 0; } -void audit_finish_fork(struct task_struct *child) -{ - struct audit_context *ctx = current->audit_context; - struct audit_context *p = child->audit_context; - if (!p || !ctx) - return; - if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT) - return; - p->arch = ctx->arch; - p->major = ctx->major; - memcpy(p->argv, ctx->argv, sizeof(ctx->argv)); - p->ctime = ctx->ctime; - p->dummy = ctx->dummy; - p->in_syscall = ctx->in_syscall; - p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL); - p->ppid = current->pid; - p->prio = ctx->prio; - p->current_state = ctx->current_state; -} - /** * audit_syscall_exit - deallocate audit context after a system call * @pt_regs: syscall registers diff --git a/kernel/fork.c b/kernel/fork.c index 443f5125f11..c1e5c21f48c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1525,8 +1525,6 @@ long do_fork(unsigned long clone_flags, init_completion(&vfork); } - audit_finish_fork(p); - /* * We set PF_STARTING at creation in case tracing wants to * use this to distinguish a fully live task from one that -- cgit v1.2.3-70-g09d2 From efaffd6e4417860c67576ac760dd6e8bbd15f006 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:07 -0500 Subject: audit: allow matching on obj_uid Allow syscall exit filter matching based on the uid of the owner of an inode used in a syscall. aka: auditctl -a always,exit -S open -F obj_uid=0 -F perm=wa Signed-off-by: Eric Paris --- include/linux/audit.h | 1 + kernel/auditfilter.c | 1 + kernel/auditsc.c | 12 ++++++++++++ 3 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 67b66c37a25..55cb3daaf47 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -223,6 +223,7 @@ #define AUDIT_PERM 106 #define AUDIT_DIR 107 #define AUDIT_FILETYPE 108 +#define AUDIT_OBJ_UID 109 #define AUDIT_ARG0 200 #define AUDIT_ARG1 (AUDIT_ARG0+1) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 903caa269b5..13e997423dc 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -461,6 +461,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_ARG1: case AUDIT_ARG2: case AUDIT_ARG3: + case AUDIT_OBJ_UID: break; case AUDIT_ARCH: entry->rule.arch_f = f; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4d8920f5ab8..5cf3ecc0151 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -586,6 +586,18 @@ static int audit_filter_rules(struct task_struct *tsk, } } break; + case AUDIT_OBJ_UID: + if (name) { + result = audit_comparator(name->uid, f->op, f->val); + } else if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(n->uid, f->op, f->val)) { + ++result; + break; + } + } + } + break; case AUDIT_WATCH: if (name) result = audit_watch_compare(rule->watch, name->ino, name->dev); -- cgit v1.2.3-70-g09d2 From 54d3218b31aee5bc9c859ae60fbde933d922448b Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:07 -0500 Subject: audit: allow audit matching on inode gid Much like the ability to filter audit on the uid of an inode collected, we should be able to filter on the gid of the inode. Signed-off-by: Eric Paris --- include/linux/audit.h | 1 + kernel/auditfilter.c | 1 + kernel/auditsc.c | 12 ++++++++++++ 3 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 55cb3daaf47..e36aa37c88a 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -224,6 +224,7 @@ #define AUDIT_DIR 107 #define AUDIT_FILETYPE 108 #define AUDIT_OBJ_UID 109 +#define AUDIT_OBJ_GID 110 #define AUDIT_ARG0 200 #define AUDIT_ARG1 (AUDIT_ARG0+1) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 13e997423dc..f10605c787e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -462,6 +462,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_ARG2: case AUDIT_ARG3: case AUDIT_OBJ_UID: + case AUDIT_OBJ_GID: break; case AUDIT_ARCH: entry->rule.arch_f = f; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 5cf3ecc0151..87b375fb12f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -598,6 +598,18 @@ static int audit_filter_rules(struct task_struct *tsk, } } break; + case AUDIT_OBJ_GID: + if (name) { + result = audit_comparator(name->gid, f->op, f->val); + } else if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(n->gid, f->op, f->val)) { + ++result; + break; + } + } + } + break; case AUDIT_WATCH: if (name) result = audit_watch_compare(rule->watch, name->ino, name->dev); -- cgit v1.2.3-70-g09d2 From 0a300be6d5be8f66cd96609334710c268d0bfdce Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:08 -0500 Subject: audit: remove task argument to audit_set_loginuid The function always deals with current. Don't expose an option pretending one can use it for something. You can't. Signed-off-by: Eric Paris --- fs/proc/base.c | 2 +- include/linux/audit.h | 2 +- kernel/auditsc.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/fs/proc/base.c b/fs/proc/base.c index 8173dfd89cb..e3cbebbabeb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1228,7 +1228,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, goto out_free_page; } - length = audit_set_loginuid(current, loginuid); + length = audit_set_loginuid(loginuid); if (likely(length == 0)) length = count; diff --git a/include/linux/audit.h b/include/linux/audit.h index e36aa37c88a..7cbd6fe4157 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -489,7 +489,7 @@ static inline void audit_ptrace(struct task_struct *t) extern unsigned int audit_serial(void); extern int auditsc_get_stamp(struct audit_context *ctx, struct timespec *t, unsigned int *serial); -extern int audit_set_loginuid(struct task_struct *task, uid_t loginuid); +extern int audit_set_loginuid(uid_t loginuid); #define audit_get_loginuid(t) ((t)->loginuid) #define audit_get_sessionid(t) ((t)->sessionid) extern void audit_log_task_context(struct audit_buffer *ab); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 87b375fb12f..9d6dd7d869c 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2163,16 +2163,16 @@ int auditsc_get_stamp(struct audit_context *ctx, static atomic_t session_id = ATOMIC_INIT(0); /** - * audit_set_loginuid - set a task's audit_context loginuid - * @task: task whose audit context is being modified + * audit_set_loginuid - set current task's audit_context loginuid * @loginuid: loginuid value * * Returns 0. * * Called (set) from fs/proc/base.c::proc_loginuid_write(). */ -int audit_set_loginuid(struct task_struct *task, uid_t loginuid) +int audit_set_loginuid(uid_t loginuid) { + struct task_struct *task = current; unsigned int sessionid = atomic_inc_return(&session_id); struct audit_context *context = task->audit_context; -- cgit v1.2.3-70-g09d2 From 633b45454503489209b0d9a45f9e3cd1b852c614 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:08 -0500 Subject: audit: only allow tasks to set their loginuid if it is -1 At the moment we allow tasks to set their loginuid if they have CAP_AUDIT_CONTROL. In reality we want tasks to set the loginuid when they log in and it be impossible to ever reset. We had to make it mutable even after it was once set (with the CAP) because on update and admin might have to restart sshd. Now sshd would get his loginuid and the next user which logged in using ssh would not be able to set his loginuid. Systemd has changed how userspace works and allowed us to make the kernel work the way it should. With systemd users (even admins) are not supposed to restart services directly. The system will restart the service for them. Thus since systemd is going to loginuid==-1, sshd would get -1, and sshd would be allowed to set a new loginuid without special permissions. If an admin in this system were to manually start an sshd he is inserting himself into the system chain of trust and thus, logically, it's his loginuid that should be used! Since we have old systems I make this a Kconfig option. Signed-off-by: Eric Paris --- fs/proc/base.c | 3 --- init/Kconfig | 14 ++++++++++++++ kernel/auditsc.c | 11 ++++++++++- 3 files changed, 24 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/fs/proc/base.c b/fs/proc/base.c index e3cbebbabeb..482df23036b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1197,9 +1197,6 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, ssize_t length; uid_t loginuid; - if (!capable(CAP_AUDIT_CONTROL)) - return -EPERM; - rcu_read_lock(); if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { rcu_read_unlock(); diff --git a/init/Kconfig b/init/Kconfig index a075765d5fb..5ad8b775f2a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -372,6 +372,20 @@ config AUDIT_TREE depends on AUDITSYSCALL select FSNOTIFY +config AUDIT_LOGINUID_IMMUTABLE + bool "Make audit loginuid immutable" + depends on AUDIT + help + The config option toggles if a task setting it's loginuid requires + CAP_SYS_AUDITCONTROL or if that task should require no special permissions + but should instead only allow setting its loginuid if it was never + previously set. On systems which use systemd or a similar central + process to restart login services this should be set to true. On older + systems in which an admin would typically have to directly stop and + start processes this should be set to false. Setting this to true allows + one to drop potentially dangerous capabilites from the login tasks, + but may not be backwards compatible with older init systems. + source "kernel/irq/Kconfig" menu "RCU Subsystem" diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9d6dd7d869c..bd084a13c71 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2173,9 +2173,18 @@ static atomic_t session_id = ATOMIC_INIT(0); int audit_set_loginuid(uid_t loginuid) { struct task_struct *task = current; - unsigned int sessionid = atomic_inc_return(&session_id); struct audit_context *context = task->audit_context; + unsigned int sessionid; + +#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE + if (task->loginuid != -1) + return -EPERM; +#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ + if (!capable(CAP_AUDIT_CONTROL)) + return -EPERM; +#endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ + sessionid = atomic_inc_return(&session_id); if (context && context->in_syscall) { struct audit_buffer *ab; -- cgit v1.2.3-70-g09d2 From 4043cde8ecf7f7d880eb1133c201a3d392fd68c3 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:08 -0500 Subject: audit: do not call audit_getname on error Just a code cleanup really. We don't need to make a function call just for it to return on error. This also makes the VFS function even easier to follow and removes a conditional on a hot path. Signed-off-by: Eric Paris --- fs/namei.c | 28 +++++++++++++--------------- kernel/auditsc.c | 3 --- 2 files changed, 13 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/fs/namei.c b/fs/namei.c index c283a1ec008..208c6aa4a98 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -140,21 +140,19 @@ static int do_getname(const char __user *filename, char *page) static char *getname_flags(const char __user *filename, int flags, int *empty) { - char *tmp, *result; - - result = ERR_PTR(-ENOMEM); - tmp = __getname(); - if (tmp) { - int retval = do_getname(filename, tmp); - - result = tmp; - if (retval < 0) { - if (retval == -ENOENT && empty) - *empty = 1; - if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { - __putname(tmp); - result = ERR_PTR(retval); - } + char *result = __getname(); + int retval; + + if (!result) + return ERR_PTR(-ENOMEM); + + retval = do_getname(filename, result); + if (retval < 0) { + if (retval == -ENOENT && empty) + *empty = 1; + if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { + __putname(result); + return ERR_PTR(retval); } } audit_getname(result); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index bd084a13c71..9161e70a437 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1913,9 +1913,6 @@ void __audit_getname(const char *name) struct audit_context *context = current->audit_context; struct audit_names *n; - if (IS_ERR(name) || !name) - return; - if (!context->in_syscall) { #if AUDIT_DEBUG == 2 printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n", -- cgit v1.2.3-70-g09d2 From 02d86a568c6d2d335256864451ac8ce781bc5652 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:08 -0500 Subject: audit: allow interfield comparison in audit rules We wish to be able to audit when a uid=500 task accesses a file which is uid=0. Or vice versa. This patch introduces a new audit filter type AUDIT_FIELD_COMPARE which takes as an 'enum' which indicates which fields should be compared. At this point we only define the task->uid vs inode->uid, but other comparisons can be added. Signed-off-by: Eric Paris --- include/linux/audit.h | 4 ++++ kernel/auditfilter.c | 5 ++++- kernel/auditsc.c | 30 +++++++++++++++++++++++++++++- 3 files changed, 37 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 7cbd6fe4157..838e05fc058 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -182,7 +182,10 @@ * AUDIT_UNUSED_BITS is updated if need be. */ #define AUDIT_UNUSED_BITS 0x07FFFC00 +/* AUDIT_FIELD_COMPARE rule list */ +#define AUDIT_COMPARE_UID_TO_OBJ_UID 1 +#define AUDIT_MAX_FIELD_COMPARE AUDIT_COMPARE_UID_TO_OBJ_UID /* Rule fields */ /* These are useful when checking the * task structure at task creation time @@ -225,6 +228,7 @@ #define AUDIT_FILETYPE 108 #define AUDIT_OBJ_UID 109 #define AUDIT_OBJ_GID 110 +#define AUDIT_FIELD_COMPARE 111 #define AUDIT_ARG0 200 #define AUDIT_ARG1 (AUDIT_ARG0+1) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index f10605c787e..a6c3f1abd20 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -526,7 +526,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, goto exit_free; break; case AUDIT_FILTERKEY: - err = -EINVAL; if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) goto exit_free; str = audit_unpack_string(&bufp, &remain, f->val); @@ -543,6 +542,10 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, if (f->val & ~S_IFMT) goto exit_free; break; + case AUDIT_FIELD_COMPARE: + if (f->val > AUDIT_MAX_FIELD_COMPARE) + goto exit_free; + break; default: goto exit_free; } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9161e70a437..8fb2c8e6d62 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -463,6 +463,32 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) return 0; } +static int audit_field_compare(struct task_struct *tsk, + const struct cred *cred, + struct audit_field *f, + struct audit_context *ctx, + struct audit_names *name) +{ + struct audit_names *n; + + switch (f->val) { + case AUDIT_COMPARE_UID_TO_OBJ_UID: + if (name) { + return audit_comparator(cred->uid, f->op, name->uid); + } else if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(cred->uid, f->op, n->uid)) + return 1; + } + } + break; + default: + WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); + return 0; + } + return 0; +} + /* Determine if any context name data matches a rule's watch data */ /* Compare a task_struct with an audit_rule. Return 1 on match, 0 * otherwise. @@ -693,8 +719,10 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_FILETYPE: result = audit_match_filetype(ctx, f->val); break; + case AUDIT_FIELD_COMPARE: + result = audit_field_compare(tsk, cred, f, ctx, name); + break; } - if (!result) return 0; } -- cgit v1.2.3-70-g09d2 From b34b039324bf081554ee8678f9b8c5d937e5206c Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:08 -0500 Subject: audit: complex interfield comparison helper Rather than code the same loop over and over implement a helper function which uses some pointer magic to make it generic enough to be used numerous places as we implement more audit interfield comparisons Signed-off-by: Eric Paris --- kernel/auditsc.c | 50 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 8fb2c8e6d62..b12cc32fe37 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -463,25 +463,53 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) return 0; } +static int audit_compare_id(uid_t uid1, + struct audit_names *name, + unsigned long name_offset, + struct audit_field *f, + struct audit_context *ctx) +{ + struct audit_names *n; + unsigned long addr; + uid_t uid2; + int rc; + + if (name) { + addr = (unsigned long)name; + addr += name_offset; + + uid2 = *(uid_t *)addr; + rc = audit_comparator(uid1, f->op, uid2); + if (rc) + return rc; + } + + if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + addr = (unsigned long)n; + addr += name_offset; + + uid2 = *(uid_t *)addr; + + rc = audit_comparator(uid1, f->op, uid2); + if (rc) + return rc; + } + } + return 0; +} + static int audit_field_compare(struct task_struct *tsk, const struct cred *cred, struct audit_field *f, struct audit_context *ctx, struct audit_names *name) { - struct audit_names *n; - switch (f->val) { case AUDIT_COMPARE_UID_TO_OBJ_UID: - if (name) { - return audit_comparator(cred->uid, f->op, name->uid); - } else if (ctx) { - list_for_each_entry(n, &ctx->names_list, list) { - if (audit_comparator(cred->uid, f->op, n->uid)) - return 1; - } - } - break; + return audit_compare_id(cred->uid, + name, offsetof(struct audit_names, uid), + f, ctx); default: WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); return 0; -- cgit v1.2.3-70-g09d2 From c9fe685f7a17a0ee8bf3fbe51e40b1c8b8e65896 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:08 -0500 Subject: audit: allow interfield comparison between gid and ogid Allow audit rules to compare the gid of the running task to the gid of the inode in question. Signed-off-by: Eric Paris --- include/linux/audit.h | 3 ++- kernel/auditsc.c | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 838e05fc058..fffbc2176ee 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -184,8 +184,9 @@ /* AUDIT_FIELD_COMPARE rule list */ #define AUDIT_COMPARE_UID_TO_OBJ_UID 1 +#define AUDIT_COMPARE_GID_TO_OBJ_GID 2 -#define AUDIT_MAX_FIELD_COMPARE AUDIT_COMPARE_UID_TO_OBJ_UID +#define AUDIT_MAX_FIELD_COMPARE AUDIT_COMPARE_GID_TO_OBJ_GID /* Rule fields */ /* These are useful when checking the * task structure at task creation time diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b12cc32fe37..861c7b9c565 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -474,6 +474,8 @@ static int audit_compare_id(uid_t uid1, uid_t uid2; int rc; + BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t)); + if (name) { addr = (unsigned long)name; addr += name_offset; @@ -510,6 +512,10 @@ static int audit_field_compare(struct task_struct *tsk, return audit_compare_id(cred->uid, name, offsetof(struct audit_names, uid), f, ctx); + case AUDIT_COMPARE_GID_TO_OBJ_GID: + return audit_compare_id(cred->gid, + name, offsetof(struct audit_names, gid), + f, ctx); default: WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); return 0; -- cgit v1.2.3-70-g09d2 From 4a6633ed08af5ba67790b4d1adcdeb8ceb55677e Mon Sep 17 00:00:00 2001 From: Peter Moody Date: Tue, 13 Dec 2011 16:17:51 -0800 Subject: audit: implement all object interfield comparisons This completes the matrix of interfield comparisons between uid/gid information for the current task and the uid/gid information for inodes. aka I can audit based on differences between the euid of the process and the uid of fs objects. Signed-off-by: Peter Moody Signed-off-by: Eric Paris --- include/linux/audit.h | 10 +++++++++- kernel/auditsc.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index fffbc2176ee..67113cb4bc1 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -185,8 +185,16 @@ /* AUDIT_FIELD_COMPARE rule list */ #define AUDIT_COMPARE_UID_TO_OBJ_UID 1 #define AUDIT_COMPARE_GID_TO_OBJ_GID 2 +#define AUDIT_COMPARE_EUID_TO_OBJ_UID 3 +#define AUDIT_COMPARE_EGID_TO_OBJ_GID 4 +#define AUDIT_COMPARE_AUID_TO_OBJ_UID 5 +#define AUDIT_COMPARE_SUID_TO_OBJ_UID 6 +#define AUDIT_COMPARE_SGID_TO_OBJ_GID 7 +#define AUDIT_COMPARE_FSUID_TO_OBJ_UID 8 +#define AUDIT_COMPARE_FSGID_TO_OBJ_GID 9 + +#define AUDIT_MAX_FIELD_COMPARE AUDIT_COMPARE_FSGID_TO_OBJ_GID -#define AUDIT_MAX_FIELD_COMPARE AUDIT_COMPARE_GID_TO_OBJ_GID /* Rule fields */ /* These are useful when checking the * task structure at task creation time diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 861c7b9c565..b8cee462b99 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -508,6 +508,7 @@ static int audit_field_compare(struct task_struct *tsk, struct audit_names *name) { switch (f->val) { + /* process to file object comparisons */ case AUDIT_COMPARE_UID_TO_OBJ_UID: return audit_compare_id(cred->uid, name, offsetof(struct audit_names, uid), @@ -516,6 +517,34 @@ static int audit_field_compare(struct task_struct *tsk, return audit_compare_id(cred->gid, name, offsetof(struct audit_names, gid), f, ctx); + case AUDIT_COMPARE_EUID_TO_OBJ_UID: + return audit_compare_id(cred->euid, + name, offsetof(struct audit_names, uid), + f, ctx); + case AUDIT_COMPARE_EGID_TO_OBJ_GID: + return audit_compare_id(cred->egid, + name, offsetof(struct audit_names, gid), + f, ctx); + case AUDIT_COMPARE_AUID_TO_OBJ_UID: + return audit_compare_id(tsk->loginuid, + name, offsetof(struct audit_names, uid), + f, ctx); + case AUDIT_COMPARE_SUID_TO_OBJ_UID: + return audit_compare_id(cred->suid, + name, offsetof(struct audit_names, uid), + f, ctx); + case AUDIT_COMPARE_SGID_TO_OBJ_GID: + return audit_compare_id(cred->sgid, + name, offsetof(struct audit_names, gid), + f, ctx); + case AUDIT_COMPARE_FSUID_TO_OBJ_UID: + return audit_compare_id(cred->fsuid, + name, offsetof(struct audit_names, uid), + f, ctx); + case AUDIT_COMPARE_FSGID_TO_OBJ_GID: + return audit_compare_id(cred->fsgid, + name, offsetof(struct audit_names, gid), + f, ctx); default: WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); return 0; -- cgit v1.2.3-70-g09d2 From 10d68360871657204885371cdf2594412675d2f9 Mon Sep 17 00:00:00 2001 From: Peter Moody Date: Wed, 4 Jan 2012 15:24:31 -0500 Subject: audit: comparison on interprocess fields This allows audit to specify rules in which we compare two fields of a process. Such as is the running process uid != to the running process euid? Signed-off-by: Peter Moody Signed-off-by: Eric Paris --- include/linux/audit.h | 24 +++++++++++++++++++++++- kernel/auditsc.c | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 67113cb4bc1..9ff7a2c48b5 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -193,7 +193,29 @@ #define AUDIT_COMPARE_FSUID_TO_OBJ_UID 8 #define AUDIT_COMPARE_FSGID_TO_OBJ_GID 9 -#define AUDIT_MAX_FIELD_COMPARE AUDIT_COMPARE_FSGID_TO_OBJ_GID +#define AUDIT_COMPARE_UID_TO_AUID 10 +#define AUDIT_COMPARE_UID_TO_EUID 11 +#define AUDIT_COMPARE_UID_TO_FSUID 12 +#define AUDIT_COMPARE_UID_TO_SUID 13 + +#define AUDIT_COMPARE_AUID_TO_FSUID 14 +#define AUDIT_COMPARE_AUID_TO_SUID 15 +#define AUDIT_COMPARE_AUID_TO_EUID 16 + +#define AUDIT_COMPARE_EUID_TO_SUID 17 +#define AUDIT_COMPARE_EUID_TO_FSUID 18 + +#define AUDIT_COMPARE_SUID_TO_FSUID 19 + +#define AUDIT_COMPARE_GID_TO_EGID 20 +#define AUDIT_COMPARE_GID_TO_FSGID 21 +#define AUDIT_COMPARE_GID_TO_SGID 22 + +#define AUDIT_COMPARE_EGID_TO_FSGID 23 +#define AUDIT_COMPARE_EGID_TO_SGID 24 +#define AUDIT_COMPARE_SGID_TO_FSGID 25 + +#define AUDIT_MAX_FIELD_COMPARE AUDIT_COMPARE_SGID_TO_FSGID /* Rule fields */ /* These are useful when checking the diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b8cee462b99..593237e3654 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -545,6 +545,45 @@ static int audit_field_compare(struct task_struct *tsk, return audit_compare_id(cred->fsgid, name, offsetof(struct audit_names, gid), f, ctx); + /* uid comparisons */ + case AUDIT_COMPARE_UID_TO_AUID: + return audit_comparator(cred->uid, f->op, tsk->loginuid); + case AUDIT_COMPARE_UID_TO_EUID: + return audit_comparator(cred->uid, f->op, cred->euid); + case AUDIT_COMPARE_UID_TO_SUID: + return audit_comparator(cred->uid, f->op, cred->suid); + case AUDIT_COMPARE_UID_TO_FSUID: + return audit_comparator(cred->uid, f->op, cred->fsuid); + /* auid comparisons */ + case AUDIT_COMPARE_AUID_TO_EUID: + return audit_comparator(tsk->loginuid, f->op, cred->euid); + case AUDIT_COMPARE_AUID_TO_SUID: + return audit_comparator(tsk->loginuid, f->op, cred->suid); + case AUDIT_COMPARE_AUID_TO_FSUID: + return audit_comparator(tsk->loginuid, f->op, cred->fsuid); + /* euid comparisons */ + case AUDIT_COMPARE_EUID_TO_SUID: + return audit_comparator(cred->euid, f->op, cred->suid); + case AUDIT_COMPARE_EUID_TO_FSUID: + return audit_comparator(cred->euid, f->op, cred->fsuid); + /* suid comparisons */ + case AUDIT_COMPARE_SUID_TO_FSUID: + return audit_comparator(cred->suid, f->op, cred->fsuid); + /* gid comparisons */ + case AUDIT_COMPARE_GID_TO_EGID: + return audit_comparator(cred->gid, f->op, cred->egid); + case AUDIT_COMPARE_GID_TO_SGID: + return audit_comparator(cred->gid, f->op, cred->sgid); + case AUDIT_COMPARE_GID_TO_FSGID: + return audit_comparator(cred->gid, f->op, cred->fsgid); + /* egid comparisons */ + case AUDIT_COMPARE_EGID_TO_SGID: + return audit_comparator(cred->egid, f->op, cred->sgid); + case AUDIT_COMPARE_EGID_TO_FSGID: + return audit_comparator(cred->egid, f->op, cred->fsgid); + /* sgid comparison */ + case AUDIT_COMPARE_SGID_TO_FSGID: + return audit_comparator(cred->sgid, f->op, cred->fsgid); default: WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); return 0; -- cgit v1.2.3-70-g09d2 From 5afb8a3f96573f7ea018abb768f5b6ebe1a6c1a4 Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Tue, 20 Dec 2011 18:39:41 -0500 Subject: audit: fix signedness bug in audit_log_execve_info() In the loop, a size_t "len" is used to hold the return value of audit_log_single_execve_arg(), which returns -1 on error. In that case the error handling (len <= 0) will be bypassed since "len" is unsigned, and the loop continues with (p += len) being wrapped. Change the type of "len" to signed int to fix the error handling. size_t len; ... for (...) { len = audit_log_single_execve_arg(...); if (len <= 0) break; p += len; } Signed-off-by: Xi Wang Signed-off-by: Eric Paris --- kernel/auditsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 593237e3654..86584ecb103 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1362,8 +1362,8 @@ static void audit_log_execve_info(struct audit_context *context, struct audit_buffer **ab, struct audit_aux_data_execve *axi) { - int i; - size_t len, len_sent = 0; + int i, len; + size_t len_sent = 0; const char __user *p; char *buf; -- cgit v1.2.3-70-g09d2 From c158a35c8a681cf68d36f22f058f9f5466386c71 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 6 Jan 2012 14:07:10 -0800 Subject: audit: no leading space in audit_log_d_path prefix audit_log_d_path() injects an additional space before the prefix, which serves no purpose and doesn't mix well with other audit_log*() functions that do not sneak extra characters into the log. Signed-off-by: Kees Cook Signed-off-by: Eric Paris --- kernel/audit.c | 2 +- kernel/auditsc.c | 6 +++--- security/lsm_audit.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 00efe4758c8..705c25a70bf 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1423,7 +1423,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, char *p, *pathname; if (prefix) - audit_log_format(ab, " %s", prefix); + audit_log_format(ab, "%s", prefix); /* We will allow 11 spaces for ' (deleted)' to be appended */ pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 86584ecb103..caaea6e944f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1171,7 +1171,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk while (vma) { if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) { - audit_log_d_path(ab, "exe=", + audit_log_d_path(ab, " exe=", &vma->vm_file->f_path); break; } @@ -1540,7 +1540,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, case 0: /* name was specified as a relative path and the * directory component is the cwd */ - audit_log_d_path(ab, "name=", &context->pwd); + audit_log_d_path(ab, " name=", &context->pwd); break; default: /* log the name's directory component */ @@ -1725,7 +1725,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts if (context->pwd.dentry && context->pwd.mnt) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); if (ab) { - audit_log_d_path(ab, "cwd=", &context->pwd); + audit_log_d_path(ab, " cwd=", &context->pwd); audit_log_end(ab); } } diff --git a/security/lsm_audit.c b/security/lsm_audit.c index fc41b7cccb5..293b8c45b1d 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -232,7 +232,7 @@ static void dump_common_audit_data(struct audit_buffer *ab, case LSM_AUDIT_DATA_PATH: { struct inode *inode; - audit_log_d_path(ab, "path=", &a->u.path); + audit_log_d_path(ab, " path=", &a->u.path); inode = a->u.path.dentry->d_inode; if (inode) { @@ -318,7 +318,7 @@ static void dump_common_audit_data(struct audit_buffer *ab, .dentry = u->dentry, .mnt = u->mnt }; - audit_log_d_path(ab, "path=", &path); + audit_log_d_path(ab, " path=", &path); break; } if (!u->addr) -- cgit v1.2.3-70-g09d2 From 160cb5a97daef0cb894685d84c9d4700bb7cccb4 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 19 Jan 2012 23:23:10 +0100 Subject: PM / Hibernate: Correct additional pages number calculation The struct bm_block is allocated by chain_alloc(), so it'd better counting it in LINKED_PAGE_DATA_SIZE. Signed-off-by: Namhyung Kim Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 1cf88900ec4..6a768e53700 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -812,7 +812,8 @@ unsigned int snapshot_additional_pages(struct zone *zone) unsigned int res; res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); - res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); + res += DIV_ROUND_UP(res * sizeof(struct bm_block), + LINKED_PAGE_DATA_SIZE); return 2 * res; } -- cgit v1.2.3-70-g09d2 From fd45c15f13e754f3c106427e857310f3e0813951 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 20 Jan 2012 10:12:45 +0900 Subject: perf: Don't call release_callchain_buffers() if allocation fails When alloc_callchain_buffers() fails, it frees all of entries before return. In addition, calling the release_callchain_buffers() will cause a NULL pointer dereference since callchain_cpu_entries is not set. Signed-off-by: Namhyung Kim Acked-by: Frederic Weisbecker Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/1327021966-27688-1-git-send-email-namhyung.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/events/callchain.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 057e24b665c..6581a040f39 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -115,8 +115,6 @@ int get_callchain_buffers(void) } err = alloc_callchain_buffers(); - if (err) - release_callchain_buffers(); exit: mutex_unlock(&callchain_mutex); -- cgit v1.2.3-70-g09d2 From 46cd6a7f680d14f6f80ede9f04aeb70fa83bd266 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 20 Jan 2012 10:12:46 +0900 Subject: perf: Call perf_cgroup_event_time() directly The perf_event_time() will call perf_cgroup_event_time() if @event is a cgroup event. Just do it directly and avoid the extra check.. Signed-off-by: Namhyung Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/1327021966-27688-2-git-send-email-namhyung.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index a8f4ac001a0..32b48c88971 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -815,7 +815,7 @@ static void update_event_times(struct perf_event *event) * here. */ if (is_cgroup_event(event)) - run_end = perf_event_time(event); + run_end = perf_cgroup_event_time(event); else if (ctx->is_active) run_end = ctx->time; else -- cgit v1.2.3-70-g09d2 From 0e90b31f4ba77027a7c21cbfc66404df0851ca21 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 20 Jan 2012 04:57:16 +0000 Subject: net: introduce res_counter_charge_nofail() for socket allocations There is a case in __sk_mem_schedule(), where an allocation is beyond the maximum, but yet we are allowed to proceed. It happens under the following condition: sk->sk_wmem_queued + size >= sk->sk_sndbuf The network code won't revert the allocation in this case, meaning that at some point later it'll try to do it. Since this is never communicated to the underlying res_counter code, there is an inbalance in res_counter uncharge operation. I see two ways of fixing this: 1) storing the information about those allocations somewhere in memcg, and then deducting from that first, before we start draining the res_counter, 2) providing a slightly different allocation function for the res_counter, that matches the original behavior of the network code more closely. I decided to go for #2 here, believing it to be more elegant, since #1 would require us to do basically that, but in a more obscure way. Signed-off-by: Glauber Costa Cc: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Michal Hocko CC: Tejun Heo CC: Li Zefan CC: Laurent Chavey Acked-by: Tejun Heo Signed-off-by: David S. Miller --- include/linux/res_counter.h | 6 ++++++ include/net/sock.h | 10 ++++------ kernel/res_counter.c | 25 +++++++++++++++++++++++++ net/core/sock.c | 4 ++-- 4 files changed, 37 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index d06d014afda..da81af086ea 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h @@ -109,12 +109,18 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent); * * returns 0 on success and <0 if the counter->usage will exceed the * counter->limit _locked call expects the counter->lock to be taken + * + * charge_nofail works the same, except that it charges the resource + * counter unconditionally, and returns < 0 if the after the current + * charge we are over limit. */ int __must_check res_counter_charge_locked(struct res_counter *counter, unsigned long val); int __must_check res_counter_charge(struct res_counter *counter, unsigned long val, struct res_counter **limit_fail_at); +int __must_check res_counter_charge_nofail(struct res_counter *counter, + unsigned long val, struct res_counter **limit_fail_at); /* * uncharge - tell that some portion of the resource is released diff --git a/include/net/sock.h b/include/net/sock.h index 0e7a9b05f92..4c69ac165e6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1008,9 +1008,8 @@ static inline void memcg_memory_allocated_add(struct cg_proto *prot, struct res_counter *fail; int ret; - ret = res_counter_charge(prot->memory_allocated, - amt << PAGE_SHIFT, &fail); - + ret = res_counter_charge_nofail(prot->memory_allocated, + amt << PAGE_SHIFT, &fail); if (ret < 0) *parent_status = OVER_LIMIT; } @@ -1054,12 +1053,11 @@ sk_memory_allocated_add(struct sock *sk, int amt, int *parent_status) } static inline void -sk_memory_allocated_sub(struct sock *sk, int amt, int parent_status) +sk_memory_allocated_sub(struct sock *sk, int amt) { struct proto *prot = sk->sk_prot; - if (mem_cgroup_sockets_enabled && sk->sk_cgrp && - parent_status != OVER_LIMIT) /* Otherwise was uncharged already */ + if (mem_cgroup_sockets_enabled && sk->sk_cgrp) memcg_memory_allocated_sub(sk->sk_cgrp, amt); atomic_long_sub(amt, prot->memory_allocated); diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 6d269cce7aa..d508363858b 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -66,6 +66,31 @@ done: return ret; } +int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, + struct res_counter **limit_fail_at) +{ + int ret, r; + unsigned long flags; + struct res_counter *c; + + r = ret = 0; + *limit_fail_at = NULL; + local_irq_save(flags); + for (c = counter; c != NULL; c = c->parent) { + spin_lock(&c->lock); + r = res_counter_charge_locked(c, val); + if (r) + c->usage += val; + spin_unlock(&c->lock); + if (r < 0 && ret == 0) { + *limit_fail_at = c; + ret = r; + } + } + local_irq_restore(flags); + + return ret; +} void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) { if (WARN_ON(counter->usage < val)) diff --git a/net/core/sock.c b/net/core/sock.c index 5c5af9988f9..3e81fd2e3c7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1827,7 +1827,7 @@ suppress_allocation: /* Alas. Undo changes. */ sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; - sk_memory_allocated_sub(sk, amt, parent_status); + sk_memory_allocated_sub(sk, amt); return 0; } @@ -1840,7 +1840,7 @@ EXPORT_SYMBOL(__sk_mem_schedule); void __sk_mem_reclaim(struct sock *sk) { sk_memory_allocated_sub(sk, - sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, 0); + sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT); sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; if (sk_under_memory_pressure(sk) && -- cgit v1.2.3-70-g09d2 From d496aab567e7e52b3e974c9192a5de6e77dce32c Mon Sep 17 00:00:00 2001 From: Ananth N Mavinakayanahalli Date: Fri, 20 Jan 2012 14:34:04 -0800 Subject: kprobes: initialize before using a hlist Commit ef53d9c5e ("kprobes: improve kretprobe scalability with hashed locking") introduced a bug where we can potentially leak kretprobe_instances since we initialize a hlist head after having used it. Initialize the hlist head before using it. Reported by: Jim Keniston Acked-by: Jim Keniston Signed-off-by: Ananth N Mavinakayanahalli Acked-by: Masami Hiramatsu Cc: Srinivasa D S Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 95dd7212e61..29f5b65bee2 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1077,6 +1077,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) /* Early boot. kretprobe_table_locks not yet initialized. */ return; + INIT_HLIST_HEAD(&empty_rp); hash = hash_ptr(tk, KPROBE_HASH_BITS); head = &kretprobe_inst_table[hash]; kretprobe_table_lock(hash, &flags); @@ -1085,7 +1086,6 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) recycle_rp_inst(ri, &empty_rp); } kretprobe_table_unlock(hash, &flags); - INIT_HLIST_HEAD(&empty_rp); hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); kfree(ri); -- cgit v1.2.3-70-g09d2 From 42ae610c1a820ddecb80943d4ccfc936f7772535 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 21 Jan 2012 11:02:24 -0800 Subject: kernel-doc: fix new warnings in auditsc.c Fix new kernel-doc warnings in auditsc.c: Warning(kernel/auditsc.c:1875): No description found for parameter 'success' Warning(kernel/auditsc.c:1875): No description found for parameter 'return_code' Warning(kernel/auditsc.c:1875): Excess function parameter 'pt_regs' description in '__audit_syscall_exit' Signed-off-by: Randy Dunlap Cc: Al Viro Cc: Eric Paris Signed-off-by: Linus Torvalds --- kernel/auditsc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index caaea6e944f..af1de0f34ea 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1863,11 +1863,12 @@ void __audit_syscall_entry(int arch, int major, /** * audit_syscall_exit - deallocate audit context after a system call - * @pt_regs: syscall registers + * @success: success value of the syscall + * @return_code: return value of the syscall * * Tear down after system call. If the audit context has been marked as * auditable (either because of the AUDIT_RECORD_CONTEXT state from - * filtering, or because some other part of the kernel write an audit + * filtering, or because some other part of the kernel wrote an audit * message), then write out the syscall information. In call cases, * free the names stored from getname(). */ -- cgit v1.2.3-70-g09d2 From fa757281a08799fd6c0f7ec6f111d1cd66afc97b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 21 Jan 2012 11:03:13 -0800 Subject: kernel-doc: fix kernel-doc warnings in sched Fix new kernel-doc notation warnings: Warning(include/linux/sched.h:2094): No description found for parameter 'p' Warning(include/linux/sched.h:2094): Excess function parameter 'tsk' description in 'is_idle_task' Warning(kernel/sched/cpupri.c:139): No description found for parameter 'newpri' Warning(kernel/sched/cpupri.c:139): Excess function parameter 'pri' description in 'cpupri_set' Warning(kernel/sched/cpupri.c:208): Excess function parameter 'bootmem' description in 'cpupri_init' Signed-off-by: Randy Dunlap Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- kernel/sched/cpupri.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4032ec1cf83..513f5245987 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2088,7 +2088,7 @@ extern int sched_setscheduler_nocheck(struct task_struct *, int, extern struct task_struct *idle_task(int cpu); /** * is_idle_task - is the specified task an idle task? - * @tsk: the task in question. + * @p: the task in question. */ static inline bool is_idle_task(struct task_struct *p) { diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index b0d798eaf13..d72586fdf66 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -129,7 +129,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, * cpupri_set - update the cpu priority setting * @cp: The cpupri context * @cpu: The target cpu - * @pri: The priority (INVALID-RT99) to assign to this CPU + * @newpri: The priority (INVALID-RT99) to assign to this CPU * * Note: Assumes cpu_rq(cpu)->lock is locked * @@ -200,7 +200,6 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) /** * cpupri_init - initialize the cpupri structure * @cp: The cpupri context - * @bootmem: true if allocations need to use bootmem * * Returns: -ENOMEM if memory fails. */ -- cgit v1.2.3-70-g09d2 From 4ca9b72b71f10147bd21969c1805f5b2c4ca7b7b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Jan 2012 11:50:51 +0100 Subject: sched: Fix rq->nr_uninterruptible update race KOSAKI Motohiro noticed the following race: > CPU0 CPU1 > -------------------------------------------------------- > deactivate_task() > task->state = TASK_UNINTERRUPTIBLE; > activate_task() > rq->nr_uninterruptible--; > > schedule() > deactivate_task() > rq->nr_uninterruptible++; > Kosaki-San's scenario is possible when CPU0 runs __sched_setscheduler() against CPU1's current @task. __sched_setscheduler() does a dequeue/enqueue in order to move the task to its new queue (position) to reflect the newly provided scheduling parameters. However it should be completely invariant to nr_uninterruptible accounting, sched_setscheduler() doesn't affect readyness to run, merely policy on when to run. So convert the inappropriate activate/deactivate_task usage to enqueue/dequeue_task, which avoids the nr_uninterruptible accounting. Also convert the two other sites: __migrate_task() and normalize_task() that still use activate/deactivate_task. These sites aren't really a problem since __migrate_task() will only be called on non-running task (and therefore are immume to the described problem) and normalize_task() isn't ever used on regular systems. Also remove the comments from activate/deactivate_task since they're misleading at best. Reported-by: KOSAKI Motohiro Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1327486224.2614.45.camel@laptop Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index df00cb09263..e067df1fd01 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -723,9 +723,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) p->sched_class->dequeue_task(rq, p, flags); } -/* - * activate_task - move a task to the runqueue. - */ void activate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_contributes_to_load(p)) @@ -734,9 +731,6 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) enqueue_task(rq, p, flags); } -/* - * deactivate_task - remove a task from the runqueue. - */ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_contributes_to_load(p)) @@ -4134,7 +4128,7 @@ recheck: on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) - deactivate_task(rq, p, 0); + dequeue_task(rq, p, 0); if (running) p->sched_class->put_prev_task(rq, p); @@ -4147,7 +4141,7 @@ recheck: if (running) p->sched_class->set_curr_task(rq); if (on_rq) - activate_task(rq, p, 0); + enqueue_task(rq, p, 0); check_class_changed(rq, p, prev_class, oldprio); task_rq_unlock(rq, p, &flags); @@ -4998,9 +4992,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * placed properly. */ if (p->on_rq) { - deactivate_task(rq_src, p, 0); + dequeue_task(rq_src, p, 0); set_task_cpu(p, dest_cpu); - activate_task(rq_dest, p, 0); + enqueue_task(rq_dest, p, 0); check_preempt_curr(rq_dest, p, 0); } done: @@ -7032,10 +7026,10 @@ static void normalize_task(struct rq *rq, struct task_struct *p) on_rq = p->on_rq; if (on_rq) - deactivate_task(rq, p, 0); + dequeue_task(rq, p, 0); __setscheduler(rq, p, SCHED_NORMAL, 0); if (on_rq) { - activate_task(rq, p, 0); + enqueue_task(rq, p, 0); resched_task(rq->curr); } -- cgit v1.2.3-70-g09d2 From db7e527da41560f597ccdc4417cefa6b7657c0c0 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 11 Jan 2012 08:58:16 +0100 Subject: sched/s390: Fix compile error in sched/core.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 029632fbb7b7c9d85063cc9eb470de6c54873df3 ("sched: Make separate sched*.c translation units") removed the include of asm/mutex.h from sched.c. This breaks the combination of: CONFIG_MUTEX_SPIN_ON_OWNER=yes CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX=yes like s390 without mutex debugging: CC kernel/sched/core.o kernel/sched/core.c: In function ‘mutex_spin_on_owner’: kernel/sched/core.c:3287: error: implicit declaration of function ‘arch_mutex_cpu_relax’ Lets re-add the include to kernel/sched/core.c Signed-off-by: Christian Borntraeger Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1326268696-30904-1-git-send-email-borntraeger@de.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e067df1fd01..5255c9d2e05 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -74,6 +74,7 @@ #include #include +#include #ifdef CONFIG_PARAVIRT #include #endif -- cgit v1.2.3-70-g09d2 From 71325960d16cd68ea0e22a8da15b2495b0f363f7 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 19 Jan 2012 18:28:57 -0800 Subject: sched/nohz: Fix nohz cpu idle load balancing state with cpu hotplug With the recent nohz scheduler changes, rq's nohz flag 'NOHZ_TICK_STOPPED' and its associated state doesn't get cleared immediately after the cpu exits idle. This gets cleared as part of the next tick seen on that cpu. For the cpu offline support, we need to clear this state manually. Fix it by registering a cpu notifier, which clears the nohz idle load balance state for this rq explicitly during the CPU_DYING notification. There won't be any nohz updates for that cpu, after the CPU_DYING notification. But lets be extra paranoid and skip updating the nohz state in the select_nohz_load_balancer() if the cpu is not in active state anymore. Reported-by: Srivatsa S. Bhat Reviewed-and-tested-by: Srivatsa S. Bhat Tested-by: Sergey Senozhatsky Signed-off-by: Suresh Siddha Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1327026538.16150.40.camel@sbsiddha-desk.sc.intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 84adb2d66cb..7c6414fc669 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4866,6 +4866,15 @@ static void nohz_balancer_kick(int cpu) return; } +static inline void clear_nohz_tick_stopped(int cpu) +{ + if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); + } +} + static inline void set_cpu_sd_state_busy(void) { struct sched_domain *sd; @@ -4904,6 +4913,12 @@ void select_nohz_load_balancer(int stop_tick) { int cpu = smp_processor_id(); + /* + * If this cpu is going down, then nothing needs to be done. + */ + if (!cpu_active(cpu)) + return; + if (stop_tick) { if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) return; @@ -4914,6 +4929,18 @@ void select_nohz_load_balancer(int stop_tick) } return; } + +static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DYING: + clear_nohz_tick_stopped(smp_processor_id()); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} #endif static DEFINE_SPINLOCK(balancing); @@ -5070,11 +5097,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) * busy tick after returning from idle, we will update the busy stats. */ set_cpu_sd_state_busy(); - if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { - clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); - } + clear_nohz_tick_stopped(cpu); /* * None are in tickless mode and hence no need for NOHZ idle load @@ -5590,6 +5613,7 @@ __init void init_sched_fair_class(void) #ifdef CONFIG_NO_HZ zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); + cpu_notifier(sched_ilb_notifier, 0); #endif #endif /* SMP */ -- cgit v1.2.3-70-g09d2 From b0f4c4b32c8e3aa0d44fc4dd6c40a9a9a8d66b63 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Thu, 26 Jan 2012 08:55:34 -0500 Subject: bugs, x86: Fix printk levels for panic, softlockups and stack dumps rsyslog will display KERN_EMERG messages on a connected terminal. However, these messages are useless/undecipherable for a general user. For example, after a softlockup we get: Message from syslogd@intel-s3e37-04 at Jan 25 14:18:06 ... kernel:Stack: Message from syslogd@intel-s3e37-04 at Jan 25 14:18:06 ... kernel:Call Trace: Message from syslogd@intel-s3e37-04 at Jan 25 14:18:06 ... kernel:Code: ff ff a8 08 75 25 31 d2 48 8d 86 38 e0 ff ff 48 89 d1 0f 01 c8 0f ae f0 48 8b 86 38 e0 ff ff a8 08 75 08 b1 01 4c 89 e0 0f 01 c9 ea 69 dd ff 4c 29 e8 48 89 c7 e8 0f bc da ff 49 89 c4 49 89 This happens because the printk levels for these messages are incorrect. Only an informational message should be displayed on a terminal. I modified the printk levels for various messages in the kernel and tested the output by using the drivers/misc/lkdtm.c kernel modules (ie, softlockups, panics, hard lockups, etc.) and confirmed that the console output was still the same and that the output to the terminals was correct. For example, in the case of a softlockup we now see the much more informative: Message from syslogd@intel-s3e37-04 at Jan 25 10:18:06 ... BUG: soft lockup - CPU4 stuck for 60s! instead of the above confusing messages. AFAICT, the messages no longer have to be KERN_EMERG. In the most important case of a panic we set console_verbose(). As for the other less severe cases the correct data is output to the console and /var/log/messages. Successfully tested by me using the drivers/misc/lkdtm.c module. Signed-off-by: Prarit Bhargava Cc: dzickus@redhat.com Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1327586134-11926-1-git-send-email-prarit@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 3 ++- arch/x86/kernel/dumpstack_64.c | 6 +++--- arch/x86/mm/fault.c | 4 ++-- kernel/watchdog.c | 2 +- lib/bug.c | 2 +- 5 files changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 1aae78f775f..4025fe4f928 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -252,7 +252,8 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) unsigned short ss; unsigned long sp; #endif - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); + printk(KERN_DEFAULT + "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); #ifdef CONFIG_PREEMPT printk("PREEMPT "); #endif diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 6d728d9284b..42b2bca0b72 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -269,11 +269,11 @@ void show_registers(struct pt_regs *regs) unsigned char c; u8 *ip; - printk(KERN_EMERG "Stack:\n"); + printk(KERN_DEFAULT "Stack:\n"); show_stack_log_lvl(NULL, regs, (unsigned long *)sp, - 0, KERN_EMERG); + 0, KERN_DEFAULT); - printk(KERN_EMERG "Code: "); + printk(KERN_DEFAULT "Code: "); ip = (u8 *)regs->ip - code_prologue; if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9d74824a708..f0b4caf85c1 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -673,7 +673,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, stackend = end_of_stack(tsk); if (tsk != &init_task && *stackend != STACK_END_MAGIC) - printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); + printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); tsk->thread.cr2 = address; tsk->thread.trap_no = 14; @@ -684,7 +684,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, sig = 0; /* Executive summary in case the body of the oops scrolled away */ - printk(KERN_EMERG "CR2: %016lx\n", address); + printk(KERN_DEFAULT "CR2: %016lx\n", address); oops_end(flags, regs, sig); } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 1d7bca7f4f5..d117262deba 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -296,7 +296,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (__this_cpu_read(soft_watchdog_warn) == true) return HRTIMER_RESTART; - printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", + printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); print_modules(); diff --git a/lib/bug.c b/lib/bug.c index 19552096d16..a28c1415357 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -169,7 +169,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) return BUG_TRAP_TYPE_WARN; } - printk(KERN_EMERG "------------[ cut here ]------------\n"); + printk(KERN_DEFAULT "------------[ cut here ]------------\n"); if (file) printk(KERN_CRIT "kernel BUG at %s:%u!\n", -- cgit v1.2.3-70-g09d2 From b5740f4b2cb3503b436925eb2242bc3d75cd3dfe Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Tue, 17 Jan 2012 17:40:31 +0900 Subject: sched: Fix ancient race in do_exit() try_to_wake_up() has a problem which may change status from TASK_DEAD to TASK_RUNNING in race condition with SMI or guest environment of virtual machine. As a result, exited task is scheduled() again and panic occurs. Here is the sequence how it occurs: ----------------------------------+----------------------------- | CPU A | CPU B ----------------------------------+----------------------------- TASK A calls exit().... do_exit() exit_mm() down_read(mm->mmap_sem); rwsem_down_failed_common() set TASK_UNINTERRUPTIBLE set waiter.task <= task A list_add to sem->wait_list : raw_spin_unlock_irq() (I/O interruption occured) __rwsem_do_wake(mmap_sem) list_del(&waiter->list); waiter->task = NULL wake_up_process(task A) try_to_wake_up() (task is still TASK_UNINTERRUPTIBLE) p->on_rq is still 1.) ttwu_do_wakeup() (*A) : (I/O interruption handler finished) if (!waiter.task) schedule() is not called due to waiter.task is NULL. tsk->state = TASK_RUNNING : check_preempt_curr(); : task->state = TASK_DEAD (*B) <--- set TASK_RUNNING (*C) schedule() (exit task is running again) BUG_ON() is called! -------------------------------------------------------- The execution time between (*A) and (*B) is usually very short, because the interruption is disabled, and setting TASK_RUNNING at (*C) must be executed before setting TASK_DEAD. HOWEVER, if SMI is interrupted between (*A) and (*B), (*C) is able to execute AFTER setting TASK_DEAD! Then, exited task is scheduled again, and BUG_ON() is called.... If the system works on guest system of virtual machine, the time between (*A) and (*B) may be also long due to scheduling of hypervisor, and same phenomenon can occur. By this patch, do_exit() waits for releasing task->pi_lock which is used in try_to_wake_up(). It guarantees the task becomes TASK_DEAD after waking up. Signed-off-by: Yasunori Goto Acked-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20120117174031.3118.E1E9C6FF@jp.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/exit.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 294b1709170..4b4042f9bc6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1038,6 +1038,22 @@ void do_exit(long code) if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); + + /* + * The setting of TASK_RUNNING by try_to_wake_up() may be delayed + * when the following two conditions become true. + * - There is race condition of mmap_sem (It is acquired by + * exit_mm()), and + * - SMI occurs before setting TASK_RUNINNG. + * (or hypervisor of virtual machine switches to other guest) + * As a result, we may become TASK_RUNNING after becoming TASK_DEAD + * + * To avoid it, we have to wait for releasing tsk->pi_lock which + * is held by try_to_wake_up() + */ + smp_mb(); + raw_spin_unlock_wait(&tsk->pi_lock); + /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ -- cgit v1.2.3-70-g09d2 From e050e3f0a71bf7dc2c148b35caff0234decc8198 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 26 Jan 2012 17:03:19 +0100 Subject: perf: Fix broken interrupt rate throttling This patch fixes the sampling interrupt throttling mechanism. It was broken in v3.2. Events were not being unthrottled. The unthrottling mechanism required that events be checked at each timer tick. This patch solves this problem and also separates: - unthrottling - multiplexing - frequency-mode period adjustments Not all of them need to be executed at each timer tick. This third version of the patch is based on my original patch + PeterZ proposal (https://lkml.org/lkml/2012/1/7/87). At each timer tick, for each context: - if the current CPU has throttled events, we unthrottle events - if context has frequency-based events, we adjust sampling periods - if we have reached the jiffies interval, we multiplex (rotate) We decoupled rotation (multiplexing) from frequency-mode sampling period adjustments. They should not necessarily happen at the same rate. Multiplexing is subject to jiffies_interval (currently at 1 but could be higher once the tunable is exposed via sysfs). We have grouped frequency-mode adjustment and unthrottling into the same routine to minimize code duplication. When throttled while in frequency mode, we scan the events only once. We have fixed the threshold enforcement code in __perf_event_overflow(). There was a bug whereby it would allow more than the authorized rate because an increment of hwc->interrupts was not executed at the right place. The patch was tested with low sampling limit (2000) and fixed periods, frequency mode, overcommitted PMU. On a 2.1GHz AMD CPU: $ cat /proc/sys/kernel/perf_event_max_sample_rate 2000 We set a rate of 3000 samples/sec (2.1GHz/3000 = 700000): $ perf record -e cycles,cycles -c 700000 noploop 10 $ perf report -D | tail -21 Aggregated stats: TOTAL events: 80086 MMAP events: 88 COMM events: 2 EXIT events: 4 THROTTLE events: 19996 UNTHROTTLE events: 19996 SAMPLE events: 40000 cycles stats: TOTAL events: 40006 MMAP events: 5 COMM events: 1 EXIT events: 4 THROTTLE events: 9998 UNTHROTTLE events: 9998 SAMPLE events: 20000 cycles stats: TOTAL events: 39996 THROTTLE events: 9998 UNTHROTTLE events: 9998 SAMPLE events: 20000 For 10s, the cap is 2x2000x10 = 40000 samples. We get exactly that: 20000 samples/event. Signed-off-by: Stephane Eranian Cc: # v3.2+ Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120126160319.GA5655@quad Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 + kernel/events/core.c | 104 ++++++++++++++++++++++++++++----------------- 2 files changed, 67 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 08855613ceb..abb2776be1b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -587,6 +587,7 @@ struct hw_perf_event { u64 sample_period; u64 last_period; local64_t period_left; + u64 interrupts_seq; u64 interrupts; u64 freq_time_stamp; diff --git a/kernel/events/core.c b/kernel/events/core.c index 32b48c88971..ba36013cfb2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2300,6 +2300,9 @@ do { \ return div64_u64(dividend, divisor); } +static DEFINE_PER_CPU(int, perf_throttled_count); +static DEFINE_PER_CPU(u64, perf_throttled_seq); + static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) { struct hw_perf_event *hwc = &event->hw; @@ -2325,16 +2328,29 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) } } -static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) +/* + * combine freq adjustment with unthrottling to avoid two passes over the + * events. At the same time, make sure, having freq events does not change + * the rate of unthrottling as that would introduce bias. + */ +static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, + int needs_unthr) { struct perf_event *event; struct hw_perf_event *hwc; - u64 interrupts, now; + u64 now, period = TICK_NSEC; s64 delta; - if (!ctx->nr_freq) + /* + * only need to iterate over all events iff: + * - context have events in frequency mode (needs freq adjust) + * - there are events to unthrottle on this cpu + */ + if (!(ctx->nr_freq || needs_unthr)) return; + raw_spin_lock(&ctx->lock); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; @@ -2344,13 +2360,8 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) hwc = &event->hw; - interrupts = hwc->interrupts; - hwc->interrupts = 0; - - /* - * unthrottle events on the tick - */ - if (interrupts == MAX_INTERRUPTS) { + if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) { + hwc->interrupts = 0; perf_log_throttle(event, 1); event->pmu->start(event, 0); } @@ -2358,14 +2369,26 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) if (!event->attr.freq || !event->attr.sample_freq) continue; - event->pmu->read(event); + /* + * stop the event and update event->count + */ + event->pmu->stop(event, PERF_EF_UPDATE); + now = local64_read(&event->count); delta = now - hwc->freq_count_stamp; hwc->freq_count_stamp = now; + /* + * restart the event + * reload only if value has changed + */ if (delta > 0) perf_adjust_period(event, period, delta); + + event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); } + + raw_spin_unlock(&ctx->lock); } /* @@ -2388,16 +2411,13 @@ static void rotate_ctx(struct perf_event_context *ctx) */ static void perf_rotate_context(struct perf_cpu_context *cpuctx) { - u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; struct perf_event_context *ctx = NULL; - int rotate = 0, remove = 1, freq = 0; + int rotate = 0, remove = 1; if (cpuctx->ctx.nr_events) { remove = 0; if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) rotate = 1; - if (cpuctx->ctx.nr_freq) - freq = 1; } ctx = cpuctx->task_ctx; @@ -2405,37 +2425,26 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) remove = 0; if (ctx->nr_events != ctx->nr_active) rotate = 1; - if (ctx->nr_freq) - freq = 1; } - if (!rotate && !freq) + if (!rotate) goto done; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); - if (freq) { - perf_ctx_adjust_freq(&cpuctx->ctx, interval); - if (ctx) - perf_ctx_adjust_freq(ctx, interval); - } - - if (rotate) { - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - if (ctx) - ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + if (ctx) + ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); - rotate_ctx(&cpuctx->ctx); - if (ctx) - rotate_ctx(ctx); + rotate_ctx(&cpuctx->ctx); + if (ctx) + rotate_ctx(ctx); - perf_event_sched_in(cpuctx, ctx, current); - } + perf_event_sched_in(cpuctx, ctx, current); perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - done: if (remove) list_del_init(&cpuctx->rotation_list); @@ -2445,10 +2454,22 @@ void perf_event_task_tick(void) { struct list_head *head = &__get_cpu_var(rotation_list); struct perf_cpu_context *cpuctx, *tmp; + struct perf_event_context *ctx; + int throttled; WARN_ON(!irqs_disabled()); + __this_cpu_inc(perf_throttled_seq); + throttled = __this_cpu_xchg(perf_throttled_count, 0); + list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { + ctx = &cpuctx->ctx; + perf_adjust_freq_unthr_context(ctx, throttled); + + ctx = cpuctx->task_ctx; + if (ctx) + perf_adjust_freq_unthr_context(ctx, throttled); + if (cpuctx->jiffies_interval == 1 || !(jiffies % cpuctx->jiffies_interval)) perf_rotate_context(cpuctx); @@ -4509,6 +4530,7 @@ static int __perf_event_overflow(struct perf_event *event, { int events = atomic_read(&event->event_limit); struct hw_perf_event *hwc = &event->hw; + u64 seq; int ret = 0; /* @@ -4518,14 +4540,20 @@ static int __perf_event_overflow(struct perf_event *event, if (unlikely(!is_sampling_event(event))) return 0; - if (unlikely(hwc->interrupts >= max_samples_per_tick)) { - if (throttle) { + seq = __this_cpu_read(perf_throttled_seq); + if (seq != hwc->interrupts_seq) { + hwc->interrupts_seq = seq; + hwc->interrupts = 1; + } else { + hwc->interrupts++; + if (unlikely(throttle + && hwc->interrupts >= max_samples_per_tick)) { + __this_cpu_inc(perf_throttled_count); hwc->interrupts = MAX_INTERRUPTS; perf_log_throttle(event, 0); ret = 1; } - } else - hwc->interrupts++; + } if (event->attr.freq) { u64 now = perf_clock(); -- cgit v1.2.3-70-g09d2 From cb297a3e433dbdcf7ad81e0564e7b804c941ff0d Mon Sep 17 00:00:00 2001 From: Chanho Min Date: Thu, 5 Jan 2012 20:00:19 +0900 Subject: sched/rt: Fix task stack corruption under __ARCH_WANT_INTERRUPTS_ON_CTXSW This issue happens under the following conditions: 1. preemption is off 2. __ARCH_WANT_INTERRUPTS_ON_CTXSW is defined 3. RT scheduling class 4. SMP system Sequence is as follows: 1.suppose current task is A. start schedule() 2.task A is enqueued pushable task at the entry of schedule() __schedule prev = rq->curr; ... put_prev_task put_prev_task_rt enqueue_pushable_task 4.pick the task B as next task. next = pick_next_task(rq); 3.rq->curr set to task B and context_switch is started. rq->curr = next; 4.At the entry of context_swtich, release this cpu's rq->lock. context_switch prepare_task_switch prepare_lock_switch raw_spin_unlock_irq(&rq->lock); 5.Shortly after rq->lock is released, interrupt is occurred and start IRQ context 6.try_to_wake_up() which called by ISR acquires rq->lock try_to_wake_up ttwu_remote rq = __task_rq_lock(p) ttwu_do_wakeup(rq, p, wake_flags); task_woken_rt 7.push_rt_task picks the task A which is enqueued before. task_woken_rt push_rt_tasks(rq) next_task = pick_next_pushable_task(rq) 8.At find_lock_lowest_rq(), If double_lock_balance() returns 0, lowest_rq can be the remote rq. (But,If preemption is on, double_lock_balance always return 1 and it does't happen.) push_rt_task find_lock_lowest_rq if (double_lock_balance(rq, lowest_rq)).. 9.find_lock_lowest_rq return the available rq. task A is migrated to the remote cpu/rq. push_rt_task ... deactivate_task(rq, next_task, 0); set_task_cpu(next_task, lowest_rq->cpu); activate_task(lowest_rq, next_task, 0); 10. But, task A is on irq context at this cpu. So, task A is scheduled by two cpus at the same time until restore from IRQ. Task A's stack is corrupted. To fix it, don't migrate an RT task if it's still running. Signed-off-by: Chanho Min Signed-off-by: Peter Zijlstra Acked-by: Steven Rostedt Cc: Link: http://lkml.kernel.org/r/CAOAMb1BHA=5fm7KTewYyke6u-8DP0iUuJMpgQw54vNeXFsGpoQ@mail.gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3640ebbb466..f42ae7fb5ec 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1587,6 +1587,11 @@ static int push_rt_task(struct rq *rq) if (!next_task) return 0; +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + if (unlikely(task_running(rq, next_task))) + return 0; +#endif + retry: if (unlikely(next_task == rq->curr)) { WARN_ON(1); -- cgit v1.2.3-70-g09d2 From 181e9bdef37bfcaa41f3ab6c948a2a0d60a268b5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 29 Jan 2012 20:35:52 +0100 Subject: PM / Hibernate: Fix s2disk regression related to freezing workqueues Commit 2aede851ddf08666f68ffc17be446420e9d2a056 PM / Hibernate: Freeze kernel threads after preallocating memory introduced a mechanism by which kernel threads were frozen after the preallocation of hibernate image memory to avoid problems with frozen kernel threads not responding to memory freeing requests. However, it overlooked the s2disk code path in which the SNAPSHOT_CREATE_IMAGE ioctl was run directly after SNAPSHOT_FREE, which caused freeze_workqueues_begin() to BUG(), because it saw that worqueues had been already frozen. Although in principle this issue might be addressed by removing the relevant BUG_ON() from freeze_workqueues_begin(), that would reintroduce the very problem that commit 2aede851ddf08666f68ffc17be4 attempted to avoid into that particular code path. For this reason, to fix the issue at hand, introduce thaw_kernel_threads() and make the SNAPSHOT_FREE ioctl execute it. Special thanks to Srivatsa S. Bhat for detailed analysis of the problem. Reported-and-tested-by: Jiri Slaby Signed-off-by: Rafael J. Wysocki Acked-by: Srivatsa S. Bhat Cc: stable@kernel.org --- include/linux/freezer.h | 2 ++ kernel/power/process.c | 19 +++++++++++++++++++ kernel/power/user.c | 9 +++++++++ 3 files changed, 30 insertions(+) (limited to 'kernel') diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 0ab54e16a91..d09af4b67cf 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -39,6 +39,7 @@ extern bool __refrigerator(bool check_kthr_stop); extern int freeze_processes(void); extern int freeze_kernel_threads(void); extern void thaw_processes(void); +extern void thaw_kernel_threads(void); static inline bool try_to_freeze(void) { @@ -174,6 +175,7 @@ static inline bool __refrigerator(bool check_kthr_stop) { return false; } static inline int freeze_processes(void) { return -ENOSYS; } static inline int freeze_kernel_threads(void) { return -ENOSYS; } static inline void thaw_processes(void) {} +static inline void thaw_kernel_threads(void) {} static inline bool try_to_freeze(void) { return false; } diff --git a/kernel/power/process.c b/kernel/power/process.c index 77274c9ba2f..eeca00311f3 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -188,3 +188,22 @@ void thaw_processes(void) printk("done.\n"); } +void thaw_kernel_threads(void) +{ + struct task_struct *g, *p; + + pm_nosig_freezing = false; + printk("Restarting kernel threads ... "); + + thaw_workqueues(); + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) + __thaw_task(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + + schedule(); + printk("done.\n"); +} diff --git a/kernel/power/user.c b/kernel/power/user.c index 6b1ab7a8852..e5a21a85730 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -274,6 +274,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, swsusp_free(); memset(&data->handle, 0, sizeof(struct snapshot_handle)); data->ready = 0; + /* + * It is necessary to thaw kernel threads here, because + * SNAPSHOT_CREATE_IMAGE may be invoked directly after + * SNAPSHOT_FREE. In that case, if kernel threads were not + * thawed, the preallocation of memory carried out by + * hibernation_snapshot() might run into problems (i.e. it + * might fail or even deadlock). + */ + thaw_kernel_threads(); break; case SNAPSHOT_PREF_IMAGE_SIZE: -- cgit v1.2.3-70-g09d2 From fe9161db2e6053da21e4649d77bbefaf3030b11d Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Wed, 1 Feb 2012 22:16:36 +0100 Subject: PM / Hibernate: Thaw kernel threads in SNAPSHOT_CREATE_IMAGE ioctl path In the SNAPSHOT_CREATE_IMAGE ioctl, if the call to hibernation_snapshot() fails, the frozen tasks are not thawed. And in the case of success, if we happen to exit due to a successful freezer test, all tasks (including those of userspace) are thawed, whereas actually we should have thawed only the kernel threads at that point. Fix both these issues. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki Cc: stable@vger.kernel.org --- kernel/power/user.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index e5a21a85730..3e100075b13 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -249,13 +249,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, } pm_restore_gfp_mask(); error = hibernation_snapshot(data->platform_support); - if (!error) { + if (error) { + thaw_kernel_threads(); + } else { error = put_user(in_suspend, (int __user *)arg); if (!error && !freezer_test_done) data->ready = 1; if (freezer_test_done) { freezer_test_done = false; - thaw_processes(); + thaw_kernel_threads(); } } break; -- cgit v1.2.3-70-g09d2 From 8cdb878dcb359fd1137e9abdee9322f5e9bcfdf8 Mon Sep 17 00:00:00 2001 From: Christopher Yeoh Date: Thu, 2 Feb 2012 11:34:09 +1030 Subject: Fix race in process_vm_rw_core This fixes the race in process_vm_core found by Oleg (see http://article.gmane.org/gmane.linux.kernel/1235667/ for details). This has been updated since I last sent it as the creation of the new mm_access() function did almost exactly the same thing as parts of the previous version of this patch did. In order to use mm_access() even when /proc isn't enabled, we move it to kernel/fork.c where other related process mm access functions already are. Signed-off-by: Chris Yeoh Signed-off-by: Linus Torvalds --- fs/proc/base.c | 20 -------------------- include/linux/sched.h | 6 ++++++ kernel/fork.c | 20 ++++++++++++++++++++ mm/process_vm_access.c | 23 +++++++++-------------- 4 files changed, 35 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/fs/proc/base.c b/fs/proc/base.c index d9512bd03e6..d4548dd49b0 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -198,26 +198,6 @@ static int proc_root_link(struct dentry *dentry, struct path *path) return result; } -static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) -{ - struct mm_struct *mm; - int err; - - err = mutex_lock_killable(&task->signal->cred_guard_mutex); - if (err) - return ERR_PTR(err); - - mm = get_task_mm(task); - if (mm && mm != current->mm && - !ptrace_may_access(task, mode)) { - mmput(mm); - mm = ERR_PTR(-EACCES); - } - mutex_unlock(&task->signal->cred_guard_mutex); - - return mm; -} - struct mm_struct *mm_for_maps(struct task_struct *task) { return mm_access(task, PTRACE_MODE_READ); diff --git a/include/linux/sched.h b/include/linux/sched.h index 2234985a5e6..7d379a6bfd8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2259,6 +2259,12 @@ static inline void mmdrop(struct mm_struct * mm) extern void mmput(struct mm_struct *); /* Grab a reference to a task's mm, if it is not already going away */ extern struct mm_struct *get_task_mm(struct task_struct *task); +/* + * Grab a reference to a task's mm, if it is not already going away + * and ptrace_may_access with the mode parameter passed to it + * succeeds. + */ +extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); /* Remove the current tasks stale references to the old mm_struct */ extern void mm_release(struct task_struct *, struct mm_struct *); /* Allocate a new mm structure and copy contents from tsk->mm */ diff --git a/kernel/fork.c b/kernel/fork.c index 051f090d40c..1b2ef3c23ae 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -647,6 +647,26 @@ struct mm_struct *get_task_mm(struct task_struct *task) } EXPORT_SYMBOL_GPL(get_task_mm); +struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) +{ + struct mm_struct *mm; + int err; + + err = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (err) + return ERR_PTR(err); + + mm = get_task_mm(task); + if (mm && mm != current->mm && + !ptrace_may_access(task, mode)) { + mmput(mm); + mm = ERR_PTR(-EACCES); + } + mutex_unlock(&task->signal->cred_guard_mutex); + + return mm; +} + /* Please note the differences between mmput and mm_release. * mmput is called whenever we stop holding onto a mm_struct, * error success whatever. diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index e920aa3ce10..c20ff48994c 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -298,23 +298,18 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, goto free_proc_pages; } - task_lock(task); - if (__ptrace_may_access(task, PTRACE_MODE_ATTACH)) { - task_unlock(task); - rc = -EPERM; - goto put_task_struct; - } - mm = task->mm; - - if (!mm || (task->flags & PF_KTHREAD)) { - task_unlock(task); - rc = -EINVAL; + mm = mm_access(task, PTRACE_MODE_ATTACH); + if (!mm || IS_ERR(mm)) { + rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; + /* + * Explicitly map EACCES to EPERM as EPERM is a more a + * appropriate error code for process_vw_readv/writev + */ + if (rc == -EACCES) + rc = -EPERM; goto put_task_struct; } - atomic_inc(&mm->mm_users); - task_unlock(task); - for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) { rc = process_vm_rw_single_vec( (unsigned long)rvec[i].iov_base, rvec[i].iov_len, -- cgit v1.2.3-70-g09d2 From 55ca6140e9bb307efc97a9301a4f501de02a6fd6 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 3 Feb 2012 15:37:16 -0800 Subject: kprobes: fix a memory leak in function pre_handler_kretprobe() In function pre_handler_kretprobe(), the allocated kretprobe_instance object will get leaked if the entry_handler callback returns non-zero. This may cause all the preallocated kretprobe_instance objects exhausted. This issue can be reproduced by changing samples/kprobes/kretprobe_example.c to probe "mutex_unlock". And the fix is straightforward: just put the allocated kretprobe_instance object back onto the free_instances list. [akpm@linux-foundation.org: use raw_spin_lock/unlock] Signed-off-by: Jiang Liu Acked-by: Jim Keniston Acked-by: Ananth N Mavinakayanahalli Cc: Masami Hiramatsu Cc: Anil S Keshavamurthy Cc: "David S. Miller" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 29f5b65bee2..9788c0ec6f4 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1673,8 +1673,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, ri->rp = rp; ri->task = current; - if (rp->entry_handler && rp->entry_handler(ri, regs)) + if (rp->entry_handler && rp->entry_handler(ri, regs)) { + raw_spin_lock_irqsave(&rp->lock, flags); + hlist_add_head(&ri->hlist, &rp->free_instances); + raw_spin_unlock_irqrestore(&rp->lock, flags); return 0; + } arch_prepare_kretprobe(ri, regs); -- cgit v1.2.3-70-g09d2 From 379e0be812ab8a2a351e784b0c987788f5123090 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Fri, 3 Feb 2012 22:22:41 +0100 Subject: PM / Freezer: Thaw only kernel threads if freezing of kernel threads fails If freezing of kernel threads fails, we are expected to automatically thaw tasks in the error recovery path. However, at times, we encounter situations in which we would like the automatic error recovery path to thaw only the kernel threads, because we want to be able to do some more cleanup before we thaw userspace. Something like: error = freeze_kernel_threads(); if (error) { /* Do some cleanup */ /* Only then thaw userspace tasks*/ thaw_processes(); } An example of such a situation is where we freeze/thaw filesystems during suspend/hibernation. There, if freezing of kernel threads fails, we would like to thaw the frozen filesystems before thawing the userspace tasks. So, modify freeze_kernel_threads() to thaw only kernel threads in case of freezing failure. And change suspend_freeze_processes() accordingly. (At the same time, let us also get rid of the rather cryptic usage of the conditional operator (:?) in that function.) [rjw: In fact, this patch fixes a regression introduced during the 3.3 merge window, because without it thaw_processes() may be called before swsusp_free() in some situations and that may lead to massive memory allocation failures.] Signed-off-by: Srivatsa S. Bhat Acked-by: Tejun Heo Acked-by: Nigel Cunningham Signed-off-by: Rafael J. Wysocki --- kernel/power/power.h | 24 ++++++++++++++++++++++-- kernel/power/process.c | 7 +++++-- 2 files changed, 27 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 0c4defe6d3b..21724eee520 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -231,8 +231,28 @@ extern int pm_test_level; #ifdef CONFIG_SUSPEND_FREEZER static inline int suspend_freeze_processes(void) { - int error = freeze_processes(); - return error ? : freeze_kernel_threads(); + int error; + + error = freeze_processes(); + + /* + * freeze_processes() automatically thaws every task if freezing + * fails. So we need not do anything extra upon error. + */ + if (error) + goto Finish; + + error = freeze_kernel_threads(); + + /* + * freeze_kernel_threads() thaws only kernel threads upon freezing + * failure. So we have to thaw the userspace tasks ourselves. + */ + if (error) + thaw_processes(); + + Finish: + return error; } static inline void suspend_thaw_processes(void) diff --git a/kernel/power/process.c b/kernel/power/process.c index eeca00311f3..7e426459e60 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -143,7 +143,10 @@ int freeze_processes(void) /** * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. * - * On success, returns 0. On failure, -errno and system is fully thawed. + * On success, returns 0. On failure, -errno and only the kernel threads are + * thawed, so as to give a chance to the caller to do additional cleanups + * (if any) before thawing the userspace tasks. So, it is the responsibility + * of the caller to thaw the userspace tasks, when the time is right. */ int freeze_kernel_threads(void) { @@ -159,7 +162,7 @@ int freeze_kernel_threads(void) BUG_ON(in_atomic()); if (error) - thaw_processes(); + thaw_kernel_threads(); return error; } -- cgit v1.2.3-70-g09d2