From 55af77969fbd7a841838220ea2287432e0da8ae5 Mon Sep 17 00:00:00 2001
From: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Date: Tue, 29 Nov 2011 15:08:36 +0900
Subject: x86: Panic on detection of stack overflow

Currently, messages are just output on the detection of stack
overflow, which is not sufficient for systems that need a
high reliability. This is because in general the overflow may
corrupt data, and the additional corruption may occur due to
reading them unless systems stop.

This patch adds the sysctl parameter
kernel.panic_on_stackoverflow and causes a panic when detecting
the overflows of kernel, IRQ and exception stacks except user
stack according to the parameter. It is disabled by default.

Signed-off-by: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Cc: yrl.pp-manager.tt@hitachi.com
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Link: http://lkml.kernel.org/r/20111129060836.11076.12323.stgit@ltc219.sdl.hitachi.co.jp
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sysctl.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ae271964385..f487f257e05 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -803,6 +803,15 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+	{
+		.procname	= "panic_on_stackoverflow",
+		.data		= &sysctl_panic_on_stackoverflow,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
 	{
 		.procname	= "bootloader_type",
 		.data		= &bootloader_type,
-- 
cgit v1.2.3-70-g09d2


From 6e736be7f282fff705db7c34a15313281b372a76 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 14 Dec 2011 00:33:38 +0100
Subject: block: make ioc get/put interface more conventional and fix race on
 alloction

Ignoring copy_io() during fork, io_context can be allocated from two
places - current_io_context() and set_task_ioprio().  The former is
always called from local task while the latter can be called from
different task.  The synchornization between them are peculiar and
dubious.

* current_io_context() doesn't grab task_lock() and assumes that if it
  saw %NULL ->io_context, it would stay that way until allocation and
  assignment is complete.  It has smp_wmb() between alloc/init and
  assignment.

* set_task_ioprio() grabs task_lock() for assignment and does
  smp_read_barrier_depends() between "ioc = task->io_context" and "if
  (ioc)".  Unfortunately, this doesn't achieve anything - the latter
  is not a dependent load of the former.  ie, if ioc itself were being
  dereferenced "ioc->xxx", it would mean something (not sure what tho)
  but as the code currently stands, the dependent read barrier is
  noop.

As only one of the the two test-assignment sequences is task_lock()
protected, the task_lock() can't do much about race between the two.
Nothing prevents current_io_context() and set_task_ioprio() allocating
its own ioc for the same task and overwriting the other's.

Also, set_task_ioprio() can race with exiting task and create a new
ioc after exit_io_context() is finished.

ioc get/put doesn't have any reason to be complex.  The only hot path
is accessing the existing ioc of %current, which is simple to achieve
given that ->io_context is never destroyed as long as the task is
alive.  All other paths can happily go through task_lock() like all
other task sub structures without impacting anything.

This patch updates ioc get/put so that it becomes more conventional.

* alloc_io_context() is replaced with get_task_io_context().  This is
  the only interface which can acquire access to ioc of another task.
  On return, the caller has an explicit reference to the object which
  should be put using put_io_context() afterwards.

* The functionality of current_io_context() remains the same but when
  creating a new ioc, it shares the code path with
  get_task_io_context() and always goes through task_lock().

* get_io_context() now means incrementing ref on an ioc which the
  caller already has access to (be that an explicit refcnt or implicit
  %current one).

* PF_EXITING inhibits creation of new io_context and once
  exit_io_context() is finished, it's guaranteed that both ioc
  acquisition functions return %NULL.

* All users are updated.  Most are trivial but
  smp_read_barrier_depends() removal from cfq_get_io_context() needs a
  bit of explanation.  I suppose the original intention was to ensure
  ioc->ioprio is visible when set_task_ioprio() allocates new
  io_context and installs it; however, this wouldn't have worked
  because set_task_ioprio() doesn't have wmb between init and install.
  There are other problems with this which will be fixed in another
  patch.

* While at it, use NUMA_NO_NODE instead of -1 for wildcard node
  specification.

-v2: Vivek spotted contamination from debug patch.  Removed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c        |  9 +++--
 block/blk-ioc.c           | 99 +++++++++++++++++++++++++++++++----------------
 block/blk.h               |  1 +
 block/cfq-iosched.c       | 18 ++++-----
 fs/ioprio.c               | 21 ++--------
 include/linux/iocontext.h |  4 +-
 kernel/fork.c             |  8 ++--
 7 files changed, 91 insertions(+), 69 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 8f630cec906..4b001dcd85b 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1645,11 +1645,12 @@ static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
 	struct io_context *ioc;
 
-	task_lock(tsk);
-	ioc = tsk->io_context;
-	if (ioc)
+	/* we don't lose anything even if ioc allocation fails */
+	ioc = get_task_io_context(tsk, GFP_ATOMIC, NUMA_NO_NODE);
+	if (ioc) {
 		ioc->cgroup_changed = 1;
-	task_unlock(tsk);
+		put_io_context(ioc);
+	}
 }
 
 void blkio_policy_register(struct blkio_policy_type *blkiop)
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 8bebf06bac7..b13ed96776c 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -16,6 +16,19 @@
  */
 static struct kmem_cache *iocontext_cachep;
 
+/**
+ * get_io_context - increment reference count to io_context
+ * @ioc: io_context to get
+ *
+ * Increment reference count to @ioc.
+ */
+void get_io_context(struct io_context *ioc)
+{
+	BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
+	atomic_long_inc(&ioc->refcount);
+}
+EXPORT_SYMBOL(get_io_context);
+
 static void cfq_dtor(struct io_context *ioc)
 {
 	if (!hlist_empty(&ioc->cic_list)) {
@@ -71,6 +84,9 @@ void exit_io_context(struct task_struct *task)
 {
 	struct io_context *ioc;
 
+	/* PF_EXITING prevents new io_context from being attached to @task */
+	WARN_ON_ONCE(!(current->flags & PF_EXITING));
+
 	task_lock(task);
 	ioc = task->io_context;
 	task->io_context = NULL;
@@ -82,7 +98,9 @@ void exit_io_context(struct task_struct *task)
 	put_io_context(ioc);
 }
 
-struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
+static struct io_context *create_task_io_context(struct task_struct *task,
+						 gfp_t gfp_flags, int node,
+						 bool take_ref)
 {
 	struct io_context *ioc;
 
@@ -98,6 +116,20 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 	INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
 	INIT_HLIST_HEAD(&ioc->cic_list);
 
+	/* try to install, somebody might already have beaten us to it */
+	task_lock(task);
+
+	if (!task->io_context && !(task->flags & PF_EXITING)) {
+		task->io_context = ioc;
+	} else {
+		kmem_cache_free(iocontext_cachep, ioc);
+		ioc = task->io_context;
+	}
+
+	if (ioc && take_ref)
+		get_io_context(ioc);
+
+	task_unlock(task);
 	return ioc;
 }
 
@@ -114,46 +146,47 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
  */
 struct io_context *current_io_context(gfp_t gfp_flags, int node)
 {
-	struct task_struct *tsk = current;
-	struct io_context *ret;
-
-	ret = tsk->io_context;
-	if (likely(ret))
-		return ret;
-
-	ret = alloc_io_context(gfp_flags, node);
-	if (ret) {
-		/* make sure set_task_ioprio() sees the settings above */
-		smp_wmb();
-		tsk->io_context = ret;
-	}
+	might_sleep_if(gfp_flags & __GFP_WAIT);
 
-	return ret;
+	if (current->io_context)
+		return current->io_context;
+
+	return create_task_io_context(current, gfp_flags, node, false);
 }
+EXPORT_SYMBOL(current_io_context);
 
-/*
- * If the current task has no IO context then create one and initialise it.
- * If it does have a context, take a ref on it.
+/**
+ * get_task_io_context - get io_context of a task
+ * @task: task of interest
+ * @gfp_flags: allocation flags, used if allocation is necessary
+ * @node: allocation node, used if allocation is necessary
+ *
+ * Return io_context of @task.  If it doesn't exist, it is created with
+ * @gfp_flags and @node.  The returned io_context has its reference count
+ * incremented.
  *
- * This is always called in the context of the task which submitted the I/O.
+ * This function always goes through task_lock() and it's better to use
+ * current_io_context() + get_io_context() for %current.
  */
-struct io_context *get_io_context(gfp_t gfp_flags, int node)
+struct io_context *get_task_io_context(struct task_struct *task,
+				       gfp_t gfp_flags, int node)
 {
-	struct io_context *ioc = NULL;
-
-	/*
-	 * Check for unlikely race with exiting task. ioc ref count is
-	 * zero when ioc is being detached.
-	 */
-	do {
-		ioc = current_io_context(gfp_flags, node);
-		if (unlikely(!ioc))
-			break;
-	} while (!atomic_long_inc_not_zero(&ioc->refcount));
+	struct io_context *ioc;
 
-	return ioc;
+	might_sleep_if(gfp_flags & __GFP_WAIT);
+
+	task_lock(task);
+	ioc = task->io_context;
+	if (likely(ioc)) {
+		get_io_context(ioc);
+		task_unlock(task);
+		return ioc;
+	}
+	task_unlock(task);
+
+	return create_task_io_context(task, gfp_flags, node, true);
 }
-EXPORT_SYMBOL(get_io_context);
+EXPORT_SYMBOL(get_task_io_context);
 
 static int __init blk_ioc_init(void)
 {
diff --git a/block/blk.h b/block/blk.h
index aae4d88fc52..fc3c41b2fd2 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -122,6 +122,7 @@ static inline int blk_should_fake_timeout(struct request_queue *q)
 }
 #endif
 
+void get_io_context(struct io_context *ioc);
 struct io_context *current_io_context(gfp_t gfp_flags, int node);
 
 int ll_back_merge_fn(struct request_queue *q, struct request *req,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ec3f5e8ba56..d42d89ccce1 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -14,6 +14,7 @@
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
+#include "blk.h"
 #include "cfq.h"
 
 /*
@@ -3194,13 +3195,13 @@ static struct cfq_io_context *
 cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
 	struct io_context *ioc = NULL;
-	struct cfq_io_context *cic;
+	struct cfq_io_context *cic = NULL;
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
-	ioc = get_io_context(gfp_mask, cfqd->queue->node);
+	ioc = current_io_context(gfp_mask, cfqd->queue->node);
 	if (!ioc)
-		return NULL;
+		goto err;
 
 	cic = cfq_cic_lookup(cfqd, ioc);
 	if (cic)
@@ -3211,10 +3212,10 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 		goto err;
 
 	if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
-		goto err_free;
-
+		goto err;
 out:
-	smp_read_barrier_depends();
+	get_io_context(ioc);
+
 	if (unlikely(ioc->ioprio_changed))
 		cfq_ioc_set_ioprio(ioc);
 
@@ -3223,10 +3224,9 @@ out:
 		cfq_ioc_set_cgroup(ioc);
 #endif
 	return cic;
-err_free:
-	cfq_cic_free(cic);
 err:
-	put_io_context(ioc);
+	if (cic)
+		cfq_cic_free(cic);
 	return NULL;
 }
 
diff --git a/fs/ioprio.c b/fs/ioprio.c
index f79dab83e17..998ec239d1e 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -48,28 +48,13 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
 	if (err)
 		return err;
 
-	task_lock(task);
-	do {
-		ioc = task->io_context;
-		/* see wmb() in current_io_context() */
-		smp_read_barrier_depends();
-		if (ioc)
-			break;
-
-		ioc = alloc_io_context(GFP_ATOMIC, -1);
-		if (!ioc) {
-			err = -ENOMEM;
-			break;
-		}
-		task->io_context = ioc;
-	} while (1);
-
-	if (!err) {
+	ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
+	if (ioc) {
 		ioc->ioprio = ioprio;
 		ioc->ioprio_changed = 1;
+		put_io_context(ioc);
 	}
 
-	task_unlock(task);
 	return err;
 }
 EXPORT_SYMBOL_GPL(set_task_ioprio);
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 8a6ecb66346..28bb621ef5a 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -78,8 +78,8 @@ struct task_struct;
 #ifdef CONFIG_BLOCK
 void put_io_context(struct io_context *ioc);
 void exit_io_context(struct task_struct *task);
-struct io_context *get_io_context(gfp_t gfp_flags, int node);
-struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
+struct io_context *get_task_io_context(struct task_struct *task,
+				       gfp_t gfp_flags, int node);
 #else
 struct io_context;
 static inline void put_io_context(struct io_context *ioc) { }
diff --git a/kernel/fork.c b/kernel/fork.c
index da4a6a10d08..5bcfc739bb7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -870,6 +870,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 {
 #ifdef CONFIG_BLOCK
 	struct io_context *ioc = current->io_context;
+	struct io_context *new_ioc;
 
 	if (!ioc)
 		return 0;
@@ -881,11 +882,12 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 		if (unlikely(!tsk->io_context))
 			return -ENOMEM;
 	} else if (ioprio_valid(ioc->ioprio)) {
-		tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
-		if (unlikely(!tsk->io_context))
+		new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
+		if (unlikely(!new_ioc))
 			return -ENOMEM;
 
-		tsk->io_context->ioprio = ioc->ioprio;
+		new_ioc->ioprio = ioc->ioprio;
+		put_io_context(new_ioc);
 	}
 #endif
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From b2efa05265d62bc29f3a64400fad4b44340eedb8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 14 Dec 2011 00:33:39 +0100
Subject: block, cfq: unlink cfq_io_context's immediately

cic is association between io_context and request_queue.  A cic is
linked from both ioc and q and should be destroyed when either one
goes away.  As ioc and q both have their own locks, locking becomes a
bit complex - both orders work for removal from one but not from the
other.

Currently, cfq tries to circumvent this locking order issue with RCU.
ioc->lock nests inside queue_lock but the radix tree and cic's are
also protected by RCU allowing either side to walk their lists without
grabbing lock.

This rather unconventional use of RCU quickly devolves into extremely
fragile convolution.  e.g. The following is from cfqd going away too
soon after ioc and q exits raced.

 general protection fault: 0000 [#1] PREEMPT SMP
 CPU 2
 Modules linked in:
 [   88.503444]
 Pid: 599, comm: hexdump Not tainted 3.1.0-rc10-work+ #158 Bochs Bochs
 RIP: 0010:[<ffffffff81397628>]  [<ffffffff81397628>] cfq_exit_single_io_context+0x58/0xf0
 ...
 Call Trace:
  [<ffffffff81395a4a>] call_for_each_cic+0x5a/0x90
  [<ffffffff81395ab5>] cfq_exit_io_context+0x15/0x20
  [<ffffffff81389130>] exit_io_context+0x100/0x140
  [<ffffffff81098a29>] do_exit+0x579/0x850
  [<ffffffff81098d5b>] do_group_exit+0x5b/0xd0
  [<ffffffff81098de7>] sys_exit_group+0x17/0x20
  [<ffffffff81b02f2b>] system_call_fastpath+0x16/0x1b

The only real hot path here is cic lookup during request
initialization and avoiding extra locking requires very confined use
of RCU.  This patch makes cic removal from both ioc and request_queue
perform double-locking and unlink immediately.

* From q side, the change is almost trivial as ioc->lock nests inside
  queue_lock.  It just needs to grab each ioc->lock as it walks
  cic_list and unlink it.

* From ioc side, it's a bit more difficult because of inversed lock
  order.  ioc needs its lock to walk its cic_list but can't grab the
  matching queue_lock and needs to perform unlock-relock dancing.

  Unlinking is now wholly done from put_io_context() and fast path is
  optimized by using the queue_lock the caller already holds, which is
  by far the most common case.  If the ioc accessed multiple devices,
  it tries with trylock.  In unlikely cases of fast path failure, it
  falls back to full double-locking dance from workqueue.

Double-locking isn't the prettiest thing in the world but it's *far*
simpler and more understandable than RCU trick without adding any
meaningful overhead.

This still leaves a lot of now unnecessary RCU logics.  Future patches
will trim them.

-v2: Vivek pointed out that cic->q was being dereferenced after
     cic->release() was called.  Updated to use local variable @this_q
     instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c        |   2 +-
 block/blk-ioc.c           | 166 ++++++++++++++++++++++++++++++++++++++--------
 block/cfq-iosched.c       |  44 +++---------
 fs/ioprio.c               |   2 +-
 include/linux/blkdev.h    |   3 +
 include/linux/iocontext.h |  12 ++--
 kernel/fork.c             |   2 +-
 7 files changed, 159 insertions(+), 72 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index dc00835aab6..27886935804 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1649,7 +1649,7 @@ static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 	ioc = get_task_io_context(tsk, GFP_ATOMIC, NUMA_NO_NODE);
 	if (ioc) {
 		ioc_cgroup_changed(ioc);
-		put_io_context(ioc);
+		put_io_context(ioc, NULL);
 	}
 }
 
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 6f59fbad93d..fb23965595d 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -29,55 +29,164 @@ void get_io_context(struct io_context *ioc)
 }
 EXPORT_SYMBOL(get_io_context);
 
-static void cfq_dtor(struct io_context *ioc)
+/*
+ * Releasing ioc may nest into another put_io_context() leading to nested
+ * fast path release.  As the ioc's can't be the same, this is okay but
+ * makes lockdep whine.  Keep track of nesting and use it as subclass.
+ */
+#ifdef CONFIG_LOCKDEP
+#define ioc_release_depth(q)		((q) ? (q)->ioc_release_depth : 0)
+#define ioc_release_depth_inc(q)	(q)->ioc_release_depth++
+#define ioc_release_depth_dec(q)	(q)->ioc_release_depth--
+#else
+#define ioc_release_depth(q)		0
+#define ioc_release_depth_inc(q)	do { } while (0)
+#define ioc_release_depth_dec(q)	do { } while (0)
+#endif
+
+/*
+ * Slow path for ioc release in put_io_context().  Performs double-lock
+ * dancing to unlink all cic's and then frees ioc.
+ */
+static void ioc_release_fn(struct work_struct *work)
 {
-	if (!hlist_empty(&ioc->cic_list)) {
-		struct cfq_io_context *cic;
+	struct io_context *ioc = container_of(work, struct io_context,
+					      release_work);
+	struct request_queue *last_q = NULL;
+
+	spin_lock_irq(&ioc->lock);
+
+	while (!hlist_empty(&ioc->cic_list)) {
+		struct cfq_io_context *cic = hlist_entry(ioc->cic_list.first,
+							 struct cfq_io_context,
+							 cic_list);
+		struct request_queue *this_q = cic->q;
+
+		if (this_q != last_q) {
+			/*
+			 * Need to switch to @this_q.  Once we release
+			 * @ioc->lock, it can go away along with @cic.
+			 * Hold on to it.
+			 */
+			__blk_get_queue(this_q);
+
+			/*
+			 * blk_put_queue() might sleep thanks to kobject
+			 * idiocy.  Always release both locks, put and
+			 * restart.
+			 */
+			if (last_q) {
+				spin_unlock(last_q->queue_lock);
+				spin_unlock_irq(&ioc->lock);
+				blk_put_queue(last_q);
+			} else {
+				spin_unlock_irq(&ioc->lock);
+			}
+
+			last_q = this_q;
+			spin_lock_irq(this_q->queue_lock);
+			spin_lock(&ioc->lock);
+			continue;
+		}
+		ioc_release_depth_inc(this_q);
+		cic->exit(cic);
+		cic->release(cic);
+		ioc_release_depth_dec(this_q);
+	}
 
-		cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
-								cic_list);
-		cic->dtor(ioc);
+	if (last_q) {
+		spin_unlock(last_q->queue_lock);
+		spin_unlock_irq(&ioc->lock);
+		blk_put_queue(last_q);
+	} else {
+		spin_unlock_irq(&ioc->lock);
 	}
+
+	kmem_cache_free(iocontext_cachep, ioc);
 }
 
 /**
  * put_io_context - put a reference of io_context
  * @ioc: io_context to put
+ * @locked_q: request_queue the caller is holding queue_lock of (hint)
  *
  * Decrement reference count of @ioc and release it if the count reaches
- * zero.
+ * zero.  If the caller is holding queue_lock of a queue, it can indicate
+ * that with @locked_q.  This is an optimization hint and the caller is
+ * allowed to pass in %NULL even when it's holding a queue_lock.
  */
-void put_io_context(struct io_context *ioc)
+void put_io_context(struct io_context *ioc, struct request_queue *locked_q)
 {
+	struct request_queue *last_q = locked_q;
+	unsigned long flags;
+
 	if (ioc == NULL)
 		return;
 
 	BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
+	if (locked_q)
+		lockdep_assert_held(locked_q->queue_lock);
 
 	if (!atomic_long_dec_and_test(&ioc->refcount))
 		return;
 
-	rcu_read_lock();
-	cfq_dtor(ioc);
-	rcu_read_unlock();
-
-	kmem_cache_free(iocontext_cachep, ioc);
-}
-EXPORT_SYMBOL(put_io_context);
+	/*
+	 * Destroy @ioc.  This is a bit messy because cic's are chained
+	 * from both ioc and queue, and ioc->lock nests inside queue_lock.
+	 * The inner ioc->lock should be held to walk our cic_list and then
+	 * for each cic the outer matching queue_lock should be grabbed.
+	 * ie. We need to do reverse-order double lock dancing.
+	 *
+	 * Another twist is that we are often called with one of the
+	 * matching queue_locks held as indicated by @locked_q, which
+	 * prevents performing double-lock dance for other queues.
+	 *
+	 * So, we do it in two stages.  The fast path uses the queue_lock
+	 * the caller is holding and, if other queues need to be accessed,
+	 * uses trylock to avoid introducing locking dependency.  This can
+	 * handle most cases, especially if @ioc was performing IO on only
+	 * single device.
+	 *
+	 * If trylock doesn't cut it, we defer to @ioc->release_work which
+	 * can do all the double-locking dancing.
+	 */
+	spin_lock_irqsave_nested(&ioc->lock, flags,
+				 ioc_release_depth(locked_q));
+
+	while (!hlist_empty(&ioc->cic_list)) {
+		struct cfq_io_context *cic = hlist_entry(ioc->cic_list.first,
+							 struct cfq_io_context,
+							 cic_list);
+		struct request_queue *this_q = cic->q;
+
+		if (this_q != last_q) {
+			if (last_q && last_q != locked_q)
+				spin_unlock(last_q->queue_lock);
+			last_q = NULL;
+
+			if (!spin_trylock(this_q->queue_lock))
+				break;
+			last_q = this_q;
+			continue;
+		}
+		ioc_release_depth_inc(this_q);
+		cic->exit(cic);
+		cic->release(cic);
+		ioc_release_depth_dec(this_q);
+	}
 
-static void cfq_exit(struct io_context *ioc)
-{
-	rcu_read_lock();
+	if (last_q && last_q != locked_q)
+		spin_unlock(last_q->queue_lock);
 
-	if (!hlist_empty(&ioc->cic_list)) {
-		struct cfq_io_context *cic;
+	spin_unlock_irqrestore(&ioc->lock, flags);
 
-		cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
-								cic_list);
-		cic->exit(ioc);
-	}
-	rcu_read_unlock();
+	/* if no cic's left, we're done; otherwise, kick release_work */
+	if (hlist_empty(&ioc->cic_list))
+		kmem_cache_free(iocontext_cachep, ioc);
+	else
+		schedule_work(&ioc->release_work);
 }
+EXPORT_SYMBOL(put_io_context);
 
 /* Called by the exiting task */
 void exit_io_context(struct task_struct *task)
@@ -92,10 +201,8 @@ void exit_io_context(struct task_struct *task)
 	task->io_context = NULL;
 	task_unlock(task);
 
-	if (atomic_dec_and_test(&ioc->nr_tasks))
-		cfq_exit(ioc);
-
-	put_io_context(ioc);
+	atomic_dec(&ioc->nr_tasks);
+	put_io_context(ioc, NULL);
 }
 
 static struct io_context *create_task_io_context(struct task_struct *task,
@@ -115,6 +222,7 @@ static struct io_context *create_task_io_context(struct task_struct *task,
 	spin_lock_init(&ioc->lock);
 	INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
 	INIT_HLIST_HEAD(&ioc->cic_list);
+	INIT_WORK(&ioc->release_work, ioc_release_fn);
 
 	/* try to install, somebody might already have beaten us to it */
 	task_lock(task);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e617b088c59..6cc60656040 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1778,7 +1778,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		cfqd->active_queue = NULL;
 
 	if (cfqd->active_cic) {
-		put_io_context(cfqd->active_cic->ioc);
+		put_io_context(cfqd->active_cic->ioc, cfqd->queue);
 		cfqd->active_cic = NULL;
 	}
 }
@@ -2812,38 +2812,6 @@ static void cfq_exit_cic(struct cfq_io_context *cic)
 	}
 }
 
-static void cfq_exit_single_io_context(struct io_context *ioc,
-				       struct cfq_io_context *cic)
-{
-	struct cfq_data *cfqd = cic_to_cfqd(cic);
-
-	if (cfqd) {
-		struct request_queue *q = cfqd->queue;
-		unsigned long flags;
-
-		spin_lock_irqsave(q->queue_lock, flags);
-
-		/*
-		 * Ensure we get a fresh copy of the ->key to prevent
-		 * race between exiting task and queue
-		 */
-		smp_read_barrier_depends();
-		if (cic->key == cfqd)
-			cfq_exit_cic(cic);
-
-		spin_unlock_irqrestore(q->queue_lock, flags);
-	}
-}
-
-/*
- * The process that ioc belongs to has exited, we need to clean up
- * and put the internal structures we have that belongs to that process.
- */
-static void cfq_exit_io_context(struct io_context *ioc)
-{
-	call_for_each_cic(ioc, cfq_exit_single_io_context);
-}
-
 static struct cfq_io_context *
 cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
@@ -2855,8 +2823,8 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 		cic->ttime.last_end_request = jiffies;
 		INIT_LIST_HEAD(&cic->queue_list);
 		INIT_HLIST_NODE(&cic->cic_list);
-		cic->dtor = cfq_free_io_context;
-		cic->exit = cfq_exit_io_context;
+		cic->exit = cfq_exit_cic;
+		cic->release = cfq_release_cic;
 		elv_ioc_count_inc(cfq_ioc_count);
 	}
 
@@ -3726,7 +3694,7 @@ static void cfq_put_request(struct request *rq)
 		BUG_ON(!cfqq->allocated[rw]);
 		cfqq->allocated[rw]--;
 
-		put_io_context(RQ_CIC(rq)->ioc);
+		put_io_context(RQ_CIC(rq)->ioc, cfqq->cfqd->queue);
 
 		rq->elevator_private[0] = NULL;
 		rq->elevator_private[1] = NULL;
@@ -3937,8 +3905,12 @@ static void cfq_exit_queue(struct elevator_queue *e)
 		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
 							struct cfq_io_context,
 							queue_list);
+		struct io_context *ioc = cic->ioc;
 
+		spin_lock(&ioc->lock);
 		cfq_exit_cic(cic);
+		cfq_release_cic(cic);
+		spin_unlock(&ioc->lock);
 	}
 
 	cfq_put_async_queues(cfqd);
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 0f1b9515213..f84b380d65e 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -51,7 +51,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
 	ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
 	if (ioc) {
 		ioc_ioprio_changed(ioc, ioprio);
-		put_io_context(ioc);
+		put_io_context(ioc, NULL);
 	}
 
 	return err;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d1b6f4ed1f9..65c2f8c7008 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -393,6 +393,9 @@ struct request_queue {
 	/* Throttle data */
 	struct throtl_data *td;
 #endif
+#ifdef CONFIG_LOCKDEP
+	int			ioc_release_depth;
+#endif
 };
 
 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 2c2b6da96b3..01e86312878 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -3,6 +3,7 @@
 
 #include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
+#include <linux/workqueue.h>
 
 struct cfq_queue;
 struct cfq_ttime {
@@ -33,8 +34,8 @@ struct cfq_io_context {
 
 	unsigned long changed;
 
-	void (*dtor)(struct io_context *); /* destructor */
-	void (*exit)(struct io_context *); /* called on task exit */
+	void (*exit)(struct cfq_io_context *);
+	void (*release)(struct cfq_io_context *);
 
 	struct rcu_head rcu_head;
 };
@@ -61,6 +62,8 @@ struct io_context {
 	struct radix_tree_root radix_root;
 	struct hlist_head cic_list;
 	void __rcu *ioc_data;
+
+	struct work_struct release_work;
 };
 
 static inline struct io_context *ioc_task_link(struct io_context *ioc)
@@ -79,7 +82,7 @@ static inline struct io_context *ioc_task_link(struct io_context *ioc)
 
 struct task_struct;
 #ifdef CONFIG_BLOCK
-void put_io_context(struct io_context *ioc);
+void put_io_context(struct io_context *ioc, struct request_queue *locked_q);
 void exit_io_context(struct task_struct *task);
 struct io_context *get_task_io_context(struct task_struct *task,
 				       gfp_t gfp_flags, int node);
@@ -87,7 +90,8 @@ void ioc_ioprio_changed(struct io_context *ioc, int ioprio);
 void ioc_cgroup_changed(struct io_context *ioc);
 #else
 struct io_context;
-static inline void put_io_context(struct io_context *ioc) { }
+static inline void put_io_context(struct io_context *ioc,
+				  struct request_queue *locked_q) { }
 static inline void exit_io_context(struct task_struct *task) { }
 #endif
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 5bcfc739bb7..2753449f203 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -887,7 +887,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 			return -ENOMEM;
 
 		new_ioc->ioprio = ioc->ioprio;
-		put_io_context(new_ioc);
+		put_io_context(new_ioc, NULL);
 	}
 #endif
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From 54848d73f9f254631303d6eab9b976855988b266 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 5 Apr 2011 13:21:19 -0600
Subject: writeback: charge leaked page dirties to active tasks

It's a years long problem that a large number of short-lived dirtiers
(eg. gcc instances in a fast kernel build) may starve long-run dirtiers
(eg. dd) as well as pushing the dirty pages to the global hard limit.

The solution is to charge the pages dirtied by the exited gcc to the
other random dirtying tasks. It sounds not perfect, however should
behave good enough in practice, seeing as that throttled tasks aren't
actually running so those that are running are more likely to pick it up
and get throttled, therefore promoting an equal spread.

Randy: fix compile error: 'dirty_throttle_leaks' undeclared in exit.c

Acked-by: Jan Kara <jack@suse.cz>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 include/linux/writeback.h |  2 ++
 kernel/exit.c             |  3 +++
 mm/page-writeback.c       | 27 +++++++++++++++++++++++++++
 3 files changed, 32 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a378c295851..05eaf5e3aad 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -7,6 +7,8 @@
 #include <linux/sched.h>
 #include <linux/fs.h>
 
+DECLARE_PER_CPU(int, dirty_throttle_leaks);
+
 /*
  * The 1/4 region under the global dirty thresh is for smooth dirty throttling:
  *
diff --git a/kernel/exit.c b/kernel/exit.c
index d0b7d988f87..d4aac24cc46 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -51,6 +51,7 @@
 #include <trace/events/sched.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/oom.h>
+#include <linux/writeback.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -1037,6 +1038,8 @@ NORET_TYPE void do_exit(long code)
 	validate_creds_for_do_exit(tsk);
 
 	preempt_disable();
+	if (tsk->nr_dirtied)
+		__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
 	exit_rcu();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 50f08241f98..619c445fc03 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1214,6 +1214,22 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
 
 static DEFINE_PER_CPU(int, bdp_ratelimits);
 
+/*
+ * Normal tasks are throttled by
+ *	loop {
+ *		dirty tsk->nr_dirtied_pause pages;
+ *		take a snap in balance_dirty_pages();
+ *	}
+ * However there is a worst case. If every task exit immediately when dirtied
+ * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be
+ * called to throttle the page dirties. The solution is to save the not yet
+ * throttled page dirties in dirty_throttle_leaks on task exit and charge them
+ * randomly into the running tasks. This works well for the above worst case,
+ * as the new task will pick up and accumulate the old task's leaked dirty
+ * count and eventually get throttled.
+ */
+DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
+
 /**
  * balance_dirty_pages_ratelimited_nr - balance dirty memory state
  * @mapping: address_space which was dirtied
@@ -1261,6 +1277,17 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
 			ratelimit = 0;
 		}
 	}
+	/*
+	 * Pick up the dirtied pages by the exited tasks. This avoids lots of
+	 * short-lived tasks (eg. gcc invocations in a kernel build) escaping
+	 * the dirty throttling and livelock other long-run dirtiers.
+	 */
+	p = &__get_cpu_var(dirty_throttle_leaks);
+	if (*p > 0 && current->nr_dirtied < ratelimit) {
+		nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
+		*p -= nr_pages_dirtied;
+		current->nr_dirtied += nr_pages_dirtied;
+	}
 	preempt_enable();
 
 	if (unlikely(current->nr_dirtied >= ratelimit))
-- 
cgit v1.2.3-70-g09d2


From 83712358ba0a1497ce59a4f84ce4dd0f803fe6fc Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Sat, 11 Jun 2011 19:25:42 -0600
Subject: writeback: dirty ratelimit - think time compensation

Compensate the task's think time when computing the final pause time,
so that ->dirty_ratelimit can be executed accurately.

        think time := time spend outside of balance_dirty_pages()

In the rare case that the task slept longer than the 200ms period time
(result in negative pause time), the sleep time will be compensated in
the following periods, too, if it's less than 1 second.

Accumulated errors are carefully avoided as long as the max pause area
is not hitted.

Pseudo code:

        period = pages_dirtied / task_ratelimit;
        think = jiffies - dirty_paused_when;
        pause = period - think;

1) normal case: period > think

        pause = period - think
        dirty_paused_when = jiffies + pause
        nr_dirtied = 0

                             period time
              |===============================>|
                  think time      pause time
              |===============>|==============>|
        ------|----------------|---------------|------------------------
        dirty_paused_when   jiffies

2) no pause case: period <= think

        don't pause; reduce future pause time by:
        dirty_paused_when += period
        nr_dirtied = 0

                           period time
              |===============================>|
                                  think time
              |===================================================>|
        ------|--------------------------------+-------------------|----
        dirty_paused_when                                       jiffies

Acked-by: Jan Kara <jack@suse.cz>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 include/linux/sched.h            |  1 +
 include/trace/events/writeback.h | 14 +++++++++++---
 kernel/fork.c                    |  1 +
 mm/page-writeback.c              | 36 ++++++++++++++++++++++++++++++++----
 4 files changed, 45 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1c4f3e9b9bc..984c3b29597 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1527,6 +1527,7 @@ struct task_struct {
 	 */
 	int nr_dirtied;
 	int nr_dirtied_pause;
+	unsigned long dirty_paused_when; /* start of a write-and-pause period */
 
 #ifdef CONFIG_LATENCYTOP
 	int latency_record_count;
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 99d1d0decf8..8588a891802 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -300,12 +300,13 @@ TRACE_EVENT(balance_dirty_pages,
 		 unsigned long dirty_ratelimit,
 		 unsigned long task_ratelimit,
 		 unsigned long dirtied,
+		 unsigned long period,
 		 long pause,
 		 unsigned long start_time),
 
 	TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
 		dirty_ratelimit, task_ratelimit,
-		dirtied, pause, start_time),
+		dirtied, period, pause, start_time),
 
 	TP_STRUCT__entry(
 		__array(	 char,	bdi, 32)
@@ -320,6 +321,8 @@ TRACE_EVENT(balance_dirty_pages,
 		__field(unsigned int,	dirtied_pause)
 		__field(unsigned long,	paused)
 		__field(	 long,	pause)
+		__field(unsigned long,	period)
+		__field(	 long,	think)
 	),
 
 	TP_fast_assign(
@@ -336,6 +339,9 @@ TRACE_EVENT(balance_dirty_pages,
 		__entry->task_ratelimit	= KBps(task_ratelimit);
 		__entry->dirtied	= dirtied;
 		__entry->dirtied_pause	= current->nr_dirtied_pause;
+		__entry->think		= current->dirty_paused_when == 0 ? 0 :
+			 (long)(jiffies - current->dirty_paused_when) * 1000/HZ;
+		__entry->period		= period * 1000 / HZ;
 		__entry->pause		= pause * 1000 / HZ;
 		__entry->paused		= (jiffies - start_time) * 1000 / HZ;
 	),
@@ -346,7 +352,7 @@ TRACE_EVENT(balance_dirty_pages,
 		  "bdi_setpoint=%lu bdi_dirty=%lu "
 		  "dirty_ratelimit=%lu task_ratelimit=%lu "
 		  "dirtied=%u dirtied_pause=%u "
-		  "paused=%lu pause=%ld",
+		  "paused=%lu pause=%ld period=%lu think=%ld",
 		  __entry->bdi,
 		  __entry->limit,
 		  __entry->setpoint,
@@ -358,7 +364,9 @@ TRACE_EVENT(balance_dirty_pages,
 		  __entry->dirtied,
 		  __entry->dirtied_pause,
 		  __entry->paused,	/* ms */
-		  __entry->pause	/* ms */
+		  __entry->pause,	/* ms */
+		  __entry->period,	/* ms */
+		  __entry->think	/* ms */
 	  )
 );
 
diff --git a/kernel/fork.c b/kernel/fork.c
index da4a6a10d08..f8668cf6a32 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1296,6 +1296,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	p->nr_dirtied = 0;
 	p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
+	p->dirty_paused_when = 0;
 
 	/*
 	 * Ok, make it visible to the rest of the system.
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 96b3e7aa705..49193215582 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1016,6 +1016,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
+	long period;
 	long pause = 0;
 	long uninitialized_var(max_pause);
 	bool dirty_exceeded = false;
@@ -1026,6 +1027,8 @@ static void balance_dirty_pages(struct address_space *mapping,
 	unsigned long start_time = jiffies;
 
 	for (;;) {
+		unsigned long now = jiffies;
+
 		/*
 		 * Unstable writes are a feature of certain networked
 		 * filesystems (i.e. NFS) in which data may have been
@@ -1045,8 +1048,11 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 */
 		freerun = dirty_freerun_ceiling(dirty_thresh,
 						background_thresh);
-		if (nr_dirty <= freerun)
+		if (nr_dirty <= freerun) {
+			current->dirty_paused_when = now;
+			current->nr_dirtied = 0;
 			break;
+		}
 
 		if (unlikely(!writeback_in_progress(bdi)))
 			bdi_start_background_writeback(bdi);
@@ -1104,10 +1110,21 @@ static void balance_dirty_pages(struct address_space *mapping,
 		task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
 							RATELIMIT_CALC_SHIFT;
 		if (unlikely(task_ratelimit == 0)) {
+			period = max_pause;
 			pause = max_pause;
 			goto pause;
 		}
-		pause = HZ * pages_dirtied / task_ratelimit;
+		period = HZ * pages_dirtied / task_ratelimit;
+		pause = period;
+		if (current->dirty_paused_when)
+			pause -= now - current->dirty_paused_when;
+		/*
+		 * For less than 1s think time (ext3/4 may block the dirtier
+		 * for up to 800ms from time to time on 1-HDD; so does xfs,
+		 * however at much less frequency), try to compensate it in
+		 * future periods by updating the virtual time; otherwise just
+		 * do a reset, as it may be a light dirtier.
+		 */
 		if (unlikely(pause <= 0)) {
 			trace_balance_dirty_pages(bdi,
 						  dirty_thresh,
@@ -1118,8 +1135,16 @@ static void balance_dirty_pages(struct address_space *mapping,
 						  dirty_ratelimit,
 						  task_ratelimit,
 						  pages_dirtied,
+						  period,
 						  pause,
 						  start_time);
+			if (pause < -HZ) {
+				current->dirty_paused_when = now;
+				current->nr_dirtied = 0;
+			} else if (period) {
+				current->dirty_paused_when += period;
+				current->nr_dirtied = 0;
+			}
 			pause = 1; /* avoid resetting nr_dirtied_pause below */
 			break;
 		}
@@ -1135,11 +1160,15 @@ pause:
 					  dirty_ratelimit,
 					  task_ratelimit,
 					  pages_dirtied,
+					  period,
 					  pause,
 					  start_time);
 		__set_current_state(TASK_KILLABLE);
 		io_schedule_timeout(pause);
 
+		current->dirty_paused_when = now + pause;
+		current->nr_dirtied = 0;
+
 		/*
 		 * This is typically equal to (nr_dirty < dirty_thresh) and can
 		 * also keep "1000+ dd on a slow USB stick" under control.
@@ -1167,11 +1196,10 @@ pause:
 	if (!dirty_exceeded && bdi->dirty_exceeded)
 		bdi->dirty_exceeded = 0;
 
-	current->nr_dirtied = 0;
 	if (pause == 0) { /* in freerun area */
 		current->nr_dirtied_pause =
 				dirty_poll_interval(nr_dirty, dirty_thresh);
-	} else if (pause <= max_pause / 4 &&
+	} else if (period <= max_pause / 4 &&
 		   pages_dirtied >= current->nr_dirtied_pause) {
 		current->nr_dirtied_pause = clamp_val(
 					dirty_ratelimit * (max_pause / 2) / HZ,
-- 
cgit v1.2.3-70-g09d2


From 30fb6aa74011dcf595f306ca2727254d708b786e Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Mon, 5 Dec 2011 18:22:48 +0100
Subject: ftrace: Fix unregister ftrace_ops accounting

Multiple users of the function tracer can register their functions
with the ftrace_ops structure. The accounting within ftrace will
update the counter on each function record that is being traced.
When the ftrace_ops filtering adds or removes functions, the
function records will be updated accordingly if the ftrace_ops is
still registered.

When a ftrace_ops is removed, the counter of the function records,
that the ftrace_ops traces, are decremented. When they reach zero
the functions that they represent are modified to stop calling the
mcount code.

When changes are made, the code is updated via stop_machine() with
a command passed to the function to tell it what to do. There is an
ENABLE and DISABLE command that tells the called function to enable
or disable the functions. But the ENABLE is really a misnomer as it
should just update the records, as records that have been enabled
and now have a count of zero should be disabled.

The DISABLE command is used to disable all functions regardless of
their counter values. This is the big off switch and is not the
complement of the ENABLE command.

To make matters worse, when a ftrace_ops is unregistered and there
is another ftrace_ops registered, neither the DISABLE nor the
ENABLE command are set when calling into the stop_machine() function
and the records will not be updated to match their counter. A command
is passed to that function that will update the mcount code to call
the registered callback directly if it is the only one left. This
means that the ftrace_ops that is still registered will have its callback
called by all functions that have been set for it as well as the ftrace_ops
that was just unregistered.

Here's a way to trigger this bug. Compile the kernel with
CONFIG_FUNCTION_PROFILER set and with CONFIG_FUNCTION_GRAPH not set:

 CONFIG_FUNCTION_PROFILER=y
 # CONFIG_FUNCTION_GRAPH is not set

This will force the function profiler to use the function tracer instead
of the function graph tracer.

  # cd /sys/kernel/debug/tracing
  # echo schedule > set_ftrace_filter
  # echo function > current_tracer
  # cat set_ftrace_filter
 schedule
  # cat trace
 # tracer: nop
 #
 # entries-in-buffer/entries-written: 692/68108025   #P:4
 #
 #                              _-----=> irqs-off
 #                             / _----=> need-resched
 #                            | / _---=> hardirq/softirq
 #                            || / _--=> preempt-depth
 #                            ||| /     delay
 #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
 #              | |       |   ||||       |         |
      kworker/0:2-909   [000] ....   531.235574: schedule <-worker_thread
           <idle>-0     [001] .N..   531.235575: schedule <-cpu_idle
      kworker/0:2-909   [000] ....   531.235597: schedule <-worker_thread
             sshd-2563  [001] ....   531.235647: schedule <-schedule_hrtimeout_range_clock

  # echo 1 > function_profile_enabled
  # echo 0 > function_porfile_enabled
  # cat set_ftrace_filter
 schedule
  # cat trace
 # tracer: function
 #
 # entries-in-buffer/entries-written: 159701/118821262   #P:4
 #
 #                              _-----=> irqs-off
 #                             / _----=> need-resched
 #                            | / _---=> hardirq/softirq
 #                            || / _--=> preempt-depth
 #                            ||| /     delay
 #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
 #              | |       |   ||||       |         |
           <idle>-0     [002] ...1   604.870655: local_touch_nmi <-cpu_idle
           <idle>-0     [002] d..1   604.870655: enter_idle <-cpu_idle
           <idle>-0     [002] d..1   604.870656: atomic_notifier_call_chain <-enter_idle
           <idle>-0     [002] d..1   604.870656: __atomic_notifier_call_chain <-atomic_notifier_call_chain

The same problem could have happened with the trace_probe_ops,
but they are modified with the set_frace_filter file which does the
update at closure of the file.

The simple solution is to change ENABLE to UPDATE and call it every
time an ftrace_ops is unregistered.

Link: http://lkml.kernel.org/r/1323105776-26961-3-git-send-email-jolsa@redhat.com

Cc: stable@vger.kernel.org # 3.0+
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b1e8943fed1..25b4f4da0fe 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -948,7 +948,7 @@ struct ftrace_func_probe {
 };
 
 enum {
-	FTRACE_ENABLE_CALLS		= (1 << 0),
+	FTRACE_UPDATE_CALLS		= (1 << 0),
 	FTRACE_DISABLE_CALLS		= (1 << 1),
 	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
 	FTRACE_START_FUNC_RET		= (1 << 3),
@@ -1519,7 +1519,7 @@ int ftrace_text_reserved(void *start, void *end)
 
 
 static int
-__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
+__ftrace_replace_code(struct dyn_ftrace *rec, int update)
 {
 	unsigned long ftrace_addr;
 	unsigned long flag = 0UL;
@@ -1527,17 +1527,17 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	ftrace_addr = (unsigned long)FTRACE_ADDR;
 
 	/*
-	 * If we are enabling tracing:
+	 * If we are updating calls:
 	 *
 	 *   If the record has a ref count, then we need to enable it
 	 *   because someone is using it.
 	 *
 	 *   Otherwise we make sure its disabled.
 	 *
-	 * If we are disabling tracing, then disable all records that
+	 * If we are disabling calls, then disable all records that
 	 * are enabled.
 	 */
-	if (enable && (rec->flags & ~FTRACE_FL_MASK))
+	if (update && (rec->flags & ~FTRACE_FL_MASK))
 		flag = FTRACE_FL_ENABLED;
 
 	/* If the state of this record hasn't changed, then do nothing */
@@ -1553,7 +1553,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	return ftrace_make_nop(NULL, rec, ftrace_addr);
 }
 
-static void ftrace_replace_code(int enable)
+static void ftrace_replace_code(int update)
 {
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
@@ -1567,7 +1567,7 @@ static void ftrace_replace_code(int enable)
 		if (rec->flags & FTRACE_FL_FREE)
 			continue;
 
-		failed = __ftrace_replace_code(rec, enable);
+		failed = __ftrace_replace_code(rec, update);
 		if (failed) {
 			ftrace_bug(failed, rec->ip);
 			/* Stop processing */
@@ -1623,7 +1623,7 @@ static int __ftrace_modify_code(void *data)
 	 */
 	function_trace_stop++;
 
-	if (*command & FTRACE_ENABLE_CALLS)
+	if (*command & FTRACE_UPDATE_CALLS)
 		ftrace_replace_code(1);
 	else if (*command & FTRACE_DISABLE_CALLS)
 		ftrace_replace_code(0);
@@ -1691,7 +1691,7 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
 		return -ENODEV;
 
 	ftrace_start_up++;
-	command |= FTRACE_ENABLE_CALLS;
+	command |= FTRACE_UPDATE_CALLS;
 
 	/* ops marked global share the filter hashes */
 	if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
@@ -1743,8 +1743,7 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command)
 	if (ops != &global_ops || !global_start_up)
 		ops->flags &= ~FTRACE_OPS_FL_ENABLED;
 
-	if (!ftrace_start_up)
-		command |= FTRACE_DISABLE_CALLS;
+	command |= FTRACE_UPDATE_CALLS;
 
 	if (saved_ftrace_func != ftrace_trace_function) {
 		saved_ftrace_func = ftrace_trace_function;
@@ -1766,7 +1765,7 @@ static void ftrace_startup_sysctl(void)
 	saved_ftrace_func = NULL;
 	/* ftrace_start_up is true if we want ftrace running */
 	if (ftrace_start_up)
-		ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+		ftrace_run_update_code(FTRACE_UPDATE_CALLS);
 }
 
 static void ftrace_shutdown_sysctl(void)
@@ -2919,7 +2918,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
 	ret = ftrace_hash_move(ops, enable, orig_hash, hash);
 	if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED
 	    && ftrace_enabled)
-		ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+		ftrace_run_update_code(FTRACE_UPDATE_CALLS);
 
 	mutex_unlock(&ftrace_lock);
 
@@ -3107,7 +3106,7 @@ ftrace_regex_release(struct inode *inode, struct file *file)
 				       orig_hash, iter->hash);
 		if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED)
 		    && ftrace_enabled)
-			ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+			ftrace_run_update_code(FTRACE_UPDATE_CALLS);
 
 		mutex_unlock(&ftrace_lock);
 	}
-- 
cgit v1.2.3-70-g09d2


From c88fd8634ea68e74c7d19fd2621b4078fd22864c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 16 Aug 2011 09:53:39 -0400
Subject: ftrace: Allow archs to modify code without stop machine

The stop machine method to modify all functions in the kernel
(some 20,000 of them) is the safest way to do so across all archs.
But some archs may not need this big hammer approach to modify code
on SMP machines, and can simply just update the code it needs.

Adding a weak function arch_ftrace_update_code() that now does the
stop machine, will also let any arch override this method.

If the arch needs to check the system and then decide if it can
avoid stop machine, it can still call ftrace_run_stop_machine() to
use the old method.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  31 ++++++
 kernel/trace/ftrace.c  | 253 +++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 246 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 26eafcef75b..4f0b6fec379 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -133,6 +133,8 @@ struct ftrace_func_command {
 int ftrace_arch_code_modify_prepare(void);
 int ftrace_arch_code_modify_post_process(void);
 
+void ftrace_bug(int err, unsigned long ip);
+
 struct seq_file;
 
 struct ftrace_probe_ops {
@@ -190,6 +192,35 @@ void ftrace_set_global_notrace(unsigned char *buf, int len, int reset);
 int register_ftrace_command(struct ftrace_func_command *cmd);
 int unregister_ftrace_command(struct ftrace_func_command *cmd);
 
+enum {
+	FTRACE_UPDATE_CALLS		= (1 << 0),
+	FTRACE_DISABLE_CALLS		= (1 << 1),
+	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
+	FTRACE_START_FUNC_RET		= (1 << 3),
+	FTRACE_STOP_FUNC_RET		= (1 << 4),
+};
+
+enum {
+	FTRACE_UPDATE_IGNORE,
+	FTRACE_UPDATE_MAKE_CALL,
+	FTRACE_UPDATE_MAKE_NOP,
+};
+
+void arch_ftrace_update_code(int command);
+
+struct ftrace_rec_iter;
+
+struct ftrace_rec_iter *ftrace_rec_iter_start(void);
+struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter);
+struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter);
+
+int ftrace_update_record(struct dyn_ftrace *rec, int enable);
+int ftrace_test_record(struct dyn_ftrace *rec, int enable);
+void ftrace_run_stop_machine(int command);
+int ftrace_location(unsigned long ip);
+
+extern ftrace_func_t ftrace_trace_function;
+
 /* defined in arch */
 extern int ftrace_ip_converted(unsigned long ip);
 extern int ftrace_dyn_arch_init(void *data);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 25b4f4da0fe..655b432fb89 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -947,13 +947,6 @@ struct ftrace_func_probe {
 	struct rcu_head		rcu;
 };
 
-enum {
-	FTRACE_UPDATE_CALLS		= (1 << 0),
-	FTRACE_DISABLE_CALLS		= (1 << 1),
-	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
-	FTRACE_START_FUNC_RET		= (1 << 3),
-	FTRACE_STOP_FUNC_RET		= (1 << 4),
-};
 struct ftrace_func_entry {
 	struct hlist_node hlist;
 	unsigned long ip;
@@ -1307,6 +1300,28 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
 		}				\
 	}
 
+/**
+ * ftrace_location - return true if the ip giving is a traced location
+ * @ip: the instruction pointer to check
+ *
+ * Returns 1 if @ip given is a pointer to a ftrace location.
+ * That is, the instruction that is either a NOP or call to
+ * the function tracer. It checks the ftrace internal tables to
+ * determine if the address belongs or not.
+ */
+int ftrace_location(unsigned long ip)
+{
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+
+	do_for_each_ftrace_rec(pg, rec) {
+		if (rec->ip == ip)
+			return 1;
+	} while_for_each_ftrace_rec();
+
+	return 0;
+}
+
 static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
 				     int filter_hash,
 				     bool inc)
@@ -1475,7 +1490,19 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
 		printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
 }
 
-static void ftrace_bug(int failed, unsigned long ip)
+/**
+ * ftrace_bug - report and shutdown function tracer
+ * @failed: The failed type (EFAULT, EINVAL, EPERM)
+ * @ip: The address that failed
+ *
+ * The arch code that enables or disables the function tracing
+ * can call ftrace_bug() when it has detected a problem in
+ * modifying the code. @failed should be one of either:
+ * EFAULT - if the problem happens on reading the @ip address
+ * EINVAL - if what is read at @ip is not what was expected
+ * EPERM - if the problem happens on writting to the @ip address
+ */
+void ftrace_bug(int failed, unsigned long ip)
 {
 	switch (failed) {
 	case -EFAULT:
@@ -1517,15 +1544,10 @@ int ftrace_text_reserved(void *start, void *end)
 	return 0;
 }
 
-
-static int
-__ftrace_replace_code(struct dyn_ftrace *rec, int update)
+static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
 {
-	unsigned long ftrace_addr;
 	unsigned long flag = 0UL;
 
-	ftrace_addr = (unsigned long)FTRACE_ADDR;
-
 	/*
 	 * If we are updating calls:
 	 *
@@ -1537,20 +1559,74 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int update)
 	 * If we are disabling calls, then disable all records that
 	 * are enabled.
 	 */
-	if (update && (rec->flags & ~FTRACE_FL_MASK))
+	if (enable && (rec->flags & ~FTRACE_FL_MASK))
 		flag = FTRACE_FL_ENABLED;
 
 	/* If the state of this record hasn't changed, then do nothing */
 	if ((rec->flags & FTRACE_FL_ENABLED) == flag)
-		return 0;
+		return FTRACE_UPDATE_IGNORE;
 
 	if (flag) {
-		rec->flags |= FTRACE_FL_ENABLED;
+		if (update)
+			rec->flags |= FTRACE_FL_ENABLED;
+		return FTRACE_UPDATE_MAKE_CALL;
+	}
+
+	if (update)
+		rec->flags &= ~FTRACE_FL_ENABLED;
+
+	return FTRACE_UPDATE_MAKE_NOP;
+}
+
+/**
+ * ftrace_update_record, set a record that now is tracing or not
+ * @rec: the record to update
+ * @enable: set to 1 if the record is tracing, zero to force disable
+ *
+ * The records that represent all functions that can be traced need
+ * to be updated when tracing has been enabled.
+ */
+int ftrace_update_record(struct dyn_ftrace *rec, int enable)
+{
+	return ftrace_check_record(rec, enable, 1);
+}
+
+/**
+ * ftrace_test_record, check if the record has been enabled or not
+ * @rec: the record to test
+ * @enable: set to 1 to check if enabled, 0 if it is disabled
+ *
+ * The arch code may need to test if a record is already set to
+ * tracing to determine how to modify the function code that it
+ * represents.
+ */
+int ftrace_test_record(struct dyn_ftrace *rec, int enable)
+{
+	return ftrace_check_record(rec, enable, 0);
+}
+
+static int
+__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
+{
+	unsigned long ftrace_addr;
+	int ret;
+
+	ftrace_addr = (unsigned long)FTRACE_ADDR;
+
+	ret = ftrace_update_record(rec, enable);
+
+	switch (ret) {
+	case FTRACE_UPDATE_IGNORE:
+		return 0;
+
+	case FTRACE_UPDATE_MAKE_CALL:
 		return ftrace_make_call(rec, ftrace_addr);
+
+	case FTRACE_UPDATE_MAKE_NOP:
+		return ftrace_make_nop(NULL, rec, ftrace_addr);
 	}
 
-	rec->flags &= ~FTRACE_FL_ENABLED;
-	return ftrace_make_nop(NULL, rec, ftrace_addr);
+	return -1; /* unknow ftrace bug */
 }
 
 static void ftrace_replace_code(int update)
@@ -1576,6 +1652,78 @@ static void ftrace_replace_code(int update)
 	} while_for_each_ftrace_rec();
 }
 
+struct ftrace_rec_iter {
+	struct ftrace_page	*pg;
+	int			index;
+};
+
+/**
+ * ftrace_rec_iter_start, start up iterating over traced functions
+ *
+ * Returns an iterator handle that is used to iterate over all
+ * the records that represent address locations where functions
+ * are traced.
+ *
+ * May return NULL if no records are available.
+ */
+struct ftrace_rec_iter *ftrace_rec_iter_start(void)
+{
+	/*
+	 * We only use a single iterator.
+	 * Protected by the ftrace_lock mutex.
+	 */
+	static struct ftrace_rec_iter ftrace_rec_iter;
+	struct ftrace_rec_iter *iter = &ftrace_rec_iter;
+
+	iter->pg = ftrace_pages_start;
+	iter->index = 0;
+
+	/* Could have empty pages */
+	while (iter->pg && !iter->pg->index)
+		iter->pg = iter->pg->next;
+
+	if (!iter->pg)
+		return NULL;
+
+	return iter;
+}
+
+/**
+ * ftrace_rec_iter_next, get the next record to process.
+ * @iter: The handle to the iterator.
+ *
+ * Returns the next iterator after the given iterator @iter.
+ */
+struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
+{
+	iter->index++;
+
+	if (iter->index >= iter->pg->index) {
+		iter->pg = iter->pg->next;
+		iter->index = 0;
+
+		/* Could have empty pages */
+		while (iter->pg && !iter->pg->index)
+			iter->pg = iter->pg->next;
+	}
+
+	if (!iter->pg)
+		return NULL;
+
+	return iter;
+}
+
+/**
+ * ftrace_rec_iter_record, get the record at the iterator location
+ * @iter: The current iterator location
+ *
+ * Returns the record that the current @iter is at.
+ */
+struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
+{
+	return &iter->pg->records[iter->index];
+}
+
 static int
 ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
 {
@@ -1617,12 +1765,6 @@ static int __ftrace_modify_code(void *data)
 {
 	int *command = data;
 
-	/*
-	 * Do not call function tracer while we update the code.
-	 * We are in stop machine, no worrying about races.
-	 */
-	function_trace_stop++;
-
 	if (*command & FTRACE_UPDATE_CALLS)
 		ftrace_replace_code(1);
 	else if (*command & FTRACE_DISABLE_CALLS)
@@ -1636,21 +1778,33 @@ static int __ftrace_modify_code(void *data)
 	else if (*command & FTRACE_STOP_FUNC_RET)
 		ftrace_disable_ftrace_graph_caller();
 
-#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
-	/*
-	 * For archs that call ftrace_test_stop_func(), we must
-	 * wait till after we update all the function callers
-	 * before we update the callback. This keeps different
-	 * ops that record different functions from corrupting
-	 * each other.
-	 */
-	__ftrace_trace_function = __ftrace_trace_function_delay;
-#endif
-	function_trace_stop--;
-
 	return 0;
 }
 
+/**
+ * ftrace_run_stop_machine, go back to the stop machine method
+ * @command: The command to tell ftrace what to do
+ *
+ * If an arch needs to fall back to the stop machine method, the
+ * it can call this function.
+ */
+void ftrace_run_stop_machine(int command)
+{
+	stop_machine(__ftrace_modify_code, &command, NULL);
+}
+
+/**
+ * arch_ftrace_update_code, modify the code to trace or not trace
+ * @command: The command that needs to be done
+ *
+ * Archs can override this function if it does not need to
+ * run stop_machine() to modify code.
+ */
+void __weak arch_ftrace_update_code(int command)
+{
+	ftrace_run_stop_machine(command);
+}
+
 static void ftrace_run_update_code(int command)
 {
 	int ret;
@@ -1659,8 +1813,31 @@ static void ftrace_run_update_code(int command)
 	FTRACE_WARN_ON(ret);
 	if (ret)
 		return;
+	/*
+	 * Do not call function tracer while we update the code.
+	 * We are in stop machine.
+	 */
+	function_trace_stop++;
 
-	stop_machine(__ftrace_modify_code, &command, NULL);
+	/*
+	 * By default we use stop_machine() to modify the code.
+	 * But archs can do what ever they want as long as it
+	 * is safe. The stop_machine() is the safest, but also
+	 * produces the most overhead.
+	 */
+	arch_ftrace_update_code(command);
+
+#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+	/*
+	 * For archs that call ftrace_test_stop_func(), we must
+	 * wait till after we update all the function callers
+	 * before we update the callback. This keeps different
+	 * ops that record different functions from corrupting
+	 * each other.
+	 */
+	__ftrace_trace_function = __ftrace_trace_function_delay;
+#endif
+	function_trace_stop--;
 
 	ret = ftrace_arch_code_modify_post_process();
 	FTRACE_WARN_ON(ret);
-- 
cgit v1.2.3-70-g09d2


From 3208230983a0ee3d95be22d463257e530c684956 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 16 Dec 2011 14:42:37 -0500
Subject: ftrace: Remove usage of "freed" records

Records that are added to the function trace table are
permanently there, except for modules. By separating out the
modules to their own pages that can be freed in one shot
we can remove the "freed" flag and simplify some of the record
management.

Another benefit of doing this is that we can also move the
records around; sort them.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h |   1 -
 kernel/trace/ftrace.c  | 100 ++++++++++++++++++++++++-------------------------
 2 files changed, 49 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 4f0b6fec379..3f79bc458bf 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -163,7 +163,6 @@ extern int ftrace_text_reserved(void *start, void *end);
 
 enum {
 	FTRACE_FL_ENABLED	= (1 << 30),
-	FTRACE_FL_FREE		= (1 << 31),
 };
 
 #define FTRACE_FL_MASK		(0x3UL << 30)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 655b432fb89..be6888f40d2 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -996,8 +996,6 @@ struct ftrace_page {
 static struct ftrace_page	*ftrace_pages_start;
 static struct ftrace_page	*ftrace_pages;
 
-static struct dyn_ftrace *ftrace_free_records;
-
 static struct ftrace_func_entry *
 ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
 {
@@ -1421,32 +1419,8 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
 	__ftrace_hash_rec_update(ops, filter_hash, 1);
 }
 
-static void ftrace_free_rec(struct dyn_ftrace *rec)
-{
-	rec->freelist = ftrace_free_records;
-	ftrace_free_records = rec;
-	rec->flags |= FTRACE_FL_FREE;
-}
-
 static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 {
-	struct dyn_ftrace *rec;
-
-	/* First check for freed records */
-	if (ftrace_free_records) {
-		rec = ftrace_free_records;
-
-		if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
-			FTRACE_WARN_ON_ONCE(1);
-			ftrace_free_records = NULL;
-			return NULL;
-		}
-
-		ftrace_free_records = rec->freelist;
-		memset(rec, 0, sizeof(*rec));
-		return rec;
-	}
-
 	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
 		if (!ftrace_pages->next) {
 			/* allocate another page */
@@ -1639,10 +1613,6 @@ static void ftrace_replace_code(int update)
 		return;
 
 	do_for_each_ftrace_rec(pg, rec) {
-		/* Skip over free records */
-		if (rec->flags & FTRACE_FL_FREE)
-			continue;
-
 		failed = __ftrace_replace_code(rec, update);
 		if (failed) {
 			ftrace_bug(failed, rec->ip);
@@ -2007,11 +1977,8 @@ static int ftrace_update_code(struct module *mod)
 		 * Do the initial record conversion from mcount jump
 		 * to the NOP instructions.
 		 */
-		if (!ftrace_code_disable(mod, p)) {
-			ftrace_free_rec(p);
-			/* Game over */
+		if (!ftrace_code_disable(mod, p))
 			break;
-		}
 
 		ftrace_update_cnt++;
 
@@ -2026,10 +1993,8 @@ static int ftrace_update_code(struct module *mod)
 		 */
 		if (ftrace_start_up && ref) {
 			int failed = __ftrace_replace_code(p, 1);
-			if (failed) {
+			if (failed)
 				ftrace_bug(failed, p->ip);
-				ftrace_free_rec(p);
-			}
 		}
 	}
 
@@ -2223,9 +2188,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 		}
 	} else {
 		rec = &iter->pg->records[iter->idx++];
-		if ((rec->flags & FTRACE_FL_FREE) ||
-
-		    ((iter->flags & FTRACE_ITER_FILTER) &&
+		if (((iter->flags & FTRACE_ITER_FILTER) &&
 		     !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) ||
 
 		    ((iter->flags & FTRACE_ITER_NOTRACE) &&
@@ -2602,7 +2565,6 @@ match_records(struct ftrace_hash *hash, char *buff,
 		goto out_unlock;
 
 	do_for_each_ftrace_rec(pg, rec) {
-
 		if (ftrace_match_record(rec, mod, search, search_len, type)) {
 			ret = enter_record(hash, rec, not);
 			if (ret < 0) {
@@ -3446,9 +3408,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
 
 	do_for_each_ftrace_rec(pg, rec) {
 
-		if (rec->flags & FTRACE_FL_FREE)
-			continue;
-
 		if (ftrace_match_record(rec, NULL, search, search_len, type)) {
 			/* if it is in the array */
 			exists = false;
@@ -3566,6 +3525,27 @@ static int ftrace_process_locs(struct module *mod,
 	unsigned long flags = 0; /* Shut up gcc */
 
 	mutex_lock(&ftrace_lock);
+	/*
+	 * Core and each module needs their own pages, as
+	 * modules will free them when they are removed.
+	 * Force a new page to be allocated for modules.
+	 */
+	if (mod) {
+		if (!ftrace_pages)
+			return -ENOMEM;
+
+		/*
+		 * If the last page was full, it will be
+		 * allocated anyway.
+		 */
+		if (ftrace_pages->index != ENTRIES_PER_PAGE) {
+			ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
+			if (!ftrace_pages->next)
+				return -ENOMEM;
+			ftrace_pages = ftrace_pages->next;
+		}
+	}
+
 	p = start;
 	while (p < end) {
 		addr = ftrace_call_adjust(*p++);
@@ -3599,9 +3579,13 @@ static int ftrace_process_locs(struct module *mod,
 }
 
 #ifdef CONFIG_MODULES
+
+#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
+
 void ftrace_release_mod(struct module *mod)
 {
 	struct dyn_ftrace *rec;
+	struct ftrace_page **last_pg;
 	struct ftrace_page *pg;
 
 	mutex_lock(&ftrace_lock);
@@ -3609,16 +3593,30 @@ void ftrace_release_mod(struct module *mod)
 	if (ftrace_disabled)
 		goto out_unlock;
 
-	do_for_each_ftrace_rec(pg, rec) {
+	/*
+	 * Each module has its own ftrace_pages, remove
+	 * them from the list.
+	 */
+	last_pg = &ftrace_pages_start;
+	for (pg = ftrace_pages_start; pg; pg = *last_pg) {
+		rec = &pg->records[0];
 		if (within_module_core(rec->ip, mod)) {
 			/*
-			 * rec->ip is changed in ftrace_free_rec()
-			 * It should not between s and e if record was freed.
+			 * As core pages are first, the first
+			 * page should never be a module page.
 			 */
-			FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
-			ftrace_free_rec(rec);
-		}
-	} while_for_each_ftrace_rec();
+			if (WARN_ON(pg == ftrace_pages_start))
+				goto out_unlock;
+
+			/* Check if we are deleting the last page */
+			if (pg == ftrace_pages)
+				ftrace_pages = next_to_ftrace_page(last_pg);
+
+			*last_pg = pg->next;
+			free_page((unsigned long)pg);
+		} else
+			last_pg = &pg->next;
+	}
  out_unlock:
 	mutex_unlock(&ftrace_lock);
 }
-- 
cgit v1.2.3-70-g09d2


From a79008755497daff157f5294c02e3b940641cc11 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 16 Dec 2011 16:23:44 -0500
Subject: ftrace: Allocate the mcount record pages as groups

Allocate the mcount record pages as a group of pages as big
as can be allocated and waste no more than a single page.

Grouping the mcount pages as much as possible helps with cache
locality, as we do not need to redirect with descriptors as we
cross from page to page. It also allows us to do more with the
records later on (sort them with bigger benefits).

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 179 ++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 128 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index be6888f40d2..2e7218869fe 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -983,12 +983,13 @@ static DEFINE_MUTEX(ftrace_regex_lock);
 
 struct ftrace_page {
 	struct ftrace_page	*next;
+	struct dyn_ftrace	*records;
 	int			index;
-	struct dyn_ftrace	records[];
+	int			size;
 };
 
-#define ENTRIES_PER_PAGE \
-  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
+#define ENTRY_SIZE sizeof(struct dyn_ftrace)
+#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
 
 /* estimate from running different kernels */
 #define NR_TO_INIT		10000
@@ -1421,14 +1422,10 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
 
 static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 {
-	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
-		if (!ftrace_pages->next) {
-			/* allocate another page */
-			ftrace_pages->next =
-				(void *)get_zeroed_page(GFP_KERNEL);
-			if (!ftrace_pages->next)
-				return NULL;
-		}
+	if (ftrace_pages->index == ftrace_pages->size) {
+		/* We should have allocated enough */
+		if (WARN_ON(!ftrace_pages->next))
+			return NULL;
 		ftrace_pages = ftrace_pages->next;
 	}
 
@@ -2005,47 +2002,106 @@ static int ftrace_update_code(struct module *mod)
 	return 0;
 }
 
-static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
+static int ftrace_allocate_records(struct ftrace_page *pg, int count)
 {
-	struct ftrace_page *pg;
+	int order;
 	int cnt;
-	int i;
 
-	/* allocate a few pages */
-	ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
-	if (!ftrace_pages_start)
-		return -1;
+	if (WARN_ON(!count))
+		return -EINVAL;
+
+	order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE));
 
 	/*
-	 * Allocate a few more pages.
-	 *
-	 * TODO: have some parser search vmlinux before
-	 *   final linking to find all calls to ftrace.
-	 *   Then we can:
-	 *    a) know how many pages to allocate.
-	 *     and/or
-	 *    b) set up the table then.
-	 *
-	 *  The dynamic code is still necessary for
-	 *  modules.
+	 * We want to fill as much as possible. No more than a page
+	 * may be empty.
 	 */
+	while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE)
+		order--;
 
-	pg = ftrace_pages = ftrace_pages_start;
+ again:
+	pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
 
-	cnt = num_to_init / ENTRIES_PER_PAGE;
-	pr_info("ftrace: allocating %ld entries in %d pages\n",
-		num_to_init, cnt + 1);
+	if (!pg->records) {
+		/* if we can't allocate this size, try something smaller */
+		if (!order)
+			return -ENOMEM;
+		order >>= 1;
+		goto again;
+	}
 
-	for (i = 0; i < cnt; i++) {
-		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+	cnt = (PAGE_SIZE << order) / ENTRY_SIZE;
+	pg->size = cnt;
 
-		/* If we fail, we'll try later anyway */
-		if (!pg->next)
+	if (cnt > count)
+		cnt = count;
+
+	return cnt;
+}
+
+static struct ftrace_page *
+ftrace_allocate_pages(unsigned long num_to_init)
+{
+	struct ftrace_page *start_pg;
+	struct ftrace_page *pg;
+	int order;
+	int cnt;
+
+	if (!num_to_init)
+		return 0;
+
+	start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL);
+	if (!pg)
+		return NULL;
+
+	/*
+	 * Try to allocate as much as possible in one continues
+	 * location that fills in all of the space. We want to
+	 * waste as little space as possible.
+	 */
+	for (;;) {
+		cnt = ftrace_allocate_records(pg, num_to_init);
+		if (cnt < 0)
+			goto free_pages;
+
+		num_to_init -= cnt;
+		if (!num_to_init)
 			break;
 
+		pg->next = kzalloc(sizeof(*pg), GFP_KERNEL);
+		if (!pg->next)
+			goto free_pages;
+
 		pg = pg->next;
 	}
 
+	return start_pg;
+
+ free_pages:
+	while (start_pg) {
+		order = get_count_order(pg->size / ENTRIES_PER_PAGE);
+		free_pages((unsigned long)pg->records, order);
+		start_pg = pg->next;
+		kfree(pg);
+		pg = start_pg;
+	}
+	pr_info("ftrace: FAILED to allocate memory for functions\n");
+	return NULL;
+}
+
+static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
+{
+	int cnt;
+
+	if (!num_to_init) {
+		pr_info("ftrace: No functions to be traced?\n");
+		return -1;
+	}
+
+	cnt = num_to_init / ENTRIES_PER_PAGE;
+	pr_info("ftrace: allocating %ld entries in %d pages\n",
+		num_to_init, cnt + 1);
+
 	return 0;
 }
 
@@ -3520,30 +3576,45 @@ static int ftrace_process_locs(struct module *mod,
 			       unsigned long *start,
 			       unsigned long *end)
 {
+	struct ftrace_page *pg;
+	unsigned long count;
 	unsigned long *p;
 	unsigned long addr;
 	unsigned long flags = 0; /* Shut up gcc */
+	int ret = -ENOMEM;
+
+	count = end - start;
+
+	if (!count)
+		return 0;
+
+	pg = ftrace_allocate_pages(count);
+	if (!pg)
+		return -ENOMEM;
 
 	mutex_lock(&ftrace_lock);
+
 	/*
 	 * Core and each module needs their own pages, as
 	 * modules will free them when they are removed.
 	 * Force a new page to be allocated for modules.
 	 */
-	if (mod) {
+	if (!mod) {
+		WARN_ON(ftrace_pages || ftrace_pages_start);
+		/* First initialization */
+		ftrace_pages = ftrace_pages_start = pg;
+	} else {
 		if (!ftrace_pages)
-			return -ENOMEM;
+			goto out;
 
-		/*
-		 * If the last page was full, it will be
-		 * allocated anyway.
-		 */
-		if (ftrace_pages->index != ENTRIES_PER_PAGE) {
-			ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
-			if (!ftrace_pages->next)
-				return -ENOMEM;
-			ftrace_pages = ftrace_pages->next;
+		if (WARN_ON(ftrace_pages->next)) {
+			/* Hmm, we have free pages? */
+			while (ftrace_pages->next)
+				ftrace_pages = ftrace_pages->next;
 		}
+
+		ftrace_pages->next = pg;
+		ftrace_pages = pg;
 	}
 
 	p = start;
@@ -3557,7 +3628,8 @@ static int ftrace_process_locs(struct module *mod,
 		 */
 		if (!addr)
 			continue;
-		ftrace_record_ip(addr);
+		if (!ftrace_record_ip(addr))
+			break;
 	}
 
 	/*
@@ -3573,9 +3645,11 @@ static int ftrace_process_locs(struct module *mod,
 	ftrace_update_code(mod);
 	if (!mod)
 		local_irq_restore(flags);
+	ret = 0;
+ out:
 	mutex_unlock(&ftrace_lock);
 
-	return 0;
+	return ret;
 }
 
 #ifdef CONFIG_MODULES
@@ -3587,6 +3661,7 @@ void ftrace_release_mod(struct module *mod)
 	struct dyn_ftrace *rec;
 	struct ftrace_page **last_pg;
 	struct ftrace_page *pg;
+	int order;
 
 	mutex_lock(&ftrace_lock);
 
@@ -3613,7 +3688,9 @@ void ftrace_release_mod(struct module *mod)
 				ftrace_pages = next_to_ftrace_page(last_pg);
 
 			*last_pg = pg->next;
-			free_page((unsigned long)pg);
+			order = get_count_order(pg->size / ENTRIES_PER_PAGE);
+			free_pages((unsigned long)pg->records, order);
+			kfree(pg);
 		} else
 			last_pg = &pg->next;
 	}
-- 
cgit v1.2.3-70-g09d2


From 85ae32ae019bc1c2cc22e5f51fe0c9f2812ef68c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 16 Dec 2011 16:30:31 -0500
Subject: ftrace: Replace record newlist with record page list

As new functions come in to be initalized from mcount to nop,
they are done by groups of pages. Whether it is the core kernel
or a module. There's no need to keep track of these on a per record
basis.

At startup, and as any module is loaded, the functions to be
traced are stored in a group of pages and added to the function
list at the end. We just need to keep a pointer to the first
page of the list that was added, and use that to know where to
start on the list for initializing functions.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  5 +---
 kernel/trace/ftrace.c  | 68 +++++++++++++++++++++++++++-----------------------
 2 files changed, 38 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 3f79bc458bf..31b9fd7aedc 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -173,10 +173,7 @@ struct dyn_ftrace {
 		unsigned long		ip; /* address of mcount call-site */
 		struct dyn_ftrace	*freelist;
 	};
-	union {
-		unsigned long		flags;
-		struct dyn_ftrace	*newlist;
-	};
+	unsigned long		flags;
 	struct dyn_arch_ftrace		arch;
 };
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2e7218869fe..366d7881f18 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -977,8 +977,6 @@ static struct ftrace_ops global_ops = {
 	.filter_hash		= EMPTY_HASH,
 };
 
-static struct dyn_ftrace *ftrace_new_addrs;
-
 static DEFINE_MUTEX(ftrace_regex_lock);
 
 struct ftrace_page {
@@ -988,6 +986,8 @@ struct ftrace_page {
 	int			size;
 };
 
+static struct ftrace_page *ftrace_new_pgs;
+
 #define ENTRY_SIZE sizeof(struct dyn_ftrace)
 #define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
 
@@ -1445,8 +1445,6 @@ ftrace_record_ip(unsigned long ip)
 		return NULL;
 
 	rec->ip = ip;
-	rec->newlist = ftrace_new_addrs;
-	ftrace_new_addrs = rec;
 
 	return rec;
 }
@@ -1936,9 +1934,11 @@ static int ops_traces_mod(struct ftrace_ops *ops)
 
 static int ftrace_update_code(struct module *mod)
 {
+	struct ftrace_page *pg;
 	struct dyn_ftrace *p;
 	cycle_t start, stop;
 	unsigned long ref = 0;
+	int i;
 
 	/*
 	 * When adding a module, we need to check if tracers are
@@ -1960,41 +1960,44 @@ static int ftrace_update_code(struct module *mod)
 	start = ftrace_now(raw_smp_processor_id());
 	ftrace_update_cnt = 0;
 
-	while (ftrace_new_addrs) {
+	for (pg = ftrace_new_pgs; pg; pg = pg->next) {
 
-		/* If something went wrong, bail without enabling anything */
-		if (unlikely(ftrace_disabled))
-			return -1;
+		for (i = 0; i < pg->index; i++) {
+			/* If something went wrong, bail without enabling anything */
+			if (unlikely(ftrace_disabled))
+				return -1;
 
-		p = ftrace_new_addrs;
-		ftrace_new_addrs = p->newlist;
-		p->flags = ref;
+			p = &pg->records[i];
+			p->flags = ref;
 
-		/*
-		 * Do the initial record conversion from mcount jump
-		 * to the NOP instructions.
-		 */
-		if (!ftrace_code_disable(mod, p))
-			break;
+			/*
+			 * Do the initial record conversion from mcount jump
+			 * to the NOP instructions.
+			 */
+			if (!ftrace_code_disable(mod, p))
+				break;
 
-		ftrace_update_cnt++;
+			ftrace_update_cnt++;
 
-		/*
-		 * If the tracing is enabled, go ahead and enable the record.
-		 *
-		 * The reason not to enable the record immediatelly is the
-		 * inherent check of ftrace_make_nop/ftrace_make_call for
-		 * correct previous instructions.  Making first the NOP
-		 * conversion puts the module to the correct state, thus
-		 * passing the ftrace_make_call check.
-		 */
-		if (ftrace_start_up && ref) {
-			int failed = __ftrace_replace_code(p, 1);
-			if (failed)
-				ftrace_bug(failed, p->ip);
+			/*
+			 * If the tracing is enabled, go ahead and enable the record.
+			 *
+			 * The reason not to enable the record immediatelly is the
+			 * inherent check of ftrace_make_nop/ftrace_make_call for
+			 * correct previous instructions.  Making first the NOP
+			 * conversion puts the module to the correct state, thus
+			 * passing the ftrace_make_call check.
+			 */
+			if (ftrace_start_up && ref) {
+				int failed = __ftrace_replace_code(p, 1);
+				if (failed)
+					ftrace_bug(failed, p->ip);
+			}
 		}
 	}
 
+	ftrace_new_pgs = NULL;
+
 	stop = ftrace_now(raw_smp_processor_id());
 	ftrace_update_time = stop - start;
 	ftrace_update_tot_cnt += ftrace_update_cnt;
@@ -3632,6 +3635,9 @@ static int ftrace_process_locs(struct module *mod,
 			break;
 	}
 
+	/* These new locations need to be initialized */
+	ftrace_new_pgs = pg;
+
 	/*
 	 * We only need to disable interrupts on start up
 	 * because we are modifying code that an interrupt
-- 
cgit v1.2.3-70-g09d2


From 68950619f8c82e468d8976130462617abea605a8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 16 Dec 2011 17:06:45 -0500
Subject: ftrace: Sort the mcount records on each page

Sort records by ip locations of the ftrace mcount calls on each of the
set of pages in the function list. This helps in localizing cache
usuage when updating the function locations, as well as gives us
the ability to quickly find an ip location in the list.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 366d7881f18..2d6f8bcd188 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -27,6 +27,7 @@
 #include <linux/sysctl.h>
 #include <linux/slab.h>
 #include <linux/ctype.h>
+#include <linux/sort.h>
 #include <linux/list.h>
 #include <linux/hash.h>
 #include <linux/rcupdate.h>
@@ -3575,6 +3576,29 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 	return 0;
 }
 
+static void ftrace_swap_recs(void *a, void *b, int size)
+{
+	struct dyn_ftrace *reca = a;
+	struct dyn_ftrace *recb = b;
+	struct dyn_ftrace t;
+
+	t = *reca;
+	*reca = *recb;
+	*recb = t;
+}
+
+static int ftrace_cmp_recs(const void *a, const void *b)
+{
+	const struct dyn_ftrace *reca = a;
+	const struct dyn_ftrace *recb = b;
+
+	if (reca->ip > recb->ip)
+		return 1;
+	if (reca->ip < recb->ip)
+		return -1;
+	return 0;
+}
+
 static int ftrace_process_locs(struct module *mod,
 			       unsigned long *start,
 			       unsigned long *end)
@@ -3638,6 +3662,11 @@ static int ftrace_process_locs(struct module *mod,
 	/* These new locations need to be initialized */
 	ftrace_new_pgs = pg;
 
+	/* Make each individual set of pages sorted by ips */
+	for (; pg; pg = pg->next)
+		sort(pg->records, pg->index, sizeof(struct dyn_ftrace),
+		     ftrace_cmp_recs, ftrace_swap_recs);
+
 	/*
 	 * We only need to disable interrupts on start up
 	 * because we are modifying code that an interrupt
-- 
cgit v1.2.3-70-g09d2


From 5855fead9cc358adebd6bdeec202d040c623ae38 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 16 Dec 2011 19:27:42 -0500
Subject: ftrace: Use bsearch to find record ip

Now that each set of pages in the function list are sorted by
ip, we can use bsearch to find a record within each set of pages.
This speeds up the ftrace_location() function by magnitudes.

For archs (like x86) that need to add a breakpoint at every function
that will be converted from a nop to a callback and vice versa,
the breakpoint callback needs to know if the breakpoint was for
ftrace or not. It requires finding the breakpoint ip within the
records. Doing a linear search is extremely inefficient. It is
a must to be able to do a fast binary search to find these locations.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2d6f8bcd188..dcd3a814d39 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,6 +22,7 @@
 #include <linux/hardirq.h>
 #include <linux/kthread.h>
 #include <linux/uaccess.h>
+#include <linux/bsearch.h>
 #include <linux/module.h>
 #include <linux/ftrace.h>
 #include <linux/sysctl.h>
@@ -1300,6 +1301,19 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
 		}				\
 	}
 
+
+static int ftrace_cmp_recs(const void *a, const void *b)
+{
+	const struct dyn_ftrace *reca = a;
+	const struct dyn_ftrace *recb = b;
+
+	if (reca->ip > recb->ip)
+		return 1;
+	if (reca->ip < recb->ip)
+		return -1;
+	return 0;
+}
+
 /**
  * ftrace_location - return true if the ip giving is a traced location
  * @ip: the instruction pointer to check
@@ -1313,11 +1327,17 @@ int ftrace_location(unsigned long ip)
 {
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
+	struct dyn_ftrace key;
 
-	do_for_each_ftrace_rec(pg, rec) {
-		if (rec->ip == ip)
+	key.ip = ip;
+
+	for (pg = ftrace_pages_start; pg; pg = pg->next) {
+		rec = bsearch(&key, pg->records, pg->index,
+			      sizeof(struct dyn_ftrace),
+			      ftrace_cmp_recs);
+		if (rec)
 			return 1;
-	} while_for_each_ftrace_rec();
+	}
 
 	return 0;
 }
@@ -3587,18 +3607,6 @@ static void ftrace_swap_recs(void *a, void *b, int size)
 	*recb = t;
 }
 
-static int ftrace_cmp_recs(const void *a, const void *b)
-{
-	const struct dyn_ftrace *reca = a;
-	const struct dyn_ftrace *recb = b;
-
-	if (reca->ip > recb->ip)
-		return 1;
-	if (reca->ip < recb->ip)
-		return -1;
-	return 0;
-}
-
 static int ftrace_process_locs(struct module *mod,
 			       unsigned long *start,
 			       unsigned long *end)
-- 
cgit v1.2.3-70-g09d2


From c842e975520f8ab09e293cc92f51a1f396251fd5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 19 Dec 2011 18:44:44 -0500
Subject: ftrace: Fix ftrace hash record update with notrace

When disabling the "notrace" records, that means we want to trace them.
If the notrace_hash is zero, it means that we want to trace all
records. But to disable a zero notrace_hash means nothing.

The check for the notrace_hash count was incorrect with:

	if (hash && !hash->count)
		return

With the correct comment above it that states that we do nothing
if the notrace_hash has zero count. But !hash also means that
the notrace hash has zero count. I think this was done to
protect against dereferencing NULL. But if !hash is true, then
we go through the following loop without doing a single thing.

Fix it to:

	if (!hash || !hash->count)
		return;

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index dcd3a814d39..a383d6c67bf 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1381,7 +1381,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
 		 * If the notrace hash has no items,
 		 * then there's nothing to do.
 		 */
-		if (hash && !hash->count)
+		if (!hash || !hash->count)
 			return;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 06a51d9307380c78bb5c92e68fc80ad2c7d7f890 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 19 Dec 2011 19:07:36 -0500
Subject: ftrace: Create ftrace_hash_empty() helper routine

There are two types of hashes in the ftrace_ops; one type
is the filter_hash and the other is the notrace_hash. Either
one may be null, meaning it has no elements. But when elements
are added, the hash is allocated.

Throughout the code, a check needs to be made to see if a hash
exists or the hash has elements, but the check if the hash exists
is usually missing causing the possible "NULL pointer dereference bug".

Add a helper routine called "ftrace_hash_empty()" that returns
true if the hash doesn't exist or its count is zero. As they mean
the same thing.

Last-bug-reported-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a383d6c67bf..e1ee07f81ca 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -999,6 +999,11 @@ static struct ftrace_page *ftrace_new_pgs;
 static struct ftrace_page	*ftrace_pages_start;
 static struct ftrace_page	*ftrace_pages;
 
+static bool ftrace_hash_empty(struct ftrace_hash *hash)
+{
+	return !hash || !hash->count;
+}
+
 static struct ftrace_func_entry *
 ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
 {
@@ -1007,7 +1012,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
 	struct hlist_head *hhd;
 	struct hlist_node *n;
 
-	if (!hash->count)
+	if (ftrace_hash_empty(hash))
 		return NULL;
 
 	if (hash->size_bits > 0)
@@ -1151,7 +1156,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
 		return NULL;
 
 	/* Empty hash? */
-	if (!hash || !hash->count)
+	if (ftrace_hash_empty(hash))
 		return new_hash;
 
 	size = 1 << hash->size_bits;
@@ -1276,9 +1281,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
 	filter_hash = rcu_dereference_raw(ops->filter_hash);
 	notrace_hash = rcu_dereference_raw(ops->notrace_hash);
 
-	if ((!filter_hash || !filter_hash->count ||
+	if ((ftrace_hash_empty(filter_hash) ||
 	     ftrace_lookup_ip(filter_hash, ip)) &&
-	    (!notrace_hash || !notrace_hash->count ||
+	    (ftrace_hash_empty(notrace_hash) ||
 	     !ftrace_lookup_ip(notrace_hash, ip)))
 		ret = 1;
 	else
@@ -1371,7 +1376,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
 	if (filter_hash) {
 		hash = ops->filter_hash;
 		other_hash = ops->notrace_hash;
-		if (!hash || !hash->count)
+		if (ftrace_hash_empty(hash))
 			all = 1;
 	} else {
 		inc = !inc;
@@ -1381,7 +1386,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
 		 * If the notrace hash has no items,
 		 * then there's nothing to do.
 		 */
-		if (!hash || !hash->count)
+		if (ftrace_hash_empty(hash))
 			return;
 	}
 
@@ -1398,8 +1403,8 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
 			if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
 				match = 1;
 		} else {
-			in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip);
-			in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip);
+			in_hash = !!ftrace_lookup_ip(hash, rec->ip);
+			in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);
 
 			/*
 			 *
@@ -1407,7 +1412,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
 			if (filter_hash && in_hash && !in_other_hash)
 				match = 1;
 			else if (!filter_hash && in_hash &&
-				 (in_other_hash || !other_hash->count))
+				 (in_other_hash || ftrace_hash_empty(other_hash)))
 				match = 1;
 		}
 		if (!match)
@@ -1950,7 +1955,7 @@ static int ops_traces_mod(struct ftrace_ops *ops)
 	struct ftrace_hash *hash;
 
 	hash = ops->filter_hash;
-	return !!(!hash || !hash->count);
+	return ftrace_hash_empty(hash);
 }
 
 static int ftrace_update_code(struct module *mod)
@@ -2320,7 +2325,8 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 	 * off, we can short cut and just print out that all
 	 * functions are enabled.
 	 */
-	if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) {
+	if (iter->flags & FTRACE_ITER_FILTER &&
+	    ftrace_hash_empty(ops->filter_hash)) {
 		if (*pos > 0)
 			return t_hash_start(m, pos);
 		iter->flags |= FTRACE_ITER_PRINTALL;
-- 
cgit v1.2.3-70-g09d2


From fc13cb0ce45296f331263a6034aa1814203e1ac3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 19 Dec 2011 14:41:25 -0500
Subject: ftrace: Allow other users of function tracing to use the output
 listing

The function tracer is set up to allow any other subsystem (like perf)
to use it. Ftrace already has a way to list what functions are enabled
by the global_ops. It would be very helpful to let other users of
the function tracer to be able to use the same code.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 35 +++++++++++++++++++++++++++++++++++
 kernel/trace/ftrace.c  | 41 +++++++++++++++++++++++++----------------
 2 files changed, 60 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 31b9fd7aedc..aa7559f0a22 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -202,6 +202,14 @@ enum {
 	FTRACE_UPDATE_MAKE_NOP,
 };
 
+enum {
+	FTRACE_ITER_FILTER	= (1 << 0),
+	FTRACE_ITER_NOTRACE	= (1 << 1),
+	FTRACE_ITER_PRINTALL	= (1 << 2),
+	FTRACE_ITER_HASH	= (1 << 3),
+	FTRACE_ITER_ENABLED	= (1 << 4),
+};
+
 void arch_ftrace_update_code(int command);
 
 struct ftrace_rec_iter;
@@ -217,6 +225,15 @@ int ftrace_location(unsigned long ip);
 
 extern ftrace_func_t ftrace_trace_function;
 
+int ftrace_regex_open(struct ftrace_ops *ops, int flag,
+		  struct inode *inode, struct file *file);
+ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf,
+			    size_t cnt, loff_t *ppos);
+ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf,
+			     size_t cnt, loff_t *ppos);
+loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin);
+int ftrace_regex_release(struct inode *inode, struct file *file);
+
 /* defined in arch */
 extern int ftrace_ip_converted(unsigned long ip);
 extern int ftrace_dyn_arch_init(void *data);
@@ -311,6 +328,24 @@ static inline int ftrace_text_reserved(void *start, void *end)
 {
 	return 0;
 }
+
+/*
+ * Again users of functions that have ftrace_ops may not
+ * have them defined when ftrace is not enabled, but these
+ * functions may still be called. Use a macro instead of inline.
+ */
+#define ftrace_regex_open(ops, flag, inod, file) ({ -ENODEV; })
+
+static inline ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf,
+			    size_t cnt, loff_t *ppos) { return -ENODEV; }
+static inline ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf,
+			     size_t cnt, loff_t *ppos) { return -ENODEV; }
+static inline loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
+{
+	return -ENODEV;
+}
+static inline int
+ftrace_regex_release(struct inode *inode, struct file *file) { return -ENODEV; }
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
 /* totally disable ftrace - can not re-enable after this */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index e1ee07f81ca..5b105c5ddc0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2134,14 +2134,6 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
 	return 0;
 }
 
-enum {
-	FTRACE_ITER_FILTER	= (1 << 0),
-	FTRACE_ITER_NOTRACE	= (1 << 1),
-	FTRACE_ITER_PRINTALL	= (1 << 2),
-	FTRACE_ITER_HASH	= (1 << 3),
-	FTRACE_ITER_ENABLED	= (1 << 4),
-};
-
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
 
 struct ftrace_iterator {
@@ -2249,7 +2241,7 @@ static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct ftrace_iterator *iter = m->private;
-	struct ftrace_ops *ops = &global_ops;
+	struct ftrace_ops *ops = iter->ops;
 	struct dyn_ftrace *rec = NULL;
 
 	if (unlikely(ftrace_disabled))
@@ -2305,7 +2297,7 @@ static void reset_iter_read(struct ftrace_iterator *iter)
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
 	struct ftrace_iterator *iter = m->private;
-	struct ftrace_ops *ops = &global_ops;
+	struct ftrace_ops *ops = iter->ops;
 	void *p = NULL;
 	loff_t l;
 
@@ -2414,6 +2406,7 @@ ftrace_avail_open(struct inode *inode, struct file *file)
 		return -ENOMEM;
 
 	iter->pg = ftrace_pages_start;
+	iter->ops = &global_ops;
 
 	ret = seq_open(file, &show_ftrace_seq_ops);
 	if (!ret) {
@@ -2442,6 +2435,7 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
 
 	iter->pg = ftrace_pages_start;
 	iter->flags = FTRACE_ITER_ENABLED;
+	iter->ops = &global_ops;
 
 	ret = seq_open(file, &show_ftrace_seq_ops);
 	if (!ret) {
@@ -2462,7 +2456,23 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
 	mutex_unlock(&ftrace_lock);
 }
 
-static int
+/**
+ * ftrace_regex_open - initialize function tracer filter files
+ * @ops: The ftrace_ops that hold the hash filters
+ * @flag: The type of filter to process
+ * @inode: The inode, usually passed in to your open routine
+ * @file: The file, usually passed in to your open routine
+ *
+ * ftrace_regex_open() initializes the filter files for the
+ * @ops. Depending on @flag it may process the filter hash or
+ * the notrace hash of @ops. With this called from the open
+ * routine, you can use ftrace_filter_write() for the write
+ * routine if @flag has FTRACE_ITER_FILTER set, or
+ * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
+ * ftrace_regex_lseek() should be used as the lseek routine, and
+ * release must call ftrace_regex_release().
+ */
+int
 ftrace_regex_open(struct ftrace_ops *ops, int flag,
 		  struct inode *inode, struct file *file)
 {
@@ -2542,7 +2552,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
 				 inode, file);
 }
 
-static loff_t
+loff_t
 ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
 {
 	loff_t ret;
@@ -3095,14 +3105,14 @@ out_unlock:
 	return ret;
 }
 
-static ssize_t
+ssize_t
 ftrace_filter_write(struct file *file, const char __user *ubuf,
 		    size_t cnt, loff_t *ppos)
 {
 	return ftrace_regex_write(file, ubuf, cnt, ppos, 1);
 }
 
-static ssize_t
+ssize_t
 ftrace_notrace_write(struct file *file, const char __user *ubuf,
 		     size_t cnt, loff_t *ppos)
 {
@@ -3292,8 +3302,7 @@ static void __init set_ftrace_early_filters(void)
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 }
 
-static int
-ftrace_regex_release(struct inode *inode, struct file *file)
+int ftrace_regex_release(struct inode *inode, struct file *file)
 {
 	struct seq_file *m = (struct seq_file *)file->private_data;
 	struct ftrace_iterator *iter;
-- 
cgit v1.2.3-70-g09d2


From 69a3083c4a7df0322d97bb2b43a33cb12af8131a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 19 Dec 2011 15:21:16 -0500
Subject: ftrace: Decouple hash items from showing filtered functions

The set_ftrace_filter shows "hashed" functions, which are functions
that are added with operations to them (like traceon and traceoff).

As other subsystems may be able to show what functions they are
using for function tracing, the hash items should no longer
be shown just because the FILTER flag is set. As they have nothing
to do with other subsystems filters.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  5 +++--
 kernel/trace/ftrace.c  | 16 ++++++++--------
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index aa7559f0a22..d1ff0de1897 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -206,8 +206,9 @@ enum {
 	FTRACE_ITER_FILTER	= (1 << 0),
 	FTRACE_ITER_NOTRACE	= (1 << 1),
 	FTRACE_ITER_PRINTALL	= (1 << 2),
-	FTRACE_ITER_HASH	= (1 << 3),
-	FTRACE_ITER_ENABLED	= (1 << 4),
+	FTRACE_ITER_DO_HASH	= (1 << 3),
+	FTRACE_ITER_HASH	= (1 << 4),
+	FTRACE_ITER_ENABLED	= (1 << 5),
 };
 
 void arch_ftrace_update_code(int command);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5b105c5ddc0..5728d9aa632 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2198,6 +2198,9 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
 	void *p = NULL;
 	loff_t l;
 
+	if (!(iter->flags & FTRACE_ITER_DO_HASH))
+		return NULL;
+
 	if (iter->func_pos > *pos)
 		return NULL;
 
@@ -2343,12 +2346,8 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 			break;
 	}
 
-	if (!p) {
-		if (iter->flags & FTRACE_ITER_FILTER)
-			return t_hash_start(m, pos);
-
-		return NULL;
-	}
+	if (!p)
+		return t_hash_start(m, pos);
 
 	return iter;
 }
@@ -2541,8 +2540,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
 static int
 ftrace_filter_open(struct inode *inode, struct file *file)
 {
-	return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER,
-				 inode, file);
+	return ftrace_regex_open(&global_ops,
+			FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH,
+			inode, file);
 }
 
 static int
-- 
cgit v1.2.3-70-g09d2


From d2d45c7a03a2b1a14159cbb665e9dd60991a7d4f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 19 Dec 2011 14:44:09 -0500
Subject: tracing: Have stack_tracer use a separate list of functions

The stack_tracer is used to look at every function and check
if the current stack is bigger than the last recorded max stack size.
When a new max is found, then it saves that stack off.

Currently the stack tracer is limited by the global_ops of
the function tracer. As the stack tracer has nothing to do with
the ftrace function tracer, except that it uses it as its internal
engine, the stack tracer should have its own list.

A new file is added to the tracing debugfs directory called:

  stack_trace_filter

that can be used to select which functions you want to check the stack
on.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_stack.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 77575b386d9..0398b7c7afd 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -133,7 +133,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
 static struct ftrace_ops trace_ops __read_mostly =
 {
 	.func = stack_trace_call,
-	.flags = FTRACE_OPS_FL_GLOBAL,
 };
 
 static ssize_t
@@ -311,6 +310,21 @@ static const struct file_operations stack_trace_fops = {
 	.release	= seq_release,
 };
 
+static int
+stack_trace_filter_open(struct inode *inode, struct file *file)
+{
+	return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER,
+				 inode, file);
+}
+
+static const struct file_operations stack_trace_filter_fops = {
+	.open = stack_trace_filter_open,
+	.read = seq_read,
+	.write = ftrace_filter_write,
+	.llseek = ftrace_regex_lseek,
+	.release = ftrace_regex_release,
+};
+
 int
 stack_trace_sysctl(struct ctl_table *table, int write,
 		   void __user *buffer, size_t *lenp,
@@ -358,6 +372,9 @@ static __init int stack_trace_init(void)
 	trace_create_file("stack_trace", 0444, d_tracer,
 			NULL, &stack_trace_fops);
 
+	trace_create_file("stack_trace_filter", 0444, d_tracer,
+			NULL, &stack_trace_filter_fops);
+
 	if (stack_tracer_enabled)
 		register_ftrace_function(&trace_ops);
 
-- 
cgit v1.2.3-70-g09d2


From 2a85a37f168d2b4d74d493b578af4dc9032be92e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 19 Dec 2011 21:57:44 -0500
Subject: ftrace: Allow access to the boot time function enabling

Change set_ftrace_early_filter() to ftrace_set_early_filter()
and make it a global function. This will allow other subsystems
in the kernel to be able to enable function tracing at start
up and reuse the ftrace function parsing code.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 3 +++
 kernel/trace/ftrace.c  | 8 ++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index d1ff0de1897..41df6f50165 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -235,6 +235,9 @@ ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf,
 loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin);
 int ftrace_regex_release(struct inode *inode, struct file *file);
 
+void __init
+ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable);
+
 /* defined in arch */
 extern int ftrace_ip_converted(unsigned long ip);
 extern int ftrace_dyn_arch_init(void *data);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5728d9aa632..683d559a0ee 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3279,8 +3279,8 @@ static void __init set_ftrace_early_graph(char *buf)
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
-static void __init
-set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable)
+void __init
+ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)
 {
 	char *func;
 
@@ -3293,9 +3293,9 @@ set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable)
 static void __init set_ftrace_early_filters(void)
 {
 	if (ftrace_filter_buf[0])
-		set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1);
+		ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1);
 	if (ftrace_notrace_buf[0])
-		set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0);
+		ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0);
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	if (ftrace_graph_buf[0])
 		set_ftrace_early_graph(ftrace_graph_buf);
-- 
cgit v1.2.3-70-g09d2


From 762e1207889b3451c50d365b741af6f9ce958886 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 19 Dec 2011 22:01:00 -0500
Subject: tracing: Have stack tracing set filtered functions at boot

Add stacktrace_filter= to the kernel command line that lets
the user pick specific functions to check the stack on.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 Documentation/kernel-parameters.txt |  8 ++++++++
 kernel/trace/trace_stack.c          | 11 +++++++++++
 2 files changed, 19 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index fd5c913c33c..fde2ae06539 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2435,6 +2435,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 	stacktrace	[FTRACE]
 			Enabled the stack tracer on boot up.
 
+	stacktrace_filter=[function-list]
+			[FTRACE] Limit the functions that the stack tracer
+			will trace at boot up. function-list is a comma separated
+			list of functions. This list can be changed at run
+			time by the stack_trace_filter file in the debugfs
+			tracing directory. Note, this enables stack tracing
+			and the stacktrace above is not needed.
+
 	sti=		[PARISC,HW]
 			Format: <num>
 			Set the STI (builtin display/keyboard on the HP-PARISC
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 0398b7c7afd..d4545f49242 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -13,6 +13,9 @@
 #include <linux/sysctl.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+
+#include <asm/setup.h>
+
 #include "trace.h"
 
 #define STACK_TRACE_ENTRIES 500
@@ -352,8 +355,13 @@ stack_trace_sysctl(struct ctl_table *table, int write,
 	return ret;
 }
 
+static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata;
+
 static __init int enable_stacktrace(char *str)
 {
+	if (strncmp(str, "_filter=", 8) == 0)
+		strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE);
+
 	stack_tracer_enabled = 1;
 	last_stack_tracer_enabled = 1;
 	return 1;
@@ -375,6 +383,9 @@ static __init int stack_trace_init(void)
 	trace_create_file("stack_trace_filter", 0444, d_tracer,
 			NULL, &stack_trace_filter_fops);
 
+	if (stack_trace_filter_buf[0])
+		ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1);
+
 	if (stack_tracer_enabled)
 		register_ftrace_function(&trace_ops);
 
-- 
cgit v1.2.3-70-g09d2


From 38b78eb855409a05f9d370228bec1955e6878e08 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 15 Dec 2011 14:31:35 -0800
Subject: tracing: Factorize filter creation

There are four places where new filter for a given filter string is
created, which involves several different steps.  This patch factors
those steps into create_[system_]filter() functions which in turn make
use of create_filter_{start|finish}() for common parts.

The only functional change is that if replace_filter_string() is
requested and fails, creation fails without any side effect instead of
being ignored.

Note that system filter is now installed after the processing is
complete which makes freeing before and then restoring filter string
on error unncessary.

-v2: Rebased to resolve conflict with 49aa29513e and updated both
     create_filter() functions to always set *filterp instead of
     requiring the caller to clear it to %NULL on entry.

Link: http://lkml.kernel.org/r/1323988305-1469-2-git-send-email-tj@kernel.org

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 283 +++++++++++++++++++------------------
 1 file changed, 142 insertions(+), 141 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f04cc3136bd..24aee712745 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1738,11 +1738,121 @@ static int replace_system_preds(struct event_subsystem *system,
 	return -ENOMEM;
 }
 
+static int create_filter_start(char *filter_str, bool set_str,
+			       struct filter_parse_state **psp,
+			       struct event_filter **filterp)
+{
+	struct event_filter *filter;
+	struct filter_parse_state *ps = NULL;
+	int err = 0;
+
+	WARN_ON_ONCE(*psp || *filterp);
+
+	/* allocate everything, and if any fails, free all and fail */
+	filter = __alloc_filter();
+	if (filter && set_str)
+		err = replace_filter_string(filter, filter_str);
+
+	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+
+	if (!filter || !ps || err) {
+		kfree(ps);
+		__free_filter(filter);
+		return -ENOMEM;
+	}
+
+	/* we're committed to creating a new filter */
+	*filterp = filter;
+	*psp = ps;
+
+	parse_init(ps, filter_ops, filter_str);
+	err = filter_parse(ps);
+	if (err && set_str)
+		append_filter_err(ps, filter);
+	return err;
+}
+
+static void create_filter_finish(struct filter_parse_state *ps)
+{
+	if (ps) {
+		filter_opstack_clear(ps);
+		postfix_clear(ps);
+		kfree(ps);
+	}
+}
+
+/**
+ * create_filter - create a filter for a ftrace_event_call
+ * @call: ftrace_event_call to create a filter for
+ * @filter_str: filter string
+ * @set_str: remember @filter_str and enable detailed error in filter
+ * @filterp: out param for created filter (always updated on return)
+ *
+ * Creates a filter for @call with @filter_str.  If @set_str is %true,
+ * @filter_str is copied and recorded in the new filter.
+ *
+ * On success, returns 0 and *@filterp points to the new filter.  On
+ * failure, returns -errno and *@filterp may point to %NULL or to a new
+ * filter.  In the latter case, the returned filter contains error
+ * information if @set_str is %true and the caller is responsible for
+ * freeing it.
+ */
+static int create_filter(struct ftrace_event_call *call,
+			 char *filter_str, bool set_str,
+			 struct event_filter **filterp)
+{
+	struct event_filter *filter = NULL;
+	struct filter_parse_state *ps = NULL;
+	int err;
+
+	err = create_filter_start(filter_str, set_str, &ps, &filter);
+	if (!err) {
+		err = replace_preds(call, filter, ps, filter_str, false);
+		if (err && set_str)
+			append_filter_err(ps, filter);
+	}
+	create_filter_finish(ps);
+
+	*filterp = filter;
+	return err;
+}
+
+/**
+ * create_system_filter - create a filter for an event_subsystem
+ * @system: event_subsystem to create a filter for
+ * @filter_str: filter string
+ * @filterp: out param for created filter (always updated on return)
+ *
+ * Identical to create_filter() except that it creates a subsystem filter
+ * and always remembers @filter_str.
+ */
+static int create_system_filter(struct event_subsystem *system,
+				char *filter_str, struct event_filter **filterp)
+{
+	struct event_filter *filter = NULL;
+	struct filter_parse_state *ps = NULL;
+	int err;
+
+	err = create_filter_start(filter_str, true, &ps, &filter);
+	if (!err) {
+		err = replace_system_preds(system, ps, filter_str);
+		if (!err) {
+			/* System filters just show a default message */
+			kfree(filter->filter_string);
+			filter->filter_string = NULL;
+		} else {
+			append_filter_err(ps, filter);
+		}
+	}
+	create_filter_finish(ps);
+
+	*filterp = filter;
+	return err;
+}
+
 int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 {
-	struct filter_parse_state *ps;
 	struct event_filter *filter;
-	struct event_filter *tmp;
 	int err = 0;
 
 	mutex_lock(&event_mutex);
@@ -1759,49 +1869,30 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 		goto out_unlock;
 	}
 
-	err = -ENOMEM;
-	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
-	if (!ps)
-		goto out_unlock;
-
-	filter = __alloc_filter();
-	if (!filter) {
-		kfree(ps);
-		goto out_unlock;
-	}
-
-	replace_filter_string(filter, filter_string);
-
-	parse_init(ps, filter_ops, filter_string);
-	err = filter_parse(ps);
-	if (err) {
-		append_filter_err(ps, filter);
-		goto out;
-	}
+	err = create_filter(call, filter_string, true, &filter);
 
-	err = replace_preds(call, filter, ps, filter_string, false);
-	if (err) {
-		filter_disable(call);
-		append_filter_err(ps, filter);
-	} else
-		call->flags |= TRACE_EVENT_FL_FILTERED;
-out:
 	/*
 	 * Always swap the call filter with the new filter
 	 * even if there was an error. If there was an error
 	 * in the filter, we disable the filter and show the error
 	 * string
 	 */
-	tmp = call->filter;
-	rcu_assign_pointer(call->filter, filter);
-	if (tmp) {
-		/* Make sure the call is done with the filter */
-		synchronize_sched();
-		__free_filter(tmp);
+	if (filter) {
+		struct event_filter *tmp = call->filter;
+
+		if (!err)
+			call->flags |= TRACE_EVENT_FL_FILTERED;
+		else
+			filter_disable(call);
+
+		rcu_assign_pointer(call->filter, filter);
+
+		if (tmp) {
+			/* Make sure the call is done with the filter */
+			synchronize_sched();
+			__free_filter(tmp);
+		}
 	}
-	filter_opstack_clear(ps);
-	postfix_clear(ps);
-	kfree(ps);
 out_unlock:
 	mutex_unlock(&event_mutex);
 
@@ -1811,7 +1902,6 @@ out_unlock:
 int apply_subsystem_event_filter(struct event_subsystem *system,
 				 char *filter_string)
 {
-	struct filter_parse_state *ps;
 	struct event_filter *filter;
 	int err = 0;
 
@@ -1835,48 +1925,19 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
 		goto out_unlock;
 	}
 
-	err = -ENOMEM;
-	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
-	if (!ps)
-		goto out_unlock;
-
-	filter = __alloc_filter();
-	if (!filter)
-		goto out;
-
-	/* System filters just show a default message */
-	kfree(filter->filter_string);
-	filter->filter_string = NULL;
-
-	/*
-	 * No event actually uses the system filter
-	 * we can free it without synchronize_sched().
-	 */
-	__free_filter(system->filter);
-	system->filter = filter;
-
-	parse_init(ps, filter_ops, filter_string);
-	err = filter_parse(ps);
-	if (err)
-		goto err_filter;
-
-	err = replace_system_preds(system, ps, filter_string);
-	if (err)
-		goto err_filter;
-
-out:
-	filter_opstack_clear(ps);
-	postfix_clear(ps);
-	kfree(ps);
+	err = create_system_filter(system, filter_string, &filter);
+	if (filter) {
+		/*
+		 * No event actually uses the system filter
+		 * we can free it without synchronize_sched().
+		 */
+		__free_filter(system->filter);
+		system->filter = filter;
+	}
 out_unlock:
 	mutex_unlock(&event_mutex);
 
 	return err;
-
-err_filter:
-	replace_filter_string(filter, filter_string);
-	append_filter_err(ps, system->filter);
-	goto out;
 }
 
 #ifdef CONFIG_PERF_EVENTS
@@ -1894,7 +1955,6 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 {
 	int err;
 	struct event_filter *filter;
-	struct filter_parse_state *ps;
 	struct ftrace_event_call *call;
 
 	mutex_lock(&event_mutex);
@@ -1909,33 +1969,10 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 	if (event->filter)
 		goto out_unlock;
 
-	filter = __alloc_filter();
-	if (!filter) {
-		err = PTR_ERR(filter);
-		goto out_unlock;
-	}
-
-	err = -ENOMEM;
-	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
-	if (!ps)
-		goto free_filter;
-
-	parse_init(ps, filter_ops, filter_str);
-	err = filter_parse(ps);
-	if (err)
-		goto free_ps;
-
-	err = replace_preds(call, filter, ps, filter_str, false);
+	err = create_filter(call, filter_str, false, &filter);
 	if (!err)
 		event->filter = filter;
-
-free_ps:
-	filter_opstack_clear(ps);
-	postfix_clear(ps);
-	kfree(ps);
-
-free_filter:
-	if (err)
+	else
 		__free_filter(filter);
 
 out_unlock:
@@ -1954,43 +1991,6 @@ out_unlock:
 #define CREATE_TRACE_POINTS
 #include "trace_events_filter_test.h"
 
-static int test_get_filter(char *filter_str, struct ftrace_event_call *call,
-			   struct event_filter **pfilter)
-{
-	struct event_filter *filter;
-	struct filter_parse_state *ps;
-	int err = -ENOMEM;
-
-	filter = __alloc_filter();
-	if (!filter)
-		goto out;
-
-	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
-	if (!ps)
-		goto free_filter;
-
-	parse_init(ps, filter_ops, filter_str);
-	err = filter_parse(ps);
-	if (err)
-		goto free_ps;
-
-	err = replace_preds(call, filter, ps, filter_str, false);
-	if (!err)
-		*pfilter = filter;
-
- free_ps:
-	filter_opstack_clear(ps);
-	postfix_clear(ps);
-	kfree(ps);
-
- free_filter:
-	if (err)
-		__free_filter(filter);
-
- out:
-	return err;
-}
-
 #define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \
 { \
 	.filter = FILTER, \
@@ -2109,12 +2109,13 @@ static __init int ftrace_test_event_filter(void)
 		struct test_filter_data_t *d = &test_filter_data[i];
 		int err;
 
-		err = test_get_filter(d->filter, &event_ftrace_test_filter,
-				      &filter);
+		err = create_filter(&event_ftrace_test_filter, d->filter,
+				    false, &filter);
 		if (err) {
 			printk(KERN_INFO
 			       "Failed to get filter for '%s', err %d\n",
 			       d->filter, err);
+			__free_filter(filter);
 			break;
 		}
 
-- 
cgit v1.2.3-70-g09d2


From b7e724d303b684655e4ca3dabd5a6840ad19012d Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 12:25:15 -0500
Subject: capabilities: reverse arguments to security_capable

security_capable takes ns, cred, cap.  But the LSM capable() hook takes
cred, ns, cap.  The capability helper functions also take cred, ns, cap.
Rather than flip argument order just to flip it back, leave them alone.
Heck, this should be a little faster since argument will be in the right
place!

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 drivers/pci/pci-sysfs.c  | 2 +-
 include/linux/security.h | 6 +++---
 kernel/capability.c      | 2 +-
 security/security.c      | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 7bcf12adced..a4457ab6134 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -431,7 +431,7 @@ pci_read_config(struct file *filp, struct kobject *kobj,
 	u8 *data = (u8*) buf;
 
 	/* Several chips lock up trying to read undefined config space */
-	if (security_capable(&init_user_ns, filp->f_cred, CAP_SYS_ADMIN) == 0) {
+	if (security_capable(filp->f_cred, &init_user_ns, CAP_SYS_ADMIN) == 0) {
 		size = dev->cfg_size;
 	} else if (dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) {
 		size = 128;
diff --git a/include/linux/security.h b/include/linux/security.h
index 4921163b275..ee969ff40a2 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1666,7 +1666,7 @@ int security_capset(struct cred *new, const struct cred *old,
 		    const kernel_cap_t *effective,
 		    const kernel_cap_t *inheritable,
 		    const kernel_cap_t *permitted);
-int security_capable(struct user_namespace *ns, const struct cred *cred,
+int security_capable(const struct cred *cred, struct user_namespace *ns,
 			int cap);
 int security_real_capable(struct task_struct *tsk, struct user_namespace *ns,
 			int cap);
@@ -1863,8 +1863,8 @@ static inline int security_capset(struct cred *new,
 	return cap_capset(new, old, effective, inheritable, permitted);
 }
 
-static inline int security_capable(struct user_namespace *ns,
-				   const struct cred *cred, int cap)
+static inline int security_capable(const struct cred *cred,
+				   struct user_namespace *ns, int cap)
 {
 	return cap_capable(cred, ns, cap, SECURITY_CAP_AUDIT);
 }
diff --git a/kernel/capability.c b/kernel/capability.c
index 283c529f8b1..d98392719ad 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -374,7 +374,7 @@ bool ns_capable(struct user_namespace *ns, int cap)
 		BUG();
 	}
 
-	if (security_capable(ns, current_cred(), cap) == 0) {
+	if (security_capable(current_cred(), ns, cap) == 0) {
 		current->flags |= PF_SUPERPRIV;
 		return true;
 	}
diff --git a/security/security.c b/security/security.c
index 9ae68c64455..b9e57f4fc44 100644
--- a/security/security.c
+++ b/security/security.c
@@ -154,7 +154,7 @@ int security_capset(struct cred *new, const struct cred *old,
 				    effective, inheritable, permitted);
 }
 
-int security_capable(struct user_namespace *ns, const struct cred *cred,
+int security_capable(const struct cred *cred, struct user_namespace *ns,
 		     int cap)
 {
 	return security_ops->capable(cred, ns, cap, SECURITY_CAP_AUDIT);
-- 
cgit v1.2.3-70-g09d2


From 2920a8409de5a51575d03deca07e5bb2be6fc98d Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 12:25:15 -0500
Subject: capabilities: remove all _real_ interfaces

The name security_real_capable and security_real_capable_noaudit just don't
make much sense to me.  Convert them to use security_capable and
security_capable_noaudit.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge E. Hallyn <serge.hallyn@canonical.com>
---
 include/linux/security.h | 25 -------------------------
 kernel/capability.c      | 18 +++++++++++++++---
 security/security.c      | 24 ------------------------
 3 files changed, 15 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/security.h b/include/linux/security.h
index caff54eee68..e345a9313a6 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1670,10 +1670,6 @@ int security_capable(const struct cred *cred, struct user_namespace *ns,
 			int cap);
 int security_capable_noaudit(const struct cred *cred, struct user_namespace *ns,
 			     int cap);
-int security_real_capable(struct task_struct *tsk, struct user_namespace *ns,
-			int cap);
-int security_real_capable_noaudit(struct task_struct *tsk,
-			struct user_namespace *ns, int cap);
 int security_quotactl(int cmds, int type, int id, struct super_block *sb);
 int security_quota_on(struct dentry *dentry);
 int security_syslog(int type);
@@ -1876,27 +1872,6 @@ static inline int security_capable_noaudit(const struct cred *cred,
 	return cap_capable(cred, ns, cap, SECURITY_CAP_NOAUDIT);
 }
 
-static inline int security_real_capable(struct task_struct *tsk, struct user_namespace *ns, int cap)
-{
-	int ret;
-
-	rcu_read_lock();
-	ret = cap_capable(__task_cred(tsk), ns, cap, SECURITY_CAP_AUDIT);
-	rcu_read_unlock();
-	return ret;
-}
-
-static inline
-int security_real_capable_noaudit(struct task_struct *tsk, struct user_namespace *ns, int cap)
-{
-	int ret;
-
-	rcu_read_lock();
-	ret = cap_capable(__task_cred(tsk), ns, cap, SECURITY_CAP_NOAUDIT);
-	rcu_read_unlock();
-	return ret;
-}
-
 static inline int security_quotactl(int cmds, int type, int id,
 				     struct super_block *sb)
 {
diff --git a/kernel/capability.c b/kernel/capability.c
index d98392719ad..ff50ab62cfc 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -298,7 +298,11 @@ error:
  */
 bool has_capability(struct task_struct *t, int cap)
 {
-	int ret = security_real_capable(t, &init_user_ns, cap);
+	int ret;
+
+	rcu_read_lock();
+	ret = security_capable(__task_cred(t), &init_user_ns, cap);
+	rcu_read_unlock();
 
 	return (ret == 0);
 }
@@ -317,7 +321,11 @@ bool has_capability(struct task_struct *t, int cap)
 bool has_ns_capability(struct task_struct *t,
 		       struct user_namespace *ns, int cap)
 {
-	int ret = security_real_capable(t, ns, cap);
+	int ret;
+
+	rcu_read_lock();
+	ret = security_capable(__task_cred(t), ns, cap);
+	rcu_read_unlock();
 
 	return (ret == 0);
 }
@@ -335,7 +343,11 @@ bool has_ns_capability(struct task_struct *t,
  */
 bool has_capability_noaudit(struct task_struct *t, int cap)
 {
-	int ret = security_real_capable_noaudit(t, &init_user_ns, cap);
+	int ret;
+
+	rcu_read_lock();
+	ret = security_capable_noaudit(__task_cred(t), &init_user_ns, cap);
+	rcu_read_unlock();
 
 	return (ret == 0);
 }
diff --git a/security/security.c b/security/security.c
index b7edaae77d1..8900c5c4db5 100644
--- a/security/security.c
+++ b/security/security.c
@@ -166,30 +166,6 @@ int security_capable_noaudit(const struct cred *cred, struct user_namespace *ns,
 	return security_ops->capable(cred, ns, cap, SECURITY_CAP_NOAUDIT);
 }
 
-int security_real_capable(struct task_struct *tsk, struct user_namespace *ns,
-			  int cap)
-{
-	const struct cred *cred;
-	int ret;
-
-	cred = get_task_cred(tsk);
-	ret = security_ops->capable(cred, ns, cap, SECURITY_CAP_AUDIT);
-	put_cred(cred);
-	return ret;
-}
-
-int security_real_capable_noaudit(struct task_struct *tsk,
-				  struct user_namespace *ns, int cap)
-{
-	const struct cred *cred;
-	int ret;
-
-	cred = get_task_cred(tsk);
-	ret = security_ops->capable(cred, ns, cap, SECURITY_CAP_NOAUDIT);
-	put_cred(cred);
-	return ret;
-}
-
 int security_quotactl(int cmds, int type, int id, struct super_block *sb)
 {
 	return security_ops->quotactl(cmds, type, id, sb);
-- 
cgit v1.2.3-70-g09d2


From 25e75703410a84b80623da3653db6b70282e5c6a Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 12:25:15 -0500
Subject: capabilities: call has_ns_capability from has_capability

Declare the more specific has_ns_capability first in the code and then call it
from has_capability.  The declaration reversal isn't stricty necessary since
they are both declared in header files, but it just makes sense to put more
specific functions first in the code.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge E. Hallyn <serge.hallyn@canonical.com>
---
 kernel/capability.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index ff50ab62cfc..fb815d1b9ea 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -287,47 +287,41 @@ error:
 }
 
 /**
- * has_capability - Does a task have a capability in init_user_ns
+ * has_ns_capability - Does a task have a capability in a specific user ns
  * @t: The task in question
+ * @ns: target user namespace
  * @cap: The capability to be tested for
  *
  * Return true if the specified task has the given superior capability
- * currently in effect to the initial user namespace, false if not.
+ * currently in effect to the specified user namespace, false if not.
  *
  * Note that this does not set PF_SUPERPRIV on the task.
  */
-bool has_capability(struct task_struct *t, int cap)
+bool has_ns_capability(struct task_struct *t,
+		       struct user_namespace *ns, int cap)
 {
 	int ret;
 
 	rcu_read_lock();
-	ret = security_capable(__task_cred(t), &init_user_ns, cap);
+	ret = security_capable(__task_cred(t), ns, cap);
 	rcu_read_unlock();
 
 	return (ret == 0);
 }
 
 /**
- * has_capability - Does a task have a capability in a specific user ns
+ * has_capability - Does a task have a capability in init_user_ns
  * @t: The task in question
- * @ns: target user namespace
  * @cap: The capability to be tested for
  *
  * Return true if the specified task has the given superior capability
- * currently in effect to the specified user namespace, false if not.
+ * currently in effect to the initial user namespace, false if not.
  *
  * Note that this does not set PF_SUPERPRIV on the task.
  */
-bool has_ns_capability(struct task_struct *t,
-		       struct user_namespace *ns, int cap)
+bool has_capability(struct task_struct *t, int cap)
 {
-	int ret;
-
-	rcu_read_lock();
-	ret = security_capable(__task_cred(t), ns, cap);
-	rcu_read_unlock();
-
-	return (ret == 0);
+	return has_ns_capability(t, &init_user_ns, cap);
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 7b61d648499e74dbec3d4ce645675e0ae040ae78 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 12:25:15 -0500
Subject: capabilites: introduce new has_ns_capabilities_noaudit

For consistency in interfaces, introduce a new interface called
has_ns_capabilities_noaudit.  It checks if the given task has the given
capability in the given namespace.  Use this new function by
has_capabilities_noaudit.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge E. Hallyn <serge.hallyn@canonical.com>
---
 include/linux/capability.h |  2 ++
 kernel/capability.c        | 30 +++++++++++++++++++++++++-----
 2 files changed, 27 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index c4211235000..63f59fa8769 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -543,6 +543,8 @@ extern bool has_capability(struct task_struct *t, int cap);
 extern bool has_ns_capability(struct task_struct *t,
 			      struct user_namespace *ns, int cap);
 extern bool has_capability_noaudit(struct task_struct *t, int cap);
+extern bool has_ns_capability_noaudit(struct task_struct *t,
+				      struct user_namespace *ns, int cap);
 extern bool capable(int cap);
 extern bool ns_capable(struct user_namespace *ns, int cap);
 extern bool task_ns_capable(struct task_struct *t, int cap);
diff --git a/kernel/capability.c b/kernel/capability.c
index fb815d1b9ea..d8398e96247 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -325,27 +325,47 @@ bool has_capability(struct task_struct *t, int cap)
 }
 
 /**
- * has_capability_noaudit - Does a task have a capability (unaudited)
+ * has_ns_capability_noaudit - Does a task have a capability (unaudited)
+ * in a specific user ns.
  * @t: The task in question
+ * @ns: target user namespace
  * @cap: The capability to be tested for
  *
  * Return true if the specified task has the given superior capability
- * currently in effect to init_user_ns, false if not.  Don't write an
- * audit message for the check.
+ * currently in effect to the specified user namespace, false if not.
+ * Do not write an audit message for the check.
  *
  * Note that this does not set PF_SUPERPRIV on the task.
  */
-bool has_capability_noaudit(struct task_struct *t, int cap)
+bool has_ns_capability_noaudit(struct task_struct *t,
+			       struct user_namespace *ns, int cap)
 {
 	int ret;
 
 	rcu_read_lock();
-	ret = security_capable_noaudit(__task_cred(t), &init_user_ns, cap);
+	ret = security_capable_noaudit(__task_cred(t), ns, cap);
 	rcu_read_unlock();
 
 	return (ret == 0);
 }
 
+/**
+ * has_capability_noaudit - Does a task have a capability (unaudited) in the
+ * initial user ns
+ * @t: The task in question
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to init_user_ns, false if not.  Don't write an
+ * audit message for the check.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_capability_noaudit(struct task_struct *t, int cap)
+{
+	return has_ns_capability_noaudit(t, &init_user_ns, cap);
+}
+
 /**
  * capable - Determine if the current task has a superior capability in effect
  * @cap: The capability to be tested for
-- 
cgit v1.2.3-70-g09d2


From 105ddf49cd301b7929a92f269440e8e562ef19db Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 12:25:15 -0500
Subject: capabilities: style only - move capable below ns_capable

Although the current code is fine for consistency this moves the capable
code below the function it calls in the c file.  It doesn't actually change
code.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge E. Hallyn <serge.hallyn@canonical.com>
---
 kernel/capability.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index d8398e96247..5f99e5d68e1 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -366,22 +366,6 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
 	return has_ns_capability_noaudit(t, &init_user_ns, cap);
 }
 
-/**
- * capable - Determine if the current task has a superior capability in effect
- * @cap: The capability to be tested for
- *
- * Return true if the current task has the given superior capability currently
- * available for use, false if not.
- *
- * This sets PF_SUPERPRIV on the task if the capability is available on the
- * assumption that it's about to be used.
- */
-bool capable(int cap)
-{
-	return ns_capable(&init_user_ns, cap);
-}
-EXPORT_SYMBOL(capable);
-
 /**
  * ns_capable - Determine if the current task has a superior capability in effect
  * @ns:  The usernamespace we want the capability in
@@ -408,6 +392,22 @@ bool ns_capable(struct user_namespace *ns, int cap)
 }
 EXPORT_SYMBOL(ns_capable);
 
+/**
+ * capable - Determine if the current task has a superior capability in effect
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool capable(int cap)
+{
+	return ns_capable(&init_user_ns, cap);
+}
+EXPORT_SYMBOL(capable);
+
 /**
  * task_ns_capable - Determine whether current task has a superior
  * capability targeted at a specific task's user namespace.
-- 
cgit v1.2.3-70-g09d2


From d2a7009f0bb03fa22ad08dd25472efa0568126b9 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 12:25:15 -0500
Subject: capabitlies: ns_capable can use the cap helpers rather than lsm call

Just to reduce the number of places to change if we every change the LSM
hook, use the capability helpers internally when possible.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge E. Hallyn <serge.hallyn@canonical.com>
---
 kernel/capability.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index 5f99e5d68e1..47626446c39 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -384,7 +384,7 @@ bool ns_capable(struct user_namespace *ns, int cap)
 		BUG();
 	}
 
-	if (security_capable(current_cred(), ns, cap) == 0) {
+	if (has_ns_capability(current, ns, cap)) {
 		current->flags |= PF_SUPERPRIV;
 		return true;
 	}
-- 
cgit v1.2.3-70-g09d2


From f1c84dae0ecc51aa35c81f19a0ebcd6c0921ddcb Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 12:25:15 -0500
Subject: capabilities: remove task_ns_* functions

task_ in the front of a function, in the security subsystem anyway, means
to me at least, that we are operating with that task as the subject of the
security decision.  In this case what it means is that we are using current as
the subject but we use the task to get the right namespace.  Who in the world
would ever realize that's what task_ns_capability means just by the name?  This
patch eliminates the task_ns functions entirely and uses the has_ns_capability
function instead.  This means we explicitly open code the ns in question in
the caller.  I think it makes the caller a LOT more clear what is going on.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge E. Hallyn <serge.hallyn@canonical.com>
---
 include/linux/capability.h |  1 -
 include/linux/cred.h       |  6 ++++--
 kernel/capability.c        | 14 --------------
 kernel/ptrace.c            |  4 ++--
 kernel/sched.c             |  2 +-
 5 files changed, 7 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 63f59fa8769..e3e8d9cb9b0 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -547,7 +547,6 @@ extern bool has_ns_capability_noaudit(struct task_struct *t,
 				      struct user_namespace *ns, int cap);
 extern bool capable(int cap);
 extern bool ns_capable(struct user_namespace *ns, int cap);
-extern bool task_ns_capable(struct task_struct *t, int cap);
 extern bool nsown_capable(int cap);
 
 /* audit system wants to get cap info from files as well */
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 40308969ed0..adadf71a732 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -358,10 +358,12 @@ static inline void put_cred(const struct cred *_cred)
 #define current_security()	(current_cred_xxx(security))
 
 #ifdef CONFIG_USER_NS
-#define current_user_ns() (current_cred_xxx(user_ns))
+#define current_user_ns()	(current_cred_xxx(user_ns))
+#define task_user_ns(task)	(task_cred_xxx((task), user_ns))
 #else
 extern struct user_namespace init_user_ns;
-#define current_user_ns() (&init_user_ns)
+#define current_user_ns()	(&init_user_ns)
+#define task_user_ns(task)	(&init_user_ns)
 #endif
 
 
diff --git a/kernel/capability.c b/kernel/capability.c
index 47626446c39..74fb3b60304 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -408,20 +408,6 @@ bool capable(int cap)
 }
 EXPORT_SYMBOL(capable);
 
-/**
- * task_ns_capable - Determine whether current task has a superior
- * capability targeted at a specific task's user namespace.
- * @t: The task whose user namespace is targeted.
- * @cap: The capability in question.
- *
- *  Return true if it does, false otherwise.
- */
-bool task_ns_capable(struct task_struct *t, int cap)
-{
-	return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
-}
-EXPORT_SYMBOL(task_ns_capable);
-
 /**
  * nsown_capable - Check superior capability to one's own user_ns
  * @cap: The capability in question
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index a70d2a5d8c7..210bbf045ee 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -196,7 +196,7 @@ ok:
 	smp_rmb();
 	if (task->mm)
 		dumpable = get_dumpable(task->mm);
-	if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE))
+	if (!dumpable && !ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
 		return -EPERM;
 
 	return security_ptrace_access_check(task, mode);
@@ -266,7 +266,7 @@ static int ptrace_attach(struct task_struct *task, long request,
 	task->ptrace = PT_PTRACED;
 	if (seize)
 		task->ptrace |= PT_SEIZED;
-	if (task_ns_capable(task, CAP_SYS_PTRACE))
+	if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
 		task->ptrace |= PT_PTRACE_CAP;
 
 	__ptrace_link(task, current);
diff --git a/kernel/sched.c b/kernel/sched.c
index b50b0f0c9aa..5670028a9c1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5409,7 +5409,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 		goto out_free_cpus_allowed;
 	}
 	retval = -EPERM;
-	if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
+	if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
 		goto out_unlock;
 
 	retval = security_task_setscheduler(p);
-- 
cgit v1.2.3-70-g09d2


From 69f594a38967f4540ce7a29b3fd214e68a8330bd Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 12:25:15 -0500
Subject: ptrace: do not audit capability check when outputing /proc/pid/stat

Reading /proc/pid/stat of another process checks if one has ptrace permissions
on that process.  If one does have permissions it outputs some data about the
process which might have security and attack implications.  If the current
task does not have ptrace permissions the read still works, but those fields
are filled with inocuous (0) values.  Since this check and a subsequent denial
is not a violation of the security policy we should not audit such denials.

This can be quite useful to removing ptrace broadly across a system without
flooding the logs when ps is run or something which harmlessly walks proc.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge E. Hallyn <serge.hallyn@canonical.com>
---
 fs/proc/array.c          |  2 +-
 include/linux/ptrace.h   |  5 +++--
 kernel/ptrace.c          | 12 ++++++++++--
 security/selinux/hooks.c |  2 +-
 4 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 3a1dafd228d..ddffd7a88b9 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -380,7 +380,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 	state = *get_task_state(task);
 	vsize = eip = esp = 0;
-	permitted = ptrace_may_access(task, PTRACE_MODE_READ);
+	permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT);
 	mm = get_task_mm(task);
 	if (mm) {
 		vsize = task_vsize(mm);
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 800f113bea6..a27e56ca41a 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -127,8 +127,9 @@ extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
 extern void exit_ptrace(struct task_struct *tracer);
-#define PTRACE_MODE_READ   1
-#define PTRACE_MODE_ATTACH 2
+#define PTRACE_MODE_READ	0x01
+#define PTRACE_MODE_ATTACH	0x02
+#define PTRACE_MODE_NOAUDIT	0x04
 /* Returns 0 on success, -errno on denial. */
 extern int __ptrace_may_access(struct task_struct *task, unsigned int mode);
 /* Returns true on success, false on denial. */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 210bbf045ee..c890ac9a796 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -161,6 +161,14 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state)
 	return ret;
 }
 
+static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
+{
+	if (mode & PTRACE_MODE_NOAUDIT)
+		return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
+	else
+		return has_ns_capability(current, ns, CAP_SYS_PTRACE);
+}
+
 int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
 	const struct cred *cred = current_cred(), *tcred;
@@ -187,7 +195,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 	     cred->gid == tcred->sgid &&
 	     cred->gid == tcred->gid))
 		goto ok;
-	if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
+	if (ptrace_has_cap(tcred->user->user_ns, mode))
 		goto ok;
 	rcu_read_unlock();
 	return -EPERM;
@@ -196,7 +204,7 @@ ok:
 	smp_rmb();
 	if (task->mm)
 		dumpable = get_dumpable(task->mm);
-	if (!dumpable && !ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
+	if (!dumpable  && !ptrace_has_cap(task_user_ns(task), mode))
 		return -EPERM;
 
 	return security_ptrace_access_check(task, mode);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index c9605c4a2e0..14f94cd29c8 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1809,7 +1809,7 @@ static int selinux_ptrace_access_check(struct task_struct *child,
 	if (rc)
 		return rc;
 
-	if (mode == PTRACE_MODE_READ) {
+	if (mode & PTRACE_MODE_READ) {
 		u32 sid = current_sid();
 		u32 csid = task_sid(child);
 		return avc_has_perm(sid, csid, SECCLASS_FILE, FILE__READ, NULL);
-- 
cgit v1.2.3-70-g09d2


From fd778461524849afd035679030ae8e8873c72b81 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 12:25:16 -0500
Subject: security: remove the security_netlink_recv hook as it is equivalent
 to capable()

Once upon a time netlink was not sync and we had to get the effective
capabilities from the skb that was being received.  Today we instead get
the capabilities from the current task.  This has rendered the entire
purpose of the hook moot as it is now functionally equivalent to the
capable() call.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 drivers/scsi/scsi_netlink.c     |  2 +-
 include/linux/security.h        | 14 --------------
 kernel/audit.c                  |  4 ++--
 net/core/rtnetlink.c            |  2 +-
 net/decnet/netfilter/dn_rtmsg.c |  2 +-
 net/ipv4/netfilter/ip_queue.c   |  2 +-
 net/ipv6/netfilter/ip6_queue.c  |  2 +-
 net/netfilter/nfnetlink.c       |  2 +-
 net/netlink/genetlink.c         |  2 +-
 net/xfrm/xfrm_user.c            |  2 +-
 security/capability.c           |  1 -
 security/commoncap.c            |  8 --------
 security/security.c             |  6 ------
 security/selinux/hooks.c        | 19 -------------------
 14 files changed, 10 insertions(+), 58 deletions(-)

(limited to 'kernel')

diff --git a/drivers/scsi/scsi_netlink.c b/drivers/scsi/scsi_netlink.c
index 26a8a45584e..feee1cc39ea 100644
--- a/drivers/scsi/scsi_netlink.c
+++ b/drivers/scsi/scsi_netlink.c
@@ -111,7 +111,7 @@ scsi_nl_rcv_msg(struct sk_buff *skb)
 			goto next_msg;
 		}
 
-		if (security_netlink_recv(skb, CAP_SYS_ADMIN)) {
+		if (!capable(CAP_SYS_ADMIN)) {
 			err = -EPERM;
 			goto next_msg;
 		}
diff --git a/include/linux/security.h b/include/linux/security.h
index e345a9313a6..ba2d531c123 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -95,7 +95,6 @@ struct xfrm_user_sec_ctx;
 struct seq_file;
 
 extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb);
-extern int cap_netlink_recv(struct sk_buff *skb, int cap);
 
 void reset_security_ops(void);
 
@@ -792,12 +791,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@skb contains the sk_buff structure for the netlink message.
  *	Return 0 if the information was successfully saved and message
  *	is allowed to be transmitted.
- * @netlink_recv:
- *	Check permission before processing the received netlink message in
- *	@skb.
- *	@skb contains the sk_buff structure for the netlink message.
- *	@cap indicates the capability required
- *	Return 0 if permission is granted.
  *
  * Security hooks for Unix domain networking.
  *
@@ -1556,7 +1549,6 @@ struct security_operations {
 			  struct sembuf *sops, unsigned nsops, int alter);
 
 	int (*netlink_send) (struct sock *sk, struct sk_buff *skb);
-	int (*netlink_recv) (struct sk_buff *skb, int cap);
 
 	void (*d_instantiate) (struct dentry *dentry, struct inode *inode);
 
@@ -1803,7 +1795,6 @@ void security_d_instantiate(struct dentry *dentry, struct inode *inode);
 int security_getprocattr(struct task_struct *p, char *name, char **value);
 int security_setprocattr(struct task_struct *p, char *name, void *value, size_t size);
 int security_netlink_send(struct sock *sk, struct sk_buff *skb);
-int security_netlink_recv(struct sk_buff *skb, int cap);
 int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen);
 int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid);
 void security_release_secctx(char *secdata, u32 seclen);
@@ -2478,11 +2469,6 @@ static inline int security_netlink_send(struct sock *sk, struct sk_buff *skb)
 	return cap_netlink_send(sk, skb);
 }
 
-static inline int security_netlink_recv(struct sk_buff *skb, int cap)
-{
-	return cap_netlink_recv(skb, cap);
-}
-
 static inline int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
 {
 	return -EOPNOTSUPP;
diff --git a/kernel/audit.c b/kernel/audit.c
index 0a1355ca3d7..f3ba55fa0b7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -601,13 +601,13 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 	case AUDIT_TTY_SET:
 	case AUDIT_TRIM:
 	case AUDIT_MAKE_EQUIV:
-		if (security_netlink_recv(skb, CAP_AUDIT_CONTROL))
+		if (!capable(CAP_AUDIT_CONTROL))
 			err = -EPERM;
 		break;
 	case AUDIT_USER:
 	case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
 	case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
-		if (security_netlink_recv(skb, CAP_AUDIT_WRITE))
+		if (!capable(CAP_AUDIT_WRITE))
 			err = -EPERM;
 		break;
 	default:  /* bad msg */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 99d9e953fe3..d3a62819671 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1931,7 +1931,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	sz_idx = type>>2;
 	kind = type&3;
 
-	if (kind != 2 && security_netlink_recv(skb, CAP_NET_ADMIN))
+	if (kind != 2 && !capable(CAP_NET_ADMIN))
 		return -EPERM;
 
 	if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
index 69975e0bcde..1531135130d 100644
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -108,7 +108,7 @@ static inline void dnrmg_receive_user_skb(struct sk_buff *skb)
 	if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
 		return;
 
-	if (security_netlink_recv(skb, CAP_NET_ADMIN))
+	if (!capable(CAP_NET_ADMIN))
 		RCV_SKB_FAIL(-EPERM);
 
 	/* Eventually we might send routing messages too */
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index e59aabd0eae..ffabb267471 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -430,7 +430,7 @@ __ipq_rcv_skb(struct sk_buff *skb)
 	if (type <= IPQM_BASE)
 		return;
 
-	if (security_netlink_recv(skb, CAP_NET_ADMIN))
+	if (!capable(CAP_NET_ADMIN))
 		RCV_SKB_FAIL(-EPERM);
 
 	spin_lock_bh(&queue_lock);
diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c
index e63c3972a73..5e5ce778be7 100644
--- a/net/ipv6/netfilter/ip6_queue.c
+++ b/net/ipv6/netfilter/ip6_queue.c
@@ -431,7 +431,7 @@ __ipq_rcv_skb(struct sk_buff *skb)
 	if (type <= IPQM_BASE)
 		return;
 
-	if (security_netlink_recv(skb, CAP_NET_ADMIN))
+	if (!capable(CAP_NET_ADMIN))
 		RCV_SKB_FAIL(-EPERM);
 
 	spin_lock_bh(&queue_lock);
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 1905976b513..e6c2b8f3218 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -130,7 +130,7 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	const struct nfnetlink_subsystem *ss;
 	int type, err;
 
-	if (security_netlink_recv(skb, CAP_NET_ADMIN))
+	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
 
 	/* All the messages must at least contain nfgenmsg */
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 482fa571b4e..05fedbf489a 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -516,7 +516,7 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		return -EOPNOTSUPP;
 
 	if ((ops->flags & GENL_ADMIN_PERM) &&
-	    security_netlink_recv(skb, CAP_NET_ADMIN))
+	    !capable(CAP_NET_ADMIN))
 		return -EPERM;
 
 	if (nlh->nlmsg_flags & NLM_F_DUMP) {
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 0256b8a0a7c..71de86698ef 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2290,7 +2290,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	link = &xfrm_dispatch[type];
 
 	/* All operations require privileges, even GET */
-	if (security_netlink_recv(skb, CAP_NET_ADMIN))
+	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
 
 	if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) ||
diff --git a/security/capability.c b/security/capability.c
index 2984ea4f776..a2c064d1044 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -999,7 +999,6 @@ void __init security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, sem_semctl);
 	set_to_cap_if_null(ops, sem_semop);
 	set_to_cap_if_null(ops, netlink_send);
-	set_to_cap_if_null(ops, netlink_recv);
 	set_to_cap_if_null(ops, d_instantiate);
 	set_to_cap_if_null(ops, getprocattr);
 	set_to_cap_if_null(ops, setprocattr);
diff --git a/security/commoncap.c b/security/commoncap.c
index 89f02ff66af..7817a763444 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -56,14 +56,6 @@ int cap_netlink_send(struct sock *sk, struct sk_buff *skb)
 	return 0;
 }
 
-int cap_netlink_recv(struct sk_buff *skb, int cap)
-{
-	if (!cap_raised(current_cap(), cap))
-		return -EPERM;
-	return 0;
-}
-EXPORT_SYMBOL(cap_netlink_recv);
-
 /**
  * cap_capable - Determine whether a task has a particular effective capability
  * @cred: The credentials to use
diff --git a/security/security.c b/security/security.c
index 8900c5c4db5..85481a9c563 100644
--- a/security/security.c
+++ b/security/security.c
@@ -922,12 +922,6 @@ int security_netlink_send(struct sock *sk, struct sk_buff *skb)
 	return security_ops->netlink_send(sk, skb);
 }
 
-int security_netlink_recv(struct sk_buff *skb, int cap)
-{
-	return security_ops->netlink_recv(skb, cap);
-}
-EXPORT_SYMBOL(security_netlink_recv);
-
 int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
 {
 	return security_ops->secid_to_secctx(secid, secdata, seclen);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 14f94cd29c8..3e37d25a9bb 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4713,24 +4713,6 @@ static int selinux_netlink_send(struct sock *sk, struct sk_buff *skb)
 	return selinux_nlmsg_perm(sk, skb);
 }
 
-static int selinux_netlink_recv(struct sk_buff *skb, int capability)
-{
-	int err;
-	struct common_audit_data ad;
-	u32 sid;
-
-	err = cap_netlink_recv(skb, capability);
-	if (err)
-		return err;
-
-	COMMON_AUDIT_DATA_INIT(&ad, CAP);
-	ad.u.cap = capability;
-
-	security_task_getsecid(current, &sid);
-	return avc_has_perm(sid, sid, SECCLASS_CAPABILITY,
-			    CAP_TO_MASK(capability), &ad);
-}
-
 static int ipc_alloc_security(struct task_struct *task,
 			      struct kern_ipc_perm *perm,
 			      u16 sclass)
@@ -5459,7 +5441,6 @@ static struct security_operations selinux_ops = {
 	.vm_enough_memory =		selinux_vm_enough_memory,
 
 	.netlink_send =			selinux_netlink_send,
-	.netlink_recv =			selinux_netlink_recv,
 
 	.bprm_set_creds =		selinux_bprm_set_creds,
 	.bprm_committing_creds =	selinux_bprm_committing_creds,
-- 
cgit v1.2.3-70-g09d2


From 9b9fb610f6800e0db46cccd8618dd7e609c9bb5a Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Tue, 10 Jan 2012 09:24:05 +0900
Subject: sched: Remove empty #ifdefs

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/4F0B8525.8070901@ct.jp.nec.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/core.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4dbfd04a214..457c881873c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7136,10 +7136,6 @@ void set_curr_task(int cpu, struct task_struct *p)
 
 #endif
 
-#ifdef CONFIG_RT_GROUP_SCHED
-#else /* !CONFIG_RT_GROUP_SCHED */
-#endif /* CONFIG_RT_GROUP_SCHED */
-
 #ifdef CONFIG_CGROUP_SCHED
 /* task_group_lock serializes the addition/removal of task groups */
 static DEFINE_SPINLOCK(task_group_lock);
@@ -7248,9 +7244,6 @@ void sched_move_task(struct task_struct *tsk)
 }
 #endif /* CONFIG_CGROUP_SCHED */
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-#endif
-
 #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
 static unsigned long to_ratio(u64 period, u64 runtime)
 {
-- 
cgit v1.2.3-70-g09d2


From c6968e73b90c2a2fb9a32d4bad249f8f70f70125 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Tue, 10 Jan 2012 15:07:31 -0800
Subject: PM/Hibernate: do not count debug pages as savable

When debugging with CONFIG_DEBUG_PAGEALLOC and debug_guardpage_minorder >
0, we have lot of free pages that are not marked so.  Snapshot code
account them as savable, what cause hibernate memory preallocation
failure.

It is pretty hard to make hibernate allocation succeed with
debug_guardpage_minorder=1.  This change at least make it possible when
system has relatively big amount of RAM.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/snapshot.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index cbe2c144139..1cf88900ec4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -858,6 +858,9 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
 	    PageReserved(page))
 		return NULL;
 
+	if (page_is_guard(page))
+		return NULL;
+
 	return page;
 }
 
@@ -920,6 +923,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
 	    && (!kernel_page_present(page) || pfn_is_nosave(pfn)))
 		return NULL;
 
+	if (page_is_guard(page))
+		return NULL;
+
 	return page;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 43d2b113241d6797b890318767e0af78e313414b Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 10 Jan 2012 15:08:09 -0800
Subject: tracepoint: add tracepoints for debugging oom_score_adj

oom_score_adj is used for guarding processes from OOM-Killer.  One of
problem is that it's inherited at fork().  When a daemon set oom_score_adj
and make children, it's hard to know where the value is set.

This patch adds some tracepoints useful for debugging. This patch adds
3 trace points.
  - creating new task
  - renaming a task (exec)
  - set oom_score_adj

To debug, users need to enable some trace pointer. Maybe filtering is useful as

# EVENT=/sys/kernel/debug/tracing/events/task/
# echo "oom_score_adj != 0" > $EVENT/task_newtask/filter
# echo "oom_score_adj != 0" > $EVENT/task_rename/filter
# echo 1 > $EVENT/enable
# EVENT=/sys/kernel/debug/tracing/events/oom/
# echo 1 > $EVENT/enable

output will be like this.
# grep oom /sys/kernel/debug/tracing/trace
bash-7699  [007] d..3  5140.744510: oom_score_adj_update: pid=7699 comm=bash oom_score_adj=-1000
bash-7699  [007] ...1  5151.818022: task_newtask: pid=7729 comm=bash clone_flags=1200011 oom_score_adj=-1000
ls-7729  [003] ...2  5151.818504: task_rename: pid=7729 oldcomm=bash newcomm=ls oom_score_adj=-1000
bash-7699  [002] ...1  5175.701468: task_newtask: pid=7730 comm=bash clone_flags=1200011 oom_score_adj=-1000
grep-7730  [007] ...2  5175.701993: task_rename: pid=7730 oldcomm=bash newcomm=grep oom_score_adj=-1000

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                   |  4 +++
 fs/proc/base.c              |  3 +++
 include/trace/events/oom.h  | 33 ++++++++++++++++++++++++
 include/trace/events/task.h | 61 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/fork.c               |  6 +++++
 mm/oom_kill.c               |  6 +++++
 6 files changed, 113 insertions(+)
 create mode 100644 include/trace/events/oom.h
 create mode 100644 include/trace/events/task.h

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index 3f64b9f26e7..aeb135c7ff5 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -59,6 +59,8 @@
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
+
+#include <trace/events/task.h>
 #include "internal.h"
 
 int core_uses_pid;
@@ -1054,6 +1056,8 @@ void set_task_comm(struct task_struct *tsk, char *buf)
 {
 	task_lock(tsk);
 
+	trace_task_rename(tsk, buf);
+
 	/*
 	 * Threads may access current->comm without holding
 	 * the task lock, so write the string carefully.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a1dddda999f..1aab5fe05a1 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -86,6 +86,7 @@
 #ifdef CONFIG_HARDWALL
 #include <asm/hardwall.h>
 #endif
+#include <trace/events/oom.h>
 #include "internal.h"
 
 /* NOTE:
@@ -1010,6 +1011,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	else
 		task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
 								-OOM_DISABLE;
+	trace_oom_score_adj_update(task);
 err_sighand:
 	unlock_task_sighand(task, &flags);
 err_task_lock:
@@ -1097,6 +1099,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 	task->signal->oom_score_adj = oom_score_adj;
 	if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
 		task->signal->oom_score_adj_min = oom_score_adj;
+	trace_oom_score_adj_update(task);
 	/*
 	 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
 	 * always attainable.
diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h
new file mode 100644
index 00000000000..dd4ba3b9200
--- /dev/null
+++ b/include/trace/events/oom.h
@@ -0,0 +1,33 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM oom
+
+#if !defined(_TRACE_OOM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_OOM_H
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(oom_score_adj_update,
+
+	TP_PROTO(struct task_struct *task),
+
+	TP_ARGS(task),
+
+	TP_STRUCT__entry(
+		__field(	pid_t,	pid)
+		__array(	char,	comm,	TASK_COMM_LEN )
+		__field(	 int,	oom_score_adj)
+	),
+
+	TP_fast_assign(
+		__entry->pid = task->pid;
+		memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
+		__entry->oom_score_adj = task->signal->oom_score_adj;
+	),
+
+	TP_printk("pid=%d comm=%s oom_score_adj=%d",
+		__entry->pid, __entry->comm, __entry->oom_score_adj)
+);
+
+#endif
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/task.h b/include/trace/events/task.h
new file mode 100644
index 00000000000..b53add02e92
--- /dev/null
+++ b/include/trace/events/task.h
@@ -0,0 +1,61 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM task
+
+#if !defined(_TRACE_TASK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_TASK_H
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(task_newtask,
+
+	TP_PROTO(struct task_struct *task, unsigned long clone_flags),
+
+	TP_ARGS(task, clone_flags),
+
+	TP_STRUCT__entry(
+		__field(	pid_t,	pid)
+		__array(	char,	comm, TASK_COMM_LEN)
+		__field( unsigned long, clone_flags)
+		__field(	int,    oom_score_adj)
+	),
+
+	TP_fast_assign(
+		__entry->pid = task->pid;
+		memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
+		__entry->clone_flags = clone_flags;
+		__entry->oom_score_adj = task->signal->oom_score_adj;
+	),
+
+	TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%d",
+		__entry->pid, __entry->comm,
+		__entry->clone_flags, __entry->oom_score_adj)
+);
+
+TRACE_EVENT(task_rename,
+
+	TP_PROTO(struct task_struct *task, char *comm),
+
+	TP_ARGS(task, comm),
+
+	TP_STRUCT__entry(
+		__field(	pid_t,	pid)
+		__array(	char, oldcomm,  TASK_COMM_LEN)
+		__array(	char, newcomm,  TASK_COMM_LEN)
+		__field(	int, oom_score_adj)
+	),
+
+	TP_fast_assign(
+		__entry->pid = task->pid;
+		memcpy(entry->oldcomm, task->comm, TASK_COMM_LEN);
+		memcpy(entry->newcomm, comm, TASK_COMM_LEN);
+		__entry->oom_score_adj = task->signal->oom_score_adj;
+	),
+
+	TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%d",
+		__entry->pid, __entry->oldcomm,
+		__entry->newcomm, __entry->oom_score_adj)
+);
+
+#endif
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index b00711ce7c1..5e1391b5ade 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -76,6 +76,9 @@
 
 #include <trace/events/sched.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/task.h>
+
 /*
  * Protected counters by write_lock_irq(&tasklist_lock)
  */
@@ -1370,6 +1373,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (clone_flags & CLONE_THREAD)
 		threadgroup_change_end(current);
 	perf_event_fork(p);
+
+	trace_task_newtask(p, clone_flags);
+
 	return p;
 
 bad_fork_free_pid:
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index eeb27e27dce..7c122faa05c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -33,6 +33,10 @@
 #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/freezer.h>
+#include <linux/ftrace.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/oom.h>
 
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
@@ -55,6 +59,7 @@ void compare_swap_oom_score_adj(int old_val, int new_val)
 	spin_lock_irq(&sighand->siglock);
 	if (current->signal->oom_score_adj == old_val)
 		current->signal->oom_score_adj = new_val;
+	trace_oom_score_adj_update(current);
 	spin_unlock_irq(&sighand->siglock);
 }
 
@@ -74,6 +79,7 @@ int test_set_oom_score_adj(int new_val)
 	spin_lock_irq(&sighand->siglock);
 	old_val = current->signal->oom_score_adj;
 	current->signal->oom_score_adj = new_val;
+	trace_oom_score_adj_update(current);
 	spin_unlock_irq(&sighand->siglock);
 
 	return old_val;
-- 
cgit v1.2.3-70-g09d2


From 5e6292c0f28f03dfdb8ea3d685f0b838a23bfba4 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Tue, 10 Jan 2012 15:11:17 -0800
Subject: signal: add block_sigmask() for adding sigmask to current->blocked

Abstract the code sequence for adding a signal handler's sa_mask to
current->blocked because the sequence is identical for all architectures.
Furthermore, in the past some architectures actually got this code wrong,
so introduce a wrapper that all architectures can use.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/signal.c |  6 +-----
 include/linux/signal.h   |  1 +
 kernel/signal.c          | 21 +++++++++++++++++++++
 3 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 54ddaeb221c..46a01bdc27e 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -682,7 +682,6 @@ static int
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 		struct pt_regs *regs)
 {
-	sigset_t blocked;
 	int ret;
 
 	/* Are we from a system call? */
@@ -733,10 +732,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	 */
 	regs->flags &= ~X86_EFLAGS_TF;
 
-	sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
-	if (!(ka->sa.sa_flags & SA_NODEFER))
-		sigaddset(&blocked, sig);
-	set_current_blocked(&blocked);
+	block_sigmask(ka, sig);
 
 	tracehook_signal_handler(sig, info, ka, regs,
 				 test_thread_flag(TIF_SINGLESTEP));
diff --git a/include/linux/signal.h b/include/linux/signal.h
index a822300a253..7987ce74874 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -254,6 +254,7 @@ extern void set_current_blocked(const sigset_t *);
 extern int show_unhandled_signals;
 
 extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie);
+extern void block_sigmask(struct k_sigaction *ka, int signr);
 extern void exit_signals(struct task_struct *tsk);
 
 extern struct kmem_cache *sighand_cachep;
diff --git a/kernel/signal.c b/kernel/signal.c
index bb0efa5705e..d532f1709fb 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2318,6 +2318,27 @@ relock:
 	return signr;
 }
 
+/**
+ * block_sigmask - add @ka's signal mask to current->blocked
+ * @ka: action for @signr
+ * @signr: signal that has been successfully delivered
+ *
+ * This function should be called when a signal has succesfully been
+ * delivered. It adds the mask of signals for @ka to current->blocked
+ * so that they are blocked during the execution of the signal
+ * handler. In addition, @signr will be blocked unless %SA_NODEFER is
+ * set in @ka->sa.sa_flags.
+ */
+void block_sigmask(struct k_sigaction *ka, int signr)
+{
+	sigset_t blocked;
+
+	sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
+	if (!(ka->sa.sa_flags & SA_NODEFER))
+		sigaddset(&blocked, signr);
+	set_current_blocked(&blocked);
+}
+
 /*
  * It could be that complete_signal() picked us to notify about the
  * group-wide signal. Other threads should be notified now to take
-- 
cgit v1.2.3-70-g09d2


From b196be89cdc14a88cc637cdad845a75c5886c82d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 10 Jan 2012 15:11:35 -0800
Subject: workqueue: make alloc_workqueue() take printf fmt and args for name

alloc_workqueue() currently expects the passed in @name pointer to remain
accessible.  This is inconvenient and a bit silly given that the whole wq
is being dynamically allocated.  This patch updates alloc_workqueue() and
friends to take printf format string instead of opaque string and matching
varargs at the end.  The name is allocated together with the wq and
formatted.

alloc_ordered_workqueue() is converted to a macro to unify varargs
handling with alloc_workqueue(), and, while at it, add comment to
alloc_workqueue().

None of the current in-kernel users pass in string with '%' as constant
name and this change shouldn't cause any problem.

[akpm@linux-foundation.org: use __printf]
Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/workqueue.h | 47 +++++++++++++++++++++++++++++++----------------
 kernel/workqueue.c        | 32 ++++++++++++++++++++++----------
 2 files changed, 53 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 0d556deb497..eb8b9f15f2e 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -297,32 +297,50 @@ extern struct workqueue_struct *system_unbound_wq;
 extern struct workqueue_struct *system_freezable_wq;
 
 extern struct workqueue_struct *
-__alloc_workqueue_key(const char *name, unsigned int flags, int max_active,
-		      struct lock_class_key *key, const char *lock_name);
+__alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active,
+	struct lock_class_key *key, const char *lock_name, ...) __printf(1, 6);
 
+/**
+ * alloc_workqueue - allocate a workqueue
+ * @fmt: printf format for the name of the workqueue
+ * @flags: WQ_* flags
+ * @max_active: max in-flight work items, 0 for default
+ * @args: args for @fmt
+ *
+ * Allocate a workqueue with the specified parameters.  For detailed
+ * information on WQ_* flags, please refer to Documentation/workqueue.txt.
+ *
+ * The __lock_name macro dance is to guarantee that single lock_class_key
+ * doesn't end up with different namesm, which isn't allowed by lockdep.
+ *
+ * RETURNS:
+ * Pointer to the allocated workqueue on success, %NULL on failure.
+ */
 #ifdef CONFIG_LOCKDEP
-#define alloc_workqueue(name, flags, max_active)		\
+#define alloc_workqueue(fmt, flags, max_active, args...)	\
 ({								\
 	static struct lock_class_key __key;			\
 	const char *__lock_name;				\
 								\
-	if (__builtin_constant_p(name))				\
-		__lock_name = (name);				\
+	if (__builtin_constant_p(fmt))				\
+		__lock_name = (fmt);				\
 	else							\
-		__lock_name = #name;				\
+		__lock_name = #fmt;				\
 								\
-	__alloc_workqueue_key((name), (flags), (max_active),	\
-			      &__key, __lock_name);		\
+	__alloc_workqueue_key((fmt), (flags), (max_active),	\
+			      &__key, __lock_name, ##args);	\
 })
 #else
-#define alloc_workqueue(name, flags, max_active)		\
-	__alloc_workqueue_key((name), (flags), (max_active), NULL, NULL)
+#define alloc_workqueue(fmt, flags, max_active, args...)	\
+	__alloc_workqueue_key((fmt), (flags), (max_active),	\
+			      NULL, NULL, ##args)
 #endif
 
 /**
  * alloc_ordered_workqueue - allocate an ordered workqueue
- * @name: name of the workqueue
+ * @fmt: printf format for the name of the workqueue
  * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful)
+ * @args: args for @fmt
  *
  * Allocate an ordered workqueue.  An ordered workqueue executes at
  * most one work item at any given time in the queued order.  They are
@@ -331,11 +349,8 @@ __alloc_workqueue_key(const char *name, unsigned int flags, int max_active,
  * RETURNS:
  * Pointer to the allocated workqueue on success, %NULL on failure.
  */
-static inline struct workqueue_struct *
-alloc_ordered_workqueue(const char *name, unsigned int flags)
-{
-	return alloc_workqueue(name, WQ_UNBOUND | flags, 1);
-}
+#define alloc_ordered_workqueue(fmt, flags, args...)		\
+	alloc_workqueue(fmt, WQ_UNBOUND | (flags), 1, ##args)
 
 #define create_workqueue(name)					\
 	alloc_workqueue((name), WQ_MEM_RECLAIM, 1)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 42fa9ad0a81..bec7b5b53e0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -242,10 +242,10 @@ struct workqueue_struct {
 
 	int			nr_drainers;	/* W: drain in progress */
 	int			saved_max_active; /* W: saved cwq max_active */
-	const char		*name;		/* I: workqueue name */
 #ifdef CONFIG_LOCKDEP
 	struct lockdep_map	lockdep_map;
 #endif
+	char			name[];		/* I: workqueue name */
 };
 
 struct workqueue_struct *system_wq __read_mostly;
@@ -2954,14 +2954,29 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
 	return clamp_val(max_active, 1, lim);
 }
 
-struct workqueue_struct *__alloc_workqueue_key(const char *name,
+struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 					       unsigned int flags,
 					       int max_active,
 					       struct lock_class_key *key,
-					       const char *lock_name)
+					       const char *lock_name, ...)
 {
+	va_list args, args1;
 	struct workqueue_struct *wq;
 	unsigned int cpu;
+	size_t namelen;
+
+	/* determine namelen, allocate wq and format name */
+	va_start(args, lock_name);
+	va_copy(args1, args);
+	namelen = vsnprintf(NULL, 0, fmt, args) + 1;
+
+	wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL);
+	if (!wq)
+		goto err;
+
+	vsnprintf(wq->name, namelen, fmt, args1);
+	va_end(args);
+	va_end(args1);
 
 	/*
 	 * Workqueues which may be used during memory reclaim should
@@ -2978,12 +2993,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
 		flags |= WQ_HIGHPRI;
 
 	max_active = max_active ?: WQ_DFL_ACTIVE;
-	max_active = wq_clamp_max_active(max_active, flags, name);
-
-	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
-	if (!wq)
-		goto err;
+	max_active = wq_clamp_max_active(max_active, flags, wq->name);
 
+	/* init wq */
 	wq->flags = flags;
 	wq->saved_max_active = max_active;
 	mutex_init(&wq->flush_mutex);
@@ -2991,7 +3003,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
 	INIT_LIST_HEAD(&wq->flusher_queue);
 	INIT_LIST_HEAD(&wq->flusher_overflow);
 
-	wq->name = name;
 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
 	INIT_LIST_HEAD(&wq->list);
 
@@ -3020,7 +3031,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
 		if (!rescuer)
 			goto err;
 
-		rescuer->task = kthread_create(rescuer_thread, wq, "%s", name);
+		rescuer->task = kthread_create(rescuer_thread, wq, "%s",
+					       wq->name);
 		if (IS_ERR(rescuer->task))
 			goto err;
 
-- 
cgit v1.2.3-70-g09d2


From 6b550f9495947fc279d12c38feaf98500e8d0646 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serge@hallyn.com>
Date: Tue, 10 Jan 2012 15:11:37 -0800
Subject: user namespace: make signal.c respect user namespaces

ipc/mqueue.c: for __SI_MESQ, convert the uid being sent to recipient's
user namespace. (new, thanks Oleg)

__send_signal: convert current's uid to the recipient's user namespace
for any siginfo which is not SI_FROMKERNEL (patch from Oleg, thanks
again :)

do_notify_parent and do_notify_parent_cldstop: map task's uid to parent's
user namespace

ptrace_signal maps parent's uid into current's user namespace before
including in signal to current.  IIUC Oleg has argued that this shouldn't
matter as the debugger will play with it, but it seems like not converting
the value currently being set is misleading.

Changelog:
Sep 20: Inspired by Oleg's suggestion, define map_cred_ns() helper to
	simplify callers and help make clear what we are translating
        (which uid into which namespace).  Passing the target task would
	make callers even easier to read, but we pass in user_ns because
	current_user_ns() != task_cred_xxx(current, user_ns).
Sep 20: As recommended by Oleg, also put task_pid_vnr() under rcu_read_lock
	in ptrace_signal().
Sep 23: In send_signal(), detect when (user) signal is coming from an
	ancestor or unrelated user namespace.  Pass that on to __send_signal,
	which sets si_uid to 0 or overflowuid if needed.
Oct 12: Base on Oleg's fixup_uid() patch.  On top of that, handle all
	SI_FROMKERNEL cases at callers, because we can't assume sender is
	current in those cases.
Nov 10: (mhelsley) rename fixup_uid to more meaningful usern_fixup_signal_uid
Nov 10: (akpm) make the !CONFIG_USER_NS case clearer

Signed-off-by: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
From: Serge Hallyn <serge.hallyn@canonical.com>
Subject: __send_signal: pass q->info, not info, to userns_fixup_signal_uid (v2)

Eric Biederman pointed out that passing info is a bug and could lead to a
NULL pointer deref to boot.

A collection of signal, securebits, filecaps, cap_bounds, and a few other
ltp tests passed with this kernel.

Changelog:
    Nov 18: previous patch missed a leading '&'

Signed-off-by: Serge Hallyn <serge.hallyn@canonical.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
From: Dan Carpenter <dan.carpenter@oracle.com>
Subject: ipc/mqueue: lock() => unlock() typo

There was a double lock typo introduced in b085f4bd6b21 "user namespace:
make signal.c respect user namespaces"

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/mqueue.c    |  7 ++++++-
 kernel/signal.c | 43 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 46 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 9a142a29074..9b7c8ab7d75 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -32,6 +32,7 @@
 #include <linux/nsproxy.h>
 #include <linux/pid.h>
 #include <linux/ipc_namespace.h>
+#include <linux/user_namespace.h>
 #include <linux/slab.h>
 
 #include <net/sock.h>
@@ -542,9 +543,13 @@ static void __do_notify(struct mqueue_inode_info *info)
 			sig_i.si_errno = 0;
 			sig_i.si_code = SI_MESGQ;
 			sig_i.si_value = info->notify.sigev_value;
+			/* map current pid/uid into info->owner's namespaces */
+			rcu_read_lock();
 			sig_i.si_pid = task_tgid_nr_ns(current,
 						ns_of_pid(info->notify_owner));
-			sig_i.si_uid = current_uid();
+			sig_i.si_uid = user_ns_map_uid(info->user->user_ns,
+						current_cred(), current_uid());
+			rcu_read_unlock();
 
 			kill_pid_info(info->notify.sigev_signo,
 				      &sig_i, info->notify_owner);
diff --git a/kernel/signal.c b/kernel/signal.c
index d532f1709fb..c73c4284160 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -28,6 +28,7 @@
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
 #include <linux/nsproxy.h>
+#include <linux/user_namespace.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
 
@@ -1019,6 +1020,34 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
 	return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
 }
 
+/*
+ * map the uid in struct cred into user namespace *ns
+ */
+static inline uid_t map_cred_ns(const struct cred *cred,
+				struct user_namespace *ns)
+{
+	return user_ns_map_uid(ns, cred, cred->uid);
+}
+
+#ifdef CONFIG_USER_NS
+static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
+{
+	if (current_user_ns() == task_cred_xxx(t, user_ns))
+		return;
+
+	if (SI_FROMKERNEL(info))
+		return;
+
+	info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
+					current_cred(), info->si_uid);
+}
+#else
+static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
+{
+	return;
+}
+#endif
+
 static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			int group, int from_ancestor_ns)
 {
@@ -1088,6 +1117,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 				q->info.si_pid = 0;
 			break;
 		}
+
+		userns_fixup_signal_uid(&q->info, t);
+
 	} else if (!is_si_special(info)) {
 		if (sig >= SIGRTMIN && info->si_code != SI_USER) {
 			/*
@@ -1626,7 +1658,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
 	 */
 	rcu_read_lock();
 	info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
-	info.si_uid = __task_cred(tsk)->uid;
+	info.si_uid = map_cred_ns(__task_cred(tsk),
+			task_cred_xxx(tsk->parent, user_ns));
 	rcu_read_unlock();
 
 	info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
@@ -1709,7 +1742,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
 	 */
 	rcu_read_lock();
 	info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
-	info.si_uid = __task_cred(tsk)->uid;
+	info.si_uid = map_cred_ns(__task_cred(tsk),
+			task_cred_xxx(parent, user_ns));
 	rcu_read_unlock();
 
 	info.si_utime = cputime_to_clock_t(tsk->utime);
@@ -2125,8 +2159,11 @@ static int ptrace_signal(int signr, siginfo_t *info,
 		info->si_signo = signr;
 		info->si_errno = 0;
 		info->si_code = SI_USER;
+		rcu_read_lock();
 		info->si_pid = task_pid_vnr(current->parent);
-		info->si_uid = task_uid(current->parent);
+		info->si_uid = map_cred_ns(__task_cred(current->parent),
+				current_user_ns());
+		rcu_read_unlock();
 	}
 
 	/* If the (new) signal is now blocked, requeue it.  */
-- 
cgit v1.2.3-70-g09d2


From bced76aeaca03b45e3b4bdb868cada328e497847 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 11 Jan 2012 13:11:12 +0100
Subject: sched: Fix lockup by limiting load-balance retries on lock-break

Eric and David reported dead machines and traced it to commit
a195f004 ("sched: Fix load-balance lock-breaking"), it turns out
there's still a scenario where we can end up re-trying forever.

Since there is no strict forward progress guarantee in the
load-balance iteration we can get stuck re-retrying the same
task-set over and over.

Creating a forward progress guarantee with the existing
structure is somewhat non-trivial, for now simply terminate the
retry loop after a few tries.

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Tested-by: Eric Dumazet <eric.dumazet@gmail.com>
Reported-by: David Ahern <dsahern@gmail.com>
[ logic cleanup as suggested by Eric ]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Link: http://lkml.kernel.org/r/1326297936.2442.157.camel@twins
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/fair.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8e42de9105f..84adb2d66cb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3130,8 +3130,10 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 }
 
 #define LBF_ALL_PINNED	0x01
-#define LBF_NEED_BREAK	0x02
-#define LBF_ABORT	0x04
+#define LBF_NEED_BREAK	0x02	/* clears into HAD_BREAK */
+#define LBF_HAD_BREAK	0x04
+#define LBF_HAD_BREAKS	0x0C	/* count HAD_BREAKs overflows into ABORT */
+#define LBF_ABORT	0x10
 
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
@@ -4508,7 +4510,9 @@ redo:
 			goto out_balanced;
 
 		if (lb_flags & LBF_NEED_BREAK) {
-			lb_flags &= ~LBF_NEED_BREAK;
+			lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
+			if (lb_flags & LBF_ABORT)
+				goto out_balanced;
 			goto redo;
 		}
 
-- 
cgit v1.2.3-70-g09d2


From 70b1e9161e903a9e1682aca3a832ed29ef876a4d Mon Sep 17 00:00:00 2001
From: Kevin Cernekee <cernekee@gmail.com>
Date: Sat, 12 Nov 2011 19:08:55 -0800
Subject: module: Add comments describing how the "strmap" logic works

Signed-off-by: Kevin Cernekee <cernekee@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 178333c48d1..cf9f1b6b326 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2193,6 +2193,13 @@ static void layout_symtab(struct module *mod, struct load_info *info)
 
 	src = (void *)info->hdr + symsect->sh_offset;
 	nsrc = symsect->sh_size / sizeof(*src);
+
+	/*
+	 * info->strmap has a '1' bit for each byte of .strtab we want to
+	 * keep resident in mod->core_strtab.  Everything else in .strtab
+	 * is unreferenced by the symbols in mod->core_symtab, and will be
+	 * discarded when add_kallsyms() compacts the string table.
+	 */
 	for (ndst = i = 1; i < nsrc; ++i, ++src)
 		if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
 			unsigned int j = src->st_name;
@@ -2215,6 +2222,8 @@ static void layout_symtab(struct module *mod, struct load_info *info)
 
 	/* Append room for core symbols' strings at end of core part. */
 	info->stroffs = mod->core_size;
+
+	/* First strtab byte (and first symtab entry) are zeroes. */
 	__set_bit(0, info->strmap);
 	mod->core_size += bitmap_weight(info->strmap, strsect->sh_size);
 }
-- 
cgit v1.2.3-70-g09d2


From 48fd11880b5ef04270be8a87d9a9a9ee2fdae338 Mon Sep 17 00:00:00 2001
From: Kevin Cernekee <cernekee@gmail.com>
Date: Fri, 13 Jan 2012 09:32:14 +1030
Subject: module: Fix performance regression on modules with large symbol
 tables

Looking at /proc/kallsyms, one starts to ponder whether all of the extra
strtab-related complexity in module.c is worth the memory savings.

Instead of making the add_kallsyms() loop even more complex, I tried the
other route of deleting the strmap logic and naively copying each string
into core_strtab with no consideration for consolidating duplicates.

Performance on an "already exists" insmod of nvidia.ko (runs
add_kallsyms() but does not actually initialize the module):

	Original scheme: 1.230s
	With naive copying: 0.058s

Extra space used: 35k (of a 408k module).

Signed-off-by: Kevin Cernekee <cernekee@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <73defb5e4bca04a6431392cc341112b1@localhost>
---
 kernel/module.c | 65 +++++++++++++++++++--------------------------------------
 1 file changed, 21 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index cf9f1b6b326..4928cffc3dc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -138,7 +138,6 @@ struct load_info {
 	unsigned long len;
 	Elf_Shdr *sechdrs;
 	char *secstrings, *strtab;
-	unsigned long *strmap;
 	unsigned long symoffs, stroffs;
 	struct _ddebug *debug;
 	unsigned int num_debug;
@@ -2178,12 +2177,19 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
 	return true;
 }
 
+/*
+ * We only allocate and copy the strings needed by the parts of symtab
+ * we keep.  This is simple, but has the effect of making multiple
+ * copies of duplicates.  We could be more sophisticated, see
+ * linux-kernel thread starting with
+ * <73defb5e4bca04a6431392cc341112b1@localhost>.
+ */
 static void layout_symtab(struct module *mod, struct load_info *info)
 {
 	Elf_Shdr *symsect = info->sechdrs + info->index.sym;
 	Elf_Shdr *strsect = info->sechdrs + info->index.str;
 	const Elf_Sym *src;
-	unsigned int i, nsrc, ndst;
+	unsigned int i, nsrc, ndst, strtab_size;
 
 	/* Put symbol section at end of init part of module. */
 	symsect->sh_flags |= SHF_ALLOC;
@@ -2194,38 +2200,23 @@ static void layout_symtab(struct module *mod, struct load_info *info)
 	src = (void *)info->hdr + symsect->sh_offset;
 	nsrc = symsect->sh_size / sizeof(*src);
 
-	/*
-	 * info->strmap has a '1' bit for each byte of .strtab we want to
-	 * keep resident in mod->core_strtab.  Everything else in .strtab
-	 * is unreferenced by the symbols in mod->core_symtab, and will be
-	 * discarded when add_kallsyms() compacts the string table.
-	 */
-	for (ndst = i = 1; i < nsrc; ++i, ++src)
+	/* Compute total space required for the core symbols' strtab. */
+	for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src)
 		if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
-			unsigned int j = src->st_name;
-
-			while (!__test_and_set_bit(j, info->strmap)
-			       && info->strtab[j])
-				++j;
-			++ndst;
+			strtab_size += strlen(&info->strtab[src->st_name]) + 1;
+			ndst++;
 		}
 
 	/* Append room for core symbols at end of core part. */
 	info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
-	mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
+	info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
+	mod->core_size += strtab_size;
 
 	/* Put string table section at end of init part of module. */
 	strsect->sh_flags |= SHF_ALLOC;
 	strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
 					 info->index.str) | INIT_OFFSET_MASK;
 	DEBUGP("\t%s\n", info->secstrings + strsect->sh_name);
-
-	/* Append room for core symbols' strings at end of core part. */
-	info->stroffs = mod->core_size;
-
-	/* First strtab byte (and first symtab entry) are zeroes. */
-	__set_bit(0, info->strmap);
-	mod->core_size += bitmap_weight(info->strmap, strsect->sh_size);
 }
 
 static void add_kallsyms(struct module *mod, const struct load_info *info)
@@ -2246,22 +2237,19 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
 		mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
 
 	mod->core_symtab = dst = mod->module_core + info->symoffs;
+	mod->core_strtab = s = mod->module_core + info->stroffs;
 	src = mod->symtab;
 	*dst = *src;
+	*s++ = 0;
 	for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
 		if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
 			continue;
+
 		dst[ndst] = *src;
-		dst[ndst].st_name = bitmap_weight(info->strmap,
-						  dst[ndst].st_name);
-		++ndst;
+		dst[ndst++].st_name = s - mod->core_strtab;
+		s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1;
 	}
 	mod->core_num_syms = ndst;
-
-	mod->core_strtab = s = mod->module_core + info->stroffs;
-	for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i)
-		if (test_bit(i, info->strmap))
-			*++s = mod->strtab[i];
 }
 #else
 static inline void layout_symtab(struct module *mod, struct load_info *info)
@@ -2751,27 +2739,18 @@ static struct module *layout_and_allocate(struct load_info *info)
 	   this is done generically; there doesn't appear to be any
 	   special cases for the architectures. */
 	layout_sections(mod, info);
-
-	info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size)
-			 * sizeof(long), GFP_KERNEL);
-	if (!info->strmap) {
-		err = -ENOMEM;
-		goto free_percpu;
-	}
 	layout_symtab(mod, info);
 
 	/* Allocate and move to the final place */
 	err = move_module(mod, info);
 	if (err)
-		goto free_strmap;
+		goto free_percpu;
 
 	/* Module has been copied to its final place now: return it. */
 	mod = (void *)info->sechdrs[info->index.mod].sh_addr;
 	kmemleak_load_module(mod, info);
 	return mod;
 
-free_strmap:
-	kfree(info->strmap);
 free_percpu:
 	percpu_modfree(mod);
 out:
@@ -2781,7 +2760,6 @@ out:
 /* mod is no longer valid after this! */
 static void module_deallocate(struct module *mod, struct load_info *info)
 {
-	kfree(info->strmap);
 	percpu_modfree(mod);
 	module_free(mod, mod->module_init);
 	module_free(mod, mod->module_core);
@@ -2911,8 +2889,7 @@ static struct module *load_module(void __user *umod,
 	if (err < 0)
 		goto unlink;
 
-	/* Get rid of temporary copy and strmap. */
-	kfree(info.strmap);
+	/* Get rid of temporary copy. */
 	free_copy(&info);
 
 	/* Done! */
-- 
cgit v1.2.3-70-g09d2


From bd77c04772da38fca510c81f78e51f727123b919 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 13 Jan 2012 09:32:14 +1030
Subject: module: struct module_ref should contains long fields

module_ref contains two "unsigned int" fields.

Thats now too small, since some machines can open more than 2^32 files.

Check commit 518de9b39e8 (fs: allow for more than 2^31 files) for
reference.

We can add an aligned(2 * sizeof(unsigned long)) attribute to force
alloc_percpu() allocating module_ref areas in single cache lines.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Rusty Russell <rusty@rustcorp.com.au>
CC: Tejun Heo <tj@kernel.org>
CC: Robin Holt <holt@sgi.com>
CC: David Miller <davem@davemloft.net>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h      | 21 ++++++++++++++++-----
 kernel/debug/kdb/kdb_main.c |  2 +-
 kernel/module.c             |  8 ++++----
 3 files changed, 21 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index 3cb7839a60b..4598bf03e98 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -205,6 +205,20 @@ enum module_state
 	MODULE_STATE_GOING,
 };
 
+/**
+ * struct module_ref - per cpu module reference counts
+ * @incs: number of module get on this cpu
+ * @decs: number of module put on this cpu
+ *
+ * We force an alignment on 8 or 16 bytes, so that alloc_percpu()
+ * put @incs/@decs in same cache line, with no extra memory cost,
+ * since alloc_percpu() is fine grained.
+ */
+struct module_ref {
+	unsigned long incs;
+	unsigned long decs;
+} __attribute((aligned(2 * sizeof(unsigned long))));
+
 struct module
 {
 	enum module_state state;
@@ -347,10 +361,7 @@ struct module
 	/* Destruction function. */
 	void (*exit)(void);
 
-	struct module_ref {
-		unsigned int incs;
-		unsigned int decs;
-	} __percpu *refptr;
+	struct module_ref __percpu *refptr;
 #endif
 
 #ifdef CONFIG_CONSTRUCTORS
@@ -434,7 +445,7 @@ extern void __module_put_and_exit(struct module *mod, long code)
 #define module_put_and_exit(code) __module_put_and_exit(THIS_MODULE, code);
 
 #ifdef CONFIG_MODULE_UNLOAD
-unsigned int module_refcount(struct module *mod);
+unsigned long module_refcount(struct module *mod);
 void __symbol_put(const char *symbol);
 #define symbol_put(x) __symbol_put(MODULE_SYMBOL_PREFIX #x)
 void symbol_put_addr(void *addr);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 63786e71a3c..e2ae7349437 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1982,7 +1982,7 @@ static int kdb_lsmod(int argc, const char **argv)
 		kdb_printf("%-20s%8u  0x%p ", mod->name,
 			   mod->core_size, (void *)mod);
 #ifdef CONFIG_MODULE_UNLOAD
-		kdb_printf("%4d ", module_refcount(mod));
+		kdb_printf("%4ld ", module_refcount(mod));
 #endif
 		if (mod->state == MODULE_STATE_GOING)
 			kdb_printf(" (Unloading)");
diff --git a/kernel/module.c b/kernel/module.c
index 4928cffc3dc..14b8e82e05d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -725,9 +725,9 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
 	}
 }
 
-unsigned int module_refcount(struct module *mod)
+unsigned long module_refcount(struct module *mod)
 {
-	unsigned int incs = 0, decs = 0;
+	unsigned long incs = 0, decs = 0;
 	int cpu;
 
 	for_each_possible_cpu(cpu)
@@ -853,7 +853,7 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
 	struct module_use *use;
 	int printed_something = 0;
 
-	seq_printf(m, " %u ", module_refcount(mod));
+	seq_printf(m, " %lu ", module_refcount(mod));
 
 	/* Always include a trailing , so userspace can differentiate
            between this and the old multi-field proc format. */
@@ -903,7 +903,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
 static ssize_t show_refcnt(struct module_attribute *mattr,
 			   struct module_kobject *mk, char *buffer)
 {
-	return sprintf(buffer, "%u\n", module_refcount(mk->mod));
+	return sprintf(buffer, "%lu\n", module_refcount(mk->mod));
 }
 
 static struct module_attribute refcnt = {
-- 
cgit v1.2.3-70-g09d2


From 5e12416927975aa3c58394cea15db6c3e488a033 Mon Sep 17 00:00:00 2001
From: Jim Cromie <jim.cromie@gmail.com>
Date: Tue, 6 Dec 2011 12:11:31 -0700
Subject: module: replace DEBUGP with pr_debug

Use more flexible pr_debug.  This allows:

  echo "module module +p" > /dbg/dynamic_debug/control

to turn on debug messages when needed.

Signed-off-by: Jim Cromie <jim.cromie@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 46 ++++++++++++++++++++--------------------------
 1 file changed, 20 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 14b8e82e05d..b02d6335f8a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -62,12 +62,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(fmt , a...)
-#endif
-
 #ifndef ARCH_SHF_SMALL
 #define ARCH_SHF_SMALL 0
 #endif
@@ -409,7 +403,7 @@ const struct kernel_symbol *find_symbol(const char *name,
 		return fsa.sym;
 	}
 
-	DEBUGP("Failed to find symbol %s\n", name);
+	pr_debug("Failed to find symbol %s\n", name);
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(find_symbol);
@@ -599,11 +593,11 @@ static int already_uses(struct module *a, struct module *b)
 
 	list_for_each_entry(use, &b->source_list, source_list) {
 		if (use->source == a) {
-			DEBUGP("%s uses %s!\n", a->name, b->name);
+			pr_debug("%s uses %s!\n", a->name, b->name);
 			return 1;
 		}
 	}
-	DEBUGP("%s does not use %s!\n", a->name, b->name);
+	pr_debug("%s does not use %s!\n", a->name, b->name);
 	return 0;
 }
 
@@ -618,7 +612,7 @@ static int add_module_usage(struct module *a, struct module *b)
 {
 	struct module_use *use;
 
-	DEBUGP("Allocating new usage for %s.\n", a->name);
+	pr_debug("Allocating new usage for %s.\n", a->name);
 	use = kmalloc(sizeof(*use), GFP_ATOMIC);
 	if (!use) {
 		printk(KERN_WARNING "%s: out of memory loading\n", a->name);
@@ -662,7 +656,7 @@ static void module_unload_free(struct module *mod)
 	mutex_lock(&module_mutex);
 	list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
 		struct module *i = use->target;
-		DEBUGP("%s unusing %s\n", mod->name, i->name);
+		pr_debug("%s unusing %s\n", mod->name, i->name);
 		module_put(i);
 		list_del(&use->source_list);
 		list_del(&use->target_list);
@@ -760,7 +754,7 @@ static void wait_for_zero_refcount(struct module *mod)
 	/* Since we might sleep for some time, release the mutex first */
 	mutex_unlock(&module_mutex);
 	for (;;) {
-		DEBUGP("Looking at refcount...\n");
+		pr_debug("Looking at refcount...\n");
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		if (module_refcount(mod) == 0)
 			break;
@@ -803,7 +797,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 	if (mod->state != MODULE_STATE_LIVE) {
 		/* FIXME: if (force), slam module count and wake up
                    waiter --RR */
-		DEBUGP("%s already dying\n", mod->name);
+		pr_debug("%s already dying\n", mod->name);
 		ret = -EBUSY;
 		goto out;
 	}
@@ -1056,7 +1050,7 @@ static int check_version(Elf_Shdr *sechdrs,
 
 		if (versions[i].crc == maybe_relocated(*crc, crc_owner))
 			return 1;
-		DEBUGP("Found checksum %lX vs module %lX\n",
+		pr_debug("Found checksum %lX vs module %lX\n",
 		       maybe_relocated(*crc, crc_owner), versions[i].crc);
 		goto bad_version;
 	}
@@ -1833,7 +1827,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
 		case SHN_COMMON:
 			/* We compiled with -fno-common.  These are not
 			   supposed to happen.  */
-			DEBUGP("Common symbol: %s\n", name);
+			pr_debug("Common symbol: %s\n", name);
 			printk("%s: please compile with -fno-common\n",
 			       mod->name);
 			ret = -ENOEXEC;
@@ -1841,7 +1835,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
 
 		case SHN_ABS:
 			/* Don't need to do anything */
-			DEBUGP("Absolute symbol: 0x%08lx\n",
+			pr_debug("Absolute symbol: 0x%08lx\n",
 			       (long)sym[i].st_value);
 			break;
 
@@ -1965,7 +1959,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
 	for (i = 0; i < info->hdr->e_shnum; i++)
 		info->sechdrs[i].sh_entsize = ~0UL;
 
-	DEBUGP("Core section allocation order:\n");
+	pr_debug("Core section allocation order:\n");
 	for (m = 0; m < ARRAY_SIZE(masks); ++m) {
 		for (i = 0; i < info->hdr->e_shnum; ++i) {
 			Elf_Shdr *s = &info->sechdrs[i];
@@ -1977,7 +1971,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
 			    || strstarts(sname, ".init"))
 				continue;
 			s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
-			DEBUGP("\t%s\n", name);
+			pr_debug("\t%s\n", sname);
 		}
 		switch (m) {
 		case 0: /* executable */
@@ -1994,7 +1988,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
 		}
 	}
 
-	DEBUGP("Init section allocation order:\n");
+	pr_debug("Init section allocation order:\n");
 	for (m = 0; m < ARRAY_SIZE(masks); ++m) {
 		for (i = 0; i < info->hdr->e_shnum; ++i) {
 			Elf_Shdr *s = &info->sechdrs[i];
@@ -2007,7 +2001,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
 				continue;
 			s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
 					 | INIT_OFFSET_MASK);
-			DEBUGP("\t%s\n", sname);
+			pr_debug("\t%s\n", sname);
 		}
 		switch (m) {
 		case 0: /* executable */
@@ -2195,7 +2189,7 @@ static void layout_symtab(struct module *mod, struct load_info *info)
 	symsect->sh_flags |= SHF_ALLOC;
 	symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
 					 info->index.sym) | INIT_OFFSET_MASK;
-	DEBUGP("\t%s\n", info->secstrings + symsect->sh_name);
+	pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
 
 	src = (void *)info->hdr + symsect->sh_offset;
 	nsrc = symsect->sh_size / sizeof(*src);
@@ -2216,7 +2210,7 @@ static void layout_symtab(struct module *mod, struct load_info *info)
 	strsect->sh_flags |= SHF_ALLOC;
 	strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
 					 info->index.str) | INIT_OFFSET_MASK;
-	DEBUGP("\t%s\n", info->secstrings + strsect->sh_name);
+	pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
 }
 
 static void add_kallsyms(struct module *mod, const struct load_info *info)
@@ -2618,7 +2612,7 @@ static int move_module(struct module *mod, struct load_info *info)
 	mod->module_init = ptr;
 
 	/* Transfer each section which specifies SHF_ALLOC */
-	DEBUGP("final section addresses:\n");
+	pr_debug("final section addresses:\n");
 	for (i = 0; i < info->hdr->e_shnum; i++) {
 		void *dest;
 		Elf_Shdr *shdr = &info->sechdrs[i];
@@ -2636,8 +2630,8 @@ static int move_module(struct module *mod, struct load_info *info)
 			memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
 		/* Update sh_addr to point to copy in image. */
 		shdr->sh_addr = (unsigned long)dest;
-		DEBUGP("\t0x%lx %s\n",
-		       shdr->sh_addr, info->secstrings + shdr->sh_name);
+		pr_debug("\t0x%lx %s\n",
+			 (long)shdr->sh_addr, info->secstrings + shdr->sh_name);
 	}
 
 	return 0;
@@ -2798,7 +2792,7 @@ static struct module *load_module(void __user *umod,
 	struct module *mod;
 	long err;
 
-	DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
+	pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n",
 	       umod, len, uargs);
 
 	/* Copy in the blobs from userspace, check they are vaguely sane. */
-- 
cgit v1.2.3-70-g09d2


From 8487bfd954928660a52e91384a9b1f1049217e35 Mon Sep 17 00:00:00 2001
From: Jim Cromie <jim.cromie@gmail.com>
Date: Tue, 6 Dec 2011 12:11:31 -0700
Subject: kernel/params: replace DEBUGP with pr_debug

Use more flexible pr_debug.  This allows:

  echo "module params +p" > /dbg/dynamic_debug/control

to turn on debug messages when needed.

Signed-off-by: Jim Cromie <jim.cromie@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/params.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 65aae11eb93..9240664af11 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -25,12 +25,6 @@
 #include <linux/slab.h>
 #include <linux/ctype.h>
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(fmt, a...)
-#endif
-
 /* Protects all parameters, and incidentally kmalloced_param list. */
 static DEFINE_MUTEX(param_lock);
 
@@ -105,7 +99,7 @@ static int parse_one(char *param,
 			/* No one handled NULL, so do it here. */
 			if (!val && params[i].ops->set != param_set_bool)
 				return -EINVAL;
-			DEBUGP("They are equal!  Calling %p\n",
+			pr_debug("They are equal!  Calling %p\n",
 			       params[i].ops->set);
 			mutex_lock(&param_lock);
 			err = params[i].ops->set(val, &params[i]);
@@ -115,11 +109,11 @@ static int parse_one(char *param,
 	}
 
 	if (handle_unknown) {
-		DEBUGP("Unknown argument: calling %p\n", handle_unknown);
+		pr_debug("Unknown argument: calling %p\n", handle_unknown);
 		return handle_unknown(param, val);
 	}
 
-	DEBUGP("Unknown argument `%s'\n", param);
+	pr_debug("Unknown argument `%s'\n", param);
 	return -ENOENT;
 }
 
@@ -184,7 +178,7 @@ int parse_args(const char *name,
 {
 	char *param, *val;
 
-	DEBUGP("Parsing ARGS: %s\n", args);
+	pr_debug("Parsing ARGS: %s\n", args);
 
 	/* Chew leading spaces */
 	args = skip_spaces(args);
-- 
cgit v1.2.3-70-g09d2


From cca3e707301862ca9b9327e6a732463982f8cd1b Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Fri, 13 Jan 2012 09:32:15 +1030
Subject: modules: sysfs - export: taint, coresize, initsize

Recent tools do not want to use /proc to retrieve module information. A few
values are currently missing from sysfs to replace the information available
in /proc/modules.

This adds /sys/module/*/{coresize,initsize,taint} attributes.

TAINT_PROPRIETARY_MODULE (P) and TAINT_OOT_MODULE (O) flags are both always
shown now, and do no longer exclude each other, also in /proc/modules.

Replace the open-coded sysfs attribute initializers with the __ATTR() macro.

Add the new attributes to Documentation/ABI.

Cc: Lucas De Marchi <lucas.demarchi@profusion.mobi>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/ABI/testing/sysfs-module | 16 ++++++
 kernel/module.c                        | 93 +++++++++++++++++++++++-----------
 2 files changed, 80 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/ABI/testing/sysfs-module b/Documentation/ABI/testing/sysfs-module
index 9489ea8e294..47064c2b1f7 100644
--- a/Documentation/ABI/testing/sysfs-module
+++ b/Documentation/ABI/testing/sysfs-module
@@ -33,3 +33,19 @@ Description:	Maximum time allowed for periodic transfers per microframe (μs)
 		Beware, non-standard modes are usually not thoroughly tested by
 		hardware designers, and the hardware can malfunction when this
 		setting differ from default 100.
+
+What:		/sys/module/*/{coresize,initsize}
+Date:		Jan 2012
+KernelVersion:»·3.3
+Contact:	Kay Sievers <kay.sievers@vrfy.org>
+Description:	Module size in bytes.
+
+What:		/sys/module/*/taint
+Date:		Jan 2012
+KernelVersion:»·3.3
+Contact:	Kay Sievers <kay.sievers@vrfy.org>
+Description:	Module taint flags:
+			P - proprietary module
+			O - out-of-tree module
+			F - force-loaded module
+			C - staging driver module
diff --git a/kernel/module.c b/kernel/module.c
index b02d6335f8a..acf6ed3ebe8 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -842,6 +842,26 @@ out:
 	return ret;
 }
 
+static size_t module_flags_taint(struct module *mod, char *buf)
+{
+	size_t l = 0;
+
+	if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
+		buf[l++] = 'P';
+	if (mod->taints & (1 << TAINT_OOT_MODULE))
+		buf[l++] = 'O';
+	if (mod->taints & (1 << TAINT_FORCED_MODULE))
+		buf[l++] = 'F';
+	if (mod->taints & (1 << TAINT_CRAP))
+		buf[l++] = 'C';
+	/*
+	 * TAINT_FORCED_RMMOD: could be added.
+	 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
+	 * apply to modules.
+	 */
+	return l;
+}
+
 static inline void print_unload_info(struct seq_file *m, struct module *mod)
 {
 	struct module_use *use;
@@ -900,10 +920,8 @@ static ssize_t show_refcnt(struct module_attribute *mattr,
 	return sprintf(buffer, "%lu\n", module_refcount(mk->mod));
 }
 
-static struct module_attribute refcnt = {
-	.attr = { .name = "refcnt", .mode = 0444 },
-	.show = show_refcnt,
-};
+static struct module_attribute modinfo_refcnt =
+	__ATTR(refcnt, 0444, show_refcnt, NULL);
 
 void module_put(struct module *module)
 {
@@ -963,10 +981,8 @@ static ssize_t show_initstate(struct module_attribute *mattr,
 	return sprintf(buffer, "%s\n", state);
 }
 
-static struct module_attribute initstate = {
-	.attr = { .name = "initstate", .mode = 0444 },
-	.show = show_initstate,
-};
+static struct module_attribute modinfo_initstate =
+	__ATTR(initstate, 0444, show_initstate, NULL);
 
 static ssize_t store_uevent(struct module_attribute *mattr,
 			    struct module_kobject *mk,
@@ -979,18 +995,50 @@ static ssize_t store_uevent(struct module_attribute *mattr,
 	return count;
 }
 
-struct module_attribute module_uevent = {
-	.attr = { .name = "uevent", .mode = 0200 },
-	.store = store_uevent,
-};
+struct module_attribute module_uevent =
+	__ATTR(uevent, 0200, NULL, store_uevent);
+
+static ssize_t show_coresize(struct module_attribute *mattr,
+			     struct module_kobject *mk, char *buffer)
+{
+	return sprintf(buffer, "%u\n", mk->mod->core_size);
+}
+
+static struct module_attribute modinfo_coresize =
+	__ATTR(coresize, 0444, show_coresize, NULL);
+
+static ssize_t show_initsize(struct module_attribute *mattr,
+			     struct module_kobject *mk, char *buffer)
+{
+	return sprintf(buffer, "%u\n", mk->mod->init_size);
+}
+
+static struct module_attribute modinfo_initsize =
+	__ATTR(initsize, 0444, show_initsize, NULL);
+
+static ssize_t show_taint(struct module_attribute *mattr,
+			  struct module_kobject *mk, char *buffer)
+{
+	size_t l;
+
+	l = module_flags_taint(mk->mod, buffer);
+	buffer[l++] = '\n';
+	return l;
+}
+
+static struct module_attribute modinfo_taint =
+	__ATTR(taint, 0444, show_taint, NULL);
 
 static struct module_attribute *modinfo_attrs[] = {
+	&module_uevent,
 	&modinfo_version,
 	&modinfo_srcversion,
-	&initstate,
-	&module_uevent,
+	&modinfo_initstate,
+	&modinfo_coresize,
+	&modinfo_initsize,
+	&modinfo_taint,
 #ifdef CONFIG_MODULE_UNLOAD
-	&refcnt,
+	&modinfo_refcnt,
 #endif
 	NULL,
 };
@@ -3236,20 +3284,7 @@ static char *module_flags(struct module *mod, char *buf)
 	    mod->state == MODULE_STATE_GOING ||
 	    mod->state == MODULE_STATE_COMING) {
 		buf[bx++] = '(';
-		if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
-			buf[bx++] = 'P';
-		else if (mod->taints & (1 << TAINT_OOT_MODULE))
-			buf[bx++] = 'O';
-		if (mod->taints & (1 << TAINT_FORCED_MODULE))
-			buf[bx++] = 'F';
-		if (mod->taints & (1 << TAINT_CRAP))
-			buf[bx++] = 'C';
-		/*
-		 * TAINT_FORCED_RMMOD: could be added.
-		 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
-		 * apply to modules.
-		 */
-
+		bx += module_flags_taint(mod, buf + bx);
 		/* Show a - for module-is-being-unloaded */
 		if (mod->state == MODULE_STATE_GOING)
 			buf[bx++] = '-';
-- 
cgit v1.2.3-70-g09d2


From 69116f279a9eaf4c540934269342d9149538fc79 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 13 Jan 2012 09:32:17 +1030
Subject: module_param: avoid bool abuse, add bint for special cases.

For historical reasons, we allow module_param(bool) to take an int (or
an unsigned int).  That's going away.

A few drivers really want an int: they set it to -1 and a parameter
will set it to 0 or 1.  This sucks: reading them from sysfs will give
'Y' for both -1 and 1, but if we change it to an int, then the users
might be broken (if they did "param" instead of "param=1").

Use a new 'bint' parser for them.

(ntfs has a different problem: it needs an int for debug_msgs because
it's also exposed via sysctl.)

Cc: Steve Glendinning <steve.glendinning@smsc.com>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Guenter Roeck <guenter.roeck@ericsson.com>
Cc: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
Cc: Christoph Raisch <raisch@de.ibm.com>
Cc: Roland Dreier <roland@kernel.org>
Cc: Sean Hefty <sean.hefty@intel.com>
Cc: Hal Rosenstock <hal.rosenstock@gmail.com>
Cc: linux390@de.ibm.com
Cc: Anton Altaparmakov <anton@tuxera.com>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: lm-sensors@lm-sensors.org
Cc: linux-rdma@vger.kernel.org
Cc: linux-s390@vger.kernel.org
Cc: linux-ntfs-dev@lists.sourceforge.net
Cc: alsa-devel@alsa-project.org
Acked-by: Takashi Iwai <tiwai@suse.de> (For the sound part)
Acked-by: Guenter Roeck <guenter.roeck@ericsson.com> (For the hwmon driver)
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/hwmon/emc2103.c                |  2 +-
 drivers/infiniband/hw/ehca/ehca_main.c |  2 +-
 drivers/s390/cio/cmf.c                 |  2 +-
 fs/ntfs/super.c                        |  2 +-
 include/linux/moduleparam.h            |  6 ++++++
 kernel/params.c                        | 24 ++++++++++++++++++++++++
 sound/pci/intel8x0.c                   |  4 ++--
 7 files changed, 36 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/drivers/hwmon/emc2103.c b/drivers/hwmon/emc2103.c
index 848a2b0bc83..865063914d7 100644
--- a/drivers/hwmon/emc2103.c
+++ b/drivers/hwmon/emc2103.c
@@ -55,7 +55,7 @@ static const u8 REG_TEMP_MAX[4] = { 0x34, 0x30, 0x31, 0x32 };
  * it.  Default is to leave the device in the state it's already in (-1).
  * This parameter allows APD mode to be optionally forced on or off */
 static int apd = -1;
-module_param(apd, bool, 0);
+module_param(apd, bint, 0);
 MODULE_PARM_DESC(init, "Set to zero to disable anti-parallel diode mode");
 
 struct temperature {
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
index c240e9972cb..8af8d4f7bdb 100644
--- a/drivers/infiniband/hw/ehca/ehca_main.c
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -82,7 +82,7 @@ module_param_named(port_act_time, ehca_port_act_time, int,  S_IRUGO);
 module_param_named(poll_all_eqs,  ehca_poll_all_eqs,  bool, S_IRUGO);
 module_param_named(static_rate,   ehca_static_rate,   int,  S_IRUGO);
 module_param_named(scaling_code,  ehca_scaling_code,  bool, S_IRUGO);
-module_param_named(lock_hcalls,   ehca_lock_hcalls,   bool, S_IRUGO);
+module_param_named(lock_hcalls,   ehca_lock_hcalls,   bint, S_IRUGO);
 module_param_named(number_of_cqs, ehca_max_cq,        int,  S_IRUGO);
 module_param_named(number_of_qps, ehca_max_qp,        int,  S_IRUGO);
 
diff --git a/drivers/s390/cio/cmf.c b/drivers/s390/cio/cmf.c
index 2985eb43948..204ca728e7f 100644
--- a/drivers/s390/cio/cmf.c
+++ b/drivers/s390/cio/cmf.c
@@ -98,7 +98,7 @@ enum cmb_format {
  * enum cmb_format.
  */
 static int format = CMF_AUTODETECT;
-module_param(format, bool, 0444);
+module_param(format, bint, 0444);
 
 /**
  * struct cmb_operations - functions to use depending on cmb_format
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 608be451609..5a4a8af5c40 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3198,7 +3198,7 @@ MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparm
 MODULE_VERSION(NTFS_VERSION);
 MODULE_LICENSE("GPL");
 #ifdef DEBUG
-module_param(debug_msgs, bool, 0);
+module_param(debug_msgs, bint, 0);
 MODULE_PARM_DESC(debug_msgs, "Enable debug messages.");
 #endif
 
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 794d4b0f121..6bdde0c3bcc 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -367,6 +367,12 @@ extern int param_set_invbool(const char *val, const struct kernel_param *kp);
 extern int param_get_invbool(char *buffer, const struct kernel_param *kp);
 #define param_check_invbool(name, p) __param_check(name, p, bool)
 
+/* An int, which can only be set like a bool (though it shows as an int). */
+extern struct kernel_param_ops param_ops_bint;
+extern int param_set_bint(const char *val, const struct kernel_param *kp);
+#define param_get_bint param_get_int
+#define param_check_bint param_check_int
+
 /**
  * module_param_array - a parameter which is an array of some type
  * @name: the name of the array variable
diff --git a/kernel/params.c b/kernel/params.c
index 9240664af11..32ee0430828 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -363,6 +363,30 @@ struct kernel_param_ops param_ops_invbool = {
 };
 EXPORT_SYMBOL(param_ops_invbool);
 
+int param_set_bint(const char *val, const struct kernel_param *kp)
+{
+	struct kernel_param boolkp;
+	bool v;
+	int ret;
+
+	/* Match bool exactly, by re-using it. */
+	boolkp = *kp;
+	boolkp.arg = &v;
+	boolkp.flags |= KPARAM_ISBOOL;
+
+	ret = param_set_bool(val, &boolkp);
+	if (ret == 0)
+		*(int *)kp->arg = v;
+	return ret;
+}
+EXPORT_SYMBOL(param_set_bint);
+
+struct kernel_param_ops param_ops_bint = {
+	.set = param_set_bint,
+	.get = param_get_int,
+};
+EXPORT_SYMBOL(param_ops_bint);
+
 /* We break the rule and mangle the string. */
 static int param_array(const char *name,
 		       const char *val,
diff --git a/sound/pci/intel8x0.c b/sound/pci/intel8x0.c
index 40b181bab93..9f3b01bb72c 100644
--- a/sound/pci/intel8x0.c
+++ b/sound/pci/intel8x0.c
@@ -95,13 +95,13 @@ module_param(ac97_quirk, charp, 0444);
 MODULE_PARM_DESC(ac97_quirk, "AC'97 workaround for strange hardware.");
 module_param(buggy_semaphore, bool, 0444);
 MODULE_PARM_DESC(buggy_semaphore, "Enable workaround for hardwares with problematic codec semaphores.");
-module_param(buggy_irq, bool, 0444);
+module_param(buggy_irq, bint, 0444);
 MODULE_PARM_DESC(buggy_irq, "Enable workaround for buggy interrupts on some motherboards.");
 module_param(xbox, bool, 0444);
 MODULE_PARM_DESC(xbox, "Set to 1 for Xbox, if you have problems with the AC'97 codec detection.");
 module_param(spdif_aclink, int, 0444);
 MODULE_PARM_DESC(spdif_aclink, "S/PDIF over AC-link.");
-module_param(inside_vm, bool, 0444);
+module_param(inside_vm, bint, 0444);
 MODULE_PARM_DESC(inside_vm, "KVM/Parallels optimization.");
 
 /* just for backward compatibility */
-- 
cgit v1.2.3-70-g09d2


From 29d4d6df107b9d86982dc759f5b1ddfe2c6b29c0 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 13 Jan 2012 09:32:17 +1030
Subject: printk: fix unnecessary module_param_name.

You don't need module_param_name if the name is the same!

Cc: Yanmin Zhang <yanmin_zhang@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 989e4a52da7..63b3bc31fe3 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -532,7 +532,7 @@ static int __init ignore_loglevel_setup(char *str)
 }
 
 early_param("ignore_loglevel", ignore_loglevel_setup);
-module_param_named(ignore_loglevel, ignore_loglevel, bool, S_IRUGO | S_IWUSR);
+module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
 	"print all kernel messages to the console.");
 
-- 
cgit v1.2.3-70-g09d2


From 6d6a55ec0877393f467067d44b9a2a8c2e4a82d2 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 13 Jan 2012 09:32:18 +1030
Subject: kernel/async: remove redundant declaration.

It's in linux/init.h, and I'm about to change it to a bool.

Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/async.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/async.c b/kernel/async.c
index 80b74b88fef..bd0c168a3bb 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -78,8 +78,6 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done);
 
 static atomic_t entry_count;
 
-extern int initcall_debug;
-
 
 /*
  * MUST be called with the lock held!
-- 
cgit v1.2.3-70-g09d2


From 2329abfa344a9a824bc4c71f2415528777265510 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 13 Jan 2012 09:32:18 +1030
Subject: module_param: make bool parameters really bool (core code)

module_param(bool) used to counter-intuitively take an int.  In
fddd5201 (mid-2009) we allowed bool or int/unsigned int using a messy
trick.

It's time to remove the int/unsigned int option.  For this version
it'll simply give a warning, but it'll break next kernel version.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/init.h   | 3 ++-
 init/main.c            | 2 +-
 kernel/irq/internals.h | 2 +-
 kernel/irq/spurious.c  | 2 +-
 kernel/printk.c        | 8 ++++----
 5 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/init.h b/include/linux/init.h
index 9146f39cddd..6b951095a42 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -2,6 +2,7 @@
 #define _LINUX_INIT_H
 
 #include <linux/compiler.h>
+#include <linux/types.h>
 
 /* These macros are used to mark some functions or 
  * initialized data (doesn't apply to uninitialized data)
@@ -156,7 +157,7 @@ void prepare_namespace(void);
 
 extern void (*late_time_init)(void);
 
-extern int initcall_debug;
+extern bool initcall_debug;
 
 #endif
   
diff --git a/init/main.c b/init/main.c
index 415548e808d..ff49a6dacfb 100644
--- a/init/main.c
+++ b/init/main.c
@@ -648,7 +648,7 @@ static void __init do_ctors(void)
 #endif
 }
 
-int initcall_debug;
+bool initcall_debug;
 core_param(initcall_debug, initcall_debug, bool, 0644);
 
 static char msgbuf[64];
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index a73dd6c7372..b7952316016 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -15,7 +15,7 @@
 
 #define istate core_internal_state__do_not_mess_with_it
 
-extern int noirqdebug;
+extern bool noirqdebug;
 
 /*
  * Bits used by threaded handlers:
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dc813a948be..611cd6003c4 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -325,7 +325,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
 	desc->irqs_unhandled = 0;
 }
 
-int noirqdebug __read_mostly;
+bool noirqdebug __read_mostly;
 
 int noirqdebug_setup(char *str)
 {
diff --git a/kernel/printk.c b/kernel/printk.c
index 63b3bc31fe3..13c0a1143f4 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -521,7 +521,7 @@ static void __call_console_drivers(unsigned start, unsigned end)
 	}
 }
 
-static int __read_mostly ignore_loglevel;
+static bool __read_mostly ignore_loglevel;
 
 static int __init ignore_loglevel_setup(char *str)
 {
@@ -696,9 +696,9 @@ static void zap_locks(void)
 }
 
 #if defined(CONFIG_PRINTK_TIME)
-static int printk_time = 1;
+static bool printk_time = 1;
 #else
-static int printk_time = 0;
+static bool printk_time = 0;
 #endif
 module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
 
@@ -1098,7 +1098,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
 	return -1;
 }
 
-int console_suspend_enabled = 1;
+bool console_suspend_enabled = 1;
 EXPORT_SYMBOL(console_suspend_enabled);
 
 static int __init console_suspend_disable(char *str)
-- 
cgit v1.2.3-70-g09d2


From efeb156e7275c5b6c6e0f96aceb3c6abf98fc392 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Thu, 12 Jan 2012 17:17:11 -0800
Subject: kprobes: silence DEBUG_STRICT_USER_COPY_CHECKS=y warning

Enabling DEBUG_STRICT_USER_COPY_CHECKS causes the following warning:

  In file included from arch/x86/include/asm/uaccess.h:573,
                   from kernel/kprobes.c:55:
  In function 'copy_from_user',
      inlined from 'write_enabled_file_bool' at
      kernel/kprobes.c:2191:
  arch/x86/include/asm/uaccess_64.h:65:
  warning: call to 'copy_from_user_overflow' declared with attribute warning: copy_from_user() buffer size is not provably correct

presumably due to buf_size being signed causing GCC to fail to see that
buf_size can't become negative.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e5d84644823..95dd7212e61 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2198,7 +2198,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
 	       const char __user *user_buf, size_t count, loff_t *ppos)
 {
 	char buf[32];
-	int buf_size;
+	size_t buf_size;
 
 	buf_size = min(count, (sizeof(buf)-1));
 	if (copy_from_user(buf, user_buf, buf_size))
-- 
cgit v1.2.3-70-g09d2


From 9402c95f34a66e81eba473a2f7267bbae5a1dee2 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 12 Jan 2012 17:17:17 -0800
Subject: treewide: remove useless NORET_TYPE macro and uses

It's a very old and now unused prototype marking so just delete it.

Neaten panic pointer argument style to keep checkpatch quiet.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/avr32/include/asm/system.h        | 2 +-
 arch/avr32/kernel/traps.c              | 2 +-
 arch/ia64/kernel/machine_kexec.c       | 2 +-
 arch/m68k/amiga/config.c               | 2 +-
 arch/mips/include/asm/ptrace.h         | 2 +-
 arch/mips/kernel/traps.c               | 2 +-
 arch/powerpc/kernel/machine_kexec_32.c | 2 +-
 arch/powerpc/kernel/machine_kexec_64.c | 6 +++---
 arch/s390/kernel/nmi.c                 | 2 +-
 arch/tile/kernel/machine_kexec.c       | 4 ++--
 include/linux/kernel.h                 | 6 +++---
 include/linux/linkage.h                | 1 -
 include/linux/sched.h                  | 2 +-
 kernel/exit.c                          | 6 +++---
 kernel/panic.c                         | 2 +-
 15 files changed, 21 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/arch/avr32/include/asm/system.h b/arch/avr32/include/asm/system.h
index 9702c2213e1..62d9ded0163 100644
--- a/arch/avr32/include/asm/system.h
+++ b/arch/avr32/include/asm/system.h
@@ -169,7 +169,7 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
 #define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n))
 
 struct pt_regs;
-void NORET_TYPE die(const char *str, struct pt_regs *regs, long err);
+void die(const char *str, struct pt_regs *regs, long err);
 void _exception(long signr, struct pt_regs *regs, int code,
 		unsigned long addr);
 
diff --git a/arch/avr32/kernel/traps.c b/arch/avr32/kernel/traps.c
index 7aa25756412..3d760c06f02 100644
--- a/arch/avr32/kernel/traps.c
+++ b/arch/avr32/kernel/traps.c
@@ -24,7 +24,7 @@
 
 static DEFINE_SPINLOCK(die_lock);
 
-void NORET_TYPE die(const char *str, struct pt_regs *regs, long err)
+void die(const char *str, struct pt_regs *regs, long err)
 {
 	static int die_counter;
 
diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
index 3d3aeef4694..581a16d5e85 100644
--- a/arch/ia64/kernel/machine_kexec.c
+++ b/arch/ia64/kernel/machine_kexec.c
@@ -27,7 +27,7 @@
 #include <asm/sal.h>
 #include <asm/mca.h>
 
-typedef NORET_TYPE void (*relocate_new_kernel_t)(
+typedef void (*relocate_new_kernel_t)(
 					unsigned long indirection_page,
 					unsigned long start_address,
 					struct ia64_boot_param *boot_param,
diff --git a/arch/m68k/amiga/config.c b/arch/m68k/amiga/config.c
index 82a4bb51d5d..a3b0558328b 100644
--- a/arch/m68k/amiga/config.c
+++ b/arch/m68k/amiga/config.c
@@ -511,7 +511,7 @@ static unsigned long amiga_gettimeoffset(void)
 	return ticks + offset;
 }
 
-static NORET_TYPE void amiga_reset(void)
+static void amiga_reset(void)
     ATTRIB_NORET;
 
 static void amiga_reset(void)
diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h
index de39b1f343e..3d913259e50 100644
--- a/arch/mips/include/asm/ptrace.h
+++ b/arch/mips/include/asm/ptrace.h
@@ -144,7 +144,7 @@ extern int ptrace_set_watch_regs(struct task_struct *child,
 extern asmlinkage void syscall_trace_enter(struct pt_regs *regs);
 extern asmlinkage void syscall_trace_leave(struct pt_regs *regs);
 
-extern NORET_TYPE void die(const char *, struct pt_regs *) ATTRIB_NORET;
+extern void die(const char *, struct pt_regs *) ATTRIB_NORET;
 
 static inline void die_if_kernel(const char *str, struct pt_regs *regs)
 {
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 5c8a49d5505..725e9a5ca96 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -1340,7 +1340,7 @@ void ejtag_exception_handler(struct pt_regs *regs)
 /*
  * NMI exception handler.
  */
-NORET_TYPE void ATTRIB_NORET nmi_exception_handler(struct pt_regs *regs)
+void ATTRIB_NORET nmi_exception_handler(struct pt_regs *regs)
 {
 	bust_spinlocks(1);
 	printk("NMI taken!!!!\n");
diff --git a/arch/powerpc/kernel/machine_kexec_32.c b/arch/powerpc/kernel/machine_kexec_32.c
index e63f2e7d2ef..026e7f15394 100644
--- a/arch/powerpc/kernel/machine_kexec_32.c
+++ b/arch/powerpc/kernel/machine_kexec_32.c
@@ -16,7 +16,7 @@
 #include <asm/hw_irq.h>
 #include <asm/io.h>
 
-typedef NORET_TYPE void (*relocate_new_kernel_t)(
+typedef void (*relocate_new_kernel_t)(
 				unsigned long indirection_page,
 				unsigned long reboot_code_buffer,
 				unsigned long start_address) ATTRIB_NORET;
diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c
index 26ccbf77dd4..5fbbf814923 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -307,9 +307,9 @@ static union thread_union kexec_stack __init_task_data =
 struct paca_struct kexec_paca;
 
 /* Our assembly helper, in kexec_stub.S */
-extern NORET_TYPE void kexec_sequence(void *newstack, unsigned long start,
-					void *image, void *control,
-					void (*clear_all)(void)) ATTRIB_NORET;
+extern void kexec_sequence(void *newstack, unsigned long start,
+			   void *image, void *control,
+			   void (*clear_all)(void)) ATTRIB_NORET;
 
 /* too late to fail here */
 void default_machine_kexec(struct kimage *image)
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index fab88431a06..0fd2e863e11 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -30,7 +30,7 @@ struct mcck_struct {
 
 static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck);
 
-static NORET_TYPE void s390_handle_damage(char *msg)
+static void s390_handle_damage(char *msg)
 {
 	smp_send_stop();
 	disabled_wait((unsigned long) __builtin_return_address(0));
diff --git a/arch/tile/kernel/machine_kexec.c b/arch/tile/kernel/machine_kexec.c
index e00d7179989..b0c90705906 100644
--- a/arch/tile/kernel/machine_kexec.c
+++ b/arch/tile/kernel/machine_kexec.c
@@ -248,10 +248,10 @@ static void setup_quasi_va_is_pa(void)
 }
 
 
-NORET_TYPE void machine_kexec(struct kimage *image)
+void machine_kexec(struct kimage *image)
 {
 	void *reboot_code_buffer;
-	NORET_TYPE void (*rnk)(unsigned long, void *, unsigned long)
+	void (*rnk)(unsigned long, void *, unsigned long)
 		ATTRIB_NORET;
 
 	/* Mask all interrupts before starting to reboot. */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 60934395e36..aaf1753dd2b 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -185,16 +185,16 @@ static inline void might_fault(void)
 
 extern struct atomic_notifier_head panic_notifier_list;
 extern long (*panic_blink)(int state);
-NORET_TYPE __printf(1, 2)
+__printf(1, 2)
 void panic(const char *fmt, ...)
 	ATTRIB_NORET __cold;
 extern void oops_enter(void);
 extern void oops_exit(void);
 void print_oops_end_marker(void);
 extern int oops_may_print(void);
-NORET_TYPE void do_exit(long error_code)
+void do_exit(long error_code)
 	ATTRIB_NORET;
-NORET_TYPE void complete_and_exit(struct completion *, long)
+void complete_and_exit(struct completion *, long)
 	ATTRIB_NORET;
 
 /* Internal, do not use. */
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index c75074cb8ad..6a8f252e49e 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -88,7 +88,6 @@
 
 #endif
 
-#define NORET_TYPE    /**/
 #define ATTRIB_NORET  __attribute__((noreturn))
 
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 21cd0303af5..4032ec1cf83 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2275,7 +2275,7 @@ extern void __cleanup_sighand(struct sighand_struct *);
 extern void exit_itimers(struct signal_struct *);
 extern void flush_itimer_signals(void);
 
-extern NORET_TYPE void do_group_exit(int);
+extern void do_group_exit(int);
 
 extern void daemonize(const char *, ...);
 extern int allow_signal(int);
diff --git a/kernel/exit.c b/kernel/exit.c
index 94ed6e20bb5..c44738267be 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -887,7 +887,7 @@ static void check_stack_usage(void)
 static inline void check_stack_usage(void) {}
 #endif
 
-NORET_TYPE void do_exit(long code)
+void do_exit(long code)
 {
 	struct task_struct *tsk = current;
 	int group_dead;
@@ -1051,7 +1051,7 @@ NORET_TYPE void do_exit(long code)
 
 EXPORT_SYMBOL_GPL(do_exit);
 
-NORET_TYPE void complete_and_exit(struct completion *comp, long code)
+void complete_and_exit(struct completion *comp, long code)
 {
 	if (comp)
 		complete(comp);
@@ -1070,7 +1070,7 @@ SYSCALL_DEFINE1(exit, int, error_code)
  * Take down every thread in the group.  This is called by fatal signals
  * as well as by sys_exit_group (below).
  */
-NORET_TYPE void
+void
 do_group_exit(int exit_code)
 {
 	struct signal_struct *sig = current->signal;
diff --git a/kernel/panic.c b/kernel/panic.c
index 3458469eb7c..6fd09ed6fd9 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -57,7 +57,7 @@ EXPORT_SYMBOL(panic_blink);
  *
  *	This function never returns.
  */
-NORET_TYPE void panic(const char * fmt, ...)
+void panic(const char *fmt, ...)
 {
 	static char buf[1024];
 	va_list args;
-- 
cgit v1.2.3-70-g09d2


From a3dd3323058d281abd584b15ad4c5b65064d7a61 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Thu, 12 Jan 2012 17:20:11 -0800
Subject: kexec: remove KMSG_DUMP_KEXEC

KMSG_DUMP_KEXEC is useless because we already save kernel messages inside
/proc/vmcore, and it is unsafe to allow modules to do other stuffs in a
crash dump scenario.

[akpm@linux-foundation.org: fix powerpc build]
Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Reported-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Jarod Wilson <jarod@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/platforms/pseries/nvram.c | 1 -
 drivers/char/ramoops.c                 | 3 +--
 drivers/mtd/mtdoops.c                  | 3 +--
 include/linux/kmsg_dump.h              | 1 -
 kernel/kexec.c                         | 3 ---
 5 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index 330a57b7c17..36f957f3184 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -638,7 +638,6 @@ static void oops_to_nvram(struct kmsg_dumper *dumper,
 		/* These are almost always orderly shutdowns. */
 		return;
 	case KMSG_DUMP_OOPS:
-	case KMSG_DUMP_KEXEC:
 		break;
 	case KMSG_DUMP_PANIC:
 		panicking = true;
diff --git a/drivers/char/ramoops.c b/drivers/char/ramoops.c
index 7c7f42a1f88..feda90cdac9 100644
--- a/drivers/char/ramoops.c
+++ b/drivers/char/ramoops.c
@@ -83,8 +83,7 @@ static void ramoops_do_dump(struct kmsg_dumper *dumper,
 	struct timeval timestamp;
 
 	if (reason != KMSG_DUMP_OOPS &&
-	    reason != KMSG_DUMP_PANIC &&
-	    reason != KMSG_DUMP_KEXEC)
+	    reason != KMSG_DUMP_PANIC)
 		return;
 
 	/* Only dump oopses if dump_oops is set */
diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c
index db8e8272d69..3ce99e00a49 100644
--- a/drivers/mtd/mtdoops.c
+++ b/drivers/mtd/mtdoops.c
@@ -315,8 +315,7 @@ static void mtdoops_do_dump(struct kmsg_dumper *dumper,
 	char *dst;
 
 	if (reason != KMSG_DUMP_OOPS &&
-	    reason != KMSG_DUMP_PANIC &&
-	    reason != KMSG_DUMP_KEXEC)
+	    reason != KMSG_DUMP_PANIC)
 		return;
 
 	/* Only dump oopses if dump_oops is set */
diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h
index ee0c952188d..fee66317e07 100644
--- a/include/linux/kmsg_dump.h
+++ b/include/linux/kmsg_dump.h
@@ -18,7 +18,6 @@
 enum kmsg_dump_reason {
 	KMSG_DUMP_OOPS,
 	KMSG_DUMP_PANIC,
-	KMSG_DUMP_KEXEC,
 	KMSG_DUMP_RESTART,
 	KMSG_DUMP_HALT,
 	KMSG_DUMP_POWEROFF,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 090ee10d960..20ed47ae252 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -32,7 +32,6 @@
 #include <linux/console.h>
 #include <linux/vmalloc.h>
 #include <linux/swap.h>
-#include <linux/kmsg_dump.h>
 #include <linux/syscore_ops.h>
 
 #include <asm/page.h>
@@ -1094,8 +1093,6 @@ void crash_kexec(struct pt_regs *regs)
 		if (kexec_crash_image) {
 			struct pt_regs fixed_regs;
 
-			kmsg_dump(KMSG_DUMP_KEXEC);
-
 			crash_setup_regs(&fixed_regs, regs);
 			crash_save_vmcoreinfo();
 			machine_crash_shutdown(&fixed_regs);
-- 
cgit v1.2.3-70-g09d2


From 6480e5a0923756b500634d9777ec4189492fbbfe Mon Sep 17 00:00:00 2001
From: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Date: Thu, 12 Jan 2012 17:20:14 -0800
Subject: kdump: add missing RAM resource in crash_shrink_memory()

When shrinking crashkernel memory using /sys/kernel/kexec_crash_size for
the newly added memory no RAM resource is created at the moment.

Example:

  $ cat /proc/iomem
  00000000-bfffffff : System RAM
    00000000-005b7ac3 : Kernel code
    005b7ac4-009743bf : Kernel data
    009bb000-00a85c33 : Kernel bss
  c0000000-cfffffff : Crash kernel
  d0000000-ffffffff : System RAM

  $ echo 0 > /sys/kernel/kexec_crash_size
  $ cat /proc/iomem
  00000000-bfffffff : System RAM
    00000000-005b7ac3 : Kernel code
    005b7ac4-009743bf : Kernel data
    009bb000-00a85c33 : Kernel bss
                                   <<-- here is System RAM missing
  d0000000-ffffffff : System RAM

One result of this bug is that the memory chunk can never be set offline
using memory hotplug.  With this patch I insert a new "System RAM"
resource for the released memory.  Then the upper example looks like the
following:

  $ echo 0 > /sys/kernel/kexec_crash_size
  $ cat /proc/iomem
  00000000-bfffffff : System RAM
    00000000-005b7ac3 : Kernel code
    005b7ac4-009743bf : Kernel data
    009bb000-00a85c33 : Kernel bss
  c0000000-cfffffff : System RAM   <<-- new rescoure
  d0000000-ffffffff : System RAM

And now I can set chunk c0000000-cfffffff offline.

Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 20ed47ae252..60bf181b3ea 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1129,6 +1129,7 @@ int crash_shrink_memory(unsigned long new_size)
 {
 	int ret = 0;
 	unsigned long start, end;
+	struct resource *ram_res;
 
 	mutex_lock(&kexec_mutex);
 
@@ -1146,6 +1147,12 @@ int crash_shrink_memory(unsigned long new_size)
 		goto unlock;
 	}
 
+	ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+	if (!ram_res) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
 	start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
 	end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
 
@@ -1154,7 +1161,15 @@ int crash_shrink_memory(unsigned long new_size)
 
 	if ((start == end) && (crashk_res.parent != NULL))
 		release_resource(&crashk_res);
+
+	ram_res->start = end;
+	ram_res->end = crashk_res.end;
+	ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+	ram_res->name = "System RAM";
+
 	crashk_res.end = end - 1;
+
+	insert_resource(&iomem_resource, ram_res);
 	crash_unmap_reserved_pages();
 
 unlock:
-- 
cgit v1.2.3-70-g09d2


From bec013c40bc89671d8d457944fdf7d2b8e79d651 Mon Sep 17 00:00:00 2001
From: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Date: Thu, 12 Jan 2012 17:20:15 -0800
Subject: kdump: crashk_res init check for /sys/kernel/kexec_crash_size

Currently it is possible to set the crash_size via the sysfs
/sys/kernel/kexec_crash_size even if no crash kernel memory has been
defined with the "crashkernel" parameter.  In this case "crashk_res" is
not initialized and crashk_res.start = crashk_res.end = 0.  Unfortunately
resource_size(&crashk_res) returns 1 in this case.  This breaks the s390
implementation of crash_(un)map_reserved_pages().

To fix the problem the correct "old_size" is now calculated in
crash_shrink_memory().  "old_size is set to "0" if crashk_res is not
initialized.  With this change crash_shrink_memory() will do nothing, when
"crashk_res" is not initialized.  It will return "0" for "echo 0 >
/sys/kernel/kexec_crash_size" and -EINVAL for "echo [not zero] >
/sys/kernel/kexec_crash_size".

In addition to that this patch also simplifies the "ret = -EINVAL" vs.
"ret = 0" logic as suggested by Simon Horman.

Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Reviewed-by: Dave Young <dyoung@redhat.com>
Reviewed-by: WANG Cong <xiyou.wangcong@gmail.com>
Reviewed-by: Simon Horman <horms@verge.net.au>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 60bf181b3ea..7b088678670 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1129,6 +1129,7 @@ int crash_shrink_memory(unsigned long new_size)
 {
 	int ret = 0;
 	unsigned long start, end;
+	unsigned long old_size;
 	struct resource *ram_res;
 
 	mutex_lock(&kexec_mutex);
@@ -1139,11 +1140,9 @@ int crash_shrink_memory(unsigned long new_size)
 	}
 	start = crashk_res.start;
 	end = crashk_res.end;
-
-	if (new_size >= end - start + 1) {
-		ret = -EINVAL;
-		if (new_size == end - start + 1)
-			ret = 0;
+	old_size = (end == 0) ? 0 : end - start + 1;
+	if (new_size >= old_size) {
+		ret = (new_size == old_size) ? 0 : -EINVAL;
 		goto unlock;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 93e13a360ba331915220f82f6e9543df961ffa1f Mon Sep 17 00:00:00 2001
From: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Date: Thu, 12 Jan 2012 17:20:18 -0800
Subject: kdump: fix crash_kexec()/smp_send_stop() race in panic()

When two CPUs call panic at the same time there is a possible race
condition that can stop kdump.  The first CPU calls crash_kexec() and the
second CPU calls smp_send_stop() in panic() before crash_kexec() finished
on the first CPU.  So the second CPU stops the first CPU and therefore
kdump fails:

1st CPU:
  panic()->crash_kexec()->mutex_trylock(&kexec_mutex)-> do kdump

2nd CPU:
  panic()->crash_kexec()->kexec_mutex already held by 1st CPU
       ->smp_send_stop()-> stop 1st CPU (stop kdump)

This patch fixes the problem by introducing a spinlock in panic that
allows only one CPU to process crash_kexec() and the subsequent panic
code.

All other CPUs call the weak function panic_smp_self_stop() that stops the
CPU itself.  This function can be overloaded by architecture code.  For
example "tile" can use their lower-power "nap" instruction for that.

Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Acked-by: Chris Metcalf <cmetcalf@tilera.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/panic.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 6fd09ed6fd9..5dce5404eee 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -49,6 +49,15 @@ static long no_blink(int state)
 long (*panic_blink)(int state);
 EXPORT_SYMBOL(panic_blink);
 
+/*
+ * Stop ourself in panic -- architecture code may override this
+ */
+void __weak panic_smp_self_stop(void)
+{
+	while (1)
+		cpu_relax();
+}
+
 /**
  *	panic - halt the system
  *	@fmt: The text string to print
@@ -59,6 +68,7 @@ EXPORT_SYMBOL(panic_blink);
  */
 void panic(const char *fmt, ...)
 {
+	static DEFINE_SPINLOCK(panic_lock);
 	static char buf[1024];
 	va_list args;
 	long i, i_next = 0;
@@ -68,8 +78,14 @@ void panic(const char *fmt, ...)
 	 * It's possible to come here directly from a panic-assertion and
 	 * not have preempt disabled. Some functions called from here want
 	 * preempt to be disabled. No point enabling it later though...
+	 *
+	 * Only one CPU is allowed to execute the panic code from here. For
+	 * multiple parallel invocations of panic, all other CPUs either
+	 * stop themself or will wait until they are stopped by the 1st CPU
+	 * with smp_send_stop().
 	 */
-	preempt_disable();
+	if (!spin_trylock(&panic_lock))
+		panic_smp_self_stop();
 
 	console_verbose();
 	bust_spinlocks(1);
-- 
cgit v1.2.3-70-g09d2


From b8f566b04d3cddd192cfd2418ae6d54ac6353792 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Thu, 12 Jan 2012 17:20:27 -0800
Subject: sysctl: add the kernel.ns_last_pid control

The sysctl works on the current task's pid namespace, getting and setting
its last_pid field.

Writing is allowed for CAP_SYS_ADMIN-capable tasks thus making it possible
to create a task with desired pid value.  This ability is required badly
for the checkpoint/restore in userspace.

This approach suits all the parties for now.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/kernel.txt |  8 ++++++++
 kernel/pid.c                    |  4 +++-
 kernel/pid_namespace.c          | 31 +++++++++++++++++++++++++++++++
 3 files changed, 42 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 6d8cd8b2c30..8c20fbd8b42 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -415,6 +415,14 @@ PIDs of value pid_max or larger are not allocated.
 
 ==============================================================
 
+ns_last_pid:
+
+The last pid allocated in the current (the one task using this sysctl
+lives in) pid namespace. When selecting a pid for a next task on fork
+kernel tries to allocate a number starting from this one.
+
+==============================================================
+
 powersave-nap: (PPC only)
 
 If set, Linux-PPC will use the 'nap' mode of powersaving,
diff --git a/kernel/pid.c b/kernel/pid.c
index fa5f72227e5..ce8e00deacc 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b)
 }
 
 /*
- * We might be racing with someone else trying to set pid_ns->last_pid.
+ * We might be racing with someone else trying to set pid_ns->last_pid
+ * at the pid allocation time (there's also a sysctl for this, but racing
+ * with this one is OK, see comment in kernel/pid_namespace.c about it).
  * We want the winner to have the "later" value, because if the
  * "earlier" value prevails, then a pid may get reused immediately.
  *
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index e9c9adc84ca..a8968396046 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	return;
 }
 
+static int pid_ns_ctl_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table tmp = *table;
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/*
+	 * Writing directly to ns' last_pid field is OK, since this field
+	 * is volatile in a living namespace anyway and a code writing to
+	 * it should synchronize its usage with external means.
+	 */
+
+	tmp.data = &current->nsproxy->pid_ns->last_pid;
+	return proc_dointvec(&tmp, write, buffer, lenp, ppos);
+}
+
+static struct ctl_table pid_ns_ctl_table[] = {
+	{
+		.procname = "ns_last_pid",
+		.maxlen = sizeof(int),
+		.mode = 0666, /* permissions are checked in the handler */
+		.proc_handler = pid_ns_ctl_handler,
+	},
+	{ }
+};
+
+static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
+
 static __init int pid_namespaces_init(void)
 {
 	pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+	register_sysctl_paths(kern_path, pid_ns_ctl_table);
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 6e6f0a1f0fa6bba1493c296eb30d1e176e1f8530 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 12 Jan 2012 17:20:30 -0800
Subject: panic: don't print redundant backtraces on oops

When an oops causes a panic and panic prints another backtrace it's pretty
common to have the original oops data be scrolled away on a 80x50 screen.

The second backtrace is quite redundant and not needed anyways.

So don't print the panic backtrace when oops_in_progress is true.

[akpm@linux-foundation.org: add comment]
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/panic.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 5dce5404eee..80aed44e345 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -94,7 +94,11 @@ void panic(const char *fmt, ...)
 	va_end(args);
 	printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
 #ifdef CONFIG_DEBUG_BUGVERBOSE
-	dump_stack();
+	/*
+	 * Avoid nested stack-dumping if a panic occurs during oops processing
+	 */
+	if (!oops_in_progress)
+		dump_stack();
 #endif
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From 028ee4be34a09a6d48bdf30ab991ae933a7bc036 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu, 12 Jan 2012 17:20:55 -0800
Subject: c/r: prctl: add PR_SET_MM codes to set up mm_struct entries

When we restore a task we need to set up text, data and data heap sizes
from userspace to the values a task had at checkpoint time.  This patch
adds auxilary prctl codes for that.

While most of them have a statistical nature (their values are involved
into calculation of /proc/<pid>/statm output) the start_brk and brk values
are used to compute an allowed size of program data segment expansion.
Which means an arbitrary changes of this values might be dangerous
operation.  So to restrict access the following requirements applied to
prctl calls:

 - The process has to have CAP_SYS_ADMIN capability granted.
 - For all opcodes except start_brk/brk members an appropriate
   VMA area must exist and should fit certain VMA flags,
   such as:
   - code segment must be executable but not writable;
   - data segment must not be executable.

start_brk/brk values must not intersect with data segment and must not
exceed RLIMIT_DATA resource limit.

Still the main guard is CAP_SYS_ADMIN capability check.

Note the kernel should be compiled with CONFIG_CHECKPOINT_RESTORE support
otherwise these prctl calls will return -EINVAL.

[akpm@linux-foundation.org: cache current->mm in a local, saving 200 bytes text]
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Vagin <avagin@openvz.org>
Cc: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Vasiliy Kulikov <segoon@openwall.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/prctl.h |  12 +++++
 kernel/sys.c          | 121 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 133 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a3baeb2c216..7ddc7f1b480 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -102,4 +102,16 @@
 
 #define PR_MCE_KILL_GET 34
 
+/*
+ * Tune up process memory map specifics.
+ */
+#define PR_SET_MM		35
+# define PR_SET_MM_START_CODE		1
+# define PR_SET_MM_END_CODE		2
+# define PR_SET_MM_START_DATA		3
+# define PR_SET_MM_END_DATA		4
+# define PR_SET_MM_START_STACK		5
+# define PR_SET_MM_START_BRK		6
+# define PR_SET_MM_BRK			7
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index ddf8155bf3f..40701538fbd 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1692,6 +1692,124 @@ SYSCALL_DEFINE1(umask, int, mask)
 	return mask;
 }
 
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static int prctl_set_mm(int opt, unsigned long addr,
+			unsigned long arg4, unsigned long arg5)
+{
+	unsigned long rlim = rlimit(RLIMIT_DATA);
+	unsigned long vm_req_flags;
+	unsigned long vm_bad_flags;
+	struct vm_area_struct *vma;
+	int error = 0;
+	struct mm_struct *mm = current->mm;
+
+	if (arg4 | arg5)
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (addr >= TASK_SIZE)
+		return -EINVAL;
+
+	down_read(&mm->mmap_sem);
+	vma = find_vma(mm, addr);
+
+	if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) {
+		/* It must be existing VMA */
+		if (!vma || vma->vm_start > addr)
+			goto out;
+	}
+
+	error = -EINVAL;
+	switch (opt) {
+	case PR_SET_MM_START_CODE:
+	case PR_SET_MM_END_CODE:
+		vm_req_flags = VM_READ | VM_EXEC;
+		vm_bad_flags = VM_WRITE | VM_MAYSHARE;
+
+		if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
+		    (vma->vm_flags & vm_bad_flags))
+			goto out;
+
+		if (opt == PR_SET_MM_START_CODE)
+			mm->start_code = addr;
+		else
+			mm->end_code = addr;
+		break;
+
+	case PR_SET_MM_START_DATA:
+	case PR_SET_MM_END_DATA:
+		vm_req_flags = VM_READ | VM_WRITE;
+		vm_bad_flags = VM_EXEC | VM_MAYSHARE;
+
+		if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
+		    (vma->vm_flags & vm_bad_flags))
+			goto out;
+
+		if (opt == PR_SET_MM_START_DATA)
+			mm->start_data = addr;
+		else
+			mm->end_data = addr;
+		break;
+
+	case PR_SET_MM_START_STACK:
+
+#ifdef CONFIG_STACK_GROWSUP
+		vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP;
+#else
+		vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN;
+#endif
+		if ((vma->vm_flags & vm_req_flags) != vm_req_flags)
+			goto out;
+
+		mm->start_stack = addr;
+		break;
+
+	case PR_SET_MM_START_BRK:
+		if (addr <= mm->end_data)
+			goto out;
+
+		if (rlim < RLIM_INFINITY &&
+		    (mm->brk - addr) +
+		    (mm->end_data - mm->start_data) > rlim)
+			goto out;
+
+		mm->start_brk = addr;
+		break;
+
+	case PR_SET_MM_BRK:
+		if (addr <= mm->end_data)
+			goto out;
+
+		if (rlim < RLIM_INFINITY &&
+		    (addr - mm->start_brk) +
+		    (mm->end_data - mm->start_data) > rlim)
+			goto out;
+
+		mm->brk = addr;
+		break;
+
+	default:
+		error = -EINVAL;
+		goto out;
+	}
+
+	error = 0;
+
+out:
+	up_read(&mm->mmap_sem);
+
+	return error;
+}
+#else /* CONFIG_CHECKPOINT_RESTORE */
+static int prctl_set_mm(int opt, unsigned long addr,
+			unsigned long arg4, unsigned long arg5)
+{
+	return -EINVAL;
+}
+#endif
+
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		unsigned long, arg4, unsigned long, arg5)
 {
@@ -1841,6 +1959,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			else
 				error = PR_MCE_KILL_DEFAULT;
 			break;
+		case PR_SET_MM:
+			error = prctl_set_mm(arg2, arg3, arg4, arg5);
+			break;
 		default:
 			error = -EINVAL;
 			break;
-- 
cgit v1.2.3-70-g09d2


From dae5cbc2440b1d21a15715d0f1fb20f632dd38ee Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sat, 14 Jan 2012 00:33:03 +0100
Subject: PM: Make sysrq-o be available for CONFIG_PM unset

After commit 1eb208aea3179dd2fc0cdeea45ef869d75b4fe70, "PM: Make
CONFIG_PM depend on (CONFIG_PM_SLEEP || CONFIG_PM_RUNTIME)", the
files under kernel/power are not built unless CONFIG_PM_SLEEP or
CONFIG_PM_RUNTIME is set.  In particular, this causes
kernel/power/poweroff.c to be omitted, even though it should be
compiled, because CONFIG_MAGIC_SYSRQ is set.

Fix the problem by causing kernel/power/Makefile to be processed
for CONFIG_PM unset too.

Reported-and-tested-by: Phil Oester <kernel@linuxace.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index f70396e5a24..2d9de86b7e7 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -23,6 +23,7 @@ CFLAGS_REMOVE_irq_work.o = -pg
 endif
 
 obj-y += sched/
+obj-y += power/
 
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
@@ -52,8 +53,6 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
-obj-$(CONFIG_PM) += power/
-obj-$(CONFIG_FREEZER) += power/
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
-- 
cgit v1.2.3-70-g09d2


From ee34a37049114303011e154478c63b977bcff24c Mon Sep 17 00:00:00 2001
From: Barry Song <Baohua.Song@csr.com>
Date: Mon, 9 Jan 2012 12:56:23 +0800
Subject: PM / Hibernate: Drop the check of swap space size for compressed
 image

For compressed image, the space required is not known until
we finish compressing and writing all pages.
This patch drops the check, and if swap space is not enough
finally, system can still restore to normal after writing
swap fails for compressed images.

Signed-off-by: Barry Song <Baohua.Song@csr.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/swap.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 3739ecced08..8742fd013a9 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -773,8 +773,7 @@ static int enough_swap(unsigned int nr_pages, unsigned int flags)
 
 	pr_debug("PM: Free swap pages: %u\n", free_swap);
 
-	required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ?
-		nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1);
+	required = PAGES_FOR_IO + nr_pages;
 	return free_swap > required;
 }
 
@@ -802,10 +801,12 @@ int swsusp_write(unsigned int flags)
 		printk(KERN_ERR "PM: Cannot get swap writer\n");
 		return error;
 	}
-	if (!enough_swap(pages, flags)) {
-		printk(KERN_ERR "PM: Not enough free swap\n");
-		error = -ENOSPC;
-		goto out_finish;
+	if (flags & SF_NOCOMPRESS_MODE) {
+		if (!enough_swap(pages, flags)) {
+			printk(KERN_ERR "PM: Not enough free swap\n");
+			error = -ENOSPC;
+			goto out_finish;
+		}
 	}
 	memset(&snapshot, 0, sizeof(struct snapshot_handle));
 	error = snapshot_read_next(&snapshot);
-- 
cgit v1.2.3-70-g09d2


From 53999bf34d55981328f8ba9def558d3e104d6e36 Mon Sep 17 00:00:00 2001
From: Kevin Winchester <kjwinchester@gmail.com>
Date: Sun, 15 Jan 2012 19:32:55 -0400
Subject: error: implicit declaration of function 'module_flags_taint'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Recent changes to kernel/module.c caused the following compile
error:

  kernel/module.c: In function ‘show_taint’:
  kernel/module.c:1024:2: error: implicit declaration of function ‘module_flags_taint’ [-Werror=implicit-function-declaration]
  cc1: some warnings being treated as errors

Correct this error by moving the definition of module_flags_taint
outside of the #ifdef CONFIG_MODULE_UNLOAD section.

Signed-off-by: Kevin Winchester <kjwinchester@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index acf6ed3ebe8..2c932760fd3 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -842,26 +842,6 @@ out:
 	return ret;
 }
 
-static size_t module_flags_taint(struct module *mod, char *buf)
-{
-	size_t l = 0;
-
-	if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
-		buf[l++] = 'P';
-	if (mod->taints & (1 << TAINT_OOT_MODULE))
-		buf[l++] = 'O';
-	if (mod->taints & (1 << TAINT_FORCED_MODULE))
-		buf[l++] = 'F';
-	if (mod->taints & (1 << TAINT_CRAP))
-		buf[l++] = 'C';
-	/*
-	 * TAINT_FORCED_RMMOD: could be added.
-	 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
-	 * apply to modules.
-	 */
-	return l;
-}
-
 static inline void print_unload_info(struct seq_file *m, struct module *mod)
 {
 	struct module_use *use;
@@ -962,6 +942,26 @@ static inline int module_unload_init(struct module *mod)
 }
 #endif /* CONFIG_MODULE_UNLOAD */
 
+static size_t module_flags_taint(struct module *mod, char *buf)
+{
+	size_t l = 0;
+
+	if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
+		buf[l++] = 'P';
+	if (mod->taints & (1 << TAINT_OOT_MODULE))
+		buf[l++] = 'O';
+	if (mod->taints & (1 << TAINT_FORCED_MODULE))
+		buf[l++] = 'F';
+	if (mod->taints & (1 << TAINT_CRAP))
+		buf[l++] = 'C';
+	/*
+	 * TAINT_FORCED_RMMOD: could be added.
+	 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
+	 * apply to modules.
+	 */
+	return l;
+}
+
 static ssize_t show_initstate(struct module_attribute *mattr,
 			      struct module_kobject *mk, char *buffer)
 {
-- 
cgit v1.2.3-70-g09d2


From c10076c4304083af15a41f6bc5e657e781c1f9a6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Jan 2012 21:40:59 -0500
Subject: tracepoints/module: Fix disabling tracepoints with taint CRAP or OOT

Tracepoints are disabled for tainted modules, which is usually because the
module is either proprietary or was forced, and we don't want either of them
using kernel tracepoints.

But, a module can also be tainted by being in the staging directory or
compiled out of tree. Either is fine for use with tracepoints, no need
to punish them.  I found this out when I noticed that my sample trace event
module, when done out of tree, stopped working.

Cc: stable@vger.kernel.org # 3.2
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Dave Jones <davej@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/tracepoint.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index db110b8ae03..f1539decd99 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -634,10 +634,11 @@ static int tracepoint_module_coming(struct module *mod)
 	int ret = 0;
 
 	/*
-	 * We skip modules that tain the kernel, especially those with different
-	 * module header (for forced load), to make sure we don't cause a crash.
+	 * We skip modules that taint the kernel, especially those with different
+	 * module headers (for forced load), to make sure we don't cause a crash.
+	 * Staging and out-of-tree GPL modules are fine.
 	 */
-	if (mod->taints)
+	if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)))
 		return 0;
 	mutex_lock(&tracepoints_mutex);
 	tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
-- 
cgit v1.2.3-70-g09d2


From d8e8ed95cda1dfd6813588333d36552935eba4a1 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 15 Dec 2011 13:43:24 +1030
Subject: rcu: Make rcutorture bool parameters really bool (core code)

module_param(bool) used to counter-intuitively take an int.  In
fddd5201 (mid-2009) we allowed bool or int/unsigned int using a messy
trick.

It's time to remove the int/unsigned int option.  For this version
it'll simply give a warning, but it'll break next kernel version.
This commit makes this change to rcutorture.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutorture.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 88f17b8a3b1..e29edc374ce 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -56,8 +56,8 @@ static int nreaders = -1;	/* # reader threads, defaults to 2*ncpus */
 static int nfakewriters = 4;	/* # fake writer threads */
 static int stat_interval;	/* Interval between stats, in seconds. */
 				/*  Defaults to "only at end of test". */
-static int verbose;		/* Print more debug info. */
-static int test_no_idle_hz;	/* Test RCU's support for tickless idle CPUs. */
+static bool verbose;		/* Print more debug info. */
+static bool test_no_idle_hz;	/* Test RCU's support for tickless idle CPUs. */
 static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
 static int stutter = 5;		/* Start/stop testing interval (in sec) */
 static int irqreader = 1;	/* RCU readers from irq (timers). */
-- 
cgit v1.2.3-70-g09d2


From 4410030646be072b82ec1892ad5cc7d91af384d8 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 27 Dec 2011 15:04:26 +0100
Subject: rcu: Add missing __cpuinit annotation in rcutorture code

"rcu: Add rcutorture CPU-hotplug capability" adds cpu hotplug operations
to the rcutorture code but produces a false positive warning about section
mismatches:

WARNING: vmlinux.o(.text+0x1e420c): Section mismatch in reference from the
	 function rcu_torture_onoff() to the function .cpuinit.text:cpu_up()
	 The function rcu_torture_onoff() references
	 the function __cpuinit cpu_up().
	 This is often because rcu_torture_onoff lacks a __cpuinit
	 annotation or the annotation of cpu_up is wrong.

This commit therefore adds a __cpuinit annotation so the warning goes away.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutorture.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e29edc374ce..a58ac285fc6 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -1399,7 +1399,7 @@ rcu_torture_shutdown(void *arg)
  * Execute random CPU-hotplug operations at the interval specified
  * by the onoff_interval.
  */
-static int
+static int __cpuinit
 rcu_torture_onoff(void *arg)
 {
 	int cpu;
@@ -1447,7 +1447,7 @@ rcu_torture_onoff(void *arg)
 	return 0;
 }
 
-static int
+static int __cpuinit
 rcu_torture_onoff_init(void)
 {
 	if (onoff_interval <= 0)
-- 
cgit v1.2.3-70-g09d2


From 951880e634a79884236a575b896abf55c39ae0bf Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 17 Jan 2012 10:19:41 -0800
Subject: Revert "capabitlies: ns_capable can use the cap helpers rather than
 lsm call"

This reverts commit d2a7009f0bb03fa22ad08dd25472efa0568126b9.

J. R. Okajima explains:

 "After this commit, I am afraid access(2) on NFS may not work
  correctly.  The scenario based upon my guess.
   - access(2) overrides the credentials.
   - calls inode_permission() -- ... -- generic_permission() --
      ns_capable().
   - while the old ns_capable() calls security_capable(current_cred()),
     the new ns_capable() calls has_ns_capability(current) --
     security_capable(__task_cred(t)).

  current_cred() returns current->cred which is effective (overridden)
  credentials, but __task_cred(current) returns current->real_cred (the
  NFSD's credential).  And the overridden credentials by access(2) lost."

Requested-by: J. R. Okajima <hooanon05@yahoo.co.jp>
Acked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/capability.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index 0fcf1c14a29..3f1adb6c647 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -384,7 +384,7 @@ bool ns_capable(struct user_namespace *ns, int cap)
 		BUG();
 	}
 
-	if (has_ns_capability(current, ns, cap)) {
+	if (security_capable(current_cred(), ns, cap) == 0) {
 		current->flags |= PF_SUPERPRIV;
 		return true;
 	}
-- 
cgit v1.2.3-70-g09d2


From 5ef30ee53b187786e64bdc1f8109e39d17f2ce58 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:05 -0500
Subject: audit: make filetype matching consistent with other filters

Every other filter that matches part of the inodes list collected by audit
will match against any of the inodes on that list.  The filetype matching
however had a strange way of doing things.  It allowed userspace to
indicated if it should match on the first of the second name collected by
the kernel.  Name collection ordering seems like a kernel internal and
making userspace rules get that right just seems like a bad idea.  As it
turns out the userspace audit writers had no idea it was doing this and
thus never overloaded the value field.  The kernel always checked the first
name collected which for the tested rules was always correct.

This patch just makes the filetype matching like the major, minor, inode,
and LSM rules in that it will match against any of the names collected.  It
also changes the rule validation to reject the old unused rule types.

Noone knew it was there.  Noone used it.  Why keep around the extra code?

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 kernel/auditfilter.c |  4 ++--
 kernel/auditsc.c     | 19 +++++++++----------
 2 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index f8277c80d67..d94dde82c3c 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -385,7 +385,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 				goto exit_free;
 			break;
 		case AUDIT_FILETYPE:
-			if ((f->val & ~S_IFMT) > S_IFMT)
+			if (f->val & ~S_IFMT)
 				goto exit_free;
 			break;
 		case AUDIT_INODE:
@@ -536,7 +536,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 				goto exit_free;
 			break;
 		case AUDIT_FILETYPE:
-			if ((f->val & ~S_IFMT) > S_IFMT)
+			if (f->val & ~S_IFMT)
 				goto exit_free;
 			break;
 		default:
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e7fe2b0d29b..a09c5031705 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -305,21 +305,20 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
 	}
 }
 
-static int audit_match_filetype(struct audit_context *ctx, int which)
+static int audit_match_filetype(struct audit_context *ctx, int val)
 {
-	unsigned index = which & ~S_IFMT;
-	umode_t mode = which & S_IFMT;
+	int index;
+	umode_t mode = (umode_t)val;
 
 	if (unlikely(!ctx))
 		return 0;
 
-	if (index >= ctx->name_count)
-		return 0;
-	if (ctx->names[index].ino == -1)
-		return 0;
-	if ((ctx->names[index].mode ^ mode) & S_IFMT)
-		return 0;
-	return 1;
+	for (index = 0; index < ctx->name_count; index++) {
+		if ((ctx->names[index].ino != -1) &&
+		    ((ctx->names[index].mode & S_IFMT) == mode))
+			return 1;
+	}
+	return 0;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 5195d8e217a78697152d64fc09a16e063a022465 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:05 -0500
Subject: audit: dynamically allocate audit_names when not enough space is in
 the names array

This patch does 2 things.  First it reduces the number of audit_names
allocated in every audit context from 20 to 5.  5 should be enough for all
'normal' syscalls (rename being the worst).  Some syscalls can still touch
more the 5 inodes such as mount.  When rpc filesystem is mounted it will
create inodes and those can exceed 5.  To handle that problem this patch will
dynamically allocate audit_names if it needs more than 5.  This should
decrease the typicall memory usage while still supporting all the possible
kernel operations.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 kernel/auditsc.c | 403 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 215 insertions(+), 188 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index a09c5031705..1a92d61ddd2 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -71,8 +71,9 @@
 #include "audit.h"
 
 /* AUDIT_NAMES is the number of slots we reserve in the audit_context
- * for saving names from getname(). */
-#define AUDIT_NAMES    20
+ * for saving names from getname().  If we get more names we will allocate
+ * a name dynamically and also add those to the list anchored by names_list. */
+#define AUDIT_NAMES	5
 
 /* Indicates that audit should log the full pathname. */
 #define AUDIT_NAME_FULL -1
@@ -101,9 +102,8 @@ struct audit_cap_data {
  *
  * Further, in fs/namei.c:path_lookup() we store the inode and device. */
 struct audit_names {
+	struct list_head list;		/* audit_context->names_list */
 	const char	*name;
-	int		name_len;	/* number of name's characters to log */
-	unsigned	name_put;	/* call __putname() for this name */
 	unsigned long	ino;
 	dev_t		dev;
 	umode_t		mode;
@@ -113,6 +113,14 @@ struct audit_names {
 	u32		osid;
 	struct audit_cap_data fcap;
 	unsigned int	fcap_ver;
+	int		name_len;	/* number of name's characters to log */
+	bool		name_put;	/* call __putname() for this name */
+	/*
+	 * This was an allocated audit_names and not from the array of
+	 * names allocated in the task audit context.  Thus this name
+	 * should be freed on syscall exit
+	 */
+	bool		should_free;
 };
 
 struct audit_aux_data {
@@ -174,8 +182,17 @@ struct audit_context {
 	long		    return_code;/* syscall return code */
 	u64		    prio;
 	int		    return_valid; /* return code is valid */
-	int		    name_count;
-	struct audit_names  names[AUDIT_NAMES];
+	/*
+	 * The names_list is the list of all audit_names collected during this
+	 * syscall.  The first AUDIT_NAMES entries in the names_list will
+	 * actually be from the preallocated_names array for performance
+	 * reasons.  Except during allocation they should never be referenced
+	 * through the preallocated_names array and should only be found/used
+	 * by running the names_list.
+	 */
+	struct audit_names  preallocated_names[AUDIT_NAMES];
+	int		    name_count; /* total records in names_list */
+	struct list_head    names_list;	/* anchor for struct audit_names->list */
 	char *		    filterkey;	/* key for rule that triggered record */
 	struct path	    pwd;
 	struct audit_context *previous; /* For nested syscalls */
@@ -307,17 +324,18 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
 
 static int audit_match_filetype(struct audit_context *ctx, int val)
 {
-	int index;
+	struct audit_names *n;
 	umode_t mode = (umode_t)val;
 
 	if (unlikely(!ctx))
 		return 0;
 
-	for (index = 0; index < ctx->name_count; index++) {
-		if ((ctx->names[index].ino != -1) &&
-		    ((ctx->names[index].mode & S_IFMT) == mode))
+	list_for_each_entry(n, &ctx->names_list, list) {
+		if ((n->ino != -1) &&
+		    ((n->mode & S_IFMT) == mode))
 			return 1;
 	}
+
 	return 0;
 }
 
@@ -456,13 +474,14 @@ static int audit_filter_rules(struct task_struct *tsk,
 			      bool task_creation)
 {
 	const struct cred *cred;
-	int i, j, need_sid = 1;
+	int i, need_sid = 1;
 	u32 sid;
 
 	cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
 
 	for (i = 0; i < rule->field_count; i++) {
 		struct audit_field *f = &rule->fields[i];
+		struct audit_names *n;
 		int result = 0;
 
 		switch (f->type) {
@@ -525,8 +544,8 @@ static int audit_filter_rules(struct task_struct *tsk,
 				result = audit_comparator(MAJOR(name->dev),
 							  f->op, f->val);
 			else if (ctx) {
-				for (j = 0; j < ctx->name_count; j++) {
-					if (audit_comparator(MAJOR(ctx->names[j].dev),	f->op, f->val)) {
+				list_for_each_entry(n, &ctx->names_list, list) {
+					if (audit_comparator(MAJOR(n->dev), f->op, f->val)) {
 						++result;
 						break;
 					}
@@ -538,8 +557,8 @@ static int audit_filter_rules(struct task_struct *tsk,
 				result = audit_comparator(MINOR(name->dev),
 							  f->op, f->val);
 			else if (ctx) {
-				for (j = 0; j < ctx->name_count; j++) {
-					if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
+				list_for_each_entry(n, &ctx->names_list, list) {
+					if (audit_comparator(MINOR(n->dev), f->op, f->val)) {
 						++result;
 						break;
 					}
@@ -550,8 +569,8 @@ static int audit_filter_rules(struct task_struct *tsk,
 			if (name)
 				result = (name->ino == f->val);
 			else if (ctx) {
-				for (j = 0; j < ctx->name_count; j++) {
-					if (audit_comparator(ctx->names[j].ino, f->op, f->val)) {
+				list_for_each_entry(n, &ctx->names_list, list) {
+					if (audit_comparator(n->ino, f->op, f->val)) {
 						++result;
 						break;
 					}
@@ -606,11 +625,10 @@ static int audit_filter_rules(struct task_struct *tsk,
 					           name->osid, f->type, f->op,
 					           f->lsm_rule, ctx);
 				} else if (ctx) {
-					for (j = 0; j < ctx->name_count; j++) {
-						if (security_audit_rule_match(
-						      ctx->names[j].osid,
-						      f->type, f->op,
-						      f->lsm_rule, ctx)) {
+					list_for_each_entry(n, &ctx->names_list, list) {
+						if (security_audit_rule_match(n->osid, f->type,
+									      f->op, f->lsm_rule,
+									      ctx)) {
 							++result;
 							break;
 						}
@@ -721,40 +739,53 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
 	return AUDIT_BUILD_CONTEXT;
 }
 
-/* At syscall exit time, this filter is called if any audit_names[] have been
+/*
+ * Given an audit_name check the inode hash table to see if they match.
+ * Called holding the rcu read lock to protect the use of audit_inode_hash
+ */
+static int audit_filter_inode_name(struct task_struct *tsk,
+				   struct audit_names *n,
+				   struct audit_context *ctx) {
+	int word, bit;
+	int h = audit_hash_ino((u32)n->ino);
+	struct list_head *list = &audit_inode_hash[h];
+	struct audit_entry *e;
+	enum audit_state state;
+
+	word = AUDIT_WORD(ctx->major);
+	bit  = AUDIT_BIT(ctx->major);
+
+	if (list_empty(list))
+		return 0;
+
+	list_for_each_entry_rcu(e, list, list) {
+		if ((e->rule.mask[word] & bit) == bit &&
+		    audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
+			ctx->current_state = state;
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+/* At syscall exit time, this filter is called if any audit_names have been
  * collected during syscall processing.  We only check rules in sublists at hash
- * buckets applicable to the inode numbers in audit_names[].
+ * buckets applicable to the inode numbers in audit_names.
  * Regarding audit_state, same rules apply as for audit_filter_syscall().
  */
 void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
 {
-	int i;
-	struct audit_entry *e;
-	enum audit_state state;
+	struct audit_names *n;
 
 	if (audit_pid && tsk->tgid == audit_pid)
 		return;
 
 	rcu_read_lock();
-	for (i = 0; i < ctx->name_count; i++) {
-		int word = AUDIT_WORD(ctx->major);
-		int bit  = AUDIT_BIT(ctx->major);
-		struct audit_names *n = &ctx->names[i];
-		int h = audit_hash_ino((u32)n->ino);
-		struct list_head *list = &audit_inode_hash[h];
 
-		if (list_empty(list))
-			continue;
-
-		list_for_each_entry_rcu(e, list, list) {
-			if ((e->rule.mask[word] & bit) == bit &&
-			    audit_filter_rules(tsk, &e->rule, ctx, n,
-				    	       &state, false)) {
-				rcu_read_unlock();
-				ctx->current_state = state;
-				return;
-			}
-		}
+	list_for_each_entry(n, &ctx->names_list, list) {
+		if (audit_filter_inode_name(tsk, n, ctx))
+			break;
 	}
 	rcu_read_unlock();
 }
@@ -798,7 +829,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 
 static inline void audit_free_names(struct audit_context *context)
 {
-	int i;
+	struct audit_names *n, *next;
 
 #if AUDIT_DEBUG == 2
 	if (context->put_count + context->ino_count != context->name_count) {
@@ -809,10 +840,9 @@ static inline void audit_free_names(struct audit_context *context)
 		       context->serial, context->major, context->in_syscall,
 		       context->name_count, context->put_count,
 		       context->ino_count);
-		for (i = 0; i < context->name_count; i++) {
+		list_for_each_entry(n, &context->names_list, list) {
 			printk(KERN_ERR "names[%d] = %p = %s\n", i,
-			       context->names[i].name,
-			       context->names[i].name ?: "(null)");
+			       n->name, n->name ?: "(null)");
 		}
 		dump_stack();
 		return;
@@ -823,9 +853,12 @@ static inline void audit_free_names(struct audit_context *context)
 	context->ino_count  = 0;
 #endif
 
-	for (i = 0; i < context->name_count; i++) {
-		if (context->names[i].name && context->names[i].name_put)
-			__putname(context->names[i].name);
+	list_for_each_entry_safe(n, next, &context->names_list, list) {
+		list_del(&n->list);
+		if (n->name && n->name_put)
+			__putname(n->name);
+		if (n->should_free)
+			kfree(n);
 	}
 	context->name_count = 0;
 	path_put(&context->pwd);
@@ -863,6 +896,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
 		return NULL;
 	audit_zero_context(context, state);
 	INIT_LIST_HEAD(&context->killed_trees);
+	INIT_LIST_HEAD(&context->names_list);
 	return context;
 }
 
@@ -1323,6 +1357,68 @@ static void show_special(struct audit_context *context, int *call_panic)
 	audit_log_end(ab);
 }
 
+static void audit_log_name(struct audit_context *context, struct audit_names *n,
+			   int record_num, int *call_panic)
+{
+	struct audit_buffer *ab;
+	ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+	if (!ab)
+		return; /* audit_panic has been called */
+
+	audit_log_format(ab, "item=%d", record_num);
+
+	if (n->name) {
+		switch (n->name_len) {
+		case AUDIT_NAME_FULL:
+			/* log the full path */
+			audit_log_format(ab, " name=");
+			audit_log_untrustedstring(ab, n->name);
+			break;
+		case 0:
+			/* name was specified as a relative path and the
+			 * directory component is the cwd */
+			audit_log_d_path(ab, "name=", &context->pwd);
+			break;
+		default:
+			/* log the name's directory component */
+			audit_log_format(ab, " name=");
+			audit_log_n_untrustedstring(ab, n->name,
+						    n->name_len);
+		}
+	} else
+		audit_log_format(ab, " name=(null)");
+
+	if (n->ino != (unsigned long)-1) {
+		audit_log_format(ab, " inode=%lu"
+				 " dev=%02x:%02x mode=%#ho"
+				 " ouid=%u ogid=%u rdev=%02x:%02x",
+				 n->ino,
+				 MAJOR(n->dev),
+				 MINOR(n->dev),
+				 n->mode,
+				 n->uid,
+				 n->gid,
+				 MAJOR(n->rdev),
+				 MINOR(n->rdev));
+	}
+	if (n->osid != 0) {
+		char *ctx = NULL;
+		u32 len;
+		if (security_secid_to_secctx(
+			n->osid, &ctx, &len)) {
+			audit_log_format(ab, " osid=%u", n->osid);
+			*call_panic = 2;
+		} else {
+			audit_log_format(ab, " obj=%s", ctx);
+			security_release_secctx(ctx, len);
+		}
+	}
+
+	audit_log_fcaps(ab, n);
+
+	audit_log_end(ab);
+}
+
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
 	const struct cred *cred;
@@ -1330,6 +1426,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 	struct audit_buffer *ab;
 	struct audit_aux_data *aux;
 	const char *tty;
+	struct audit_names *n;
 
 	/* tsk == current */
 	context->pid = tsk->pid;
@@ -1469,66 +1566,10 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			audit_log_end(ab);
 		}
 	}
-	for (i = 0; i < context->name_count; i++) {
-		struct audit_names *n = &context->names[i];
-
-		ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
-		if (!ab)
-			continue; /* audit_panic has been called */
-
-		audit_log_format(ab, "item=%d", i);
-
-		if (n->name) {
-			switch(n->name_len) {
-			case AUDIT_NAME_FULL:
-				/* log the full path */
-				audit_log_format(ab, " name=");
-				audit_log_untrustedstring(ab, n->name);
-				break;
-			case 0:
-				/* name was specified as a relative path and the
-				 * directory component is the cwd */
-				audit_log_d_path(ab, "name=", &context->pwd);
-				break;
-			default:
-				/* log the name's directory component */
-				audit_log_format(ab, " name=");
-				audit_log_n_untrustedstring(ab, n->name,
-							    n->name_len);
-			}
-		} else
-			audit_log_format(ab, " name=(null)");
-
-		if (n->ino != (unsigned long)-1) {
-			audit_log_format(ab, " inode=%lu"
-					 " dev=%02x:%02x mode=%#ho"
-					 " ouid=%u ogid=%u rdev=%02x:%02x",
-					 n->ino,
-					 MAJOR(n->dev),
-					 MINOR(n->dev),
-					 n->mode,
-					 n->uid,
-					 n->gid,
-					 MAJOR(n->rdev),
-					 MINOR(n->rdev));
-		}
-		if (n->osid != 0) {
-			char *ctx = NULL;
-			u32 len;
-			if (security_secid_to_secctx(
-				n->osid, &ctx, &len)) {
-				audit_log_format(ab, " osid=%u", n->osid);
-				call_panic = 2;
-			} else {
-				audit_log_format(ab, " obj=%s", ctx);
-				security_release_secctx(ctx, len);
-			}
-		}
-
-		audit_log_fcaps(ab, n);
 
-		audit_log_end(ab);
-	}
+	i = 0;
+	list_for_each_entry(n, &context->names_list, list)
+		audit_log_name(context, n, i++, &call_panic);
 
 	/* Send end of event record to help user space know we are finished */
 	ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -1820,6 +1861,30 @@ retry:
 #endif
 }
 
+static struct audit_names *audit_alloc_name(struct audit_context *context)
+{
+	struct audit_names *aname;
+
+	if (context->name_count < AUDIT_NAMES) {
+		aname = &context->preallocated_names[context->name_count];
+		memset(aname, 0, sizeof(*aname));
+	} else {
+		aname = kzalloc(sizeof(*aname), GFP_NOFS);
+		if (!aname)
+			return NULL;
+		aname->should_free = true;
+	}
+
+	aname->ino = (unsigned long)-1;
+	list_add_tail(&aname->list, &context->names_list);
+
+	context->name_count++;
+#if AUDIT_DEBUG
+	context->ino_count++;
+#endif
+	return aname;
+}
+
 /**
  * audit_getname - add a name to the list
  * @name: name to add
@@ -1830,6 +1895,7 @@ retry:
 void __audit_getname(const char *name)
 {
 	struct audit_context *context = current->audit_context;
+	struct audit_names *n;
 
 	if (IS_ERR(name) || !name)
 		return;
@@ -1842,13 +1908,15 @@ void __audit_getname(const char *name)
 #endif
 		return;
 	}
-	BUG_ON(context->name_count >= AUDIT_NAMES);
-	context->names[context->name_count].name = name;
-	context->names[context->name_count].name_len = AUDIT_NAME_FULL;
-	context->names[context->name_count].name_put = 1;
-	context->names[context->name_count].ino  = (unsigned long)-1;
-	context->names[context->name_count].osid = 0;
-	++context->name_count;
+
+	n = audit_alloc_name(context);
+	if (!n)
+		return;
+
+	n->name = name;
+	n->name_len = AUDIT_NAME_FULL;
+	n->name_put = true;
+
 	if (!context->pwd.dentry)
 		get_fs_pwd(current->fs, &context->pwd);
 }
@@ -1870,12 +1938,13 @@ void audit_putname(const char *name)
 		printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
 		       __FILE__, __LINE__, context->serial, name);
 		if (context->name_count) {
+			struct audit_names *n;
 			int i;
-			for (i = 0; i < context->name_count; i++)
+
+			list_for_each_entry(n, &context->names_list, list)
 				printk(KERN_ERR "name[%d] = %p = %s\n", i,
-				       context->names[i].name,
-				       context->names[i].name ?: "(null)");
-		}
+				       n->name, n->name ?: "(null)");
+			}
 #endif
 		__putname(name);
 	}
@@ -1896,39 +1965,11 @@ void audit_putname(const char *name)
 #endif
 }
 
-static int audit_inc_name_count(struct audit_context *context,
-				const struct inode *inode)
-{
-	if (context->name_count >= AUDIT_NAMES) {
-		if (inode)
-			printk(KERN_DEBUG "audit: name_count maxed, losing inode data: "
-			       "dev=%02x:%02x, inode=%lu\n",
-			       MAJOR(inode->i_sb->s_dev),
-			       MINOR(inode->i_sb->s_dev),
-			       inode->i_ino);
-
-		else
-			printk(KERN_DEBUG "name_count maxed, losing inode data\n");
-		return 1;
-	}
-	context->name_count++;
-#if AUDIT_DEBUG
-	context->ino_count++;
-#endif
-	return 0;
-}
-
-
 static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
 {
 	struct cpu_vfs_cap_data caps;
 	int rc;
 
-	memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t));
-	memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t));
-	name->fcap.fE = 0;
-	name->fcap_ver = 0;
-
 	if (!dentry)
 		return 0;
 
@@ -1968,30 +2009,25 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent
  */
 void __audit_inode(const char *name, const struct dentry *dentry)
 {
-	int idx;
 	struct audit_context *context = current->audit_context;
 	const struct inode *inode = dentry->d_inode;
+	struct audit_names *n;
 
 	if (!context->in_syscall)
 		return;
-	if (context->name_count
-	    && context->names[context->name_count-1].name
-	    && context->names[context->name_count-1].name == name)
-		idx = context->name_count - 1;
-	else if (context->name_count > 1
-		 && context->names[context->name_count-2].name
-		 && context->names[context->name_count-2].name == name)
-		idx = context->name_count - 2;
-	else {
-		/* FIXME: how much do we care about inodes that have no
-		 * associated name? */
-		if (audit_inc_name_count(context, inode))
-			return;
-		idx = context->name_count - 1;
-		context->names[idx].name = NULL;
+
+	list_for_each_entry_reverse(n, &context->names_list, list) {
+		if (n->name && (n->name == name))
+			goto out;
 	}
+
+	/* unable to find the name from a previous getname() */
+	n = audit_alloc_name(context);
+	if (!n)
+		return;
+out:
 	handle_path(dentry);
-	audit_copy_inode(&context->names[idx], dentry, inode);
+	audit_copy_inode(n, dentry, inode);
 }
 
 /**
@@ -2010,11 +2046,11 @@ void __audit_inode(const char *name, const struct dentry *dentry)
 void __audit_inode_child(const struct dentry *dentry,
 			 const struct inode *parent)
 {
-	int idx;
 	struct audit_context *context = current->audit_context;
 	const char *found_parent = NULL, *found_child = NULL;
 	const struct inode *inode = dentry->d_inode;
 	const char *dname = dentry->d_name.name;
+	struct audit_names *n;
 	int dirlen = 0;
 
 	if (!context->in_syscall)
@@ -2024,9 +2060,7 @@ void __audit_inode_child(const struct dentry *dentry,
 		handle_one(inode);
 
 	/* parent is more likely, look for it first */
-	for (idx = 0; idx < context->name_count; idx++) {
-		struct audit_names *n = &context->names[idx];
-
+	list_for_each_entry(n, &context->names_list, list) {
 		if (!n->name)
 			continue;
 
@@ -2039,9 +2073,7 @@ void __audit_inode_child(const struct dentry *dentry,
 	}
 
 	/* no matching parent, look for matching child */
-	for (idx = 0; idx < context->name_count; idx++) {
-		struct audit_names *n = &context->names[idx];
-
+	list_for_each_entry(n, &context->names_list, list) {
 		if (!n->name)
 			continue;
 
@@ -2059,34 +2091,29 @@ void __audit_inode_child(const struct dentry *dentry,
 
 add_names:
 	if (!found_parent) {
-		if (audit_inc_name_count(context, parent))
+		n = audit_alloc_name(context);
+		if (!n)
 			return;
-		idx = context->name_count - 1;
-		context->names[idx].name = NULL;
-		audit_copy_inode(&context->names[idx], NULL, parent);
+		audit_copy_inode(n, NULL, parent);
 	}
 
 	if (!found_child) {
-		if (audit_inc_name_count(context, inode))
+		n = audit_alloc_name(context);
+		if (!n)
 			return;
-		idx = context->name_count - 1;
 
 		/* Re-use the name belonging to the slot for a matching parent
 		 * directory. All names for this context are relinquished in
 		 * audit_free_names() */
 		if (found_parent) {
-			context->names[idx].name = found_parent;
-			context->names[idx].name_len = AUDIT_NAME_FULL;
+			n->name = found_parent;
+			n->name_len = AUDIT_NAME_FULL;
 			/* don't call __putname() */
-			context->names[idx].name_put = 0;
-		} else {
-			context->names[idx].name = NULL;
+			n->name_put = false;
 		}
 
 		if (inode)
-			audit_copy_inode(&context->names[idx], NULL, inode);
-		else
-			context->names[idx].ino = (unsigned long)-1;
+			audit_copy_inode(n, NULL, inode);
 	}
 }
 EXPORT_SYMBOL_GPL(__audit_inode_child);
-- 
cgit v1.2.3-70-g09d2


From 3035c51e8ac0512686ceb9f2bd1d13bdc6e4fb29 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:05 -0500
Subject: audit: drop the meaningless and format breaking word 'user'

userspace audit messages look like so:

type=USER msg=audit(1271170549.415:24710): user pid=14722 uid=0 auid=500 ses=1 subj=unconfined_u:unconfined_r:auditctl_t:s0-s0:c0.c1023 msg=''

That third field just says 'user'.  That's useless and doesn't follow the
key=value pair we are trying to enforce.  We already know it came from the
user based on the record type.  Kill that word.  Die.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 kernel/audit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 2c1d6ab7106..00efe4758c8 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -631,7 +631,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
 	}
 
 	*ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
-	audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u",
+	audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
 			 pid, uid, auid, ses);
 	if (sid) {
 		rc = security_secid_to_secctx(sid, &ctx, &len);
-- 
cgit v1.2.3-70-g09d2


From 16c174bd95cb07c9d0ad3fcd8c70f9cea7214c9d Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:05 -0500
Subject: audit: check current inode and containing object when filtering on
 major and minor

The audit system has the ability to filter on the major and minor number of
the device containing the inode being operated upon.  Lets say that
/dev/sda1 has major,minor 8,1 and that we mount /dev/sda1 on /boot.  Now lets
say we add a watch with a filter on 8,1.  If we proceed to open an inode
inside /boot, such as /vboot/vmlinuz, we will match the major,minor filter.

Lets instead assume that one were to use a tool like debugfs and were to
open /dev/sda1 directly and to modify it's contents.  We might hope that
this would also be logged, but it isn't.  The rules will check the
major,minor of the device containing /dev/sda1.  In other words the rule
would match on the major/minor of the tmpfs mounted at /dev.

I believe these rules should trigger on either device.  The man page is
devoid of useful information about the intended semantics.  It only seems
logical that if you want to know everything that happened on a major,minor
that would include things that happened to the device itself...

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 kernel/auditsc.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1a92d61ddd2..7c495147c3d 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -540,12 +540,14 @@ static int audit_filter_rules(struct task_struct *tsk,
 			}
 			break;
 		case AUDIT_DEVMAJOR:
-			if (name)
-				result = audit_comparator(MAJOR(name->dev),
-							  f->op, f->val);
-			else if (ctx) {
+			if (name) {
+				if (audit_comparator(MAJOR(name->dev), f->op, f->val) ||
+				    audit_comparator(MAJOR(name->rdev), f->op, f->val))
+					++result;
+			} else if (ctx) {
 				list_for_each_entry(n, &ctx->names_list, list) {
-					if (audit_comparator(MAJOR(n->dev), f->op, f->val)) {
+					if (audit_comparator(MAJOR(n->dev), f->op, f->val) ||
+					    audit_comparator(MAJOR(n->rdev), f->op, f->val)) {
 						++result;
 						break;
 					}
@@ -553,12 +555,14 @@ static int audit_filter_rules(struct task_struct *tsk,
 			}
 			break;
 		case AUDIT_DEVMINOR:
-			if (name)
-				result = audit_comparator(MINOR(name->dev),
-							  f->op, f->val);
-			else if (ctx) {
+			if (name) {
+				if (audit_comparator(MINOR(name->dev), f->op, f->val) ||
+				    audit_comparator(MINOR(name->rdev), f->op, f->val))
+					++result;
+			} else if (ctx) {
 				list_for_each_entry(n, &ctx->names_list, list) {
-					if (audit_comparator(MINOR(n->dev), f->op, f->val)) {
+					if (audit_comparator(MINOR(n->dev), f->op, f->val) ||
+					    audit_comparator(MINOR(n->rdev), f->op, f->val)) {
 						++result;
 						break;
 					}
-- 
cgit v1.2.3-70-g09d2


From 85e7bac33b8d5edafc4e219c7dfdb3d48e0b4e31 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:05 -0500
Subject: seccomp: audit abnormal end to a process due to seccomp

The audit system likes to collect information about processes that end
abnormally (SIGSEGV) as this may me useful intrusion detection information.
This patch adds audit support to collect information when seccomp forces a
task to exit because of misbehavior in a similar way.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h |  8 ++++++++
 kernel/auditsc.c      | 50 +++++++++++++++++++++++++++++---------------------
 kernel/seccomp.c      |  2 ++
 3 files changed, 39 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 426ab9f4dd8..6e1c533f9b4 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -430,6 +430,7 @@ extern void audit_putname(const char *name);
 extern void __audit_inode(const char *name, const struct dentry *dentry);
 extern void __audit_inode_child(const struct dentry *dentry,
 				const struct inode *parent);
+extern void __audit_seccomp(unsigned long syscall);
 extern void __audit_ptrace(struct task_struct *t);
 
 static inline int audit_dummy_context(void)
@@ -453,6 +454,12 @@ static inline void audit_inode_child(const struct dentry *dentry,
 }
 void audit_core_dumps(long signr);
 
+static inline void audit_seccomp(unsigned long syscall)
+{
+	if (unlikely(!audit_dummy_context()))
+		__audit_seccomp(syscall);
+}
+
 static inline void audit_ptrace(struct task_struct *t)
 {
 	if (unlikely(!audit_dummy_context()))
@@ -558,6 +565,7 @@ extern int audit_signals;
 #define audit_inode(n,d) do { (void)(d); } while (0)
 #define audit_inode_child(i,p) do { ; } while (0)
 #define audit_core_dumps(i) do { ; } while (0)
+#define audit_seccomp(i) do { ; } while (0)
 #define auditsc_get_stamp(c,t,s) (0)
 #define audit_get_loginuid(t) (-1)
 #define audit_get_sessionid(t) (-1)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7c495147c3d..e9bcb93800d 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2529,6 +2529,25 @@ void __audit_mmap_fd(int fd, int flags)
 	context->type = AUDIT_MMAP;
 }
 
+static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
+{
+	uid_t auid, uid;
+	gid_t gid;
+	unsigned int sessionid;
+
+	auid = audit_get_loginuid(current);
+	sessionid = audit_get_sessionid(current);
+	current_uid_gid(&uid, &gid);
+
+	audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
+			 auid, uid, gid, sessionid);
+	audit_log_task_context(ab);
+	audit_log_format(ab, " pid=%d comm=", current->pid);
+	audit_log_untrustedstring(ab, current->comm);
+	audit_log_format(ab, " reason=");
+	audit_log_string(ab, reason);
+	audit_log_format(ab, " sig=%ld", signr);
+}
 /**
  * audit_core_dumps - record information about processes that end abnormally
  * @signr: signal value
@@ -2539,10 +2558,6 @@ void __audit_mmap_fd(int fd, int flags)
 void audit_core_dumps(long signr)
 {
 	struct audit_buffer *ab;
-	u32 sid;
-	uid_t auid = audit_get_loginuid(current), uid;
-	gid_t gid;
-	unsigned int sessionid = audit_get_sessionid(current);
 
 	if (!audit_enabled)
 		return;
@@ -2551,24 +2566,17 @@ void audit_core_dumps(long signr)
 		return;
 
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
-	current_uid_gid(&uid, &gid);
-	audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
-			 auid, uid, gid, sessionid);
-	security_task_getsecid(current, &sid);
-	if (sid) {
-		char *ctx = NULL;
-		u32 len;
+	audit_log_abend(ab, "memory violation", signr);
+	audit_log_end(ab);
+}
 
-		if (security_secid_to_secctx(sid, &ctx, &len))
-			audit_log_format(ab, " ssid=%u", sid);
-		else {
-			audit_log_format(ab, " subj=%s", ctx);
-			security_release_secctx(ctx, len);
-		}
-	}
-	audit_log_format(ab, " pid=%d comm=", current->pid);
-	audit_log_untrustedstring(ab, current->comm);
-	audit_log_format(ab, " sig=%ld", signr);
+void __audit_seccomp(unsigned long syscall)
+{
+	struct audit_buffer *ab;
+
+	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
+	audit_log_abend(ab, "seccomp", SIGKILL);
+	audit_log_format(ab, " syscall=%ld", syscall);
 	audit_log_end(ab);
 }
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 57d4b13b631..e8d76c5895e 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -6,6 +6,7 @@
  * This defines a simple but solid secure-computing mode.
  */
 
+#include <linux/audit.h>
 #include <linux/seccomp.h>
 #include <linux/sched.h>
 #include <linux/compat.h>
@@ -54,6 +55,7 @@ void __secure_computing(int this_syscall)
 #ifdef SECCOMP_DEBUG
 	dump_stack();
 #endif
+	audit_seccomp(this_syscall);
 	do_exit(SIGKILL);
 }
 
-- 
cgit v1.2.3-70-g09d2


From d7e7528bcd456f5c36ad4a202ccfb43c5aa98bc4 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:06 -0500
Subject: Audit: push audit success and retcode into arch ptrace.h

The audit system previously expected arches calling to audit_syscall_exit to
supply as arguments if the syscall was a success and what the return code was.
Audit also provides a helper AUDITSC_RESULT which was supposed to simplify things
by converting from negative retcodes to an audit internal magic value stating
success or failure.  This helper was wrong and could indicate that a valid
pointer returned to userspace was a failed syscall.  The fix is to fix the
layering foolishness.  We now pass audit_syscall_exit a struct pt_reg and it
in turns calls back into arch code to collect the return value and to
determine if the syscall was a success or failure.  We also define a generic
is_syscall_success() macro which determines success/failure based on if the
value is < -MAX_ERRNO.  This works for arches like x86 which do not use a
separate mechanism to indicate syscall failure.

We make both the is_syscall_success() and regs_return_value() static inlines
instead of macros.  The reason is because the audit function must take a void*
for the regs.  (uml calls theirs struct uml_pt_regs instead of just struct
pt_regs so audit_syscall_exit can't take a struct pt_regs).  Since the audit
function takes a void* we need to use static inlines to cast it back to the
arch correct structure to dereference it.

The other major change is that on some arches, like ia64, MIPS and ppc, we
change regs_return_value() to give us the negative value on syscall failure.
THE only other user of this macro, kretprobe_example.c, won't notice and it
makes the value signed consistently for the audit functions across all archs.

In arch/sh/kernel/ptrace_64.c I see that we were using regs[9] in the old
audit code as the return value.  But the ptrace_64.h code defined the macro
regs_return_value() as regs[3].  I have no idea which one is correct, but this
patch now uses the regs_return_value() function, so it now uses regs[3].

For powerpc we previously used regs->result but now use the
regs_return_value() function which uses regs->gprs[3].  regs->gprs[3] is
always positive so the regs_return_value(), much like ia64 makes it negative
before calling the audit code when appropriate.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: H. Peter Anvin <hpa@zytor.com> [for x86 portion]
Acked-by: Tony Luck <tony.luck@intel.com> [for ia64]
Acked-by: Richard Weinberger <richard@nod.at> [for uml]
Acked-by: David S. Miller <davem@davemloft.net> [for sparc]
Acked-by: Ralf Baechle <ralf@linux-mips.org> [for mips]
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> [for ppc]
---
 arch/ia64/include/asm/ptrace.h       | 13 ++++++++++++-
 arch/ia64/kernel/ptrace.c            |  9 +--------
 arch/microblaze/include/asm/ptrace.h |  5 +++++
 arch/microblaze/kernel/ptrace.c      |  3 +--
 arch/mips/include/asm/ptrace.h       | 14 +++++++++++++-
 arch/mips/kernel/ptrace.c            |  4 +---
 arch/powerpc/include/asm/ptrace.h    | 13 ++++++++++++-
 arch/powerpc/kernel/ptrace.c         |  4 +---
 arch/s390/include/asm/ptrace.h       |  6 +++++-
 arch/s390/kernel/ptrace.c            |  4 +---
 arch/sh/include/asm/ptrace_32.h      |  5 ++++-
 arch/sh/include/asm/ptrace_64.h      |  5 ++++-
 arch/sh/kernel/ptrace_32.c           |  4 +---
 arch/sh/kernel/ptrace_64.c           |  4 +---
 arch/sparc/include/asm/ptrace.h      | 10 +++++++++-
 arch/sparc/kernel/ptrace_64.c        | 11 +----------
 arch/um/kernel/ptrace.c              |  4 ++--
 arch/x86/ia32/ia32entry.S            | 10 +++++-----
 arch/x86/kernel/entry_32.S           |  8 ++++----
 arch/x86/kernel/entry_64.S           | 10 +++++-----
 arch/x86/kernel/ptrace.c             |  3 +--
 arch/x86/kernel/vm86_32.c            |  4 ++--
 arch/x86/um/shared/sysdep/ptrace.h   |  5 +++++
 include/linux/audit.h                | 22 ++++++++++++++--------
 include/linux/ptrace.h               | 10 ++++++++++
 kernel/auditsc.c                     | 16 ++++++++++++----
 26 files changed, 132 insertions(+), 74 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/include/asm/ptrace.h b/arch/ia64/include/asm/ptrace.h
index f5cb27614e3..68c98f5b3ca 100644
--- a/arch/ia64/include/asm/ptrace.h
+++ b/arch/ia64/include/asm/ptrace.h
@@ -246,7 +246,18 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs)
 	return regs->ar_bspstore;
 }
 
-#define regs_return_value(regs) ((regs)->r8)
+static inline int is_syscall_success(struct pt_regs *regs)
+{
+	return regs->r10 != -1;
+}
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	if (is_syscall_success(regs))
+		return regs->r8;
+	else
+		return -regs->r8;
+}
 
 /* Conserve space in histogram by encoding slot bits in address
  * bits 2 and 3 rather than bits 0 and 1.
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 8848f43d819..2c154088cce 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -1268,14 +1268,7 @@ syscall_trace_leave (long arg0, long arg1, long arg2, long arg3,
 {
 	int step;
 
-	if (unlikely(current->audit_context)) {
-		int success = AUDITSC_RESULT(regs.r10);
-		long result = regs.r8;
-
-		if (success != AUDITSC_SUCCESS)
-			result = -result;
-		audit_syscall_exit(success, result);
-	}
+	audit_syscall_exit(&regs);
 
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
diff --git a/arch/microblaze/include/asm/ptrace.h b/arch/microblaze/include/asm/ptrace.h
index 816bee64b19..94e92c80585 100644
--- a/arch/microblaze/include/asm/ptrace.h
+++ b/arch/microblaze/include/asm/ptrace.h
@@ -61,6 +61,11 @@ struct pt_regs {
 #define instruction_pointer(regs)	((regs)->pc)
 #define profile_pc(regs)		instruction_pointer(regs)
 
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	return regs->r3;
+}
+
 #else /* __KERNEL__ */
 
 /* pt_regs offsets used by gdbserver etc in ptrace syscalls */
diff --git a/arch/microblaze/kernel/ptrace.c b/arch/microblaze/kernel/ptrace.c
index 043cb58f9c4..f564b1bfd38 100644
--- a/arch/microblaze/kernel/ptrace.c
+++ b/arch/microblaze/kernel/ptrace.c
@@ -159,8 +159,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 {
 	int step;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->r3), regs->r3);
+	audit_syscall_exit(regs);
 
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h
index de39b1f343e..7d409505df2 100644
--- a/arch/mips/include/asm/ptrace.h
+++ b/arch/mips/include/asm/ptrace.h
@@ -137,7 +137,19 @@ extern int ptrace_set_watch_regs(struct task_struct *child,
  */
 #define user_mode(regs) (((regs)->cp0_status & KU_MASK) == KU_USER)
 
-#define regs_return_value(_regs) ((_regs)->regs[2])
+static inline int is_syscall_success(struct pt_regs *regs)
+{
+	return !regs->regs[7];
+}
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	if (is_syscall_success(regs))
+		return regs->regs[2];
+	else
+		return -regs->regs[2];
+}
+
 #define instruction_pointer(regs) ((regs)->cp0_epc)
 #define profile_pc(regs) instruction_pointer(regs)
 
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 4e6ea1ffad4..ab0f1963a7b 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -572,9 +572,7 @@ out:
  */
 asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 {
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->regs[7]),
-		                   -regs->regs[2]);
+	audit_syscall_exit(regs);
 
 	if (!(current->ptrace & PT_PTRACED))
 		return;
diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h
index 48223f9b872..78a205162fd 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -86,7 +86,18 @@ struct pt_regs {
 #define instruction_pointer(regs) ((regs)->nip)
 #define user_stack_pointer(regs) ((regs)->gpr[1])
 #define kernel_stack_pointer(regs) ((regs)->gpr[1])
-#define regs_return_value(regs) ((regs)->gpr[3])
+static inline int is_syscall_success(struct pt_regs *regs)
+{
+	return !(regs->ccr & 0x10000000);
+}
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	if (is_syscall_success(regs))
+		return regs->gpr[3];
+	else
+		return -regs->gpr[3];
+}
 
 #ifdef CONFIG_SMP
 extern unsigned long profile_pc(struct pt_regs *regs);
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 5de73dbd15c..09d31c12a5e 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -1748,9 +1748,7 @@ void do_syscall_trace_leave(struct pt_regs *regs)
 {
 	int step;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_exit((regs->ccr&0x10000000)?AUDITSC_FAILURE:AUDITSC_SUCCESS,
-				   regs->result);
+	audit_syscall_exit(regs);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->result);
diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h
index 56da355678f..aeb77f01798 100644
--- a/arch/s390/include/asm/ptrace.h
+++ b/arch/s390/include/asm/ptrace.h
@@ -541,9 +541,13 @@ struct user_regs_struct
 #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0)
 #define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN)
 #define user_stack_pointer(regs)((regs)->gprs[15])
-#define regs_return_value(regs)((regs)->gprs[2])
 #define profile_pc(regs) instruction_pointer(regs)
 
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	return regs->gprs[2];
+}
+
 int regs_query_register_offset(const char *name);
 const char *regs_query_register_name(unsigned int offset);
 unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset);
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 573bc29551e..f5275860098 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -751,9 +751,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 
 asmlinkage void do_syscall_trace_exit(struct pt_regs *regs)
 {
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]),
-				   regs->gprs[2]);
+	audit_syscall_exit(regs);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->gprs[2]);
diff --git a/arch/sh/include/asm/ptrace_32.h b/arch/sh/include/asm/ptrace_32.h
index 6c2239cca1a..2d3e906aa72 100644
--- a/arch/sh/include/asm/ptrace_32.h
+++ b/arch/sh/include/asm/ptrace_32.h
@@ -76,7 +76,10 @@ struct pt_dspregs {
 #ifdef __KERNEL__
 
 #define MAX_REG_OFFSET		offsetof(struct pt_regs, tra)
-#define regs_return_value(_regs)	((_regs)->regs[0])
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	return regs->regs[0];
+}
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/sh/include/asm/ptrace_64.h b/arch/sh/include/asm/ptrace_64.h
index bf9be7764d6..eb3fcceaf64 100644
--- a/arch/sh/include/asm/ptrace_64.h
+++ b/arch/sh/include/asm/ptrace_64.h
@@ -13,7 +13,10 @@ struct pt_regs {
 #ifdef __KERNEL__
 
 #define MAX_REG_OFFSET		offsetof(struct pt_regs, tregs[7])
-#define regs_return_value(_regs)	((_regs)->regs[3])
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	return regs->regs[3];
+}
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index 92b3c276339..c0b5c179d27 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -530,9 +530,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 {
 	int step;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->regs[0]),
-				   regs->regs[0]);
+	audit_syscall_exit(regs);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->regs[0]);
diff --git a/arch/sh/kernel/ptrace_64.c b/arch/sh/kernel/ptrace_64.c
index c8f97649f35..ba720d68643 100644
--- a/arch/sh/kernel/ptrace_64.c
+++ b/arch/sh/kernel/ptrace_64.c
@@ -548,9 +548,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 {
 	int step;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->regs[9]),
-				   regs->regs[9]);
+	audit_syscall_exit(regs);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->regs[9]);
diff --git a/arch/sparc/include/asm/ptrace.h b/arch/sparc/include/asm/ptrace.h
index a0e1bcf843a..c00c3b5c280 100644
--- a/arch/sparc/include/asm/ptrace.h
+++ b/arch/sparc/include/asm/ptrace.h
@@ -207,7 +207,15 @@ do {	current_thread_info()->syscall_noerror = 1; \
 #define instruction_pointer(regs) ((regs)->tpc)
 #define instruction_pointer_set(regs, val) ((regs)->tpc = (val))
 #define user_stack_pointer(regs) ((regs)->u_regs[UREG_FP])
-#define regs_return_value(regs) ((regs)->u_regs[UREG_I0])
+static inline int is_syscall_success(struct pt_regs *regs)
+{
+	return !(regs->tstate & (TSTATE_XCARRY | TSTATE_ICARRY));
+}
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	return regs->u_regs[UREG_I0];
+}
 #ifdef CONFIG_SMP
 extern unsigned long profile_pc(struct pt_regs *);
 #else
diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c
index 96ee50a8066..c73c8c50f11 100644
--- a/arch/sparc/kernel/ptrace_64.c
+++ b/arch/sparc/kernel/ptrace_64.c
@@ -1086,17 +1086,8 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs)
 
 asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 {
-#ifdef CONFIG_AUDITSYSCALL
-	if (unlikely(current->audit_context)) {
-		unsigned long tstate = regs->tstate;
-		int result = AUDITSC_SUCCESS;
+	audit_syscall_exit(regs);
 
-		if (unlikely(tstate & (TSTATE_XCARRY | TSTATE_ICARRY)))
-			result = AUDITSC_FAILURE;
-
-		audit_syscall_exit(result, regs->u_regs[UREG_I0]);
-	}
-#endif
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->u_regs[UREG_G1]);
 
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index c9da32b0c70..2ccf25c42fe 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -175,8 +175,8 @@ void syscall_trace(struct uml_pt_regs *regs, int entryexit)
 					    UPT_SYSCALL_ARG2(regs),
 					    UPT_SYSCALL_ARG3(regs),
 					    UPT_SYSCALL_ARG4(regs));
-		else audit_syscall_exit(AUDITSC_RESULT(UPT_SYSCALL_RET(regs)),
-					UPT_SYSCALL_RET(regs));
+		else
+			audit_syscall_exit(regs);
 	}
 
 	/* Fake a debug trap */
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 3e274564f6b..64ced0b8f8f 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -14,6 +14,7 @@
 #include <asm/segment.h>
 #include <asm/irqflags.h>
 #include <linux/linkage.h>
+#include <linux/err.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -208,12 +209,11 @@ sysexit_from_sys_call:
 	TRACE_IRQS_ON
 	sti
 	movl %eax,%esi		/* second arg, syscall return value */
-	cmpl $0,%eax		/* is it < 0? */
-	setl %al		/* 1 if so, 0 if not */
+	cmpl $-MAX_ERRNO,%eax	/* is it an error ? */
+	setbe %al		/* 1 if so, 0 if not */
 	movzbl %al,%edi		/* zero-extend that into %edi */
-	inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
-	call audit_syscall_exit
-	movl RAX-ARGOFFSET(%rsp),%eax	/* reload syscall return value */
+	call __audit_syscall_exit
+	movq RAX-ARGOFFSET(%rsp),%rax	/* reload syscall return value */
 	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
 	cli
 	TRACE_IRQS_OFF
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 22d0e21b4dd..a22facf06f0 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -42,6 +42,7 @@
  */
 
 #include <linux/linkage.h>
+#include <linux/err.h>
 #include <asm/thread_info.h>
 #include <asm/irqflags.h>
 #include <asm/errno.h>
@@ -466,11 +467,10 @@ sysexit_audit:
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_ANY)
 	movl %eax,%edx		/* second arg, syscall return value */
-	cmpl $0,%eax		/* is it < 0? */
-	setl %al		/* 1 if so, 0 if not */
+	cmpl $-MAX_ERRNO,%eax	/* is it an error ? */
+	setbe %al		/* 1 if so, 0 if not */
 	movzbl %al,%eax		/* zero-extend that */
-	inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
-	call audit_syscall_exit
+	call __audit_syscall_exit
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a20e1cb9dc8..e51393dd93a 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -55,6 +55,7 @@
 #include <asm/paravirt.h>
 #include <asm/ftrace.h>
 #include <asm/percpu.h>
+#include <linux/err.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -563,17 +564,16 @@ auditsys:
 	jmp system_call_fastpath
 
 	/*
-	 * Return fast path for syscall audit.  Call audit_syscall_exit()
+	 * Return fast path for syscall audit.  Call __audit_syscall_exit()
 	 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
 	 * masked off.
 	 */
 sysret_audit:
 	movq RAX-ARGOFFSET(%rsp),%rsi	/* second arg, syscall return value */
-	cmpq $0,%rsi		/* is it < 0? */
-	setl %al		/* 1 if so, 0 if not */
+	cmpq $-MAX_ERRNO,%rsi	/* is it < -MAX_ERRNO? */
+	setbe %al		/* 1 if so, 0 if not */
 	movzbl %al,%edi		/* zero-extend that into %edi */
-	inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
-	call audit_syscall_exit
+	call __audit_syscall_exit
 	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
 	jmp sysret_check
 #endif	/* CONFIG_AUDITSYSCALL */
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 89a04c7b5bb..8b021875877 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1414,8 +1414,7 @@ void syscall_trace_leave(struct pt_regs *regs)
 {
 	bool step;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
+	audit_syscall_exit(regs);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->ax);
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 863f8753ab0..af17e1c966d 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -335,9 +335,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 	if (info->flags & VM86_SCREEN_BITMAP)
 		mark_screen_rdonly(tsk->mm);
 
-	/*call audit_syscall_exit since we do not exit via the normal paths */
+	/*call __audit_syscall_exit since we do not exit via the normal paths */
 	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(0), 0);
+		__audit_syscall_exit(1, 0);
 
 	__asm__ __volatile__(
 		"movl %0,%%esp\n\t"
diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h
index 711b1621747..5ef9344a8b2 100644
--- a/arch/x86/um/shared/sysdep/ptrace.h
+++ b/arch/x86/um/shared/sysdep/ptrace.h
@@ -3,3 +3,8 @@
 #else
 #include "ptrace_64.h"
 #endif
+
+static inline long regs_return_value(struct uml_pt_regs *regs)
+{
+	return UPT_SYSCALL_RET(regs);
+}
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 6e1c533f9b4..3d65e4b3ba0 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -26,6 +26,7 @@
 
 #include <linux/types.h>
 #include <linux/elf-em.h>
+#include <linux/ptrace.h>
 
 /* The netlink messages for the audit system is divided into blocks:
  * 1000 - 1099 are for commanding the audit system
@@ -408,10 +409,6 @@ struct audit_field {
 	void				*lsm_rule;
 };
 
-#define AUDITSC_INVALID 0
-#define AUDITSC_SUCCESS 1
-#define AUDITSC_FAILURE 2
-#define AUDITSC_RESULT(x) ( ((long)(x))<0?AUDITSC_FAILURE:AUDITSC_SUCCESS )
 extern int __init audit_register_class(int class, unsigned *list);
 extern int audit_classify_syscall(int abi, unsigned syscall);
 extern int audit_classify_arch(int arch);
@@ -424,7 +421,7 @@ extern void audit_free(struct task_struct *task);
 extern void audit_syscall_entry(int arch,
 				int major, unsigned long a0, unsigned long a1,
 				unsigned long a2, unsigned long a3);
-extern void audit_syscall_exit(int failed, long return_code);
+extern void __audit_syscall_exit(int ret_success, long ret_value);
 extern void __audit_getname(const char *name);
 extern void audit_putname(const char *name);
 extern void __audit_inode(const char *name, const struct dentry *dentry);
@@ -438,6 +435,15 @@ static inline int audit_dummy_context(void)
 	void *p = current->audit_context;
 	return !p || *(int *)p;
 }
+static inline void audit_syscall_exit(void *pt_regs)
+{
+	if (unlikely(current->audit_context)) {
+		int success = is_syscall_success(pt_regs);
+		int return_code = regs_return_value(pt_regs);
+
+		__audit_syscall_exit(success, return_code);
+	}
+}
 static inline void audit_getname(const char *name)
 {
 	if (unlikely(!audit_dummy_context()))
@@ -551,12 +557,12 @@ static inline void audit_mmap_fd(int fd, int flags)
 
 extern int audit_n_rules;
 extern int audit_signals;
-#else
+#else /* CONFIG_AUDITSYSCALL */
 #define audit_finish_fork(t)
 #define audit_alloc(t) ({ 0; })
 #define audit_free(t) do { ; } while (0)
 #define audit_syscall_entry(ta,a,b,c,d,e) do { ; } while (0)
-#define audit_syscall_exit(f,r) do { ; } while (0)
+#define audit_syscall_exit(r) do { ; } while (0)
 #define audit_dummy_context() 1
 #define audit_getname(n) do { ; } while (0)
 #define audit_putname(n) do { ; } while (0)
@@ -587,7 +593,7 @@ extern int audit_signals;
 #define audit_ptrace(t) ((void)0)
 #define audit_n_rules 0
 #define audit_signals 0
-#endif
+#endif /* CONFIG_AUDITSYSCALL */
 
 #ifdef CONFIG_AUDIT
 /* These are defined in audit.c */
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 800f113bea6..dd4cefa6519 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -112,6 +112,7 @@
 
 #include <linux/compiler.h>		/* For unlikely.  */
 #include <linux/sched.h>		/* For struct task_struct.  */
+#include <linux/err.h>			/* for IS_ERR_VALUE */
 
 
 extern long arch_ptrace(struct task_struct *child, long request,
@@ -265,6 +266,15 @@ static inline void ptrace_release_task(struct task_struct *task)
 #define force_successful_syscall_return() do { } while (0)
 #endif
 
+#ifndef is_syscall_success
+/*
+ * On most systems we can tell if a syscall is a success based on if the retval
+ * is an error value.  On some systems like ia64 and powerpc they have different
+ * indicators of success/failure and must define their own.
+ */
+#define is_syscall_success(regs) (!IS_ERR_VALUE((unsigned long)(regs_return_value(regs))))
+#endif
+
 /*
  * <asm/ptrace.h> should define the following things inside #ifdef __KERNEL__.
  *
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e9bcb93800d..3d285380818 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -70,6 +70,11 @@
 
 #include "audit.h"
 
+/* flags stating the success for a syscall */
+#define AUDITSC_INVALID 0
+#define AUDITSC_SUCCESS 1
+#define AUDITSC_FAILURE 2
+
 /* AUDIT_NAMES is the number of slots we reserve in the audit_context
  * for saving names from getname().  If we get more names we will allocate
  * a name dynamically and also add those to the list anchored by names_list. */
@@ -1724,8 +1729,7 @@ void audit_finish_fork(struct task_struct *child)
 
 /**
  * audit_syscall_exit - deallocate audit context after a system call
- * @valid: success/failure flag
- * @return_code: syscall return value
+ * @pt_regs: syscall registers
  *
  * Tear down after system call.  If the audit context has been marked as
  * auditable (either because of the AUDIT_RECORD_CONTEXT state from
@@ -1733,13 +1737,17 @@ void audit_finish_fork(struct task_struct *child)
  * message), then write out the syscall information.  In call cases,
  * free the names stored from getname().
  */
-void audit_syscall_exit(int valid, long return_code)
+void __audit_syscall_exit(int success, long return_code)
 {
 	struct task_struct *tsk = current;
 	struct audit_context *context;
 
-	context = audit_get_context(tsk, valid, return_code);
+	if (success)
+		success = AUDITSC_SUCCESS;
+	else
+		success = AUDITSC_FAILURE;
 
+	context = audit_get_context(tsk, success, return_code);
 	if (likely(!context))
 		return;
 
-- 
cgit v1.2.3-70-g09d2


From b05d8447e7821695bc2fa3359431f7a664232743 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:06 -0500
Subject: audit: inline audit_syscall_entry to reduce burden on archs

Every arch calls:

if (unlikely(current->audit_context))
	audit_syscall_entry()

which requires knowledge about audit (the existance of audit_context) in
the arch code.  Just do it all in static inline in audit.h so that arch's
can remain blissfully ignorant.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 arch/ia64/kernel/ptrace.c       |  9 +--------
 arch/microblaze/kernel/ptrace.c |  6 ++----
 arch/mips/kernel/ptrace.c       |  7 +++----
 arch/powerpc/kernel/ptrace.c    | 26 ++++++++++++--------------
 arch/s390/kernel/ptrace.c       | 11 +++++------
 arch/sh/kernel/ptrace_32.c      |  7 +++----
 arch/sh/kernel/ptrace_64.c      |  7 +++----
 arch/sparc/kernel/ptrace_64.c   | 17 ++++++++---------
 arch/um/kernel/ptrace.c         | 20 +++++++++-----------
 arch/x86/ia32/ia32entry.S       |  2 +-
 arch/x86/kernel/entry_32.S      |  2 +-
 arch/x86/kernel/entry_64.S      |  4 ++--
 arch/x86/kernel/ptrace.c        | 22 ++++++++++------------
 arch/xtensa/kernel/ptrace.c     |  3 +--
 include/linux/audit.h           | 13 ++++++++++---
 kernel/auditsc.c                |  2 +-
 16 files changed, 72 insertions(+), 86 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 2c154088cce..dad91661ddf 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -1246,15 +1246,8 @@ syscall_trace_enter (long arg0, long arg1, long arg2, long arg3,
 	if (test_thread_flag(TIF_RESTORE_RSE))
 		ia64_sync_krbs();
 
-	if (unlikely(current->audit_context)) {
-		long syscall;
-		int arch;
 
-		syscall = regs.r15;
-		arch = AUDIT_ARCH_IA64;
-
-		audit_syscall_entry(arch, syscall, arg0, arg1, arg2, arg3);
-	}
+	audit_syscall_entry(AUDIT_ARCH_IA64, regs.r15, arg0, arg1, arg2, arg3);
 
 	return 0;
 }
diff --git a/arch/microblaze/kernel/ptrace.c b/arch/microblaze/kernel/ptrace.c
index f564b1bfd38..6eb2aa927d8 100644
--- a/arch/microblaze/kernel/ptrace.c
+++ b/arch/microblaze/kernel/ptrace.c
@@ -147,10 +147,8 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 		 */
 		ret = -1L;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_entry(EM_MICROBLAZE, regs->r12,
-				    regs->r5, regs->r6,
-				    regs->r7, regs->r8);
+	audit_syscall_entry(EM_MICROBLAZE, regs->r12, regs->r5, regs->r6,
+			    regs->r7, regs->r8);
 
 	return ret ?: regs->r12;
 }
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index ab0f1963a7b..7786b608d93 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -560,10 +560,9 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs)
 	}
 
 out:
-	if (unlikely(current->audit_context))
-		audit_syscall_entry(audit_arch(), regs->regs[2],
-				    regs->regs[4], regs->regs[5],
-				    regs->regs[6], regs->regs[7]);
+	audit_syscall_entry(audit_arch(), regs->regs[2],
+			    regs->regs[4], regs->regs[5],
+			    regs->regs[6], regs->regs[7]);
 }
 
 /*
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 09d31c12a5e..5b43325402b 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -1724,22 +1724,20 @@ long do_syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->gpr[0]);
 
-	if (unlikely(current->audit_context)) {
 #ifdef CONFIG_PPC64
-		if (!is_32bit_task())
-			audit_syscall_entry(AUDIT_ARCH_PPC64,
-					    regs->gpr[0],
-					    regs->gpr[3], regs->gpr[4],
-					    regs->gpr[5], regs->gpr[6]);
-		else
+	if (!is_32bit_task())
+		audit_syscall_entry(AUDIT_ARCH_PPC64,
+				    regs->gpr[0],
+				    regs->gpr[3], regs->gpr[4],
+				    regs->gpr[5], regs->gpr[6]);
+	else
 #endif
-			audit_syscall_entry(AUDIT_ARCH_PPC,
-					    regs->gpr[0],
-					    regs->gpr[3] & 0xffffffff,
-					    regs->gpr[4] & 0xffffffff,
-					    regs->gpr[5] & 0xffffffff,
-					    regs->gpr[6] & 0xffffffff);
-	}
+		audit_syscall_entry(AUDIT_ARCH_PPC,
+				    regs->gpr[0],
+				    regs->gpr[3] & 0xffffffff,
+				    regs->gpr[4] & 0xffffffff,
+				    regs->gpr[5] & 0xffffffff,
+				    regs->gpr[6] & 0xffffffff);
 
 	return ret ?: regs->gpr[0];
 }
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index f5275860098..9d82ed4bcb2 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -740,12 +740,11 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->gprs[2]);
 
-	if (unlikely(current->audit_context))
-		audit_syscall_entry(is_compat_task() ?
-					AUDIT_ARCH_S390 : AUDIT_ARCH_S390X,
-				    regs->gprs[2], regs->orig_gpr2,
-				    regs->gprs[3], regs->gprs[4],
-				    regs->gprs[5]);
+	audit_syscall_entry(is_compat_task() ?
+				AUDIT_ARCH_S390 : AUDIT_ARCH_S390X,
+			    regs->gprs[2], regs->orig_gpr2,
+			    regs->gprs[3], regs->gprs[4],
+			    regs->gprs[5]);
 	return ret ?: regs->gprs[2];
 }
 
diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index c0b5c179d27..a3e65156376 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -518,10 +518,9 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->regs[0]);
 
-	if (unlikely(current->audit_context))
-		audit_syscall_entry(audit_arch(), regs->regs[3],
-				    regs->regs[4], regs->regs[5],
-				    regs->regs[6], regs->regs[7]);
+	audit_syscall_entry(audit_arch(), regs->regs[3],
+			    regs->regs[4], regs->regs[5],
+			    regs->regs[6], regs->regs[7]);
 
 	return ret ?: regs->regs[0];
 }
diff --git a/arch/sh/kernel/ptrace_64.c b/arch/sh/kernel/ptrace_64.c
index ba720d68643..3d0080b5c97 100644
--- a/arch/sh/kernel/ptrace_64.c
+++ b/arch/sh/kernel/ptrace_64.c
@@ -536,10 +536,9 @@ asmlinkage long long do_syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->regs[9]);
 
-	if (unlikely(current->audit_context))
-		audit_syscall_entry(audit_arch(), regs->regs[1],
-				    regs->regs[2], regs->regs[3],
-				    regs->regs[4], regs->regs[5]);
+	audit_syscall_entry(audit_arch(), regs->regs[1],
+			    regs->regs[2], regs->regs[3],
+			    regs->regs[4], regs->regs[5]);
 
 	return ret ?: regs->regs[9];
 }
diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c
index c73c8c50f11..9388844cd88 100644
--- a/arch/sparc/kernel/ptrace_64.c
+++ b/arch/sparc/kernel/ptrace_64.c
@@ -1071,15 +1071,14 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->u_regs[UREG_G1]);
 
-	if (unlikely(current->audit_context) && !ret)
-		audit_syscall_entry((test_thread_flag(TIF_32BIT) ?
-				     AUDIT_ARCH_SPARC :
-				     AUDIT_ARCH_SPARC64),
-				    regs->u_regs[UREG_G1],
-				    regs->u_regs[UREG_I0],
-				    regs->u_regs[UREG_I1],
-				    regs->u_regs[UREG_I2],
-				    regs->u_regs[UREG_I3]);
+	audit_syscall_entry((test_thread_flag(TIF_32BIT) ?
+			     AUDIT_ARCH_SPARC :
+			     AUDIT_ARCH_SPARC64),
+			    regs->u_regs[UREG_G1],
+			    regs->u_regs[UREG_I0],
+			    regs->u_regs[UREG_I1],
+			    regs->u_regs[UREG_I2],
+			    regs->u_regs[UREG_I3]);
 
 	return ret;
 }
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index 2ccf25c42fe..06b19039050 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -167,17 +167,15 @@ void syscall_trace(struct uml_pt_regs *regs, int entryexit)
 	int is_singlestep = (current->ptrace & PT_DTRACE) && entryexit;
 	int tracesysgood;
 
-	if (unlikely(current->audit_context)) {
-		if (!entryexit)
-			audit_syscall_entry(HOST_AUDIT_ARCH,
-					    UPT_SYSCALL_NR(regs),
-					    UPT_SYSCALL_ARG1(regs),
-					    UPT_SYSCALL_ARG2(regs),
-					    UPT_SYSCALL_ARG3(regs),
-					    UPT_SYSCALL_ARG4(regs));
-		else
-			audit_syscall_exit(regs);
-	}
+	if (!entryexit)
+		audit_syscall_entry(HOST_AUDIT_ARCH,
+				    UPT_SYSCALL_NR(regs),
+				    UPT_SYSCALL_ARG1(regs),
+				    UPT_SYSCALL_ARG2(regs),
+				    UPT_SYSCALL_ARG3(regs),
+				    UPT_SYSCALL_ARG4(regs));
+	else
+		audit_syscall_exit(regs);
 
 	/* Fake a debug trap */
 	if (is_singlestep)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 025f0f01d25..cecfd9a8f73 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -192,7 +192,7 @@ sysexit_from_sys_call:
 	movl %ebx,%edx			/* 3rd arg: 1st syscall arg */
 	movl %eax,%esi			/* 2nd arg: syscall number */
 	movl $AUDIT_ARCH_I386,%edi	/* 1st arg: audit arch */
-	call audit_syscall_entry
+	call __audit_syscall_entry
 	movl RAX-ARGOFFSET(%rsp),%eax	/* reload syscall number */
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja ia32_badsys
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index a22facf06f0..1ccd742eba1 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -456,7 +456,7 @@ sysenter_audit:
 	movl %ebx,%ecx			/* 3rd arg: 1st syscall arg */
 	movl %eax,%edx			/* 2nd arg: syscall number */
 	movl $AUDIT_ARCH_I386,%eax	/* 1st arg: audit arch */
-	call audit_syscall_entry
+	call __audit_syscall_entry
 	pushl_cfi %ebx
 	movl PT_EAX(%esp),%eax		/* reload syscall number */
 	jmp sysenter_do_call
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e51393dd93a..1ca66b65012 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -549,7 +549,7 @@ badsys:
 #ifdef CONFIG_AUDITSYSCALL
 	/*
 	 * Fast path for syscall audit without full syscall trace.
-	 * We just call audit_syscall_entry() directly, and then
+	 * We just call __audit_syscall_entry() directly, and then
 	 * jump back to the normal fast path.
 	 */
 auditsys:
@@ -559,7 +559,7 @@ auditsys:
 	movq %rdi,%rdx			/* 3rd arg: 1st syscall arg */
 	movq %rax,%rsi			/* 2nd arg: syscall number */
 	movl $AUDIT_ARCH_X86_64,%edi	/* 1st arg: audit arch */
-	call audit_syscall_entry
+	call __audit_syscall_entry
 	LOAD_ARGS 0		/* reload call-clobbered registers */
 	jmp system_call_fastpath
 
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 8b021875877..50267386b76 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1392,20 +1392,18 @@ long syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->orig_ax);
 
-	if (unlikely(current->audit_context)) {
-		if (IS_IA32)
-			audit_syscall_entry(AUDIT_ARCH_I386,
-					    regs->orig_ax,
-					    regs->bx, regs->cx,
-					    regs->dx, regs->si);
+	if (IS_IA32)
+		audit_syscall_entry(AUDIT_ARCH_I386,
+				    regs->orig_ax,
+				    regs->bx, regs->cx,
+				    regs->dx, regs->si);
 #ifdef CONFIG_X86_64
-		else
-			audit_syscall_entry(AUDIT_ARCH_X86_64,
-					    regs->orig_ax,
-					    regs->di, regs->si,
-					    regs->dx, regs->r10);
+	else
+		audit_syscall_entry(AUDIT_ARCH_X86_64,
+				    regs->orig_ax,
+				    regs->di, regs->si,
+				    regs->dx, regs->r10);
 #endif
-	}
 
 	return ret ?: regs->orig_ax;
 }
diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c
index a0d042aa296..2dff698ab02 100644
--- a/arch/xtensa/kernel/ptrace.c
+++ b/arch/xtensa/kernel/ptrace.c
@@ -334,8 +334,7 @@ void do_syscall_trace_enter(struct pt_regs *regs)
 		do_syscall_trace();
 
 #if 0
-	if (unlikely(current->audit_context))
-		audit_syscall_entry(current, AUDIT_ARCH_XTENSA..);
+	audit_syscall_entry(current, AUDIT_ARCH_XTENSA..);
 #endif
 }
 
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 3d65e4b3ba0..f56ce2669b8 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -418,9 +418,9 @@ extern int audit_classify_arch(int arch);
 extern void audit_finish_fork(struct task_struct *child);
 extern int  audit_alloc(struct task_struct *task);
 extern void audit_free(struct task_struct *task);
-extern void audit_syscall_entry(int arch,
-				int major, unsigned long a0, unsigned long a1,
-				unsigned long a2, unsigned long a3);
+extern void __audit_syscall_entry(int arch,
+				  int major, unsigned long a0, unsigned long a1,
+				  unsigned long a2, unsigned long a3);
 extern void __audit_syscall_exit(int ret_success, long ret_value);
 extern void __audit_getname(const char *name);
 extern void audit_putname(const char *name);
@@ -435,6 +435,13 @@ static inline int audit_dummy_context(void)
 	void *p = current->audit_context;
 	return !p || *(int *)p;
 }
+static inline void audit_syscall_entry(int arch, int major, unsigned long a0,
+				       unsigned long a1, unsigned long a2,
+				       unsigned long a3)
+{
+	if (unlikely(!audit_dummy_context()))
+		__audit_syscall_entry(arch, major, a0, a1, a2, a3);
+}
 static inline void audit_syscall_exit(void *pt_regs)
 {
 	if (unlikely(current->audit_context)) {
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3d285380818..b408100dd6e 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1632,7 +1632,7 @@ void audit_free(struct task_struct *tsk)
  * will only be written if another part of the kernel requests that it
  * be written).
  */
-void audit_syscall_entry(int arch, int major,
+void __audit_syscall_entry(int arch, int major,
 			 unsigned long a1, unsigned long a2,
 			 unsigned long a3, unsigned long a4)
 {
-- 
cgit v1.2.3-70-g09d2


From 997f5b6444f4608692ec807fb802fd9767c80e76 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:06 -0500
Subject: audit: remove AUDIT_SETUP_CONTEXT as it isn't used

Audit contexts have 3 states.  Disabled, which doesn't collect anything,
build, which collects info but might not emit it, and record, which
collects and emits.  There is a 4th state, setup, which isn't used.  Get
rid of it.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 kernel/audit.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.h b/kernel/audit.h
index 91e7071c4d2..81676680337 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -36,12 +36,8 @@ enum audit_state {
 	AUDIT_DISABLED,		/* Do not create per-task audit_context.
 				 * No syscall-specific audit records can
 				 * be generated. */
-	AUDIT_SETUP_CONTEXT,	/* Create the per-task audit_context,
-				 * but don't necessarily fill it in at
-				 * syscall entry time (i.e., filter
-				 * instead). */
 	AUDIT_BUILD_CONTEXT,	/* Create the per-task audit_context,
-				 * and always fill it in at syscall
+				 * and fill it in at syscall
 				 * entry time.  This makes a full
 				 * syscall record available if some
 				 * other part of the kernel decides it
-- 
cgit v1.2.3-70-g09d2


From 56179a6ec65a56e0279a58e35cb450d38f061b94 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:06 -0500
Subject: audit: drop some potentially inadvisable likely notations

The audit code makes heavy use of likely() and unlikely() macros, but they
don't always make sense.  Drop any that seem questionable and let the
computer do it's thing.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 kernel/auditsc.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b408100dd6e..d7382c2aaa9 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -805,7 +805,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 {
 	struct audit_context *context = tsk->audit_context;
 
-	if (likely(!context))
+	if (!context)
 		return NULL;
 	context->return_valid = return_valid;
 
@@ -928,7 +928,7 @@ int audit_alloc(struct task_struct *tsk)
 		return 0; /* Return if not auditing. */
 
 	state = audit_filter_task(tsk, &key);
-	if (likely(state == AUDIT_DISABLED))
+	if (state == AUDIT_DISABLED)
 		return 0;
 
 	if (!(context = audit_alloc_context(state))) {
@@ -1599,7 +1599,7 @@ void audit_free(struct task_struct *tsk)
 	struct audit_context *context;
 
 	context = audit_get_context(tsk, 0, 0);
-	if (likely(!context))
+	if (!context)
 		return;
 
 	/* Check for system calls that do not go through the exit
@@ -1640,7 +1640,7 @@ void __audit_syscall_entry(int arch, int major,
 	struct audit_context *context = tsk->audit_context;
 	enum audit_state     state;
 
-	if (unlikely(!context))
+	if (!context)
 		return;
 
 	/*
@@ -1697,7 +1697,7 @@ void __audit_syscall_entry(int arch, int major,
 		context->prio = 0;
 		state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
 	}
-	if (likely(state == AUDIT_DISABLED))
+	if (state == AUDIT_DISABLED)
 		return;
 
 	context->serial     = 0;
@@ -1748,7 +1748,7 @@ void __audit_syscall_exit(int success, long return_code)
 		success = AUDITSC_FAILURE;
 
 	context = audit_get_context(tsk, success, return_code);
-	if (likely(!context))
+	if (!context)
 		return;
 
 	if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
-- 
cgit v1.2.3-70-g09d2


From 07c49417877f8658a6aa0ad9b4e21e4fd4df11b6 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:07 -0500
Subject: audit: inline checks for not needing to collect aux records

A number of audit hooks make function calls before they determine that
auxilary records do not need to be collected.  Do those checks as static
inlines since the most common case is going to be that records are not
needed and we can skip the function call overhead.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h | 23 ++++++++++++++++++++---
 kernel/auditsc.c      | 15 +++------------
 2 files changed, 23 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index f56ce2669b8..cf16faff6b8 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -489,9 +489,9 @@ extern int  audit_set_loginuid(struct task_struct *task, uid_t loginuid);
 extern void audit_log_task_context(struct audit_buffer *ab);
 extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
 extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
-extern int audit_bprm(struct linux_binprm *bprm);
-extern void audit_socketcall(int nargs, unsigned long *args);
-extern int audit_sockaddr(int len, void *addr);
+extern int __audit_bprm(struct linux_binprm *bprm);
+extern void __audit_socketcall(int nargs, unsigned long *args);
+extern int __audit_sockaddr(int len, void *addr);
 extern void __audit_fd_pair(int fd1, int fd2);
 extern int audit_set_macxattr(const char *name);
 extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr);
@@ -519,6 +519,23 @@ static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid
 	if (unlikely(!audit_dummy_context()))
 		__audit_ipc_set_perm(qbytes, uid, gid, mode);
 }
+static inline int audit_bprm(struct linux_binprm *bprm)
+{
+	if (unlikely(!audit_dummy_context()))
+		return __audit_bprm(bprm);
+	return 0;
+}
+static inline void audit_socketcall(int nargs, unsigned long *args)
+{
+	if (unlikely(!audit_dummy_context()))
+		__audit_socketcall(nargs, args);
+}
+static inline int audit_sockaddr(int len, void *addr)
+{
+	if (unlikely(!audit_dummy_context()))
+		return __audit_sockaddr(len, addr);
+	return 0;
+}
 static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
 {
 	if (unlikely(!audit_dummy_context()))
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d7382c2aaa9..e1062f66b01 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2309,14 +2309,11 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo
 	context->ipc.has_perm = 1;
 }
 
-int audit_bprm(struct linux_binprm *bprm)
+int __audit_bprm(struct linux_binprm *bprm)
 {
 	struct audit_aux_data_execve *ax;
 	struct audit_context *context = current->audit_context;
 
-	if (likely(!audit_enabled || !context || context->dummy))
-		return 0;
-
 	ax = kmalloc(sizeof(*ax), GFP_KERNEL);
 	if (!ax)
 		return -ENOMEM;
@@ -2337,13 +2334,10 @@ int audit_bprm(struct linux_binprm *bprm)
  * @args: args array
  *
  */
-void audit_socketcall(int nargs, unsigned long *args)
+void __audit_socketcall(int nargs, unsigned long *args)
 {
 	struct audit_context *context = current->audit_context;
 
-	if (likely(!context || context->dummy))
-		return;
-
 	context->type = AUDIT_SOCKETCALL;
 	context->socketcall.nargs = nargs;
 	memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
@@ -2369,13 +2363,10 @@ void __audit_fd_pair(int fd1, int fd2)
  *
  * Returns 0 for success or NULL context or < 0 on error.
  */
-int audit_sockaddr(int len, void *a)
+int __audit_sockaddr(int len, void *a)
 {
 	struct audit_context *context = current->audit_context;
 
-	if (likely(!context || context->dummy))
-		return 0;
-
 	if (!context->sockaddr) {
 		void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
 		if (!p)
-- 
cgit v1.2.3-70-g09d2


From a4ff8dba7d8ce5ceb43fb27df66292251cc73bdc Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:07 -0500
Subject: audit: inline audit_free to simplify the look of generic code

make the conditional a static inline instead of doing it in generic code.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h | 7 ++++++-
 kernel/auditsc.c      | 2 +-
 kernel/exit.c         | 3 +--
 3 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 4f1efe3e861..8eb8bda749b 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -417,7 +417,7 @@ extern int audit_classify_arch(int arch);
 				/* Public API */
 extern void audit_finish_fork(struct task_struct *child);
 extern int  audit_alloc(struct task_struct *task);
-extern void audit_free(struct task_struct *task);
+extern void __audit_free(struct task_struct *task);
 extern void __audit_syscall_entry(int arch,
 				  int major, unsigned long a0, unsigned long a1,
 				  unsigned long a2, unsigned long a3);
@@ -435,6 +435,11 @@ static inline int audit_dummy_context(void)
 	void *p = current->audit_context;
 	return !p || *(int *)p;
 }
+static inline void audit_free(struct task_struct *task)
+{
+	if (unlikely(task->audit_context))
+		__audit_free(task);
+}
 static inline void audit_syscall_entry(int arch, int major, unsigned long a0,
 				       unsigned long a1, unsigned long a2,
 				       unsigned long a3)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e1062f66b01..7aaeb38b262 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1594,7 +1594,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
  *
  * Called from copy_process and do_exit
  */
-void audit_free(struct task_struct *tsk)
+void __audit_free(struct task_struct *tsk)
 {
 	struct audit_context *context;
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 94ed6e20bb5..88dcbbc446f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -964,8 +964,7 @@ NORET_TYPE void do_exit(long code)
 	acct_collect(code, group_dead);
 	if (group_dead)
 		tty_audit_exit();
-	if (unlikely(tsk->audit_context))
-		audit_free(tsk);
+	audit_free(tsk);
 
 	tsk->exit_code = code;
 	taskstats_exit(tsk, group_dead);
-- 
cgit v1.2.3-70-g09d2


From 7ff68e53ece8c175d2951bb8a30b3cce8f9c5579 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:07 -0500
Subject: audit: reject entry,always rules

We deprecated entry,always rules a long time ago.  Reject those rules as
invalid.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 kernel/auditfilter.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index d94dde82c3c..903caa269b5 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -235,13 +235,15 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
 	switch(listnr) {
 	default:
 		goto exit_err;
-	case AUDIT_FILTER_USER:
-	case AUDIT_FILTER_TYPE:
 #ifdef CONFIG_AUDITSYSCALL
 	case AUDIT_FILTER_ENTRY:
+		if (rule->action == AUDIT_ALWAYS)
+			goto exit_err;
 	case AUDIT_FILTER_EXIT:
 	case AUDIT_FILTER_TASK:
 #endif
+	case AUDIT_FILTER_USER:
+	case AUDIT_FILTER_TYPE:
 		;
 	}
 	if (unlikely(rule->action == AUDIT_POSSIBLE)) {
-- 
cgit v1.2.3-70-g09d2


From 6422e78de6880c66a82af512d9bd0c85eb62e661 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:07 -0500
Subject: audit: remove audit_finish_fork as it can't be called

Audit entry,always rules are not allowed and are automatically changed in
exit,always rules in userspace.  The kernel refuses to load such rules.

Thus a task in the middle of a syscall (and thus in audit_finish_fork())
can only be in one of two states: AUDIT_BUILD_CONTEXT or AUDIT_DISABLED.
Since the current task cannot be in AUDIT_RECORD_CONTEXT we aren't every
going to actually use the code in audit_finish_fork() since it will
return without doing anything.  Thus drop the code.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h |  2 --
 kernel/auditsc.c      | 20 --------------------
 kernel/fork.c         |  2 --
 3 files changed, 24 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 8eb8bda749b..67b66c37a25 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -415,7 +415,6 @@ extern int audit_classify_arch(int arch);
 #ifdef CONFIG_AUDITSYSCALL
 /* These are defined in auditsc.c */
 				/* Public API */
-extern void audit_finish_fork(struct task_struct *child);
 extern int  audit_alloc(struct task_struct *task);
 extern void __audit_free(struct task_struct *task);
 extern void __audit_syscall_entry(int arch,
@@ -586,7 +585,6 @@ static inline void audit_mmap_fd(int fd, int flags)
 extern int audit_n_rules;
 extern int audit_signals;
 #else /* CONFIG_AUDITSYSCALL */
-#define audit_finish_fork(t)
 #define audit_alloc(t) ({ 0; })
 #define audit_free(t) do { ; } while (0)
 #define audit_syscall_entry(ta,a,b,c,d,e) do { ; } while (0)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7aaeb38b262..4d8920f5ab8 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1707,26 +1707,6 @@ void __audit_syscall_entry(int arch, int major,
 	context->ppid       = 0;
 }
 
-void audit_finish_fork(struct task_struct *child)
-{
-	struct audit_context *ctx = current->audit_context;
-	struct audit_context *p = child->audit_context;
-	if (!p || !ctx)
-		return;
-	if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT)
-		return;
-	p->arch = ctx->arch;
-	p->major = ctx->major;
-	memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
-	p->ctime = ctx->ctime;
-	p->dummy = ctx->dummy;
-	p->in_syscall = ctx->in_syscall;
-	p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
-	p->ppid = current->pid;
-	p->prio = ctx->prio;
-	p->current_state = ctx->current_state;
-}
-
 /**
  * audit_syscall_exit - deallocate audit context after a system call
  * @pt_regs: syscall registers
diff --git a/kernel/fork.c b/kernel/fork.c
index 443f5125f11..c1e5c21f48c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1525,8 +1525,6 @@ long do_fork(unsigned long clone_flags,
 			init_completion(&vfork);
 		}
 
-		audit_finish_fork(p);
-
 		/*
 		 * We set PF_STARTING at creation in case tracing wants to
 		 * use this to distinguish a fully live task from one that
-- 
cgit v1.2.3-70-g09d2


From efaffd6e4417860c67576ac760dd6e8bbd15f006 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:07 -0500
Subject: audit: allow matching on obj_uid

Allow syscall exit filter matching based on the uid of the owner of an
inode used in a syscall.  aka:

auditctl -a always,exit -S open -F obj_uid=0 -F perm=wa

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h |  1 +
 kernel/auditfilter.c  |  1 +
 kernel/auditsc.c      | 12 ++++++++++++
 3 files changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 67b66c37a25..55cb3daaf47 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -223,6 +223,7 @@
 #define AUDIT_PERM	106
 #define AUDIT_DIR	107
 #define AUDIT_FILETYPE	108
+#define AUDIT_OBJ_UID	109
 
 #define AUDIT_ARG0      200
 #define AUDIT_ARG1      (AUDIT_ARG0+1)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 903caa269b5..13e997423dc 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -461,6 +461,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 		case AUDIT_ARG1:
 		case AUDIT_ARG2:
 		case AUDIT_ARG3:
+		case AUDIT_OBJ_UID:
 			break;
 		case AUDIT_ARCH:
 			entry->rule.arch_f = f;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4d8920f5ab8..5cf3ecc0151 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -586,6 +586,18 @@ static int audit_filter_rules(struct task_struct *tsk,
 				}
 			}
 			break;
+		case AUDIT_OBJ_UID:
+			if (name) {
+				result = audit_comparator(name->uid, f->op, f->val);
+			} else if (ctx) {
+				list_for_each_entry(n, &ctx->names_list, list) {
+					if (audit_comparator(n->uid, f->op, f->val)) {
+						++result;
+						break;
+					}
+				}
+			}
+			break;
 		case AUDIT_WATCH:
 			if (name)
 				result = audit_watch_compare(rule->watch, name->ino, name->dev);
-- 
cgit v1.2.3-70-g09d2


From 54d3218b31aee5bc9c859ae60fbde933d922448b Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:07 -0500
Subject: audit: allow audit matching on inode gid

Much like the ability to filter audit on the uid of an inode collected, we
should be able to filter on the gid of the inode.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h |  1 +
 kernel/auditfilter.c  |  1 +
 kernel/auditsc.c      | 12 ++++++++++++
 3 files changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 55cb3daaf47..e36aa37c88a 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -224,6 +224,7 @@
 #define AUDIT_DIR	107
 #define AUDIT_FILETYPE	108
 #define AUDIT_OBJ_UID	109
+#define AUDIT_OBJ_GID	110
 
 #define AUDIT_ARG0      200
 #define AUDIT_ARG1      (AUDIT_ARG0+1)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 13e997423dc..f10605c787e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -462,6 +462,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 		case AUDIT_ARG2:
 		case AUDIT_ARG3:
 		case AUDIT_OBJ_UID:
+		case AUDIT_OBJ_GID:
 			break;
 		case AUDIT_ARCH:
 			entry->rule.arch_f = f;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 5cf3ecc0151..87b375fb12f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -598,6 +598,18 @@ static int audit_filter_rules(struct task_struct *tsk,
 				}
 			}
 			break;
+		case AUDIT_OBJ_GID:
+			if (name) {
+				result = audit_comparator(name->gid, f->op, f->val);
+			} else if (ctx) {
+				list_for_each_entry(n, &ctx->names_list, list) {
+					if (audit_comparator(n->gid, f->op, f->val)) {
+						++result;
+						break;
+					}
+				}
+			}
+			break;
 		case AUDIT_WATCH:
 			if (name)
 				result = audit_watch_compare(rule->watch, name->ino, name->dev);
-- 
cgit v1.2.3-70-g09d2


From 0a300be6d5be8f66cd96609334710c268d0bfdce Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:08 -0500
Subject: audit: remove task argument to audit_set_loginuid

The function always deals with current.  Don't expose an option
pretending one can use it for something.  You can't.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/proc/base.c        | 2 +-
 include/linux/audit.h | 2 +-
 kernel/auditsc.c      | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8173dfd89cb..e3cbebbabeb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1228,7 +1228,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 		goto out_free_page;
 
 	}
-	length = audit_set_loginuid(current, loginuid);
+	length = audit_set_loginuid(loginuid);
 	if (likely(length == 0))
 		length = count;
 
diff --git a/include/linux/audit.h b/include/linux/audit.h
index e36aa37c88a..7cbd6fe4157 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -489,7 +489,7 @@ static inline void audit_ptrace(struct task_struct *t)
 extern unsigned int audit_serial(void);
 extern int auditsc_get_stamp(struct audit_context *ctx,
 			      struct timespec *t, unsigned int *serial);
-extern int  audit_set_loginuid(struct task_struct *task, uid_t loginuid);
+extern int  audit_set_loginuid(uid_t loginuid);
 #define audit_get_loginuid(t) ((t)->loginuid)
 #define audit_get_sessionid(t) ((t)->sessionid)
 extern void audit_log_task_context(struct audit_buffer *ab);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 87b375fb12f..9d6dd7d869c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2163,16 +2163,16 @@ int auditsc_get_stamp(struct audit_context *ctx,
 static atomic_t session_id = ATOMIC_INIT(0);
 
 /**
- * audit_set_loginuid - set a task's audit_context loginuid
- * @task: task whose audit context is being modified
+ * audit_set_loginuid - set current task's audit_context loginuid
  * @loginuid: loginuid value
  *
  * Returns 0.
  *
  * Called (set) from fs/proc/base.c::proc_loginuid_write().
  */
-int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
+int audit_set_loginuid(uid_t loginuid)
 {
+	struct task_struct *task = current;
 	unsigned int sessionid = atomic_inc_return(&session_id);
 	struct audit_context *context = task->audit_context;
 
-- 
cgit v1.2.3-70-g09d2


From 633b45454503489209b0d9a45f9e3cd1b852c614 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:08 -0500
Subject: audit: only allow tasks to set their loginuid if it is -1

At the moment we allow tasks to set their loginuid if they have
CAP_AUDIT_CONTROL.  In reality we want tasks to set the loginuid when they
log in and it be impossible to ever reset.  We had to make it mutable even
after it was once set (with the CAP) because on update and admin might have
to restart sshd.  Now sshd would get his loginuid and the next user which
logged in using ssh would not be able to set his loginuid.

Systemd has changed how userspace works and allowed us to make the kernel
work the way it should.  With systemd users (even admins) are not supposed
to restart services directly.  The system will restart the service for
them.  Thus since systemd is going to loginuid==-1, sshd would get -1, and
sshd would be allowed to set a new loginuid without special permissions.

If an admin in this system were to manually start an sshd he is inserting
himself into the system chain of trust and thus, logically, it's his
loginuid that should be used!  Since we have old systems I make this a
Kconfig option.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/proc/base.c   |  3 ---
 init/Kconfig     | 14 ++++++++++++++
 kernel/auditsc.c | 11 ++++++++++-
 3 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index e3cbebbabeb..482df23036b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1197,9 +1197,6 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 	ssize_t length;
 	uid_t loginuid;
 
-	if (!capable(CAP_AUDIT_CONTROL))
-		return -EPERM;
-
 	rcu_read_lock();
 	if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
 		rcu_read_unlock();
diff --git a/init/Kconfig b/init/Kconfig
index a075765d5fb..5ad8b775f2a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -372,6 +372,20 @@ config AUDIT_TREE
 	depends on AUDITSYSCALL
 	select FSNOTIFY
 
+config AUDIT_LOGINUID_IMMUTABLE
+	bool "Make audit loginuid immutable"
+	depends on AUDIT
+	help
+	  The config option toggles if a task setting it's loginuid requires
+	  CAP_SYS_AUDITCONTROL or if that task should require no special permissions
+	  but should instead only allow setting its loginuid if it was never
+	  previously set.  On systems which use systemd or a similar central
+	  process to restart login services this should be set to true.  On older
+	  systems in which an admin would typically have to directly stop and
+	  start processes this should be set to false.  Setting this to true allows
+	  one to drop potentially dangerous capabilites from the login tasks,
+	  but may not be backwards compatible with older init systems.
+
 source "kernel/irq/Kconfig"
 
 menu "RCU Subsystem"
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9d6dd7d869c..bd084a13c71 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2173,9 +2173,18 @@ static atomic_t session_id = ATOMIC_INIT(0);
 int audit_set_loginuid(uid_t loginuid)
 {
 	struct task_struct *task = current;
-	unsigned int sessionid = atomic_inc_return(&session_id);
 	struct audit_context *context = task->audit_context;
+	unsigned int sessionid;
+
+#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
+	if (task->loginuid != -1)
+		return -EPERM;
+#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
+	if (!capable(CAP_AUDIT_CONTROL))
+		return -EPERM;
+#endif  /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
 
+	sessionid = atomic_inc_return(&session_id);
 	if (context && context->in_syscall) {
 		struct audit_buffer *ab;
 
-- 
cgit v1.2.3-70-g09d2


From 4043cde8ecf7f7d880eb1133c201a3d392fd68c3 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:08 -0500
Subject: audit: do not call audit_getname on error

Just a code cleanup really.  We don't need to make a function call just for
it to return on error.  This also makes the VFS function even easier to follow
and removes a conditional on a hot path.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/namei.c       | 28 +++++++++++++---------------
 kernel/auditsc.c |  3 ---
 2 files changed, 13 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/fs/namei.c b/fs/namei.c
index c283a1ec008..208c6aa4a98 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -140,21 +140,19 @@ static int do_getname(const char __user *filename, char *page)
 
 static char *getname_flags(const char __user *filename, int flags, int *empty)
 {
-	char *tmp, *result;
-
-	result = ERR_PTR(-ENOMEM);
-	tmp = __getname();
-	if (tmp)  {
-		int retval = do_getname(filename, tmp);
-
-		result = tmp;
-		if (retval < 0) {
-			if (retval == -ENOENT && empty)
-				*empty = 1;
-			if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
-				__putname(tmp);
-				result = ERR_PTR(retval);
-			}
+	char *result = __getname();
+	int retval;
+
+	if (!result)
+		return ERR_PTR(-ENOMEM);
+
+	retval = do_getname(filename, result);
+	if (retval < 0) {
+		if (retval == -ENOENT && empty)
+			*empty = 1;
+		if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
+			__putname(result);
+			return ERR_PTR(retval);
 		}
 	}
 	audit_getname(result);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index bd084a13c71..9161e70a437 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1913,9 +1913,6 @@ void __audit_getname(const char *name)
 	struct audit_context *context = current->audit_context;
 	struct audit_names *n;
 
-	if (IS_ERR(name) || !name)
-		return;
-
 	if (!context->in_syscall) {
 #if AUDIT_DEBUG == 2
 		printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n",
-- 
cgit v1.2.3-70-g09d2


From 02d86a568c6d2d335256864451ac8ce781bc5652 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:08 -0500
Subject: audit: allow interfield comparison in audit rules

We wish to be able to audit when a uid=500 task accesses a file which is
uid=0.  Or vice versa.  This patch introduces a new audit filter type
AUDIT_FIELD_COMPARE which takes as an 'enum' which indicates which fields
should be compared.  At this point we only define the task->uid vs
inode->uid, but other comparisons can be added.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h |  4 ++++
 kernel/auditfilter.c  |  5 ++++-
 kernel/auditsc.c      | 30 +++++++++++++++++++++++++++++-
 3 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 7cbd6fe4157..838e05fc058 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -182,7 +182,10 @@
  * AUDIT_UNUSED_BITS is updated if need be. */
 #define AUDIT_UNUSED_BITS	0x07FFFC00
 
+/* AUDIT_FIELD_COMPARE rule list */
+#define AUDIT_COMPARE_UID_TO_OBJ_UID	1
 
+#define AUDIT_MAX_FIELD_COMPARE	AUDIT_COMPARE_UID_TO_OBJ_UID
 /* Rule fields */
 				/* These are useful when checking the
 				 * task structure at task creation time
@@ -225,6 +228,7 @@
 #define AUDIT_FILETYPE	108
 #define AUDIT_OBJ_UID	109
 #define AUDIT_OBJ_GID	110
+#define AUDIT_FIELD_COMPARE	111
 
 #define AUDIT_ARG0      200
 #define AUDIT_ARG1      (AUDIT_ARG0+1)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index f10605c787e..a6c3f1abd20 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -526,7 +526,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 				goto exit_free;
 			break;
 		case AUDIT_FILTERKEY:
-			err = -EINVAL;
 			if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
 				goto exit_free;
 			str = audit_unpack_string(&bufp, &remain, f->val);
@@ -543,6 +542,10 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 			if (f->val & ~S_IFMT)
 				goto exit_free;
 			break;
+		case AUDIT_FIELD_COMPARE:
+			if (f->val > AUDIT_MAX_FIELD_COMPARE)
+				goto exit_free;
+			break;
 		default:
 			goto exit_free;
 		}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9161e70a437..8fb2c8e6d62 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -463,6 +463,32 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
 	return 0;
 }
 
+static int audit_field_compare(struct task_struct *tsk,
+			       const struct cred *cred,
+			       struct audit_field *f,
+			       struct audit_context *ctx,
+			       struct audit_names *name)
+{
+	struct audit_names *n;
+
+	switch (f->val) {
+	case AUDIT_COMPARE_UID_TO_OBJ_UID:
+		if (name) {
+			return audit_comparator(cred->uid, f->op, name->uid);
+		} else if (ctx) {
+			list_for_each_entry(n, &ctx->names_list, list) {
+				if (audit_comparator(cred->uid, f->op, n->uid))
+					return 1;
+			}
+		}
+		break;
+	default:
+		WARN(1, "Missing AUDIT_COMPARE define.  Report as a bug\n");
+		return 0;
+	}
+	return 0;
+}
+
 /* Determine if any context name data matches a rule's watch data */
 /* Compare a task_struct with an audit_rule.  Return 1 on match, 0
  * otherwise.
@@ -693,8 +719,10 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_FILETYPE:
 			result = audit_match_filetype(ctx, f->val);
 			break;
+		case AUDIT_FIELD_COMPARE:
+			result = audit_field_compare(tsk, cred, f, ctx, name);
+			break;
 		}
-
 		if (!result)
 			return 0;
 	}
-- 
cgit v1.2.3-70-g09d2


From b34b039324bf081554ee8678f9b8c5d937e5206c Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:08 -0500
Subject: audit: complex interfield comparison helper

Rather than code the same loop over and over implement a helper function which
uses some pointer magic to make it generic enough to be used numerous places
as we implement more audit interfield comparisons

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 kernel/auditsc.c | 50 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 8fb2c8e6d62..b12cc32fe37 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -463,25 +463,53 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
 	return 0;
 }
 
+static int audit_compare_id(uid_t uid1,
+			    struct audit_names *name,
+			    unsigned long name_offset,
+			    struct audit_field *f,
+			    struct audit_context *ctx)
+{
+	struct audit_names *n;
+	unsigned long addr;
+	uid_t uid2;
+	int rc;
+
+	if (name) {
+		addr = (unsigned long)name;
+		addr += name_offset;
+
+		uid2 = *(uid_t *)addr;
+		rc = audit_comparator(uid1, f->op, uid2);
+		if (rc)
+			return rc;
+	}
+
+	if (ctx) {
+		list_for_each_entry(n, &ctx->names_list, list) {
+			addr = (unsigned long)n;
+			addr += name_offset;
+
+			uid2 = *(uid_t *)addr;
+
+			rc = audit_comparator(uid1, f->op, uid2);
+			if (rc)
+				return rc;
+		}
+	}
+	return 0;
+}
+
 static int audit_field_compare(struct task_struct *tsk,
 			       const struct cred *cred,
 			       struct audit_field *f,
 			       struct audit_context *ctx,
 			       struct audit_names *name)
 {
-	struct audit_names *n;
-
 	switch (f->val) {
 	case AUDIT_COMPARE_UID_TO_OBJ_UID:
-		if (name) {
-			return audit_comparator(cred->uid, f->op, name->uid);
-		} else if (ctx) {
-			list_for_each_entry(n, &ctx->names_list, list) {
-				if (audit_comparator(cred->uid, f->op, n->uid))
-					return 1;
-			}
-		}
-		break;
+		return audit_compare_id(cred->uid,
+					name, offsetof(struct audit_names, uid),
+					f, ctx);
 	default:
 		WARN(1, "Missing AUDIT_COMPARE define.  Report as a bug\n");
 		return 0;
-- 
cgit v1.2.3-70-g09d2


From c9fe685f7a17a0ee8bf3fbe51e40b1c8b8e65896 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:08 -0500
Subject: audit: allow interfield comparison between gid and ogid

Allow audit rules to compare the gid of the running task to the gid of the
inode in question.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h | 3 ++-
 kernel/auditsc.c      | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 838e05fc058..fffbc2176ee 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -184,8 +184,9 @@
 
 /* AUDIT_FIELD_COMPARE rule list */
 #define AUDIT_COMPARE_UID_TO_OBJ_UID	1
+#define AUDIT_COMPARE_GID_TO_OBJ_GID	2
 
-#define AUDIT_MAX_FIELD_COMPARE	AUDIT_COMPARE_UID_TO_OBJ_UID
+#define AUDIT_MAX_FIELD_COMPARE	AUDIT_COMPARE_GID_TO_OBJ_GID
 /* Rule fields */
 				/* These are useful when checking the
 				 * task structure at task creation time
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b12cc32fe37..861c7b9c565 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -474,6 +474,8 @@ static int audit_compare_id(uid_t uid1,
 	uid_t uid2;
 	int rc;
 
+	BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));
+
 	if (name) {
 		addr = (unsigned long)name;
 		addr += name_offset;
@@ -510,6 +512,10 @@ static int audit_field_compare(struct task_struct *tsk,
 		return audit_compare_id(cred->uid,
 					name, offsetof(struct audit_names, uid),
 					f, ctx);
+	case AUDIT_COMPARE_GID_TO_OBJ_GID:
+		return audit_compare_id(cred->gid,
+					name, offsetof(struct audit_names, gid),
+					f, ctx);
 	default:
 		WARN(1, "Missing AUDIT_COMPARE define.  Report as a bug\n");
 		return 0;
-- 
cgit v1.2.3-70-g09d2


From 4a6633ed08af5ba67790b4d1adcdeb8ceb55677e Mon Sep 17 00:00:00 2001
From: Peter Moody <pmoody@google.com>
Date: Tue, 13 Dec 2011 16:17:51 -0800
Subject: audit: implement all object interfield comparisons

This completes the matrix of interfield comparisons between uid/gid
information for the current task and the uid/gid information for inodes.
aka I can audit based on differences between the euid of the process and
the uid of fs objects.

Signed-off-by: Peter Moody <pmoody@google.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h | 10 +++++++++-
 kernel/auditsc.c      | 29 +++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index fffbc2176ee..67113cb4bc1 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -185,8 +185,16 @@
 /* AUDIT_FIELD_COMPARE rule list */
 #define AUDIT_COMPARE_UID_TO_OBJ_UID	1
 #define AUDIT_COMPARE_GID_TO_OBJ_GID	2
+#define AUDIT_COMPARE_EUID_TO_OBJ_UID	3
+#define AUDIT_COMPARE_EGID_TO_OBJ_GID	4
+#define AUDIT_COMPARE_AUID_TO_OBJ_UID	5
+#define AUDIT_COMPARE_SUID_TO_OBJ_UID	6
+#define AUDIT_COMPARE_SGID_TO_OBJ_GID	7
+#define AUDIT_COMPARE_FSUID_TO_OBJ_UID	8
+#define AUDIT_COMPARE_FSGID_TO_OBJ_GID	9
+
+#define AUDIT_MAX_FIELD_COMPARE		AUDIT_COMPARE_FSGID_TO_OBJ_GID
 
-#define AUDIT_MAX_FIELD_COMPARE	AUDIT_COMPARE_GID_TO_OBJ_GID
 /* Rule fields */
 				/* These are useful when checking the
 				 * task structure at task creation time
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 861c7b9c565..b8cee462b99 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -508,6 +508,7 @@ static int audit_field_compare(struct task_struct *tsk,
 			       struct audit_names *name)
 {
 	switch (f->val) {
+	/* process to file object comparisons */
 	case AUDIT_COMPARE_UID_TO_OBJ_UID:
 		return audit_compare_id(cred->uid,
 					name, offsetof(struct audit_names, uid),
@@ -516,6 +517,34 @@ static int audit_field_compare(struct task_struct *tsk,
 		return audit_compare_id(cred->gid,
 					name, offsetof(struct audit_names, gid),
 					f, ctx);
+	case AUDIT_COMPARE_EUID_TO_OBJ_UID:
+		return audit_compare_id(cred->euid,
+					name, offsetof(struct audit_names, uid),
+					f, ctx);
+	case AUDIT_COMPARE_EGID_TO_OBJ_GID:
+		return audit_compare_id(cred->egid,
+					name, offsetof(struct audit_names, gid),
+					f, ctx);
+	case AUDIT_COMPARE_AUID_TO_OBJ_UID:
+		return audit_compare_id(tsk->loginuid,
+					name, offsetof(struct audit_names, uid),
+					f, ctx);
+	case AUDIT_COMPARE_SUID_TO_OBJ_UID:
+		return audit_compare_id(cred->suid,
+					name, offsetof(struct audit_names, uid),
+					f, ctx);
+	case AUDIT_COMPARE_SGID_TO_OBJ_GID:
+		return audit_compare_id(cred->sgid,
+					name, offsetof(struct audit_names, gid),
+					f, ctx);
+	case AUDIT_COMPARE_FSUID_TO_OBJ_UID:
+		return audit_compare_id(cred->fsuid,
+					name, offsetof(struct audit_names, uid),
+					f, ctx);
+	case AUDIT_COMPARE_FSGID_TO_OBJ_GID:
+		return audit_compare_id(cred->fsgid,
+					name, offsetof(struct audit_names, gid),
+					f, ctx);
 	default:
 		WARN(1, "Missing AUDIT_COMPARE define.  Report as a bug\n");
 		return 0;
-- 
cgit v1.2.3-70-g09d2


From 10d68360871657204885371cdf2594412675d2f9 Mon Sep 17 00:00:00 2001
From: Peter Moody <pmoody@google.com>
Date: Wed, 4 Jan 2012 15:24:31 -0500
Subject: audit: comparison on interprocess fields

This allows audit to specify rules in which we compare two fields of a
process.  Such as is the running process uid != to the running process
euid?

Signed-off-by: Peter Moody <pmoody@google.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h | 24 +++++++++++++++++++++++-
 kernel/auditsc.c      | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 67113cb4bc1..9ff7a2c48b5 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -193,7 +193,29 @@
 #define AUDIT_COMPARE_FSUID_TO_OBJ_UID	8
 #define AUDIT_COMPARE_FSGID_TO_OBJ_GID	9
 
-#define AUDIT_MAX_FIELD_COMPARE		AUDIT_COMPARE_FSGID_TO_OBJ_GID
+#define AUDIT_COMPARE_UID_TO_AUID	10
+#define AUDIT_COMPARE_UID_TO_EUID	11
+#define AUDIT_COMPARE_UID_TO_FSUID	12
+#define AUDIT_COMPARE_UID_TO_SUID	13
+
+#define AUDIT_COMPARE_AUID_TO_FSUID	14
+#define AUDIT_COMPARE_AUID_TO_SUID	15
+#define AUDIT_COMPARE_AUID_TO_EUID	16
+
+#define AUDIT_COMPARE_EUID_TO_SUID	17
+#define AUDIT_COMPARE_EUID_TO_FSUID	18
+
+#define AUDIT_COMPARE_SUID_TO_FSUID	19
+
+#define AUDIT_COMPARE_GID_TO_EGID	20
+#define AUDIT_COMPARE_GID_TO_FSGID	21
+#define AUDIT_COMPARE_GID_TO_SGID	22
+
+#define AUDIT_COMPARE_EGID_TO_FSGID	23
+#define AUDIT_COMPARE_EGID_TO_SGID	24
+#define AUDIT_COMPARE_SGID_TO_FSGID	25
+
+#define AUDIT_MAX_FIELD_COMPARE		AUDIT_COMPARE_SGID_TO_FSGID
 
 /* Rule fields */
 				/* These are useful when checking the
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b8cee462b99..593237e3654 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -545,6 +545,45 @@ static int audit_field_compare(struct task_struct *tsk,
 		return audit_compare_id(cred->fsgid,
 					name, offsetof(struct audit_names, gid),
 					f, ctx);
+	/* uid comparisons */
+	case AUDIT_COMPARE_UID_TO_AUID:
+		return audit_comparator(cred->uid, f->op, tsk->loginuid);
+	case AUDIT_COMPARE_UID_TO_EUID:
+		return audit_comparator(cred->uid, f->op, cred->euid);
+	case AUDIT_COMPARE_UID_TO_SUID:
+		return audit_comparator(cred->uid, f->op, cred->suid);
+	case AUDIT_COMPARE_UID_TO_FSUID:
+		return audit_comparator(cred->uid, f->op, cred->fsuid);
+	/* auid comparisons */
+	case AUDIT_COMPARE_AUID_TO_EUID:
+		return audit_comparator(tsk->loginuid, f->op, cred->euid);
+	case AUDIT_COMPARE_AUID_TO_SUID:
+		return audit_comparator(tsk->loginuid, f->op, cred->suid);
+	case AUDIT_COMPARE_AUID_TO_FSUID:
+		return audit_comparator(tsk->loginuid, f->op, cred->fsuid);
+	/* euid comparisons */
+	case AUDIT_COMPARE_EUID_TO_SUID:
+		return audit_comparator(cred->euid, f->op, cred->suid);
+	case AUDIT_COMPARE_EUID_TO_FSUID:
+		return audit_comparator(cred->euid, f->op, cred->fsuid);
+	/* suid comparisons */
+	case AUDIT_COMPARE_SUID_TO_FSUID:
+		return audit_comparator(cred->suid, f->op, cred->fsuid);
+	/* gid comparisons */
+	case AUDIT_COMPARE_GID_TO_EGID:
+		return audit_comparator(cred->gid, f->op, cred->egid);
+	case AUDIT_COMPARE_GID_TO_SGID:
+		return audit_comparator(cred->gid, f->op, cred->sgid);
+	case AUDIT_COMPARE_GID_TO_FSGID:
+		return audit_comparator(cred->gid, f->op, cred->fsgid);
+	/* egid comparisons */
+	case AUDIT_COMPARE_EGID_TO_SGID:
+		return audit_comparator(cred->egid, f->op, cred->sgid);
+	case AUDIT_COMPARE_EGID_TO_FSGID:
+		return audit_comparator(cred->egid, f->op, cred->fsgid);
+	/* sgid comparison */
+	case AUDIT_COMPARE_SGID_TO_FSGID:
+		return audit_comparator(cred->sgid, f->op, cred->fsgid);
 	default:
 		WARN(1, "Missing AUDIT_COMPARE define.  Report as a bug\n");
 		return 0;
-- 
cgit v1.2.3-70-g09d2


From 5afb8a3f96573f7ea018abb768f5b6ebe1a6c1a4 Mon Sep 17 00:00:00 2001
From: Xi Wang <xi.wang@gmail.com>
Date: Tue, 20 Dec 2011 18:39:41 -0500
Subject: audit: fix signedness bug in audit_log_execve_info()

In the loop, a size_t "len" is used to hold the return value of
audit_log_single_execve_arg(), which returns -1 on error.  In that
case the error handling (len <= 0) will be bypassed since "len" is
unsigned, and the loop continues with (p += len) being wrapped.
Change the type of "len" to signed int to fix the error handling.

	size_t len;
	...
	for (...) {
		len = audit_log_single_execve_arg(...);
		if (len <= 0)
			break;
		p += len;
	}

Signed-off-by: Xi Wang <xi.wang@gmail.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 kernel/auditsc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 593237e3654..86584ecb103 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1362,8 +1362,8 @@ static void audit_log_execve_info(struct audit_context *context,
 				  struct audit_buffer **ab,
 				  struct audit_aux_data_execve *axi)
 {
-	int i;
-	size_t len, len_sent = 0;
+	int i, len;
+	size_t len_sent = 0;
 	const char __user *p;
 	char *buf;
 
-- 
cgit v1.2.3-70-g09d2


From c158a35c8a681cf68d36f22f058f9f5466386c71 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 6 Jan 2012 14:07:10 -0800
Subject: audit: no leading space in audit_log_d_path prefix

audit_log_d_path() injects an additional space before the prefix,
which serves no purpose and doesn't mix well with other audit_log*()
functions that do not sneak extra characters into the log.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 kernel/audit.c       | 2 +-
 kernel/auditsc.c     | 6 +++---
 security/lsm_audit.c | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 00efe4758c8..705c25a70bf 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1423,7 +1423,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
 	char *p, *pathname;
 
 	if (prefix)
-		audit_log_format(ab, " %s", prefix);
+		audit_log_format(ab, "%s", prefix);
 
 	/* We will allow 11 spaces for ' (deleted)' to be appended */
 	pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 86584ecb103..caaea6e944f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1171,7 +1171,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
 		while (vma) {
 			if ((vma->vm_flags & VM_EXECUTABLE) &&
 			    vma->vm_file) {
-				audit_log_d_path(ab, "exe=",
+				audit_log_d_path(ab, " exe=",
 						 &vma->vm_file->f_path);
 				break;
 			}
@@ -1540,7 +1540,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
 		case 0:
 			/* name was specified as a relative path and the
 			 * directory component is the cwd */
-			audit_log_d_path(ab, "name=", &context->pwd);
+			audit_log_d_path(ab, " name=", &context->pwd);
 			break;
 		default:
 			/* log the name's directory component */
@@ -1725,7 +1725,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 	if (context->pwd.dentry && context->pwd.mnt) {
 		ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
 		if (ab) {
-			audit_log_d_path(ab, "cwd=", &context->pwd);
+			audit_log_d_path(ab, " cwd=", &context->pwd);
 			audit_log_end(ab);
 		}
 	}
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index fc41b7cccb5..293b8c45b1d 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -232,7 +232,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 	case LSM_AUDIT_DATA_PATH: {
 		struct inode *inode;
 
-		audit_log_d_path(ab, "path=", &a->u.path);
+		audit_log_d_path(ab, " path=", &a->u.path);
 
 		inode = a->u.path.dentry->d_inode;
 		if (inode) {
@@ -318,7 +318,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 						.dentry = u->dentry,
 						.mnt = u->mnt
 					};
-					audit_log_d_path(ab, "path=", &path);
+					audit_log_d_path(ab, " path=", &path);
 					break;
 				}
 				if (!u->addr)
-- 
cgit v1.2.3-70-g09d2


From 160cb5a97daef0cb894685d84c9d4700bb7cccb4 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung.kim@lge.com>
Date: Thu, 19 Jan 2012 23:23:10 +0100
Subject: PM / Hibernate: Correct additional pages number calculation

The struct bm_block is allocated by chain_alloc(),
so it'd better counting it in LINKED_PAGE_DATA_SIZE.

Signed-off-by: Namhyung Kim <namhyung.kim@lge.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/snapshot.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 1cf88900ec4..6a768e53700 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -812,7 +812,8 @@ unsigned int snapshot_additional_pages(struct zone *zone)
 	unsigned int res;
 
 	res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
-	res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
+	res += DIV_ROUND_UP(res * sizeof(struct bm_block),
+			    LINKED_PAGE_DATA_SIZE);
 	return 2 * res;
 }
 
-- 
cgit v1.2.3-70-g09d2


From fd45c15f13e754f3c106427e857310f3e0813951 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung.kim@lge.com>
Date: Fri, 20 Jan 2012 10:12:45 +0900
Subject: perf: Don't call release_callchain_buffers() if allocation fails

When alloc_callchain_buffers() fails, it frees all of
entries before return. In addition, calling the
release_callchain_buffers() will cause a NULL pointer
dereference since callchain_cpu_entries is not set.

Signed-off-by: Namhyung Kim <namhyung.kim@lge.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Link: http://lkml.kernel.org/r/1327021966-27688-1-git-send-email-namhyung.kim@lge.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/events/callchain.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 057e24b665c..6581a040f39 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -115,8 +115,6 @@ int get_callchain_buffers(void)
 	}
 
 	err = alloc_callchain_buffers();
-	if (err)
-		release_callchain_buffers();
 exit:
 	mutex_unlock(&callchain_mutex);
 
-- 
cgit v1.2.3-70-g09d2


From 46cd6a7f680d14f6f80ede9f04aeb70fa83bd266 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung.kim@lge.com>
Date: Fri, 20 Jan 2012 10:12:46 +0900
Subject: perf: Call perf_cgroup_event_time() directly

The perf_event_time() will call perf_cgroup_event_time()
if @event is a cgroup event. Just do it directly and avoid
the extra check..

Signed-off-by: Namhyung Kim <namhyung.kim@lge.com>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Link: http://lkml.kernel.org/r/1327021966-27688-2-git-send-email-namhyung.kim@lge.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index a8f4ac001a0..32b48c88971 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -815,7 +815,7 @@ static void update_event_times(struct perf_event *event)
 	 * here.
 	 */
 	if (is_cgroup_event(event))
-		run_end = perf_event_time(event);
+		run_end = perf_cgroup_event_time(event);
 	else if (ctx->is_active)
 		run_end = ctx->time;
 	else
-- 
cgit v1.2.3-70-g09d2


From 0e90b31f4ba77027a7c21cbfc66404df0851ca21 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@parallels.com>
Date: Fri, 20 Jan 2012 04:57:16 +0000
Subject: net: introduce res_counter_charge_nofail() for socket allocations

There is a case in __sk_mem_schedule(), where an allocation
is beyond the maximum, but yet we are allowed to proceed.
It happens under the following condition:

	sk->sk_wmem_queued + size >= sk->sk_sndbuf

The network code won't revert the allocation in this case,
meaning that at some point later it'll try to do it. Since
this is never communicated to the underlying res_counter
code, there is an inbalance in res_counter uncharge operation.

I see two ways of fixing this:

1) storing the information about those allocations somewhere
   in memcg, and then deducting from that first, before
   we start draining the res_counter,
2) providing a slightly different allocation function for
   the res_counter, that matches the original behavior of
   the network code more closely.

I decided to go for #2 here, believing it to be more elegant,
since #1 would require us to do basically that, but in a more
obscure way.

Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
CC: Tejun Heo <tj@kernel.org>
CC: Li Zefan <lizf@cn.fujitsu.com>
CC: Laurent Chavey <chavey@google.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/res_counter.h |  6 ++++++
 include/net/sock.h          | 10 ++++------
 kernel/res_counter.c        | 25 +++++++++++++++++++++++++
 net/core/sock.c             |  4 ++--
 4 files changed, 37 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index d06d014afda..da81af086ea 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -109,12 +109,18 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent);
  *
  * returns 0 on success and <0 if the counter->usage will exceed the
  * counter->limit _locked call expects the counter->lock to be taken
+ *
+ * charge_nofail works the same, except that it charges the resource
+ * counter unconditionally, and returns < 0 if the after the current
+ * charge we are over limit.
  */
 
 int __must_check res_counter_charge_locked(struct res_counter *counter,
 		unsigned long val);
 int __must_check res_counter_charge(struct res_counter *counter,
 		unsigned long val, struct res_counter **limit_fail_at);
+int __must_check res_counter_charge_nofail(struct res_counter *counter,
+		unsigned long val, struct res_counter **limit_fail_at);
 
 /*
  * uncharge - tell that some portion of the resource is released
diff --git a/include/net/sock.h b/include/net/sock.h
index 0e7a9b05f92..4c69ac165e6 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1008,9 +1008,8 @@ static inline void memcg_memory_allocated_add(struct cg_proto *prot,
 	struct res_counter *fail;
 	int ret;
 
-	ret = res_counter_charge(prot->memory_allocated,
-				 amt << PAGE_SHIFT, &fail);
-
+	ret = res_counter_charge_nofail(prot->memory_allocated,
+					amt << PAGE_SHIFT, &fail);
 	if (ret < 0)
 		*parent_status = OVER_LIMIT;
 }
@@ -1054,12 +1053,11 @@ sk_memory_allocated_add(struct sock *sk, int amt, int *parent_status)
 }
 
 static inline void
-sk_memory_allocated_sub(struct sock *sk, int amt, int parent_status)
+sk_memory_allocated_sub(struct sock *sk, int amt)
 {
 	struct proto *prot = sk->sk_prot;
 
-	if (mem_cgroup_sockets_enabled && sk->sk_cgrp &&
-	    parent_status != OVER_LIMIT) /* Otherwise was uncharged already */
+	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
 		memcg_memory_allocated_sub(sk->sk_cgrp, amt);
 
 	atomic_long_sub(amt, prot->memory_allocated);
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 6d269cce7aa..d508363858b 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -66,6 +66,31 @@ done:
 	return ret;
 }
 
+int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
+			      struct res_counter **limit_fail_at)
+{
+	int ret, r;
+	unsigned long flags;
+	struct res_counter *c;
+
+	r = ret = 0;
+	*limit_fail_at = NULL;
+	local_irq_save(flags);
+	for (c = counter; c != NULL; c = c->parent) {
+		spin_lock(&c->lock);
+		r = res_counter_charge_locked(c, val);
+		if (r)
+			c->usage += val;
+		spin_unlock(&c->lock);
+		if (r < 0 && ret == 0) {
+			*limit_fail_at = c;
+			ret = r;
+		}
+	}
+	local_irq_restore(flags);
+
+	return ret;
+}
 void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
 {
 	if (WARN_ON(counter->usage < val))
diff --git a/net/core/sock.c b/net/core/sock.c
index 5c5af9988f9..3e81fd2e3c7 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1827,7 +1827,7 @@ suppress_allocation:
 	/* Alas. Undo changes. */
 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
 
-	sk_memory_allocated_sub(sk, amt, parent_status);
+	sk_memory_allocated_sub(sk, amt);
 
 	return 0;
 }
@@ -1840,7 +1840,7 @@ EXPORT_SYMBOL(__sk_mem_schedule);
 void __sk_mem_reclaim(struct sock *sk)
 {
 	sk_memory_allocated_sub(sk,
-				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, 0);
+				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
 	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
 
 	if (sk_under_memory_pressure(sk) &&
-- 
cgit v1.2.3-70-g09d2


From d496aab567e7e52b3e974c9192a5de6e77dce32c Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Fri, 20 Jan 2012 14:34:04 -0800
Subject: kprobes: initialize before using a hlist

Commit ef53d9c5e ("kprobes: improve kretprobe scalability with hashed
locking") introduced a bug where we can potentially leak
kretprobe_instances since we initialize a hlist head after having used
it.

Initialize the hlist head before using it.

Reported by: Jim Keniston <jkenisto@us.ibm.com>
Acked-by: Jim Keniston <jkenisto@us.ibm.com>
Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Srinivasa D S <srinivasa@in.ibm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 95dd7212e61..29f5b65bee2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1077,6 +1077,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
 		/* Early boot.  kretprobe_table_locks not yet initialized. */
 		return;
 
+	INIT_HLIST_HEAD(&empty_rp);
 	hash = hash_ptr(tk, KPROBE_HASH_BITS);
 	head = &kretprobe_inst_table[hash];
 	kretprobe_table_lock(hash, &flags);
@@ -1085,7 +1086,6 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
 			recycle_rp_inst(ri, &empty_rp);
 	}
 	kretprobe_table_unlock(hash, &flags);
-	INIT_HLIST_HEAD(&empty_rp);
 	hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
 		hlist_del(&ri->hlist);
 		kfree(ri);
-- 
cgit v1.2.3-70-g09d2


From 42ae610c1a820ddecb80943d4ccfc936f7772535 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Sat, 21 Jan 2012 11:02:24 -0800
Subject: kernel-doc: fix new warnings in auditsc.c

Fix new kernel-doc warnings in auditsc.c:

Warning(kernel/auditsc.c:1875): No description found for parameter 'success'
Warning(kernel/auditsc.c:1875): No description found for parameter 'return_code'
Warning(kernel/auditsc.c:1875): Excess function parameter 'pt_regs' description in '__audit_syscall_exit'

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric Paris <eparis@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/auditsc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index caaea6e944f..af1de0f34ea 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1863,11 +1863,12 @@ void __audit_syscall_entry(int arch, int major,
 
 /**
  * audit_syscall_exit - deallocate audit context after a system call
- * @pt_regs: syscall registers
+ * @success: success value of the syscall
+ * @return_code: return value of the syscall
  *
  * Tear down after system call.  If the audit context has been marked as
  * auditable (either because of the AUDIT_RECORD_CONTEXT state from
- * filtering, or because some other part of the kernel write an audit
+ * filtering, or because some other part of the kernel wrote an audit
  * message), then write out the syscall information.  In call cases,
  * free the names stored from getname().
  */
-- 
cgit v1.2.3-70-g09d2


From fa757281a08799fd6c0f7ec6f111d1cd66afc97b Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Sat, 21 Jan 2012 11:03:13 -0800
Subject: kernel-doc: fix kernel-doc warnings in sched

Fix new kernel-doc notation warnings:

Warning(include/linux/sched.h:2094): No description found for parameter 'p'
Warning(include/linux/sched.h:2094): Excess function parameter 'tsk' description in 'is_idle_task'
Warning(kernel/sched/cpupri.c:139): No description found for parameter 'newpri'
Warning(kernel/sched/cpupri.c:139): Excess function parameter 'pri' description in 'cpupri_set'
Warning(kernel/sched/cpupri.c:208): Excess function parameter 'bootmem' description in 'cpupri_init'

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Cc:	Ingo Molnar <mingo@elte.hu>
Cc:	Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 2 +-
 kernel/sched/cpupri.c | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4032ec1cf83..513f5245987 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2088,7 +2088,7 @@ extern int sched_setscheduler_nocheck(struct task_struct *, int,
 extern struct task_struct *idle_task(int cpu);
 /**
  * is_idle_task - is the specified task an idle task?
- * @tsk: the task in question.
+ * @p: the task in question.
  */
 static inline bool is_idle_task(struct task_struct *p)
 {
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index b0d798eaf13..d72586fdf66 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -129,7 +129,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
  * cpupri_set - update the cpu priority setting
  * @cp: The cpupri context
  * @cpu: The target cpu
- * @pri: The priority (INVALID-RT99) to assign to this CPU
+ * @newpri: The priority (INVALID-RT99) to assign to this CPU
  *
  * Note: Assumes cpu_rq(cpu)->lock is locked
  *
@@ -200,7 +200,6 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 /**
  * cpupri_init - initialize the cpupri structure
  * @cp: The cpupri context
- * @bootmem: true if allocations need to use bootmem
  *
  * Returns: -ENOMEM if memory fails.
  */
-- 
cgit v1.2.3-70-g09d2


From 4ca9b72b71f10147bd21969c1805f5b2c4ca7b7b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Jan 2012 11:50:51 +0100
Subject: sched: Fix rq->nr_uninterruptible update race

KOSAKI Motohiro noticed the following race:

 > CPU0                    CPU1
 > --------------------------------------------------------
 > deactivate_task()
 >                         task->state = TASK_UNINTERRUPTIBLE;
 > activate_task()
 >    rq->nr_uninterruptible--;
 >
 >                         schedule()
 >                           deactivate_task()
 >                             rq->nr_uninterruptible++;
 >

Kosaki-San's scenario is possible when CPU0 runs
__sched_setscheduler() against CPU1's current @task.

__sched_setscheduler() does a dequeue/enqueue in order to move
the task to its new queue (position) to reflect the newly provided
scheduling parameters. However it should be completely invariant to
nr_uninterruptible accounting, sched_setscheduler() doesn't affect
readyness to run, merely policy on when to run.

So convert the inappropriate activate/deactivate_task usage to
enqueue/dequeue_task, which avoids the nr_uninterruptible accounting.

Also convert the two other sites: __migrate_task() and
normalize_task() that still use activate/deactivate_task. These sites
aren't really a problem since __migrate_task() will only be called on
non-running task (and therefore are immume to the described problem)
and normalize_task() isn't ever used on regular systems.

Also remove the comments from activate/deactivate_task since they're
misleading at best.

Reported-by: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1327486224.2614.45.camel@laptop
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/core.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index df00cb09263..e067df1fd01 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -723,9 +723,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 	p->sched_class->dequeue_task(rq, p, flags);
 }
 
-/*
- * activate_task - move a task to the runqueue.
- */
 void activate_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (task_contributes_to_load(p))
@@ -734,9 +731,6 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
 	enqueue_task(rq, p, flags);
 }
 
-/*
- * deactivate_task - remove a task from the runqueue.
- */
 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (task_contributes_to_load(p))
@@ -4134,7 +4128,7 @@ recheck:
 	on_rq = p->on_rq;
 	running = task_current(rq, p);
 	if (on_rq)
-		deactivate_task(rq, p, 0);
+		dequeue_task(rq, p, 0);
 	if (running)
 		p->sched_class->put_prev_task(rq, p);
 
@@ -4147,7 +4141,7 @@ recheck:
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (on_rq)
-		activate_task(rq, p, 0);
+		enqueue_task(rq, p, 0);
 
 	check_class_changed(rq, p, prev_class, oldprio);
 	task_rq_unlock(rq, p, &flags);
@@ -4998,9 +4992,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 	 * placed properly.
 	 */
 	if (p->on_rq) {
-		deactivate_task(rq_src, p, 0);
+		dequeue_task(rq_src, p, 0);
 		set_task_cpu(p, dest_cpu);
-		activate_task(rq_dest, p, 0);
+		enqueue_task(rq_dest, p, 0);
 		check_preempt_curr(rq_dest, p, 0);
 	}
 done:
@@ -7032,10 +7026,10 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
 
 	on_rq = p->on_rq;
 	if (on_rq)
-		deactivate_task(rq, p, 0);
+		dequeue_task(rq, p, 0);
 	__setscheduler(rq, p, SCHED_NORMAL, 0);
 	if (on_rq) {
-		activate_task(rq, p, 0);
+		enqueue_task(rq, p, 0);
 		resched_task(rq->curr);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From db7e527da41560f597ccdc4417cefa6b7657c0c0 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Wed, 11 Jan 2012 08:58:16 +0100
Subject: sched/s390: Fix compile error in sched/core.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 029632fbb7b7c9d85063cc9eb470de6c54873df3 ("sched: Make
separate sched*.c translation units") removed the include of
asm/mutex.h from sched.c.

This breaks the combination of:

 CONFIG_MUTEX_SPIN_ON_OWNER=yes
 CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX=yes

like s390 without mutex debugging:

  CC      kernel/sched/core.o
  kernel/sched/core.c: In function ‘mutex_spin_on_owner’:
  kernel/sched/core.c:3287: error: implicit declaration of function ‘arch_mutex_cpu_relax’

Lets re-add the include to kernel/sched/core.c

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1326268696-30904-1-git-send-email-borntraeger@de.ibm.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e067df1fd01..5255c9d2e05 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -74,6 +74,7 @@
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
+#include <asm/mutex.h>
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #endif
-- 
cgit v1.2.3-70-g09d2


From 71325960d16cd68ea0e22a8da15b2495b0f363f7 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Thu, 19 Jan 2012 18:28:57 -0800
Subject: sched/nohz: Fix nohz cpu idle load balancing state with cpu hotplug

With the recent nohz scheduler changes, rq's nohz flag
'NOHZ_TICK_STOPPED' and its associated state doesn't get cleared
immediately after the cpu exits idle. This gets cleared as part
of the next tick seen on that cpu.

For the cpu offline support, we need to clear this state
manually. Fix it by registering a cpu notifier, which clears the
nohz idle load balance state for this rq explicitly during the
CPU_DYING notification.

There won't be any nohz updates for that cpu, after the
CPU_DYING notification. But lets be extra paranoid and skip
updating the nohz state in the select_nohz_load_balancer() if
the cpu is not in active state anymore.

Reported-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Reviewed-and-tested-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Tested-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1327026538.16150.40.camel@sbsiddha-desk.sc.intel.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/fair.c | 34 +++++++++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 84adb2d66cb..7c6414fc669 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4866,6 +4866,15 @@ static void nohz_balancer_kick(int cpu)
 	return;
 }
 
+static inline void clear_nohz_tick_stopped(int cpu)
+{
+	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
+		cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+		atomic_dec(&nohz.nr_cpus);
+		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
+	}
+}
+
 static inline void set_cpu_sd_state_busy(void)
 {
 	struct sched_domain *sd;
@@ -4904,6 +4913,12 @@ void select_nohz_load_balancer(int stop_tick)
 {
 	int cpu = smp_processor_id();
 
+	/*
+	 * If this cpu is going down, then nothing needs to be done.
+	 */
+	if (!cpu_active(cpu))
+		return;
+
 	if (stop_tick) {
 		if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
 			return;
@@ -4914,6 +4929,18 @@ void select_nohz_load_balancer(int stop_tick)
 	}
 	return;
 }
+
+static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
+					unsigned long action, void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DYING:
+		clear_nohz_tick_stopped(smp_processor_id());
+		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
 #endif
 
 static DEFINE_SPINLOCK(balancing);
@@ -5070,11 +5097,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
 	* busy tick after returning from idle, we will update the busy stats.
 	*/
 	set_cpu_sd_state_busy();
-	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
-		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
-		cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
-		atomic_dec(&nohz.nr_cpus);
-	}
+	clear_nohz_tick_stopped(cpu);
 
 	/*
 	 * None are in tickless mode and hence no need for NOHZ idle load
@@ -5590,6 +5613,7 @@ __init void init_sched_fair_class(void)
 
 #ifdef CONFIG_NO_HZ
 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+	cpu_notifier(sched_ilb_notifier, 0);
 #endif
 #endif /* SMP */
 
-- 
cgit v1.2.3-70-g09d2


From b0f4c4b32c8e3aa0d44fc4dd6c40a9a9a8d66b63 Mon Sep 17 00:00:00 2001
From: Prarit Bhargava <prarit@redhat.com>
Date: Thu, 26 Jan 2012 08:55:34 -0500
Subject: bugs, x86: Fix printk levels for panic, softlockups and stack dumps

rsyslog will display KERN_EMERG messages on a connected
terminal.  However, these messages are useless/undecipherable
for a general user.

For example, after a softlockup we get:

 Message from syslogd@intel-s3e37-04 at Jan 25 14:18:06 ...
 kernel:Stack:

 Message from syslogd@intel-s3e37-04 at Jan 25 14:18:06 ...
 kernel:Call Trace:

 Message from syslogd@intel-s3e37-04 at Jan 25 14:18:06 ...
 kernel:Code: ff ff a8 08 75 25 31 d2 48 8d 86 38 e0 ff ff 48 89
 d1 0f 01 c8 0f ae f0 48 8b 86 38 e0 ff ff a8 08 75 08 b1 01 4c 89 e0 0f 01 c9 <e8> ea 69 dd ff 4c 29 e8 48 89 c7 e8 0f bc da ff 49 89 c4 49 89

This happens because the printk levels for these messages are
incorrect. Only an informational message should be displayed on
a terminal.

I modified the printk levels for various messages in the kernel
and tested the output by using the drivers/misc/lkdtm.c kernel
modules (ie, softlockups, panics, hard lockups, etc.) and
confirmed that the console output was still the same and that
the output to the terminals was correct.

For example, in the case of a softlockup we now see the much
more informative:

 Message from syslogd@intel-s3e37-04 at Jan 25 10:18:06 ...
 BUG: soft lockup - CPU4 stuck for 60s!

instead of the above confusing messages.

AFAICT, the messages no longer have to be KERN_EMERG.  In the
most important case of a panic we set console_verbose().  As for
the other less severe cases the correct data is output to the
console and /var/log/messages.

Successfully tested by me using the drivers/misc/lkdtm.c module.

Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Cc: dzickus@redhat.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/1327586134-11926-1-git-send-email-prarit@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/dumpstack.c    | 3 ++-
 arch/x86/kernel/dumpstack_64.c | 6 +++---
 arch/x86/mm/fault.c            | 4 ++--
 kernel/watchdog.c              | 2 +-
 lib/bug.c                      | 2 +-
 5 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 1aae78f775f..4025fe4f928 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -252,7 +252,8 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
 	unsigned short ss;
 	unsigned long sp;
 #endif
-	printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
+	printk(KERN_DEFAULT
+	       "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
 #ifdef CONFIG_PREEMPT
 	printk("PREEMPT ");
 #endif
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 6d728d9284b..42b2bca0b72 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -269,11 +269,11 @@ void show_registers(struct pt_regs *regs)
 		unsigned char c;
 		u8 *ip;
 
-		printk(KERN_EMERG "Stack:\n");
+		printk(KERN_DEFAULT "Stack:\n");
 		show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
-				   0, KERN_EMERG);
+				   0, KERN_DEFAULT);
 
-		printk(KERN_EMERG "Code: ");
+		printk(KERN_DEFAULT "Code: ");
 
 		ip = (u8 *)regs->ip - code_prologue;
 		if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9d74824a708..f0b4caf85c1 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -673,7 +673,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 
 	stackend = end_of_stack(tsk);
 	if (tsk != &init_task && *stackend != STACK_END_MAGIC)
-		printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
+		printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
 
 	tsk->thread.cr2		= address;
 	tsk->thread.trap_no	= 14;
@@ -684,7 +684,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 		sig = 0;
 
 	/* Executive summary in case the body of the oops scrolled away */
-	printk(KERN_EMERG "CR2: %016lx\n", address);
+	printk(KERN_DEFAULT "CR2: %016lx\n", address);
 
 	oops_end(flags, regs, sig);
 }
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 1d7bca7f4f5..d117262deba 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -296,7 +296,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 		if (__this_cpu_read(soft_watchdog_warn) == true)
 			return HRTIMER_RESTART;
 
-		printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
+		printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
 			smp_processor_id(), duration,
 			current->comm, task_pid_nr(current));
 		print_modules();
diff --git a/lib/bug.c b/lib/bug.c
index 19552096d16..a28c1415357 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -169,7 +169,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 		return BUG_TRAP_TYPE_WARN;
 	}
 
-	printk(KERN_EMERG "------------[ cut here ]------------\n");
+	printk(KERN_DEFAULT "------------[ cut here ]------------\n");
 
 	if (file)
 		printk(KERN_CRIT "kernel BUG at %s:%u!\n",
-- 
cgit v1.2.3-70-g09d2


From b5740f4b2cb3503b436925eb2242bc3d75cd3dfe Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Tue, 17 Jan 2012 17:40:31 +0900
Subject: sched: Fix ancient race in do_exit()

try_to_wake_up() has a problem which may change status from TASK_DEAD to
TASK_RUNNING in race condition with SMI or guest environment of virtual
machine. As a result, exited task is scheduled() again and panic occurs.

Here is the sequence how it occurs:

 ----------------------------------+-----------------------------
                                   |
            CPU A                  |             CPU B
 ----------------------------------+-----------------------------

TASK A calls exit()....

do_exit()

  exit_mm()
    down_read(mm->mmap_sem);

    rwsem_down_failed_common()

      set TASK_UNINTERRUPTIBLE
      set waiter.task <= task A
      list_add to sem->wait_list
           :
      raw_spin_unlock_irq()
      (I/O interruption occured)

                                      __rwsem_do_wake(mmap_sem)

                                        list_del(&waiter->list);
                                        waiter->task = NULL
                                        wake_up_process(task A)
                                          try_to_wake_up()
                                             (task is still
                                               TASK_UNINTERRUPTIBLE)
                                              p->on_rq is still 1.)

                                              ttwu_do_wakeup()
                                                 (*A)
                                                   :
     (I/O interruption handler finished)

      if (!waiter.task)
          schedule() is not called
          due to waiter.task is NULL.

      tsk->state = TASK_RUNNING

          :
                                              check_preempt_curr();
                                                  :
  task->state = TASK_DEAD
                                              (*B)
                                        <---    set TASK_RUNNING (*C)

     schedule()
     (exit task is running again)
     BUG_ON() is called!
 --------------------------------------------------------

The execution time between (*A) and (*B) is usually very short,
because the interruption is disabled, and setting TASK_RUNNING at (*C)
must be executed before setting TASK_DEAD.

HOWEVER, if SMI is interrupted between (*A) and (*B),
(*C) is able to execute AFTER setting TASK_DEAD!
Then, exited task is scheduled again, and BUG_ON() is called....

If the system works on guest system of virtual machine, the time
between (*A) and (*B) may be also long due to scheduling of hypervisor,
and same phenomenon can occur.

By this patch, do_exit() waits for releasing task->pi_lock which is used
in try_to_wake_up(). It guarantees the task becomes TASK_DEAD after
waking up.

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20120117174031.3118.E1E9C6FF@jp.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/exit.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 294b1709170..4b4042f9bc6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1038,6 +1038,22 @@ void do_exit(long code)
 	if (tsk->nr_dirtied)
 		__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
 	exit_rcu();
+
+	/*
+	 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
+	 * when the following two conditions become true.
+	 *   - There is race condition of mmap_sem (It is acquired by
+	 *     exit_mm()), and
+	 *   - SMI occurs before setting TASK_RUNINNG.
+	 *     (or hypervisor of virtual machine switches to other guest)
+	 *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD
+	 *
+	 * To avoid it, we have to wait for releasing tsk->pi_lock which
+	 * is held by try_to_wake_up()
+	 */
+	smp_mb();
+	raw_spin_unlock_wait(&tsk->pi_lock);
+
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
 	tsk->flags |= PF_NOFREEZE;	/* tell freezer to ignore us */
-- 
cgit v1.2.3-70-g09d2


From e050e3f0a71bf7dc2c148b35caff0234decc8198 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 26 Jan 2012 17:03:19 +0100
Subject: perf: Fix broken interrupt rate throttling

This patch fixes the sampling interrupt throttling mechanism.

It was broken in v3.2. Events were not being unthrottled. The
unthrottling mechanism required that events be checked at each
timer tick.

This patch solves this problem and also separates:

  - unthrottling
  - multiplexing
  - frequency-mode period adjustments

Not all of them need to be executed at each timer tick.

This third version of the patch is based on my original patch +
PeterZ proposal (https://lkml.org/lkml/2012/1/7/87).

At each timer tick, for each context:

  - if the current CPU has throttled events, we unthrottle events

  - if context has frequency-based events, we adjust sampling periods

  - if we have reached the jiffies interval, we multiplex (rotate)

We decoupled rotation (multiplexing) from frequency-mode sampling
period adjustments.  They should not necessarily happen at the same
rate. Multiplexing is subject to jiffies_interval (currently at 1
but could be higher once the tunable is exposed via sysfs).

We have grouped frequency-mode adjustment and unthrottling into the
same routine to minimize code duplication. When throttled while in
frequency mode, we scan the events only once.

We have fixed the threshold enforcement code in __perf_event_overflow().
There was a bug whereby it would allow more than the authorized rate
because an increment of hwc->interrupts was not executed at the right
place.

The patch was tested with low sampling limit (2000) and fixed periods,
frequency mode, overcommitted PMU.

On a 2.1GHz AMD CPU:

 $ cat /proc/sys/kernel/perf_event_max_sample_rate
 2000

We set a rate of 3000 samples/sec (2.1GHz/3000 = 700000):

 $ perf record -e cycles,cycles -c 700000  noploop 10
 $ perf report -D | tail -21

 Aggregated stats:
           TOTAL events:      80086
            MMAP events:         88
            COMM events:          2
            EXIT events:          4
        THROTTLE events:      19996
      UNTHROTTLE events:      19996
          SAMPLE events:      40000

 cycles stats:
           TOTAL events:      40006
            MMAP events:          5
            COMM events:          1
            EXIT events:          4
        THROTTLE events:       9998
      UNTHROTTLE events:       9998
          SAMPLE events:      20000

 cycles stats:
           TOTAL events:      39996
        THROTTLE events:       9998
      UNTHROTTLE events:       9998
          SAMPLE events:      20000

For 10s, the cap is 2x2000x10 = 40000 samples.
We get exactly that: 20000 samples/event.

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: <stable@kernel.org> # v3.2+
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20120126160319.GA5655@quad
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h |   1 +
 kernel/events/core.c       | 104 ++++++++++++++++++++++++++++-----------------
 2 files changed, 67 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 08855613ceb..abb2776be1b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -587,6 +587,7 @@ struct hw_perf_event {
 	u64				sample_period;
 	u64				last_period;
 	local64_t			period_left;
+	u64                             interrupts_seq;
 	u64				interrupts;
 
 	u64				freq_time_stamp;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 32b48c88971..ba36013cfb2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2300,6 +2300,9 @@ do {					\
 	return div64_u64(dividend, divisor);
 }
 
+static DEFINE_PER_CPU(int, perf_throttled_count);
+static DEFINE_PER_CPU(u64, perf_throttled_seq);
+
 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 {
 	struct hw_perf_event *hwc = &event->hw;
@@ -2325,16 +2328,29 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 	}
 }
 
-static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
+/*
+ * combine freq adjustment with unthrottling to avoid two passes over the
+ * events. At the same time, make sure, having freq events does not change
+ * the rate of unthrottling as that would introduce bias.
+ */
+static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
+					   int needs_unthr)
 {
 	struct perf_event *event;
 	struct hw_perf_event *hwc;
-	u64 interrupts, now;
+	u64 now, period = TICK_NSEC;
 	s64 delta;
 
-	if (!ctx->nr_freq)
+	/*
+	 * only need to iterate over all events iff:
+	 * - context have events in frequency mode (needs freq adjust)
+	 * - there are events to unthrottle on this cpu
+	 */
+	if (!(ctx->nr_freq || needs_unthr))
 		return;
 
+	raw_spin_lock(&ctx->lock);
+
 	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
 		if (event->state != PERF_EVENT_STATE_ACTIVE)
 			continue;
@@ -2344,13 +2360,8 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
 
 		hwc = &event->hw;
 
-		interrupts = hwc->interrupts;
-		hwc->interrupts = 0;
-
-		/*
-		 * unthrottle events on the tick
-		 */
-		if (interrupts == MAX_INTERRUPTS) {
+		if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) {
+			hwc->interrupts = 0;
 			perf_log_throttle(event, 1);
 			event->pmu->start(event, 0);
 		}
@@ -2358,14 +2369,26 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
 		if (!event->attr.freq || !event->attr.sample_freq)
 			continue;
 
-		event->pmu->read(event);
+		/*
+		 * stop the event and update event->count
+		 */
+		event->pmu->stop(event, PERF_EF_UPDATE);
+
 		now = local64_read(&event->count);
 		delta = now - hwc->freq_count_stamp;
 		hwc->freq_count_stamp = now;
 
+		/*
+		 * restart the event
+		 * reload only if value has changed
+		 */
 		if (delta > 0)
 			perf_adjust_period(event, period, delta);
+
+		event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
 	}
+
+	raw_spin_unlock(&ctx->lock);
 }
 
 /*
@@ -2388,16 +2411,13 @@ static void rotate_ctx(struct perf_event_context *ctx)
  */
 static void perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
-	u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
 	struct perf_event_context *ctx = NULL;
-	int rotate = 0, remove = 1, freq = 0;
+	int rotate = 0, remove = 1;
 
 	if (cpuctx->ctx.nr_events) {
 		remove = 0;
 		if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
 			rotate = 1;
-		if (cpuctx->ctx.nr_freq)
-			freq = 1;
 	}
 
 	ctx = cpuctx->task_ctx;
@@ -2405,37 +2425,26 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
 		remove = 0;
 		if (ctx->nr_events != ctx->nr_active)
 			rotate = 1;
-		if (ctx->nr_freq)
-			freq = 1;
 	}
 
-	if (!rotate && !freq)
+	if (!rotate)
 		goto done;
 
 	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 	perf_pmu_disable(cpuctx->ctx.pmu);
 
-	if (freq) {
-		perf_ctx_adjust_freq(&cpuctx->ctx, interval);
-		if (ctx)
-			perf_ctx_adjust_freq(ctx, interval);
-	}
-
-	if (rotate) {
-		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-		if (ctx)
-			ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+	if (ctx)
+		ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
 
-		rotate_ctx(&cpuctx->ctx);
-		if (ctx)
-			rotate_ctx(ctx);
+	rotate_ctx(&cpuctx->ctx);
+	if (ctx)
+		rotate_ctx(ctx);
 
-		perf_event_sched_in(cpuctx, ctx, current);
-	}
+	perf_event_sched_in(cpuctx, ctx, current);
 
 	perf_pmu_enable(cpuctx->ctx.pmu);
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-
 done:
 	if (remove)
 		list_del_init(&cpuctx->rotation_list);
@@ -2445,10 +2454,22 @@ void perf_event_task_tick(void)
 {
 	struct list_head *head = &__get_cpu_var(rotation_list);
 	struct perf_cpu_context *cpuctx, *tmp;
+	struct perf_event_context *ctx;
+	int throttled;
 
 	WARN_ON(!irqs_disabled());
 
+	__this_cpu_inc(perf_throttled_seq);
+	throttled = __this_cpu_xchg(perf_throttled_count, 0);
+
 	list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
+		ctx = &cpuctx->ctx;
+		perf_adjust_freq_unthr_context(ctx, throttled);
+
+		ctx = cpuctx->task_ctx;
+		if (ctx)
+			perf_adjust_freq_unthr_context(ctx, throttled);
+
 		if (cpuctx->jiffies_interval == 1 ||
 				!(jiffies % cpuctx->jiffies_interval))
 			perf_rotate_context(cpuctx);
@@ -4509,6 +4530,7 @@ static int __perf_event_overflow(struct perf_event *event,
 {
 	int events = atomic_read(&event->event_limit);
 	struct hw_perf_event *hwc = &event->hw;
+	u64 seq;
 	int ret = 0;
 
 	/*
@@ -4518,14 +4540,20 @@ static int __perf_event_overflow(struct perf_event *event,
 	if (unlikely(!is_sampling_event(event)))
 		return 0;
 
-	if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
-		if (throttle) {
+	seq = __this_cpu_read(perf_throttled_seq);
+	if (seq != hwc->interrupts_seq) {
+		hwc->interrupts_seq = seq;
+		hwc->interrupts = 1;
+	} else {
+		hwc->interrupts++;
+		if (unlikely(throttle
+			     && hwc->interrupts >= max_samples_per_tick)) {
+			__this_cpu_inc(perf_throttled_count);
 			hwc->interrupts = MAX_INTERRUPTS;
 			perf_log_throttle(event, 0);
 			ret = 1;
 		}
-	} else
-		hwc->interrupts++;
+	}
 
 	if (event->attr.freq) {
 		u64 now = perf_clock();
-- 
cgit v1.2.3-70-g09d2


From cb297a3e433dbdcf7ad81e0564e7b804c941ff0d Mon Sep 17 00:00:00 2001
From: Chanho Min <chanho0207@gmail.com>
Date: Thu, 5 Jan 2012 20:00:19 +0900
Subject: sched/rt: Fix task stack corruption under
 __ARCH_WANT_INTERRUPTS_ON_CTXSW

This issue happens under the following conditions:

 1. preemption is off
 2. __ARCH_WANT_INTERRUPTS_ON_CTXSW is defined
 3. RT scheduling class
 4. SMP system

Sequence is as follows:

 1.suppose current task is A. start schedule()
 2.task A is enqueued pushable task at the entry of schedule()
   __schedule
    prev = rq->curr;
    ...
    put_prev_task
     put_prev_task_rt
      enqueue_pushable_task
 4.pick the task B as next task.
   next = pick_next_task(rq);
 3.rq->curr set to task B and context_switch is started.
   rq->curr = next;
 4.At the entry of context_swtich, release this cpu's rq->lock.
   context_switch
    prepare_task_switch
     prepare_lock_switch
      raw_spin_unlock_irq(&rq->lock);
 5.Shortly after rq->lock is released, interrupt is occurred and start IRQ context
 6.try_to_wake_up() which called by ISR acquires rq->lock
    try_to_wake_up
     ttwu_remote
      rq = __task_rq_lock(p)
      ttwu_do_wakeup(rq, p, wake_flags);
        task_woken_rt
 7.push_rt_task picks the task A which is enqueued before.
   task_woken_rt
    push_rt_tasks(rq)
     next_task = pick_next_pushable_task(rq)
 8.At find_lock_lowest_rq(), If double_lock_balance() returns 0,
   lowest_rq can be the remote rq.
  (But,If preemption is on, double_lock_balance always return 1 and it
   does't happen.)
   push_rt_task
    find_lock_lowest_rq
     if (double_lock_balance(rq, lowest_rq))..
 9.find_lock_lowest_rq return the available rq. task A is migrated to
   the remote cpu/rq.
   push_rt_task
    ...
    deactivate_task(rq, next_task, 0);
    set_task_cpu(next_task, lowest_rq->cpu);
    activate_task(lowest_rq, next_task, 0);
 10. But, task A is on irq context at this cpu.
     So, task A is scheduled by two cpus at the same time until restore from IRQ.
     Task A's stack is corrupted.

To fix it, don't migrate an RT task if it's still running.

Signed-off-by: Chanho Min <chanho.min@lge.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/r/CAOAMb1BHA=5fm7KTewYyke6u-8DP0iUuJMpgQw54vNeXFsGpoQ@mail.gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/rt.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 3640ebbb466..f42ae7fb5ec 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1587,6 +1587,11 @@ static int push_rt_task(struct rq *rq)
 	if (!next_task)
 		return 0;
 
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+       if (unlikely(task_running(rq, next_task)))
+               return 0;
+#endif
+
 retry:
 	if (unlikely(next_task == rq->curr)) {
 		WARN_ON(1);
-- 
cgit v1.2.3-70-g09d2


From 181e9bdef37bfcaa41f3ab6c948a2a0d60a268b5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 29 Jan 2012 20:35:52 +0100
Subject: PM / Hibernate: Fix s2disk regression related to freezing workqueues

Commit 2aede851ddf08666f68ffc17be446420e9d2a056

  PM / Hibernate: Freeze kernel threads after preallocating memory

introduced a mechanism by which kernel threads were frozen after
the preallocation of hibernate image memory to avoid problems with
frozen kernel threads not responding to memory freeing requests.
However, it overlooked the s2disk code path in which the
SNAPSHOT_CREATE_IMAGE ioctl was run directly after SNAPSHOT_FREE,
which caused freeze_workqueues_begin() to BUG(), because it saw
that worqueues had been already frozen.

Although in principle this issue might be addressed by removing
the relevant BUG_ON() from freeze_workqueues_begin(), that would
reintroduce the very problem that commit 2aede851ddf08666f68ffc17be4
attempted to avoid into that particular code path.  For this reason,
to fix the issue at hand, introduce thaw_kernel_threads() and make
the SNAPSHOT_FREE ioctl execute it.

Special thanks to Srivatsa S. Bhat for detailed analysis of the
problem.

Reported-and-tested-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Cc: stable@kernel.org
---
 include/linux/freezer.h |  2 ++
 kernel/power/process.c  | 19 +++++++++++++++++++
 kernel/power/user.c     |  9 +++++++++
 3 files changed, 30 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 0ab54e16a91..d09af4b67cf 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -39,6 +39,7 @@ extern bool __refrigerator(bool check_kthr_stop);
 extern int freeze_processes(void);
 extern int freeze_kernel_threads(void);
 extern void thaw_processes(void);
+extern void thaw_kernel_threads(void);
 
 static inline bool try_to_freeze(void)
 {
@@ -174,6 +175,7 @@ static inline bool __refrigerator(bool check_kthr_stop) { return false; }
 static inline int freeze_processes(void) { return -ENOSYS; }
 static inline int freeze_kernel_threads(void) { return -ENOSYS; }
 static inline void thaw_processes(void) {}
+static inline void thaw_kernel_threads(void) {}
 
 static inline bool try_to_freeze(void) { return false; }
 
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 77274c9ba2f..eeca00311f3 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -188,3 +188,22 @@ void thaw_processes(void)
 	printk("done.\n");
 }
 
+void thaw_kernel_threads(void)
+{
+	struct task_struct *g, *p;
+
+	pm_nosig_freezing = false;
+	printk("Restarting kernel threads ... ");
+
+	thaw_workqueues();
+
+	read_lock(&tasklist_lock);
+	do_each_thread(g, p) {
+		if (p->flags & (PF_KTHREAD | PF_WQ_WORKER))
+			__thaw_task(p);
+	} while_each_thread(g, p);
+	read_unlock(&tasklist_lock);
+
+	schedule();
+	printk("done.\n");
+}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 6b1ab7a8852..e5a21a85730 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -274,6 +274,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 		swsusp_free();
 		memset(&data->handle, 0, sizeof(struct snapshot_handle));
 		data->ready = 0;
+		/*
+		 * It is necessary to thaw kernel threads here, because
+		 * SNAPSHOT_CREATE_IMAGE may be invoked directly after
+		 * SNAPSHOT_FREE.  In that case, if kernel threads were not
+		 * thawed, the preallocation of memory carried out by
+		 * hibernation_snapshot() might run into problems (i.e. it
+		 * might fail or even deadlock).
+		 */
+		thaw_kernel_threads();
 		break;
 
 	case SNAPSHOT_PREF_IMAGE_SIZE:
-- 
cgit v1.2.3-70-g09d2


From fe9161db2e6053da21e4649d77bbefaf3030b11d Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Wed, 1 Feb 2012 22:16:36 +0100
Subject: PM / Hibernate: Thaw kernel threads in SNAPSHOT_CREATE_IMAGE ioctl
 path

In the SNAPSHOT_CREATE_IMAGE ioctl, if the call to hibernation_snapshot()
fails, the frozen tasks are not thawed.

And in the case of success, if we happen to exit due to a successful freezer
test, all tasks (including those of userspace) are thawed, whereas actually
we should have thawed only the kernel threads at that point. Fix both these
issues.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: stable@vger.kernel.org
---
 kernel/power/user.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/user.c b/kernel/power/user.c
index e5a21a85730..3e100075b13 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -249,13 +249,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 		}
 		pm_restore_gfp_mask();
 		error = hibernation_snapshot(data->platform_support);
-		if (!error) {
+		if (error) {
+			thaw_kernel_threads();
+		} else {
 			error = put_user(in_suspend, (int __user *)arg);
 			if (!error && !freezer_test_done)
 				data->ready = 1;
 			if (freezer_test_done) {
 				freezer_test_done = false;
-				thaw_processes();
+				thaw_kernel_threads();
 			}
 		}
 		break;
-- 
cgit v1.2.3-70-g09d2


From 8cdb878dcb359fd1137e9abdee9322f5e9bcfdf8 Mon Sep 17 00:00:00 2001
From: Christopher Yeoh <cyeoh@au1.ibm.com>
Date: Thu, 2 Feb 2012 11:34:09 +1030
Subject: Fix race in process_vm_rw_core

This fixes the race in process_vm_core found by Oleg (see

  http://article.gmane.org/gmane.linux.kernel/1235667/

for details).

This has been updated since I last sent it as the creation of the new
mm_access() function did almost exactly the same thing as parts of the
previous version of this patch did.

In order to use mm_access() even when /proc isn't enabled, we move it to
kernel/fork.c where other related process mm access functions already
are.

Signed-off-by: Chris Yeoh <yeohc@au1.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c         | 20 --------------------
 include/linux/sched.h  |  6 ++++++
 kernel/fork.c          | 20 ++++++++++++++++++++
 mm/process_vm_access.c | 23 +++++++++--------------
 4 files changed, 35 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index d9512bd03e6..d4548dd49b0 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -198,26 +198,6 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
 	return result;
 }
 
-static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
-{
-	struct mm_struct *mm;
-	int err;
-
-	err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
-	if (err)
-		return ERR_PTR(err);
-
-	mm = get_task_mm(task);
-	if (mm && mm != current->mm &&
-			!ptrace_may_access(task, mode)) {
-		mmput(mm);
-		mm = ERR_PTR(-EACCES);
-	}
-	mutex_unlock(&task->signal->cred_guard_mutex);
-
-	return mm;
-}
-
 struct mm_struct *mm_for_maps(struct task_struct *task)
 {
 	return mm_access(task, PTRACE_MODE_READ);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2234985a5e6..7d379a6bfd8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2259,6 +2259,12 @@ static inline void mmdrop(struct mm_struct * mm)
 extern void mmput(struct mm_struct *);
 /* Grab a reference to a task's mm, if it is not already going away */
 extern struct mm_struct *get_task_mm(struct task_struct *task);
+/*
+ * Grab a reference to a task's mm, if it is not already going away
+ * and ptrace_may_access with the mode parameter passed to it
+ * succeeds.
+ */
+extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
 /* Remove the current tasks stale references to the old mm_struct */
 extern void mm_release(struct task_struct *, struct mm_struct *);
 /* Allocate a new mm structure and copy contents from tsk->mm */
diff --git a/kernel/fork.c b/kernel/fork.c
index 051f090d40c..1b2ef3c23ae 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -647,6 +647,26 @@ struct mm_struct *get_task_mm(struct task_struct *task)
 }
 EXPORT_SYMBOL_GPL(get_task_mm);
 
+struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
+{
+	struct mm_struct *mm;
+	int err;
+
+	err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
+	if (err)
+		return ERR_PTR(err);
+
+	mm = get_task_mm(task);
+	if (mm && mm != current->mm &&
+			!ptrace_may_access(task, mode)) {
+		mmput(mm);
+		mm = ERR_PTR(-EACCES);
+	}
+	mutex_unlock(&task->signal->cred_guard_mutex);
+
+	return mm;
+}
+
 /* Please note the differences between mmput and mm_release.
  * mmput is called whenever we stop holding onto a mm_struct,
  * error success whatever.
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index e920aa3ce10..c20ff48994c 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -298,23 +298,18 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
 		goto free_proc_pages;
 	}
 
-	task_lock(task);
-	if (__ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
-		task_unlock(task);
-		rc = -EPERM;
-		goto put_task_struct;
-	}
-	mm = task->mm;
-
-	if (!mm || (task->flags & PF_KTHREAD)) {
-		task_unlock(task);
-		rc = -EINVAL;
+	mm = mm_access(task, PTRACE_MODE_ATTACH);
+	if (!mm || IS_ERR(mm)) {
+		rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+		/*
+		 * Explicitly map EACCES to EPERM as EPERM is a more a
+		 * appropriate error code for process_vw_readv/writev
+		 */
+		if (rc == -EACCES)
+			rc = -EPERM;
 		goto put_task_struct;
 	}
 
-	atomic_inc(&mm->mm_users);
-	task_unlock(task);
-
 	for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) {
 		rc = process_vm_rw_single_vec(
 			(unsigned long)rvec[i].iov_base, rvec[i].iov_len,
-- 
cgit v1.2.3-70-g09d2


From 55ca6140e9bb307efc97a9301a4f501de02a6fd6 Mon Sep 17 00:00:00 2001
From: Jiang Liu <liuj97@gmail.com>
Date: Fri, 3 Feb 2012 15:37:16 -0800
Subject: kprobes: fix a memory leak in function pre_handler_kretprobe()

In function pre_handler_kretprobe(), the allocated kretprobe_instance
object will get leaked if the entry_handler callback returns non-zero.
This may cause all the preallocated kretprobe_instance objects exhausted.

This issue can be reproduced by changing
samples/kprobes/kretprobe_example.c to probe "mutex_unlock".  And the fix
is straightforward: just put the allocated kretprobe_instance object back
onto the free_instances list.

[akpm@linux-foundation.org: use raw_spin_lock/unlock]
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Acked-by: Jim Keniston <jkenisto@us.ibm.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 29f5b65bee2..9788c0ec6f4 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1673,8 +1673,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
 		ri->rp = rp;
 		ri->task = current;
 
-		if (rp->entry_handler && rp->entry_handler(ri, regs))
+		if (rp->entry_handler && rp->entry_handler(ri, regs)) {
+			raw_spin_lock_irqsave(&rp->lock, flags);
+			hlist_add_head(&ri->hlist, &rp->free_instances);
+			raw_spin_unlock_irqrestore(&rp->lock, flags);
 			return 0;
+		}
 
 		arch_prepare_kretprobe(ri, regs);
 
-- 
cgit v1.2.3-70-g09d2


From 379e0be812ab8a2a351e784b0c987788f5123090 Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Fri, 3 Feb 2012 22:22:41 +0100
Subject: PM / Freezer: Thaw only kernel threads if freezing of kernel threads
 fails

If freezing of kernel threads fails, we are expected to automatically
thaw tasks in the error recovery path. However, at times, we encounter
situations in which we would like the automatic error recovery path
to thaw only the kernel threads, because we want to be able to do
some more cleanup before we thaw userspace. Something like:

error = freeze_kernel_threads();
if (error) {
	/* Do some cleanup */

	/* Only then thaw userspace tasks*/
	thaw_processes();
}

An example of such a situation is where we freeze/thaw filesystems
during suspend/hibernation. There, if freezing of kernel threads
fails, we would like to thaw the frozen filesystems before thawing
the userspace tasks.

So, modify freeze_kernel_threads() to thaw only kernel threads in
case of freezing failure. And change suspend_freeze_processes()
accordingly. (At the same time, let us also get rid of the rather
cryptic usage of the conditional operator (:?) in that function.)

[rjw: In fact, this patch fixes a regression introduced during the
 3.3 merge window, because without it thaw_processes() may be called
 before swsusp_free() in some situations and that may lead to massive
 memory allocation failures.]

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Nigel Cunningham <nigel@tuxonice.net>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/power.h   | 24 ++++++++++++++++++++++--
 kernel/power/process.c |  7 +++++--
 2 files changed, 27 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index 0c4defe6d3b..21724eee520 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -231,8 +231,28 @@ extern int pm_test_level;
 #ifdef CONFIG_SUSPEND_FREEZER
 static inline int suspend_freeze_processes(void)
 {
-	int error = freeze_processes();
-	return error ? : freeze_kernel_threads();
+	int error;
+
+	error = freeze_processes();
+
+	/*
+	 * freeze_processes() automatically thaws every task if freezing
+	 * fails. So we need not do anything extra upon error.
+	 */
+	if (error)
+		goto Finish;
+
+	error = freeze_kernel_threads();
+
+	/*
+	 * freeze_kernel_threads() thaws only kernel threads upon freezing
+	 * failure. So we have to thaw the userspace tasks ourselves.
+	 */
+	if (error)
+		thaw_processes();
+
+ Finish:
+	return error;
 }
 
 static inline void suspend_thaw_processes(void)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index eeca00311f3..7e426459e60 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -143,7 +143,10 @@ int freeze_processes(void)
 /**
  * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator.
  *
- * On success, returns 0.  On failure, -errno and system is fully thawed.
+ * On success, returns 0.  On failure, -errno and only the kernel threads are
+ * thawed, so as to give a chance to the caller to do additional cleanups
+ * (if any) before thawing the userspace tasks. So, it is the responsibility
+ * of the caller to thaw the userspace tasks, when the time is right.
  */
 int freeze_kernel_threads(void)
 {
@@ -159,7 +162,7 @@ int freeze_kernel_threads(void)
 	BUG_ON(in_atomic());
 
 	if (error)
-		thaw_processes();
+		thaw_kernel_threads();
 	return error;
 }
 
-- 
cgit v1.2.3-70-g09d2