From 383efcd00053ec40023010ce5034bd702e7ab373 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 18 Mar 2013 12:22:34 -0700
Subject: sched: Convert BUG_ON()s in try_to_wake_up_local() to WARN_ON_ONCE()s

try_to_wake_up_local() should only be invoked to wake up another
task in the same runqueue and BUG_ON()s are used to enforce the
rule. Missing try_to_wake_up_local() can stall workqueue
execution but such stalls are likely to be finite either by
another work item being queued or the one blocked getting
unblocked.  There's no reason to trigger BUG while holding rq
lock crashing the whole system.

Convert BUG_ON()s in try_to_wake_up_local() to WARN_ON_ONCE()s.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20130318192234.GD3042@htj.dyndns.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b7b03cd2d4c..306943f531a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1488,8 +1488,10 @@ static void try_to_wake_up_local(struct task_struct *p)
 {
 	struct rq *rq = task_rq(p);
 
-	BUG_ON(rq != this_rq());
-	BUG_ON(p == current);
+	if (WARN_ON_ONCE(rq != this_rq()) ||
+	    WARN_ON_ONCE(p == current))
+		return;
+
 	lockdep_assert_held(&rq->lock);
 
 	if (!raw_spin_trylock(&p->pi_lock)) {
-- 
cgit v1.2.3-70-g09d2


From dd9c086d9f507d02d5ba4d7c5eef4bb9518088b8 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 18 Mar 2013 14:33:28 +0100
Subject: perf: Fix ring_buffer perf_output_space() boundary calculation

This patch fixes a flaw in perf_output_space(). In case the size
of the space needed is bigger than the actual buffer size, there
may be situations where the function would return true (i.e.,
there is space) when it should not. head > offset due to
rounding of the masking logic.

The problem can be tested by activating BTS on Intel processors.
A BTS record can be as big as 16 pages. The following command
fails:

  $ perf record -m 4 -c 1 -e branches:u my_test_program

You will get a buffer corruption with this. Perf report won't be
able to parse the perf.data.

The fix is to first check that the requested space is smaller
than the buffer size. If so, then the masking logic will work
fine. If not, then there is no chance the record can be saved
and it will be gracefully handled by upper code layers.

[ In v2, we also make the logic for the writable more explicit by
  renaming it to rb->overwrite because it tells whether or not the
  buffer can overwrite its tail (suggested by PeterZ). ]

Signed-off-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: peterz@infradead.org
Cc: jolsa@redhat.com
Cc: fweisbec@gmail.com
Link: http://lkml.kernel.org/r/20130318133327.GA3056@quad
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/internal.h    |  2 +-
 kernel/events/ring_buffer.c | 22 ++++++++++++++++++----
 2 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index d56a64c99a8..eb675c4d59d 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -16,7 +16,7 @@ struct ring_buffer {
 	int				page_order;	/* allocation order  */
 #endif
 	int				nr_pages;	/* nr of data pages  */
-	int				writable;	/* are we writable   */
+	int				overwrite;	/* can overwrite itself */
 
 	atomic_t			poll;		/* POLL_ for wakeups */
 
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 23cb34ff397..97fddb09762 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -18,12 +18,24 @@
 static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
 			      unsigned long offset, unsigned long head)
 {
-	unsigned long mask;
+	unsigned long sz = perf_data_size(rb);
+	unsigned long mask = sz - 1;
 
-	if (!rb->writable)
+	/*
+	 * check if user-writable
+	 * overwrite : over-write its own tail
+	 * !overwrite: buffer possibly drops events.
+	 */
+	if (rb->overwrite)
 		return true;
 
-	mask = perf_data_size(rb) - 1;
+	/*
+	 * verify that payload is not bigger than buffer
+	 * otherwise masking logic may fail to detect
+	 * the "not enough space" condition
+	 */
+	if ((head - offset) > sz)
+		return false;
 
 	offset = (offset - tail) & mask;
 	head   = (head   - tail) & mask;
@@ -212,7 +224,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
 		rb->watermark = max_size / 2;
 
 	if (flags & RING_BUFFER_WRITABLE)
-		rb->writable = 1;
+		rb->overwrite = 0;
+	else
+		rb->overwrite = 1;
 
 	atomic_set(&rb->refcount, 1);
 
-- 
cgit v1.2.3-70-g09d2


From 84cc8fd2fe65866e49d70b38b3fdf7219dd92fe0 Mon Sep 17 00:00:00 2001
From: Michael Bohan <mbohan@codeaurora.org>
Date: Tue, 19 Mar 2013 19:19:25 -0700
Subject: hrtimer: Don't reinitialize a cpu_base lock on CPU_UP

The current code makes the assumption that a cpu_base lock won't be
held if the CPU corresponding to that cpu_base is offline, which isn't
always true.

If a hrtimer is not queued, then it will not be migrated by
migrate_hrtimers() when a CPU is offlined. Therefore, the hrtimer's
cpu_base may still point to a CPU which has subsequently gone offline
if the timer wasn't enqueued at the time the CPU went down.

Normally this wouldn't be a problem, but a cpu_base's lock is blindly
reinitialized each time a CPU is brought up. If a CPU is brought
online during the period that another thread is performing a hrtimer
operation on a stale hrtimer, then the lock will be reinitialized
under its feet, and a SPIN_BUG() like the following will be observed:

<0>[   28.082085] BUG: spinlock already unlocked on CPU#0, swapper/0/0
<0>[   28.087078]  lock: 0xc4780b40, value 0x0 .magic: dead4ead, .owner: <none>/-1, .owner_cpu: -1
<4>[   42.451150] [<c0014398>] (unwind_backtrace+0x0/0x120) from [<c0269220>] (do_raw_spin_unlock+0x44/0xdc)
<4>[   42.460430] [<c0269220>] (do_raw_spin_unlock+0x44/0xdc) from [<c071b5bc>] (_raw_spin_unlock+0x8/0x30)
<4>[   42.469632] [<c071b5bc>] (_raw_spin_unlock+0x8/0x30) from [<c00a9ce0>] (__hrtimer_start_range_ns+0x1e4/0x4f8)
<4>[   42.479521] [<c00a9ce0>] (__hrtimer_start_range_ns+0x1e4/0x4f8) from [<c00aa014>] (hrtimer_start+0x20/0x28)
<4>[   42.489247] [<c00aa014>] (hrtimer_start+0x20/0x28) from [<c00e6190>] (rcu_idle_enter_common+0x1ac/0x320)
<4>[   42.498709] [<c00e6190>] (rcu_idle_enter_common+0x1ac/0x320) from [<c00e6440>] (rcu_idle_enter+0xa0/0xb8)
<4>[   42.508259] [<c00e6440>] (rcu_idle_enter+0xa0/0xb8) from [<c000f268>] (cpu_idle+0x24/0xf0)
<4>[   42.516503] [<c000f268>] (cpu_idle+0x24/0xf0) from [<c06ed3c0>] (rest_init+0x88/0xa0)
<4>[   42.524319] [<c06ed3c0>] (rest_init+0x88/0xa0) from [<c0c00978>] (start_kernel+0x3d0/0x434)

As an example, this particular crash occurred when hrtimer_start() was
executed on CPU #0. The code locked the hrtimer's current cpu_base
corresponding to CPU #1. CPU #0 then tried to switch the hrtimer's
cpu_base to an optimal CPU which was online. In this case, it selected
the cpu_base corresponding to CPU #3.

Before it could proceed, CPU #1 came online and reinitialized the
spinlock corresponding to its cpu_base. Thus now CPU #0 held a lock
which was reinitialized. When CPU #0 finally ended up unlocking the
old cpu_base corresponding to CPU #1 so that it could switch to CPU
#3, we hit this SPIN_BUG() above while in switch_hrtimer_base().

CPU #0                            CPU #1
----                              ----
...                               <offline>
hrtimer_start()
lock_hrtimer_base(base #1)
...                               init_hrtimers_cpu()
switch_hrtimer_base()             ...
...                               raw_spin_lock_init(&cpu_base->lock)
raw_spin_unlock(&cpu_base->lock)  ...
<spin_bug>

Solve this by statically initializing the lock.

Signed-off-by: Michael Bohan <mbohan@codeaurora.org>
Link: http://lkml.kernel.org/r/1363745965-23475-1-git-send-email-mbohan@codeaurora.org
Cc: stable@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cc47812d3fe..14be27feda4 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -63,6 +63,7 @@
 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 {
 
+	.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
 	.clock_base =
 	{
 		{
@@ -1642,8 +1643,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 	struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
 	int i;
 
-	raw_spin_lock_init(&cpu_base->lock);
-
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		cpu_base->clock_base[i].cpu_base = cpu_base;
 		timerqueue_init_head(&cpu_base->clock_base[i].active);
-- 
cgit v1.2.3-70-g09d2


From a1cbcaa9ea87b87a96b9fc465951dcf36e459ca2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 6 Apr 2013 10:10:27 +0200
Subject: sched_clock: Prevent 64bit inatomicity on 32bit systems

The sched_clock_remote() implementation has the following inatomicity
problem on 32bit systems when accessing the remote scd->clock, which
is a 64bit value.

CPU0			CPU1

sched_clock_local()	sched_clock_remote(CPU0)
...
			remote_clock = scd[CPU0]->clock
			    read_low32bit(scd[CPU0]->clock)
cmpxchg64(scd->clock,...)
			    read_high32bit(scd[CPU0]->clock)

While the update of scd->clock is using an atomic64 mechanism, the
readout on the remote cpu is not, which can cause completely bogus
readouts.

It is a quite rare problem, because it requires the update to hit the
narrow race window between the low/high readout and the update must go
across the 32bit boundary.

The resulting misbehaviour is, that CPU1 will see the sched_clock on
CPU1 ~4 seconds ahead of it's own and update CPU1s sched_clock value
to this bogus timestamp. This stays that way due to the clamping
implementation for about 4 seconds until the synchronization with
CLOCK_MONOTONIC undoes the problem.

The issue is hard to observe, because it might only result in a less
accurate SCHED_OTHER timeslicing behaviour. To create observable
damage on realtime scheduling classes, it is necessary that the bogus
update of CPU1 sched_clock happens in the context of an realtime
thread, which then gets charged 4 seconds of RT runtime, which results
in the RT throttler mechanism to trigger and prevent scheduling of RT
tasks for a little less than 4 seconds. So this is quite unlikely as
well.

The issue was quite hard to decode as the reproduction time is between
2 days and 3 weeks and intrusive tracing makes it less likely, but the
following trace recorded with trace_clock=global, which uses
sched_clock_local(), gave the final hint:

  <idle>-0   0d..30 400269.477150: hrtimer_cancel: hrtimer=0xf7061e80
  <idle>-0   0d..30 400269.477151: hrtimer_start:  hrtimer=0xf7061e80 ...
irq/20-S-587 1d..32 400273.772118: sched_wakeup:   comm= ... target_cpu=0
  <idle>-0   0dN.30 400273.772118: hrtimer_cancel: hrtimer=0xf7061e80

What happens is that CPU0 goes idle and invokes
sched_clock_idle_sleep_event() which invokes sched_clock_local() and
CPU1 runs a remote wakeup for CPU0 at the same time, which invokes
sched_remote_clock(). The time jump gets propagated to CPU0 via
sched_remote_clock() and stays stale on both cores for ~4 seconds.

There are only two other possibilities, which could cause a stale
sched clock:

1) ktime_get() which reads out CLOCK_MONOTONIC returns a sporadic
   wrong value.

2) sched_clock() which reads the TSC returns a sporadic wrong value.

#1 can be excluded because sched_clock would continue to increase for
   one jiffy and then go stale.

#2 can be excluded because it would not make the clock jump
   forward. It would just result in a stale sched_clock for one jiffy.

After quite some brain twisting and finding the same pattern on other
traces, sched_clock_remote() remained the only place which could cause
such a problem and as explained above it's indeed racy on 32bit
systems.

So while on 64bit systems the readout is atomic, we need to verify the
remote readout on 32bit machines. We need to protect the local->clock
readout in sched_clock_remote() on 32bit as well because an NMI could
hit between the low and the high readout, call sched_clock_local() and
modify local->clock.

Thanks to Siegfried Wulsch for bearing with my debug requests and
going through the tedious tasks of running a bunch of reproducer
systems to generate the debug information which let me decode the
issue.

Reported-by: Siegfried Wulsch <Siegfried.Wulsch@rovema.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1304051544160.21884@ionos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
---
 kernel/sched/clock.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c685e31492d..c3ae1446461 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
 	u64 this_clock, remote_clock;
 	u64 *ptr, old_val, val;
 
+#if BITS_PER_LONG != 64
+again:
+	/*
+	 * Careful here: The local and the remote clock values need to
+	 * be read out atomic as we need to compare the values and
+	 * then update either the local or the remote side. So the
+	 * cmpxchg64 below only protects one readout.
+	 *
+	 * We must reread via sched_clock_local() in the retry case on
+	 * 32bit as an NMI could use sched_clock_local() via the
+	 * tracer and hit between the readout of
+	 * the low32bit and the high 32bit portion.
+	 */
+	this_clock = sched_clock_local(my_scd);
+	/*
+	 * We must enforce atomic readout on 32bit, otherwise the
+	 * update on the remote cpu can hit inbetween the readout of
+	 * the low32bit and the high 32bit portion.
+	 */
+	remote_clock = cmpxchg64(&scd->clock, 0, 0);
+#else
+	/*
+	 * On 64bit the read of [my]scd->clock is atomic versus the
+	 * update, so we can avoid the above 32bit dance.
+	 */
 	sched_clock_local(my_scd);
 again:
 	this_clock = my_scd->clock;
 	remote_clock = scd->clock;
+#endif
 
 	/*
 	 * Use the opportunity that we have both locks
-- 
cgit v1.2.3-70-g09d2


From fd9b86d37a600488dbd80fe60cca46b822bff1cd Mon Sep 17 00:00:00 2001
From: libin <huawei.libin@huawei.com>
Date: Mon, 8 Apr 2013 14:39:12 +0800
Subject: sched/debug: Fix sd->*_idx limit range avoiding overflow

Commit 201c373e8e ("sched/debug: Limit sd->*_idx range on
sysctl") was an incomplete bug fix.

This patch fixes sd->*_idx limit range to [0 ~ CPU_LOAD_IDX_MAX-1]
avoiding array overflow caused by setting sd->*_idx to CPU_LOAD_IDX_MAX
on sysctl.

Signed-off-by: Libin <huawei.libin@huawei.com>
Cc: <jiang.liu@huawei.com>
Cc: <guohanjun@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/51626610.2040607@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 306943f531a..fa077929e31 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4933,7 +4933,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
 }
 
 static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
 
 static void
 set_table_entry(struct ctl_table *entry,
-- 
cgit v1.2.3-70-g09d2


From c97847d2f0eb77c806e650e04d9bbcf79fa05730 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Mon, 8 Apr 2013 11:48:27 +0800
Subject: perf: Fix strncpy() use, always make sure it's NUL terminated

For NUL terminated string, always make sure that there's '\0' at the end.

In our case we need a return value, so still use strncpy() and
fix up the tail explicitly.

(strlcpy() returns the size, not the pointer)

Signed-off-by: Chen Gang <gang.chen@asianux.com>
Cc: a.p.zijlstra@chello.nl <a.p.zijlstra@chello.nl>
Cc: paulus@samba.org <paulus@samba.org>
Cc: acme@ghostprotocols.net <acme@ghostprotocols.net>
Link: http://lkml.kernel.org/r/51623E0B.7070101@asianux.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 59412d037ee..7f0d67ea3f4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4737,7 +4737,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
 	} else {
 		if (arch_vma_name(mmap_event->vma)) {
 			name = strncpy(tmp, arch_vma_name(mmap_event->vma),
-				       sizeof(tmp));
+				       sizeof(tmp) - 1);
+			tmp[sizeof(tmp) - 1] = '\0';
 			goto got_name;
 		}
 
-- 
cgit v1.2.3-70-g09d2


From 67012ab1d2ce871afea4ee55408f233f97d09d07 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Mon, 8 Apr 2013 12:06:44 +0800
Subject: perf: Fix strncpy() use, use strlcpy() instead of strncpy()

For NUL terminated string we always need to set '\0' at the end.

Signed-off-by: Chen Gang <gang.chen@asianux.com>
Cc: rostedt@goodmis.org
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/51624254.30301@asianux.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4f1dade5698..3f5046a4d81 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -132,7 +132,7 @@ static char *default_bootup_tracer;
 
 static int __init set_cmdline_ftrace(char *str)
 {
-	strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
+	strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
 	default_bootup_tracer = bootup_tracer_buf;
 	/* We are using ftrace early, expand it */
 	ring_buffer_expanded = 1;
@@ -162,7 +162,7 @@ static char *trace_boot_options __initdata;
 
 static int __init set_trace_boot_options(char *str)
 {
-	strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
+	strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
 	trace_boot_options = trace_boot_options_buf;
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 75761cc15877c155b3849b4e0e0cb3f897faf471 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Mon, 8 Apr 2013 12:12:39 +0800
Subject: ftrace: Fix strncpy() use, use strlcpy() instead of strncpy()

For NUL terminated string we always need to set '\0' at the end.

Signed-off-by: Chen Gang <gang.chen@asianux.com>
Cc: rostedt@goodmis.org
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/516243B7.9020405@asianux.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/trace/ftrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6893d5a2bf0..db143742704 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3441,14 +3441,14 @@ static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
 
 static int __init set_ftrace_notrace(char *str)
 {
-	strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+	strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
 	return 1;
 }
 __setup("ftrace_notrace=", set_ftrace_notrace);
 
 static int __init set_ftrace_filter(char *str)
 {
-	strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+	strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
 	return 1;
 }
 __setup("ftrace_filter=", set_ftrace_filter);
-- 
cgit v1.2.3-70-g09d2


From e614b3332a4f3f264a26da28e5a1f4cc3aea3974 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Thu, 4 Apr 2013 10:57:48 +0200
Subject: sched/cputime: Fix accounting on multi-threaded processes

Recent commit 6fac4829 ("cputime: Use accessors to read task
cputime stats") introduced a bug, where we account many times
the cputime of the first thread, instead of cputimes of all
the different threads.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20130404085740.GA2495@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cputime.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ed12cbb135f..e93cca92f38 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -310,7 +310,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 
 	t = tsk;
 	do {
-		task_cputime(tsk, &utime, &stime);
+		task_cputime(t, &utime, &stime);
 		times->utime += utime;
 		times->stime += stime;
 		times->sum_exec_runtime += task_sched_runtime(t);
-- 
cgit v1.2.3-70-g09d2


From 2930e04d00e113ae24bb2b7c2b58de7b648a62c7 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 26 Mar 2013 17:33:00 -0400
Subject: tracing: Fix race with update_max_tr_single and changing tracers

The commit 34600f0e9 "tracing: Fix race with max_tr and changing tracers"
fixed the updating of the main buffers with the race of changing
tracers, but left out the fix to the updating of just a per cpu buffer.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4f1dade5698..7ba7fc76f9e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -744,8 +744,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 		return;
 
 	WARN_ON_ONCE(!irqs_disabled());
-	if (WARN_ON_ONCE(!current_trace->allocated_snapshot))
+	if (!current_trace->allocated_snapshot) {
+		/* Only the nop tracer should hit this when disabling */
+		WARN_ON_ONCE(current_trace != &nop_trace);
 		return;
+	}
 
 	arch_spin_lock(&ftrace_max_lock);
 
-- 
cgit v1.2.3-70-g09d2


From 5000c418840b309251c5887f0b56503aae30f84c Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 26 Mar 2013 17:53:03 +0100
Subject: ftrace: Consistently restore trace function on sysctl enabling

If we reenable ftrace via syctl, we currently set ftrace_trace_function
based on the previous simplistic algorithm. This is inconsistent with
what update_ftrace_function does. So better call that helper instead.

Link: http://lkml.kernel.org/r/5151D26F.1070702@siemens.com

Cc: stable@vger.kernel.org
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6893d5a2bf0..cc4943c7ce6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4555,12 +4555,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 		ftrace_startup_sysctl();
 
 		/* we are starting ftrace again */
-		if (ftrace_ops_list != &ftrace_list_end) {
-			if (ftrace_ops_list->next == &ftrace_list_end)
-				ftrace_trace_function = ftrace_ops_list->func;
-			else
-				ftrace_trace_function = ftrace_ops_list_func;
-		}
+		if (ftrace_ops_list != &ftrace_list_end)
+			update_ftrace_function();
 
 	} else {
 		/* stopping ftrace calls (just send to ftrace_stub) */
-- 
cgit v1.2.3-70-g09d2


From 395b97a3aeff0b8d949ee3e67bf8c11c5ffd6861 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 27 Mar 2013 09:31:28 -0400
Subject: ftrace: Do not call stub functions in control loop

The function tracing control loop used by perf spits out a warning
if the called function is not a control function. This is because
the control function references a per cpu allocated data structure
on struct ftrace_ops that is not allocated for other types of
functions.

commit 0a016409e42 "ftrace: Optimize the function tracer list loop"

Had an optimization done to all function tracing loops to optimize
for a single registered ops. Unfortunately, this allows for a slight
race when tracing starts or ends, where the stub function might be
called after the current registered ops is removed. In this case we
get the following dump:

root# perf stat -e ftrace:function sleep 1
[   74.339105] WARNING: at include/linux/ftrace.h:209 ftrace_ops_control_func+0xde/0xf0()
[   74.349522] Hardware name: PRIMERGY RX200 S6
[   74.357149] Modules linked in: sg igb iTCO_wdt ptp pps_core iTCO_vendor_support i7core_edac dca lpc_ich i2c_i801 coretemp edac_core crc32c_intel mfd_core ghash_clmulni_intel dm_multipath acpi_power_meter pcspk
r microcode vhost_net tun macvtap macvlan nfsd kvm_intel kvm auth_rpcgss nfs_acl lockd sunrpc uinput xfs libcrc32c sd_mod crc_t10dif sr_mod cdrom mgag200 i2c_algo_bit drm_kms_helper ttm qla2xxx mptsas ahci drm li
bahci scsi_transport_sas mptscsih libata scsi_transport_fc i2c_core mptbase scsi_tgt dm_mirror dm_region_hash dm_log dm_mod
[   74.446233] Pid: 1377, comm: perf Tainted: G        W    3.9.0-rc1 #1
[   74.453458] Call Trace:
[   74.456233]  [<ffffffff81062e3f>] warn_slowpath_common+0x7f/0xc0
[   74.462997]  [<ffffffff810fbc60>] ? rcu_note_context_switch+0xa0/0xa0
[   74.470272]  [<ffffffff811041a2>] ? __unregister_ftrace_function+0xa2/0x1a0
[   74.478117]  [<ffffffff81062e9a>] warn_slowpath_null+0x1a/0x20
[   74.484681]  [<ffffffff81102ede>] ftrace_ops_control_func+0xde/0xf0
[   74.491760]  [<ffffffff8162f400>] ftrace_call+0x5/0x2f
[   74.497511]  [<ffffffff8162f400>] ? ftrace_call+0x5/0x2f
[   74.503486]  [<ffffffff8162f400>] ? ftrace_call+0x5/0x2f
[   74.509500]  [<ffffffff810fbc65>] ? synchronize_sched+0x5/0x50
[   74.516088]  [<ffffffff816254d5>] ? _cond_resched+0x5/0x40
[   74.522268]  [<ffffffff810fbc65>] ? synchronize_sched+0x5/0x50
[   74.528837]  [<ffffffff811041a2>] ? __unregister_ftrace_function+0xa2/0x1a0
[   74.536696]  [<ffffffff816254d5>] ? _cond_resched+0x5/0x40
[   74.542878]  [<ffffffff8162402d>] ? mutex_lock+0x1d/0x50
[   74.548869]  [<ffffffff81105c67>] unregister_ftrace_function+0x27/0x50
[   74.556243]  [<ffffffff8111eadf>] perf_ftrace_event_register+0x9f/0x140
[   74.563709]  [<ffffffff816254d5>] ? _cond_resched+0x5/0x40
[   74.569887]  [<ffffffff8162402d>] ? mutex_lock+0x1d/0x50
[   74.575898]  [<ffffffff8111e94e>] perf_trace_destroy+0x2e/0x50
[   74.582505]  [<ffffffff81127ba9>] tp_perf_event_destroy+0x9/0x10
[   74.589298]  [<ffffffff811295d0>] free_event+0x70/0x1a0
[   74.595208]  [<ffffffff8112a579>] perf_event_release_kernel+0x69/0xa0
[   74.602460]  [<ffffffff816254d5>] ? _cond_resched+0x5/0x40
[   74.608667]  [<ffffffff8112a640>] put_event+0x90/0xc0
[   74.614373]  [<ffffffff8112a740>] perf_release+0x10/0x20
[   74.620367]  [<ffffffff811a3044>] __fput+0xf4/0x280
[   74.625894]  [<ffffffff811a31de>] ____fput+0xe/0x10
[   74.631387]  [<ffffffff81083697>] task_work_run+0xa7/0xe0
[   74.637452]  [<ffffffff81014981>] do_notify_resume+0x71/0xb0
[   74.643843]  [<ffffffff8162fa92>] int_signal+0x12/0x17

To fix this a new ftrace_ops flag is added that denotes the ftrace_list_end
ftrace_ops stub as just that, a stub. This flag is now checked in the
control loop and the function is not called if the flag is set.

Thanks to Jovi for not just reporting the bug, but also pointing out
where the bug was in the code.

Link: http://lkml.kernel.org/r/514A8855.7090402@redhat.com
Link: http://lkml.kernel.org/r/1364377499-1900-15-git-send-email-jovi.zhangwei@huawei.com

Tested-by: WANG Chao <chaowang@redhat.com>
Reported-by: WANG Chao <chaowang@redhat.com>
Reported-by: zhangwei(Jovi) <jovi.zhangwei@huawei.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 2 ++
 kernel/trace/ftrace.c  | 5 +++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e5ca8ef50e9..167abf90780 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -89,6 +89,7 @@ typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip,
  *            that the call back has its own recursion protection. If it does
  *            not set this, then the ftrace infrastructure will add recursion
  *            protection for the caller.
+ * STUB   - The ftrace_ops is just a place holder.
  */
 enum {
 	FTRACE_OPS_FL_ENABLED			= 1 << 0,
@@ -98,6 +99,7 @@ enum {
 	FTRACE_OPS_FL_SAVE_REGS			= 1 << 4,
 	FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED	= 1 << 5,
 	FTRACE_OPS_FL_RECURSION_SAFE		= 1 << 6,
+	FTRACE_OPS_FL_STUB			= 1 << 7,
 };
 
 struct ftrace_ops {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index cc4943c7ce6..7e897106b7e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -66,7 +66,7 @@
 
 static struct ftrace_ops ftrace_list_end __read_mostly = {
 	.func		= ftrace_stub,
-	.flags		= FTRACE_OPS_FL_RECURSION_SAFE,
+	.flags		= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
 };
 
 /* ftrace_enabled is a method to turn ftrace on or off */
@@ -4131,7 +4131,8 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
 	preempt_disable_notrace();
 	trace_recursion_set(TRACE_CONTROL_BIT);
 	do_for_each_ftrace_op(op, ftrace_control_list) {
-		if (!ftrace_function_local_disabled(op) &&
+		if (!(op->flags & FTRACE_OPS_FL_STUB) &&
+		    !ftrace_function_local_disabled(op) &&
 		    ftrace_ops_test(op, ip))
 			op->func(ip, parent_ip, op, regs);
 	} while_for_each_ftrace_op(op);
-- 
cgit v1.2.3-70-g09d2


From 6f389a8f1dd22a24f3d9afc2812b30d639e94625 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhc@lemote.com>
Date: Sun, 7 Apr 2013 02:14:14 +0000
Subject: PM / reboot: call syscore_shutdown() after disable_nonboot_cpus()

As commit 40dc166c (PM / Core: Introduce struct syscore_ops for core
subsystems PM) say, syscore_ops operations should be carried with one
CPU on-line and interrupts disabled. However, after commit f96972f2d
(kernel/sys.c: call disable_nonboot_cpus() in kernel_restart()),
syscore_shutdown() is called before disable_nonboot_cpus(), so break
the rules. We have a MIPS machine with a 8259A PIC, and there is an
external timer (HPET) linked at 8259A. Since 8259A has been shutdown
too early (by syscore_shutdown()), disable_nonboot_cpus() runs without
timer interrupt, so it hangs and reboot fails. This patch call
syscore_shutdown() a little later (after disable_nonboot_cpus()) to
avoid reboot failure, this is the same way as poweroff does.

For consistency, add disable_nonboot_cpus() to kernel_halt().

Signed-off-by: Huacai Chen <chenhc@lemote.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sys.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 39c9c4a2949..0da73cf73e6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -324,7 +324,6 @@ void kernel_restart_prepare(char *cmd)
 	system_state = SYSTEM_RESTART;
 	usermodehelper_disable();
 	device_shutdown();
-	syscore_shutdown();
 }
 
 /**
@@ -370,6 +369,7 @@ void kernel_restart(char *cmd)
 {
 	kernel_restart_prepare(cmd);
 	disable_nonboot_cpus();
+	syscore_shutdown();
 	if (!cmd)
 		printk(KERN_EMERG "Restarting system.\n");
 	else
@@ -395,6 +395,7 @@ static void kernel_shutdown_prepare(enum system_states state)
 void kernel_halt(void)
 {
 	kernel_shutdown_prepare(SYSTEM_HALT);
+	disable_nonboot_cpus();
 	syscore_shutdown();
 	printk(KERN_EMERG "System halted.\n");
 	kmsg_dump(KMSG_DUMP_HALT);
-- 
cgit v1.2.3-70-g09d2


From 83e03b3fe4daffdebbb42151d5410d730ae50bd1 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung.kim@lge.com>
Date: Mon, 1 Apr 2013 21:46:23 +0900
Subject: tracing: Fix double free when function profile init failed

On the failure path, stat->start and stat->pages will refer same page.
So it'll attempt to free the same page again and get kernel panic.

Link: http://lkml.kernel.org/r/1364820385-32027-1-git-send-email-namhyung@kernel.org

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: stable@vger.kernel.org
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7e897106b7e..926ebfb7493 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -694,7 +694,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
 		free_page(tmp);
 	}
 
-	free_page((unsigned long)stat->pages);
 	stat->pages = NULL;
 	stat->start = NULL;
 
-- 
cgit v1.2.3-70-g09d2


From c481420248c6730246d2a1b1773d5d7007ae0835 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Fri, 12 Apr 2013 11:05:54 +0800
Subject: perf: Fix error return code

Fix to return -ENOMEM in the allocation error case instead of 0
(if pmu_bus_running == 1), as done elsewhere in this function.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Cc: a.p.zijlstra@chello.nl
Cc: paulus@samba.org
Cc: acme@ghostprotocols.net
Link: http://lkml.kernel.org/r/CAPgLHd8j_fWcgqe%3DKLWjpBj%2B%3Do0Pw6Z-SEq%3DNTPU08c2w1tngQ@mail.gmail.com
[ Tweaked the error code setting placement and the changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7f0d67ea3f4..7e0962ed7f8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5987,6 +5987,7 @@ skip_type:
 	if (pmu->pmu_cpu_context)
 		goto got_cpu_context;
 
+	ret = -ENOMEM;
 	pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
 	if (!pmu->pmu_cpu_context)
 		goto free_dev;
-- 
cgit v1.2.3-70-g09d2


From f2530dc71cf0822f90bb63ea4600caaef33a66bb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Apr 2013 09:33:34 +0200
Subject: kthread: Prevent unpark race which puts threads on the wrong cpu

The smpboot threads rely on the park/unpark mechanism which binds per
cpu threads on a particular core. Though the functionality is racy:

CPU0	       	 	CPU1  	     	    CPU2
unpark(T)				    wake_up_process(T)
  clear(SHOULD_PARK)	T runs
			leave parkme() due to !SHOULD_PARK
  bind_to(CPU2)		BUG_ON(wrong CPU)

We cannot let the tasks move themself to the target CPU as one of
those tasks is actually the migration thread itself, which requires
that it starts running on the target cpu right away.

The solution to this problem is to prevent wakeups in park mode which
are not from unpark(). That way we can guarantee that the association
of the task to the target cpu is working correctly.

Add a new task state (TASK_PARKED) which prevents other wakeups and
use this state explicitly for the unpark wakeup.

Peter noticed: Also, since the task state is visible to userspace and
all the parked tasks are still in the PID space, its a good hint in ps
and friends that these tasks aren't really there for the moment.

The migration thread has another related issue.

CPU0	      	     	 CPU1
Bring up CPU2
create_thread(T)
park(T)
 wait_for_completion()
			 parkme()
			 complete()
sched_set_stop_task()
			 schedule(TASK_PARKED)

The sched_set_stop_task() call is issued while the task is on the
runqueue of CPU1 and that confuses the hell out of the stop_task class
on that cpu. So we need the same synchronizaion before
sched_set_stop_task().

Reported-by: Dave Jones <davej@redhat.com>
Reported-and-tested-by: Dave Hansen <dave@sr71.net>
Reported-and-tested-by: Borislav Petkov <bp@alien8.de>
Acked-by: Peter Ziljstra <peterz@infradead.org>
Cc: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Cc: dhillf@gmail.com
Cc: Ingo Molnar <mingo@kernel.org>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1304091635430.21884@ionos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/proc/array.c              |  1 +
 include/linux/sched.h        |  5 +++--
 include/trace/events/sched.h |  2 +-
 kernel/kthread.c             | 52 ++++++++++++++++++++++++--------------------
 kernel/smpboot.c             | 14 ++++++++++--
 5 files changed, 45 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index f7ed9ee46eb..cbd0f1b324b 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -143,6 +143,7 @@ static const char * const task_state_array[] = {
 	"x (dead)",		/*  64 */
 	"K (wakekill)",		/* 128 */
 	"W (waking)",		/* 256 */
+	"P (parked)",		/* 512 */
 };
 
 static inline const char *get_task_state(struct task_struct *tsk)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d35d2b6ddbf..e692a022527 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -163,9 +163,10 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #define TASK_DEAD		64
 #define TASK_WAKEKILL		128
 #define TASK_WAKING		256
-#define TASK_STATE_MAX		512
+#define TASK_PARKED		512
+#define TASK_STATE_MAX		1024
 
-#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW"
+#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
 
 extern char ___assert_task_state[1 - 2*!!(
 		sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 5a8671e8a67..e5586caff67 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -147,7 +147,7 @@ TRACE_EVENT(sched_switch,
 		  __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|",
 				{ 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
 				{ 16, "Z" }, { 32, "X" }, { 64, "x" },
-				{ 128, "W" }) : "R",
+				{ 128, "K" }, { 256, "W" }, { 512, "P" }) : "R",
 		__entry->prev_state & TASK_STATE_MAX ? "+" : "",
 		__entry->next_comm, __entry->next_pid, __entry->next_prio)
 );
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 691dc2ef9ba..9eb7fed0bba 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -124,12 +124,12 @@ void *kthread_data(struct task_struct *task)
 
 static void __kthread_parkme(struct kthread *self)
 {
-	__set_current_state(TASK_INTERRUPTIBLE);
+	__set_current_state(TASK_PARKED);
 	while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
 		if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
 			complete(&self->parked);
 		schedule();
-		__set_current_state(TASK_INTERRUPTIBLE);
+		__set_current_state(TASK_PARKED);
 	}
 	clear_bit(KTHREAD_IS_PARKED, &self->flags);
 	__set_current_state(TASK_RUNNING);
@@ -256,8 +256,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 }
 EXPORT_SYMBOL(kthread_create_on_node);
 
-static void __kthread_bind(struct task_struct *p, unsigned int cpu)
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
 {
+	/* Must have done schedule() in kthread() before we set_task_cpu */
+	if (!wait_task_inactive(p, state)) {
+		WARN_ON(1);
+		return;
+	}
 	/* It's safe because the task is inactive. */
 	do_set_cpus_allowed(p, cpumask_of(cpu));
 	p->flags |= PF_THREAD_BOUND;
@@ -274,12 +279,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu)
  */
 void kthread_bind(struct task_struct *p, unsigned int cpu)
 {
-	/* Must have done schedule() in kthread() before we set_task_cpu */
-	if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
-		WARN_ON(1);
-		return;
-	}
-	__kthread_bind(p, cpu);
+	__kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(kthread_bind);
 
@@ -324,6 +324,22 @@ static struct kthread *task_get_live_kthread(struct task_struct *k)
 	return NULL;
 }
 
+static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
+{
+	clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+	/*
+	 * We clear the IS_PARKED bit here as we don't wait
+	 * until the task has left the park code. So if we'd
+	 * park before that happens we'd see the IS_PARKED bit
+	 * which might be about to be cleared.
+	 */
+	if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+		if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
+			__kthread_bind(k, kthread->cpu, TASK_PARKED);
+		wake_up_state(k, TASK_PARKED);
+	}
+}
+
 /**
  * kthread_unpark - unpark a thread created by kthread_create().
  * @k:		thread created by kthread_create().
@@ -336,20 +352,8 @@ void kthread_unpark(struct task_struct *k)
 {
 	struct kthread *kthread = task_get_live_kthread(k);
 
-	if (kthread) {
-		clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
-		/*
-		 * We clear the IS_PARKED bit here as we don't wait
-		 * until the task has left the park code. So if we'd
-		 * park before that happens we'd see the IS_PARKED bit
-		 * which might be about to be cleared.
-		 */
-		if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
-			if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
-				__kthread_bind(k, kthread->cpu);
-			wake_up_process(k);
-		}
-	}
+	if (kthread)
+		__kthread_unpark(k, kthread);
 	put_task_struct(k);
 }
 
@@ -407,7 +411,7 @@ int kthread_stop(struct task_struct *k)
 	trace_sched_kthread_stop(k);
 	if (kthread) {
 		set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
-		clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+		__kthread_unpark(k, kthread);
 		wake_up_process(k);
 		wait_for_completion(&kthread->exited);
 	}
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 8eaed9aa9cf..02fc5c93367 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -185,8 +185,18 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
 	}
 	get_task_struct(tsk);
 	*per_cpu_ptr(ht->store, cpu) = tsk;
-	if (ht->create)
-		ht->create(cpu);
+	if (ht->create) {
+		/*
+		 * Make sure that the task has actually scheduled out
+		 * into park position, before calling the create
+		 * callback. At least the migration thread callback
+		 * requires that the task is off the runqueue.
+		 */
+		if (!wait_task_inactive(tsk, TASK_PARKED))
+			WARN_ON(1);
+		else
+			ht->create(cpu);
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 6a76f8c0ab19f215af2a3442870eeb5f0e81998d Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung.kim@lge.com>
Date: Thu, 11 Apr 2013 15:55:01 +0900
Subject: tracing: Fix possible NULL pointer dereferences

Currently set_ftrace_pid and set_graph_function files use seq_lseek
for their fops.  However seq_open() is called only for FMODE_READ in
the fops->open() so that if an user tries to seek one of those file
when she open it for writing, it sees NULL seq_file and then panic.

It can be easily reproduced with following command:

  $ cd /sys/kernel/debug/tracing
  $ echo 1234 | sudo tee -a set_ftrace_pid

In this example, GNU coreutils' tee opens the file with fopen(, "a")
and then the fopen() internally calls lseek().

Link: http://lkml.kernel.org/r/1365663302-2170-1-git-send-email-namhyung@kernel.org

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: stable@vger.kernel.org
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h     |  2 +-
 kernel/trace/ftrace.c      | 10 +++++-----
 kernel/trace/trace_stack.c |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 167abf90780..eb3ce327b97 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -396,7 +396,7 @@ ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf,
 			    size_t cnt, loff_t *ppos);
 ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf,
 			     size_t cnt, loff_t *ppos);
-loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int whence);
+loff_t ftrace_filter_lseek(struct file *file, loff_t offset, int whence);
 int ftrace_regex_release(struct inode *inode, struct file *file);
 
 void __init
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 926ebfb7493..affc35d829c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2697,7 +2697,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
 }
 
 loff_t
-ftrace_regex_lseek(struct file *file, loff_t offset, int whence)
+ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
 {
 	loff_t ret;
 
@@ -3570,7 +3570,7 @@ static const struct file_operations ftrace_filter_fops = {
 	.open = ftrace_filter_open,
 	.read = seq_read,
 	.write = ftrace_filter_write,
-	.llseek = ftrace_regex_lseek,
+	.llseek = ftrace_filter_lseek,
 	.release = ftrace_regex_release,
 };
 
@@ -3578,7 +3578,7 @@ static const struct file_operations ftrace_notrace_fops = {
 	.open = ftrace_notrace_open,
 	.read = seq_read,
 	.write = ftrace_notrace_write,
-	.llseek = ftrace_regex_lseek,
+	.llseek = ftrace_filter_lseek,
 	.release = ftrace_regex_release,
 };
 
@@ -3783,8 +3783,8 @@ static const struct file_operations ftrace_graph_fops = {
 	.open		= ftrace_graph_open,
 	.read		= seq_read,
 	.write		= ftrace_graph_write,
+	.llseek		= ftrace_filter_lseek,
 	.release	= ftrace_graph_release,
-	.llseek		= seq_lseek,
 };
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
@@ -4439,7 +4439,7 @@ static const struct file_operations ftrace_pid_fops = {
 	.open		= ftrace_pid_open,
 	.write		= ftrace_pid_write,
 	.read		= seq_read,
-	.llseek		= seq_lseek,
+	.llseek		= ftrace_filter_lseek,
 	.release	= ftrace_pid_release,
 };
 
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 42ca822fc70..83a8b5b7bd3 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -322,7 +322,7 @@ static const struct file_operations stack_trace_filter_fops = {
 	.open = stack_trace_filter_open,
 	.read = seq_read,
 	.write = ftrace_filter_write,
-	.llseek = ftrace_regex_lseek,
+	.llseek = ftrace_filter_lseek,
 	.release = ftrace_regex_release,
 };
 
-- 
cgit v1.2.3-70-g09d2


From 7f49ef69db6bbf756c0abca7e9b65b32e999eec8 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 12 Apr 2013 16:40:13 -0400
Subject: ftrace: Move ftrace_filter_lseek out of CONFIG_DYNAMIC_FTRACE section

As ftrace_filter_lseek is now used with ftrace_pid_fops, it needs to
be moved out of the #ifdef CONFIG_DYNAMIC_FTRACE section as the
ftrace_pid_fops is defined when DYNAMIC_FTRACE is not.

Cc: stable@vger.kernel.org
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  3 ++-
 kernel/trace/ftrace.c  | 28 ++++++++++++++--------------
 2 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index eb3ce327b97..52da2a25079 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -396,7 +396,6 @@ ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf,
 			    size_t cnt, loff_t *ppos);
 ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf,
 			     size_t cnt, loff_t *ppos);
-loff_t ftrace_filter_lseek(struct file *file, loff_t offset, int whence);
 int ftrace_regex_release(struct inode *inode, struct file *file);
 
 void __init
@@ -569,6 +568,8 @@ static inline int
 ftrace_regex_release(struct inode *inode, struct file *file) { return -ENODEV; }
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
+loff_t ftrace_filter_lseek(struct file *file, loff_t offset, int whence);
+
 /* totally disable ftrace - can not re-enable after this */
 void ftrace_kill(void);
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index affc35d829c..2461ede45a8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1052,6 +1052,19 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
 
 static struct pid * const ftrace_swapper_pid = &init_struct_pid;
 
+loff_t
+ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
+{
+	loff_t ret;
+
+	if (file->f_mode & FMODE_READ)
+		ret = seq_lseek(file, offset, whence);
+	else
+		file->f_pos = ret = 1;
+
+	return ret;
+}
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 #ifndef CONFIG_FTRACE_MCOUNT_RECORD
@@ -2612,7 +2625,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
  * routine, you can use ftrace_filter_write() for the write
  * routine if @flag has FTRACE_ITER_FILTER set, or
  * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
- * ftrace_regex_lseek() should be used as the lseek routine, and
+ * ftrace_filter_lseek() should be used as the lseek routine, and
  * release must call ftrace_regex_release().
  */
 int
@@ -2696,19 +2709,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
 				 inode, file);
 }
 
-loff_t
-ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
-{
-	loff_t ret;
-
-	if (file->f_mode & FMODE_READ)
-		ret = seq_lseek(file, offset, whence);
-	else
-		file->f_pos = ret = 1;
-
-	return ret;
-}
-
 static int ftrace_match(char *str, char *regex, int len, int type)
 {
 	int matched = 0;
-- 
cgit v1.2.3-70-g09d2


From 935d8aabd4331f47a89c3e1daa5779d23cf244ee Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 14 Apr 2013 10:06:31 -0700
Subject: Add file_ns_capable() helper function for open-time capability
 checking

Nothing is using it yet, but this will allow us to delay the open-time
checks to use time, without breaking the normal UNIX permission
semantics where permissions are determined by the opener (and the file
descriptor can then be passed to a different process, or the process can
drop capabilities).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/capability.h |  2 ++
 kernel/capability.c        | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 98503b79236..d9a4f7f40f3 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -35,6 +35,7 @@ struct cpu_vfs_cap_data {
 #define _KERNEL_CAP_T_SIZE     (sizeof(kernel_cap_t))
 
 
+struct file;
 struct inode;
 struct dentry;
 struct user_namespace;
@@ -211,6 +212,7 @@ extern bool capable(int cap);
 extern bool ns_capable(struct user_namespace *ns, int cap);
 extern bool nsown_capable(int cap);
 extern bool inode_capable(const struct inode *inode, int cap);
+extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
 
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
diff --git a/kernel/capability.c b/kernel/capability.c
index 493d9725948..f6c2ce5701e 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -392,6 +392,30 @@ bool ns_capable(struct user_namespace *ns, int cap)
 }
 EXPORT_SYMBOL(ns_capable);
 
+/**
+ * file_ns_capable - Determine if the file's opener had a capability in effect
+ * @file:  The file we want to check
+ * @ns:  The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if task that opened the file had a capability in effect
+ * when the file was opened.
+ *
+ * This does not set PF_SUPERPRIV because the caller may not
+ * actually be privileged.
+ */
+bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap)
+{
+	if (WARN_ON_ONCE(!cap_valid(cap)))
+		return false;
+
+	if (security_capable(file->f_cred, ns, cap) == 0)
+		return true;
+
+	return false;
+}
+EXPORT_SYMBOL(file_ns_capable);
+
 /**
  * capable - Determine if the current task has a superior capability in effect
  * @cap: The capability to be tested for
-- 
cgit v1.2.3-70-g09d2


From 6708075f104c3c9b04b23336bb0366ca30c3931b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 14 Apr 2013 13:47:02 -0700
Subject: userns: Don't let unprivileged users trick privileged users into
 setting the id_map

When we require privilege for setting /proc/<pid>/uid_map or
/proc/<pid>/gid_map no longer allow an unprivileged user to
open the file and pass it to a privileged program to write
to the file.

Instead when privilege is required require both the opener and the
writer to have the necessary capabilities.

I have tested this code and verified that setting /proc/<pid>/uid_map
fails when an unprivileged user opens the file and a privielged user
attempts to set the mapping, that unprivileged users can still map
their own id, and that a privileged users can still setup an arbitrary
mapping.

Reported-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
---
 kernel/user_namespace.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index a54f26f82eb..e2d4ace4481 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -25,7 +25,8 @@
 
 static struct kmem_cache *user_ns_cachep __read_mostly;
 
-static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
+static bool new_idmap_permitted(const struct file *file,
+				struct user_namespace *ns, int cap_setid,
 				struct uid_gid_map *map);
 
 static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
@@ -700,7 +701,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 
 	ret = -EPERM;
 	/* Validate the user is allowed to use user id's mapped to. */
-	if (!new_idmap_permitted(ns, cap_setid, &new_map))
+	if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
 		goto out;
 
 	/* Map the lower ids from the parent user namespace to the
@@ -787,7 +788,8 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
 			 &ns->projid_map, &ns->parent->projid_map);
 }
 
-static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
+static bool new_idmap_permitted(const struct file *file, 
+				struct user_namespace *ns, int cap_setid,
 				struct uid_gid_map *new_map)
 {
 	/* Allow mapping to your own filesystem ids */
@@ -811,8 +813,10 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
 
 	/* Allow the specified ids if we have the appropriate capability
 	 * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
+	 * And the opener of the id file also had the approprpiate capability.
 	 */
-	if (ns_capable(ns->parent, cap_setid))
+	if (ns_capable(ns->parent, cap_setid) &&
+	    file_ns_capable(file, ns->parent, cap_setid))
 		return true;
 
 	return false;
-- 
cgit v1.2.3-70-g09d2


From e3211c120a85b792978bcb4be7b2886df18d27f0 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Sun, 14 Apr 2013 16:28:19 -0700
Subject: userns: Check uid_map's opener's fsuid, not the current fsuid

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
---
 kernel/user_namespace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index e2d4ace4481..5c16f3aa757 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -797,12 +797,12 @@ static bool new_idmap_permitted(const struct file *file,
 		u32 id = new_map->extent[0].lower_first;
 		if (cap_setid == CAP_SETUID) {
 			kuid_t uid = make_kuid(ns->parent, id);
-			if (uid_eq(uid, current_fsuid()))
+			if (uid_eq(uid, file->f_cred->fsuid))
 				return true;
 		}
 		else if (cap_setid == CAP_SETGID) {
 			kgid_t gid = make_kgid(ns->parent, id);
-			if (gid_eq(gid, current_fsgid()))
+			if (gid_eq(gid, file->f_cred->fsgid))
 				return true;
 		}
 	}
-- 
cgit v1.2.3-70-g09d2


From 41c21e351e79004dbb4efa4bc14a53a7e0af38c5 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Sun, 14 Apr 2013 11:44:04 -0700
Subject: userns: Changing any namespace id mappings should require privileges

Changing uid/gid/projid mappings doesn't change your id within the
namespace; it reconfigures the namespace.  Unprivileged programs should
*not* be able to write these files.  (We're also checking the privileges
on the wrong task.)

Given the write-once nature of these files and the other security
checks, this is likely impossible to usefully exploit.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
---
 kernel/user_namespace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 5c16f3aa757..e134d8f365d 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -613,10 +613,10 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	if (map->nr_extents != 0)
 		goto out;
 
-	/* Require the appropriate privilege CAP_SETUID or CAP_SETGID
-	 * over the user namespace in order to set the id mapping.
+	/*
+	 * Adjusting namespace settings requires capabilities on the target.
 	 */
-	if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid))
+	if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
 		goto out;
 
 	/* Get a buffer */
-- 
cgit v1.2.3-70-g09d2


From 8176cced706b5e5d15887584150764894e94e02f Mon Sep 17 00:00:00 2001
From: Tommi Rantala <tt.rantala@gmail.com>
Date: Sat, 13 Apr 2013 22:49:14 +0300
Subject: perf: Treat attr.config as u64 in perf_swevent_init()

Trinity discovered that we fail to check all 64 bits of
attr.config passed by user space, resulting to out-of-bounds
access of the perf_swevent_enabled array in
sw_perf_event_destroy().

Introduced in commit b0a873ebb ("perf: Register PMU
implementations").

Signed-off-by: Tommi Rantala <tt.rantala@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: davej@redhat.com
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Link: http://lkml.kernel.org/r/1365882554-30259-1-git-send-email-tt.rantala@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7e0962ed7f8..4d3124b3927 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5331,7 +5331,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
 
 static int perf_swevent_init(struct perf_event *event)
 {
-	int event_id = event->attr.config;
+	u64 event_id = event->attr.config;
 
 	if (event->attr.type != PERF_TYPE_SOFTWARE)
 		return -ENOENT;
-- 
cgit v1.2.3-70-g09d2


From 55a20ee7804ab64ac90bcdd4e2868a42829e2784 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 15 Apr 2013 22:23:47 -0700
Subject: x86, kdump: Retore crashkernel= to allocate under 896M

Vivek found old kexec-tools does not work new kernel anymore.

So change back crashkernel= back to old behavoir, and add crashkernel_high=
to let user decide if buffer could be above 4G, and also new kexec-tools will
be needed.

-v2: let crashkernel=X override crashkernel_high=
    update description about _high will be ignored by crashkernel=X
-v3: update description about kernel-parameters.txt according to Vivek.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Link: http://lkml.kernel.org/r/1366089828-19692-4-git-send-email-yinghai@kernel.org
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 Documentation/kernel-parameters.txt | 13 +++++++++++--
 arch/x86/kernel/setup.c             | 24 +++++++++++++++++++-----
 include/linux/kexec.h               |  2 ++
 kernel/kexec.c                      |  9 +++++++++
 4 files changed, 41 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index cff672da248..709eb3edc6b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -603,9 +603,16 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			a memory unit (amount[KMG]). See also
 			Documentation/kdump/kdump.txt for an example.
 
+	crashkernel_high=size[KMG]
+			[KNL, x86_64] range could be above 4G. Allow kernel
+			to allocate physical memory region from top, so could
+			be above 4G if system have more than 4G ram installed.
+			Otherwise memory region will be allocated below 4G, if
+			available.
+			It will be ignored if crashkernel=X is specified.
 	crashkernel_low=size[KMG]
-			[KNL, x86_64] range under 4G. When crashkernel= is
-			passed, kernel allocate physical memory region
+			[KNL, x86_64] range under 4G. When crashkernel_high= is
+			passed, kernel could allocate physical memory region
 			above 4G, that cause second kernel crash on system
 			that require some amount of low memory, e.g. swiotlb
 			requires at least 64M+32K low memory.  Kernel would
@@ -613,6 +620,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			This one let user to specify own low range under 4G
 			for second kernel instead.
 			0: to disable low allocation.
+			It will be ignored when crashkernel_high=X is not used
+			or memory reserved is below 4G.
 
 	cs89x0_dma=	[HW,NET]
 			Format: <dma>
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 12349202cae..a85a144f205 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -507,11 +507,14 @@ static void __init memblock_x86_reserve_range_setup_data(void)
 /*
  * Keep the crash kernel below this limit.  On 32 bits earlier kernels
  * would limit the kernel to the low 512 MiB due to mapping restrictions.
+ * On 64bit, old kexec-tools need to under 896MiB.
  */
 #ifdef CONFIG_X86_32
-# define CRASH_KERNEL_ADDR_MAX	(512 << 20)
+# define CRASH_KERNEL_ADDR_LOW_MAX	(512 << 20)
+# define CRASH_KERNEL_ADDR_HIGH_MAX	(512 << 20)
 #else
-# define CRASH_KERNEL_ADDR_MAX	MAXMEM
+# define CRASH_KERNEL_ADDR_LOW_MAX	(896UL<<20)
+# define CRASH_KERNEL_ADDR_HIGH_MAX	MAXMEM
 #endif
 
 static void __init reserve_crashkernel_low(void)
@@ -525,6 +528,7 @@ static void __init reserve_crashkernel_low(void)
 	int ret;
 
 	total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT));
+	/* crashkernel_low=YM */
 	ret = parse_crashkernel_low(boot_command_line, total_low_mem,
 						&low_size, &base);
 	if (ret != 0) {
@@ -569,14 +573,22 @@ static void __init reserve_crashkernel(void)
 	const unsigned long long alignment = 16<<20;	/* 16M */
 	unsigned long long total_mem;
 	unsigned long long crash_size, crash_base;
+	bool high = false;
 	int ret;
 
 	total_mem = memblock_phys_mem_size();
 
+	/* crashkernel=XM */
 	ret = parse_crashkernel(boot_command_line, total_mem,
 			&crash_size, &crash_base);
-	if (ret != 0 || crash_size <= 0)
-		return;
+	if (ret != 0 || crash_size <= 0) {
+		/* crashkernel_high=XM */
+		ret = parse_crashkernel_high(boot_command_line, total_mem,
+				&crash_size, &crash_base);
+		if (ret != 0 || crash_size <= 0)
+			return;
+		high = true;
+	}
 
 	/* 0 means: find the address automatically */
 	if (crash_base <= 0) {
@@ -584,7 +596,9 @@ static void __init reserve_crashkernel(void)
 		 *  kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
 		 */
 		crash_base = memblock_find_in_range(alignment,
-			       CRASH_KERNEL_ADDR_MAX, crash_size, alignment);
+					high ? CRASH_KERNEL_ADDR_HIGH_MAX :
+					       CRASH_KERNEL_ADDR_LOW_MAX,
+					crash_size, alignment);
 
 		if (!crash_base) {
 			pr_info("crashkernel reservation failed - No suitable area found.\n");
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index d2e6927bbaa..d78d28a733b 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -200,6 +200,8 @@ extern size_t vmcoreinfo_max_size;
 
 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base);
+int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
+		unsigned long long *crash_size, unsigned long long *crash_base);
 int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base);
 int crash_shrink_memory(unsigned long new_size);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index bddd3d7a74b..1b2f73f5f9b 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1422,6 +1422,15 @@ int __init parse_crashkernel(char *cmdline,
 					"crashkernel=");
 }
 
+int __init parse_crashkernel_high(char *cmdline,
+			     unsigned long long system_ram,
+			     unsigned long long *crash_size,
+			     unsigned long long *crash_base)
+{
+	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+					"crashkernel_high=");
+}
+
 int __init parse_crashkernel_low(char *cmdline,
 			     unsigned long long system_ram,
 			     unsigned long long *crash_size,
-- 
cgit v1.2.3-70-g09d2


From adbc742bf78695bb98c79d18c558b61571748b99 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 15 Apr 2013 22:23:48 -0700
Subject: x86, kdump: Change crashkernel_high/low= to crashkernel=,high/low

Per hpa, use crashkernel=X,high crashkernel=Y,low instead of
crashkernel_hign=X crashkernel_low=Y. As that could be extensible.

-v2: according to Vivek, change delimiter to ;
-v3: let hign and low only handle simple form and it conforms to
	description in kernel-parameters.txt
     still keep crashkernel=X override any crashkernel=X,high
        crashkernel=Y,low
-v4: update get_last_crashkernel returning and add more strict
     checking in parse_crashkernel_simple() found by HATAYAMA.
-v5: Change delimiter back to , according to HPA.
     also separate parse_suffix from parse_simper according to vivek.
	so we can avoid @pos in that path.
-v6: Tight the checking about crashkernel=X,highblahblah,high
     found by HTYAYAMA.

Cc: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Link: http://lkml.kernel.org/r/1366089828-19692-5-git-send-email-yinghai@kernel.org
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 Documentation/kernel-parameters.txt |  10 ++--
 arch/x86/kernel/setup.c             |   6 +-
 kernel/kexec.c                      | 109 +++++++++++++++++++++++++++++++-----
 3 files changed, 104 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 709eb3edc6b..a1ac1f1d6d3 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -603,16 +603,16 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			a memory unit (amount[KMG]). See also
 			Documentation/kdump/kdump.txt for an example.
 
-	crashkernel_high=size[KMG]
+	crashkernel=size[KMG],high
 			[KNL, x86_64] range could be above 4G. Allow kernel
 			to allocate physical memory region from top, so could
 			be above 4G if system have more than 4G ram installed.
 			Otherwise memory region will be allocated below 4G, if
 			available.
 			It will be ignored if crashkernel=X is specified.
-	crashkernel_low=size[KMG]
-			[KNL, x86_64] range under 4G. When crashkernel_high= is
-			passed, kernel could allocate physical memory region
+	crashkernel=size[KMG],low
+			[KNL, x86_64] range under 4G. When crashkernel=X,high
+			is passed, kernel could allocate physical memory region
 			above 4G, that cause second kernel crash on system
 			that require some amount of low memory, e.g. swiotlb
 			requires at least 64M+32K low memory.  Kernel would
@@ -620,7 +620,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			This one let user to specify own low range under 4G
 			for second kernel instead.
 			0: to disable low allocation.
-			It will be ignored when crashkernel_high=X is not used
+			It will be ignored when crashkernel=X,high is not used
 			or memory reserved is below 4G.
 
 	cs89x0_dma=	[HW,NET]
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index a85a144f205..fae9134a2de 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -528,7 +528,7 @@ static void __init reserve_crashkernel_low(void)
 	int ret;
 
 	total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT));
-	/* crashkernel_low=YM */
+	/* crashkernel=Y,low */
 	ret = parse_crashkernel_low(boot_command_line, total_low_mem,
 						&low_size, &base);
 	if (ret != 0) {
@@ -542,7 +542,7 @@ static void __init reserve_crashkernel_low(void)
 		low_size = swiotlb_size_or_default() + (8UL<<20);
 		auto_set = true;
 	} else {
-		/* passed with crashkernel_low=0 ? */
+		/* passed with crashkernel=0,low ? */
 		if (!low_size)
 			return;
 	}
@@ -582,7 +582,7 @@ static void __init reserve_crashkernel(void)
 	ret = parse_crashkernel(boot_command_line, total_mem,
 			&crash_size, &crash_base);
 	if (ret != 0 || crash_size <= 0) {
-		/* crashkernel_high=XM */
+		/* crashkernel=X,high */
 		ret = parse_crashkernel_high(boot_command_line, total_mem,
 				&crash_size, &crash_base);
 		if (ret != 0 || crash_size <= 0)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 1b2f73f5f9b..401fdb041f3 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1368,35 +1368,114 @@ static int __init parse_crashkernel_simple(char 		*cmdline,
 	return 0;
 }
 
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW  1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+	[SUFFIX_HIGH] = ",high",
+	[SUFFIX_LOW]  = ",low",
+	[SUFFIX_NULL] = NULL,
+};
+
 /*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
+ * That function parses "suffix"  crashkernel command lines like
+ *
+ *	crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
  */
+static int __init parse_crashkernel_suffix(char *cmdline,
+					   unsigned long long	*crash_size,
+					   unsigned long long	*crash_base,
+					   const char *suffix)
+{
+	char *cur = cmdline;
+
+	*crash_size = memparse(cmdline, &cur);
+	if (cmdline == cur) {
+		pr_warn("crashkernel: memory value expected\n");
+		return -EINVAL;
+	}
+
+	/* check with suffix */
+	if (strncmp(cur, suffix, strlen(suffix))) {
+		pr_warn("crashkernel: unrecognized char\n");
+		return -EINVAL;
+	}
+	cur += strlen(suffix);
+	if (*cur != ' ' && *cur != '\0') {
+		pr_warn("crashkernel: unrecognized char\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+			     const char *name,
+			     const char *suffix)
+{
+	char *p = cmdline, *ck_cmdline = NULL;
+
+	/* find crashkernel and use the last one if there are more */
+	p = strstr(p, name);
+	while (p) {
+		char *end_p = strchr(p, ' ');
+		char *q;
+
+		if (!end_p)
+			end_p = p + strlen(p);
+
+		if (!suffix) {
+			int i;
+
+			/* skip the one with any known suffix */
+			for (i = 0; suffix_tbl[i]; i++) {
+				q = end_p - strlen(suffix_tbl[i]);
+				if (!strncmp(q, suffix_tbl[i],
+					     strlen(suffix_tbl[i])))
+					goto next;
+			}
+			ck_cmdline = p;
+		} else {
+			q = end_p - strlen(suffix);
+			if (!strncmp(q, suffix, strlen(suffix)))
+				ck_cmdline = p;
+		}
+next:
+		p = strstr(p+1, name);
+	}
+
+	if (!ck_cmdline)
+		return NULL;
+
+	return ck_cmdline;
+}
+
 static int __init __parse_crashkernel(char *cmdline,
 			     unsigned long long system_ram,
 			     unsigned long long *crash_size,
 			     unsigned long long *crash_base,
-				const char *name)
+			     const char *name,
+			     const char *suffix)
 {
-	char 	*p = cmdline, *ck_cmdline = NULL;
 	char	*first_colon, *first_space;
+	char	*ck_cmdline;
 
 	BUG_ON(!crash_size || !crash_base);
 	*crash_size = 0;
 	*crash_base = 0;
 
-	/* find crashkernel and use the last one if there are more */
-	p = strstr(p, name);
-	while (p) {
-		ck_cmdline = p;
-		p = strstr(p+1, name);
-	}
+	ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
 
 	if (!ck_cmdline)
 		return -EINVAL;
 
 	ck_cmdline += strlen(name);
 
+	if (suffix)
+		return parse_crashkernel_suffix(ck_cmdline, crash_size,
+				crash_base, suffix);
 	/*
 	 * if the commandline contains a ':', then that's the extended
 	 * syntax -- if not, it must be the classic syntax
@@ -1413,13 +1492,17 @@ static int __init __parse_crashkernel(char *cmdline,
 	return 0;
 }
 
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
 int __init parse_crashkernel(char *cmdline,
 			     unsigned long long system_ram,
 			     unsigned long long *crash_size,
 			     unsigned long long *crash_base)
 {
 	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-					"crashkernel=");
+					"crashkernel=", NULL);
 }
 
 int __init parse_crashkernel_high(char *cmdline,
@@ -1428,7 +1511,7 @@ int __init parse_crashkernel_high(char *cmdline,
 			     unsigned long long *crash_base)
 {
 	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-					"crashkernel_high=");
+				"crashkernel=", suffix_tbl[SUFFIX_HIGH]);
 }
 
 int __init parse_crashkernel_low(char *cmdline,
@@ -1437,7 +1520,7 @@ int __init parse_crashkernel_low(char *cmdline,
 			     unsigned long long *crash_base)
 {
 	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-					"crashkernel_low=");
+				"crashkernel=", suffix_tbl[SUFFIX_LOW]);
 }
 
 static void update_vmcoreinfo_note(void)
-- 
cgit v1.2.3-70-g09d2


From 157752d84f5df47e01577970f9c5f61a0b9f4546 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 15 Apr 2013 22:23:46 -0700
Subject: kexec: use Crash kernel for Crash kernel low

We can extend kexec-tools to support multiple "Crash kernel" in /proc/iomem
instead.

So we can use "Crash kernel" instead of "Crash kernel low" in /proc/iomem.

Suggested-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Link: http://lkml.kernel.org/r/1366089828-19692-3-git-send-email-yinghai@kernel.org
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 kernel/kexec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 401fdb041f3..ffd4e111fd6 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -55,7 +55,7 @@ struct resource crashk_res = {
 	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
 };
 struct resource crashk_low_res = {
-	.name  = "Crash kernel low",
+	.name  = "Crash kernel",
 	.start = 0,
 	.end   = 0,
 	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
-- 
cgit v1.2.3-70-g09d2


From b9e146d8eb3b9ecae5086d373b50fa0c1f3e7f0f Mon Sep 17 00:00:00 2001
From: Emese Revfy <re.emese@gmail.com>
Date: Wed, 17 Apr 2013 15:58:36 -0700
Subject: kernel/signal.c: stop info leak via the tkill and the tgkill syscalls

This fixes a kernel memory contents leak via the tkill and tgkill syscalls
for compat processes.

This is visible in the siginfo_t->_sifields._rt.si_sigval.sival_ptr field
when handling signals delivered from tkill.

The place of the infoleak:

int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
{
        ...
        put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr);
        ...
}

Signed-off-by: Emese Revfy <re.emese@gmail.com>
Reviewed-by: PaX Team <pageexec@freemail.hu>
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Serge Hallyn <serge.hallyn@canonical.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index dd72567767d..598dc06be42 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2948,7 +2948,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
 
 static int do_tkill(pid_t tgid, pid_t pid, int sig)
 {
-	struct siginfo info;
+	struct siginfo info = {};
 
 	info.si_signo = sig;
 	info.si_errno = 0;
-- 
cgit v1.2.3-70-g09d2


From 5c51543b0ae45967cfa99ca16748dc72888cc265 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 18 Apr 2013 18:33:18 +0900
Subject: kprobes: Fix a double lock bug of kprobe_mutex

Fix a double locking bug caused when debug.kprobe-optimization=0.
While the proc_kprobes_optimization_handler locks kprobe_mutex,
wait_for_kprobe_optimizer locks it again and that causes a double lock.
To fix the bug, this introduces different mutex for protecting
sysctl parameter and locks it in proc_kprobes_optimization_handler.
Of course, since we need to lock kprobe_mutex when touching kprobes
resources, that is done in *optimize_all_kprobes().

This bug was introduced by commit ad72b3bea744 ("kprobes: fix
wait_for_kprobe_optimizer()")

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e35be53f661..3fed7f0cbcd 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -794,16 +794,16 @@ out:
 }
 
 #ifdef CONFIG_SYSCTL
-/* This should be called with kprobe_mutex locked */
 static void __kprobes optimize_all_kprobes(void)
 {
 	struct hlist_head *head;
 	struct kprobe *p;
 	unsigned int i;
 
+	mutex_lock(&kprobe_mutex);
 	/* If optimization is already allowed, just return */
 	if (kprobes_allow_optimization)
-		return;
+		goto out;
 
 	kprobes_allow_optimization = true;
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
@@ -813,18 +813,22 @@ static void __kprobes optimize_all_kprobes(void)
 				optimize_kprobe(p);
 	}
 	printk(KERN_INFO "Kprobes globally optimized\n");
+out:
+	mutex_unlock(&kprobe_mutex);
 }
 
-/* This should be called with kprobe_mutex locked */
 static void __kprobes unoptimize_all_kprobes(void)
 {
 	struct hlist_head *head;
 	struct kprobe *p;
 	unsigned int i;
 
+	mutex_lock(&kprobe_mutex);
 	/* If optimization is already prohibited, just return */
-	if (!kprobes_allow_optimization)
+	if (!kprobes_allow_optimization) {
+		mutex_unlock(&kprobe_mutex);
 		return;
+	}
 
 	kprobes_allow_optimization = false;
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
@@ -834,11 +838,14 @@ static void __kprobes unoptimize_all_kprobes(void)
 				unoptimize_kprobe(p, false);
 		}
 	}
+	mutex_unlock(&kprobe_mutex);
+
 	/* Wait for unoptimizing completion */
 	wait_for_kprobe_optimizer();
 	printk(KERN_INFO "Kprobes globally unoptimized\n");
 }
 
+static DEFINE_MUTEX(kprobe_sysctl_mutex);
 int sysctl_kprobes_optimization;
 int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
 				      void __user *buffer, size_t *length,
@@ -846,7 +853,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
 {
 	int ret;
 
-	mutex_lock(&kprobe_mutex);
+	mutex_lock(&kprobe_sysctl_mutex);
 	sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
 	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
 
@@ -854,7 +861,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
 		optimize_all_kprobes();
 	else
 		unoptimize_all_kprobes();
-	mutex_unlock(&kprobe_mutex);
+	mutex_unlock(&kprobe_sysctl_mutex);
 
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From 0a82a8d132b26d438eb90b3ab35a7016e7227a1d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 18 Apr 2013 09:00:26 -0700
Subject: Revert "block: add missing block_bio_complete() tracepoint"

This reverts commit 3a366e614d0837d9fc23f78cdb1a1186ebc3387f.

Wanlong Gao reports that it causes a kernel panic on his machine several
minutes after boot. Reverting it removes the panic.

Jens says:
 "It's not quite clear why that is yet, so I think we should just revert
  the commit for 3.9 final (which I'm assuming is pretty close).

  The wifi is crap at the LSF hotel, so sending this email instead of
  queueing up a revert and pull request."

Reported-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Requested-by: Jens Axboe <axboe@kernel.dk>
Cc: Tejun Heo <tj@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/blk-core.c             |  1 +
 drivers/md/dm.c              |  1 +
 drivers/md/raid5.c           | 11 ++++++++++-
 fs/bio.c                     |  2 --
 include/linux/blktrace_api.h |  1 -
 include/trace/events/block.h |  8 ++++----
 kernel/trace/blktrace.c      | 26 +++-----------------------
 7 files changed, 19 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-core.c b/block/blk-core.c
index 074b758efc4..7c288358a74 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,7 @@
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
 
 DEFINE_IDA(blk_queue_ida);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7e469260fe5..9a0bdad9ad8 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -611,6 +611,7 @@ static void dec_pending(struct dm_io *io, int error)
 			queue_io(md, bio);
 		} else {
 			/* done with normal IO or empty flush */
+			trace_block_bio_complete(md->queue, bio, io_error);
 			bio_endio(bio, io_error);
 		}
 	}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 24909eb13fe..f4e87bfc756 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -184,6 +184,8 @@ static void return_io(struct bio *return_bi)
 		return_bi = bi->bi_next;
 		bi->bi_next = NULL;
 		bi->bi_size = 0;
+		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
+					 bi, 0);
 		bio_endio(bi, 0);
 		bi = return_bi;
 	}
@@ -3914,6 +3916,8 @@ static void raid5_align_endio(struct bio *bi, int error)
 	rdev_dec_pending(rdev, conf->mddev);
 
 	if (!error && uptodate) {
+		trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
+					 raid_bi, 0);
 		bio_endio(raid_bi, 0);
 		if (atomic_dec_and_test(&conf->active_aligned_reads))
 			wake_up(&conf->wait_for_stripe);
@@ -4382,6 +4386,8 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 		if ( rw == WRITE )
 			md_write_end(mddev);
 
+		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
+					 bi, 0);
 		bio_endio(bi, 0);
 	}
 }
@@ -4758,8 +4764,11 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 		handled++;
 	}
 	remaining = raid5_dec_bi_active_stripes(raid_bio);
-	if (remaining == 0)
+	if (remaining == 0) {
+		trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
+					 raid_bio, 0);
 		bio_endio(raid_bio, 0);
+	}
 	if (atomic_dec_and_test(&conf->active_aligned_reads))
 		wake_up(&conf->wait_for_stripe);
 	return handled;
diff --git a/fs/bio.c b/fs/bio.c
index bb5768f59b3..b96fc6ce485 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1428,8 +1428,6 @@ void bio_endio(struct bio *bio, int error)
 	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 		error = -EIO;
 
-	trace_block_bio_complete(bio, error);
-
 	if (bio->bi_end_io)
 		bio->bi_end_io(bio, error);
 }
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 0ea61e07a91..7c2e030e72f 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -12,7 +12,6 @@
 
 struct blk_trace {
 	int trace_state;
-	bool rq_based;
 	struct rchan *rchan;
 	unsigned long __percpu *sequence;
 	unsigned char __percpu *msg_data;
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 9961726523d..9c1467357b0 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -257,6 +257,7 @@ TRACE_EVENT(block_bio_bounce,
 
 /**
  * block_bio_complete - completed all work on the block operation
+ * @q: queue holding the block operation
  * @bio: block operation completed
  * @error: io error value
  *
@@ -265,9 +266,9 @@ TRACE_EVENT(block_bio_bounce,
  */
 TRACE_EVENT(block_bio_complete,
 
-	TP_PROTO(struct bio *bio, int error),
+	TP_PROTO(struct request_queue *q, struct bio *bio, int error),
 
-	TP_ARGS(bio, error),
+	TP_ARGS(q, bio, error),
 
 	TP_STRUCT__entry(
 		__field( dev_t,		dev		)
@@ -278,8 +279,7 @@ TRACE_EVENT(block_bio_complete,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= bio->bi_bdev ?
-					  bio->bi_bdev->bd_dev : 0;
+		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_sector;
 		__entry->nr_sector	= bio->bi_size >> 9;
 		__entry->error		= error;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9e5b8c272ee..5a0f781cd72 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -739,12 +739,6 @@ static void blk_add_trace_rq_complete(void *ignore,
 				      struct request_queue *q,
 				      struct request *rq)
 {
-	struct blk_trace *bt = q->blk_trace;
-
-	/* if control ever passes through here, it's a request based driver */
-	if (unlikely(bt && !bt->rq_based))
-		bt->rq_based = true;
-
 	blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
 }
 
@@ -780,24 +774,10 @@ static void blk_add_trace_bio_bounce(void *ignore,
 	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
 }
 
-static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error)
+static void blk_add_trace_bio_complete(void *ignore,
+				       struct request_queue *q, struct bio *bio,
+				       int error)
 {
-	struct request_queue *q;
-	struct blk_trace *bt;
-
-	if (!bio->bi_bdev)
-		return;
-
-	q = bdev_get_queue(bio->bi_bdev);
-	bt = q->blk_trace;
-
-	/*
-	 * Request based drivers will generate both rq and bio completions.
-	 * Ignore bio ones.
-	 */
-	if (likely(!bt) || bt->rq_based)
-		return;
-
 	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
 }
 
-- 
cgit v1.2.3-70-g09d2