From 383efcd00053ec40023010ce5034bd702e7ab373 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 18 Mar 2013 12:22:34 -0700 Subject: sched: Convert BUG_ON()s in try_to_wake_up_local() to WARN_ON_ONCE()s try_to_wake_up_local() should only be invoked to wake up another task in the same runqueue and BUG_ON()s are used to enforce the rule. Missing try_to_wake_up_local() can stall workqueue execution but such stalls are likely to be finite either by another work item being queued or the one blocked getting unblocked. There's no reason to trigger BUG while holding rq lock crashing the whole system. Convert BUG_ON()s in try_to_wake_up_local() to WARN_ON_ONCE()s. Signed-off-by: Tejun Heo Acked-by: Steven Rostedt Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20130318192234.GD3042@htj.dyndns.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b7b03cd2d4c..306943f531a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1488,8 +1488,10 @@ static void try_to_wake_up_local(struct task_struct *p) { struct rq *rq = task_rq(p); - BUG_ON(rq != this_rq()); - BUG_ON(p == current); + if (WARN_ON_ONCE(rq != this_rq()) || + WARN_ON_ONCE(p == current)) + return; + lockdep_assert_held(&rq->lock); if (!raw_spin_trylock(&p->pi_lock)) { -- cgit v1.2.3-70-g09d2 From dd9c086d9f507d02d5ba4d7c5eef4bb9518088b8 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 18 Mar 2013 14:33:28 +0100 Subject: perf: Fix ring_buffer perf_output_space() boundary calculation This patch fixes a flaw in perf_output_space(). In case the size of the space needed is bigger than the actual buffer size, there may be situations where the function would return true (i.e., there is space) when it should not. head > offset due to rounding of the masking logic. The problem can be tested by activating BTS on Intel processors. A BTS record can be as big as 16 pages. The following command fails: $ perf record -m 4 -c 1 -e branches:u my_test_program You will get a buffer corruption with this. Perf report won't be able to parse the perf.data. The fix is to first check that the requested space is smaller than the buffer size. If so, then the masking logic will work fine. If not, then there is no chance the record can be saved and it will be gracefully handled by upper code layers. [ In v2, we also make the logic for the writable more explicit by renaming it to rb->overwrite because it tells whether or not the buffer can overwrite its tail (suggested by PeterZ). ] Signed-off-by: Stephane Eranian Acked-by: Peter Zijlstra Cc: peterz@infradead.org Cc: jolsa@redhat.com Cc: fweisbec@gmail.com Link: http://lkml.kernel.org/r/20130318133327.GA3056@quad Signed-off-by: Ingo Molnar --- kernel/events/internal.h | 2 +- kernel/events/ring_buffer.c | 22 ++++++++++++++++++---- 2 files changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/internal.h b/kernel/events/internal.h index d56a64c99a8..eb675c4d59d 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -16,7 +16,7 @@ struct ring_buffer { int page_order; /* allocation order */ #endif int nr_pages; /* nr of data pages */ - int writable; /* are we writable */ + int overwrite; /* can overwrite itself */ atomic_t poll; /* POLL_ for wakeups */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 23cb34ff397..97fddb09762 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -18,12 +18,24 @@ static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, unsigned long offset, unsigned long head) { - unsigned long mask; + unsigned long sz = perf_data_size(rb); + unsigned long mask = sz - 1; - if (!rb->writable) + /* + * check if user-writable + * overwrite : over-write its own tail + * !overwrite: buffer possibly drops events. + */ + if (rb->overwrite) return true; - mask = perf_data_size(rb) - 1; + /* + * verify that payload is not bigger than buffer + * otherwise masking logic may fail to detect + * the "not enough space" condition + */ + if ((head - offset) > sz) + return false; offset = (offset - tail) & mask; head = (head - tail) & mask; @@ -212,7 +224,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) rb->watermark = max_size / 2; if (flags & RING_BUFFER_WRITABLE) - rb->writable = 1; + rb->overwrite = 0; + else + rb->overwrite = 1; atomic_set(&rb->refcount, 1); -- cgit v1.2.3-70-g09d2 From 84cc8fd2fe65866e49d70b38b3fdf7219dd92fe0 Mon Sep 17 00:00:00 2001 From: Michael Bohan Date: Tue, 19 Mar 2013 19:19:25 -0700 Subject: hrtimer: Don't reinitialize a cpu_base lock on CPU_UP The current code makes the assumption that a cpu_base lock won't be held if the CPU corresponding to that cpu_base is offline, which isn't always true. If a hrtimer is not queued, then it will not be migrated by migrate_hrtimers() when a CPU is offlined. Therefore, the hrtimer's cpu_base may still point to a CPU which has subsequently gone offline if the timer wasn't enqueued at the time the CPU went down. Normally this wouldn't be a problem, but a cpu_base's lock is blindly reinitialized each time a CPU is brought up. If a CPU is brought online during the period that another thread is performing a hrtimer operation on a stale hrtimer, then the lock will be reinitialized under its feet, and a SPIN_BUG() like the following will be observed: <0>[ 28.082085] BUG: spinlock already unlocked on CPU#0, swapper/0/0 <0>[ 28.087078] lock: 0xc4780b40, value 0x0 .magic: dead4ead, .owner: /-1, .owner_cpu: -1 <4>[ 42.451150] [] (unwind_backtrace+0x0/0x120) from [] (do_raw_spin_unlock+0x44/0xdc) <4>[ 42.460430] [] (do_raw_spin_unlock+0x44/0xdc) from [] (_raw_spin_unlock+0x8/0x30) <4>[ 42.469632] [] (_raw_spin_unlock+0x8/0x30) from [] (__hrtimer_start_range_ns+0x1e4/0x4f8) <4>[ 42.479521] [] (__hrtimer_start_range_ns+0x1e4/0x4f8) from [] (hrtimer_start+0x20/0x28) <4>[ 42.489247] [] (hrtimer_start+0x20/0x28) from [] (rcu_idle_enter_common+0x1ac/0x320) <4>[ 42.498709] [] (rcu_idle_enter_common+0x1ac/0x320) from [] (rcu_idle_enter+0xa0/0xb8) <4>[ 42.508259] [] (rcu_idle_enter+0xa0/0xb8) from [] (cpu_idle+0x24/0xf0) <4>[ 42.516503] [] (cpu_idle+0x24/0xf0) from [] (rest_init+0x88/0xa0) <4>[ 42.524319] [] (rest_init+0x88/0xa0) from [] (start_kernel+0x3d0/0x434) As an example, this particular crash occurred when hrtimer_start() was executed on CPU #0. The code locked the hrtimer's current cpu_base corresponding to CPU #1. CPU #0 then tried to switch the hrtimer's cpu_base to an optimal CPU which was online. In this case, it selected the cpu_base corresponding to CPU #3. Before it could proceed, CPU #1 came online and reinitialized the spinlock corresponding to its cpu_base. Thus now CPU #0 held a lock which was reinitialized. When CPU #0 finally ended up unlocking the old cpu_base corresponding to CPU #1 so that it could switch to CPU #3, we hit this SPIN_BUG() above while in switch_hrtimer_base(). CPU #0 CPU #1 ---- ---- ... hrtimer_start() lock_hrtimer_base(base #1) ... init_hrtimers_cpu() switch_hrtimer_base() ... ... raw_spin_lock_init(&cpu_base->lock) raw_spin_unlock(&cpu_base->lock) ... Solve this by statically initializing the lock. Signed-off-by: Michael Bohan Link: http://lkml.kernel.org/r/1363745965-23475-1-git-send-email-mbohan@codeaurora.org Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index cc47812d3fe..14be27feda4 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -63,6 +63,7 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { + .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), .clock_base = { { @@ -1642,8 +1643,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu) struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; - raw_spin_lock_init(&cpu_base->lock); - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { cpu_base->clock_base[i].cpu_base = cpu_base; timerqueue_init_head(&cpu_base->clock_base[i].active); -- cgit v1.2.3-70-g09d2 From a1cbcaa9ea87b87a96b9fc465951dcf36e459ca2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 6 Apr 2013 10:10:27 +0200 Subject: sched_clock: Prevent 64bit inatomicity on 32bit systems The sched_clock_remote() implementation has the following inatomicity problem on 32bit systems when accessing the remote scd->clock, which is a 64bit value. CPU0 CPU1 sched_clock_local() sched_clock_remote(CPU0) ... remote_clock = scd[CPU0]->clock read_low32bit(scd[CPU0]->clock) cmpxchg64(scd->clock,...) read_high32bit(scd[CPU0]->clock) While the update of scd->clock is using an atomic64 mechanism, the readout on the remote cpu is not, which can cause completely bogus readouts. It is a quite rare problem, because it requires the update to hit the narrow race window between the low/high readout and the update must go across the 32bit boundary. The resulting misbehaviour is, that CPU1 will see the sched_clock on CPU1 ~4 seconds ahead of it's own and update CPU1s sched_clock value to this bogus timestamp. This stays that way due to the clamping implementation for about 4 seconds until the synchronization with CLOCK_MONOTONIC undoes the problem. The issue is hard to observe, because it might only result in a less accurate SCHED_OTHER timeslicing behaviour. To create observable damage on realtime scheduling classes, it is necessary that the bogus update of CPU1 sched_clock happens in the context of an realtime thread, which then gets charged 4 seconds of RT runtime, which results in the RT throttler mechanism to trigger and prevent scheduling of RT tasks for a little less than 4 seconds. So this is quite unlikely as well. The issue was quite hard to decode as the reproduction time is between 2 days and 3 weeks and intrusive tracing makes it less likely, but the following trace recorded with trace_clock=global, which uses sched_clock_local(), gave the final hint: -0 0d..30 400269.477150: hrtimer_cancel: hrtimer=0xf7061e80 -0 0d..30 400269.477151: hrtimer_start: hrtimer=0xf7061e80 ... irq/20-S-587 1d..32 400273.772118: sched_wakeup: comm= ... target_cpu=0 -0 0dN.30 400273.772118: hrtimer_cancel: hrtimer=0xf7061e80 What happens is that CPU0 goes idle and invokes sched_clock_idle_sleep_event() which invokes sched_clock_local() and CPU1 runs a remote wakeup for CPU0 at the same time, which invokes sched_remote_clock(). The time jump gets propagated to CPU0 via sched_remote_clock() and stays stale on both cores for ~4 seconds. There are only two other possibilities, which could cause a stale sched clock: 1) ktime_get() which reads out CLOCK_MONOTONIC returns a sporadic wrong value. 2) sched_clock() which reads the TSC returns a sporadic wrong value. #1 can be excluded because sched_clock would continue to increase for one jiffy and then go stale. #2 can be excluded because it would not make the clock jump forward. It would just result in a stale sched_clock for one jiffy. After quite some brain twisting and finding the same pattern on other traces, sched_clock_remote() remained the only place which could cause such a problem and as explained above it's indeed racy on 32bit systems. So while on 64bit systems the readout is atomic, we need to verify the remote readout on 32bit machines. We need to protect the local->clock readout in sched_clock_remote() on 32bit as well because an NMI could hit between the low and the high readout, call sched_clock_local() and modify local->clock. Thanks to Siegfried Wulsch for bearing with my debug requests and going through the tedious tasks of running a bunch of reproducer systems to generate the debug information which let me decode the issue. Reported-by: Siegfried Wulsch Acked-by: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1304051544160.21884@ionos Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org --- kernel/sched/clock.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c685e31492d..c3ae1446461 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd) u64 this_clock, remote_clock; u64 *ptr, old_val, val; +#if BITS_PER_LONG != 64 +again: + /* + * Careful here: The local and the remote clock values need to + * be read out atomic as we need to compare the values and + * then update either the local or the remote side. So the + * cmpxchg64 below only protects one readout. + * + * We must reread via sched_clock_local() in the retry case on + * 32bit as an NMI could use sched_clock_local() via the + * tracer and hit between the readout of + * the low32bit and the high 32bit portion. + */ + this_clock = sched_clock_local(my_scd); + /* + * We must enforce atomic readout on 32bit, otherwise the + * update on the remote cpu can hit inbetween the readout of + * the low32bit and the high 32bit portion. + */ + remote_clock = cmpxchg64(&scd->clock, 0, 0); +#else + /* + * On 64bit the read of [my]scd->clock is atomic versus the + * update, so we can avoid the above 32bit dance. + */ sched_clock_local(my_scd); again: this_clock = my_scd->clock; remote_clock = scd->clock; +#endif /* * Use the opportunity that we have both locks -- cgit v1.2.3-70-g09d2 From fd9b86d37a600488dbd80fe60cca46b822bff1cd Mon Sep 17 00:00:00 2001 From: libin Date: Mon, 8 Apr 2013 14:39:12 +0800 Subject: sched/debug: Fix sd->*_idx limit range avoiding overflow Commit 201c373e8e ("sched/debug: Limit sd->*_idx range on sysctl") was an incomplete bug fix. This patch fixes sd->*_idx limit range to [0 ~ CPU_LOAD_IDX_MAX-1] avoiding array overflow caused by setting sd->*_idx to CPU_LOAD_IDX_MAX on sysctl. Signed-off-by: Libin Cc: Cc: Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/51626610.2040607@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 306943f531a..fa077929e31 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4933,7 +4933,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) } static int min_load_idx = 0; -static int max_load_idx = CPU_LOAD_IDX_MAX; +static int max_load_idx = CPU_LOAD_IDX_MAX-1; static void set_table_entry(struct ctl_table *entry, -- cgit v1.2.3-70-g09d2 From c97847d2f0eb77c806e650e04d9bbcf79fa05730 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Mon, 8 Apr 2013 11:48:27 +0800 Subject: perf: Fix strncpy() use, always make sure it's NUL terminated For NUL terminated string, always make sure that there's '\0' at the end. In our case we need a return value, so still use strncpy() and fix up the tail explicitly. (strlcpy() returns the size, not the pointer) Signed-off-by: Chen Gang Cc: a.p.zijlstra@chello.nl Cc: paulus@samba.org Cc: acme@ghostprotocols.net Link: http://lkml.kernel.org/r/51623E0B.7070101@asianux.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 59412d037ee..7f0d67ea3f4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4737,7 +4737,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) } else { if (arch_vma_name(mmap_event->vma)) { name = strncpy(tmp, arch_vma_name(mmap_event->vma), - sizeof(tmp)); + sizeof(tmp) - 1); + tmp[sizeof(tmp) - 1] = '\0'; goto got_name; } -- cgit v1.2.3-70-g09d2 From 67012ab1d2ce871afea4ee55408f233f97d09d07 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Mon, 8 Apr 2013 12:06:44 +0800 Subject: perf: Fix strncpy() use, use strlcpy() instead of strncpy() For NUL terminated string we always need to set '\0' at the end. Signed-off-by: Chen Gang Cc: rostedt@goodmis.org Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/51624254.30301@asianux.com Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4f1dade5698..3f5046a4d81 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -132,7 +132,7 @@ static char *default_bootup_tracer; static int __init set_cmdline_ftrace(char *str) { - strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); + strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ ring_buffer_expanded = 1; @@ -162,7 +162,7 @@ static char *trace_boot_options __initdata; static int __init set_trace_boot_options(char *str) { - strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); + strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); trace_boot_options = trace_boot_options_buf; return 0; } -- cgit v1.2.3-70-g09d2 From 75761cc15877c155b3849b4e0e0cb3f897faf471 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Mon, 8 Apr 2013 12:12:39 +0800 Subject: ftrace: Fix strncpy() use, use strlcpy() instead of strncpy() For NUL terminated string we always need to set '\0' at the end. Signed-off-by: Chen Gang Cc: rostedt@goodmis.org Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/516243B7.9020405@asianux.com Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6893d5a2bf0..db143742704 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3441,14 +3441,14 @@ static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; static int __init set_ftrace_notrace(char *str) { - strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); + strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_notrace=", set_ftrace_notrace); static int __init set_ftrace_filter(char *str) { - strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); + strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_filter=", set_ftrace_filter); -- cgit v1.2.3-70-g09d2 From e614b3332a4f3f264a26da28e5a1f4cc3aea3974 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 4 Apr 2013 10:57:48 +0200 Subject: sched/cputime: Fix accounting on multi-threaded processes Recent commit 6fac4829 ("cputime: Use accessors to read task cputime stats") introduced a bug, where we account many times the cputime of the first thread, instead of cputimes of all the different threads. Signed-off-by: Stanislaw Gruszka Acked-by: Frederic Weisbecker Cc: Oleg Nesterov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20130404085740.GA2495@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ed12cbb135f..e93cca92f38 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -310,7 +310,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) t = tsk; do { - task_cputime(tsk, &utime, &stime); + task_cputime(t, &utime, &stime); times->utime += utime; times->stime += stime; times->sum_exec_runtime += task_sched_runtime(t); -- cgit v1.2.3-70-g09d2 From 2930e04d00e113ae24bb2b7c2b58de7b648a62c7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 26 Mar 2013 17:33:00 -0400 Subject: tracing: Fix race with update_max_tr_single and changing tracers The commit 34600f0e9 "tracing: Fix race with max_tr and changing tracers" fixed the updating of the main buffers with the race of changing tracers, but left out the fix to the updating of just a per cpu buffer. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4f1dade5698..7ba7fc76f9e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -744,8 +744,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); - if (WARN_ON_ONCE(!current_trace->allocated_snapshot)) + if (!current_trace->allocated_snapshot) { + /* Only the nop tracer should hit this when disabling */ + WARN_ON_ONCE(current_trace != &nop_trace); return; + } arch_spin_lock(&ftrace_max_lock); -- cgit v1.2.3-70-g09d2 From 5000c418840b309251c5887f0b56503aae30f84c Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 26 Mar 2013 17:53:03 +0100 Subject: ftrace: Consistently restore trace function on sysctl enabling If we reenable ftrace via syctl, we currently set ftrace_trace_function based on the previous simplistic algorithm. This is inconsistent with what update_ftrace_function does. So better call that helper instead. Link: http://lkml.kernel.org/r/5151D26F.1070702@siemens.com Cc: stable@vger.kernel.org Signed-off-by: Jan Kiszka Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6893d5a2bf0..cc4943c7ce6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4555,12 +4555,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, ftrace_startup_sysctl(); /* we are starting ftrace again */ - if (ftrace_ops_list != &ftrace_list_end) { - if (ftrace_ops_list->next == &ftrace_list_end) - ftrace_trace_function = ftrace_ops_list->func; - else - ftrace_trace_function = ftrace_ops_list_func; - } + if (ftrace_ops_list != &ftrace_list_end) + update_ftrace_function(); } else { /* stopping ftrace calls (just send to ftrace_stub) */ -- cgit v1.2.3-70-g09d2 From 395b97a3aeff0b8d949ee3e67bf8c11c5ffd6861 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 27 Mar 2013 09:31:28 -0400 Subject: ftrace: Do not call stub functions in control loop The function tracing control loop used by perf spits out a warning if the called function is not a control function. This is because the control function references a per cpu allocated data structure on struct ftrace_ops that is not allocated for other types of functions. commit 0a016409e42 "ftrace: Optimize the function tracer list loop" Had an optimization done to all function tracing loops to optimize for a single registered ops. Unfortunately, this allows for a slight race when tracing starts or ends, where the stub function might be called after the current registered ops is removed. In this case we get the following dump: root# perf stat -e ftrace:function sleep 1 [ 74.339105] WARNING: at include/linux/ftrace.h:209 ftrace_ops_control_func+0xde/0xf0() [ 74.349522] Hardware name: PRIMERGY RX200 S6 [ 74.357149] Modules linked in: sg igb iTCO_wdt ptp pps_core iTCO_vendor_support i7core_edac dca lpc_ich i2c_i801 coretemp edac_core crc32c_intel mfd_core ghash_clmulni_intel dm_multipath acpi_power_meter pcspk r microcode vhost_net tun macvtap macvlan nfsd kvm_intel kvm auth_rpcgss nfs_acl lockd sunrpc uinput xfs libcrc32c sd_mod crc_t10dif sr_mod cdrom mgag200 i2c_algo_bit drm_kms_helper ttm qla2xxx mptsas ahci drm li bahci scsi_transport_sas mptscsih libata scsi_transport_fc i2c_core mptbase scsi_tgt dm_mirror dm_region_hash dm_log dm_mod [ 74.446233] Pid: 1377, comm: perf Tainted: G W 3.9.0-rc1 #1 [ 74.453458] Call Trace: [ 74.456233] [] warn_slowpath_common+0x7f/0xc0 [ 74.462997] [] ? rcu_note_context_switch+0xa0/0xa0 [ 74.470272] [] ? __unregister_ftrace_function+0xa2/0x1a0 [ 74.478117] [] warn_slowpath_null+0x1a/0x20 [ 74.484681] [] ftrace_ops_control_func+0xde/0xf0 [ 74.491760] [] ftrace_call+0x5/0x2f [ 74.497511] [] ? ftrace_call+0x5/0x2f [ 74.503486] [] ? ftrace_call+0x5/0x2f [ 74.509500] [] ? synchronize_sched+0x5/0x50 [ 74.516088] [] ? _cond_resched+0x5/0x40 [ 74.522268] [] ? synchronize_sched+0x5/0x50 [ 74.528837] [] ? __unregister_ftrace_function+0xa2/0x1a0 [ 74.536696] [] ? _cond_resched+0x5/0x40 [ 74.542878] [] ? mutex_lock+0x1d/0x50 [ 74.548869] [] unregister_ftrace_function+0x27/0x50 [ 74.556243] [] perf_ftrace_event_register+0x9f/0x140 [ 74.563709] [] ? _cond_resched+0x5/0x40 [ 74.569887] [] ? mutex_lock+0x1d/0x50 [ 74.575898] [] perf_trace_destroy+0x2e/0x50 [ 74.582505] [] tp_perf_event_destroy+0x9/0x10 [ 74.589298] [] free_event+0x70/0x1a0 [ 74.595208] [] perf_event_release_kernel+0x69/0xa0 [ 74.602460] [] ? _cond_resched+0x5/0x40 [ 74.608667] [] put_event+0x90/0xc0 [ 74.614373] [] perf_release+0x10/0x20 [ 74.620367] [] __fput+0xf4/0x280 [ 74.625894] [] ____fput+0xe/0x10 [ 74.631387] [] task_work_run+0xa7/0xe0 [ 74.637452] [] do_notify_resume+0x71/0xb0 [ 74.643843] [] int_signal+0x12/0x17 To fix this a new ftrace_ops flag is added that denotes the ftrace_list_end ftrace_ops stub as just that, a stub. This flag is now checked in the control loop and the function is not called if the flag is set. Thanks to Jovi for not just reporting the bug, but also pointing out where the bug was in the code. Link: http://lkml.kernel.org/r/514A8855.7090402@redhat.com Link: http://lkml.kernel.org/r/1364377499-1900-15-git-send-email-jovi.zhangwei@huawei.com Tested-by: WANG Chao Reported-by: WANG Chao Reported-by: zhangwei(Jovi) Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 2 ++ kernel/trace/ftrace.c | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e5ca8ef50e9..167abf90780 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -89,6 +89,7 @@ typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip, * that the call back has its own recursion protection. If it does * not set this, then the ftrace infrastructure will add recursion * protection for the caller. + * STUB - The ftrace_ops is just a place holder. */ enum { FTRACE_OPS_FL_ENABLED = 1 << 0, @@ -98,6 +99,7 @@ enum { FTRACE_OPS_FL_SAVE_REGS = 1 << 4, FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED = 1 << 5, FTRACE_OPS_FL_RECURSION_SAFE = 1 << 6, + FTRACE_OPS_FL_STUB = 1 << 7, }; struct ftrace_ops { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index cc4943c7ce6..7e897106b7e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -66,7 +66,7 @@ static struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, }; /* ftrace_enabled is a method to turn ftrace on or off */ @@ -4131,7 +4131,8 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, preempt_disable_notrace(); trace_recursion_set(TRACE_CONTROL_BIT); do_for_each_ftrace_op(op, ftrace_control_list) { - if (!ftrace_function_local_disabled(op) && + if (!(op->flags & FTRACE_OPS_FL_STUB) && + !ftrace_function_local_disabled(op) && ftrace_ops_test(op, ip)) op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); -- cgit v1.2.3-70-g09d2 From 6f389a8f1dd22a24f3d9afc2812b30d639e94625 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Sun, 7 Apr 2013 02:14:14 +0000 Subject: PM / reboot: call syscore_shutdown() after disable_nonboot_cpus() As commit 40dc166c (PM / Core: Introduce struct syscore_ops for core subsystems PM) say, syscore_ops operations should be carried with one CPU on-line and interrupts disabled. However, after commit f96972f2d (kernel/sys.c: call disable_nonboot_cpus() in kernel_restart()), syscore_shutdown() is called before disable_nonboot_cpus(), so break the rules. We have a MIPS machine with a 8259A PIC, and there is an external timer (HPET) linked at 8259A. Since 8259A has been shutdown too early (by syscore_shutdown()), disable_nonboot_cpus() runs without timer interrupt, so it hangs and reboot fails. This patch call syscore_shutdown() a little later (after disable_nonboot_cpus()) to avoid reboot failure, this is the same way as poweroff does. For consistency, add disable_nonboot_cpus() to kernel_halt(). Signed-off-by: Huacai Chen Cc: Signed-off-by: Rafael J. Wysocki --- kernel/sys.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 39c9c4a2949..0da73cf73e6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -324,7 +324,6 @@ void kernel_restart_prepare(char *cmd) system_state = SYSTEM_RESTART; usermodehelper_disable(); device_shutdown(); - syscore_shutdown(); } /** @@ -370,6 +369,7 @@ void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); disable_nonboot_cpus(); + syscore_shutdown(); if (!cmd) printk(KERN_EMERG "Restarting system.\n"); else @@ -395,6 +395,7 @@ static void kernel_shutdown_prepare(enum system_states state) void kernel_halt(void) { kernel_shutdown_prepare(SYSTEM_HALT); + disable_nonboot_cpus(); syscore_shutdown(); printk(KERN_EMERG "System halted.\n"); kmsg_dump(KMSG_DUMP_HALT); -- cgit v1.2.3-70-g09d2 From 83e03b3fe4daffdebbb42151d5410d730ae50bd1 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 1 Apr 2013 21:46:23 +0900 Subject: tracing: Fix double free when function profile init failed On the failure path, stat->start and stat->pages will refer same page. So it'll attempt to free the same page again and get kernel panic. Link: http://lkml.kernel.org/r/1364820385-32027-1-git-send-email-namhyung@kernel.org Cc: Frederic Weisbecker Cc: Namhyung Kim Cc: stable@vger.kernel.org Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7e897106b7e..926ebfb7493 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -694,7 +694,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) free_page(tmp); } - free_page((unsigned long)stat->pages); stat->pages = NULL; stat->start = NULL; -- cgit v1.2.3-70-g09d2 From c481420248c6730246d2a1b1773d5d7007ae0835 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 12 Apr 2013 11:05:54 +0800 Subject: perf: Fix error return code Fix to return -ENOMEM in the allocation error case instead of 0 (if pmu_bus_running == 1), as done elsewhere in this function. Signed-off-by: Wei Yongjun Cc: a.p.zijlstra@chello.nl Cc: paulus@samba.org Cc: acme@ghostprotocols.net Link: http://lkml.kernel.org/r/CAPgLHd8j_fWcgqe%3DKLWjpBj%2B%3Do0Pw6Z-SEq%3DNTPU08c2w1tngQ@mail.gmail.com [ Tweaked the error code setting placement and the changelog. ] Signed-off-by: Ingo Molnar --- kernel/events/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 7f0d67ea3f4..7e0962ed7f8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5987,6 +5987,7 @@ skip_type: if (pmu->pmu_cpu_context) goto got_cpu_context; + ret = -ENOMEM; pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); if (!pmu->pmu_cpu_context) goto free_dev; -- cgit v1.2.3-70-g09d2 From f2530dc71cf0822f90bb63ea4600caaef33a66bb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Apr 2013 09:33:34 +0200 Subject: kthread: Prevent unpark race which puts threads on the wrong cpu The smpboot threads rely on the park/unpark mechanism which binds per cpu threads on a particular core. Though the functionality is racy: CPU0 CPU1 CPU2 unpark(T) wake_up_process(T) clear(SHOULD_PARK) T runs leave parkme() due to !SHOULD_PARK bind_to(CPU2) BUG_ON(wrong CPU) We cannot let the tasks move themself to the target CPU as one of those tasks is actually the migration thread itself, which requires that it starts running on the target cpu right away. The solution to this problem is to prevent wakeups in park mode which are not from unpark(). That way we can guarantee that the association of the task to the target cpu is working correctly. Add a new task state (TASK_PARKED) which prevents other wakeups and use this state explicitly for the unpark wakeup. Peter noticed: Also, since the task state is visible to userspace and all the parked tasks are still in the PID space, its a good hint in ps and friends that these tasks aren't really there for the moment. The migration thread has another related issue. CPU0 CPU1 Bring up CPU2 create_thread(T) park(T) wait_for_completion() parkme() complete() sched_set_stop_task() schedule(TASK_PARKED) The sched_set_stop_task() call is issued while the task is on the runqueue of CPU1 and that confuses the hell out of the stop_task class on that cpu. So we need the same synchronizaion before sched_set_stop_task(). Reported-by: Dave Jones Reported-and-tested-by: Dave Hansen Reported-and-tested-by: Borislav Petkov Acked-by: Peter Ziljstra Cc: Srivatsa S. Bhat Cc: dhillf@gmail.com Cc: Ingo Molnar Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1304091635430.21884@ionos Signed-off-by: Thomas Gleixner --- fs/proc/array.c | 1 + include/linux/sched.h | 5 +++-- include/trace/events/sched.h | 2 +- kernel/kthread.c | 52 ++++++++++++++++++++++++-------------------- kernel/smpboot.c | 14 ++++++++++-- 5 files changed, 45 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/fs/proc/array.c b/fs/proc/array.c index f7ed9ee46eb..cbd0f1b324b 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -143,6 +143,7 @@ static const char * const task_state_array[] = { "x (dead)", /* 64 */ "K (wakekill)", /* 128 */ "W (waking)", /* 256 */ + "P (parked)", /* 512 */ }; static inline const char *get_task_state(struct task_struct *tsk) diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6ddbf..e692a022527 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -163,9 +163,10 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #define TASK_DEAD 64 #define TASK_WAKEKILL 128 #define TASK_WAKING 256 -#define TASK_STATE_MAX 512 +#define TASK_PARKED 512 +#define TASK_STATE_MAX 1024 -#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW" +#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP" extern char ___assert_task_state[1 - 2*!!( sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 5a8671e8a67..e5586caff67 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -147,7 +147,7 @@ TRACE_EVENT(sched_switch, __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|", { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, { 16, "Z" }, { 32, "X" }, { 64, "x" }, - { 128, "W" }) : "R", + { 128, "K" }, { 256, "W" }, { 512, "P" }) : "R", __entry->prev_state & TASK_STATE_MAX ? "+" : "", __entry->next_comm, __entry->next_pid, __entry->next_prio) ); diff --git a/kernel/kthread.c b/kernel/kthread.c index 691dc2ef9ba..9eb7fed0bba 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -124,12 +124,12 @@ void *kthread_data(struct task_struct *task) static void __kthread_parkme(struct kthread *self) { - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_PARKED); while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) { if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags)) complete(&self->parked); schedule(); - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_PARKED); } clear_bit(KTHREAD_IS_PARKED, &self->flags); __set_current_state(TASK_RUNNING); @@ -256,8 +256,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create_on_node); -static void __kthread_bind(struct task_struct *p, unsigned int cpu) +static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) { + /* Must have done schedule() in kthread() before we set_task_cpu */ + if (!wait_task_inactive(p, state)) { + WARN_ON(1); + return; + } /* It's safe because the task is inactive. */ do_set_cpus_allowed(p, cpumask_of(cpu)); p->flags |= PF_THREAD_BOUND; @@ -274,12 +279,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu) */ void kthread_bind(struct task_struct *p, unsigned int cpu) { - /* Must have done schedule() in kthread() before we set_task_cpu */ - if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { - WARN_ON(1); - return; - } - __kthread_bind(p, cpu); + __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(kthread_bind); @@ -324,6 +324,22 @@ static struct kthread *task_get_live_kthread(struct task_struct *k) return NULL; } +static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) +{ + clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + /* + * We clear the IS_PARKED bit here as we don't wait + * until the task has left the park code. So if we'd + * park before that happens we'd see the IS_PARKED bit + * which might be about to be cleared. + */ + if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { + if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) + __kthread_bind(k, kthread->cpu, TASK_PARKED); + wake_up_state(k, TASK_PARKED); + } +} + /** * kthread_unpark - unpark a thread created by kthread_create(). * @k: thread created by kthread_create(). @@ -336,20 +352,8 @@ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = task_get_live_kthread(k); - if (kthread) { - clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); - /* - * We clear the IS_PARKED bit here as we don't wait - * until the task has left the park code. So if we'd - * park before that happens we'd see the IS_PARKED bit - * which might be about to be cleared. - */ - if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { - if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) - __kthread_bind(k, kthread->cpu); - wake_up_process(k); - } - } + if (kthread) + __kthread_unpark(k, kthread); put_task_struct(k); } @@ -407,7 +411,7 @@ int kthread_stop(struct task_struct *k) trace_sched_kthread_stop(k); if (kthread) { set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); - clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + __kthread_unpark(k, kthread); wake_up_process(k); wait_for_completion(&kthread->exited); } diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 8eaed9aa9cf..02fc5c93367 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -185,8 +185,18 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) } get_task_struct(tsk); *per_cpu_ptr(ht->store, cpu) = tsk; - if (ht->create) - ht->create(cpu); + if (ht->create) { + /* + * Make sure that the task has actually scheduled out + * into park position, before calling the create + * callback. At least the migration thread callback + * requires that the task is off the runqueue. + */ + if (!wait_task_inactive(tsk, TASK_PARKED)) + WARN_ON(1); + else + ht->create(cpu); + } return 0; } -- cgit v1.2.3-70-g09d2 From 6a76f8c0ab19f215af2a3442870eeb5f0e81998d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 11 Apr 2013 15:55:01 +0900 Subject: tracing: Fix possible NULL pointer dereferences Currently set_ftrace_pid and set_graph_function files use seq_lseek for their fops. However seq_open() is called only for FMODE_READ in the fops->open() so that if an user tries to seek one of those file when she open it for writing, it sees NULL seq_file and then panic. It can be easily reproduced with following command: $ cd /sys/kernel/debug/tracing $ echo 1234 | sudo tee -a set_ftrace_pid In this example, GNU coreutils' tee opens the file with fopen(, "a") and then the fopen() internally calls lseek(). Link: http://lkml.kernel.org/r/1365663302-2170-1-git-send-email-namhyung@kernel.org Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Namhyung Kim Cc: stable@vger.kernel.org Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 2 +- kernel/trace/ftrace.c | 10 +++++----- kernel/trace/trace_stack.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 167abf90780..eb3ce327b97 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -396,7 +396,7 @@ ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos); ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos); -loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int whence); +loff_t ftrace_filter_lseek(struct file *file, loff_t offset, int whence); int ftrace_regex_release(struct inode *inode, struct file *file); void __init diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 926ebfb7493..affc35d829c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2697,7 +2697,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file) } loff_t -ftrace_regex_lseek(struct file *file, loff_t offset, int whence) +ftrace_filter_lseek(struct file *file, loff_t offset, int whence) { loff_t ret; @@ -3570,7 +3570,7 @@ static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = seq_read, .write = ftrace_filter_write, - .llseek = ftrace_regex_lseek, + .llseek = ftrace_filter_lseek, .release = ftrace_regex_release, }; @@ -3578,7 +3578,7 @@ static const struct file_operations ftrace_notrace_fops = { .open = ftrace_notrace_open, .read = seq_read, .write = ftrace_notrace_write, - .llseek = ftrace_regex_lseek, + .llseek = ftrace_filter_lseek, .release = ftrace_regex_release, }; @@ -3783,8 +3783,8 @@ static const struct file_operations ftrace_graph_fops = { .open = ftrace_graph_open, .read = seq_read, .write = ftrace_graph_write, + .llseek = ftrace_filter_lseek, .release = ftrace_graph_release, - .llseek = seq_lseek, }; #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -4439,7 +4439,7 @@ static const struct file_operations ftrace_pid_fops = { .open = ftrace_pid_open, .write = ftrace_pid_write, .read = seq_read, - .llseek = seq_lseek, + .llseek = ftrace_filter_lseek, .release = ftrace_pid_release, }; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 42ca822fc70..83a8b5b7bd3 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -322,7 +322,7 @@ static const struct file_operations stack_trace_filter_fops = { .open = stack_trace_filter_open, .read = seq_read, .write = ftrace_filter_write, - .llseek = ftrace_regex_lseek, + .llseek = ftrace_filter_lseek, .release = ftrace_regex_release, }; -- cgit v1.2.3-70-g09d2 From 7f49ef69db6bbf756c0abca7e9b65b32e999eec8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 12 Apr 2013 16:40:13 -0400 Subject: ftrace: Move ftrace_filter_lseek out of CONFIG_DYNAMIC_FTRACE section As ftrace_filter_lseek is now used with ftrace_pid_fops, it needs to be moved out of the #ifdef CONFIG_DYNAMIC_FTRACE section as the ftrace_pid_fops is defined when DYNAMIC_FTRACE is not. Cc: stable@vger.kernel.org Cc: Namhyung Kim Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 3 ++- kernel/trace/ftrace.c | 28 ++++++++++++++-------------- 2 files changed, 16 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index eb3ce327b97..52da2a25079 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -396,7 +396,6 @@ ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos); ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos); -loff_t ftrace_filter_lseek(struct file *file, loff_t offset, int whence); int ftrace_regex_release(struct inode *inode, struct file *file); void __init @@ -569,6 +568,8 @@ static inline int ftrace_regex_release(struct inode *inode, struct file *file) { return -ENODEV; } #endif /* CONFIG_DYNAMIC_FTRACE */ +loff_t ftrace_filter_lseek(struct file *file, loff_t offset, int whence); + /* totally disable ftrace - can not re-enable after this */ void ftrace_kill(void); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index affc35d829c..2461ede45a8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1052,6 +1052,19 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer) static struct pid * const ftrace_swapper_pid = &init_struct_pid; +loff_t +ftrace_filter_lseek(struct file *file, loff_t offset, int whence) +{ + loff_t ret; + + if (file->f_mode & FMODE_READ) + ret = seq_lseek(file, offset, whence); + else + file->f_pos = ret = 1; + + return ret; +} + #ifdef CONFIG_DYNAMIC_FTRACE #ifndef CONFIG_FTRACE_MCOUNT_RECORD @@ -2612,7 +2625,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash) * routine, you can use ftrace_filter_write() for the write * routine if @flag has FTRACE_ITER_FILTER set, or * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. - * ftrace_regex_lseek() should be used as the lseek routine, and + * ftrace_filter_lseek() should be used as the lseek routine, and * release must call ftrace_regex_release(). */ int @@ -2696,19 +2709,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file) inode, file); } -loff_t -ftrace_filter_lseek(struct file *file, loff_t offset, int whence) -{ - loff_t ret; - - if (file->f_mode & FMODE_READ) - ret = seq_lseek(file, offset, whence); - else - file->f_pos = ret = 1; - - return ret; -} - static int ftrace_match(char *str, char *regex, int len, int type) { int matched = 0; -- cgit v1.2.3-70-g09d2 From 935d8aabd4331f47a89c3e1daa5779d23cf244ee Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 14 Apr 2013 10:06:31 -0700 Subject: Add file_ns_capable() helper function for open-time capability checking Nothing is using it yet, but this will allow us to delay the open-time checks to use time, without breaking the normal UNIX permission semantics where permissions are determined by the opener (and the file descriptor can then be passed to a different process, or the process can drop capabilities). Signed-off-by: Linus Torvalds --- include/linux/capability.h | 2 ++ kernel/capability.c | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'kernel') diff --git a/include/linux/capability.h b/include/linux/capability.h index 98503b79236..d9a4f7f40f3 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -35,6 +35,7 @@ struct cpu_vfs_cap_data { #define _KERNEL_CAP_T_SIZE (sizeof(kernel_cap_t)) +struct file; struct inode; struct dentry; struct user_namespace; @@ -211,6 +212,7 @@ extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); extern bool nsown_capable(int cap); extern bool inode_capable(const struct inode *inode, int cap); +extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); diff --git a/kernel/capability.c b/kernel/capability.c index 493d9725948..f6c2ce5701e 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -392,6 +392,30 @@ bool ns_capable(struct user_namespace *ns, int cap) } EXPORT_SYMBOL(ns_capable); +/** + * file_ns_capable - Determine if the file's opener had a capability in effect + * @file: The file we want to check + * @ns: The usernamespace we want the capability in + * @cap: The capability to be tested for + * + * Return true if task that opened the file had a capability in effect + * when the file was opened. + * + * This does not set PF_SUPERPRIV because the caller may not + * actually be privileged. + */ +bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) +{ + if (WARN_ON_ONCE(!cap_valid(cap))) + return false; + + if (security_capable(file->f_cred, ns, cap) == 0) + return true; + + return false; +} +EXPORT_SYMBOL(file_ns_capable); + /** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for -- cgit v1.2.3-70-g09d2 From 6708075f104c3c9b04b23336bb0366ca30c3931b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 14 Apr 2013 13:47:02 -0700 Subject: userns: Don't let unprivileged users trick privileged users into setting the id_map When we require privilege for setting /proc//uid_map or /proc//gid_map no longer allow an unprivileged user to open the file and pass it to a privileged program to write to the file. Instead when privilege is required require both the opener and the writer to have the necessary capabilities. I have tested this code and verified that setting /proc//uid_map fails when an unprivileged user opens the file and a privielged user attempts to set the mapping, that unprivileged users can still map their own id, and that a privileged users can still setup an arbitrary mapping. Reported-by: Andy Lutomirski Signed-off-by: "Eric W. Biederman" Signed-off-by: Andy Lutomirski --- kernel/user_namespace.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index a54f26f82eb..e2d4ace4481 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -25,7 +25,8 @@ static struct kmem_cache *user_ns_cachep __read_mostly; -static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, +static bool new_idmap_permitted(const struct file *file, + struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) @@ -700,7 +701,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, ret = -EPERM; /* Validate the user is allowed to use user id's mapped to. */ - if (!new_idmap_permitted(ns, cap_setid, &new_map)) + if (!new_idmap_permitted(file, ns, cap_setid, &new_map)) goto out; /* Map the lower ids from the parent user namespace to the @@ -787,7 +788,8 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t &ns->projid_map, &ns->parent->projid_map); } -static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, +static bool new_idmap_permitted(const struct file *file, + struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { /* Allow mapping to your own filesystem ids */ @@ -811,8 +813,10 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. + * And the opener of the id file also had the approprpiate capability. */ - if (ns_capable(ns->parent, cap_setid)) + if (ns_capable(ns->parent, cap_setid) && + file_ns_capable(file, ns->parent, cap_setid)) return true; return false; -- cgit v1.2.3-70-g09d2 From e3211c120a85b792978bcb4be7b2886df18d27f0 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 14 Apr 2013 16:28:19 -0700 Subject: userns: Check uid_map's opener's fsuid, not the current fsuid Signed-off-by: Andy Lutomirski --- kernel/user_namespace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index e2d4ace4481..5c16f3aa757 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -797,12 +797,12 @@ static bool new_idmap_permitted(const struct file *file, u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); - if (uid_eq(uid, current_fsuid())) + if (uid_eq(uid, file->f_cred->fsuid)) return true; } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); - if (gid_eq(gid, current_fsgid())) + if (gid_eq(gid, file->f_cred->fsgid)) return true; } } -- cgit v1.2.3-70-g09d2 From 41c21e351e79004dbb4efa4bc14a53a7e0af38c5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 14 Apr 2013 11:44:04 -0700 Subject: userns: Changing any namespace id mappings should require privileges Changing uid/gid/projid mappings doesn't change your id within the namespace; it reconfigures the namespace. Unprivileged programs should *not* be able to write these files. (We're also checking the privileges on the wrong task.) Given the write-once nature of these files and the other security checks, this is likely impossible to usefully exploit. Signed-off-by: Andy Lutomirski --- kernel/user_namespace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 5c16f3aa757..e134d8f365d 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -613,10 +613,10 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (map->nr_extents != 0) goto out; - /* Require the appropriate privilege CAP_SETUID or CAP_SETGID - * over the user namespace in order to set the id mapping. + /* + * Adjusting namespace settings requires capabilities on the target. */ - if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid)) + if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN)) goto out; /* Get a buffer */ -- cgit v1.2.3-70-g09d2 From 8176cced706b5e5d15887584150764894e94e02f Mon Sep 17 00:00:00 2001 From: Tommi Rantala Date: Sat, 13 Apr 2013 22:49:14 +0300 Subject: perf: Treat attr.config as u64 in perf_swevent_init() Trinity discovered that we fail to check all 64 bits of attr.config passed by user space, resulting to out-of-bounds access of the perf_swevent_enabled array in sw_perf_event_destroy(). Introduced in commit b0a873ebb ("perf: Register PMU implementations"). Signed-off-by: Tommi Rantala Cc: Peter Zijlstra Cc: davej@redhat.com Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/1365882554-30259-1-git-send-email-tt.rantala@gmail.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 7e0962ed7f8..4d3124b3927 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5331,7 +5331,7 @@ static void sw_perf_event_destroy(struct perf_event *event) static int perf_swevent_init(struct perf_event *event) { - int event_id = event->attr.config; + u64 event_id = event->attr.config; if (event->attr.type != PERF_TYPE_SOFTWARE) return -ENOENT; -- cgit v1.2.3-70-g09d2 From 55a20ee7804ab64ac90bcdd4e2868a42829e2784 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 15 Apr 2013 22:23:47 -0700 Subject: x86, kdump: Retore crashkernel= to allocate under 896M Vivek found old kexec-tools does not work new kernel anymore. So change back crashkernel= back to old behavoir, and add crashkernel_high= to let user decide if buffer could be above 4G, and also new kexec-tools will be needed. -v2: let crashkernel=X override crashkernel_high= update description about _high will be ignored by crashkernel=X -v3: update description about kernel-parameters.txt according to Vivek. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1366089828-19692-4-git-send-email-yinghai@kernel.org Acked-by: Vivek Goyal Signed-off-by: H. Peter Anvin --- Documentation/kernel-parameters.txt | 13 +++++++++++-- arch/x86/kernel/setup.c | 24 +++++++++++++++++++----- include/linux/kexec.h | 2 ++ kernel/kexec.c | 9 +++++++++ 4 files changed, 41 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index cff672da248..709eb3edc6b 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -603,9 +603,16 @@ bytes respectively. Such letter suffixes can also be entirely omitted. a memory unit (amount[KMG]). See also Documentation/kdump/kdump.txt for an example. + crashkernel_high=size[KMG] + [KNL, x86_64] range could be above 4G. Allow kernel + to allocate physical memory region from top, so could + be above 4G if system have more than 4G ram installed. + Otherwise memory region will be allocated below 4G, if + available. + It will be ignored if crashkernel=X is specified. crashkernel_low=size[KMG] - [KNL, x86_64] range under 4G. When crashkernel= is - passed, kernel allocate physical memory region + [KNL, x86_64] range under 4G. When crashkernel_high= is + passed, kernel could allocate physical memory region above 4G, that cause second kernel crash on system that require some amount of low memory, e.g. swiotlb requires at least 64M+32K low memory. Kernel would @@ -613,6 +620,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. This one let user to specify own low range under 4G for second kernel instead. 0: to disable low allocation. + It will be ignored when crashkernel_high=X is not used + or memory reserved is below 4G. cs89x0_dma= [HW,NET] Format: diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 12349202cae..a85a144f205 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -507,11 +507,14 @@ static void __init memblock_x86_reserve_range_setup_data(void) /* * Keep the crash kernel below this limit. On 32 bits earlier kernels * would limit the kernel to the low 512 MiB due to mapping restrictions. + * On 64bit, old kexec-tools need to under 896MiB. */ #ifdef CONFIG_X86_32 -# define CRASH_KERNEL_ADDR_MAX (512 << 20) +# define CRASH_KERNEL_ADDR_LOW_MAX (512 << 20) +# define CRASH_KERNEL_ADDR_HIGH_MAX (512 << 20) #else -# define CRASH_KERNEL_ADDR_MAX MAXMEM +# define CRASH_KERNEL_ADDR_LOW_MAX (896UL<<20) +# define CRASH_KERNEL_ADDR_HIGH_MAX MAXMEM #endif static void __init reserve_crashkernel_low(void) @@ -525,6 +528,7 @@ static void __init reserve_crashkernel_low(void) int ret; total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT)); + /* crashkernel_low=YM */ ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base); if (ret != 0) { @@ -569,14 +573,22 @@ static void __init reserve_crashkernel(void) const unsigned long long alignment = 16<<20; /* 16M */ unsigned long long total_mem; unsigned long long crash_size, crash_base; + bool high = false; int ret; total_mem = memblock_phys_mem_size(); + /* crashkernel=XM */ ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); - if (ret != 0 || crash_size <= 0) - return; + if (ret != 0 || crash_size <= 0) { + /* crashkernel_high=XM */ + ret = parse_crashkernel_high(boot_command_line, total_mem, + &crash_size, &crash_base); + if (ret != 0 || crash_size <= 0) + return; + high = true; + } /* 0 means: find the address automatically */ if (crash_base <= 0) { @@ -584,7 +596,9 @@ static void __init reserve_crashkernel(void) * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX */ crash_base = memblock_find_in_range(alignment, - CRASH_KERNEL_ADDR_MAX, crash_size, alignment); + high ? CRASH_KERNEL_ADDR_HIGH_MAX : + CRASH_KERNEL_ADDR_LOW_MAX, + crash_size, alignment); if (!crash_base) { pr_info("crashkernel reservation failed - No suitable area found.\n"); diff --git a/include/linux/kexec.h b/include/linux/kexec.h index d2e6927bbaa..d78d28a733b 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -200,6 +200,8 @@ extern size_t vmcoreinfo_max_size; int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base); +int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, + unsigned long long *crash_size, unsigned long long *crash_base); int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base); int crash_shrink_memory(unsigned long new_size); diff --git a/kernel/kexec.c b/kernel/kexec.c index bddd3d7a74b..1b2f73f5f9b 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1422,6 +1422,15 @@ int __init parse_crashkernel(char *cmdline, "crashkernel="); } +int __init parse_crashkernel_high(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel_high="); +} + int __init parse_crashkernel_low(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, -- cgit v1.2.3-70-g09d2 From adbc742bf78695bb98c79d18c558b61571748b99 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 15 Apr 2013 22:23:48 -0700 Subject: x86, kdump: Change crashkernel_high/low= to crashkernel=,high/low Per hpa, use crashkernel=X,high crashkernel=Y,low instead of crashkernel_hign=X crashkernel_low=Y. As that could be extensible. -v2: according to Vivek, change delimiter to ; -v3: let hign and low only handle simple form and it conforms to description in kernel-parameters.txt still keep crashkernel=X override any crashkernel=X,high crashkernel=Y,low -v4: update get_last_crashkernel returning and add more strict checking in parse_crashkernel_simple() found by HATAYAMA. -v5: Change delimiter back to , according to HPA. also separate parse_suffix from parse_simper according to vivek. so we can avoid @pos in that path. -v6: Tight the checking about crashkernel=X,highblahblah,high found by HTYAYAMA. Cc: HATAYAMA Daisuke Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1366089828-19692-5-git-send-email-yinghai@kernel.org Acked-by: Vivek Goyal Signed-off-by: H. Peter Anvin --- Documentation/kernel-parameters.txt | 10 ++-- arch/x86/kernel/setup.c | 6 +- kernel/kexec.c | 109 +++++++++++++++++++++++++++++++----- 3 files changed, 104 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 709eb3edc6b..a1ac1f1d6d3 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -603,16 +603,16 @@ bytes respectively. Such letter suffixes can also be entirely omitted. a memory unit (amount[KMG]). See also Documentation/kdump/kdump.txt for an example. - crashkernel_high=size[KMG] + crashkernel=size[KMG],high [KNL, x86_64] range could be above 4G. Allow kernel to allocate physical memory region from top, so could be above 4G if system have more than 4G ram installed. Otherwise memory region will be allocated below 4G, if available. It will be ignored if crashkernel=X is specified. - crashkernel_low=size[KMG] - [KNL, x86_64] range under 4G. When crashkernel_high= is - passed, kernel could allocate physical memory region + crashkernel=size[KMG],low + [KNL, x86_64] range under 4G. When crashkernel=X,high + is passed, kernel could allocate physical memory region above 4G, that cause second kernel crash on system that require some amount of low memory, e.g. swiotlb requires at least 64M+32K low memory. Kernel would @@ -620,7 +620,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. This one let user to specify own low range under 4G for second kernel instead. 0: to disable low allocation. - It will be ignored when crashkernel_high=X is not used + It will be ignored when crashkernel=X,high is not used or memory reserved is below 4G. cs89x0_dma= [HW,NET] diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index a85a144f205..fae9134a2de 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -528,7 +528,7 @@ static void __init reserve_crashkernel_low(void) int ret; total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT)); - /* crashkernel_low=YM */ + /* crashkernel=Y,low */ ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base); if (ret != 0) { @@ -542,7 +542,7 @@ static void __init reserve_crashkernel_low(void) low_size = swiotlb_size_or_default() + (8UL<<20); auto_set = true; } else { - /* passed with crashkernel_low=0 ? */ + /* passed with crashkernel=0,low ? */ if (!low_size) return; } @@ -582,7 +582,7 @@ static void __init reserve_crashkernel(void) ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); if (ret != 0 || crash_size <= 0) { - /* crashkernel_high=XM */ + /* crashkernel=X,high */ ret = parse_crashkernel_high(boot_command_line, total_mem, &crash_size, &crash_base); if (ret != 0 || crash_size <= 0) diff --git a/kernel/kexec.c b/kernel/kexec.c index 1b2f73f5f9b..401fdb041f3 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1368,35 +1368,114 @@ static int __init parse_crashkernel_simple(char *cmdline, return 0; } +#define SUFFIX_HIGH 0 +#define SUFFIX_LOW 1 +#define SUFFIX_NULL 2 +static __initdata char *suffix_tbl[] = { + [SUFFIX_HIGH] = ",high", + [SUFFIX_LOW] = ",low", + [SUFFIX_NULL] = NULL, +}; + /* - * That function is the entry point for command line parsing and should be - * called from the arch-specific code. + * That function parses "suffix" crashkernel command lines like + * + * crashkernel=size,[high|low] + * + * It returns 0 on success and -EINVAL on failure. */ +static int __init parse_crashkernel_suffix(char *cmdline, + unsigned long long *crash_size, + unsigned long long *crash_base, + const char *suffix) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + /* check with suffix */ + if (strncmp(cur, suffix, strlen(suffix))) { + pr_warn("crashkernel: unrecognized char\n"); + return -EINVAL; + } + cur += strlen(suffix); + if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char\n"); + return -EINVAL; + } + + return 0; +} + +static __init char *get_last_crashkernel(char *cmdline, + const char *name, + const char *suffix) +{ + char *p = cmdline, *ck_cmdline = NULL; + + /* find crashkernel and use the last one if there are more */ + p = strstr(p, name); + while (p) { + char *end_p = strchr(p, ' '); + char *q; + + if (!end_p) + end_p = p + strlen(p); + + if (!suffix) { + int i; + + /* skip the one with any known suffix */ + for (i = 0; suffix_tbl[i]; i++) { + q = end_p - strlen(suffix_tbl[i]); + if (!strncmp(q, suffix_tbl[i], + strlen(suffix_tbl[i]))) + goto next; + } + ck_cmdline = p; + } else { + q = end_p - strlen(suffix); + if (!strncmp(q, suffix, strlen(suffix))) + ck_cmdline = p; + } +next: + p = strstr(p+1, name); + } + + if (!ck_cmdline) + return NULL; + + return ck_cmdline; +} + static int __init __parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base, - const char *name) + const char *name, + const char *suffix) { - char *p = cmdline, *ck_cmdline = NULL; char *first_colon, *first_space; + char *ck_cmdline; BUG_ON(!crash_size || !crash_base); *crash_size = 0; *crash_base = 0; - /* find crashkernel and use the last one if there are more */ - p = strstr(p, name); - while (p) { - ck_cmdline = p; - p = strstr(p+1, name); - } + ck_cmdline = get_last_crashkernel(cmdline, name, suffix); if (!ck_cmdline) return -EINVAL; ck_cmdline += strlen(name); + if (suffix) + return parse_crashkernel_suffix(ck_cmdline, crash_size, + crash_base, suffix); /* * if the commandline contains a ':', then that's the extended * syntax -- if not, it must be the classic syntax @@ -1413,13 +1492,17 @@ static int __init __parse_crashkernel(char *cmdline, return 0; } +/* + * That function is the entry point for command line parsing and should be + * called from the arch-specific code. + */ int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base) { return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel="); + "crashkernel=", NULL); } int __init parse_crashkernel_high(char *cmdline, @@ -1428,7 +1511,7 @@ int __init parse_crashkernel_high(char *cmdline, unsigned long long *crash_base) { return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel_high="); + "crashkernel=", suffix_tbl[SUFFIX_HIGH]); } int __init parse_crashkernel_low(char *cmdline, @@ -1437,7 +1520,7 @@ int __init parse_crashkernel_low(char *cmdline, unsigned long long *crash_base) { return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel_low="); + "crashkernel=", suffix_tbl[SUFFIX_LOW]); } static void update_vmcoreinfo_note(void) -- cgit v1.2.3-70-g09d2 From 157752d84f5df47e01577970f9c5f61a0b9f4546 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 15 Apr 2013 22:23:46 -0700 Subject: kexec: use Crash kernel for Crash kernel low We can extend kexec-tools to support multiple "Crash kernel" in /proc/iomem instead. So we can use "Crash kernel" instead of "Crash kernel low" in /proc/iomem. Suggested-by: Vivek Goyal Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1366089828-19692-3-git-send-email-yinghai@kernel.org Acked-by: Vivek Goyal Signed-off-by: H. Peter Anvin --- kernel/kexec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 401fdb041f3..ffd4e111fd6 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -55,7 +55,7 @@ struct resource crashk_res = { .flags = IORESOURCE_BUSY | IORESOURCE_MEM }; struct resource crashk_low_res = { - .name = "Crash kernel low", + .name = "Crash kernel", .start = 0, .end = 0, .flags = IORESOURCE_BUSY | IORESOURCE_MEM -- cgit v1.2.3-70-g09d2 From b9e146d8eb3b9ecae5086d373b50fa0c1f3e7f0f Mon Sep 17 00:00:00 2001 From: Emese Revfy Date: Wed, 17 Apr 2013 15:58:36 -0700 Subject: kernel/signal.c: stop info leak via the tkill and the tgkill syscalls This fixes a kernel memory contents leak via the tkill and tgkill syscalls for compat processes. This is visible in the siginfo_t->_sifields._rt.si_sigval.sival_ptr field when handling signals delivered from tkill. The place of the infoleak: int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) { ... put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr); ... } Signed-off-by: Emese Revfy Reviewed-by: PaX Team Signed-off-by: Kees Cook Cc: Al Viro Cc: Oleg Nesterov Cc: "Eric W. Biederman" Cc: Serge Hallyn Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index dd72567767d..598dc06be42 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2948,7 +2948,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) static int do_tkill(pid_t tgid, pid_t pid, int sig) { - struct siginfo info; + struct siginfo info = {}; info.si_signo = sig; info.si_errno = 0; -- cgit v1.2.3-70-g09d2 From 5c51543b0ae45967cfa99ca16748dc72888cc265 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 18 Apr 2013 18:33:18 +0900 Subject: kprobes: Fix a double lock bug of kprobe_mutex Fix a double locking bug caused when debug.kprobe-optimization=0. While the proc_kprobes_optimization_handler locks kprobe_mutex, wait_for_kprobe_optimizer locks it again and that causes a double lock. To fix the bug, this introduces different mutex for protecting sysctl parameter and locks it in proc_kprobes_optimization_handler. Of course, since we need to lock kprobe_mutex when touching kprobes resources, that is done in *optimize_all_kprobes(). This bug was introduced by commit ad72b3bea744 ("kprobes: fix wait_for_kprobe_optimizer()") Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Ingo Molnar Cc: Tejun Heo Cc: "David S. Miller" Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e35be53f661..3fed7f0cbcd 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -794,16 +794,16 @@ out: } #ifdef CONFIG_SYSCTL -/* This should be called with kprobe_mutex locked */ static void __kprobes optimize_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; unsigned int i; + mutex_lock(&kprobe_mutex); /* If optimization is already allowed, just return */ if (kprobes_allow_optimization) - return; + goto out; kprobes_allow_optimization = true; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { @@ -813,18 +813,22 @@ static void __kprobes optimize_all_kprobes(void) optimize_kprobe(p); } printk(KERN_INFO "Kprobes globally optimized\n"); +out: + mutex_unlock(&kprobe_mutex); } -/* This should be called with kprobe_mutex locked */ static void __kprobes unoptimize_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; unsigned int i; + mutex_lock(&kprobe_mutex); /* If optimization is already prohibited, just return */ - if (!kprobes_allow_optimization) + if (!kprobes_allow_optimization) { + mutex_unlock(&kprobe_mutex); return; + } kprobes_allow_optimization = false; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { @@ -834,11 +838,14 @@ static void __kprobes unoptimize_all_kprobes(void) unoptimize_kprobe(p, false); } } + mutex_unlock(&kprobe_mutex); + /* Wait for unoptimizing completion */ wait_for_kprobe_optimizer(); printk(KERN_INFO "Kprobes globally unoptimized\n"); } +static DEFINE_MUTEX(kprobe_sysctl_mutex); int sysctl_kprobes_optimization; int proc_kprobes_optimization_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, @@ -846,7 +853,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, { int ret; - mutex_lock(&kprobe_mutex); + mutex_lock(&kprobe_sysctl_mutex); sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0; ret = proc_dointvec_minmax(table, write, buffer, length, ppos); @@ -854,7 +861,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, optimize_all_kprobes(); else unoptimize_all_kprobes(); - mutex_unlock(&kprobe_mutex); + mutex_unlock(&kprobe_sysctl_mutex); return ret; } -- cgit v1.2.3-70-g09d2 From 0a82a8d132b26d438eb90b3ab35a7016e7227a1d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 18 Apr 2013 09:00:26 -0700 Subject: Revert "block: add missing block_bio_complete() tracepoint" This reverts commit 3a366e614d0837d9fc23f78cdb1a1186ebc3387f. Wanlong Gao reports that it causes a kernel panic on his machine several minutes after boot. Reverting it removes the panic. Jens says: "It's not quite clear why that is yet, so I think we should just revert the commit for 3.9 final (which I'm assuming is pretty close). The wifi is crap at the LSF hotel, so sending this email instead of queueing up a revert and pull request." Reported-by: Wanlong Gao Requested-by: Jens Axboe Cc: Tejun Heo Cc: Steven Rostedt Signed-off-by: Linus Torvalds --- block/blk-core.c | 1 + drivers/md/dm.c | 1 + drivers/md/raid5.c | 11 ++++++++++- fs/bio.c | 2 -- include/linux/blktrace_api.h | 1 - include/trace/events/block.h | 8 ++++---- kernel/trace/blktrace.c | 26 +++----------------------- 7 files changed, 19 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/block/blk-core.c b/block/blk-core.c index 074b758efc4..7c288358a74 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -39,6 +39,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); DEFINE_IDA(blk_queue_ida); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7e469260fe5..9a0bdad9ad8 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -611,6 +611,7 @@ static void dec_pending(struct dm_io *io, int error) queue_io(md, bio); } else { /* done with normal IO or empty flush */ + trace_block_bio_complete(md->queue, bio, io_error); bio_endio(bio, io_error); } } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 24909eb13fe..f4e87bfc756 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -184,6 +184,8 @@ static void return_io(struct bio *return_bi) return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; + trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), + bi, 0); bio_endio(bi, 0); bi = return_bi; } @@ -3914,6 +3916,8 @@ static void raid5_align_endio(struct bio *bi, int error) rdev_dec_pending(rdev, conf->mddev); if (!error && uptodate) { + trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), + raid_bi, 0); bio_endio(raid_bi, 0); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); @@ -4382,6 +4386,8 @@ static void make_request(struct mddev *mddev, struct bio * bi) if ( rw == WRITE ) md_write_end(mddev); + trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), + bi, 0); bio_endio(bi, 0); } } @@ -4758,8 +4764,11 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) handled++; } remaining = raid5_dec_bi_active_stripes(raid_bio); - if (remaining == 0) + if (remaining == 0) { + trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), + raid_bio, 0); bio_endio(raid_bio, 0); + } if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); return handled; diff --git a/fs/bio.c b/fs/bio.c index bb5768f59b3..b96fc6ce485 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1428,8 +1428,6 @@ void bio_endio(struct bio *bio, int error) else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) error = -EIO; - trace_block_bio_complete(bio, error); - if (bio->bi_end_io) bio->bi_end_io(bio, error); } diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 0ea61e07a91..7c2e030e72f 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -12,7 +12,6 @@ struct blk_trace { int trace_state; - bool rq_based; struct rchan *rchan; unsigned long __percpu *sequence; unsigned char __percpu *msg_data; diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 9961726523d..9c1467357b0 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -257,6 +257,7 @@ TRACE_EVENT(block_bio_bounce, /** * block_bio_complete - completed all work on the block operation + * @q: queue holding the block operation * @bio: block operation completed * @error: io error value * @@ -265,9 +266,9 @@ TRACE_EVENT(block_bio_bounce, */ TRACE_EVENT(block_bio_complete, - TP_PROTO(struct bio *bio, int error), + TP_PROTO(struct request_queue *q, struct bio *bio, int error), - TP_ARGS(bio, error), + TP_ARGS(q, bio, error), TP_STRUCT__entry( __field( dev_t, dev ) @@ -278,8 +279,7 @@ TRACE_EVENT(block_bio_complete, ), TP_fast_assign( - __entry->dev = bio->bi_bdev ? - bio->bi_bdev->bd_dev : 0; + __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_sector; __entry->nr_sector = bio->bi_size >> 9; __entry->error = error; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 9e5b8c272ee..5a0f781cd72 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -739,12 +739,6 @@ static void blk_add_trace_rq_complete(void *ignore, struct request_queue *q, struct request *rq) { - struct blk_trace *bt = q->blk_trace; - - /* if control ever passes through here, it's a request based driver */ - if (unlikely(bt && !bt->rq_based)) - bt->rq_based = true; - blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); } @@ -780,24 +774,10 @@ static void blk_add_trace_bio_bounce(void *ignore, blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); } -static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error) +static void blk_add_trace_bio_complete(void *ignore, + struct request_queue *q, struct bio *bio, + int error) { - struct request_queue *q; - struct blk_trace *bt; - - if (!bio->bi_bdev) - return; - - q = bdev_get_queue(bio->bi_bdev); - bt = q->blk_trace; - - /* - * Request based drivers will generate both rq and bio completions. - * Ignore bio ones. - */ - if (likely(!bt) || bt->rq_based) - return; - blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); } -- cgit v1.2.3-70-g09d2