From 60e69eed85bb7b5198ef70643b5895c26ad76ef7 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Mon, 7 Apr 2014 10:55:15 +0200 Subject: sched/numa: Fix task_numa_free() lockdep splat Sasha reported that lockdep claims that the following commit: made numa_group.lock interrupt unsafe: 156654f491dd ("sched/numa: Move task_numa_free() to __put_task_struct()") While I don't see how that could be, given the commit in question moved task_numa_free() from one irq enabled region to another, the below does make both gripes and lockups upon gripe with numa=fake=4 go away. Reported-by: Sasha Levin Fixes: 156654f491dd ("sched/numa: Move task_numa_free() to __put_task_struct()") Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: torvalds@linux-foundation.org Cc: mgorman@suse.com Cc: akpm@linux-foundation.org Cc: Dave Jones Link: http://lkml.kernel.org/r/1396860915.5170.5.camel@marge.simpson.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 13 +++++++------ kernel/sched/sched.h | 9 +++++++++ 2 files changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7e9bd0b1fa9..4f14a656a72 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1497,7 +1497,7 @@ static void task_numa_placement(struct task_struct *p) /* If the task is part of a group prevent parallel updates to group stats */ if (p->numa_group) { group_lock = &p->numa_group->lock; - spin_lock(group_lock); + spin_lock_irq(group_lock); } /* Find the node with the highest number of faults */ @@ -1572,7 +1572,7 @@ static void task_numa_placement(struct task_struct *p) } } - spin_unlock(group_lock); + spin_unlock_irq(group_lock); } /* Preferred node as the node with the most faults */ @@ -1677,7 +1677,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (!join) return; - double_lock(&my_grp->lock, &grp->lock); + BUG_ON(irqs_disabled()); + double_lock_irq(&my_grp->lock, &grp->lock); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { my_grp->faults[i] -= p->numa_faults_memory[i]; @@ -1691,7 +1692,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, grp->nr_tasks++; spin_unlock(&my_grp->lock); - spin_unlock(&grp->lock); + spin_unlock_irq(&grp->lock); rcu_assign_pointer(p->numa_group, grp); @@ -1710,14 +1711,14 @@ void task_numa_free(struct task_struct *p) void *numa_faults = p->numa_faults_memory; if (grp) { - spin_lock(&grp->lock); + spin_lock_irq(&grp->lock); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) grp->faults[i] -= p->numa_faults_memory[i]; grp->total_faults -= p->total_numa_faults; list_del(&p->numa_entry); grp->nr_tasks--; - spin_unlock(&grp->lock); + spin_unlock_irq(&grp->lock); rcu_assign_pointer(p->numa_group, NULL); put_numa_group(grp); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c9007f28d3a..456e492a3dc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1385,6 +1385,15 @@ static inline void double_lock(spinlock_t *l1, spinlock_t *l2) spin_lock_nested(l2, SINGLE_DEPTH_NESTING); } +static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + spin_lock_irq(l1); + spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) { if (l1 > l2) -- cgit v1.2.3-70-g09d2 From a227960fe0cafcc229a8d6bb8b454a3a0b33719d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Apr 2014 16:15:59 +0200 Subject: locking/mutex: Fix debug_mutexes debug_mutex_unlock() would bail when !debug_locks and forgets to actually unlock. Reported-by: "Michael L. Semon" Reported-by: "Kirill A. Shutemov" Reported-by: Valdis Kletnieks Fixes: 6f008e72cd11 ("locking/mutex: Fix debug checks") Tested-by: Dave Jones Cc: Jason Low Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140410141559.GE13658@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/locking/mutex-debug.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index e1191c996c5..5cf6731b98e 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -71,18 +71,17 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, void debug_mutex_unlock(struct mutex *lock) { - if (unlikely(!debug_locks)) - return; + if (likely(debug_locks)) { + DEBUG_LOCKS_WARN_ON(lock->magic != lock); - DEBUG_LOCKS_WARN_ON(lock->magic != lock); + if (!lock->owner) + DEBUG_LOCKS_WARN_ON(!lock->owner); + else + DEBUG_LOCKS_WARN_ON(lock->owner != current); - if (!lock->owner) - DEBUG_LOCKS_WARN_ON(!lock->owner); - else - DEBUG_LOCKS_WARN_ON(lock->owner != current); - - DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); - mutex_clear_owner(lock); + DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); + mutex_clear_owner(lock); + } /* * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug -- cgit v1.2.3-70-g09d2 From 2eac7648321f4a08aa4078504d7727af0af7173b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 14 Apr 2014 21:02:59 +0200 Subject: seccomp: fix populating a0-a5 syscall args in 32-bit x86 BPF Linus reports that on 32-bit x86 Chromium throws the following seccomp resp. audit log messages: audit: type=1326 audit(1397359304.356:28108): auid=500 uid=500 gid=500 ses=2 subj=unconfined_u:unconfined_r:chrome_sandbox_t:s0-s0:c0.c1023 pid=3677 comm="chrome" exe="/opt/google/chrome/chrome" sig=0 syscall=172 compat=0 ip=0xb2dd9852 code=0x30000 audit: type=1326 audit(1397359304.356:28109): auid=500 uid=500 gid=500 ses=2 subj=unconfined_u:unconfined_r:chrome_sandbox_t:s0-s0:c0.c1023 pid=3677 comm="chrome" exe="/opt/google/chrome/chrome" sig=0 syscall=5 compat=0 ip=0xb2dd9852 code=0x50000 These audit messages are being triggered via audit_seccomp() through __secure_computing() in seccomp mode (BPF) filter with seccomp return codes 0x30000 (== SECCOMP_RET_TRAP) and 0x50000 (== SECCOMP_RET_ERRNO) during filter runtime. Moreover, Linus reports that x86_64 Chromium seems fine. The underlying issue that explains this is that the implementation of populate_seccomp_data() is wrong. Our seccomp data structure sd that is being shared with user ABI is: struct seccomp_data { int nr; __u32 arch; __u64 instruction_pointer; __u64 args[6]; }; Therefore, a simple cast to 'unsigned long *' for storing the value of the syscall argument via syscall_get_arguments() is just wrong as on 32-bit x86 (or any other 32bit arch), it would result in storing a0-a5 at wrong offsets in args[] member, and thus i) could leak stack memory to user space and ii) tampers with the logic of seccomp BPF programs that read out and check for syscall arguments: syscall_get_arguments(task, regs, 0, 1, (unsigned long *) &sd->args[0]); Tested on 32-bit x86 with Google Chrome, unfortunately only via remote test machine through slow ssh X forwarding, but it fixes the issue on my side. So fix it up by storing args in type correct variables, gcc is clever and optimizes the copy away in other cases, e.g. x86_64. Fixes: bd4cf0ed331a ("net: filter: rework/optimize internal BPF interpreter's instruction set") Reported-and-bisected-by: Linus Torvalds Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Cc: Linus Torvalds Cc: Eric Paris Cc: James Morris Cc: Kees Cook Signed-off-by: David S. Miller --- kernel/seccomp.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index d8d046c0726..590c3792508 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -69,18 +69,17 @@ static void populate_seccomp_data(struct seccomp_data *sd) { struct task_struct *task = current; struct pt_regs *regs = task_pt_regs(task); + unsigned long args[6]; sd->nr = syscall_get_nr(task, regs); sd->arch = syscall_get_arch(); - - /* Unroll syscall_get_args to help gcc on arm. */ - syscall_get_arguments(task, regs, 0, 1, (unsigned long *) &sd->args[0]); - syscall_get_arguments(task, regs, 1, 1, (unsigned long *) &sd->args[1]); - syscall_get_arguments(task, regs, 2, 1, (unsigned long *) &sd->args[2]); - syscall_get_arguments(task, regs, 3, 1, (unsigned long *) &sd->args[3]); - syscall_get_arguments(task, regs, 4, 1, (unsigned long *) &sd->args[4]); - syscall_get_arguments(task, regs, 5, 1, (unsigned long *) &sd->args[5]); - + syscall_get_arguments(task, regs, 0, 6, args); + sd->args[0] = args[0]; + sd->args[1] = args[1]; + sd->args[2] = args[2]; + sd->args[3] = args[3]; + sd->args[4] = args[4]; + sd->args[5] = args[5]; sd->instruction_pointer = KSTK_EIP(task); } -- cgit v1.2.3-70-g09d2 From e79323bd87808fdfbc68ce6c5371bd224d9672ee Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 14 Apr 2014 16:58:55 -0400 Subject: user namespace: fix incorrect memory barriers smp_read_barrier_depends() can be used if there is data dependency between the readers - i.e. if the read operation after the barrier uses address that was obtained from the read operation before the barrier. In this file, there is only control dependency, no data dependecy, so the use of smp_read_barrier_depends() is incorrect. The code could fail in the following way: * the cpu predicts that idx < entries is true and starts executing the body of the for loop * the cpu fetches map->extent[0].first and map->extent[0].count * the cpu fetches map->nr_extents * the cpu verifies that idx < extents is true, so it commits the instructions in the body of the for loop The problem is that in this scenario, the cpu read map->extent[0].first and map->nr_extents in the wrong order. We need a full read memory barrier to prevent it. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- kernel/user_namespace.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 0d8f6023fd8..bf71b4b2d63 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -152,7 +152,7 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) /* Find the matching extent */ extents = map->nr_extents; - smp_read_barrier_depends(); + smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; @@ -176,7 +176,7 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id) /* Find the matching extent */ extents = map->nr_extents; - smp_read_barrier_depends(); + smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; @@ -199,7 +199,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id) /* Find the matching extent */ extents = map->nr_extents; - smp_read_barrier_depends(); + smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; @@ -615,9 +615,8 @@ static ssize_t map_write(struct file *file, const char __user *buf, * were written before the count of the extents. * * To achieve this smp_wmb() is used on guarantee the write - * order and smp_read_barrier_depends() is guaranteed that we - * don't have crazy architectures returning stale data. - * + * order and smp_rmb() is guaranteed that we don't have crazy + * architectures returning stale data. */ mutex_lock(&id_map_mutex); -- cgit v1.2.3-70-g09d2 From 521c42990e9d561ed5ed9f501f07639d0512b3c9 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 15 Apr 2014 10:54:37 +0530 Subject: tick-common: Fix wrong check in tick_check_replacement() tick_check_replacement() returns if a replacement of clock_event_device is possible or not. It does this as the first check: if (tick_check_percpu(curdev, newdev, smp_processor_id())) return false; Thats wrong. tick_check_percpu() returns true when the device is useable. Check for false instead. [ tglx: Massaged changelog ] Signed-off-by: Viresh Kumar Cc: # v3.11+ Cc: linaro-kernel@lists.linaro.org Cc: fweisbec@gmail.com Cc: Arvind.Chauhan@arm.com Cc: linaro-networking@linaro.org Link: http://lkml.kernel.org/r/486a02efe0246635aaba786e24b42d316438bf3b.1397537987.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 015661279b6..0a0608edeb2 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -276,7 +276,7 @@ static bool tick_check_preferred(struct clock_event_device *curdev, bool tick_check_replacement(struct clock_event_device *curdev, struct clock_event_device *newdev) { - if (tick_check_percpu(curdev, newdev, smp_processor_id())) + if (!tick_check_percpu(curdev, newdev, smp_processor_id())) return false; return tick_check_preferred(curdev, newdev); -- cgit v1.2.3-70-g09d2 From 03e6bdc5c4d0fc166bfd5d3cf749a5a0c1b5b1bd Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 15 Apr 2014 10:54:40 +0530 Subject: tick-sched: Don't call update_wall_time() when delta is lesser than tick_period In tick_do_update_jiffies64() we are processing ticks only if delta is greater than tick_period. This is what we are supposed to do here and it broke a bit with this patch: commit 47a1b796 (tick/timekeeping: Call update_wall_time outside the jiffies lock) With above patch, we might end up calling update_wall_time() even if delta is found to be smaller that tick_period. Fix this by returning when the delta is less than tick period. [ tglx: Made it a 3 liner and massaged changelog ] Signed-off-by: Viresh Kumar Cc: linaro-kernel@lists.linaro.org Cc: fweisbec@gmail.com Cc: Arvind.Chauhan@arm.com Cc: linaro-networking@linaro.org Cc: John Stultz Cc: # v3.14+ Link: http://lkml.kernel.org/r/80afb18a494b0bd9710975bcc4de134ae323c74f.1397537987.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 9f8af69c67e..e947d968c5b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -84,6 +84,9 @@ static void tick_do_update_jiffies64(ktime_t now) /* Keep the tick_next_period variable up to date */ tick_next_period = ktime_add(last_jiffies_update, tick_period); + } else { + write_sequnlock(&jiffies_lock); + return; } write_sequnlock(&jiffies_lock); update_wall_time(); -- cgit v1.2.3-70-g09d2 From 27630532ef5ead28b98cfe28d8f95222ef91c2b7 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 15 Apr 2014 10:54:41 +0530 Subject: tick-sched: Check tick_nohz_enabled in tick_nohz_switch_to_nohz() Since commit d689fe222 (NOHZ: Check for nohz active instead of nohz enabled) the tick_nohz_switch_to_nohz() function returns because it checks for the tick_nohz_active flag. This can't be set, because the function itself sets it. Undo the change in tick_nohz_switch_to_nohz(). Signed-off-by: Viresh Kumar Cc: linaro-kernel@lists.linaro.org Cc: fweisbec@gmail.com Cc: Arvind.Chauhan@arm.com Cc: linaro-networking@linaro.org Cc: # 3.13+ Link: http://lkml.kernel.org/r/40939c05f2d65d781b92b20302b02243d0654224.1397537987.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e947d968c5b..6558b7ac112 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -970,7 +970,7 @@ static void tick_nohz_switch_to_nohz(void) struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t next; - if (!tick_nohz_active) + if (!tick_nohz_enabled) return; local_irq_disable(); -- cgit v1.2.3-70-g09d2 From 0acf07d240a84069c4a6651e6030cf35d30c7159 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 16 Apr 2014 10:54:34 -0700 Subject: seccomp: fix memory leak on filter attach This sets the correct error code when final filter memory is unavailable, and frees the raw filter no matter what. unreferenced object 0xffff8800d6ea4000 (size 512): comm "sshd", pid 278, jiffies 4294898315 (age 46.653s) hex dump (first 32 bytes): 21 00 00 00 04 00 00 00 15 00 01 00 3e 00 00 c0 !...........>... 06 00 00 00 00 00 00 00 21 00 00 00 00 00 00 00 ........!....... backtrace: [] kmemleak_alloc+0x4e/0xb0 [] __kmalloc+0x280/0x320 [] prctl_set_seccomp+0x11e/0x3b0 [] SyS_prctl+0x3bb/0x4a0 [] system_call_fastpath+0x1a/0x1f [] 0xffffffffffffffff Reported-by: Masami Ichikawa Signed-off-by: Kees Cook Tested-by: Masami Ichikawa Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/seccomp.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 590c3792508..b35c21503a3 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -255,6 +255,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) goto free_prog; /* Allocate a new seccomp_filter */ + ret = -ENOMEM; filter = kzalloc(sizeof(struct seccomp_filter) + sizeof(struct sock_filter_int) * new_len, GFP_KERNEL|__GFP_NOWARN); @@ -264,6 +265,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) ret = sk_convert_filter(fp, fprog->len, filter->insnsi, &new_len); if (ret) goto free_filter; + kfree(fp); atomic_set(&filter->usage, 1); filter->len = new_len; -- cgit v1.2.3-70-g09d2 From 5d6c97c55984b3b991400692f9e8568a702b93c0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 16 Apr 2014 19:21:53 -0400 Subject: tracing: Do not try to recreated toplevel set_ftrace_* files With the restructing of the function tracer working with instances, the "top level" buffer is a bit special, as the function tracing is mapped to the same set of filters. This is done by using a "global_ops" descriptor and having the "set_ftrace_filter" and "set_ftrace_notrace" map to it. When an instance is created, it creates the same files but its for the local instance and not the global_ops. The issues is that the local instance creation shares some code with the global instance one and we end up trying to create th top level "set_ftrace_*" files twice, and on boot up, we get an error like this: Could not create debugfs 'set_ftrace_filter' entry Could not create debugfs 'set_ftrace_notrace' entry The reason they failed to be created was because they were created twice, and the second time gives this error as you can not create the same file twice. Reported-by: Borislav Petkov Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 5b781d2be38..ffd56351b52 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -58,12 +58,16 @@ int ftrace_create_function_files(struct trace_array *tr, { int ret; - /* The top level array uses the "global_ops". */ - if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) { - ret = allocate_ftrace_ops(tr); - if (ret) - return ret; - } + /* + * The top level array uses the "global_ops", and the files are + * created on boot up. + */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return 0; + + ret = allocate_ftrace_ops(tr); + if (ret) + return ret; ftrace_create_filter_files(tr->ops, parent); -- cgit v1.2.3-70-g09d2 From a1d9a3231eac4117cadaf4b6bba5b2902c15a33e Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 10 Apr 2014 17:38:36 +0400 Subject: sched: Check for stop task appearance when balancing happens We need to do it like we do for the other higher priority classes.. Signed-off-by: Kirill Tkhai Cc: Michael wang Cc: Sasha Levin Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/336561397137116@web27h.yandex.ru Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 11 ++++++++++- kernel/sched/fair.c | 3 ++- kernel/sched/rt.c | 7 ++++--- 3 files changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 27ef4092552..b08095786cb 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1021,8 +1021,17 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) dl_rq = &rq->dl; - if (need_pull_dl_task(rq, prev)) + if (need_pull_dl_task(rq, prev)) { pull_dl_task(rq); + /* + * pull_rt_task() can drop (and re-acquire) rq->lock; this + * means a stop task can slip in, in which case we need to + * re-start task selection. + */ + if (rq->stop && rq->stop->on_rq) + return RETRY_TASK; + } + /* * When prev is DL, we may throttle it in put_prev_task(). * So, we update time before we check for dl_nr_running. diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4f14a656a72..7570dd969c2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6728,7 +6728,8 @@ static int idle_balance(struct rq *this_rq) out: /* Is there a task of a high priority class? */ if (this_rq->nr_running != this_rq->cfs.h_nr_running && - (this_rq->dl.dl_nr_running || + ((this_rq->stop && this_rq->stop->on_rq) || + this_rq->dl.dl_nr_running || (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt)))) pulled_task = -1; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index d8cdf161855..bd2267ad404 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1362,10 +1362,11 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) pull_rt_task(rq); /* * pull_rt_task() can drop (and re-acquire) rq->lock; this - * means a dl task can slip in, in which case we need to - * re-start task selection. + * means a dl or stop task can slip in, in which case we need + * to re-start task selection. */ - if (unlikely(rq->dl.dl_nr_running)) + if (unlikely((rq->stop && rq->stop->on_rq) || + rq->dl.dl_nr_running)) return RETRY_TASK; } -- cgit v1.2.3-70-g09d2 From 6ea6215fe394e320468589d9bba464a48f6d823a Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" Date: Thu, 17 Apr 2014 16:05:19 +0800 Subject: tracing/uprobes: Fix uprobe_cpu_buffer memory leak Forgot to free uprobe_cpu_buffer percpu page in uprobe_buffer_disable(). Link: http://lkml.kernel.org/p/534F8B3F.1090407@huawei.com Cc: stable@vger.kernel.org # v3.14+ Acked-by: Namhyung Kim Signed-off-by: zhangwei(Jovi) Signed-off-by: Steven Rostedt --- kernel/trace/trace_uprobe.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 930e51462dc..c082a744134 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -732,9 +732,15 @@ static int uprobe_buffer_enable(void) static void uprobe_buffer_disable(void) { + int cpu; + BUG_ON(!mutex_is_locked(&event_mutex)); if (--uprobe_buffer_refcnt == 0) { + for_each_possible_cpu(cpu) + free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, + cpu)->buf); + free_percpu(uprobe_cpu_buffer); uprobe_cpu_buffer = NULL; } -- cgit v1.2.3-70-g09d2 From 01f8fa4f01d8362358eb90e412bd7ae18a3ec1ad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Apr 2014 14:36:44 +0000 Subject: genirq: Allow forcing cpu affinity of interrupts The current implementation of irq_set_affinity() refuses rightfully to route an interrupt to an offline cpu. But there is a special case, where this is actually desired. Some of the ARM SoCs have per cpu timers which require setting the affinity during cpu startup where the cpu is not yet in the online mask. If we can't do that, then the local timer interrupt for the about to become online cpu is routed to some random online cpu. The developers of the affected machines tried to work around that issue, but that results in a massive mess in that timer code. We have a yet unused argument in the set_affinity callbacks of the irq chips, which I added back then for a similar reason. It was never required so it got not used. But I'm happy that I never removed it. That allows us to implement a sane handling of the above scenario. So the affected SoC drivers can add the required force handling to their interrupt chip, switch the timer code to irq_force_affinity() and things just work. This does not affect any existing user of irq_set_affinity(). Tagged for stable to allow a simple fix of the affected SoC clock event drivers. Reported-and-tested-by: Krzysztof Kozlowski Signed-off-by: Thomas Gleixner Cc: Kyungmin Park Cc: Marek Szyprowski Cc: Bartlomiej Zolnierkiewicz Cc: Tomasz Figa , Cc: Daniel Lezcano , Cc: Kukjin Kim Cc: linux-arm-kernel@lists.infradead.org, Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20140416143315.717251504@linutronix.de Signed-off-by: Thomas Gleixner --- arch/mips/cavium-octeon/octeon-irq.c | 2 +- include/linux/interrupt.h | 35 ++++++++++++++++++++++++++++++++++- include/linux/irq.h | 3 ++- kernel/irq/manage.c | 17 ++++++----------- 4 files changed, 43 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/arch/mips/cavium-octeon/octeon-irq.c b/arch/mips/cavium-octeon/octeon-irq.c index c2bb4f896ce..3aa5b46b2d4 100644 --- a/arch/mips/cavium-octeon/octeon-irq.c +++ b/arch/mips/cavium-octeon/octeon-irq.c @@ -635,7 +635,7 @@ static void octeon_irq_cpu_offline_ciu(struct irq_data *data) cpumask_clear(&new_affinity); cpumask_set_cpu(cpumask_first(cpu_online_mask), &new_affinity); } - __irq_set_affinity_locked(data, &new_affinity); + irq_set_affinity_locked(data, &new_affinity, false); } static int octeon_irq_ciu_set_affinity(struct irq_data *data, diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index c7bfac1c4a7..8834a7e5b94 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -203,7 +203,40 @@ static inline int check_wakeup_irqs(void) { return 0; } extern cpumask_var_t irq_default_affinity; -extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask); +/* Internal implementation. Use the helpers below */ +extern int __irq_set_affinity(unsigned int irq, const struct cpumask *cpumask, + bool force); + +/** + * irq_set_affinity - Set the irq affinity of a given irq + * @irq: Interrupt to set affinity + * @mask: cpumask + * + * Fails if cpumask does not contain an online CPU + */ +static inline int +irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) +{ + return __irq_set_affinity(irq, cpumask, false); +} + +/** + * irq_force_affinity - Force the irq affinity of a given irq + * @irq: Interrupt to set affinity + * @mask: cpumask + * + * Same as irq_set_affinity, but without checking the mask against + * online cpus. + * + * Solely for low level cpu hotplug code, where we need to make per + * cpu interrupts affine before the cpu becomes online. + */ +static inline int +irq_force_affinity(unsigned int irq, const struct cpumask *cpumask) +{ + return __irq_set_affinity(irq, cpumask, true); +} + extern int irq_can_set_affinity(unsigned int irq); extern int irq_select_affinity(unsigned int irq); diff --git a/include/linux/irq.h b/include/linux/irq.h index d278838908c..10a0b1ac4ea 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -394,7 +394,8 @@ extern void remove_percpu_irq(unsigned int irq, struct irqaction *act); extern void irq_cpu_online(void); extern void irq_cpu_offline(void); -extern int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *cpumask); +extern int irq_set_affinity_locked(struct irq_data *data, + const struct cpumask *cpumask, bool force); #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ) void irq_move_irq(struct irq_data *data); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2486a4c1a71..d34131ca372 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -180,7 +180,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, struct irq_chip *chip = irq_data_get_irq_chip(data); int ret; - ret = chip->irq_set_affinity(data, mask, false); + ret = chip->irq_set_affinity(data, mask, force); switch (ret) { case IRQ_SET_MASK_OK: cpumask_copy(data->affinity, mask); @@ -192,7 +192,8 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, return ret; } -int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) +int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, + bool force) { struct irq_chip *chip = irq_data_get_irq_chip(data); struct irq_desc *desc = irq_data_to_desc(data); @@ -202,7 +203,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) return -EINVAL; if (irq_can_move_pcntxt(data)) { - ret = irq_do_set_affinity(data, mask, false); + ret = irq_do_set_affinity(data, mask, force); } else { irqd_set_move_pending(data); irq_copy_pending(desc, mask); @@ -217,13 +218,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) return ret; } -/** - * irq_set_affinity - Set the irq affinity of a given irq - * @irq: Interrupt to set affinity - * @mask: cpumask - * - */ -int irq_set_affinity(unsigned int irq, const struct cpumask *mask) +int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -233,7 +228,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask) return -EINVAL; raw_spin_lock_irqsave(&desc->lock, flags); - ret = __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask); + ret = irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force); raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } -- cgit v1.2.3-70-g09d2 From 7861144b8cb217634d738e94a748deeae139a1e2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 18 Apr 2014 15:07:12 -0700 Subject: kernel/watchdog.c:touch_softlockup_watchdog(): use raw_cpu_write() Fix: BUG: using __this_cpu_write() in preemptible [00000000] code: systemd-udevd/497 caller is __this_cpu_preempt_check+0x13/0x20 CPU: 3 PID: 497 Comm: systemd-udevd Tainted: G W 3.15.0-rc1 #9 Hardware name: Hewlett-Packard HP EliteBook 8470p/179B, BIOS 68ICF Ver. F.02 04/27/2012 Call Trace: check_preemption_disabled+0xe1/0xf0 __this_cpu_preempt_check+0x13/0x20 touch_nmi_watchdog+0x28/0x40 Reported-by: Luis Henriques Tested-by: Luis Henriques Cc: Eric Piel Cc: Robert Moore Cc: Lv Zheng Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index e90089fd78e..516203e665f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -138,7 +138,11 @@ static void __touch_watchdog(void) void touch_softlockup_watchdog(void) { - __this_cpu_write(watchdog_touch_ts, 0); + /* + * Preemption can be enabled. It doesn't matter which CPU's timestamp + * gets zeroed here, so use the raw_ operation. + */ + raw_cpu_write(watchdog_touch_ts, 0); } EXPORT_SYMBOL(touch_softlockup_watchdog); -- cgit v1.2.3-70-g09d2 From f3f125324fc1b8500cd20a2907628f7e5d88a708 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 20 Apr 2014 23:43:01 +0200 Subject: PM / suspend: Make cpuidle work in the "freeze" state The "freeze" system sleep state introduced by commit 7e73c5ae6e79 (PM: Introduce suspend state PM_SUSPEND_FREEZE) requires cpuidle to be functional when freeze_enter() is executed to work correctly (that is, to be able to save any more energy than runtime idle), but that is impossible after commit 8651f97bd951d (PM / cpuidle: System resume hang fix with cpuidle) which caused cpuidle to be paused in dpm_suspend_noirq() and resumed in dpm_resume_noirq(). To avoid that problem, add cpuidle_resume() and cpuidle_pause() to the beginning and the end of freeze_enter(), respectively. Reported-by: Zhang Rui Signed-off-by: Rafael J. Wysocki Reviewed-by: Preeti U Murthy --- kernel/power/suspend.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index c3ad9cafe93..8233cd4047d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -53,7 +54,9 @@ static void freeze_begin(void) static void freeze_enter(void) { + cpuidle_resume(); wait_event(suspend_freeze_wait_head, suspend_freeze_wake); + cpuidle_pause(); } void freeze_wake(void) -- cgit v1.2.3-70-g09d2 From 79465d2fd48e68940c2bdecddbdecd45bbba06fe Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 28 Apr 2014 11:05:43 +0930 Subject: module: remove warning about waiting module removal. We remove the waiting module removal in commit 3f2b9c9cdf38 (September 2013), but it turns out that modprobe in kmod (< version 16) was asking for waiting module removal. No one noticed since modprobe would check for 0 usage immediately before trying to remove the module, and the race is unlikely. However, it means that anyone running old (but not ancient) kmod versions is hitting the printk designed to see if anyone was running "rmmod -w". All reports so far have been false positives, so remove the warning. Fixes: 3f2b9c9cdf389e303b2273679af08aab5f153517 Reported-by: Valerio Vanni Cc: Elliott, Robert (Server Storage) Cc: stable@kernel.org Acked-by: Lucas De Marchi Signed-off-by: Rusty Russell --- kernel/module.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 11869408f79..ae7821898bf 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -815,9 +815,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, return -EFAULT; name[MODULE_NAME_LEN-1] = '\0'; - if (!(flags & O_NONBLOCK)) - pr_warn("waiting module removal not supported: please upgrade\n"); - if (mutex_lock_interruptible(&module_mutex) != 0) return -EINTR; -- cgit v1.2.3-70-g09d2 From 62a08ae2a5763aabeee98264605236b001503e0c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 24 Apr 2014 09:50:53 +0200 Subject: genirq: x86: Ensure that dynamic irq allocation does not conflict On x86 the allocation of irq descriptors may allocate interrupts which are in the range of the GSI interrupts. That's wrong as those interrupts are hardwired and we don't have the irq domain translation like PPC. So one of these interrupts can be hooked up later to one of the devices which are hard wired to it and the io_apic init code for that particular interrupt line happily reuses that descriptor with a completely different configuration so hell breaks lose. Inside x86 we allocate dynamic interrupts from above nr_gsi_irqs, except for a few usage sites which have not yet blown up in our face for whatever reason. But for drivers which need an irq range, like the GPIO drivers, we have no limit in place and we don't want to expose such a detail to a driver. To cure this introduce a function which an architecture can implement to impose a lower bound on the dynamic interrupt allocations. Implement it for x86 and set the lower bound to nr_gsi_irqs, which is the end of the hardwired interrupt space, so all dynamic allocations happen above. That not only allows the GPIO driver to work sanely, it also protects the bogus callsites of create_irq_nr() in hpet, uv, irq_remapping and htirq code. They need to be cleaned up as well, but that's a separate issue. Reported-by: Jin Yao Signed-off-by: Thomas Gleixner Tested-by: Mika Westerberg Cc: Mathias Nyman Cc: Linus Torvalds Cc: Grant Likely Cc: H. Peter Anvin Cc: Rafael J. Wysocki Cc: Andy Shevchenko Cc: Krogerus Heikki Cc: Linus Walleij Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1404241617360.28206@ionos.tec.linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 5 +++++ include/linux/irq.h | 2 ++ kernel/irq/irqdesc.c | 7 +++++++ kernel/softirq.c | 5 +++++ 4 files changed, 19 insertions(+) (limited to 'kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 6ad4658de70..d23aa82e7a7 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3425,6 +3425,11 @@ int get_nr_irqs_gsi(void) return nr_irqs_gsi; } +unsigned int arch_dynirq_lower_bound(unsigned int from) +{ + return from < nr_irqs_gsi ? nr_irqs_gsi : from; +} + int __init arch_probe_nr_irqs(void) { int nr; diff --git a/include/linux/irq.h b/include/linux/irq.h index 10a0b1ac4ea..5c57efb863d 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -603,6 +603,8 @@ static inline u32 irq_get_trigger_type(unsigned int irq) return d ? irqd_get_trigger_type(d) : 0; } +unsigned int arch_dynirq_lower_bound(unsigned int from); + int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, struct module *owner); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index a7174617616..bb07f2928f4 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -363,6 +363,13 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, if (from > irq) return -EINVAL; from = irq; + } else { + /* + * For interrupts which are freely allocated the + * architecture can force a lower bound to the @from + * argument. x86 uses this to exclude the GSI space. + */ + from = arch_dynirq_lower_bound(from); } mutex_lock(&sparse_irq_lock); diff --git a/kernel/softirq.c b/kernel/softirq.c index b50990a5bea..33e4648ae0e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -779,3 +779,8 @@ int __init __weak arch_early_irq_init(void) { return 0; } + +unsigned int __weak arch_dynirq_lower_bound(unsigned int from) +{ + return from; +} -- cgit v1.2.3-70-g09d2 From a949ae560a511fe4e3adf48fa44fefded93e5c2b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 24 Apr 2014 10:40:12 -0400 Subject: ftrace/module: Hardcode ftrace_module_init() call into load_module() A race exists between module loading and enabling of function tracer. CPU 1 CPU 2 ----- ----- load_module() module->state = MODULE_STATE_COMING register_ftrace_function() mutex_lock(&ftrace_lock); ftrace_startup() update_ftrace_function(); ftrace_arch_code_modify_prepare() set_all_module_text_rw(); ftrace_arch_code_modify_post_process() set_all_module_text_ro(); [ here all module text is set to RO, including the module that is loading!! ] blocking_notifier_call_chain(MODULE_STATE_COMING); ftrace_init_module() [ tries to modify code, but it's RO, and fails! ftrace_bug() is called] When this race happens, ftrace_bug() will produces a nasty warning and all of the function tracing features will be disabled until reboot. The simple solution is to treate module load the same way the core kernel is treated at boot. To hardcode the ftrace function modification of converting calls to mcount into nops. This is done in init/main.c there's no reason it could not be done in load_module(). This gives a better control of the changes and doesn't tie the state of the module to its notifiers as much. Ftrace is special, it needs to be treated as such. The reason this would work, is that the ftrace_module_init() would be called while the module is in MODULE_STATE_UNFORMED, which is ignored by the set_all_module_text_ro() call. Link: http://lkml.kernel.org/r/1395637826-3312-1-git-send-email-indou.takao@jp.fujitsu.com Reported-by: Takao Indoh Acked-by: Rusty Russell Cc: stable@vger.kernel.org # 2.6.38+ Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 2 ++ kernel/module.c | 3 +++ kernel/trace/ftrace.c | 27 ++++----------------------- 3 files changed, 9 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 9212b017bc7..ae9504b4b67 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -535,6 +535,7 @@ static inline int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_a extern int ftrace_arch_read_dyn_info(char *buf, int size); extern int skip_trace(unsigned long ip); +extern void ftrace_module_init(struct module *mod); extern void ftrace_disable_daemon(void); extern void ftrace_enable_daemon(void); @@ -544,6 +545,7 @@ static inline int ftrace_force_update(void) { return 0; } static inline void ftrace_disable_daemon(void) { } static inline void ftrace_enable_daemon(void) { } static inline void ftrace_release_mod(struct module *mod) {} +static inline void ftrace_module_init(struct module *mod) {} static inline __init int register_ftrace_command(struct ftrace_func_command *cmd) { return -EINVAL; diff --git a/kernel/module.c b/kernel/module.c index 11869408f79..5f14fec9f82 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3271,6 +3271,9 @@ static int load_module(struct load_info *info, const char __user *uargs, dynamic_debug_setup(info->debug, info->num_debug); + /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */ + ftrace_module_init(mod); + /* Finally it's fully formed, ready to start executing. */ err = complete_formation(mod, info); if (err) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1fd4b947921..4a54a25afa2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4330,16 +4330,11 @@ static void ftrace_init_module(struct module *mod, ftrace_process_locs(mod, start, end); } -static int ftrace_module_notify_enter(struct notifier_block *self, - unsigned long val, void *data) +void ftrace_module_init(struct module *mod) { - struct module *mod = data; - - if (val == MODULE_STATE_COMING) - ftrace_init_module(mod, mod->ftrace_callsites, - mod->ftrace_callsites + - mod->num_ftrace_callsites); - return 0; + ftrace_init_module(mod, mod->ftrace_callsites, + mod->ftrace_callsites + + mod->num_ftrace_callsites); } static int ftrace_module_notify_exit(struct notifier_block *self, @@ -4353,11 +4348,6 @@ static int ftrace_module_notify_exit(struct notifier_block *self, return 0; } #else -static int ftrace_module_notify_enter(struct notifier_block *self, - unsigned long val, void *data) -{ - return 0; -} static int ftrace_module_notify_exit(struct notifier_block *self, unsigned long val, void *data) { @@ -4365,11 +4355,6 @@ static int ftrace_module_notify_exit(struct notifier_block *self, } #endif /* CONFIG_MODULES */ -struct notifier_block ftrace_module_enter_nb = { - .notifier_call = ftrace_module_notify_enter, - .priority = INT_MAX, /* Run before anything that can use kprobes */ -}; - struct notifier_block ftrace_module_exit_nb = { .notifier_call = ftrace_module_notify_exit, .priority = INT_MIN, /* Run after anything that can remove kprobes */ @@ -4403,10 +4388,6 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); - ret = register_module_notifier(&ftrace_module_enter_nb); - if (ret) - pr_warning("Failed to register trace ftrace module enter notifier\n"); - ret = register_module_notifier(&ftrace_module_exit_nb); if (ret) pr_warning("Failed to register trace ftrace module exit notifier\n"); -- cgit v1.2.3-70-g09d2 From 6c6c0d5a1c949d2e084706f9e5fb1fccc175b265 Mon Sep 17 00:00:00 2001 From: Stuart Hayes Date: Tue, 29 Apr 2014 17:55:02 -0500 Subject: hrtimer: Prevent all reprogramming if hang detected If the last hrtimer interrupt detected a hang it sets hang_detected=1 and programs the clock event device with a delay to let the system make progress. If hang_detected == 1, we prevent reprogramming of the clock event device in hrtimer_reprogram() but not in hrtimer_force_reprogram(). This can lead to the following situation: hrtimer_interrupt() hang_detected = 1; program ce device to Xms from now (hang delay) We have two timers pending: T1 expires 50ms from now T2 expires 5s from now Now T1 gets canceled, which causes hrtimer_force_reprogram() to be invoked, which in turn programs the clock event device to T2 (5 seconds from now). Any hrtimer_start after that will not reprogram the hardware due to hang_detected still being set. So we effectivly block all timers until the T2 event fires and cleans up the hang situation. Add a check for hang_detected to hrtimer_force_reprogram() which prevents the reprogramming of the hang delay in the hardware timer. The subsequent hrtimer_interrupt will resolve all outstanding issues. [ tglx: Rewrote subject and changelog and fixed up the comment in hrtimer_force_reprogram() ] Signed-off-by: Stuart Hayes Link: http://lkml.kernel.org/r/53602DC6.2060101@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index d55092ceee2..e3724fdac2d 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -569,6 +569,23 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) cpu_base->expires_next.tv64 = expires_next.tv64; + /* + * If a hang was detected in the last timer interrupt then we + * leave the hang delay active in the hardware. We want the + * system to make progress. That also prevents the following + * scenario: + * T1 expires 50ms from now + * T2 expires 5s from now + * + * T1 is removed, so this code is called and would reprogram + * the hardware to 5s from now. Any hrtimer_start after that + * will not reprogram the hardware due to hang_detected being + * set. So we'd effectivly block all timers until the T2 event + * fires. + */ + if (cpu_base->hang_detected) + return; + if (cpu_base->expires_next.tv64 != KTIME_MAX) tick_program_event(cpu_base->expires_next, 1); } -- cgit v1.2.3-70-g09d2 From 012a45e3f4af68e86d85cce060c6c2fed56498b2 Mon Sep 17 00:00:00 2001 From: Leon Ma Date: Wed, 30 Apr 2014 16:43:10 +0800 Subject: hrtimer: Prevent remote enqueue of leftmost timers If a cpu is idle and starts an hrtimer which is not pinned on that same cpu, the nohz code might target the timer to a different cpu. In the case that we switch the cpu base of the timer we already have a sanity check in place, which determines whether the timer is earlier than the current leftmost timer on the target cpu. In that case we enqueue the timer on the current cpu because we cannot reprogram the clock event device on the target. If the timers base is already the target CPU we do not have this sanity check in place so we enqueue the timer as the leftmost timer in the target cpus rb tree, but we cannot reprogram the clock event device on the target cpu. So the timer expires late and subsequently prevents the reprogramming of the target cpu clock event device until the previously programmed event fires or a timer with an earlier expiry time gets enqueued on the target cpu itself. Add the same target check as we have for the switch base case and start the timer on the current cpu if it would become the leftmost timer on the target. [ tglx: Rewrote subject and changelog ] Signed-off-by: Leon Ma Link: http://lkml.kernel.org/r/1398847391-5994-1-git-send-email-xindong.ma@intel.com Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index e3724fdac2d..6b715c0af1b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -234,6 +234,11 @@ again: goto again; } timer->base = new_base; + } else { + if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { + cpu = this_cpu; + goto again; + } } return new_base; } -- cgit v1.2.3-70-g09d2 From 98a01e779f3c66b0b11cd7e64d531c0e41c95762 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Fri, 18 Apr 2014 17:23:11 +0200 Subject: timer: Prevent overflow in apply_slack On architectures with sizeof(int) < sizeof (long), the computation of mask inside apply_slack() can be undefined if the computed bit is > 32. E.g. with: expires = 0xffffe6f5 and slack = 25, we get: expires_limit = 0x20000000e bit = 33 mask = (1 << 33) - 1 /* undefined */ On x86, mask becomes 1 and and the slack is not applied properly. On s390, mask is -1, expires is set to 0 and the timer fires immediately. Use 1UL << bit to solve that issue. Suggested-by: Deborah Townsend Signed-off-by: Jiri Bohac Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20140418152310.GA13654@midget.suse.cz Signed-off-by: Thomas Gleixner --- kernel/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 87bd529879c..3bb01a323b2 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -838,7 +838,7 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) bit = find_last_bit(&mask, BITS_PER_LONG); - mask = (1 << bit) - 1; + mask = (1UL << bit) - 1; expires_limit = expires_limit & ~(mask); -- cgit v1.2.3-70-g09d2 From 561a4fe851ccab9dd0d14989ab566f9392d9f8b5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 2 May 2014 13:30:04 -0400 Subject: tracing: Use rcu_dereference_sched() for trace event triggers As trace event triggers are now part of the mainline kernel, I added my trace event trigger tests to my test suite I run on all my kernels. Now these tests get run under different config options, and one of those options is CONFIG_PROVE_RCU, which checks under lockdep that the rcu locking primitives are being used correctly. This triggered the following splat: =============================== [ INFO: suspicious RCU usage. ] 3.15.0-rc2-test+ #11 Not tainted ------------------------------- kernel/trace/trace_events_trigger.c:80 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 4 locks held by swapper/1/0: #0: ((&(&j_cdbs->work)->timer)){..-...}, at: [] call_timer_fn+0x5/0x1be #1: (&(&pool->lock)->rlock){-.-...}, at: [] __queue_work+0x140/0x283 #2: (&p->pi_lock){-.-.-.}, at: [] try_to_wake_up+0x2e/0x1e8 #3: (&rq->lock){-.-.-.}, at: [] try_to_wake_up+0x1a0/0x1e8 stack backtrace: CPU: 1 PID: 0 Comm: swapper/1 Not tainted 3.15.0-rc2-test+ #11 Hardware name: /DG965MQ, BIOS MQ96510J.86A.0372.2006.0605.1717 06/05/2006 0000000000000001 ffff88007e083b98 ffffffff819f53a5 0000000000000006 ffff88007b0942c0 ffff88007e083bc8 ffffffff81081307 ffff88007ad96d20 0000000000000000 ffff88007af2d840 ffff88007b2e701c ffff88007e083c18 Call Trace: [] dump_stack+0x4f/0x7c [] lockdep_rcu_suspicious+0x107/0x110 [] event_triggers_call+0x99/0x108 [] ftrace_event_buffer_commit+0x42/0xa4 [] ftrace_raw_event_sched_wakeup_template+0x71/0x7c [] ttwu_do_wakeup+0x7f/0xff [] ttwu_do_activate.constprop.126+0x5c/0x61 [] try_to_wake_up+0x1ac/0x1e8 [] wake_up_process+0x36/0x3b [] wake_up_worker+0x24/0x26 [] insert_work+0x5c/0x65 [] __queue_work+0x26c/0x283 [] ? __queue_work+0x283/0x283 [] delayed_work_timer_fn+0x1e/0x20 [] call_timer_fn+0xdf/0x1be^M [] ? call_timer_fn+0x5/0x1be [] ? __queue_work+0x283/0x283 [] run_timer_softirq+0x1a4/0x22f^M [] __do_softirq+0x17b/0x31b^M [] irq_exit+0x42/0x97 [] smp_apic_timer_interrupt+0x37/0x44 [] apic_timer_interrupt+0x6f/0x80 [] ? default_idle+0x21/0x32 [] ? default_idle+0x1f/0x32 [] arch_cpu_idle+0xf/0x11 [] cpu_startup_entry+0x1a3/0x213 [] start_secondary+0x212/0x219 The cause is that the triggers are protected by rcu_read_lock_sched() but the data is dereferenced with rcu_dereference() which expects it to be protected with rcu_read_lock(). The proper reference should be rcu_dereference_sched(). Cc: Tom Zanussi Cc: stable@vger.kernel.org # 3.14+ Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_trigger.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 925f537f07d..4747b476a03 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -77,7 +77,7 @@ event_triggers_call(struct ftrace_event_file *file, void *rec) data->ops->func(data); continue; } - filter = rcu_dereference(data->filter); + filter = rcu_dereference_sched(data->filter); if (filter && !filter_match_preds(filter, rec)) continue; if (data->cmd_ops->post_trigger) { -- cgit v1.2.3-70-g09d2