From d8e8ed95cda1dfd6813588333d36552935eba4a1 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 15 Dec 2011 13:43:24 +1030 Subject: rcu: Make rcutorture bool parameters really bool (core code) module_param(bool) used to counter-intuitively take an int. In fddd5201 (mid-2009) we allowed bool or int/unsigned int using a messy trick. It's time to remove the int/unsigned int option. For this version it'll simply give a warning, but it'll break next kernel version. This commit makes this change to rcutorture. Signed-off-by: Rusty Russell Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 88f17b8a3b1..e29edc374ce 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -56,8 +56,8 @@ static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ static int nfakewriters = 4; /* # fake writer threads */ static int stat_interval; /* Interval between stats, in seconds. */ /* Defaults to "only at end of test". */ -static int verbose; /* Print more debug info. */ -static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ +static bool verbose; /* Print more debug info. */ +static bool test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ static int stutter = 5; /* Start/stop testing interval (in sec) */ static int irqreader = 1; /* RCU readers from irq (timers). */ -- cgit v1.2.3-70-g09d2 From 4410030646be072b82ec1892ad5cc7d91af384d8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 27 Dec 2011 15:04:26 +0100 Subject: rcu: Add missing __cpuinit annotation in rcutorture code "rcu: Add rcutorture CPU-hotplug capability" adds cpu hotplug operations to the rcutorture code but produces a false positive warning about section mismatches: WARNING: vmlinux.o(.text+0x1e420c): Section mismatch in reference from the function rcu_torture_onoff() to the function .cpuinit.text:cpu_up() The function rcu_torture_onoff() references the function __cpuinit cpu_up(). This is often because rcu_torture_onoff lacks a __cpuinit annotation or the annotation of cpu_up is wrong. This commit therefore adds a __cpuinit annotation so the warning goes away. Signed-off-by: Heiko Carstens Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index e29edc374ce..a58ac285fc6 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1399,7 +1399,7 @@ rcu_torture_shutdown(void *arg) * Execute random CPU-hotplug operations at the interval specified * by the onoff_interval. */ -static int +static int __cpuinit rcu_torture_onoff(void *arg) { int cpu; @@ -1447,7 +1447,7 @@ rcu_torture_onoff(void *arg) return 0; } -static int +static int __cpuinit rcu_torture_onoff_init(void) { if (onoff_interval <= 0) -- cgit v1.2.3-70-g09d2 From 160cb5a97daef0cb894685d84c9d4700bb7cccb4 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 19 Jan 2012 23:23:10 +0100 Subject: PM / Hibernate: Correct additional pages number calculation The struct bm_block is allocated by chain_alloc(), so it'd better counting it in LINKED_PAGE_DATA_SIZE. Signed-off-by: Namhyung Kim Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 1cf88900ec4..6a768e53700 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -812,7 +812,8 @@ unsigned int snapshot_additional_pages(struct zone *zone) unsigned int res; res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); - res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); + res += DIV_ROUND_UP(res * sizeof(struct bm_block), + LINKED_PAGE_DATA_SIZE); return 2 * res; } -- cgit v1.2.3-70-g09d2 From fd45c15f13e754f3c106427e857310f3e0813951 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 20 Jan 2012 10:12:45 +0900 Subject: perf: Don't call release_callchain_buffers() if allocation fails When alloc_callchain_buffers() fails, it frees all of entries before return. In addition, calling the release_callchain_buffers() will cause a NULL pointer dereference since callchain_cpu_entries is not set. Signed-off-by: Namhyung Kim Acked-by: Frederic Weisbecker Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/1327021966-27688-1-git-send-email-namhyung.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/events/callchain.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 057e24b665c..6581a040f39 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -115,8 +115,6 @@ int get_callchain_buffers(void) } err = alloc_callchain_buffers(); - if (err) - release_callchain_buffers(); exit: mutex_unlock(&callchain_mutex); -- cgit v1.2.3-70-g09d2 From 46cd6a7f680d14f6f80ede9f04aeb70fa83bd266 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 20 Jan 2012 10:12:46 +0900 Subject: perf: Call perf_cgroup_event_time() directly The perf_event_time() will call perf_cgroup_event_time() if @event is a cgroup event. Just do it directly and avoid the extra check.. Signed-off-by: Namhyung Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/1327021966-27688-2-git-send-email-namhyung.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index a8f4ac001a0..32b48c88971 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -815,7 +815,7 @@ static void update_event_times(struct perf_event *event) * here. */ if (is_cgroup_event(event)) - run_end = perf_event_time(event); + run_end = perf_cgroup_event_time(event); else if (ctx->is_active) run_end = ctx->time; else -- cgit v1.2.3-70-g09d2 From 0e90b31f4ba77027a7c21cbfc66404df0851ca21 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 20 Jan 2012 04:57:16 +0000 Subject: net: introduce res_counter_charge_nofail() for socket allocations There is a case in __sk_mem_schedule(), where an allocation is beyond the maximum, but yet we are allowed to proceed. It happens under the following condition: sk->sk_wmem_queued + size >= sk->sk_sndbuf The network code won't revert the allocation in this case, meaning that at some point later it'll try to do it. Since this is never communicated to the underlying res_counter code, there is an inbalance in res_counter uncharge operation. I see two ways of fixing this: 1) storing the information about those allocations somewhere in memcg, and then deducting from that first, before we start draining the res_counter, 2) providing a slightly different allocation function for the res_counter, that matches the original behavior of the network code more closely. I decided to go for #2 here, believing it to be more elegant, since #1 would require us to do basically that, but in a more obscure way. Signed-off-by: Glauber Costa Cc: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Michal Hocko CC: Tejun Heo CC: Li Zefan CC: Laurent Chavey Acked-by: Tejun Heo Signed-off-by: David S. Miller --- include/linux/res_counter.h | 6 ++++++ include/net/sock.h | 10 ++++------ kernel/res_counter.c | 25 +++++++++++++++++++++++++ net/core/sock.c | 4 ++-- 4 files changed, 37 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index d06d014afda..da81af086ea 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h @@ -109,12 +109,18 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent); * * returns 0 on success and <0 if the counter->usage will exceed the * counter->limit _locked call expects the counter->lock to be taken + * + * charge_nofail works the same, except that it charges the resource + * counter unconditionally, and returns < 0 if the after the current + * charge we are over limit. */ int __must_check res_counter_charge_locked(struct res_counter *counter, unsigned long val); int __must_check res_counter_charge(struct res_counter *counter, unsigned long val, struct res_counter **limit_fail_at); +int __must_check res_counter_charge_nofail(struct res_counter *counter, + unsigned long val, struct res_counter **limit_fail_at); /* * uncharge - tell that some portion of the resource is released diff --git a/include/net/sock.h b/include/net/sock.h index 0e7a9b05f92..4c69ac165e6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1008,9 +1008,8 @@ static inline void memcg_memory_allocated_add(struct cg_proto *prot, struct res_counter *fail; int ret; - ret = res_counter_charge(prot->memory_allocated, - amt << PAGE_SHIFT, &fail); - + ret = res_counter_charge_nofail(prot->memory_allocated, + amt << PAGE_SHIFT, &fail); if (ret < 0) *parent_status = OVER_LIMIT; } @@ -1054,12 +1053,11 @@ sk_memory_allocated_add(struct sock *sk, int amt, int *parent_status) } static inline void -sk_memory_allocated_sub(struct sock *sk, int amt, int parent_status) +sk_memory_allocated_sub(struct sock *sk, int amt) { struct proto *prot = sk->sk_prot; - if (mem_cgroup_sockets_enabled && sk->sk_cgrp && - parent_status != OVER_LIMIT) /* Otherwise was uncharged already */ + if (mem_cgroup_sockets_enabled && sk->sk_cgrp) memcg_memory_allocated_sub(sk->sk_cgrp, amt); atomic_long_sub(amt, prot->memory_allocated); diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 6d269cce7aa..d508363858b 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -66,6 +66,31 @@ done: return ret; } +int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, + struct res_counter **limit_fail_at) +{ + int ret, r; + unsigned long flags; + struct res_counter *c; + + r = ret = 0; + *limit_fail_at = NULL; + local_irq_save(flags); + for (c = counter; c != NULL; c = c->parent) { + spin_lock(&c->lock); + r = res_counter_charge_locked(c, val); + if (r) + c->usage += val; + spin_unlock(&c->lock); + if (r < 0 && ret == 0) { + *limit_fail_at = c; + ret = r; + } + } + local_irq_restore(flags); + + return ret; +} void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) { if (WARN_ON(counter->usage < val)) diff --git a/net/core/sock.c b/net/core/sock.c index 5c5af9988f9..3e81fd2e3c7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1827,7 +1827,7 @@ suppress_allocation: /* Alas. Undo changes. */ sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; - sk_memory_allocated_sub(sk, amt, parent_status); + sk_memory_allocated_sub(sk, amt); return 0; } @@ -1840,7 +1840,7 @@ EXPORT_SYMBOL(__sk_mem_schedule); void __sk_mem_reclaim(struct sock *sk) { sk_memory_allocated_sub(sk, - sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, 0); + sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT); sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; if (sk_under_memory_pressure(sk) && -- cgit v1.2.3-70-g09d2 From d496aab567e7e52b3e974c9192a5de6e77dce32c Mon Sep 17 00:00:00 2001 From: Ananth N Mavinakayanahalli Date: Fri, 20 Jan 2012 14:34:04 -0800 Subject: kprobes: initialize before using a hlist Commit ef53d9c5e ("kprobes: improve kretprobe scalability with hashed locking") introduced a bug where we can potentially leak kretprobe_instances since we initialize a hlist head after having used it. Initialize the hlist head before using it. Reported by: Jim Keniston Acked-by: Jim Keniston Signed-off-by: Ananth N Mavinakayanahalli Acked-by: Masami Hiramatsu Cc: Srinivasa D S Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 95dd7212e61..29f5b65bee2 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1077,6 +1077,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) /* Early boot. kretprobe_table_locks not yet initialized. */ return; + INIT_HLIST_HEAD(&empty_rp); hash = hash_ptr(tk, KPROBE_HASH_BITS); head = &kretprobe_inst_table[hash]; kretprobe_table_lock(hash, &flags); @@ -1085,7 +1086,6 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) recycle_rp_inst(ri, &empty_rp); } kretprobe_table_unlock(hash, &flags); - INIT_HLIST_HEAD(&empty_rp); hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); kfree(ri); -- cgit v1.2.3-70-g09d2 From 42ae610c1a820ddecb80943d4ccfc936f7772535 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 21 Jan 2012 11:02:24 -0800 Subject: kernel-doc: fix new warnings in auditsc.c Fix new kernel-doc warnings in auditsc.c: Warning(kernel/auditsc.c:1875): No description found for parameter 'success' Warning(kernel/auditsc.c:1875): No description found for parameter 'return_code' Warning(kernel/auditsc.c:1875): Excess function parameter 'pt_regs' description in '__audit_syscall_exit' Signed-off-by: Randy Dunlap Cc: Al Viro Cc: Eric Paris Signed-off-by: Linus Torvalds --- kernel/auditsc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index caaea6e944f..af1de0f34ea 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1863,11 +1863,12 @@ void __audit_syscall_entry(int arch, int major, /** * audit_syscall_exit - deallocate audit context after a system call - * @pt_regs: syscall registers + * @success: success value of the syscall + * @return_code: return value of the syscall * * Tear down after system call. If the audit context has been marked as * auditable (either because of the AUDIT_RECORD_CONTEXT state from - * filtering, or because some other part of the kernel write an audit + * filtering, or because some other part of the kernel wrote an audit * message), then write out the syscall information. In call cases, * free the names stored from getname(). */ -- cgit v1.2.3-70-g09d2 From fa757281a08799fd6c0f7ec6f111d1cd66afc97b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 21 Jan 2012 11:03:13 -0800 Subject: kernel-doc: fix kernel-doc warnings in sched Fix new kernel-doc notation warnings: Warning(include/linux/sched.h:2094): No description found for parameter 'p' Warning(include/linux/sched.h:2094): Excess function parameter 'tsk' description in 'is_idle_task' Warning(kernel/sched/cpupri.c:139): No description found for parameter 'newpri' Warning(kernel/sched/cpupri.c:139): Excess function parameter 'pri' description in 'cpupri_set' Warning(kernel/sched/cpupri.c:208): Excess function parameter 'bootmem' description in 'cpupri_init' Signed-off-by: Randy Dunlap Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- kernel/sched/cpupri.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4032ec1cf83..513f5245987 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2088,7 +2088,7 @@ extern int sched_setscheduler_nocheck(struct task_struct *, int, extern struct task_struct *idle_task(int cpu); /** * is_idle_task - is the specified task an idle task? - * @tsk: the task in question. + * @p: the task in question. */ static inline bool is_idle_task(struct task_struct *p) { diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index b0d798eaf13..d72586fdf66 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -129,7 +129,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, * cpupri_set - update the cpu priority setting * @cp: The cpupri context * @cpu: The target cpu - * @pri: The priority (INVALID-RT99) to assign to this CPU + * @newpri: The priority (INVALID-RT99) to assign to this CPU * * Note: Assumes cpu_rq(cpu)->lock is locked * @@ -200,7 +200,6 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) /** * cpupri_init - initialize the cpupri structure * @cp: The cpupri context - * @bootmem: true if allocations need to use bootmem * * Returns: -ENOMEM if memory fails. */ -- cgit v1.2.3-70-g09d2 From 4ca9b72b71f10147bd21969c1805f5b2c4ca7b7b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Jan 2012 11:50:51 +0100 Subject: sched: Fix rq->nr_uninterruptible update race KOSAKI Motohiro noticed the following race: > CPU0 CPU1 > -------------------------------------------------------- > deactivate_task() > task->state = TASK_UNINTERRUPTIBLE; > activate_task() > rq->nr_uninterruptible--; > > schedule() > deactivate_task() > rq->nr_uninterruptible++; > Kosaki-San's scenario is possible when CPU0 runs __sched_setscheduler() against CPU1's current @task. __sched_setscheduler() does a dequeue/enqueue in order to move the task to its new queue (position) to reflect the newly provided scheduling parameters. However it should be completely invariant to nr_uninterruptible accounting, sched_setscheduler() doesn't affect readyness to run, merely policy on when to run. So convert the inappropriate activate/deactivate_task usage to enqueue/dequeue_task, which avoids the nr_uninterruptible accounting. Also convert the two other sites: __migrate_task() and normalize_task() that still use activate/deactivate_task. These sites aren't really a problem since __migrate_task() will only be called on non-running task (and therefore are immume to the described problem) and normalize_task() isn't ever used on regular systems. Also remove the comments from activate/deactivate_task since they're misleading at best. Reported-by: KOSAKI Motohiro Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1327486224.2614.45.camel@laptop Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index df00cb09263..e067df1fd01 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -723,9 +723,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) p->sched_class->dequeue_task(rq, p, flags); } -/* - * activate_task - move a task to the runqueue. - */ void activate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_contributes_to_load(p)) @@ -734,9 +731,6 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) enqueue_task(rq, p, flags); } -/* - * deactivate_task - remove a task from the runqueue. - */ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_contributes_to_load(p)) @@ -4134,7 +4128,7 @@ recheck: on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) - deactivate_task(rq, p, 0); + dequeue_task(rq, p, 0); if (running) p->sched_class->put_prev_task(rq, p); @@ -4147,7 +4141,7 @@ recheck: if (running) p->sched_class->set_curr_task(rq); if (on_rq) - activate_task(rq, p, 0); + enqueue_task(rq, p, 0); check_class_changed(rq, p, prev_class, oldprio); task_rq_unlock(rq, p, &flags); @@ -4998,9 +4992,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * placed properly. */ if (p->on_rq) { - deactivate_task(rq_src, p, 0); + dequeue_task(rq_src, p, 0); set_task_cpu(p, dest_cpu); - activate_task(rq_dest, p, 0); + enqueue_task(rq_dest, p, 0); check_preempt_curr(rq_dest, p, 0); } done: @@ -7032,10 +7026,10 @@ static void normalize_task(struct rq *rq, struct task_struct *p) on_rq = p->on_rq; if (on_rq) - deactivate_task(rq, p, 0); + dequeue_task(rq, p, 0); __setscheduler(rq, p, SCHED_NORMAL, 0); if (on_rq) { - activate_task(rq, p, 0); + enqueue_task(rq, p, 0); resched_task(rq->curr); } -- cgit v1.2.3-70-g09d2 From db7e527da41560f597ccdc4417cefa6b7657c0c0 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 11 Jan 2012 08:58:16 +0100 Subject: sched/s390: Fix compile error in sched/core.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 029632fbb7b7c9d85063cc9eb470de6c54873df3 ("sched: Make separate sched*.c translation units") removed the include of asm/mutex.h from sched.c. This breaks the combination of: CONFIG_MUTEX_SPIN_ON_OWNER=yes CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX=yes like s390 without mutex debugging: CC kernel/sched/core.o kernel/sched/core.c: In function ‘mutex_spin_on_owner’: kernel/sched/core.c:3287: error: implicit declaration of function ‘arch_mutex_cpu_relax’ Lets re-add the include to kernel/sched/core.c Signed-off-by: Christian Borntraeger Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1326268696-30904-1-git-send-email-borntraeger@de.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e067df1fd01..5255c9d2e05 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -74,6 +74,7 @@ #include #include +#include #ifdef CONFIG_PARAVIRT #include #endif -- cgit v1.2.3-70-g09d2 From 71325960d16cd68ea0e22a8da15b2495b0f363f7 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 19 Jan 2012 18:28:57 -0800 Subject: sched/nohz: Fix nohz cpu idle load balancing state with cpu hotplug With the recent nohz scheduler changes, rq's nohz flag 'NOHZ_TICK_STOPPED' and its associated state doesn't get cleared immediately after the cpu exits idle. This gets cleared as part of the next tick seen on that cpu. For the cpu offline support, we need to clear this state manually. Fix it by registering a cpu notifier, which clears the nohz idle load balance state for this rq explicitly during the CPU_DYING notification. There won't be any nohz updates for that cpu, after the CPU_DYING notification. But lets be extra paranoid and skip updating the nohz state in the select_nohz_load_balancer() if the cpu is not in active state anymore. Reported-by: Srivatsa S. Bhat Reviewed-and-tested-by: Srivatsa S. Bhat Tested-by: Sergey Senozhatsky Signed-off-by: Suresh Siddha Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1327026538.16150.40.camel@sbsiddha-desk.sc.intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 84adb2d66cb..7c6414fc669 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4866,6 +4866,15 @@ static void nohz_balancer_kick(int cpu) return; } +static inline void clear_nohz_tick_stopped(int cpu) +{ + if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); + } +} + static inline void set_cpu_sd_state_busy(void) { struct sched_domain *sd; @@ -4904,6 +4913,12 @@ void select_nohz_load_balancer(int stop_tick) { int cpu = smp_processor_id(); + /* + * If this cpu is going down, then nothing needs to be done. + */ + if (!cpu_active(cpu)) + return; + if (stop_tick) { if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) return; @@ -4914,6 +4929,18 @@ void select_nohz_load_balancer(int stop_tick) } return; } + +static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DYING: + clear_nohz_tick_stopped(smp_processor_id()); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} #endif static DEFINE_SPINLOCK(balancing); @@ -5070,11 +5097,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) * busy tick after returning from idle, we will update the busy stats. */ set_cpu_sd_state_busy(); - if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { - clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); - } + clear_nohz_tick_stopped(cpu); /* * None are in tickless mode and hence no need for NOHZ idle load @@ -5590,6 +5613,7 @@ __init void init_sched_fair_class(void) #ifdef CONFIG_NO_HZ zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); + cpu_notifier(sched_ilb_notifier, 0); #endif #endif /* SMP */ -- cgit v1.2.3-70-g09d2 From b0f4c4b32c8e3aa0d44fc4dd6c40a9a9a8d66b63 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Thu, 26 Jan 2012 08:55:34 -0500 Subject: bugs, x86: Fix printk levels for panic, softlockups and stack dumps rsyslog will display KERN_EMERG messages on a connected terminal. However, these messages are useless/undecipherable for a general user. For example, after a softlockup we get: Message from syslogd@intel-s3e37-04 at Jan 25 14:18:06 ... kernel:Stack: Message from syslogd@intel-s3e37-04 at Jan 25 14:18:06 ... kernel:Call Trace: Message from syslogd@intel-s3e37-04 at Jan 25 14:18:06 ... kernel:Code: ff ff a8 08 75 25 31 d2 48 8d 86 38 e0 ff ff 48 89 d1 0f 01 c8 0f ae f0 48 8b 86 38 e0 ff ff a8 08 75 08 b1 01 4c 89 e0 0f 01 c9 ea 69 dd ff 4c 29 e8 48 89 c7 e8 0f bc da ff 49 89 c4 49 89 This happens because the printk levels for these messages are incorrect. Only an informational message should be displayed on a terminal. I modified the printk levels for various messages in the kernel and tested the output by using the drivers/misc/lkdtm.c kernel modules (ie, softlockups, panics, hard lockups, etc.) and confirmed that the console output was still the same and that the output to the terminals was correct. For example, in the case of a softlockup we now see the much more informative: Message from syslogd@intel-s3e37-04 at Jan 25 10:18:06 ... BUG: soft lockup - CPU4 stuck for 60s! instead of the above confusing messages. AFAICT, the messages no longer have to be KERN_EMERG. In the most important case of a panic we set console_verbose(). As for the other less severe cases the correct data is output to the console and /var/log/messages. Successfully tested by me using the drivers/misc/lkdtm.c module. Signed-off-by: Prarit Bhargava Cc: dzickus@redhat.com Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1327586134-11926-1-git-send-email-prarit@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 3 ++- arch/x86/kernel/dumpstack_64.c | 6 +++--- arch/x86/mm/fault.c | 4 ++-- kernel/watchdog.c | 2 +- lib/bug.c | 2 +- 5 files changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 1aae78f775f..4025fe4f928 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -252,7 +252,8 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) unsigned short ss; unsigned long sp; #endif - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); + printk(KERN_DEFAULT + "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); #ifdef CONFIG_PREEMPT printk("PREEMPT "); #endif diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 6d728d9284b..42b2bca0b72 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -269,11 +269,11 @@ void show_registers(struct pt_regs *regs) unsigned char c; u8 *ip; - printk(KERN_EMERG "Stack:\n"); + printk(KERN_DEFAULT "Stack:\n"); show_stack_log_lvl(NULL, regs, (unsigned long *)sp, - 0, KERN_EMERG); + 0, KERN_DEFAULT); - printk(KERN_EMERG "Code: "); + printk(KERN_DEFAULT "Code: "); ip = (u8 *)regs->ip - code_prologue; if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9d74824a708..f0b4caf85c1 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -673,7 +673,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, stackend = end_of_stack(tsk); if (tsk != &init_task && *stackend != STACK_END_MAGIC) - printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); + printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); tsk->thread.cr2 = address; tsk->thread.trap_no = 14; @@ -684,7 +684,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, sig = 0; /* Executive summary in case the body of the oops scrolled away */ - printk(KERN_EMERG "CR2: %016lx\n", address); + printk(KERN_DEFAULT "CR2: %016lx\n", address); oops_end(flags, regs, sig); } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 1d7bca7f4f5..d117262deba 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -296,7 +296,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (__this_cpu_read(soft_watchdog_warn) == true) return HRTIMER_RESTART; - printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", + printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); print_modules(); diff --git a/lib/bug.c b/lib/bug.c index 19552096d16..a28c1415357 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -169,7 +169,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) return BUG_TRAP_TYPE_WARN; } - printk(KERN_EMERG "------------[ cut here ]------------\n"); + printk(KERN_DEFAULT "------------[ cut here ]------------\n"); if (file) printk(KERN_CRIT "kernel BUG at %s:%u!\n", -- cgit v1.2.3-70-g09d2 From b5740f4b2cb3503b436925eb2242bc3d75cd3dfe Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Tue, 17 Jan 2012 17:40:31 +0900 Subject: sched: Fix ancient race in do_exit() try_to_wake_up() has a problem which may change status from TASK_DEAD to TASK_RUNNING in race condition with SMI or guest environment of virtual machine. As a result, exited task is scheduled() again and panic occurs. Here is the sequence how it occurs: ----------------------------------+----------------------------- | CPU A | CPU B ----------------------------------+----------------------------- TASK A calls exit().... do_exit() exit_mm() down_read(mm->mmap_sem); rwsem_down_failed_common() set TASK_UNINTERRUPTIBLE set waiter.task <= task A list_add to sem->wait_list : raw_spin_unlock_irq() (I/O interruption occured) __rwsem_do_wake(mmap_sem) list_del(&waiter->list); waiter->task = NULL wake_up_process(task A) try_to_wake_up() (task is still TASK_UNINTERRUPTIBLE) p->on_rq is still 1.) ttwu_do_wakeup() (*A) : (I/O interruption handler finished) if (!waiter.task) schedule() is not called due to waiter.task is NULL. tsk->state = TASK_RUNNING : check_preempt_curr(); : task->state = TASK_DEAD (*B) <--- set TASK_RUNNING (*C) schedule() (exit task is running again) BUG_ON() is called! -------------------------------------------------------- The execution time between (*A) and (*B) is usually very short, because the interruption is disabled, and setting TASK_RUNNING at (*C) must be executed before setting TASK_DEAD. HOWEVER, if SMI is interrupted between (*A) and (*B), (*C) is able to execute AFTER setting TASK_DEAD! Then, exited task is scheduled again, and BUG_ON() is called.... If the system works on guest system of virtual machine, the time between (*A) and (*B) may be also long due to scheduling of hypervisor, and same phenomenon can occur. By this patch, do_exit() waits for releasing task->pi_lock which is used in try_to_wake_up(). It guarantees the task becomes TASK_DEAD after waking up. Signed-off-by: Yasunori Goto Acked-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20120117174031.3118.E1E9C6FF@jp.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/exit.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 294b1709170..4b4042f9bc6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1038,6 +1038,22 @@ void do_exit(long code) if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); + + /* + * The setting of TASK_RUNNING by try_to_wake_up() may be delayed + * when the following two conditions become true. + * - There is race condition of mmap_sem (It is acquired by + * exit_mm()), and + * - SMI occurs before setting TASK_RUNINNG. + * (or hypervisor of virtual machine switches to other guest) + * As a result, we may become TASK_RUNNING after becoming TASK_DEAD + * + * To avoid it, we have to wait for releasing tsk->pi_lock which + * is held by try_to_wake_up() + */ + smp_mb(); + raw_spin_unlock_wait(&tsk->pi_lock); + /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ -- cgit v1.2.3-70-g09d2 From e050e3f0a71bf7dc2c148b35caff0234decc8198 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 26 Jan 2012 17:03:19 +0100 Subject: perf: Fix broken interrupt rate throttling This patch fixes the sampling interrupt throttling mechanism. It was broken in v3.2. Events were not being unthrottled. The unthrottling mechanism required that events be checked at each timer tick. This patch solves this problem and also separates: - unthrottling - multiplexing - frequency-mode period adjustments Not all of them need to be executed at each timer tick. This third version of the patch is based on my original patch + PeterZ proposal (https://lkml.org/lkml/2012/1/7/87). At each timer tick, for each context: - if the current CPU has throttled events, we unthrottle events - if context has frequency-based events, we adjust sampling periods - if we have reached the jiffies interval, we multiplex (rotate) We decoupled rotation (multiplexing) from frequency-mode sampling period adjustments. They should not necessarily happen at the same rate. Multiplexing is subject to jiffies_interval (currently at 1 but could be higher once the tunable is exposed via sysfs). We have grouped frequency-mode adjustment and unthrottling into the same routine to minimize code duplication. When throttled while in frequency mode, we scan the events only once. We have fixed the threshold enforcement code in __perf_event_overflow(). There was a bug whereby it would allow more than the authorized rate because an increment of hwc->interrupts was not executed at the right place. The patch was tested with low sampling limit (2000) and fixed periods, frequency mode, overcommitted PMU. On a 2.1GHz AMD CPU: $ cat /proc/sys/kernel/perf_event_max_sample_rate 2000 We set a rate of 3000 samples/sec (2.1GHz/3000 = 700000): $ perf record -e cycles,cycles -c 700000 noploop 10 $ perf report -D | tail -21 Aggregated stats: TOTAL events: 80086 MMAP events: 88 COMM events: 2 EXIT events: 4 THROTTLE events: 19996 UNTHROTTLE events: 19996 SAMPLE events: 40000 cycles stats: TOTAL events: 40006 MMAP events: 5 COMM events: 1 EXIT events: 4 THROTTLE events: 9998 UNTHROTTLE events: 9998 SAMPLE events: 20000 cycles stats: TOTAL events: 39996 THROTTLE events: 9998 UNTHROTTLE events: 9998 SAMPLE events: 20000 For 10s, the cap is 2x2000x10 = 40000 samples. We get exactly that: 20000 samples/event. Signed-off-by: Stephane Eranian Cc: # v3.2+ Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120126160319.GA5655@quad Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 + kernel/events/core.c | 104 ++++++++++++++++++++++++++++----------------- 2 files changed, 67 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 08855613ceb..abb2776be1b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -587,6 +587,7 @@ struct hw_perf_event { u64 sample_period; u64 last_period; local64_t period_left; + u64 interrupts_seq; u64 interrupts; u64 freq_time_stamp; diff --git a/kernel/events/core.c b/kernel/events/core.c index 32b48c88971..ba36013cfb2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2300,6 +2300,9 @@ do { \ return div64_u64(dividend, divisor); } +static DEFINE_PER_CPU(int, perf_throttled_count); +static DEFINE_PER_CPU(u64, perf_throttled_seq); + static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) { struct hw_perf_event *hwc = &event->hw; @@ -2325,16 +2328,29 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) } } -static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) +/* + * combine freq adjustment with unthrottling to avoid two passes over the + * events. At the same time, make sure, having freq events does not change + * the rate of unthrottling as that would introduce bias. + */ +static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, + int needs_unthr) { struct perf_event *event; struct hw_perf_event *hwc; - u64 interrupts, now; + u64 now, period = TICK_NSEC; s64 delta; - if (!ctx->nr_freq) + /* + * only need to iterate over all events iff: + * - context have events in frequency mode (needs freq adjust) + * - there are events to unthrottle on this cpu + */ + if (!(ctx->nr_freq || needs_unthr)) return; + raw_spin_lock(&ctx->lock); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; @@ -2344,13 +2360,8 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) hwc = &event->hw; - interrupts = hwc->interrupts; - hwc->interrupts = 0; - - /* - * unthrottle events on the tick - */ - if (interrupts == MAX_INTERRUPTS) { + if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) { + hwc->interrupts = 0; perf_log_throttle(event, 1); event->pmu->start(event, 0); } @@ -2358,14 +2369,26 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) if (!event->attr.freq || !event->attr.sample_freq) continue; - event->pmu->read(event); + /* + * stop the event and update event->count + */ + event->pmu->stop(event, PERF_EF_UPDATE); + now = local64_read(&event->count); delta = now - hwc->freq_count_stamp; hwc->freq_count_stamp = now; + /* + * restart the event + * reload only if value has changed + */ if (delta > 0) perf_adjust_period(event, period, delta); + + event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); } + + raw_spin_unlock(&ctx->lock); } /* @@ -2388,16 +2411,13 @@ static void rotate_ctx(struct perf_event_context *ctx) */ static void perf_rotate_context(struct perf_cpu_context *cpuctx) { - u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; struct perf_event_context *ctx = NULL; - int rotate = 0, remove = 1, freq = 0; + int rotate = 0, remove = 1; if (cpuctx->ctx.nr_events) { remove = 0; if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) rotate = 1; - if (cpuctx->ctx.nr_freq) - freq = 1; } ctx = cpuctx->task_ctx; @@ -2405,37 +2425,26 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) remove = 0; if (ctx->nr_events != ctx->nr_active) rotate = 1; - if (ctx->nr_freq) - freq = 1; } - if (!rotate && !freq) + if (!rotate) goto done; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); - if (freq) { - perf_ctx_adjust_freq(&cpuctx->ctx, interval); - if (ctx) - perf_ctx_adjust_freq(ctx, interval); - } - - if (rotate) { - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - if (ctx) - ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + if (ctx) + ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); - rotate_ctx(&cpuctx->ctx); - if (ctx) - rotate_ctx(ctx); + rotate_ctx(&cpuctx->ctx); + if (ctx) + rotate_ctx(ctx); - perf_event_sched_in(cpuctx, ctx, current); - } + perf_event_sched_in(cpuctx, ctx, current); perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - done: if (remove) list_del_init(&cpuctx->rotation_list); @@ -2445,10 +2454,22 @@ void perf_event_task_tick(void) { struct list_head *head = &__get_cpu_var(rotation_list); struct perf_cpu_context *cpuctx, *tmp; + struct perf_event_context *ctx; + int throttled; WARN_ON(!irqs_disabled()); + __this_cpu_inc(perf_throttled_seq); + throttled = __this_cpu_xchg(perf_throttled_count, 0); + list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { + ctx = &cpuctx->ctx; + perf_adjust_freq_unthr_context(ctx, throttled); + + ctx = cpuctx->task_ctx; + if (ctx) + perf_adjust_freq_unthr_context(ctx, throttled); + if (cpuctx->jiffies_interval == 1 || !(jiffies % cpuctx->jiffies_interval)) perf_rotate_context(cpuctx); @@ -4509,6 +4530,7 @@ static int __perf_event_overflow(struct perf_event *event, { int events = atomic_read(&event->event_limit); struct hw_perf_event *hwc = &event->hw; + u64 seq; int ret = 0; /* @@ -4518,14 +4540,20 @@ static int __perf_event_overflow(struct perf_event *event, if (unlikely(!is_sampling_event(event))) return 0; - if (unlikely(hwc->interrupts >= max_samples_per_tick)) { - if (throttle) { + seq = __this_cpu_read(perf_throttled_seq); + if (seq != hwc->interrupts_seq) { + hwc->interrupts_seq = seq; + hwc->interrupts = 1; + } else { + hwc->interrupts++; + if (unlikely(throttle + && hwc->interrupts >= max_samples_per_tick)) { + __this_cpu_inc(perf_throttled_count); hwc->interrupts = MAX_INTERRUPTS; perf_log_throttle(event, 0); ret = 1; } - } else - hwc->interrupts++; + } if (event->attr.freq) { u64 now = perf_clock(); -- cgit v1.2.3-70-g09d2 From cb297a3e433dbdcf7ad81e0564e7b804c941ff0d Mon Sep 17 00:00:00 2001 From: Chanho Min Date: Thu, 5 Jan 2012 20:00:19 +0900 Subject: sched/rt: Fix task stack corruption under __ARCH_WANT_INTERRUPTS_ON_CTXSW This issue happens under the following conditions: 1. preemption is off 2. __ARCH_WANT_INTERRUPTS_ON_CTXSW is defined 3. RT scheduling class 4. SMP system Sequence is as follows: 1.suppose current task is A. start schedule() 2.task A is enqueued pushable task at the entry of schedule() __schedule prev = rq->curr; ... put_prev_task put_prev_task_rt enqueue_pushable_task 4.pick the task B as next task. next = pick_next_task(rq); 3.rq->curr set to task B and context_switch is started. rq->curr = next; 4.At the entry of context_swtich, release this cpu's rq->lock. context_switch prepare_task_switch prepare_lock_switch raw_spin_unlock_irq(&rq->lock); 5.Shortly after rq->lock is released, interrupt is occurred and start IRQ context 6.try_to_wake_up() which called by ISR acquires rq->lock try_to_wake_up ttwu_remote rq = __task_rq_lock(p) ttwu_do_wakeup(rq, p, wake_flags); task_woken_rt 7.push_rt_task picks the task A which is enqueued before. task_woken_rt push_rt_tasks(rq) next_task = pick_next_pushable_task(rq) 8.At find_lock_lowest_rq(), If double_lock_balance() returns 0, lowest_rq can be the remote rq. (But,If preemption is on, double_lock_balance always return 1 and it does't happen.) push_rt_task find_lock_lowest_rq if (double_lock_balance(rq, lowest_rq)).. 9.find_lock_lowest_rq return the available rq. task A is migrated to the remote cpu/rq. push_rt_task ... deactivate_task(rq, next_task, 0); set_task_cpu(next_task, lowest_rq->cpu); activate_task(lowest_rq, next_task, 0); 10. But, task A is on irq context at this cpu. So, task A is scheduled by two cpus at the same time until restore from IRQ. Task A's stack is corrupted. To fix it, don't migrate an RT task if it's still running. Signed-off-by: Chanho Min Signed-off-by: Peter Zijlstra Acked-by: Steven Rostedt Cc: Link: http://lkml.kernel.org/r/CAOAMb1BHA=5fm7KTewYyke6u-8DP0iUuJMpgQw54vNeXFsGpoQ@mail.gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3640ebbb466..f42ae7fb5ec 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1587,6 +1587,11 @@ static int push_rt_task(struct rq *rq) if (!next_task) return 0; +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + if (unlikely(task_running(rq, next_task))) + return 0; +#endif + retry: if (unlikely(next_task == rq->curr)) { WARN_ON(1); -- cgit v1.2.3-70-g09d2 From 181e9bdef37bfcaa41f3ab6c948a2a0d60a268b5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 29 Jan 2012 20:35:52 +0100 Subject: PM / Hibernate: Fix s2disk regression related to freezing workqueues Commit 2aede851ddf08666f68ffc17be446420e9d2a056 PM / Hibernate: Freeze kernel threads after preallocating memory introduced a mechanism by which kernel threads were frozen after the preallocation of hibernate image memory to avoid problems with frozen kernel threads not responding to memory freeing requests. However, it overlooked the s2disk code path in which the SNAPSHOT_CREATE_IMAGE ioctl was run directly after SNAPSHOT_FREE, which caused freeze_workqueues_begin() to BUG(), because it saw that worqueues had been already frozen. Although in principle this issue might be addressed by removing the relevant BUG_ON() from freeze_workqueues_begin(), that would reintroduce the very problem that commit 2aede851ddf08666f68ffc17be4 attempted to avoid into that particular code path. For this reason, to fix the issue at hand, introduce thaw_kernel_threads() and make the SNAPSHOT_FREE ioctl execute it. Special thanks to Srivatsa S. Bhat for detailed analysis of the problem. Reported-and-tested-by: Jiri Slaby Signed-off-by: Rafael J. Wysocki Acked-by: Srivatsa S. Bhat Cc: stable@kernel.org --- include/linux/freezer.h | 2 ++ kernel/power/process.c | 19 +++++++++++++++++++ kernel/power/user.c | 9 +++++++++ 3 files changed, 30 insertions(+) (limited to 'kernel') diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 0ab54e16a91..d09af4b67cf 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -39,6 +39,7 @@ extern bool __refrigerator(bool check_kthr_stop); extern int freeze_processes(void); extern int freeze_kernel_threads(void); extern void thaw_processes(void); +extern void thaw_kernel_threads(void); static inline bool try_to_freeze(void) { @@ -174,6 +175,7 @@ static inline bool __refrigerator(bool check_kthr_stop) { return false; } static inline int freeze_processes(void) { return -ENOSYS; } static inline int freeze_kernel_threads(void) { return -ENOSYS; } static inline void thaw_processes(void) {} +static inline void thaw_kernel_threads(void) {} static inline bool try_to_freeze(void) { return false; } diff --git a/kernel/power/process.c b/kernel/power/process.c index 77274c9ba2f..eeca00311f3 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -188,3 +188,22 @@ void thaw_processes(void) printk("done.\n"); } +void thaw_kernel_threads(void) +{ + struct task_struct *g, *p; + + pm_nosig_freezing = false; + printk("Restarting kernel threads ... "); + + thaw_workqueues(); + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) + __thaw_task(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + + schedule(); + printk("done.\n"); +} diff --git a/kernel/power/user.c b/kernel/power/user.c index 6b1ab7a8852..e5a21a85730 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -274,6 +274,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, swsusp_free(); memset(&data->handle, 0, sizeof(struct snapshot_handle)); data->ready = 0; + /* + * It is necessary to thaw kernel threads here, because + * SNAPSHOT_CREATE_IMAGE may be invoked directly after + * SNAPSHOT_FREE. In that case, if kernel threads were not + * thawed, the preallocation of memory carried out by + * hibernation_snapshot() might run into problems (i.e. it + * might fail or even deadlock). + */ + thaw_kernel_threads(); break; case SNAPSHOT_PREF_IMAGE_SIZE: -- cgit v1.2.3-70-g09d2 From fe9161db2e6053da21e4649d77bbefaf3030b11d Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Wed, 1 Feb 2012 22:16:36 +0100 Subject: PM / Hibernate: Thaw kernel threads in SNAPSHOT_CREATE_IMAGE ioctl path In the SNAPSHOT_CREATE_IMAGE ioctl, if the call to hibernation_snapshot() fails, the frozen tasks are not thawed. And in the case of success, if we happen to exit due to a successful freezer test, all tasks (including those of userspace) are thawed, whereas actually we should have thawed only the kernel threads at that point. Fix both these issues. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki Cc: stable@vger.kernel.org --- kernel/power/user.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index e5a21a85730..3e100075b13 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -249,13 +249,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, } pm_restore_gfp_mask(); error = hibernation_snapshot(data->platform_support); - if (!error) { + if (error) { + thaw_kernel_threads(); + } else { error = put_user(in_suspend, (int __user *)arg); if (!error && !freezer_test_done) data->ready = 1; if (freezer_test_done) { freezer_test_done = false; - thaw_processes(); + thaw_kernel_threads(); } } break; -- cgit v1.2.3-70-g09d2 From 8cdb878dcb359fd1137e9abdee9322f5e9bcfdf8 Mon Sep 17 00:00:00 2001 From: Christopher Yeoh Date: Thu, 2 Feb 2012 11:34:09 +1030 Subject: Fix race in process_vm_rw_core This fixes the race in process_vm_core found by Oleg (see http://article.gmane.org/gmane.linux.kernel/1235667/ for details). This has been updated since I last sent it as the creation of the new mm_access() function did almost exactly the same thing as parts of the previous version of this patch did. In order to use mm_access() even when /proc isn't enabled, we move it to kernel/fork.c where other related process mm access functions already are. Signed-off-by: Chris Yeoh Signed-off-by: Linus Torvalds --- fs/proc/base.c | 20 -------------------- include/linux/sched.h | 6 ++++++ kernel/fork.c | 20 ++++++++++++++++++++ mm/process_vm_access.c | 23 +++++++++-------------- 4 files changed, 35 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/fs/proc/base.c b/fs/proc/base.c index d9512bd03e6..d4548dd49b0 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -198,26 +198,6 @@ static int proc_root_link(struct dentry *dentry, struct path *path) return result; } -static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) -{ - struct mm_struct *mm; - int err; - - err = mutex_lock_killable(&task->signal->cred_guard_mutex); - if (err) - return ERR_PTR(err); - - mm = get_task_mm(task); - if (mm && mm != current->mm && - !ptrace_may_access(task, mode)) { - mmput(mm); - mm = ERR_PTR(-EACCES); - } - mutex_unlock(&task->signal->cred_guard_mutex); - - return mm; -} - struct mm_struct *mm_for_maps(struct task_struct *task) { return mm_access(task, PTRACE_MODE_READ); diff --git a/include/linux/sched.h b/include/linux/sched.h index 2234985a5e6..7d379a6bfd8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2259,6 +2259,12 @@ static inline void mmdrop(struct mm_struct * mm) extern void mmput(struct mm_struct *); /* Grab a reference to a task's mm, if it is not already going away */ extern struct mm_struct *get_task_mm(struct task_struct *task); +/* + * Grab a reference to a task's mm, if it is not already going away + * and ptrace_may_access with the mode parameter passed to it + * succeeds. + */ +extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); /* Remove the current tasks stale references to the old mm_struct */ extern void mm_release(struct task_struct *, struct mm_struct *); /* Allocate a new mm structure and copy contents from tsk->mm */ diff --git a/kernel/fork.c b/kernel/fork.c index 051f090d40c..1b2ef3c23ae 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -647,6 +647,26 @@ struct mm_struct *get_task_mm(struct task_struct *task) } EXPORT_SYMBOL_GPL(get_task_mm); +struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) +{ + struct mm_struct *mm; + int err; + + err = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (err) + return ERR_PTR(err); + + mm = get_task_mm(task); + if (mm && mm != current->mm && + !ptrace_may_access(task, mode)) { + mmput(mm); + mm = ERR_PTR(-EACCES); + } + mutex_unlock(&task->signal->cred_guard_mutex); + + return mm; +} + /* Please note the differences between mmput and mm_release. * mmput is called whenever we stop holding onto a mm_struct, * error success whatever. diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index e920aa3ce10..c20ff48994c 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -298,23 +298,18 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, goto free_proc_pages; } - task_lock(task); - if (__ptrace_may_access(task, PTRACE_MODE_ATTACH)) { - task_unlock(task); - rc = -EPERM; - goto put_task_struct; - } - mm = task->mm; - - if (!mm || (task->flags & PF_KTHREAD)) { - task_unlock(task); - rc = -EINVAL; + mm = mm_access(task, PTRACE_MODE_ATTACH); + if (!mm || IS_ERR(mm)) { + rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; + /* + * Explicitly map EACCES to EPERM as EPERM is a more a + * appropriate error code for process_vw_readv/writev + */ + if (rc == -EACCES) + rc = -EPERM; goto put_task_struct; } - atomic_inc(&mm->mm_users); - task_unlock(task); - for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) { rc = process_vm_rw_single_vec( (unsigned long)rvec[i].iov_base, rvec[i].iov_len, -- cgit v1.2.3-70-g09d2 From 55ca6140e9bb307efc97a9301a4f501de02a6fd6 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 3 Feb 2012 15:37:16 -0800 Subject: kprobes: fix a memory leak in function pre_handler_kretprobe() In function pre_handler_kretprobe(), the allocated kretprobe_instance object will get leaked if the entry_handler callback returns non-zero. This may cause all the preallocated kretprobe_instance objects exhausted. This issue can be reproduced by changing samples/kprobes/kretprobe_example.c to probe "mutex_unlock". And the fix is straightforward: just put the allocated kretprobe_instance object back onto the free_instances list. [akpm@linux-foundation.org: use raw_spin_lock/unlock] Signed-off-by: Jiang Liu Acked-by: Jim Keniston Acked-by: Ananth N Mavinakayanahalli Cc: Masami Hiramatsu Cc: Anil S Keshavamurthy Cc: "David S. Miller" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 29f5b65bee2..9788c0ec6f4 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1673,8 +1673,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, ri->rp = rp; ri->task = current; - if (rp->entry_handler && rp->entry_handler(ri, regs)) + if (rp->entry_handler && rp->entry_handler(ri, regs)) { + raw_spin_lock_irqsave(&rp->lock, flags); + hlist_add_head(&ri->hlist, &rp->free_instances); + raw_spin_unlock_irqrestore(&rp->lock, flags); return 0; + } arch_prepare_kretprobe(ri, regs); -- cgit v1.2.3-70-g09d2 From 379e0be812ab8a2a351e784b0c987788f5123090 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Fri, 3 Feb 2012 22:22:41 +0100 Subject: PM / Freezer: Thaw only kernel threads if freezing of kernel threads fails If freezing of kernel threads fails, we are expected to automatically thaw tasks in the error recovery path. However, at times, we encounter situations in which we would like the automatic error recovery path to thaw only the kernel threads, because we want to be able to do some more cleanup before we thaw userspace. Something like: error = freeze_kernel_threads(); if (error) { /* Do some cleanup */ /* Only then thaw userspace tasks*/ thaw_processes(); } An example of such a situation is where we freeze/thaw filesystems during suspend/hibernation. There, if freezing of kernel threads fails, we would like to thaw the frozen filesystems before thawing the userspace tasks. So, modify freeze_kernel_threads() to thaw only kernel threads in case of freezing failure. And change suspend_freeze_processes() accordingly. (At the same time, let us also get rid of the rather cryptic usage of the conditional operator (:?) in that function.) [rjw: In fact, this patch fixes a regression introduced during the 3.3 merge window, because without it thaw_processes() may be called before swsusp_free() in some situations and that may lead to massive memory allocation failures.] Signed-off-by: Srivatsa S. Bhat Acked-by: Tejun Heo Acked-by: Nigel Cunningham Signed-off-by: Rafael J. Wysocki --- kernel/power/power.h | 24 ++++++++++++++++++++++-- kernel/power/process.c | 7 +++++-- 2 files changed, 27 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 0c4defe6d3b..21724eee520 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -231,8 +231,28 @@ extern int pm_test_level; #ifdef CONFIG_SUSPEND_FREEZER static inline int suspend_freeze_processes(void) { - int error = freeze_processes(); - return error ? : freeze_kernel_threads(); + int error; + + error = freeze_processes(); + + /* + * freeze_processes() automatically thaws every task if freezing + * fails. So we need not do anything extra upon error. + */ + if (error) + goto Finish; + + error = freeze_kernel_threads(); + + /* + * freeze_kernel_threads() thaws only kernel threads upon freezing + * failure. So we have to thaw the userspace tasks ourselves. + */ + if (error) + thaw_processes(); + + Finish: + return error; } static inline void suspend_thaw_processes(void) diff --git a/kernel/power/process.c b/kernel/power/process.c index eeca00311f3..7e426459e60 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -143,7 +143,10 @@ int freeze_processes(void) /** * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. * - * On success, returns 0. On failure, -errno and system is fully thawed. + * On success, returns 0. On failure, -errno and only the kernel threads are + * thawed, so as to give a chance to the caller to do additional cleanups + * (if any) before thawing the userspace tasks. So, it is the responsibility + * of the caller to thaw the userspace tasks, when the time is right. */ int freeze_kernel_threads(void) { @@ -159,7 +162,7 @@ int freeze_kernel_threads(void) BUG_ON(in_atomic()); if (error) - thaw_processes(); + thaw_kernel_threads(); return error; } -- cgit v1.2.3-70-g09d2 From 11a3122f6cf2d988a77eb8883d0fc49cd013a6d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 7 Feb 2012 07:51:30 +0100 Subject: block: strip out locking optimization in put_io_context() put_io_context() performed a complex trylock dancing to avoid deferring ioc release to workqueue. It was also broken on UP because trylock was always assumed to succeed which resulted in unbalanced preemption count. While there are ways to fix the UP breakage, even the most pathological microbench (forced ioc allocation and tight fork/exit loop) fails to show any appreciable performance benefit of the optimization. Strip it out. If there turns out to be workloads which are affected by this change, simpler optimization from the discussion thread can be applied later. Signed-off-by: Tejun Heo LKML-Reference: <1328514611.21268.66.camel@sli10-conroe> Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- block/blk-core.c | 2 +- block/blk-ioc.c | 92 ++++++----------------------------------------- block/cfq-iosched.c | 2 +- fs/ioprio.c | 2 +- include/linux/blkdev.h | 3 -- include/linux/iocontext.h | 5 ++- kernel/fork.c | 2 +- 8 files changed, 18 insertions(+), 92 deletions(-) (limited to 'kernel') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index fa8f2630944..75642a352a8 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1659,7 +1659,7 @@ static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); if (ioc) { ioc_cgroup_changed(ioc); - put_io_context(ioc, NULL); + put_io_context(ioc); } } } diff --git a/block/blk-core.c b/block/blk-core.c index 63670257511..532b3a21b38 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -642,7 +642,7 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq) if (rq->cmd_flags & REQ_ELVPRIV) { elv_put_request(q, rq); if (rq->elv.icq) - put_io_context(rq->elv.icq->ioc, q); + put_io_context(rq->elv.icq->ioc); } mempool_free(rq, q->rq.rq_pool); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 7490b6da245..9884fd7427f 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -29,21 +29,6 @@ void get_io_context(struct io_context *ioc) } EXPORT_SYMBOL(get_io_context); -/* - * Releasing ioc may nest into another put_io_context() leading to nested - * fast path release. As the ioc's can't be the same, this is okay but - * makes lockdep whine. Keep track of nesting and use it as subclass. - */ -#ifdef CONFIG_LOCKDEP -#define ioc_release_depth(q) ((q) ? (q)->ioc_release_depth : 0) -#define ioc_release_depth_inc(q) (q)->ioc_release_depth++ -#define ioc_release_depth_dec(q) (q)->ioc_release_depth-- -#else -#define ioc_release_depth(q) 0 -#define ioc_release_depth_inc(q) do { } while (0) -#define ioc_release_depth_dec(q) do { } while (0) -#endif - static void icq_free_icq_rcu(struct rcu_head *head) { struct io_cq *icq = container_of(head, struct io_cq, __rcu_head); @@ -75,11 +60,8 @@ static void ioc_exit_icq(struct io_cq *icq) if (rcu_dereference_raw(ioc->icq_hint) == icq) rcu_assign_pointer(ioc->icq_hint, NULL); - if (et->ops.elevator_exit_icq_fn) { - ioc_release_depth_inc(q); + if (et->ops.elevator_exit_icq_fn) et->ops.elevator_exit_icq_fn(icq); - ioc_release_depth_dec(q); - } /* * @icq->q might have gone away by the time RCU callback runs @@ -149,81 +131,29 @@ static void ioc_release_fn(struct work_struct *work) /** * put_io_context - put a reference of io_context * @ioc: io_context to put - * @locked_q: request_queue the caller is holding queue_lock of (hint) * * Decrement reference count of @ioc and release it if the count reaches - * zero. If the caller is holding queue_lock of a queue, it can indicate - * that with @locked_q. This is an optimization hint and the caller is - * allowed to pass in %NULL even when it's holding a queue_lock. + * zero. */ -void put_io_context(struct io_context *ioc, struct request_queue *locked_q) +void put_io_context(struct io_context *ioc) { - struct request_queue *last_q = locked_q; unsigned long flags; if (ioc == NULL) return; BUG_ON(atomic_long_read(&ioc->refcount) <= 0); - if (locked_q) - lockdep_assert_held(locked_q->queue_lock); - - if (!atomic_long_dec_and_test(&ioc->refcount)) - return; /* - * Destroy @ioc. This is a bit messy because icq's are chained - * from both ioc and queue, and ioc->lock nests inside queue_lock. - * The inner ioc->lock should be held to walk our icq_list and then - * for each icq the outer matching queue_lock should be grabbed. - * ie. We need to do reverse-order double lock dancing. - * - * Another twist is that we are often called with one of the - * matching queue_locks held as indicated by @locked_q, which - * prevents performing double-lock dance for other queues. - * - * So, we do it in two stages. The fast path uses the queue_lock - * the caller is holding and, if other queues need to be accessed, - * uses trylock to avoid introducing locking dependency. This can - * handle most cases, especially if @ioc was performing IO on only - * single device. - * - * If trylock doesn't cut it, we defer to @ioc->release_work which - * can do all the double-locking dancing. + * Releasing ioc requires reverse order double locking and we may + * already be holding a queue_lock. Do it asynchronously from wq. */ - spin_lock_irqsave_nested(&ioc->lock, flags, - ioc_release_depth(locked_q)); - - while (!hlist_empty(&ioc->icq_list)) { - struct io_cq *icq = hlist_entry(ioc->icq_list.first, - struct io_cq, ioc_node); - struct request_queue *this_q = icq->q; - - if (this_q != last_q) { - if (last_q && last_q != locked_q) - spin_unlock(last_q->queue_lock); - last_q = NULL; - - /* spin_trylock() always successes in UP case */ - if (this_q != locked_q && - !spin_trylock(this_q->queue_lock)) - break; - last_q = this_q; - continue; - } - ioc_exit_icq(icq); + if (atomic_long_dec_and_test(&ioc->refcount)) { + spin_lock_irqsave(&ioc->lock, flags); + if (!hlist_empty(&ioc->icq_list)) + schedule_work(&ioc->release_work); + spin_unlock_irqrestore(&ioc->lock, flags); } - - if (last_q && last_q != locked_q) - spin_unlock(last_q->queue_lock); - - spin_unlock_irqrestore(&ioc->lock, flags); - - /* if no icq is left, we're done; otherwise, kick release_work */ - if (hlist_empty(&ioc->icq_list)) - kmem_cache_free(iocontext_cachep, ioc); - else - schedule_work(&ioc->release_work); } EXPORT_SYMBOL(put_io_context); @@ -238,7 +168,7 @@ void exit_io_context(struct task_struct *task) task_unlock(task); atomic_dec(&ioc->nr_tasks); - put_io_context(ioc, NULL); + put_io_context(ioc); } /** diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index da21c24dbed..5684df6848b 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1794,7 +1794,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqd->active_queue = NULL; if (cfqd->active_cic) { - put_io_context(cfqd->active_cic->icq.ioc, cfqd->queue); + put_io_context(cfqd->active_cic->icq.ioc); cfqd->active_cic = NULL; } } diff --git a/fs/ioprio.c b/fs/ioprio.c index f84b380d65e..0f1b9515213 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -51,7 +51,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio) ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); if (ioc) { ioc_ioprio_changed(ioc, ioprio); - put_io_context(ioc, NULL); + put_io_context(ioc); } return err; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6c6a1f00806..606cf339bb5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -399,9 +399,6 @@ struct request_queue { /* Throttle data */ struct throtl_data *td; #endif -#ifdef CONFIG_LOCKDEP - int ioc_release_depth; -#endif }; #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 7e1371c4bcc..119773eebe3 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -133,7 +133,7 @@ static inline struct io_context *ioc_task_link(struct io_context *ioc) struct task_struct; #ifdef CONFIG_BLOCK -void put_io_context(struct io_context *ioc, struct request_queue *locked_q); +void put_io_context(struct io_context *ioc); void exit_io_context(struct task_struct *task); struct io_context *get_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node); @@ -141,8 +141,7 @@ void ioc_ioprio_changed(struct io_context *ioc, int ioprio); void ioc_cgroup_changed(struct io_context *ioc); #else struct io_context; -static inline void put_io_context(struct io_context *ioc, - struct request_queue *locked_q) { } +static inline void put_io_context(struct io_context *ioc) { } static inline void exit_io_context(struct task_struct *task) { } #endif diff --git a/kernel/fork.c b/kernel/fork.c index 051f090d40c..c574aefa8d1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -890,7 +890,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) return -ENOMEM; new_ioc->ioprio = ioc->ioprio; - put_io_context(new_ioc, NULL); + put_io_context(new_ioc); } #endif return 0; -- cgit v1.2.3-70-g09d2 From f39d47ff819ed52a2afbdbecbe35f23f7755f58d Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 7 Feb 2012 14:39:57 +0100 Subject: perf: Fix double start/stop in x86_pmu_start() The following patch fixes a bug introduced by the following commit: e050e3f0a71b ("perf: Fix broken interrupt rate throttling") The patch caused the following warning to pop up depending on the sampling frequency adjustments: ------------[ cut here ]------------ WARNING: at arch/x86/kernel/cpu/perf_event.c:995 x86_pmu_start+0x79/0xd4() It was caused by the following call sequence: perf_adjust_freq_unthr_context.part() { stop() if (delta > 0) { perf_adjust_period() { if (period > 8*...) { stop() ... start() } } } start() } Which caused a double start and a double stop, thus triggering the assert in x86_pmu_start(). The patch fixes the problem by avoiding the double calls. We pass a new argument to perf_adjust_period() to indicate whether or not the event is already stopped. We can't just remove the start/stop from that function because it's called from __perf_event_overflow where the event needs to be reloaded via a stop/start back-toback call. The patch reintroduces the assertion in x86_pmu_start() which was removed by commit: 84f2b9b ("perf: Remove deprecated WARN_ON_ONCE()") In this second version, we've added calls to disable/enable PMU during unthrottling or frequency adjustment based on bug report of spurious NMI interrupts from Eric Dumazet. Reported-and-tested-by: Eric Dumazet Signed-off-by: Stephane Eranian Acked-by: Peter Zijlstra Cc: markus@trippelsdorf.de Cc: paulus@samba.org Link: http://lkml.kernel.org/r/20120207133956.GA4932@quad [ Minor edits to the changelog and to the code ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 3 +++ kernel/events/core.c | 19 ++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 2a30e5ae6ac..5adce1040b1 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -986,6 +986,9 @@ static void x86_pmu_start(struct perf_event *event, int flags) struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx = event->hw.idx; + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + if (WARN_ON_ONCE(idx == -1)) return; diff --git a/kernel/events/core.c b/kernel/events/core.c index ba36013cfb2..1b5c081d8b9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2303,7 +2303,7 @@ do { \ static DEFINE_PER_CPU(int, perf_throttled_count); static DEFINE_PER_CPU(u64, perf_throttled_seq); -static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) +static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable) { struct hw_perf_event *hwc = &event->hw; s64 period, sample_period; @@ -2322,9 +2322,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) hwc->sample_period = sample_period; if (local64_read(&hwc->period_left) > 8*sample_period) { - event->pmu->stop(event, PERF_EF_UPDATE); + if (disable) + event->pmu->stop(event, PERF_EF_UPDATE); + local64_set(&hwc->period_left, 0); - event->pmu->start(event, PERF_EF_RELOAD); + + if (disable) + event->pmu->start(event, PERF_EF_RELOAD); } } @@ -2350,6 +2354,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, return; raw_spin_lock(&ctx->lock); + perf_pmu_disable(ctx->pmu); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) @@ -2381,13 +2386,17 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, /* * restart the event * reload only if value has changed + * we have stopped the event so tell that + * to perf_adjust_period() to avoid stopping it + * twice. */ if (delta > 0) - perf_adjust_period(event, period, delta); + perf_adjust_period(event, period, delta, false); event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); } + perf_pmu_enable(ctx->pmu); raw_spin_unlock(&ctx->lock); } @@ -4562,7 +4571,7 @@ static int __perf_event_overflow(struct perf_event *event, hwc->freq_time_stamp = now; if (delta > 0 && delta < 2*TICK_NSEC) - perf_adjust_period(event, delta, hwc->last_period); + perf_adjust_period(event, delta, hwc->last_period, true); } /* -- cgit v1.2.3-70-g09d2 From f6302f1bcd75a042df69866d98b8d775a668f8f1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 10 Feb 2012 09:03:58 +0100 Subject: relay: prevent integer overflow in relay_open() "subbuf_size" and "n_subbufs" come from the user and they need to be capped to prevent an integer overflow. Signed-off-by: Dan Carpenter Cc: stable@kernel.org Signed-off-by: Jens Axboe --- kernel/relay.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 4335e1d7ee2..ab56a1764d4 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -164,10 +164,14 @@ depopulate: */ static struct rchan_buf *relay_create_buf(struct rchan *chan) { - struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); - if (!buf) + struct rchan_buf *buf; + + if (chan->n_subbufs > UINT_MAX / sizeof(size_t *)) return NULL; + buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); + if (!buf) + return NULL; buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL); if (!buf->padding) goto free_buf; @@ -574,6 +578,8 @@ struct rchan *relay_open(const char *base_filename, if (!(subbuf_size && n_subbufs)) return NULL; + if (subbuf_size > UINT_MAX / n_subbufs) + return NULL; chan = kzalloc(sizeof(struct rchan), GFP_KERNEL); if (!chan) -- cgit v1.2.3-70-g09d2 From 10f296cbfe3b93188c41463fd7a53808ebdbcbe3 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 1 Feb 2012 10:33:11 +0800 Subject: module: make module param bint handle nul value Allow bint param accept nul values, just do same as bool param. Signed-off-by: Dave Young Cc: Rusty Russell Signed-off-by: Rusty Russell --- kernel/params.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 32ee0430828..4bc965d8a1f 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -97,7 +97,8 @@ static int parse_one(char *param, for (i = 0; i < num_params; i++) { if (parameq(param, params[i].name)) { /* No one handled NULL, so do it here. */ - if (!val && params[i].ops->set != param_set_bool) + if (!val && params[i].ops->set != param_set_bool + && params[i].ops->set != param_set_bint) return -EINVAL; pr_debug("They are equal! Calling %p\n", params[i].ops->set); -- cgit v1.2.3-70-g09d2 From 074b85175a43a23fdbde60f55feea636e0bf0f85 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 8 Feb 2012 12:39:07 -0800 Subject: vfs: fix panic in __d_lookup() with high dentry hashtable counts When the number of dentry cache hash table entries gets too high (2147483648 entries), as happens by default on a 16TB system, use of a signed integer in the dcache_init() initialization loop prevents the dentry_hashtable from getting initialized, causing a panic in __d_lookup(). Fix this in dcache_init() and similar areas. Signed-off-by: Dimitri Sivanich Acked-by: David S. Miller Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/dcache.c | 8 ++++---- fs/inode.c | 8 ++++---- kernel/pid.c | 4 ++-- mm/page_alloc.c | 1 + net/ipv4/tcp.c | 5 +++-- 5 files changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/fs/dcache.c b/fs/dcache.c index 16a53cc2cc0..fe19ac13f75 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2968,7 +2968,7 @@ __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { - int loop; + unsigned int loop; /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. @@ -2986,13 +2986,13 @@ static void __init dcache_init_early(void) &d_hash_mask, 0); - for (loop = 0; loop < (1 << d_hash_shift); loop++) + for (loop = 0; loop < (1U << d_hash_shift); loop++) INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } static void __init dcache_init(void) { - int loop; + unsigned int loop; /* * A constructor could be added for stable state like the lists, @@ -3016,7 +3016,7 @@ static void __init dcache_init(void) &d_hash_mask, 0); - for (loop = 0; loop < (1 << d_hash_shift); loop++) + for (loop = 0; loop < (1U << d_hash_shift); loop++) INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } diff --git a/fs/inode.c b/fs/inode.c index fb10d86ffad..d3ebdbe723d 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1651,7 +1651,7 @@ __setup("ihash_entries=", set_ihash_entries); */ void __init inode_init_early(void) { - int loop; + unsigned int loop; /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. @@ -1669,13 +1669,13 @@ void __init inode_init_early(void) &i_hash_mask, 0); - for (loop = 0; loop < (1 << i_hash_shift); loop++) + for (loop = 0; loop < (1U << i_hash_shift); loop++) INIT_HLIST_HEAD(&inode_hashtable[loop]); } void __init inode_init(void) { - int loop; + unsigned int loop; /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", @@ -1699,7 +1699,7 @@ void __init inode_init(void) &i_hash_mask, 0); - for (loop = 0; loop < (1 << i_hash_shift); loop++) + for (loop = 0; loop < (1U << i_hash_shift); loop++) INIT_HLIST_HEAD(&inode_hashtable[loop]); } diff --git a/kernel/pid.c b/kernel/pid.c index ce8e00deacc..9f08dfabaf1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -543,12 +543,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) */ void __init pidhash_init(void) { - int i, pidhash_size; + unsigned int i, pidhash_size; pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, HASH_EARLY | HASH_SMALL, &pidhash_shift, NULL, 4096); - pidhash_size = 1 << pidhash_shift; + pidhash_size = 1U << pidhash_shift; for (i = 0; i < pidhash_size; i++) INIT_HLIST_HEAD(&pid_hash[i]); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d2186ecb36f..a13ded1938f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5236,6 +5236,7 @@ void *__init alloc_large_system_hash(const char *tablename, max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; do_div(max, bucketsize); } + max = min(max, 0x80000000ULL); if (numentries > max) numentries = max; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 37755ccc0e9..22ef5f9fd2f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3240,7 +3240,8 @@ void __init tcp_init(void) { struct sk_buff *skb = NULL; unsigned long limit; - int i, max_share, cnt; + int max_share, cnt; + unsigned int i; unsigned long jiffy = jiffies; BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); @@ -3283,7 +3284,7 @@ void __init tcp_init(void) &tcp_hashinfo.bhash_size, NULL, 64 * 1024); - tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size; + tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; for (i = 0; i < tcp_hashinfo.bhash_size; i++) { spin_lock_init(&tcp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); -- cgit v1.2.3-70-g09d2 From d80e731ecab420ddcb79ee9d0ac427acbc187b4b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 24 Feb 2012 20:07:11 +0100 Subject: epoll: introduce POLLFREE to flush ->signalfd_wqh before kfree() This patch is intentionally incomplete to simplify the review. It ignores ep_unregister_pollwait() which plays with the same wqh. See the next change. epoll assumes that the EPOLL_CTL_ADD'ed file controls everything f_op->poll() needs. In particular it assumes that the wait queue can't go away until eventpoll_release(). This is not true in case of signalfd, the task which does EPOLL_CTL_ADD uses its ->sighand which is not connected to the file. This patch adds the special event, POLLFREE, currently only for epoll. It expects that init_poll_funcptr()'ed hook should do the necessary cleanup. Perhaps it should be defined as EPOLLFREE in eventpoll. __cleanup_sighand() is changed to do wake_up_poll(POLLFREE) if ->signalfd_wqh is not empty, we add the new signalfd_cleanup() helper. ep_poll_callback(POLLFREE) simply does list_del_init(task_list). This make this poll entry inconsistent, but we don't care. If you share epoll fd which contains our sigfd with another process you should blame yourself. signalfd is "really special". I simply do not know how we can define the "right" semantics if it used with epoll. The main problem is, epoll calls signalfd_poll() once to establish the connection with the wait queue, after that signalfd_poll(NULL) returns the different/inconsistent results depending on who does EPOLL_CTL_MOD/signalfd_read/etc. IOW: apart from sigmask, signalfd has nothing to do with the file, it works with the current thread. In short: this patch is the hack which tries to fix the symptoms. It also assumes that nobody can take tasklist_lock under epoll locks, this seems to be true. Note: - we do not have wake_up_all_poll() but wake_up_poll() is fine, poll/epoll doesn't use WQ_FLAG_EXCLUSIVE. - signalfd_cleanup() uses POLLHUP along with POLLFREE, we need a couple of simple changes in eventpoll.c to make sure it can't be "lost". Reported-by: Maxime Bizon Cc: Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 4 ++++ fs/signalfd.c | 11 +++++++++++ include/asm-generic/poll.h | 2 ++ include/linux/signalfd.h | 5 ++++- kernel/fork.c | 5 ++++- 5 files changed, 25 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index aabdfc38cf2..34bbfc6dd8d 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -842,6 +842,10 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; + /* the caller holds eppoll_entry->whead->lock */ + if ((unsigned long)key & POLLFREE) + list_del_init(&wait->task_list); + spin_lock_irqsave(&ep->lock, flags); /* diff --git a/fs/signalfd.c b/fs/signalfd.c index 492465b451d..79c1eea98a3 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -30,6 +30,17 @@ #include #include +void signalfd_cleanup(struct sighand_struct *sighand) +{ + wait_queue_head_t *wqh = &sighand->signalfd_wqh; + + if (likely(!waitqueue_active(wqh))) + return; + + /* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */ + wake_up_poll(wqh, POLLHUP | POLLFREE); +} + struct signalfd_ctx { sigset_t sigmask; }; diff --git a/include/asm-generic/poll.h b/include/asm-generic/poll.h index 44bce836d35..9ce7f44aebd 100644 --- a/include/asm-generic/poll.h +++ b/include/asm-generic/poll.h @@ -28,6 +28,8 @@ #define POLLRDHUP 0x2000 #endif +#define POLLFREE 0x4000 /* currently only for epoll */ + struct pollfd { int fd; short events; diff --git a/include/linux/signalfd.h b/include/linux/signalfd.h index 3ff4961da9b..247399b2979 100644 --- a/include/linux/signalfd.h +++ b/include/linux/signalfd.h @@ -61,13 +61,16 @@ static inline void signalfd_notify(struct task_struct *tsk, int sig) wake_up(&tsk->sighand->signalfd_wqh); } +extern void signalfd_cleanup(struct sighand_struct *sighand); + #else /* CONFIG_SIGNALFD */ static inline void signalfd_notify(struct task_struct *tsk, int sig) { } +static inline void signalfd_cleanup(struct sighand_struct *sighand) { } + #endif /* CONFIG_SIGNALFD */ #endif /* __KERNEL__ */ #endif /* _LINUX_SIGNALFD_H */ - diff --git a/kernel/fork.c b/kernel/fork.c index b77fd559c78..e2cd3e2a5ae 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include @@ -935,8 +936,10 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_sighand(struct sighand_struct *sighand) { - if (atomic_dec_and_test(&sighand->count)) + if (atomic_dec_and_test(&sighand->count)) { + signalfd_cleanup(sighand); kmem_cache_free(sighand_cachep, sighand); + } } -- cgit v1.2.3-70-g09d2