From d6672c501852d577097f6757c311d937aca0b04b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 1 Aug 2008 11:23:50 +0200 Subject: lockdep: build fix fix: kernel/built-in.o: In function `lockdep_stats_show': lockdep_proc.c:(.text+0x3cb2f): undefined reference to `lockdep_count_forward_deps' kernel/built-in.o: In function `l_show': lockdep_proc.c:(.text+0x3d02b): undefined reference to `lockdep_count_forward_deps' lockdep_proc.c:(.text+0x3d047): undefined reference to `lockdep_count_backward_deps' Signed-off-by: Ingo Molnar --- kernel/lockdep_internals.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index 55db193d366..56b196932c0 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -50,8 +50,21 @@ extern unsigned int nr_process_chains; extern unsigned int max_lockdep_depth; extern unsigned int max_recursion_depth; +#ifdef CONFIG_PROVE_LOCKING extern unsigned long lockdep_count_forward_deps(struct lock_class *); extern unsigned long lockdep_count_backward_deps(struct lock_class *); +#else +static inline unsigned long +lockdep_count_forward_deps(struct lock_class *class) +{ + return 0; +} +static inline unsigned long +lockdep_count_backward_deps(struct lock_class *class) +{ + return 0; +} +#endif #ifdef CONFIG_DEBUG_LOCKDEP /* -- cgit v1.2.3-70-g09d2 From c72f4573a5e05e35a31474977f500cbe21dcbb11 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 12 Aug 2008 13:27:37 -0700 Subject: lockdep: spin_lock_nest_lock(), checkpatch fixes fix: WARNING: EXPORT_SYMBOL(foo); should immediately follow its function/variable #46: FILE: kernel/spinlock.c:326: +EXPORT_SYMBOL(_spin_lock_nest_lock); total: 0 errors, 1 warnings, 26 lines checked Signed-off-by: Andrew Morton Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/spinlock.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 44baeea94ab..29ab20749dd 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -290,7 +290,6 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } - EXPORT_SYMBOL(_spin_lock_nested); unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) @@ -312,7 +311,6 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas #endif return flags; } - EXPORT_SYMBOL(_spin_lock_irqsave_nested); void __lockfunc _spin_lock_nest_lock(spinlock_t *lock, @@ -322,7 +320,6 @@ void __lockfunc _spin_lock_nest_lock(spinlock_t *lock, spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } - EXPORT_SYMBOL(_spin_lock_nest_lock); #endif -- cgit v1.2.3-70-g09d2 From 2df8b1d656021e180ab93c8a4b2c9c2923d30b82 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 30 Jul 2008 12:43:11 -0700 Subject: lockdep: use WARN() in kernel/lockdep.c Use WARN() instead of a printk+WARN_ON() pair; this way the message becomes part of the warning section for better reporting/collection. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton --- kernel/lockdep.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 1aa91fd6b06..77fa776a2da 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -1759,11 +1759,10 @@ static void check_chain_key(struct task_struct *curr) hlock = curr->held_locks + i; if (chain_key != hlock->prev_chain_key) { debug_locks_off(); - printk("hm#1, depth: %u [%u], %016Lx != %016Lx\n", + WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n", curr->lockdep_depth, i, (unsigned long long)chain_key, (unsigned long long)hlock->prev_chain_key); - WARN_ON(1); return; } id = hlock->class_idx - 1; @@ -1778,11 +1777,10 @@ static void check_chain_key(struct task_struct *curr) } if (chain_key != curr->curr_chain_key) { debug_locks_off(); - printk("hm#2, depth: %u [%u], %016Lx != %016Lx\n", + WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n", curr->lockdep_depth, i, (unsigned long long)chain_key, (unsigned long long)curr->curr_chain_key); - WARN_ON(1); } #endif } -- cgit v1.2.3-70-g09d2 From 09f2724a786f76475ef2985cf84f5359c553aade Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Wed, 14 Aug 2030 15:56:40 +0800 Subject: sched: fix the race between walk_tg_tree and sched_create_group With 2.6.27-rc3, I hit a kernel panic when running volanoMark on my new x86_64 machine. I also hit it with other 2.6.27-rc kernels. See below log. Basically, function walk_tg_tree and sched_create_group have a race between accessing and initiating tg->children. Below patch fixes it by moving tg->children initiation to the front of linking tg->siblings to parent->children. {----------------panic log------------} BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 IP: [] walk_tg_tree+0x45/0x7f PGD 1be1c4067 PUD 1bdd8d067 PMD 0 Oops: 0000 [1] SMP CPU 11 Modules linked in: igb Pid: 22979, comm: java Not tainted 2.6.27-rc3 #1 RIP: 0010:[] [] walk_tg_tree+0x45/0x7f RSP: 0018:ffff8801bfbbbd18 EFLAGS: 00010083 RAX: 0000000000000000 RBX: ffff8800be0dce40 RCX: ffffffffffffffc0 RDX: ffff880102c43740 RSI: 0000000000000000 RDI: ffff8800be0dce40 RBP: ffff8801bfbbbd48 R08: ffff8800ba437bc8 R09: 0000000000001f40 R10: ffff8801be812100 R11: ffffffff805fdf44 R12: ffff880102c43740 R13: 0000000000000000 R14: ffffffff8022cf0f R15: ffffffff8022749f FS: 00000000568ac950(0063) GS:ffff8801bfa26d00(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000000 CR3: 00000001bd848000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process java (pid: 22979, threadinfo ffff8801b145a000, task ffff8801bf18e450) Stack: 0000000000000001 ffff8800ba5c8d60 0000000000000001 0000000000000001 ffff8800bad1ccb8 0000000000000000 ffff8801bfbbbd98 ffffffff8022ed37 0000000000000001 0000000000000286 ffff8801bd5ee180 ffff8800ba437bc8 Call Trace: [] try_to_wake_up+0x71/0x24c [] autoremove_wake_function+0x9/0x2e [] ? __wake_up_common+0x46/0x76 [] __wake_up+0x38/0x4f [] tcp_v4_rcv+0x380/0x62e Signed-off-by: Zhang Yanmin Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index d601fb0406c..8bf8a5528bc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8462,8 +8462,8 @@ struct task_group *sched_create_group(struct task_group *parent) WARN_ON(!parent); /* root should already exist */ tg->parent = parent; - list_add_rcu(&tg->siblings, &parent->children); INIT_LIST_HEAD(&tg->children); + list_add_rcu(&tg->siblings, &parent->children); spin_unlock_irqrestore(&task_group_lock, flags); return tg; -- cgit v1.2.3-70-g09d2 From 5cd9c58fbe9ec92b45b27e131719af4f2bd9eb40 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 14 Aug 2008 11:37:28 +0100 Subject: security: Fix setting of PF_SUPERPRIV by __capable() Fix the setting of PF_SUPERPRIV by __capable() as it could corrupt the flags the target process if that is not the current process and it is trying to change its own flags in a different way at the same time. __capable() is using neither atomic ops nor locking to protect t->flags. This patch removes __capable() and introduces has_capability() that doesn't set PF_SUPERPRIV on the process being queried. This patch further splits security_ptrace() in two: (1) security_ptrace_may_access(). This passes judgement on whether one process may access another only (PTRACE_MODE_ATTACH for ptrace() and PTRACE_MODE_READ for /proc), and takes a pointer to the child process. current is the parent. (2) security_ptrace_traceme(). This passes judgement on PTRACE_TRACEME only, and takes only a pointer to the parent process. current is the child. In Smack and commoncap, this uses has_capability() to determine whether the parent will be permitted to use PTRACE_ATTACH if normal checks fail. This does not set PF_SUPERPRIV. Two of the instances of __capable() actually only act on current, and so have been changed to calls to capable(). Of the places that were using __capable(): (1) The OOM killer calls __capable() thrice when weighing the killability of a process. All of these now use has_capability(). (2) cap_ptrace() and smack_ptrace() were using __capable() to check to see whether the parent was allowed to trace any process. As mentioned above, these have been split. For PTRACE_ATTACH and /proc, capable() is now used, and for PTRACE_TRACEME, has_capability() is used. (3) cap_safe_nice() only ever saw current, so now uses capable(). (4) smack_setprocattr() rejected accesses to tasks other than current just after calling __capable(), so the order of these two tests have been switched and capable() is used instead. (5) In smack_file_send_sigiotask(), we need to allow privileged processes to receive SIGIO on files they're manipulating. (6) In smack_task_wait(), we let a process wait for a privileged process, whether or not the process doing the waiting is privileged. I've tested this with the LTP SELinux and syscalls testscripts. Signed-off-by: David Howells Acked-by: Serge Hallyn Acked-by: Casey Schaufler Acked-by: Andrew G. Morgan Acked-by: Al Viro Signed-off-by: James Morris --- include/linux/capability.h | 15 ++++++++++++-- include/linux/security.h | 39 +++++++++++++++++++++++------------- kernel/capability.c | 21 ++++++++++++-------- kernel/ptrace.c | 5 ++--- mm/oom_kill.c | 6 ++++-- security/capability.c | 3 ++- security/commoncap.c | 24 ++++++++++++++++------- security/root_plug.c | 3 ++- security/security.c | 10 +++++++--- security/selinux/hooks.c | 25 ++++++++++++++++------- security/smack/smack_lsm.c | 49 ++++++++++++++++++++++++++++++++-------------- 11 files changed, 137 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/include/linux/capability.h b/include/linux/capability.h index 02673846d20..9d1fe30b6f6 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -503,8 +503,19 @@ extern const kernel_cap_t __cap_init_eff_set; kernel_cap_t cap_set_effective(const kernel_cap_t pE_new); -int capable(int cap); -int __capable(struct task_struct *t, int cap); +/** + * has_capability - Determine if a task has a superior capability available + * @t: The task in question + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect, false if not. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +#define has_capability(t, cap) (security_capable((t), (cap)) == 0) + +extern int capable(int cap); #endif /* __KERNEL__ */ diff --git a/include/linux/security.h b/include/linux/security.h index fd96e7f8a6f..2ee5ecfb239 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -46,8 +46,8 @@ struct audit_krule; */ extern int cap_capable(struct task_struct *tsk, int cap); extern int cap_settime(struct timespec *ts, struct timezone *tz); -extern int cap_ptrace(struct task_struct *parent, struct task_struct *child, - unsigned int mode); +extern int cap_ptrace_may_access(struct task_struct *child, unsigned int mode); +extern int cap_ptrace_traceme(struct task_struct *parent); extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); extern int cap_capset_check(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); extern void cap_capset_set(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); @@ -1157,17 +1157,24 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @alter contains the flag indicating whether changes are to be made. * Return 0 if permission is granted. * - * @ptrace: - * Check permission before allowing the @parent process to trace the + * @ptrace_may_access: + * Check permission before allowing the current process to trace the * @child process. * Security modules may also want to perform a process tracing check * during an execve in the set_security or apply_creds hooks of * binprm_security_ops if the process is being traced and its security * attributes would be changed by the execve. - * @parent contains the task_struct structure for parent process. - * @child contains the task_struct structure for child process. + * @child contains the task_struct structure for the target process. * @mode contains the PTRACE_MODE flags indicating the form of access. * Return 0 if permission is granted. + * @ptrace_traceme: + * Check that the @parent process has sufficient permission to trace the + * current process before allowing the current process to present itself + * to the @parent process for tracing. + * The parent process will still have to undergo the ptrace_may_access + * checks before it is allowed to trace this one. + * @parent contains the task_struct structure for debugger process. + * Return 0 if permission is granted. * @capget: * Get the @effective, @inheritable, and @permitted capability sets for * the @target process. The hook may also perform permission checking to @@ -1287,8 +1294,8 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) struct security_operations { char name[SECURITY_NAME_MAX + 1]; - int (*ptrace) (struct task_struct *parent, struct task_struct *child, - unsigned int mode); + int (*ptrace_may_access) (struct task_struct *child, unsigned int mode); + int (*ptrace_traceme) (struct task_struct *parent); int (*capget) (struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); @@ -1560,8 +1567,8 @@ extern struct dentry *securityfs_create_dir(const char *name, struct dentry *par extern void securityfs_remove(struct dentry *dentry); /* Security operations */ -int security_ptrace(struct task_struct *parent, struct task_struct *child, - unsigned int mode); +int security_ptrace_may_access(struct task_struct *child, unsigned int mode); +int security_ptrace_traceme(struct task_struct *parent); int security_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, @@ -1742,11 +1749,15 @@ static inline int security_init(void) return 0; } -static inline int security_ptrace(struct task_struct *parent, - struct task_struct *child, - unsigned int mode) +static inline int security_ptrace_may_access(struct task_struct *child, + unsigned int mode) +{ + return cap_ptrace_may_access(child, mode); +} + +static inline int security_ptrace_traceme(struct task_struct *child) { - return cap_ptrace(parent, child, mode); + return cap_ptrace_traceme(parent); } static inline int security_capget(struct task_struct *target, diff --git a/kernel/capability.c b/kernel/capability.c index 0101e847603..33e51e78c2d 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -486,17 +486,22 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) return ret; } -int __capable(struct task_struct *t, int cap) +/** + * capable - Determine if the current task has a superior capability in effect + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +int capable(int cap) { - if (security_capable(t, cap) == 0) { - t->flags |= PF_SUPERPRIV; + if (has_capability(current, cap)) { + current->flags |= PF_SUPERPRIV; return 1; } return 0; } - -int capable(int cap) -{ - return __capable(current, cap); -} EXPORT_SYMBOL(capable); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 082b3fcb32a..356699a96d5 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -140,7 +140,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) if (!dumpable && !capable(CAP_SYS_PTRACE)) return -EPERM; - return security_ptrace(current, task, mode); + return security_ptrace_may_access(task, mode); } bool ptrace_may_access(struct task_struct *task, unsigned int mode) @@ -499,8 +499,7 @@ repeat: goto repeat; } - ret = security_ptrace(current->parent, current, - PTRACE_MODE_ATTACH); + ret = security_ptrace_traceme(current->parent); /* * Set the ptrace bit in the process ptrace flags. diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8a5467ee626..64e5b4bcd96 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -26,6 +26,7 @@ #include #include #include +#include int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; @@ -128,7 +129,8 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * Superuser processes are usually more important, so we make it * less likely that we kill those. */ - if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE)) + if (has_capability(p, CAP_SYS_ADMIN) || + has_capability(p, CAP_SYS_RESOURCE)) points /= 4; /* @@ -137,7 +139,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * tend to only have this flag set on applications they think * of as important. */ - if (__capable(p, CAP_SYS_RAWIO)) + if (has_capability(p, CAP_SYS_RAWIO)) points /= 4; /* diff --git a/security/capability.c b/security/capability.c index 63d10da515a..24587481903 100644 --- a/security/capability.c +++ b/security/capability.c @@ -811,7 +811,8 @@ struct security_operations default_security_ops = { void security_fixup_ops(struct security_operations *ops) { - set_to_cap_if_null(ops, ptrace); + set_to_cap_if_null(ops, ptrace_may_access); + set_to_cap_if_null(ops, ptrace_traceme); set_to_cap_if_null(ops, capget); set_to_cap_if_null(ops, capset_check); set_to_cap_if_null(ops, capset_set); diff --git a/security/commoncap.c b/security/commoncap.c index 4afbece37a0..e4c4b3fc0c0 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -63,14 +63,24 @@ int cap_settime(struct timespec *ts, struct timezone *tz) return 0; } -int cap_ptrace (struct task_struct *parent, struct task_struct *child, - unsigned int mode) +int cap_ptrace_may_access(struct task_struct *child, unsigned int mode) { /* Derived from arch/i386/kernel/ptrace.c:sys_ptrace. */ - if (!cap_issubset(child->cap_permitted, parent->cap_permitted) && - !__capable(parent, CAP_SYS_PTRACE)) - return -EPERM; - return 0; + if (cap_issubset(child->cap_permitted, current->cap_permitted)) + return 0; + if (capable(CAP_SYS_PTRACE)) + return 0; + return -EPERM; +} + +int cap_ptrace_traceme(struct task_struct *parent) +{ + /* Derived from arch/i386/kernel/ptrace.c:sys_ptrace. */ + if (cap_issubset(current->cap_permitted, parent->cap_permitted)) + return 0; + if (has_capability(parent, CAP_SYS_PTRACE)) + return 0; + return -EPERM; } int cap_capget (struct task_struct *target, kernel_cap_t *effective, @@ -534,7 +544,7 @@ int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid, static inline int cap_safe_nice(struct task_struct *p) { if (!cap_issubset(p->cap_permitted, current->cap_permitted) && - !__capable(current, CAP_SYS_NICE)) + !capable(CAP_SYS_NICE)) return -EPERM; return 0; } diff --git a/security/root_plug.c b/security/root_plug.c index be0ebec2580..c3f68b5b372 100644 --- a/security/root_plug.c +++ b/security/root_plug.c @@ -72,7 +72,8 @@ static int rootplug_bprm_check_security (struct linux_binprm *bprm) static struct security_operations rootplug_security_ops = { /* Use the capability functions for some of the hooks */ - .ptrace = cap_ptrace, + .ptrace_may_access = cap_ptrace_may_access, + .ptrace_traceme = cap_ptrace_traceme, .capget = cap_capget, .capset_check = cap_capset_check, .capset_set = cap_capset_set, diff --git a/security/security.c b/security/security.c index ff706872775..3a4b4f55b33 100644 --- a/security/security.c +++ b/security/security.c @@ -127,10 +127,14 @@ int register_security(struct security_operations *ops) /* Security operations */ -int security_ptrace(struct task_struct *parent, struct task_struct *child, - unsigned int mode) +int security_ptrace_may_access(struct task_struct *child, unsigned int mode) { - return security_ops->ptrace(parent, child, mode); + return security_ops->ptrace_may_access(child, mode); +} + +int security_ptrace_traceme(struct task_struct *parent) +{ + return security_ops->ptrace_traceme(parent); } int security_capget(struct task_struct *target, diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 3ae9bec5a50..03fc6a81ae3 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1738,24 +1738,34 @@ static inline u32 file_to_av(struct file *file) /* Hook functions begin here. */ -static int selinux_ptrace(struct task_struct *parent, - struct task_struct *child, - unsigned int mode) +static int selinux_ptrace_may_access(struct task_struct *child, + unsigned int mode) { int rc; - rc = secondary_ops->ptrace(parent, child, mode); + rc = secondary_ops->ptrace_may_access(child, mode); if (rc) return rc; if (mode == PTRACE_MODE_READ) { - struct task_security_struct *tsec = parent->security; + struct task_security_struct *tsec = current->security; struct task_security_struct *csec = child->security; return avc_has_perm(tsec->sid, csec->sid, SECCLASS_FILE, FILE__READ, NULL); } - return task_has_perm(parent, child, PROCESS__PTRACE); + return task_has_perm(current, child, PROCESS__PTRACE); +} + +static int selinux_ptrace_traceme(struct task_struct *parent) +{ + int rc; + + rc = secondary_ops->ptrace_traceme(parent); + if (rc) + return rc; + + return task_has_perm(parent, current, PROCESS__PTRACE); } static int selinux_capget(struct task_struct *target, kernel_cap_t *effective, @@ -5346,7 +5356,8 @@ static int selinux_key_getsecurity(struct key *key, char **_buffer) static struct security_operations selinux_ops = { .name = "selinux", - .ptrace = selinux_ptrace, + .ptrace_may_access = selinux_ptrace_may_access, + .ptrace_traceme = selinux_ptrace_traceme, .capget = selinux_capget, .capset_check = selinux_capset_check, .capset_set = selinux_capset_set, diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 1b40e558f98..87d75417ea9 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -87,27 +87,46 @@ struct inode_smack *new_inode_smack(char *smack) */ /** - * smack_ptrace - Smack approval on ptrace - * @ptp: parent task pointer + * smack_ptrace_may_access - Smack approval on PTRACE_ATTACH * @ctp: child task pointer * * Returns 0 if access is OK, an error code otherwise * * Do the capability checks, and require read and write. */ -static int smack_ptrace(struct task_struct *ptp, struct task_struct *ctp, - unsigned int mode) +static int smack_ptrace_may_access(struct task_struct *ctp, unsigned int mode) { int rc; - rc = cap_ptrace(ptp, ctp, mode); + rc = cap_ptrace_may_access(ctp, mode); if (rc != 0) return rc; - rc = smk_access(ptp->security, ctp->security, MAY_READWRITE); - if (rc != 0 && __capable(ptp, CAP_MAC_OVERRIDE)) + rc = smk_access(current->security, ctp->security, MAY_READWRITE); + if (rc != 0 && capable(CAP_MAC_OVERRIDE)) return 0; + return rc; +} + +/** + * smack_ptrace_traceme - Smack approval on PTRACE_TRACEME + * @ptp: parent task pointer + * + * Returns 0 if access is OK, an error code otherwise + * + * Do the capability checks, and require read and write. + */ +static int smack_ptrace_traceme(struct task_struct *ptp) +{ + int rc; + + rc = cap_ptrace_traceme(ptp); + if (rc != 0) + return rc; + rc = smk_access(ptp->security, current->security, MAY_READWRITE); + if (rc != 0 && has_capability(ptp, CAP_MAC_OVERRIDE)) + return 0; return rc; } @@ -923,7 +942,7 @@ static int smack_file_send_sigiotask(struct task_struct *tsk, */ file = container_of(fown, struct file, f_owner); rc = smk_access(file->f_security, tsk->security, MAY_WRITE); - if (rc != 0 && __capable(tsk, CAP_MAC_OVERRIDE)) + if (rc != 0 && has_capability(tsk, CAP_MAC_OVERRIDE)) return 0; return rc; } @@ -1164,12 +1183,12 @@ static int smack_task_wait(struct task_struct *p) * account for the smack labels having gotten to * be different in the first place. * - * This breaks the strict subjet/object access + * This breaks the strict subject/object access * control ideal, taking the object's privilege * state into account in the decision as well as * the smack value. */ - if (capable(CAP_MAC_OVERRIDE) || __capable(p, CAP_MAC_OVERRIDE)) + if (capable(CAP_MAC_OVERRIDE) || has_capability(p, CAP_MAC_OVERRIDE)) return 0; return rc; @@ -2016,9 +2035,6 @@ static int smack_setprocattr(struct task_struct *p, char *name, { char *newsmack; - if (!__capable(p, CAP_MAC_ADMIN)) - return -EPERM; - /* * Changing another process' Smack value is too dangerous * and supports no sane use case. @@ -2026,6 +2042,9 @@ static int smack_setprocattr(struct task_struct *p, char *name, if (p != current) return -EPERM; + if (!capable(CAP_MAC_ADMIN)) + return -EPERM; + if (value == NULL || size == 0 || size >= SMK_LABELLEN) return -EINVAL; @@ -2552,7 +2571,8 @@ static void smack_release_secctx(char *secdata, u32 seclen) struct security_operations smack_ops = { .name = "smack", - .ptrace = smack_ptrace, + .ptrace_may_access = smack_ptrace_may_access, + .ptrace_traceme = smack_ptrace_traceme, .capget = cap_capget, .capset_check = cap_capset_check, .capset_set = cap_capset_set, @@ -2729,4 +2749,3 @@ static __init int smack_init(void) * all processes and objects when they are created. */ security_initcall(smack_init); - -- cgit v1.2.3-70-g09d2 From f1679d08480008e06fd619c71635ed33274e2595 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Aug 2008 15:49:00 +0200 Subject: sched: fix rt-bandwidth hotplug race When we hot-unplug a cpu and rebuild the sched-domain, all cpus will be detatched. Alex observed the case where a runqueue was stealing bandwidth from an already disabled runqueue to satisfy its own needs. Stop this by skipping over already disabled runqueues. Reported-by: Alex Nixon Signed-off-by: Peter Zijlstra Tested-by: Alex Nixon Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 6163e4cf885..998ba54b454 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -298,7 +298,7 @@ static void __disable_runtime(struct rq *rq) struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; - if (iter == rt_rq) + if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) continue; spin_lock(&iter->rt_runtime_lock); -- cgit v1.2.3-70-g09d2 From 4cd69b986ebf0f8da93f82ffbb89c032ee09c2e1 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 15 Aug 2008 00:40:20 -0700 Subject: kexec: fix compilation warning on xchg(&kexec_lock, 0) in kernel_kexec() kernel/kexec.c: In function 'kernel_kexec': kernel/kexec.c:1506: warning: value computed is not used Signed-off-by: Huang Ying Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index c8a4370e2a3..cf3797b7678 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1503,7 +1503,8 @@ int kernel_kexec(void) } Unlock: - xchg(&kexec_lock, 0); + if (!xchg(&kexec_lock, 0)) + BUG(); return error; } -- cgit v1.2.3-70-g09d2 From 7ade3fcc1fe2801336112027a884070c9ca451af Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 15 Aug 2008 00:40:21 -0700 Subject: kexec jump: clean up #ifdef and comments Move if (kexec_image->preserve_context) { ... } into #ifdef CONFIG_KEXEC_JUMP to make code looks cleaner. Fix no longer correct comments of kernel_kexec(). Signed-off-by: Huang Ying Acked-by: Vivek Goyal Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: "Eric W. Biederman" Cc: Vivek Goyal Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index cf3797b7678..bfbbd120623 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1426,11 +1426,9 @@ static int __init crash_save_vmcoreinfo_init(void) module_init(crash_save_vmcoreinfo_init) -/** - * kernel_kexec - reboot the system - * - * Move into place and start executing a preloaded standalone - * executable. If nothing was preloaded return an error. +/* + * Move into place and start executing a preloaded standalone + * executable. If nothing was preloaded return an error. */ int kernel_kexec(void) { @@ -1443,8 +1441,8 @@ int kernel_kexec(void) goto Unlock; } - if (kexec_image->preserve_context) { #ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) { mutex_lock(&pm_mutex); pm_prepare_console(); error = freeze_processes(); @@ -1471,8 +1469,9 @@ int kernel_kexec(void) if (error) goto Enable_irqs; save_processor_state(); + } else #endif - } else { + { blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); system_state = SYSTEM_RESTART; @@ -1484,8 +1483,8 @@ int kernel_kexec(void) machine_kexec(kexec_image); - if (kexec_image->preserve_context) { #ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) { restore_processor_state(); device_power_up(PMSG_RESTORE); Enable_irqs: @@ -1499,8 +1498,8 @@ int kernel_kexec(void) Restore_console: pm_restore_console(); mutex_unlock(&pm_mutex); -#endif } +#endif Unlock: if (!xchg(&kexec_lock, 0)) -- cgit v1.2.3-70-g09d2 From 163f6876f5c3ff8215e900b93779e960a56b3694 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 15 Aug 2008 00:40:22 -0700 Subject: kexec jump: rename KEXEC_CONTROL_CODE_SIZE to KEXEC_CONTROL_PAGE_SIZE Rename KEXEC_CONTROL_CODE_SIZE to KEXEC_CONTROL_PAGE_SIZE, because control page is used for not only code on some platform. For example in kexec jump, it is used for data and stack too. [akpm@linux-foundation.org: unbreak powerpc and arm, finish conversion] Signed-off-by: Huang Ying Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: "Eric W. Biederman" Cc: Vivek Goyal Cc: Ingo Molnar Cc: Russell King Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/kexec.h | 2 +- arch/arm/kernel/machine_kexec.c | 2 +- arch/ia64/include/asm/kexec.h | 2 +- arch/powerpc/include/asm/kexec.h | 2 +- arch/powerpc/kernel/machine_kexec_32.c | 2 +- arch/s390/include/asm/kexec.h | 2 +- arch/sh/include/asm/kexec.h | 2 +- arch/x86/kernel/machine_kexec_32.c | 2 +- include/asm-mips/kexec.h | 2 +- include/asm-x86/kexec.h | 4 ++-- include/linux/kexec.h | 4 ++-- kernel/kexec.c | 6 +++--- 12 files changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/arch/arm/include/asm/kexec.h b/arch/arm/include/asm/kexec.h index c8986bb99ed..df15a0dc228 100644 --- a/arch/arm/include/asm/kexec.h +++ b/arch/arm/include/asm/kexec.h @@ -10,7 +10,7 @@ /* Maximum address we can use for the control code buffer */ #define KEXEC_CONTROL_MEMORY_LIMIT (-1UL) -#define KEXEC_CONTROL_CODE_SIZE 4096 +#define KEXEC_CONTROL_PAGE_SIZE 4096 #define KEXEC_ARCH KEXEC_ARCH_ARM diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c index db8f54a3451..fae5beb3c3d 100644 --- a/arch/arm/kernel/machine_kexec.c +++ b/arch/arm/kernel/machine_kexec.c @@ -71,7 +71,7 @@ void machine_kexec(struct kimage *image) flush_icache_range((unsigned long) reboot_code_buffer, - (unsigned long) reboot_code_buffer + KEXEC_CONTROL_CODE_SIZE); + (unsigned long) reboot_code_buffer + KEXEC_CONTROL_PAGE_SIZE); printk(KERN_INFO "Bye!\n"); cpu_proc_fin(); diff --git a/arch/ia64/include/asm/kexec.h b/arch/ia64/include/asm/kexec.h index 541be835fc5..e1d58f819d7 100644 --- a/arch/ia64/include/asm/kexec.h +++ b/arch/ia64/include/asm/kexec.h @@ -9,7 +9,7 @@ /* Maximum address we can use for the control code buffer */ #define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE -#define KEXEC_CONTROL_CODE_SIZE (8192 + 8192 + 4096) +#define KEXEC_CONTROL_PAGE_SIZE (8192 + 8192 + 4096) /* The native architecture */ #define KEXEC_ARCH KEXEC_ARCH_IA_64 diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index acdcdc66f1b..3736d9b3328 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -22,7 +22,7 @@ #define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE #endif -#define KEXEC_CONTROL_CODE_SIZE 4096 +#define KEXEC_CONTROL_PAGE_SIZE 4096 /* The native architecture */ #ifdef __powerpc64__ diff --git a/arch/powerpc/kernel/machine_kexec_32.c b/arch/powerpc/kernel/machine_kexec_32.c index cbaa3419679..ae63a964b85 100644 --- a/arch/powerpc/kernel/machine_kexec_32.c +++ b/arch/powerpc/kernel/machine_kexec_32.c @@ -51,7 +51,7 @@ void default_machine_kexec(struct kimage *image) relocate_new_kernel_size); flush_icache_range(reboot_code_buffer, - reboot_code_buffer + KEXEC_CONTROL_CODE_SIZE); + reboot_code_buffer + KEXEC_CONTROL_PAGE_SIZE); printk(KERN_INFO "Bye!\n"); /* now call it */ diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h index f219c6411e0..bb729b84a21 100644 --- a/arch/s390/include/asm/kexec.h +++ b/arch/s390/include/asm/kexec.h @@ -31,7 +31,7 @@ #define KEXEC_CONTROL_MEMORY_LIMIT (1UL<<31) /* Allocate one page for the pdp and the second for the code */ -#define KEXEC_CONTROL_CODE_SIZE 4096 +#define KEXEC_CONTROL_PAGE_SIZE 4096 /* The native architecture */ #define KEXEC_ARCH KEXEC_ARCH_S390 diff --git a/arch/sh/include/asm/kexec.h b/arch/sh/include/asm/kexec.h index 00f4260ef09..765a5e1660f 100644 --- a/arch/sh/include/asm/kexec.h +++ b/arch/sh/include/asm/kexec.h @@ -21,7 +21,7 @@ /* Maximum address we can use for the control code buffer */ #define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE -#define KEXEC_CONTROL_CODE_SIZE 4096 +#define KEXEC_CONTROL_PAGE_SIZE 4096 /* The native architecture */ #define KEXEC_ARCH KEXEC_ARCH_SH diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 9fe478d9840..466450167de 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -78,7 +78,7 @@ static void load_segments(void) /* * A architecture hook called to validate the * proposed image and prepare the control pages - * as needed. The pages for KEXEC_CONTROL_CODE_SIZE + * as needed. The pages for KEXEC_CONTROL_PAGE_SIZE * have been allocated, but the segments have yet * been copied into the kernel. * diff --git a/include/asm-mips/kexec.h b/include/asm-mips/kexec.h index cdbab43b7d3..4314892aaeb 100644 --- a/include/asm-mips/kexec.h +++ b/include/asm-mips/kexec.h @@ -16,7 +16,7 @@ /* Maximum address we can use for the control code buffer */ #define KEXEC_CONTROL_MEMORY_LIMIT (0x20000000) -#define KEXEC_CONTROL_CODE_SIZE 4096 +#define KEXEC_CONTROL_PAGE_SIZE 4096 /* The native architecture */ #define KEXEC_ARCH KEXEC_ARCH_MIPS diff --git a/include/asm-x86/kexec.h b/include/asm-x86/kexec.h index c0e52a14fd4..f6fb3d21883 100644 --- a/include/asm-x86/kexec.h +++ b/include/asm-x86/kexec.h @@ -63,7 +63,7 @@ /* Maximum address we can use for the control code buffer */ # define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE -# define KEXEC_CONTROL_CODE_SIZE 4096 +# define KEXEC_CONTROL_PAGE_SIZE 4096 /* The native architecture */ # define KEXEC_ARCH KEXEC_ARCH_386 @@ -79,7 +79,7 @@ # define KEXEC_CONTROL_MEMORY_LIMIT (0xFFFFFFFFFFUL) /* Allocate one page for the pdp and the second for the code */ -# define KEXEC_CONTROL_CODE_SIZE (4096UL + 4096UL) +# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL) /* The native architecture */ # define KEXEC_ARCH KEXEC_ARCH_X86_64 diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 32110cede64..17f76fc0517 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -25,8 +25,8 @@ #error KEXEC_CONTROL_MEMORY_LIMIT not defined #endif -#ifndef KEXEC_CONTROL_CODE_SIZE -#error KEXEC_CONTROL_CODE_SIZE not defined +#ifndef KEXEC_CONTROL_PAGE_SIZE +#error KEXEC_CONTROL_PAGE_SIZE not defined #endif #ifndef KEXEC_ARCH diff --git a/kernel/kexec.c b/kernel/kexec.c index bfbbd120623..2810558802b 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -77,7 +77,7 @@ int kexec_should_crash(struct task_struct *p) * * The code for the transition from the current kernel to the * the new kernel is placed in the control_code_buffer, whose size - * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single + * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single * page of memory is necessary, but some architectures require more. * Because this memory must be identity mapped in the transition from * virtual to physical addresses it must live in the range @@ -242,7 +242,7 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, */ result = -ENOMEM; image->control_code_page = kimage_alloc_control_pages(image, - get_order(KEXEC_CONTROL_CODE_SIZE)); + get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { printk(KERN_ERR "Could not allocate control_code_buffer\n"); goto out; @@ -317,7 +317,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, */ result = -ENOMEM; image->control_code_page = kimage_alloc_control_pages(image, - get_order(KEXEC_CONTROL_CODE_SIZE)); + get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { printk(KERN_ERR "Could not allocate control_code_buffer\n"); goto out; -- cgit v1.2.3-70-g09d2 From ca195b7f6da3d5dde0bb85a7c322d7de73352653 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 15 Aug 2008 00:40:24 -0700 Subject: kexec jump: remove duplication of kexec_restart_prepare() Call kernel_restart_prepare() in kernel_kexec() instead of duplicating the code. Signed-off-by: Huang Ying Acked-by: Pavel Machek Acked-by: Vivek Goyal Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: "Eric W. Biederman" Cc: Vivek Goyal Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/reboot.h | 1 + kernel/kexec.c | 6 +----- kernel/sys.c | 2 +- 3 files changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index b93b541cf11..988e55fe649 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -59,6 +59,7 @@ extern void machine_crash_shutdown(struct pt_regs *); * Architecture independent implemenations of sys_reboot commands. */ +extern void kernel_restart_prepare(char *cmd); extern void kernel_restart(char *cmd); extern void kernel_halt(void); extern void kernel_power_off(void); diff --git a/kernel/kexec.c b/kernel/kexec.c index 2810558802b..b81682312dc 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1472,11 +1472,7 @@ int kernel_kexec(void) } else #endif { - blocking_notifier_call_chain(&reboot_notifier_list, - SYS_RESTART, NULL); - system_state = SYSTEM_RESTART; - device_shutdown(); - sysdev_shutdown(); + kernel_restart_prepare(NULL); printk(KERN_EMERG "Starting new kernel\n"); machine_shutdown(); } diff --git a/kernel/sys.c b/kernel/sys.c index c01858090a9..3dacb00a7f7 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -274,7 +274,7 @@ void emergency_restart(void) } EXPORT_SYMBOL_GPL(emergency_restart); -static void kernel_restart_prepare(char *cmd) +void kernel_restart_prepare(char *cmd) { blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); system_state = SYSTEM_RESTART; -- cgit v1.2.3-70-g09d2 From 73bd9c72a29be1e8de008186eea55d333a938804 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 15 Aug 2008 00:40:24 -0700 Subject: kexec jump: in sync with hibernation implementation Add device_pm_lock() and device_pm_unlock() in kernel_kexec() in sync with current hibernation implementation. Signed-off-by: Huang Ying Acked-by: Pavel Machek Cc: "Rafael J. Wysocki" Cc: "Eric W. Biederman" Cc: Vivek Goyal Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index b81682312dc..17c80fdc453 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1457,6 +1457,7 @@ int kernel_kexec(void) error = disable_nonboot_cpus(); if (error) goto Resume_devices; + device_pm_lock(); local_irq_disable(); /* At this point, device_suspend() has been called, * but *not* device_power_down(). We *must* @@ -1485,6 +1486,7 @@ int kernel_kexec(void) device_power_up(PMSG_RESTORE); Enable_irqs: local_irq_enable(); + device_pm_unlock(); enable_nonboot_cpus(); Resume_devices: device_resume(PMSG_RESTORE); -- cgit v1.2.3-70-g09d2 From 3122c331190e9d1622bf1c8cf6ce3b17cca67c9e Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 15 Aug 2008 00:40:26 -0700 Subject: kexec jump: fix for ftrace Ftrace depends on some processor state that we destroyed during kexec and restored by restore_processor_state(). So save_processor_state() and restore_processor_state() are moved into machine_kexec() and ftrace is restored after restore_processor_state(). Signed-off-by: Huang Ying Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: "Eric W. Biederman" Cc: Vivek Goyal Cc: Ingo Molnar Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/machine_kexec_32.c | 16 +++++++++++++++- kernel/kexec.c | 2 -- 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 5c8e7735c89..0732adba05c 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -113,6 +114,7 @@ void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; void *control_page; + int save_ftrace_enabled; asmlinkage unsigned long (*relocate_kernel_ptr)(unsigned long indirection_page, unsigned long control_page, @@ -120,7 +122,12 @@ void machine_kexec(struct kimage *image) unsigned int has_pae, unsigned int preserve_context); - tracer_disable(); +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) + save_processor_state(); +#endif + + save_ftrace_enabled = __ftrace_enabled_save(); /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); @@ -178,6 +185,13 @@ void machine_kexec(struct kimage *image) (unsigned long)page_list, image->start, cpu_has_pae, image->preserve_context); + +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) + restore_processor_state(); +#endif + + __ftrace_enabled_restore(save_ftrace_enabled); } void arch_crash_save_vmcoreinfo(void) diff --git a/kernel/kexec.c b/kernel/kexec.c index 17c80fdc453..9fc6f7cbd8a 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1469,7 +1469,6 @@ int kernel_kexec(void) error = device_power_down(PMSG_FREEZE); if (error) goto Enable_irqs; - save_processor_state(); } else #endif { @@ -1482,7 +1481,6 @@ int kernel_kexec(void) #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { - restore_processor_state(); device_power_up(PMSG_RESTORE); Enable_irqs: local_irq_enable(); -- cgit v1.2.3-70-g09d2 From 8c5a1cf0ad3ac5fcdf51314a63b16a440870f6a2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 15 Aug 2008 00:40:27 -0700 Subject: kexec: use a mutex for locking rather than xchg() Functionally the same, but more conventional. Cc: Huang Ying Tested-by: Vivek Goyal Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 9fc6f7cbd8a..59f3f0df35d 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include @@ -924,19 +924,14 @@ static int kimage_load_segment(struct kimage *image, */ struct kimage *kexec_image; struct kimage *kexec_crash_image; -/* - * A home grown binary mutex. - * Nothing can wait so this mutex is safe to use - * in interrupt context :) - */ -static int kexec_lock; + +static DEFINE_MUTEX(kexec_mutex); asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, struct kexec_segment __user *segments, unsigned long flags) { struct kimage **dest_image, *image; - int locked; int result; /* We only trust the superuser with rebooting the system. */ @@ -972,8 +967,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, * * KISS: always take the mutex. */ - locked = xchg(&kexec_lock, 1); - if (locked) + if (!mutex_trylock(&kexec_mutex)) return -EBUSY; dest_image = &kexec_image; @@ -1015,8 +1009,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, image = xchg(dest_image, image); out: - locked = xchg(&kexec_lock, 0); /* Release the mutex */ - BUG_ON(!locked); + mutex_unlock(&kexec_mutex); kimage_free(image); return result; @@ -1063,10 +1056,7 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry, void crash_kexec(struct pt_regs *regs) { - int locked; - - - /* Take the kexec_lock here to prevent sys_kexec_load + /* Take the kexec_mutex here to prevent sys_kexec_load * running on one cpu from replacing the crash kernel * we are using after a panic on a different cpu. * @@ -1074,8 +1064,7 @@ void crash_kexec(struct pt_regs *regs) * of memory the xchg(&kexec_crash_image) would be * sufficient. But since I reuse the memory... */ - locked = xchg(&kexec_lock, 1); - if (!locked) { + if (mutex_trylock(&kexec_mutex)) { if (kexec_crash_image) { struct pt_regs fixed_regs; crash_setup_regs(&fixed_regs, regs); @@ -1083,8 +1072,7 @@ void crash_kexec(struct pt_regs *regs) machine_crash_shutdown(&fixed_regs); machine_kexec(kexec_crash_image); } - locked = xchg(&kexec_lock, 0); - BUG_ON(!locked); + mutex_unlock(&kexec_mutex); } } @@ -1434,7 +1422,7 @@ int kernel_kexec(void) { int error = 0; - if (xchg(&kexec_lock, 1)) + if (!mutex_trylock(&kexec_mutex)) return -EBUSY; if (!kexec_image) { error = -EINVAL; @@ -1498,8 +1486,6 @@ int kernel_kexec(void) #endif Unlock: - if (!xchg(&kexec_lock, 0)) - BUG(); - + mutex_unlock(&kexec_mutex); return error; } -- cgit v1.2.3-70-g09d2 From be4de35263f59ca1f4740edfffbfb02cc3f2189e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 15 Aug 2008 00:40:44 -0700 Subject: completions: uninline try_wait_for_completion and completion_done m68k fails to build with these functions inlined in completion.h. Move them out of line into sched.c and export them to avoid this problem. Signed-off-by: Dave Chinner Cc: Geert Uytterhoeven Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/completion.h | 46 ++-------------------------------------------- kernel/sched.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/include/linux/completion.h b/include/linux/completion.h index 57faa60de9b..02ef8835999 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -49,6 +49,8 @@ extern unsigned long wait_for_completion_timeout(struct completion *x, unsigned long timeout); extern unsigned long wait_for_completion_interruptible_timeout( struct completion *x, unsigned long timeout); +extern bool try_wait_for_completion(struct completion *x); +extern bool completion_done(struct completion *x); extern void complete(struct completion *); extern void complete_all(struct completion *); @@ -56,48 +58,4 @@ extern void complete_all(struct completion *); #define INIT_COMPLETION(x) ((x).done = 0) -/** - * try_wait_for_completion - try to decrement a completion without blocking - * @x: completion structure - * - * Returns: 0 if a decrement cannot be done without blocking - * 1 if a decrement succeeded. - * - * If a completion is being used as a counting completion, - * attempt to decrement the counter without blocking. This - * enables us to avoid waiting if the resource the completion - * is protecting is not available. - */ -static inline bool try_wait_for_completion(struct completion *x) -{ - int ret = 1; - - spin_lock_irq(&x->wait.lock); - if (!x->done) - ret = 0; - else - x->done--; - spin_unlock_irq(&x->wait.lock); - return ret; -} - -/** - * completion_done - Test to see if a completion has any waiters - * @x: completion structure - * - * Returns: 0 if there are waiters (wait_for_completion() in progress) - * 1 if there are no waiters. - * - */ -static inline bool completion_done(struct completion *x) -{ - int ret = 1; - - spin_lock_irq(&x->wait.lock); - if (!x->done) - ret = 0; - spin_unlock_irq(&x->wait.lock); - return ret; -} - #endif diff --git a/kernel/sched.c b/kernel/sched.c index d601fb0406c..95e6ad3c231 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4669,6 +4669,52 @@ int __sched wait_for_completion_killable(struct completion *x) } EXPORT_SYMBOL(wait_for_completion_killable); +/** + * try_wait_for_completion - try to decrement a completion without blocking + * @x: completion structure + * + * Returns: 0 if a decrement cannot be done without blocking + * 1 if a decrement succeeded. + * + * If a completion is being used as a counting completion, + * attempt to decrement the counter without blocking. This + * enables us to avoid waiting if the resource the completion + * is protecting is not available. + */ +bool try_wait_for_completion(struct completion *x) +{ + int ret = 1; + + spin_lock_irq(&x->wait.lock); + if (!x->done) + ret = 0; + else + x->done--; + spin_unlock_irq(&x->wait.lock); + return ret; +} +EXPORT_SYMBOL(try_wait_for_completion); + +/** + * completion_done - Test to see if a completion has any waiters + * @x: completion structure + * + * Returns: 0 if there are waiters (wait_for_completion() in progress) + * 1 if there are no waiters. + * + */ +bool completion_done(struct completion *x) +{ + int ret = 1; + + spin_lock_irq(&x->wait.lock); + if (!x->done) + ret = 0; + spin_unlock_irq(&x->wait.lock); + return ret; +} +EXPORT_SYMBOL(completion_done); + static long __sched sleep_on_common(wait_queue_head_t *q, int state, long timeout) { -- cgit v1.2.3-70-g09d2 From 55cd53404c5cc5fd94708232e3b4aa4a9388917b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Aug 2008 08:54:26 +0200 Subject: sched: scale sysctl_sched_shares_ratelimit with nr_cpus David reported that his Niagra spend a little too much time in tg_shares_up(), which considering he has a large cpu count makes sense. So scale the ratelimit value with the number of cpus like we do for other controls as well. Reported-by: David Miller Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8bf8a5528bc..040807196b3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -808,9 +808,9 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; /* * ratelimit for updating the group shares. - * default: 0.5ms + * default: 0.25ms */ -const_debug unsigned int sysctl_sched_shares_ratelimit = 500000; +unsigned int sysctl_sched_shares_ratelimit = 250000; /* * period over which we measure -rt task cpu usage in us. @@ -5740,6 +5740,8 @@ static inline void sched_init_granularity(void) sysctl_sched_latency = limit; sysctl_sched_wakeup_granularity *= factor; + + sysctl_sched_shares_ratelimit *= factor; } #ifdef CONFIG_SMP -- cgit v1.2.3-70-g09d2 From df60a8441866153d691ae69b77934904c2de5e0d Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 15 Aug 2008 09:33:05 -0700 Subject: lockdep: fix build if CONFIG_PROVE_LOCKING not defined If CONFIG_PROVE_LOCKING not defined, then no dependency information is available. Signed-off-by: Stephen Hemminger Signed-off-by: Ingo Molnar --- kernel/lockdep_proc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index fa19aee604c..4b194d34d77 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -82,7 +82,6 @@ static void print_name(struct seq_file *m, struct lock_class *class) static int l_show(struct seq_file *m, void *v) { - unsigned long nr_forward_deps, nr_backward_deps; struct lock_class *class = v; struct lock_list *entry; char c1, c2, c3, c4; @@ -96,11 +95,10 @@ static int l_show(struct seq_file *m, void *v) #ifdef CONFIG_DEBUG_LOCKDEP seq_printf(m, " OPS:%8ld", class->ops); #endif - nr_forward_deps = lockdep_count_forward_deps(class); - seq_printf(m, " FD:%5ld", nr_forward_deps); - - nr_backward_deps = lockdep_count_backward_deps(class); - seq_printf(m, " BD:%5ld", nr_backward_deps); +#ifdef CONFIG_PROVE_LOCKING + seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class)); + seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class)); +#endif get_usage_chars(class, &c1, &c2, &c3, &c4); seq_printf(m, " %c%c%c%c", c1, c2, c3, c4); @@ -325,7 +323,9 @@ static int lockdep_stats_show(struct seq_file *m, void *v) if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) nr_hardirq_read_unsafe++; +#ifdef CONFIG_PROVE_LOCKING sum_forward_deps += lockdep_count_forward_deps(class); +#endif } #ifdef CONFIG_DEBUG_LOCKDEP DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); -- cgit v1.2.3-70-g09d2 From 6951b12a0fe7fc64789f2c4d62d2a304e93836a8 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Mon, 18 Aug 2008 04:26:37 +0400 Subject: lockdep: fix spurious 'inconsistent lock state' warning Since f82b217e3513fe3af342c0f3ee1494e86250c21c lockdep can output spurious warnings related to hwirqs due to hardirq_off shrinkage from int to bit-sized flag. Guard it with double negation to fix the warning. Signed-off-by: Dmitry Baryshkov Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 77fa776a2da..3bfb1877a00 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2582,7 +2582,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->trylock = trylock; hlock->read = read; hlock->check = check; - hlock->hardirqs_off = hardirqs_off; + hlock->hardirqs_off = !!hardirqs_off; #ifdef CONFIG_LOCK_STAT hlock->waittime_stamp = 0; hlock->holdtime_stamp = sched_clock(); -- cgit v1.2.3-70-g09d2 From 1b04624f93bb1c4f9495b8476d1dd0200af019e2 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Tue, 19 Aug 2008 20:37:07 -0700 Subject: tracehook: fix SA_NOCLDWAIT I outwitted myself again in commit 2b2a1ff64afbadac842bbc58c5166962cf4f7664, and broke the SA_NOCLDWAIT behavior so it leaks zombies. This fixes it. Reported-by: Andi Kleen Signed-off-by: Roland McGrath --- kernel/signal.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index c539f60c6f4..e661b01d340 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1338,6 +1338,7 @@ int do_notify_parent(struct task_struct *tsk, int sig) struct siginfo info; unsigned long flags; struct sighand_struct *psig; + int ret = sig; BUG_ON(sig == -1); @@ -1402,7 +1403,7 @@ int do_notify_parent(struct task_struct *tsk, int sig) * is implementation-defined: we do (if you don't want * it, just use SIG_IGN instead). */ - tsk->exit_signal = -1; + ret = tsk->exit_signal = -1; if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) sig = -1; } @@ -1411,7 +1412,7 @@ int do_notify_parent(struct task_struct *tsk, int sig) __wake_up_parent(tsk, tsk->parent); spin_unlock_irqrestore(&psig->siglock, flags); - return sig; + return ret; } static void do_notify_parent_cldstop(struct task_struct *tsk, int why) -- cgit v1.2.3-70-g09d2 From 2d70b68d42b5196a48ccb639e3797f097ef5bea3 Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Wed, 20 Aug 2008 14:09:17 -0700 Subject: fix setpriority(PRIO_PGRP) thread iterator breakage When user calls sys_setpriority(PRIO_PGRP ...) on a NPTL style multi-LWP process, only the task leader of the process is affected, all other sibling LWP threads didn't receive the setting. The problem was that the iterator used in sys_setpriority() only iteartes over one task for each process, ignoring all other sibling thread. Introduce a new macro do_each_pid_thread / while_each_pid_thread to walk each thread of a process. Convert 4 call sites in {set/get}priority and ioprio_{set/get}. Signed-off-by: Ken Chen Cc: Oleg Nesterov Cc: Roland McGrath Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ioprio.c | 8 ++++---- include/linux/pid.h | 9 +++++++++ kernel/sys.c | 8 ++++---- 3 files changed, 17 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/fs/ioprio.c b/fs/ioprio.c index c4a1c3c65aa..da3cc460d4d 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -115,11 +115,11 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio) pgrp = task_pgrp(current); else pgrp = find_vpid(who); - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { ret = set_task_ioprio(p, ioprio); if (ret) break; - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case IOPRIO_WHO_USER: if (!who) @@ -204,7 +204,7 @@ asmlinkage long sys_ioprio_get(int which, int who) pgrp = task_pgrp(current); else pgrp = find_vpid(who); - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { tmpio = get_task_ioprio(p); if (tmpio < 0) continue; @@ -212,7 +212,7 @@ asmlinkage long sys_ioprio_get(int which, int who) ret = tmpio; else ret = ioprio_best(ret, tmpio); - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case IOPRIO_WHO_USER: if (!who) diff --git a/include/linux/pid.h b/include/linux/pid.h index 22921ac4cfd..d7e98ff8021 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -161,4 +161,13 @@ pid_t pid_vnr(struct pid *pid); } \ } while (0) +#define do_each_pid_thread(pid, type, task) \ + do_each_pid_task(pid, type, task) { \ + struct task_struct *tg___ = task; \ + do { + +#define while_each_pid_thread(pid, type, task) \ + } while_each_thread(tg___, task); \ + task = tg___; \ + } while_each_pid_task(pid, type, task) #endif /* _LINUX_PID_H */ diff --git a/kernel/sys.c b/kernel/sys.c index 3dacb00a7f7..038a7bc0901 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -169,9 +169,9 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) pgrp = find_vpid(who); else pgrp = task_pgrp(current); - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { error = set_one_prio(p, niceval, error); - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: user = current->user; @@ -229,11 +229,11 @@ asmlinkage long sys_getpriority(int which, int who) pgrp = find_vpid(who); else pgrp = task_pgrp(current); - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: user = current->user; -- cgit v1.2.3-70-g09d2 From efc2dead2c82cae31943828f6d977c483942b0eb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Aug 2008 12:44:55 +0200 Subject: sched: enable LB_BIAS by default Yanmin reported a significant regression on his 16-core machine due to: commit 93b75217df39e6d75889cc6f8050343286aff4a5 Author: Peter Zijlstra Date: Fri Jun 27 13:41:33 2008 +0200 Flip back to the old behaviour. Reported-by: "Zhang, Yanmin" Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 862b06bd560..9353ca78154 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -8,6 +8,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1) SCHED_FEAT(HRTICK, 1) SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(ASYM_GRAN, 1) -SCHED_FEAT(LB_BIAS, 0) +SCHED_FEAT(LB_BIAS, 1) SCHED_FEAT(LB_WAKEUP_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) -- cgit v1.2.3-70-g09d2 From 01dcb0443ed89eccf26c2b43f1ea13b368ae740d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 20 Aug 2008 16:35:19 -0700 Subject: rcu: fix synchronize_rcu() so that kernel-doc works Fix RCU's synchronize_rcu() so that it looks like a C function, enabling it to be recognized as a function with kernel-doc annotation. Warning(linux-2.6.26-git11//kernel/rcupdate.c:81): No description found for parameter 'synchronize_rcu' Warning(linux-2.6.26-git11//kernel/rcupdate.c:81): No description found for parameter 'call_rcu' [akpm@linux-foundation.org: fix comment] Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/rcupdate.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f14f372cf6f..467d5940f62 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -77,6 +77,7 @@ void wakeme_after_rcu(struct rcu_head *head) * sections are delimited by rcu_read_lock() and rcu_read_unlock(), * and may be nested. */ +void synchronize_rcu(void); /* Makes kernel-doc tools happy */ synchronize_rcu_xxx(synchronize_rcu, call_rcu) EXPORT_SYMBOL_GPL(synchronize_rcu); -- cgit v1.2.3-70-g09d2 From 3c4fbe5e01d7e5309be5045e7ae0db20a049e6dc Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 20 Aug 2008 16:37:38 -0700 Subject: nohz: fix wrong event handler after online an offlined cpu On the tickless system(CONFIG_NO_HZ=y and CONFIG_HIGH_RES_TIMERS=n), after I made an offlined cpu online, I found this cpu's event handler was tick_handle_periodic, not tick_nohz_handler. After debuging, I found this bug was caused by the wrong tick mode. the tick mode is not changed to NOHZ_MODE_INACTIVE when the cpu is offline. This patch fixes this bug. Signed-off-by: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/tick.h | 5 ++++- kernel/time/tick-sched.c | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/tick.h b/include/linux/tick.h index d3c02695dc5..8cf8cfe2cc9 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -74,10 +74,13 @@ extern struct tick_device *tick_get_device(int cpu); extern int tick_init_highres(void); extern int tick_program_event(ktime_t expires, int force); extern void tick_setup_sched_timer(void); +# endif + +# if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS extern void tick_cancel_sched_timer(int cpu); # else static inline void tick_cancel_sched_timer(int cpu) { } -# endif /* HIGHRES */ +# endif # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST extern struct tick_device *tick_get_broadcast_device(void); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f5da526424a..7a46bde78c6 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -643,17 +643,21 @@ void tick_setup_sched_timer(void) ts->nohz_mode = NOHZ_MODE_HIGHRES; #endif } +#endif /* HIGH_RES_TIMERS */ +#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS void tick_cancel_sched_timer(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); +# ifdef CONFIG_HIGH_RES_TIMERS if (ts->sched_timer.base) hrtimer_cancel(&ts->sched_timer); +# endif ts->nohz_mode = NOHZ_MODE_INACTIVE; } -#endif /* HIGH_RES_TIMERS */ +#endif /** * Async notification about clocksource changes -- cgit v1.2.3-70-g09d2 From 7a8fc9b248e77a4eab0613acf30a6811799786b3 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sun, 17 Aug 2008 17:36:59 +0300 Subject: removed unused #include 's This patch lets the files using linux/version.h match the files that #include it. Signed-off-by: Adrian Bunk Signed-off-by: Linus Torvalds --- arch/arm/plat-omap/clock.c | 1 - arch/cris/arch-v32/kernel/fasttimer.c | 2 -- arch/mn10300/kernel/mn10300-serial.c | 1 - arch/powerpc/sysdev/bestcomm/gen_bd.c | 1 - arch/x86/mach-rdc321x/platform.c | 1 - drivers/atm/adummy.c | 1 - drivers/char/xilinx_hwicap/buffer_icap.h | 1 - drivers/char/xilinx_hwicap/fifo_icap.h | 1 - drivers/char/xilinx_hwicap/xilinx_hwicap.h | 1 - drivers/edac/edac_core.h | 1 - drivers/i2c/busses/i2c-at91.c | 1 - drivers/infiniband/hw/ehca/ehca_tools.h | 1 - drivers/infiniband/hw/ipath/ipath_fs.c | 1 - drivers/infiniband/hw/nes/nes.h | 1 - drivers/infiniband/ulp/iser/iser_verbs.c | 1 - drivers/input/keyboard/bf54x-keys.c | 1 - drivers/input/touchscreen/mainstone-wm97xx.c | 1 - drivers/mfd/asic3.c | 1 - drivers/misc/eeprom_93cx6.c | 1 - drivers/mtd/maps/amd76xrom.c | 1 - drivers/mtd/maps/ck804xrom.c | 1 - drivers/mtd/maps/esb2rom.c | 1 - drivers/mtd/nand/au1550nd.c | 1 - drivers/net/myri10ge/myri10ge.c | 1 - drivers/net/netxen/netxen_nic.h | 1 - drivers/net/netxen/netxen_nic_ethtool.c | 1 - drivers/net/netxen/netxen_nic_hdr.h | 2 -- drivers/net/tokenring/lanstreamer.c | 1 - drivers/net/tokenring/lanstreamer.h | 2 -- drivers/net/wireless/b43legacy/main.c | 1 - drivers/net/wireless/iwlwifi/iwl-3945-led.c | 1 - drivers/net/wireless/iwlwifi/iwl-led.c | 1 - drivers/net/wireless/iwlwifi/iwl-rfkill.c | 1 - drivers/rtc/rtc-max6902.c | 2 -- drivers/rtc/rtc-r9701.c | 1 - drivers/s390/net/ctcm_mpc.c | 1 - drivers/scsi/dpt/dpti_i2o.h | 1 - drivers/scsi/ips.c | 1 - drivers/scsi/ips.h | 1 - drivers/scsi/lpfc/lpfc_debugfs.c | 1 - drivers/scsi/nsp32.c | 1 - drivers/scsi/nsp32.h | 1 - drivers/scsi/pcmcia/nsp_cs.c | 1 - drivers/scsi/qla2xxx/qla_mid.c | 1 - drivers/usb/atm/ueagle-atm.c | 1 - drivers/usb/gadget/amd5536udc.c | 1 - drivers/usb/gadget/s3c2410_udc.c | 1 - drivers/usb/misc/iowarrior.c | 1 - drivers/usb/serial/garmin_gps.c | 2 -- drivers/video/arkfb.c | 1 - drivers/video/s3fb.c | 1 - drivers/video/vermilion/vermilion.h | 1 - drivers/video/vt8623fb.c | 1 - drivers/video/xilinxfb.c | 1 - fs/jffs2/jffs2_fs_i.h | 1 - fs/xfs/xfs_dmapi.h | 1 - include/asm-x86/xen/hypervisor.h | 1 - include/linux/fs_uart_pd.h | 1 - kernel/nsproxy.c | 1 - kernel/power/swap.c | 1 - kernel/user_namespace.c | 1 - kernel/utsname.c | 1 - kernel/utsname_sysctl.c | 1 - sound/mips/au1x00.c | 1 - sound/soc/at91/eti_b1_wm8731.c | 1 - sound/soc/codecs/wm8753.c | 1 - sound/soc/codecs/wm9712.c | 1 - 67 files changed, 72 deletions(-) (limited to 'kernel') diff --git a/arch/arm/plat-omap/clock.c b/arch/arm/plat-omap/clock.c index 23a07059999..197974defbe 100644 --- a/arch/arm/plat-omap/clock.c +++ b/arch/arm/plat-omap/clock.c @@ -10,7 +10,6 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#include #include #include #include diff --git a/arch/cris/arch-v32/kernel/fasttimer.c b/arch/cris/arch-v32/kernel/fasttimer.c index 2de9d5849ef..111caa1a2ef 100644 --- a/arch/cris/arch-v32/kernel/fasttimer.c +++ b/arch/cris/arch-v32/kernel/fasttimer.c @@ -19,8 +19,6 @@ #include #include -#include - #include #include #include diff --git a/arch/mn10300/kernel/mn10300-serial.c b/arch/mn10300/kernel/mn10300-serial.c index 8b054e7a8ae..aa07d0cd190 100644 --- a/arch/mn10300/kernel/mn10300-serial.c +++ b/arch/mn10300/kernel/mn10300-serial.c @@ -17,7 +17,6 @@ static const char serial_revdate[] = "2007-11-06"; #define SUPPORT_SYSRQ #endif -#include #include #include #include diff --git a/arch/powerpc/sysdev/bestcomm/gen_bd.c b/arch/powerpc/sysdev/bestcomm/gen_bd.c index a3a134c35b0..e0a53e3147b 100644 --- a/arch/powerpc/sysdev/bestcomm/gen_bd.c +++ b/arch/powerpc/sysdev/bestcomm/gen_bd.c @@ -11,7 +11,6 @@ * */ -#include #include #include #include diff --git a/arch/x86/mach-rdc321x/platform.c b/arch/x86/mach-rdc321x/platform.c index a037041817c..4f4e50c3ad3 100644 --- a/arch/x86/mach-rdc321x/platform.c +++ b/arch/x86/mach-rdc321x/platform.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include diff --git a/drivers/atm/adummy.c b/drivers/atm/adummy.c index 2ebd07f2ef8..5effec6f545 100644 --- a/drivers/atm/adummy.c +++ b/drivers/atm/adummy.c @@ -3,7 +3,6 @@ */ #include -#include #include #include #include diff --git a/drivers/char/xilinx_hwicap/buffer_icap.h b/drivers/char/xilinx_hwicap/buffer_icap.h index c5b1840906b..8b0252bf06e 100644 --- a/drivers/char/xilinx_hwicap/buffer_icap.h +++ b/drivers/char/xilinx_hwicap/buffer_icap.h @@ -38,7 +38,6 @@ #include #include -#include #include #include diff --git a/drivers/char/xilinx_hwicap/fifo_icap.h b/drivers/char/xilinx_hwicap/fifo_icap.h index ffabd3ba2bd..62bda453c90 100644 --- a/drivers/char/xilinx_hwicap/fifo_icap.h +++ b/drivers/char/xilinx_hwicap/fifo_icap.h @@ -38,7 +38,6 @@ #include #include -#include #include #include diff --git a/drivers/char/xilinx_hwicap/xilinx_hwicap.h b/drivers/char/xilinx_hwicap/xilinx_hwicap.h index 1f9c8b082db..24d0d9b938f 100644 --- a/drivers/char/xilinx_hwicap/xilinx_hwicap.h +++ b/drivers/char/xilinx_hwicap/xilinx_hwicap.h @@ -38,7 +38,6 @@ #include #include -#include #include #include diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h index b27b13c5eb5..4b55ec607a8 100644 --- a/drivers/edac/edac_core.h +++ b/drivers/edac/edac_core.h @@ -34,7 +34,6 @@ #include #include #include -#include #define EDAC_MC_LABEL_LEN 31 #define EDAC_DEVICE_NAME_LEN 31 diff --git a/drivers/i2c/busses/i2c-at91.c b/drivers/i2c/busses/i2c-at91.c index c1adcdbf797..9efb0213725 100644 --- a/drivers/i2c/busses/i2c-at91.c +++ b/drivers/i2c/busses/i2c-at91.c @@ -14,7 +14,6 @@ */ #include -#include #include #include #include diff --git a/drivers/infiniband/hw/ehca/ehca_tools.h b/drivers/infiniband/hw/ehca/ehca_tools.h index ec950bf8c47..21f7d06f14a 100644 --- a/drivers/infiniband/hw/ehca/ehca_tools.h +++ b/drivers/infiniband/hw/ehca/ehca_tools.h @@ -54,7 +54,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c index 23faba9d21e..8bb5170b4e4 100644 --- a/drivers/infiniband/hw/ipath/ipath_fs.c +++ b/drivers/infiniband/hw/ipath/ipath_fs.c @@ -31,7 +31,6 @@ * SOFTWARE. */ -#include #include #include #include diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h index 39bd897b40c..8eb7ae96974 100644 --- a/drivers/infiniband/hw/nes/nes.h +++ b/drivers/infiniband/hw/nes/nes.h @@ -43,7 +43,6 @@ #include #include #include -#include #include #include diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 63462ecca14..26ff6214a81 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -33,7 +33,6 @@ #include #include #include -#include #include "iscsi_iser.h" diff --git a/drivers/input/keyboard/bf54x-keys.c b/drivers/input/keyboard/bf54x-keys.c index 54ed8e2e1c0..6f227d3dbda 100644 --- a/drivers/input/keyboard/bf54x-keys.c +++ b/drivers/input/keyboard/bf54x-keys.c @@ -29,7 +29,6 @@ */ #include -#include #include #include diff --git a/drivers/input/touchscreen/mainstone-wm97xx.c b/drivers/input/touchscreen/mainstone-wm97xx.c index 283f93a0cee..37a555f3730 100644 --- a/drivers/input/touchscreen/mainstone-wm97xx.c +++ b/drivers/input/touchscreen/mainstone-wm97xx.c @@ -25,7 +25,6 @@ #include #include -#include #include #include #include diff --git a/drivers/mfd/asic3.c b/drivers/mfd/asic3.c index c6408a62d95..bc2a807f210 100644 --- a/drivers/mfd/asic3.c +++ b/drivers/mfd/asic3.c @@ -16,7 +16,6 @@ * */ -#include #include #include #include diff --git a/drivers/misc/eeprom_93cx6.c b/drivers/misc/eeprom_93cx6.c index ea55654e594..15b1780025c 100644 --- a/drivers/misc/eeprom_93cx6.c +++ b/drivers/misc/eeprom_93cx6.c @@ -26,7 +26,6 @@ #include #include -#include #include #include diff --git a/drivers/mtd/maps/amd76xrom.c b/drivers/mtd/maps/amd76xrom.c index 948b86f35ef..d1eec7d3243 100644 --- a/drivers/mtd/maps/amd76xrom.c +++ b/drivers/mtd/maps/amd76xrom.c @@ -6,7 +6,6 @@ #include #include -#include #include #include #include diff --git a/drivers/mtd/maps/ck804xrom.c b/drivers/mtd/maps/ck804xrom.c index effaf7cdefa..1a6feb4474d 100644 --- a/drivers/mtd/maps/ck804xrom.c +++ b/drivers/mtd/maps/ck804xrom.c @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/drivers/mtd/maps/esb2rom.c b/drivers/mtd/maps/esb2rom.c index aa64a475278..bbbcdd4c8d1 100644 --- a/drivers/mtd/maps/esb2rom.c +++ b/drivers/mtd/maps/esb2rom.c @@ -12,7 +12,6 @@ #include #include -#include #include #include #include diff --git a/drivers/mtd/nand/au1550nd.c b/drivers/mtd/nand/au1550nd.c index 761946ea45b..92c334ff450 100644 --- a/drivers/mtd/nand/au1550nd.c +++ b/drivers/mtd/nand/au1550nd.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c index 5d76cd09e24..54cd89cb083 100644 --- a/drivers/net/myri10ge/myri10ge.c +++ b/drivers/net/myri10ge/myri10ge.c @@ -56,7 +56,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/netxen/netxen_nic.h b/drivers/net/netxen/netxen_nic.h index ab871df6b1d..244ab49c433 100644 --- a/drivers/net/netxen/netxen_nic.h +++ b/drivers/net/netxen/netxen_nic.h @@ -45,7 +45,6 @@ #include #include #include -#include #include #include diff --git a/drivers/net/netxen/netxen_nic_ethtool.c b/drivers/net/netxen/netxen_nic_ethtool.c index 4ad3e0844b9..b974ca0fc53 100644 --- a/drivers/net/netxen/netxen_nic_ethtool.c +++ b/drivers/net/netxen/netxen_nic_ethtool.c @@ -38,7 +38,6 @@ #include #include #include -#include #include "netxen_nic.h" #include "netxen_nic_hw.h" diff --git a/drivers/net/netxen/netxen_nic_hdr.h b/drivers/net/netxen/netxen_nic_hdr.h index e8e8d73f6ed..e80f9e3e597 100644 --- a/drivers/net/netxen/netxen_nic_hdr.h +++ b/drivers/net/netxen/netxen_nic_hdr.h @@ -32,8 +32,6 @@ #include #include -#include - #include #include #include diff --git a/drivers/net/tokenring/lanstreamer.c b/drivers/net/tokenring/lanstreamer.c index 47d84cd2809..59d1673f938 100644 --- a/drivers/net/tokenring/lanstreamer.c +++ b/drivers/net/tokenring/lanstreamer.c @@ -119,7 +119,6 @@ #include #include #include -#include #include #include diff --git a/drivers/net/tokenring/lanstreamer.h b/drivers/net/tokenring/lanstreamer.h index e7bb3494afc..13ccee6449c 100644 --- a/drivers/net/tokenring/lanstreamer.h +++ b/drivers/net/tokenring/lanstreamer.h @@ -60,8 +60,6 @@ * */ -#include - /* MAX_INTR - the maximum number of times we can loop * inside the interrupt function before returning * control to the OS (maximum value is 256) diff --git a/drivers/net/wireless/b43legacy/main.c b/drivers/net/wireless/b43legacy/main.c index 2541c81932f..1cb77db5c29 100644 --- a/drivers/net/wireless/b43legacy/main.c +++ b/drivers/net/wireless/b43legacy/main.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/wireless/iwlwifi/iwl-3945-led.c b/drivers/net/wireless/iwlwifi/iwl-3945-led.c index d3336966b6b..705c65bed9f 100644 --- a/drivers/net/wireless/iwlwifi/iwl-3945-led.c +++ b/drivers/net/wireless/iwlwifi/iwl-3945-led.c @@ -27,7 +27,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/wireless/iwlwifi/iwl-led.c b/drivers/net/wireless/iwlwifi/iwl-led.c index cb11c4a4d69..4eee1b163cd 100644 --- a/drivers/net/wireless/iwlwifi/iwl-led.c +++ b/drivers/net/wireless/iwlwifi/iwl-led.c @@ -27,7 +27,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/wireless/iwlwifi/iwl-rfkill.c b/drivers/net/wireless/iwlwifi/iwl-rfkill.c index e5e5846e9f2..5d642298f04 100644 --- a/drivers/net/wireless/iwlwifi/iwl-rfkill.c +++ b/drivers/net/wireless/iwlwifi/iwl-rfkill.c @@ -27,7 +27,6 @@ *****************************************************************************/ #include #include -#include #include #include diff --git a/drivers/rtc/rtc-max6902.c b/drivers/rtc/rtc-max6902.c index 12f0310ae89..78b2551fb19 100644 --- a/drivers/rtc/rtc-max6902.c +++ b/drivers/rtc/rtc-max6902.c @@ -20,8 +20,6 @@ */ #include -#include - #include #include #include diff --git a/drivers/rtc/rtc-r9701.c b/drivers/rtc/rtc-r9701.c index b35f9bfa2af..395985b339c 100644 --- a/drivers/rtc/rtc-r9701.c +++ b/drivers/rtc/rtc-r9701.c @@ -14,7 +14,6 @@ */ #include -#include #include #include #include diff --git a/drivers/s390/net/ctcm_mpc.c b/drivers/s390/net/ctcm_mpc.c index 49ae1cd25ca..2de1e2fccbf 100644 --- a/drivers/s390/net/ctcm_mpc.c +++ b/drivers/s390/net/ctcm_mpc.c @@ -19,7 +19,6 @@ #undef DEBUGDATA #undef DEBUGCCW -#include #include #include #include diff --git a/drivers/scsi/dpt/dpti_i2o.h b/drivers/scsi/dpt/dpti_i2o.h index 19406cea6d6..179ad77f6cc 100644 --- a/drivers/scsi/dpt/dpti_i2o.h +++ b/drivers/scsi/dpt/dpti_i2o.h @@ -21,7 +21,6 @@ #include -#include #include #include diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c index 7c615c70ec5..bc9e6ddf41d 100644 --- a/drivers/scsi/ips.c +++ b/drivers/scsi/ips.c @@ -165,7 +165,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/scsi/ips.h b/drivers/scsi/ips.h index e0657b6f009..4e49fbcfe8a 100644 --- a/drivers/scsi/ips.h +++ b/drivers/scsi/ips.h @@ -50,7 +50,6 @@ #ifndef _IPS_H_ #define _IPS_H_ -#include #include #include #include diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c index 90272e65957..094b47e94b2 100644 --- a/drivers/scsi/lpfc/lpfc_debugfs.c +++ b/drivers/scsi/lpfc/lpfc_debugfs.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include diff --git a/drivers/scsi/nsp32.c b/drivers/scsi/nsp32.c index edf9fdb3cb3..22052bb7bec 100644 --- a/drivers/scsi/nsp32.c +++ b/drivers/scsi/nsp32.c @@ -23,7 +23,6 @@ * 1.2: PowerPC (big endian) support. */ -#include #include #include #include diff --git a/drivers/scsi/nsp32.h b/drivers/scsi/nsp32.h index 6715ecb3bfc..9565acf1aa7 100644 --- a/drivers/scsi/nsp32.h +++ b/drivers/scsi/nsp32.h @@ -16,7 +16,6 @@ #ifndef _NSP32_H #define _NSP32_H -#include //#define NSP32_DEBUG 9 /* diff --git a/drivers/scsi/pcmcia/nsp_cs.c b/drivers/scsi/pcmcia/nsp_cs.c index a221b6ef9fa..24e6cb8396e 100644 --- a/drivers/scsi/pcmcia/nsp_cs.c +++ b/drivers/scsi/pcmcia/nsp_cs.c @@ -25,7 +25,6 @@ ***********************************************************************/ -#include #include #include #include diff --git a/drivers/scsi/qla2xxx/qla_mid.c b/drivers/scsi/qla2xxx/qla_mid.c index 50baf6a1d67..93560cd7278 100644 --- a/drivers/scsi/qla2xxx/qla_mid.c +++ b/drivers/scsi/qla2xxx/qla_mid.c @@ -6,7 +6,6 @@ */ #include "qla_def.h" -#include #include #include #include diff --git a/drivers/usb/atm/ueagle-atm.c b/drivers/usb/atm/ueagle-atm.c index cb01b5106ef..b6483dd98ac 100644 --- a/drivers/usb/atm/ueagle-atm.c +++ b/drivers/usb/atm/ueagle-atm.c @@ -64,7 +64,6 @@ #include #include #include -#include #include #include diff --git a/drivers/usb/gadget/amd5536udc.c b/drivers/usb/gadget/amd5536udc.c index 1500e1b3c30..abf8192f89e 100644 --- a/drivers/usb/gadget/amd5536udc.c +++ b/drivers/usb/gadget/amd5536udc.c @@ -44,7 +44,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/usb/gadget/s3c2410_udc.c b/drivers/usb/gadget/s3c2410_udc.c index 53880738459..29d13ebe750 100644 --- a/drivers/usb/gadget/s3c2410_udc.c +++ b/drivers/usb/gadget/s3c2410_udc.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c index e6ca9979e3a..a4ef77ef917 100644 --- a/drivers/usb/misc/iowarrior.c +++ b/drivers/usb/misc/iowarrior.c @@ -19,7 +19,6 @@ #include #include #include -#include #include /* Version Information */ diff --git a/drivers/usb/serial/garmin_gps.c b/drivers/usb/serial/garmin_gps.c index 2e663f1afd5..d9538208807 100644 --- a/drivers/usb/serial/garmin_gps.c +++ b/drivers/usb/serial/garmin_gps.c @@ -38,8 +38,6 @@ #include #include -#include - /* the mode to be set when the port ist opened */ static int initial_mode = 1; diff --git a/drivers/video/arkfb.c b/drivers/video/arkfb.c index 4bd569e479a..314d18694b6 100644 --- a/drivers/video/arkfb.c +++ b/drivers/video/arkfb.c @@ -11,7 +11,6 @@ * Code is based on s3fb */ -#include #include #include #include diff --git a/drivers/video/s3fb.c b/drivers/video/s3fb.c index 8361bd0e3df..4dcec48a1d7 100644 --- a/drivers/video/s3fb.c +++ b/drivers/video/s3fb.c @@ -11,7 +11,6 @@ * which is based on the code of neofb. */ -#include #include #include #include diff --git a/drivers/video/vermilion/vermilion.h b/drivers/video/vermilion/vermilion.h index c4aba59d480..7491abfcf1f 100644 --- a/drivers/video/vermilion/vermilion.h +++ b/drivers/video/vermilion/vermilion.h @@ -30,7 +30,6 @@ #define _VERMILION_H_ #include -#include #include #include #include diff --git a/drivers/video/vt8623fb.c b/drivers/video/vt8623fb.c index 34aae7a2a62..3df17dc8c3d 100644 --- a/drivers/video/vt8623fb.c +++ b/drivers/video/vt8623fb.c @@ -12,7 +12,6 @@ * (http://davesdomain.org.uk/viafb/) */ -#include #include #include #include diff --git a/drivers/video/xilinxfb.c b/drivers/video/xilinxfb.c index 7b3a8423f48..5da3d2423cc 100644 --- a/drivers/video/xilinxfb.c +++ b/drivers/video/xilinxfb.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h index 31559f45fdd..4c41db91eaa 100644 --- a/fs/jffs2/jffs2_fs_i.h +++ b/fs/jffs2/jffs2_fs_i.h @@ -12,7 +12,6 @@ #ifndef _JFFS2_FS_I #define _JFFS2_FS_I -#include #include #include #include diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h index cdc2d3464a1..2813cdd7237 100644 --- a/fs/xfs/xfs_dmapi.h +++ b/fs/xfs/xfs_dmapi.h @@ -18,7 +18,6 @@ #ifndef __XFS_DMAPI_H__ #define __XFS_DMAPI_H__ -#include /* Values used to define the on-disk version of dm_attrname_t. All * on-disk attribute names start with the 8-byte string "SGI_DMI_". * diff --git a/include/asm-x86/xen/hypervisor.h b/include/asm-x86/xen/hypervisor.h index 8e15dd28c91..04ee0610014 100644 --- a/include/asm-x86/xen/hypervisor.h +++ b/include/asm-x86/xen/hypervisor.h @@ -35,7 +35,6 @@ #include #include -#include #include #include diff --git a/include/linux/fs_uart_pd.h b/include/linux/fs_uart_pd.h index 809bb9ffc78..36b61ff3927 100644 --- a/include/linux/fs_uart_pd.h +++ b/include/linux/fs_uart_pd.h @@ -12,7 +12,6 @@ #ifndef FS_UART_PD_H #define FS_UART_PD_H -#include #include enum fs_uart_id { diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 21575fc46d0..1d3ef29a258 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -14,7 +14,6 @@ */ #include -#include #include #include #include diff --git a/kernel/power/swap.c b/kernel/power/swap.c index a0abf9a463f..80ccac849e4 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index a9ab0596de4..532858fa5b8 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -6,7 +6,6 @@ */ #include -#include #include #include #include diff --git a/kernel/utsname.c b/kernel/utsname.c index 64d398f1244..815237a55af 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index fe3a56c2256..4ab9659d269 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -12,7 +12,6 @@ #include #include #include -#include #include static void *get_uts(ctl_table *table, int write) diff --git a/sound/mips/au1x00.c b/sound/mips/au1x00.c index ee0741f9eb5..fbef38a9604 100644 --- a/sound/mips/au1x00.c +++ b/sound/mips/au1x00.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include diff --git a/sound/soc/at91/eti_b1_wm8731.c b/sound/soc/at91/eti_b1_wm8731.c index b081e83766b..b81d6b2cfa1 100644 --- a/sound/soc/at91/eti_b1_wm8731.c +++ b/sound/soc/at91/eti_b1_wm8731.c @@ -22,7 +22,6 @@ #include #include -#include #include #include #include diff --git a/sound/soc/codecs/wm8753.c b/sound/soc/codecs/wm8753.c index 8604809f0c3..dc7b18fd278 100644 --- a/sound/soc/codecs/wm8753.c +++ b/sound/soc/codecs/wm8753.c @@ -34,7 +34,6 @@ #include #include -#include #include #include #include diff --git a/sound/soc/codecs/wm9712.c b/sound/soc/codecs/wm9712.c index 1fb7f9a7aec..2f1c91b1d55 100644 --- a/sound/soc/codecs/wm9712.c +++ b/sound/soc/codecs/wm9712.c @@ -13,7 +13,6 @@ #include #include -#include #include #include #include -- cgit v1.2.3-70-g09d2 From 354879bb977e06695993435745f06a0f6d39ce2b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Aug 2008 17:15:34 +0200 Subject: sched_clock: fix cpu_clock() This patch fixes 3 issues: a) it removes the dependency on jiffies, because jiffies are incremented by a single CPU, and the tick is not synchronized between CPUs. Therefore relying on it to calculate a window to clip whacky TSC values doesn't work as it can drift around. So instead use [GTOD, GTOD+TICK_NSEC) as the window. b) __update_sched_clock() did (roughly speaking): delta = sched_clock() - scd->tick_raw; clock += delta; Which gives exponential growth, instead of linear. c) allows the sched_clock_cpu() value to warp the u64 without breaking. the results are more reliable sched_clock() deltas: before after sched_clock cpu_clock: 15750 51312 51488 cpu_clock: 59719 51052 50947 cpu_clock: 15879 51249 51061 cpu_clock: 1 50933 51198 cpu_clock: 1 50931 51039 cpu_clock: 1 51093 50981 cpu_clock: 1 51043 51040 cpu_clock: 1 50959 50938 cpu_clock: 1 50981 51011 cpu_clock: 1 51364 51212 cpu_clock: 1 51219 51273 cpu_clock: 1 51389 51048 cpu_clock: 1 51285 51611 cpu_clock: 1 50964 51137 cpu_clock: 1 50973 50968 cpu_clock: 1 50967 50972 cpu_clock: 1 58910 58485 cpu_clock: 1 51082 51025 cpu_clock: 1 50957 50958 cpu_clock: 1 50958 50957 cpu_clock: 1006128 51128 50971 cpu_clock: 1 51107 51155 cpu_clock: 1 51371 51081 cpu_clock: 1 51104 51365 cpu_clock: 1 51363 51309 cpu_clock: 1 51107 51160 cpu_clock: 1 51139 51100 cpu_clock: 1 51216 51136 cpu_clock: 1 51207 51215 cpu_clock: 1 51087 51263 cpu_clock: 1 51249 51177 cpu_clock: 1 51519 51412 cpu_clock: 1 51416 51255 cpu_clock: 1 51591 51594 cpu_clock: 1 50966 51374 cpu_clock: 1 50966 50966 cpu_clock: 1 51291 50948 cpu_clock: 1 50973 50867 cpu_clock: 1 50970 50970 cpu_clock: 998306 50970 50971 cpu_clock: 1 50971 50970 cpu_clock: 1 50970 50970 cpu_clock: 1 50971 50971 cpu_clock: 1 50970 50970 cpu_clock: 1 51351 50970 cpu_clock: 1 50970 51352 cpu_clock: 1 50971 50970 cpu_clock: 1 50970 50970 cpu_clock: 1 51321 50971 cpu_clock: 1 50974 51324 Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 84 +++++++++++++++++++++------------------------------- 1 file changed, 34 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 204991a0bfa..e8ab096ddfe 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -12,19 +12,17 @@ * * Create a semi stable clock from a mixture of other events, including: * - gtod - * - jiffies * - sched_clock() * - explicit idle events * * We use gtod as base and the unstable clock deltas. The deltas are filtered, - * making it monotonic and keeping it within an expected window. This window - * is set up using jiffies. + * making it monotonic and keeping it within an expected window. * * Furthermore, explicit sleep and wakeup hooks allow us to account for time * that is otherwise invisible (TSC gets stopped). * * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat - * consistent between cpus (never more than 1 jiffies difference). + * consistent between cpus (never more than 2 jiffies difference). */ #include #include @@ -54,7 +52,6 @@ struct sched_clock_data { */ raw_spinlock_t lock; - unsigned long tick_jiffies; u64 tick_raw; u64 tick_gtod; u64 clock; @@ -75,14 +72,12 @@ static inline struct sched_clock_data *cpu_sdc(int cpu) void sched_clock_init(void) { u64 ktime_now = ktime_to_ns(ktime_get()); - unsigned long now_jiffies = jiffies; int cpu; for_each_possible_cpu(cpu) { struct sched_clock_data *scd = cpu_sdc(cpu); scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - scd->tick_jiffies = now_jiffies; scd->tick_raw = 0; scd->tick_gtod = ktime_now; scd->clock = ktime_now; @@ -91,47 +86,52 @@ void sched_clock_init(void) sched_clock_running = 1; } +/* + * min,max except they take wrapping into account + */ + +static inline u64 wrap_min(u64 x, u64 y) +{ + return (s64)(x - y) < 0 ? x : y; +} + +static inline u64 wrap_max(u64 x, u64 y) +{ + return (s64)(x - y) > 0 ? x : y; +} + /* * update the percpu scd from the raw @now value * * - filter out backward motion - * - use jiffies to generate a min,max window to clip the raw values + * - use the GTOD tick value to create a window to filter crazy TSC values */ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) { - unsigned long now_jiffies = jiffies; - long delta_jiffies = now_jiffies - scd->tick_jiffies; - u64 clock = scd->clock; - u64 min_clock, max_clock; s64 delta = now - scd->tick_raw; + u64 clock, min_clock, max_clock; WARN_ON_ONCE(!irqs_disabled()); - min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC; - if (unlikely(delta < 0)) { - clock++; - goto out; - } + if (unlikely(delta < 0)) + delta = 0; - max_clock = min_clock + TICK_NSEC; + /* + * scd->clock = clamp(scd->tick_gtod + delta, + * max(scd->tick_gtod, scd->clock), + * scd->tick_gtod + TICK_NSEC); + */ - if (unlikely(clock + delta > max_clock)) { - if (clock < max_clock) - clock = max_clock; - else - clock++; - } else { - clock += delta; - } + clock = scd->tick_gtod + delta; + min_clock = wrap_max(scd->tick_gtod, scd->clock); + max_clock = scd->tick_gtod + TICK_NSEC; - out: - if (unlikely(clock < min_clock)) - clock = min_clock; + clock = wrap_max(clock, min_clock); + clock = wrap_min(clock, max_clock); - scd->tick_jiffies = now_jiffies; scd->clock = clock; - return clock; + return scd->clock; } static void lock_double_clock(struct sched_clock_data *data1, @@ -171,7 +171,7 @@ u64 sched_clock_cpu(int cpu) * larger time as the latest time for both * runqueues. (this creates monotonic movement) */ - if (likely(remote_clock < this_clock)) { + if (likely((s64)(remote_clock - this_clock) < 0)) { clock = this_clock; scd->clock = clock; } else { @@ -207,14 +207,9 @@ void sched_clock_tick(void) now = sched_clock(); __raw_spin_lock(&scd->lock); - __update_sched_clock(scd, now); - /* - * update tick_gtod after __update_sched_clock() because that will - * already observe 1 new jiffy; adding a new tick_gtod to that would - * increase the clock 2 jiffies. - */ scd->tick_raw = now; scd->tick_gtod = now_gtod; + __update_sched_clock(scd, now); __raw_spin_unlock(&scd->lock); } @@ -232,18 +227,7 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); */ void sched_clock_idle_wakeup_event(u64 delta_ns) { - struct sched_clock_data *scd = this_scd(); - - /* - * Override the previous timestamp and ignore all - * sched_clock() deltas that occured while we idled, - * and use the PM-provided delta_ns to advance the - * rq clock: - */ - __raw_spin_lock(&scd->lock); - scd->clock += delta_ns; - __raw_spin_unlock(&scd->lock); - + sched_clock_tick(); touch_softlockup_watchdog(); } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -- cgit v1.2.3-70-g09d2 From ffb4ba76a25ab6c9deeec33e4f58395586ca747c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 25 Aug 2008 11:10:26 -0700 Subject: [module] Don't let gcc inline load_module() 'load_module()' is a complex function that contains all the ELF section logic, and inlining it is utterly insane. But gcc will do it, simply because there is only one call-site. As a result, all the stack space that is allocated for all the work to load the module will still be active when we actually call the module init sequence, and the deep call chain makes stack overflows happen. And stack overflows are really hard to debug, because they not only corrupt random pages below the stack, but also corrupt the thread_info structure that is allocated under the stack. In this case, Alan Brunelle reported some crazy oopses at bootup, after loading the processor module that ends up doing complex ACPI stuff and has quite a deep callchain. This should fix it, and is the sane thing to do regardless. Cc: Alan D. Brunelle Cc: Arjan van de Ven Cc: Rusty Russell Signed-off-by: Linus Torvalds --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 08864d257eb..9db11911e04 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1799,7 +1799,7 @@ static void *module_alloc_update_bounds(unsigned long size) /* Allocate and load the module: note that size of section 0 is always zero, and we rely on this for optional sections. */ -static struct module *load_module(void __user *umod, +static noinline struct module *load_module(void __user *umod, unsigned long len, const char __user *uargs) { -- cgit v1.2.3-70-g09d2 From f73be6dedf4fa058ce80846dae604b08fa805ca1 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 25 Aug 2008 17:07:14 -0700 Subject: smp: have smp_call_function_single() detect invalid CPUs Have smp_call_function_single() return invalid CPU indicies and return -ENXIO. This function is already executed inside a get_cpu()..put_cpu() which locks out CPU removal, so rather than having the higher layers doing another layer of locking to guard against unplugged CPUs do the test here. Signed-off-by: H. Peter Anvin --- kernel/smp.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 782e2b93e46..f362a855377 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -210,8 +210,10 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, { struct call_single_data d; unsigned long flags; - /* prevent preemption and reschedule on another processor */ + /* prevent preemption and reschedule on another processor, + as well as CPU removal */ int me = get_cpu(); + int err = 0; /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); @@ -220,7 +222,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, local_irq_save(flags); func(info); local_irq_restore(flags); - } else { + } else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) { struct call_single_data *data = NULL; if (!wait) { @@ -236,10 +238,12 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, data->func = func; data->info = info; generic_exec_single(cpu, data); + } else { + err = -ENXIO; /* CPU not online */ } put_cpu(); - return 0; + return err; } EXPORT_SYMBOL(smp_call_function_single); -- cgit v1.2.3-70-g09d2 From 2189459d25a47401c69a17794c9d390c890351f9 Mon Sep 17 00:00:00 2001 From: Joe Korty Date: Mon, 25 Aug 2008 17:15:33 -0400 Subject: lockstat: fix numerical output rounding error Fix rounding error in /proc/lock_stat numerical output. On occasion the two digit fractional part contains the three digit value '100'. This is due to a bug in the rounding algorithm which pushes values in the range '95..99' to '100' rather than to '00' + an increment to the integer part. For example, - 123456.100 old display + 123457.00 new display Signed-off-by: Ingo Molnar --- kernel/lockdep_proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 4b194d34d77..20dbcbf9c7d 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -472,8 +472,9 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr) { unsigned long rem; + nr += 5; /* for display rounding */ rem = do_div(nr, 1000); /* XXX: do_div_signed */ - snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, ((int)rem+5)/10); + snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10); } static void seq_time(struct seq_file *m, s64 time) -- cgit v1.2.3-70-g09d2 From 04148b73b89d49fe0fe201bcee395e51f7d637ce Mon Sep 17 00:00:00 2001 From: Joe Korty Date: Mon, 25 Aug 2008 17:16:23 -0400 Subject: lockstat: repair erronous contention statistics Fix bad contention counting in /proc/lock_stat. /proc/lockstat tries to gather per-ip contention statistics per-lock. This was failing due to a garbage per-ip index selector being used. Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 3bfb1877a00..b5db51d2803 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3029,7 +3029,7 @@ found_it: stats = get_lock_stats(hlock_class(hlock)); if (point < ARRAY_SIZE(stats->contention_point)) - stats->contention_point[i]++; + stats->contention_point[point]++; if (lock->cpu != smp_processor_id()) stats->bounces[bounce_contended + !!hlock->read]++; put_lock_stats(stats); -- cgit v1.2.3-70-g09d2 From 74870172824a78640ec4f03058d9bd35dfa08618 Mon Sep 17 00:00:00 2001 From: Zhu Yi Date: Wed, 27 Aug 2008 14:33:00 +0800 Subject: lockdep: fix invalid list_del_rcu in zap_class The problem is found during iwlagn driver testing on v2.6.27-rc4-176-gb8e6c91 kernel, but it turns out to be a lockdep bug. In our testing, we frequently load and unload the iwlagn driver (>50 times). Then the MAX_STACK_TRACE_ENTRIES is reached (expected behaviour?). The error message with the call trace is as below. BUG: MAX_STACK_TRACE_ENTRIES too low! turning off the locking correctness validator. Pid: 4895, comm: iwlagn Not tainted 2.6.27-rc4 #13 Call Trace: [] save_stack_trace+0x22/0x3e [] save_trace+0x8b/0x91 [] mark_lock+0x1b0/0x8fa [] __lock_acquire+0x5b9/0x716 [] ieee80211_sta_work+0x0/0x6ea [mac80211] [] lock_acquire+0x52/0x6b [] run_workqueue+0x97/0x1ed [] run_workqueue+0xe7/0x1ed [] run_workqueue+0x97/0x1ed [] worker_thread+0xd8/0xe3 [] autoremove_wake_function+0x0/0x2e [] worker_thread+0x0/0xe3 [] kthread+0x47/0x73 [] trace_hardirqs_on_thunk+0x3a/0x3f [] child_rip+0xa/0x11 [] restore_args+0x0/0x30 [] finish_task_switch+0x0/0xcc [] kthread+0x0/0x73 [] child_rip+0x0/0x11 Although the above is harmless, when the ilwagn module is removed later lockdep will trigger a kernel oops as below. BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 IP: [] zap_class+0x24/0x82 PGD 73128067 PUD 7448c067 PMD 0 Oops: 0002 [1] SMP CPU 0 Modules linked in: rfcomm l2cap bluetooth autofs4 sunrpc nf_conntrack_ipv6 xt_state nf_conntrack xt_tcpudp ip6t_ipv6header ip6t_REJECT ip6table_filter ip6_tables x_tables ipv6 cpufreq_ondemand acpi_cpufreq dm_mirror dm_log dm_multipath dm_mod snd_hda_intel sr_mod snd_seq_dummy snd_seq_oss snd_seq_midi_event battery snd_seq snd_seq_device cdrom button snd_pcm_oss snd_mixer_oss snd_pcm snd_timer snd_page_alloc e1000e snd_hwdep sg iTCO_wdt iTCO_vendor_support ac pcspkr i2c_i801 i2c_core snd soundcore video output ata_piix ata_generic libata sd_mod scsi_mod ext3 jbd mbcache uhci_hcd ohci_hcd ehci_hcd [last unloaded: mac80211] Pid: 4941, comm: modprobe Not tainted 2.6.27-rc4 #10 RIP: 0010:[] [] zap_class+0x24/0x82 RSP: 0000:ffff88007bcb3eb0 EFLAGS: 00010046 RAX: 0000000000068ee8 RBX: ffffffff8192a0a0 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000001dfb RDI: ffffffff816e70b0 RBP: ffffffffa00cd000 R08: ffffffff816818f8 R09: ffff88007c923558 R10: ffffe20002ad2408 R11: ffffffff811028ec R12: ffffffff8192a0a0 R13: 000000000002bd90 R14: 0000000000000000 R15: 0000000000000296 FS: 00007f9d1cee56f0(0000) GS:ffffffff814a58c0(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000008 CR3: 0000000073047000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process modprobe (pid: 4941, threadinfo ffff88007bcb2000, task ffff8800758d1fc0) Stack: ffffffff81057376 0000000000000000 ffffffffa00f7b00 0000000000000000 0000000000000080 0000000000618278 00007fff24f16720 0000000000000000 ffffffff8105d37a ffffffffa00f7b00 ffffffff8105d591 313132303863616d Call Trace: [] ? lockdep_free_key_range+0x61/0xf5 [] ? free_module+0xd4/0xe4 [] ? sys_delete_module+0x1de/0x1f9 [] ? audit_syscall_entry+0x12d/0x160 [] ? system_call_fastpath+0x16/0x1b Code: b2 00 01 00 00 00 c3 31 f6 49 c7 c0 10 8a 61 81 eb 32 49 39 38 75 26 48 98 48 6b c0 38 48 8b 90 08 8a 61 81 48 8b 88 00 8a 61 81 <48> 89 51 08 48 89 0a 48 c7 80 08 8a 61 81 00 02 20 00 48 ff c6 RIP [] zap_class+0x24/0x82 RSP CR2: 0000000000000008 ---[ end trace a1297e0c4abb0f2e ]--- The root cause for this oops is in add_lock_to_list() when save_trace() fails due to MAX_STACK_TRACE_ENTRIES is reached, entry->class is assigned but entry is never added into any lock list. This makes the list_del_rcu() in zap_class() oops later when the module is unloaded. This patch fixes the problem by assigning entry->class after save_trace() returns success. Signed-off-by: Zhu Yi Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index b5db51d2803..dbda475b13b 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -875,11 +875,11 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, if (!entry) return 0; - entry->class = this; - entry->distance = distance; if (!save_trace(&entry->trace)) return 0; + entry->class = this; + entry->distance = distance; /* * Since we never remove from the dependency list, the list can * be walked lockless by other CPUs, it's only allocation -- cgit v1.2.3-70-g09d2 From 2633f0e57b1127f4060d70bf460140dc9bb19386 Mon Sep 17 00:00:00 2001 From: Steve VanDeBogart Date: Tue, 26 Aug 2008 15:14:36 -0700 Subject: exit signals: use of uninitialized field notify_count task->signal->notify_count is only initialized if task->signal->group_exit_task is not NULL. Reorder a conditional so that uninitialised memory is not used. Found by Valgrind. Signed-off-by: Steve VanDeBogart Signed-off-by: Ingo Molnar --- kernel/exit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 38ec4063014..75c64738763 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -918,8 +918,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead) /* mt-exec, de_thread() is waiting for us */ if (thread_group_leader(tsk) && - tsk->signal->notify_count < 0 && - tsk->signal->group_exit_task) + tsk->signal->group_exit_task && + tsk->signal->notify_count < 0) wake_up_process(tsk->signal->group_exit_task); write_unlock_irq(&tasklist_lock); -- cgit v1.2.3-70-g09d2 From f42ac38c59e0a03d6da0c24a63fb211393f484b0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 27 Aug 2008 09:14:40 -0400 Subject: ftrace: disable tracing for suspend to ram I've been painstakingly debugging the issue with suspend to ram and ftraced. The 2.6.28 code does not have this issue, but since the mcount recording is not going to be in 27, this must be solved for the ftrace daemon version. The resume from suspend to ram would reboot because it was triple faulting. Debugging further, I found that calling the mcount function itself was not an issue, but it would fault when it incremented preempt_count. preempt_count is on the tasks info structure that is on the low memory address of the task's stack. For some reason, it could not write to it. Resuming out of suspend to ram does quite a lot of funny tricks to get to work, so it is not surprising at all that simply doing a preempt_disable() would cause a fault. Thanks to Rafael for suggesting to add a "while (1);" to find the place in resuming that is causing the fault. I would place the loop somewhere in the code, compile and reboot and see if it would either reboot (hit the fault) or simply hang (hit the loop). Doing this over and over again, I narrowed it down that it was happening in enable_nonboot_cpus. At this point, I found that it is easier to simply disable tracing around the suspend code, instead of searching for the particular function that can not handle doing a preempt_disable. This patch disables the tracer as it suspends and reenables it on resume. I tested this patch on my Laptop, and it can resume fine with the patch. Signed-off-by: Steven Rostedt Acked-by: Rafael J. Wysocki Signed-off-by: Linus Torvalds --- kernel/power/main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 0b7476f5d2a..540b16b6856 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "power.h" @@ -310,7 +311,7 @@ static int suspend_enter(suspend_state_t state) */ int suspend_devices_and_enter(suspend_state_t state) { - int error; + int error, ftrace_save; if (!suspend_ops) return -ENOSYS; @@ -321,6 +322,7 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); + ftrace_save = __ftrace_enabled_save(); suspend_test_start(); error = device_suspend(PMSG_SUSPEND); if (error) { @@ -352,6 +354,7 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); device_resume(PMSG_RESUME); suspend_test_finish("resume devices"); + __ftrace_enabled_restore(ftrace_save); resume_console(); Close: if (suspend_ops->end) -- cgit v1.2.3-70-g09d2 From f3ade837808121ff8bab9c56725f4fe40ec85a56 Mon Sep 17 00:00:00 2001 From: John Blackwood Date: Tue, 26 Aug 2008 15:09:43 -0400 Subject: sched: fix sched_rt_rq_enqueue() resched idle When sysctl_sched_rt_runtime is set to something other than -1 and the CONFIG_RT_GROUP_SCHED kernel parameter is NOT enabled, we get into a state where we see one or more CPUs idling forvever even though there are real-time tasks in their rt runqueue that are able to run (no longer throttled). The sequence is: - A real-time task is running when the timer sets the rt runqueue to throttled, and the rt task is resched_task()ed and switched out, and idle is switched in since there are no non-rt tasks to run on that cpu. - Eventually the do_sched_rt_period_timer() runs and un-throttles the rt runqueue, but we just exit the timer interrupt and go back to executing the idle task in the idle loop forever. If we change the sched_rt_rq_enqueue() routine to use some of the code from the CONFIG_RT_GROUP_SCHED enabled version of this same routine and resched_task() the currently executing task (idle in our case) if it is a lower priority task than the higher rt task in the now un-throttled runqueue, the problem is no longer observed. Signed-off-by: John Blackwood Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 998ba54b454..07d9b330790 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -199,6 +199,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { + if (rt_rq->rt_nr_running) + resched_task(rq_of_rt_rq(rt_rq)->curr); } static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) -- cgit v1.2.3-70-g09d2 From cc2991cf15ae92fa30b3ea9f56a8a5a337bd33c7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:03 +0200 Subject: sched: rt-bandwidth accounting fix It fixes an accounting bug where we would continue accumulating runtime even though the bandwidth control is disabled. This would lead to very long throttle periods once bandwidth control gets turned on again. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 07d9b330790..552310798da 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -440,9 +440,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { u64 runtime = sched_rt_runtime(rt_rq); - if (runtime == RUNTIME_INF) - return 0; - if (rt_rq->rt_throttled) return rt_rq_throttled(rt_rq); @@ -493,9 +490,11 @@ static void update_curr_rt(struct rq *rq) rt_rq = rt_rq_of_se(rt_se); spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_time += delta_exec; - if (sched_rt_runtime_exceeded(rt_rq)) - resched_task(curr); + if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { + rt_rq->rt_time += delta_exec; + if (sched_rt_runtime_exceeded(rt_rq)) + resched_task(curr); + } spin_unlock(&rt_rq->rt_runtime_lock); } } -- cgit v1.2.3-70-g09d2 From 41108eb10142e0552f2de1e4c0675b108c5f018f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 28 Aug 2008 14:39:12 +0200 Subject: ftrace: disable tracing for hibernation In accordance with commit f42ac38c59e0a03d6da0c24a63fb211393f484b0 ("ftrace: disable tracing for suspend to ram"), disable tracing around the suspend code in hibernation code paths. Signed-off-by: Rafael J. Wysocki Acked-by: Steven Rostedt Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index f011e0870b5..bbd85c60f74 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "power.h" @@ -255,7 +256,7 @@ static int create_image(int platform_mode) int hibernation_snapshot(int platform_mode) { - int error; + int error, ftrace_save; /* Free memory before shutting down devices. */ error = swsusp_shrink_memory(); @@ -267,6 +268,7 @@ int hibernation_snapshot(int platform_mode) goto Close; suspend_console(); + ftrace_save = __ftrace_enabled_save(); error = device_suspend(PMSG_FREEZE); if (error) goto Recover_platform; @@ -296,6 +298,7 @@ int hibernation_snapshot(int platform_mode) Resume_devices: device_resume(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); + __ftrace_enabled_restore(ftrace_save); resume_console(); Close: platform_end(platform_mode); @@ -366,10 +369,11 @@ static int resume_target_kernel(void) int hibernation_restore(int platform_mode) { - int error; + int error, ftrace_save; pm_prepare_console(); suspend_console(); + ftrace_save = __ftrace_enabled_save(); error = device_suspend(PMSG_QUIESCE); if (error) goto Finish; @@ -384,6 +388,7 @@ int hibernation_restore(int platform_mode) platform_restore_cleanup(platform_mode); device_resume(PMSG_RECOVER); Finish: + __ftrace_enabled_restore(ftrace_save); resume_console(); pm_restore_console(); return error; @@ -396,7 +401,7 @@ int hibernation_restore(int platform_mode) int hibernation_platform_enter(void) { - int error; + int error, ftrace_save; if (!hibernation_ops) return -ENOSYS; @@ -411,6 +416,7 @@ int hibernation_platform_enter(void) goto Close; suspend_console(); + ftrace_save = __ftrace_enabled_save(); error = device_suspend(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -445,6 +451,7 @@ int hibernation_platform_enter(void) hibernation_ops->finish(); Resume_devices: device_resume(PMSG_RESTORE); + __ftrace_enabled_restore(ftrace_save); resume_console(); Close: hibernation_ops->end(); -- cgit v1.2.3-70-g09d2 From 316d9679f33caf7e683471647d1472bfe133d858 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 29 Aug 2008 20:06:23 +0200 Subject: Don't trigger softlockup detector on network fs blocked tasks Pulling the ethernet cable on a 2.6.27-rc system with NFS mounts currently leads to an ongoing flood of soft lockup detector backtraces for all tasks blocked on the NFS mounts when the hickup takes longer than 120s. I don't think NFS problems should be all that noisy. Luckily there's a reasonably easy way to distingush this case. Don't report task softlockup warnings for tasks in TASK_KILLABLE state, which is used by the network file systems. I believe this patch is a 2.6.27 candidate. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- kernel/softlockup.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/softlockup.c b/kernel/softlockup.c index b75b492fbfc..1a07f8ca4b9 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -180,6 +180,10 @@ static void check_hung_task(struct task_struct *t, unsigned long now) if (t->flags & PF_FROZEN) return; + /* Don't check for tasks waiting on network file systems like NFS */ + if (t->state & TASK_KILLABLE) + return; + if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { t->last_switch_count = switch_count; t->last_switch_timestamp = now; -- cgit v1.2.3-70-g09d2 From bef69ea0dcce574a425feb0a5aa4c63dd108b9a6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 29 Aug 2008 20:18:31 -0700 Subject: Resource handling: add 'insert_resource_expand_to_fit()' function Not used anywhere yet, but this complements the existing plain 'insert_resource()' functionality with a version that can expand the resource we are adding in order to fix up any conflicts it has with existing resources. Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 1 + kernel/resource.c | 88 ++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 64 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 22d2115458c..8d3b7a9afd1 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -109,6 +109,7 @@ extern struct resource iomem_resource; extern int request_resource(struct resource *root, struct resource *new); extern int release_resource(struct resource *new); extern int insert_resource(struct resource *parent, struct resource *new); +extern void insert_resource_expand_to_fit(struct resource *root, struct resource *new); extern int allocate_resource(struct resource *root, struct resource *new, resource_size_t size, resource_size_t min, resource_size_t max, resource_size_t align, diff --git a/kernel/resource.c b/kernel/resource.c index f5b518eabef..cf0a178c751 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -362,35 +362,21 @@ int allocate_resource(struct resource *root, struct resource *new, EXPORT_SYMBOL(allocate_resource); -/** - * insert_resource - Inserts a resource in the resource tree - * @parent: parent of the new resource - * @new: new resource to insert - * - * Returns 0 on success, -EBUSY if the resource can't be inserted. - * - * This function is equivalent to request_resource when no conflict - * happens. If a conflict happens, and the conflicting resources - * entirely fit within the range of the new resource, then the new - * resource is inserted and the conflicting resources become children of - * the new resource. +/* + * Insert a resource into the resource tree. If successful, return NULL, + * otherwise return the conflicting resource (compare to __request_resource()) */ -int insert_resource(struct resource *parent, struct resource *new) +static struct resource * __insert_resource(struct resource *parent, struct resource *new) { - int result; struct resource *first, *next; - write_lock(&resource_lock); - for (;; parent = first) { - result = 0; first = __request_resource(parent, new); if (!first) - goto out; + return first; - result = -EBUSY; if (first == parent) - goto out; + return first; if ((first->start > new->start) || (first->end < new->end)) break; @@ -401,15 +387,13 @@ int insert_resource(struct resource *parent, struct resource *new) for (next = first; ; next = next->sibling) { /* Partial overlap? Bad, and unfixable */ if (next->start < new->start || next->end > new->end) - goto out; + return next; if (!next->sibling) break; if (next->sibling->start > new->end) break; } - result = 0; - new->parent = parent; new->sibling = next->sibling; new->child = first; @@ -426,10 +410,64 @@ int insert_resource(struct resource *parent, struct resource *new) next = next->sibling; next->sibling = new; } + return NULL; +} - out: +/** + * insert_resource - Inserts a resource in the resource tree + * @parent: parent of the new resource + * @new: new resource to insert + * + * Returns 0 on success, -EBUSY if the resource can't be inserted. + * + * This function is equivalent to request_resource when no conflict + * happens. If a conflict happens, and the conflicting resources + * entirely fit within the range of the new resource, then the new + * resource is inserted and the conflicting resources become children of + * the new resource. + */ +int insert_resource(struct resource *parent, struct resource *new) +{ + struct resource *conflict; + + write_lock(&resource_lock); + conflict = __insert_resource(parent, new); + write_unlock(&resource_lock); + return conflict ? -EBUSY : 0; +} + +/** + * insert_resource_expand_to_fit - Insert a resource into the resource tree + * @parent: parent of the new resource + * @new: new resource to insert + * + * Insert a resource into the resource tree, possibly expanding it in order + * to make it encompass any conflicting resources. + */ +void insert_resource_expand_to_fit(struct resource *root, struct resource *new) +{ + if (new->parent) + return; + + write_lock(&resource_lock); + for (;;) { + struct resource *conflict; + + conflict = __insert_resource(root, new); + if (!conflict) + break; + if (conflict == root) + break; + + /* Ok, expand resource to cover the conflict, then try again .. */ + if (conflict->start < new->start) + new->start = conflict->start; + if (conflict->end > new->end) + new->end = conflict->end; + + printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name); + } write_unlock(&resource_lock); - return result; } /** -- cgit v1.2.3-70-g09d2 From c4bacefb7aaf49da11a695f29d85d40909f17693 Mon Sep 17 00:00:00 2001 From: Cordelia Date: Mon, 18 Aug 2008 09:45:51 -0700 Subject: [PATCH] audit: Moved variable declaration to beginning of function got rid of compilation warning: ISO C90 forbids mixed declarations and code Signed-off-by: Cordelia Sam Signed-off-by: Al Viro --- kernel/auditsc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 972f8e61d36..59cedfb040e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -243,10 +243,11 @@ static inline int open_arg(int flags, int mask) static int audit_match_perm(struct audit_context *ctx, int mask) { + unsigned n; if (unlikely(!ctx)) return 0; - unsigned n = ctx->major; + n = ctx->major; switch (audit_classify_syscall(ctx->arch, n)) { case 0: /* native */ if ((mask & AUDIT_PERM_WRITE) && -- cgit v1.2.3-70-g09d2 From 6781f4ae30bbb8ebf31187b3c9304be16966f5a0 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 31 Aug 2008 20:31:55 -0700 Subject: kernel/resource.c: fix new kernel-doc warning Fix kernel-doc warning for new function: Warning(linux-2.6.27-rc5-git2//kernel/resource.c:448): No description found for parameter 'root' Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- kernel/resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index cf0a178c751..03d796c1b2e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -438,7 +438,7 @@ int insert_resource(struct resource *parent, struct resource *new) /** * insert_resource_expand_to_fit - Insert a resource into the resource tree - * @parent: parent of the new resource + * @root: root resource descriptor * @new: new resource to insert * * Insert a resource into the resource tree, possibly expanding it in order -- cgit v1.2.3-70-g09d2 From cbaed698f37494b30b2449b51c728ae48630cb2b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 30 Aug 2008 21:08:40 +0400 Subject: softlockup: minor cleanup, don't check task->state twice The recent commit 16d9679f33caf7e683471647d1472bfe133d858 changed check_hung_task() to filter out the TASK_KILLABLE tasks. We can move this check to the caller which has to test t->state anyway. Signed-off-by: Oleg Nesterov Acked-by: Andi Kleen Signed-off-by: Linus Torvalds --- kernel/softlockup.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 1a07f8ca4b9..cb838ee93a8 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -180,10 +180,6 @@ static void check_hung_task(struct task_struct *t, unsigned long now) if (t->flags & PF_FROZEN) return; - /* Don't check for tasks waiting on network file systems like NFS */ - if (t->state & TASK_KILLABLE) - return; - if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { t->last_switch_count = switch_count; t->last_switch_timestamp = now; @@ -237,7 +233,8 @@ static void check_hung_uninterruptible_tasks(int this_cpu) do_each_thread(g, t) { if (!--max_count) goto unlock; - if (t->state & TASK_UNINTERRUPTIBLE) + /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ + if (t->state == TASK_UNINTERRUPTIBLE) check_hung_task(t, now); } while_each_thread(g, t); unlock: -- cgit v1.2.3-70-g09d2 From add0d4dfd660e9e4fd0af3eac3cad23583c9558f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 2 Sep 2008 14:35:48 -0700 Subject: pid_ns: zap_pid_ns_processes: fix the ->child_reaper changing zap_pid_ns_processes() sets pid_ns->child_reaper = NULL, this is wrong. Yes, we have already killed all tasks in this namespace, and sys_wait4() doesn't see any child. But this doesn't mean ->children list is empty, we may have EXIT_DEAD tasks which are not visible to do_wait(). In that case the subsequent forget_original_parent() will crash the kernel because it will try to re-parent these tasks to the NULL reaper. Even if there are no childs, it is not good that forget_original_parent() uses reaper == NULL. Change the code to set ->child_reaper = init_pid_ns.child_reaper instead. We could use pid_ns->parent->child_reaper as well, I think this does not really matter. These EXIT_DEAD tasks are not visible to the new ->parent after re-parenting, they will silently do release_task() eventually. Note that we must change ->child_reaper, otherwise forget_original_parent() will use reaper == father, and in that case we will hit the (correct) BUG_ON(!list_empty(&father->children)). Signed-off-by: Oleg Nesterov Acked-by: Serge Hallyn Acked-by: Sukadev Bhattiprolu Acked-by: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid_namespace.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index ea567b78d1a..598f1eec982 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -179,9 +179,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) rc = sys_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); - - /* Child reaper for the pid namespace is going away */ - pid_ns->child_reaper = NULL; + /* + * We can not clear ->child_reaper or leave it alone. + * There may by stealth EXIT_DEAD tasks on ->children, + * forget_original_parent() must move them somewhere. + */ + pid_ns->child_reaper = init_pid_ns.child_reaper; acct_exit_ns(pid_ns); return; } -- cgit v1.2.3-70-g09d2 From 950bbabb5a804690a0201190de5c22837f72f83f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 2 Sep 2008 14:35:49 -0700 Subject: pid_ns: (BUG 11391) change ->child_reaper when init->group_leader exits We don't change pid_ns->child_reaper when the main thread of the subnamespace init exits. As Robert Rex pointed out this is wrong. Yes, the re-parenting itself works correctly, but if the reparented task exits it needs ->parent->nsproxy->pid_ns in do_notify_parent(), and if the main thread is zombie its ->nsproxy was already cleared by exit_task_namespaces(). Introduce the new function, find_new_reaper(), which finds the new ->parent for the re-parenting and changes ->child_reaper if needed. Kill the now unneeded exit_child_reaper(). Also move the changing of ->child_reaper from zap_pid_ns_processes() to find_new_reaper(), this consolidates the games with ->child_reaper and makes it stable under tasklist_lock. Addresses http://bugzilla.kernel.org/show_bug.cgi?id=11391 Reported-by: Robert Rex Signed-off-by: Oleg Nesterov Acked-by: Serge Hallyn Acked-by: Pavel Emelyanov Acked-by: Sukadev Bhattiprolu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 78 ++++++++++++++++++++++---------------------------- kernel/pid_namespace.c | 6 ---- 2 files changed, 34 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 75c64738763..25ed2ad986d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -831,26 +831,50 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father) * the child reaper process (ie "init") in our pid * space. */ +static struct task_struct *find_new_reaper(struct task_struct *father) +{ + struct pid_namespace *pid_ns = task_active_pid_ns(father); + struct task_struct *thread; + + thread = father; + while_each_thread(father, thread) { + if (thread->flags & PF_EXITING) + continue; + if (unlikely(pid_ns->child_reaper == father)) + pid_ns->child_reaper = thread; + return thread; + } + + if (unlikely(pid_ns->child_reaper == father)) { + write_unlock_irq(&tasklist_lock); + if (unlikely(pid_ns == &init_pid_ns)) + panic("Attempted to kill init!"); + + zap_pid_ns_processes(pid_ns); + write_lock_irq(&tasklist_lock); + /* + * We can not clear ->child_reaper or leave it alone. + * There may by stealth EXIT_DEAD tasks on ->children, + * forget_original_parent() must move them somewhere. + */ + pid_ns->child_reaper = init_pid_ns.child_reaper; + } + + return pid_ns->child_reaper; +} + static void forget_original_parent(struct task_struct *father) { - struct task_struct *p, *n, *reaper = father; + struct task_struct *p, *n, *reaper; LIST_HEAD(ptrace_dead); write_lock_irq(&tasklist_lock); - + reaper = find_new_reaper(father); /* * First clean up ptrace if we were using it. */ ptrace_exit(father, &ptrace_dead); - do { - reaper = next_thread(reaper); - if (reaper == father) { - reaper = task_child_reaper(father); - break; - } - } while (reaper->flags & PF_EXITING); - list_for_each_entry_safe(p, n, &father->children, sibling) { p->real_parent = reaper; if (p->parent == father) { @@ -959,39 +983,6 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif -static inline void exit_child_reaper(struct task_struct *tsk) -{ - if (likely(tsk->group_leader != task_child_reaper(tsk))) - return; - - if (tsk->nsproxy->pid_ns == &init_pid_ns) - panic("Attempted to kill init!"); - - /* - * @tsk is the last thread in the 'cgroup-init' and is exiting. - * Terminate all remaining processes in the namespace and reap them - * before exiting @tsk. - * - * Note that @tsk (last thread of cgroup-init) may not necessarily - * be the child-reaper (i.e main thread of cgroup-init) of the - * namespace i.e the child_reaper may have already exited. - * - * Even after a child_reaper exits, we let it inherit orphaned children, - * because, pid_ns->child_reaper remains valid as long as there is - * at least one living sub-thread in the cgroup init. - - * This living sub-thread of the cgroup-init will be notified when - * a child inherited by the 'child-reaper' exits (do_notify_parent() - * uses __group_send_sig_info()). Further, when reaping child processes, - * do_wait() iterates over children of all living sub threads. - - * i.e even though 'child_reaper' thread is listed as the parent of the - * orphaned children, any living sub-thread in the cgroup-init can - * perform the role of the child_reaper. - */ - zap_pid_ns_processes(tsk->nsproxy->pid_ns); -} - NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; @@ -1051,7 +1042,6 @@ NORET_TYPE void do_exit(long code) } group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { - exit_child_reaper(tsk); hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 598f1eec982..fab8ea86fac 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -179,12 +179,6 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) rc = sys_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); - /* - * We can not clear ->child_reaper or leave it alone. - * There may by stealth EXIT_DEAD tasks on ->children, - * forget_original_parent() must move them somewhere. - */ - pid_ns->child_reaper = init_pid_ns.child_reaper; acct_exit_ns(pid_ns); return; } -- cgit v1.2.3-70-g09d2 From 9d3593574702ae1899e23a1535da1ac71f928042 Mon Sep 17 00:00:00 2001 From: John Kacur Date: Tue, 2 Sep 2008 14:36:13 -0700 Subject: pm_qos_requirement might sleep Make PM_QOS and CPU_IDLE play nicer when run with the RT-Preempt kernel. The purpose of the patch is to remove the spin_lock around the read in the function pm_qos_requirement - since spinlocks can sleep in -rt and this function is called from idle. CPU_IDLE polls the target_value's of some of the pm_qos parameters from the idle loop causing sleeping locking warnings. Changing the target_value to an atomic avoids this issue. Remove the spinlock in pm_qos_requirement by making target_value an atomic type. Signed-off-by: mark gross Signed-off-by: John Kacur Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pm_qos_params.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index da9c2dda6a4..dfdec524d1b 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -43,7 +43,7 @@ #include /* - * locking rule: all changes to target_value or requirements or notifiers lists + * locking rule: all changes to requirements or notifiers lists * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock * held, taken with _irqsave. One lock to rule them all */ @@ -66,7 +66,7 @@ struct pm_qos_object { struct miscdevice pm_qos_power_miscdev; char *name; s32 default_value; - s32 target_value; + atomic_t target_value; s32 (*comparitor)(s32, s32); }; @@ -77,7 +77,7 @@ static struct pm_qos_object cpu_dma_pm_qos = { .notifiers = &cpu_dma_lat_notifier, .name = "cpu_dma_latency", .default_value = 2000 * USEC_PER_SEC, - .target_value = 2000 * USEC_PER_SEC, + .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), .comparitor = min_compare }; @@ -87,7 +87,7 @@ static struct pm_qos_object network_lat_pm_qos = { .notifiers = &network_lat_notifier, .name = "network_latency", .default_value = 2000 * USEC_PER_SEC, - .target_value = 2000 * USEC_PER_SEC, + .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), .comparitor = min_compare }; @@ -99,7 +99,7 @@ static struct pm_qos_object network_throughput_pm_qos = { .notifiers = &network_throughput_notifier, .name = "network_throughput", .default_value = 0, - .target_value = 0, + .target_value = ATOMIC_INIT(0), .comparitor = max_compare }; @@ -150,11 +150,11 @@ static void update_target(int target) extreme_value = pm_qos_array[target]->comparitor( extreme_value, node->value); } - if (pm_qos_array[target]->target_value != extreme_value) { + if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) { call_notifier = 1; - pm_qos_array[target]->target_value = extreme_value; + atomic_set(&pm_qos_array[target]->target_value, extreme_value); pr_debug(KERN_ERR "new target for qos %d is %d\n", target, - pm_qos_array[target]->target_value); + atomic_read(&pm_qos_array[target]->target_value)); } spin_unlock_irqrestore(&pm_qos_lock, flags); @@ -193,14 +193,7 @@ static int find_pm_qos_object_by_minor(int minor) */ int pm_qos_requirement(int pm_qos_class) { - int ret_val; - unsigned long flags; - - spin_lock_irqsave(&pm_qos_lock, flags); - ret_val = pm_qos_array[pm_qos_class]->target_value; - spin_unlock_irqrestore(&pm_qos_lock, flags); - - return ret_val; + return atomic_read(&pm_qos_array[pm_qos_class]->target_value); } EXPORT_SYMBOL_GPL(pm_qos_requirement); -- cgit v1.2.3-70-g09d2 From b380b0d4f7dffcc235c0facefa537d4655619101 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 4 Sep 2008 17:05:57 +0100 Subject: forgotten refcount on sysctl root table We should've set refcount on the root sysctl table; otherwise we'll blow up the first time we get down to zero dynamically registered sysctl tables. Signed-off-by: Al Viro Tested-by: James Bottomley Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index fe471334727..50ec0886fa3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -159,6 +159,7 @@ static int proc_dointvec_taint(struct ctl_table *table, int write, struct file * static struct ctl_table root_table[]; static struct ctl_table_root sysctl_table_root; static struct ctl_table_header root_table_header = { + .count = 1, .ctl_table = root_table, .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list), .root = &sysctl_table_root, -- cgit v1.2.3-70-g09d2