From bd8fbdee6562ee526f3c2582a3b373ef195015dd Mon Sep 17 00:00:00 2001 From: Rui Sousa Date: Wed, 3 Sep 2008 17:53:07 +0200 Subject: lockdep: fix compilation when CONFIG_TRACE_IRQFLAGS_SUPPORT is not set This patch fixes compilation if CONFIG_TRACE_IRQFLAGS_SUPPORT is ever disabled (which is currently not allowed by Kconfig). Alternatively we could just remove the option altogether and the associated code paths. Since the compilation error has been in the tree for at least two years and no one noticed it, I guess we don't really have the need for CONFIG_TRACE_IRQFLAGS_SUPPORT=n. Boot tested on x86 UP. Signed-off-by: Rui Sousa Signed-off-by: Ingo Molnar --- include/linux/irqflags.h | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 74bde13224c..f2993512b3b 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -52,10 +52,10 @@ # define start_critical_timings() do { } while (0) #endif -#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT - #include +#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT + #define local_irq_enable() \ do { trace_hardirqs_on(); raw_local_irq_enable(); } while (0) #define local_irq_disable() \ @@ -84,21 +84,20 @@ * The local_irq_*() APIs are equal to the raw_local_irq*() * if !TRACE_IRQFLAGS. */ -# define raw_local_irq_disable() local_irq_disable() -# define raw_local_irq_enable() local_irq_enable() -# define raw_local_irq_save(flags) \ +#define local_irq_disable() raw_local_irq_disable() +#define local_irq_enable() raw_local_irq_enable() +#define local_irq_save(flags) \ do { \ typecheck(unsigned long, flags); \ - local_irq_save(flags); \ + raw_local_irq_save(flags); \ } while (0) -# define raw_local_irq_restore(flags) \ +# define local_irq_restore(flags) \ do { \ typecheck(unsigned long, flags); \ - local_irq_restore(flags); \ + raw_local_irq_restore(flags); \ } while (0) #endif /* CONFIG_TRACE_IRQFLAGS_SUPPORT */ -#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT #define safe_halt() \ do { \ trace_hardirqs_on(); \ @@ -124,6 +123,5 @@ typecheck(unsigned long, flags); \ raw_irqs_disabled_flags(flags); \ }) -#endif /* CONFIG_X86 */ #endif -- cgit v1.2.3-18-g5258 From cdad72207d164569cb4bf647eb824a7f93e8d388 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Sep 2008 16:00:57 +0200 Subject: lockstat: documentation update Christoph noted that the documentation doesn't tell in what unit the lockstat times are reported, ammend this. Reported-by: Christoph Hellwig Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- Documentation/lockstat.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/lockstat.txt b/Documentation/lockstat.txt index 4ba4664ce5c..02f36f5c64f 100644 --- a/Documentation/lockstat.txt +++ b/Documentation/lockstat.txt @@ -100,6 +100,7 @@ The first lock (05-10) is a read/write lock, and shows two lines above the short separator. The contention points don't match the column descriptors, they have two: contentions and [] symbol. +The integer part of the time values is in us. View the top contending locks: -- cgit v1.2.3-18-g5258 From 8c56250f48347750c82ab18d98d647dcf99ca674 Mon Sep 17 00:00:00 2001 From: Rui Sousa Date: Thu, 4 Sep 2008 19:47:53 +0200 Subject: lockdep, UML: fix compilation when CONFIG_TRACE_IRQFLAGS_SUPPORT is not set Hiroshi Shimamoto reported: > > !TRACE_IRQFLAGS_SUPPORT mode of build for future work, it can be > > restored via a simple revert. > > Hi, it seems that this patch breaks uml build. > > kernel/printk.c: In function 'vprintk': > kernel/printk.c:674: error: implicit declaration of function > 'raw_local_irq_save' kernel/printk.c:772: error: implicit declaration of > function 'raw_local_irq_restore' With the patch bellow it compiles (make ARCH=um with a x86 host), but I'm really out of my league on this one... Reported-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- include/asm-um/system-generic.h | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/include/asm-um/system-generic.h b/include/asm-um/system-generic.h index 5bcfa35e7a2..f1ea4da34fa 100644 --- a/include/asm-um/system-generic.h +++ b/include/asm-um/system-generic.h @@ -4,15 +4,15 @@ #include "asm/arch/system.h" #undef switch_to -#undef local_irq_save -#undef local_irq_restore -#undef local_irq_disable -#undef local_irq_enable -#undef local_save_flags -#undef local_irq_restore -#undef local_irq_enable -#undef local_irq_disable -#undef local_irq_save +#undef raw_local_irq_save +#undef raw_local_irq_restore +#undef raw_local_irq_disable +#undef raw_local_irq_enable +#undef raw_local_save_flags +#undef raw_local_irq_restore +#undef raw_local_irq_enable +#undef raw_local_irq_disable +#undef raw_local_irq_save #undef irqs_disabled extern void *switch_to(void *prev, void *next, void *last); @@ -23,21 +23,21 @@ extern int get_signals(void); extern void block_signals(void); extern void unblock_signals(void); -#define local_save_flags(flags) do { typecheck(unsigned long, flags); \ +#define raw_local_save_flags(flags) do { typecheck(unsigned long, flags); \ (flags) = get_signals(); } while(0) -#define local_irq_restore(flags) do { typecheck(unsigned long, flags); \ +#define raw_local_irq_restore(flags) do { typecheck(unsigned long, flags); \ set_signals(flags); } while(0) -#define local_irq_save(flags) do { local_save_flags(flags); \ - local_irq_disable(); } while(0) +#define raw_local_irq_save(flags) do { raw_local_save_flags(flags); \ + raw_local_irq_disable(); } while(0) -#define local_irq_enable() unblock_signals() -#define local_irq_disable() block_signals() +#define raw_local_irq_enable() unblock_signals() +#define raw_local_irq_disable() block_signals() #define irqs_disabled() \ ({ \ unsigned long flags; \ - local_save_flags(flags); \ + raw_local_save_flags(flags); \ (flags == 0); \ }) -- cgit v1.2.3-18-g5258 From 1b2439dbb703ae8d95a9ce7ece6b7800b80f41f0 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 15 Aug 2008 15:29:38 -0700 Subject: debug: add notifier chain debugging during some development we suspected a case where we left something in a notifier chain that was from a module that was unloaded already... and that sort of thing is rather hard to track down. This patch adds a very simple sanity check (which isn't all that expensive) to make sure the notifier we're about to call is actually from either the kernel itself of from a still-loaded module, avoiding a hard-to-chase-down crash. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- kernel/notifier.c | 16 ++++++++++++++++ lib/Kconfig.debug | 10 ++++++++++ 2 files changed, 26 insertions(+) diff --git a/kernel/notifier.c b/kernel/notifier.c index 823be11584e..143fdd77dbf 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -21,6 +21,10 @@ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); static int notifier_chain_register(struct notifier_block **nl, struct notifier_block *n) { + if (!kernel_text_address((unsigned long)n->notifier_call)) { + WARN(1, "Invalid notifier registered!"); + return 0; + } while ((*nl) != NULL) { if (n->priority > (*nl)->priority) break; @@ -34,6 +38,10 @@ static int notifier_chain_register(struct notifier_block **nl, static int notifier_chain_cond_register(struct notifier_block **nl, struct notifier_block *n) { + if (!kernel_text_address((unsigned long)n->notifier_call)) { + WARN(1, "Invalid notifier registered!"); + return 0; + } while ((*nl) != NULL) { if ((*nl) == n) return 0; @@ -82,6 +90,14 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, while (nb && nr_to_call) { next_nb = rcu_dereference(nb->next); + +#ifdef CONFIG_DEBUG_NOTIFIERS + if (!kernel_text_address((unsigned long)nb->notifier_call)) { + WARN(1, "Invalid notifier called!"); + nb = next_nb; + continue; + } +#endif ret = nb->notifier_call(nb, val, v); if (nr_calls) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 8b5a7d304a5..342858fbabb 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -536,6 +536,16 @@ config DEBUG_SG If unsure, say N. +config DEBUG_NOTIFIERS + bool "Debug notifier call chains" + depends on DEBUG_KERNEL + help + Enable this to turn on sanity checking for notifier call chains. + This is most useful for kernel developers to make sure that + modules properly unregister themselves from notifier chains. + This is a relatively cheap check but if you care about maximum + performance, say N. + config FRAME_POINTER bool "Compile the kernel with frame pointers" depends on DEBUG_KERNEL && \ -- cgit v1.2.3-18-g5258 From fb822db465bd9fd4208eef1af4490539b236c54e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 20 Aug 2008 11:17:40 +0200 Subject: softlockup: increase hung tasks check from 2 minutes to 8 minutes Andrew says: > Seems that about 100% of the reports we get of this warning triggering > are sys_sync, transaction commit, etc. increase the timeout. If it still triggers for people, we can kill it. Signed-off-by: Ingo Molnar --- kernel/softlockup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/softlockup.c b/kernel/softlockup.c index b75b492fbfc..17a05806533 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -164,7 +164,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = 1024; /* * Zero means infinite timeout - no checking done: */ -unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; +unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480; unsigned long __read_mostly sysctl_hung_task_warnings = 10; -- cgit v1.2.3-18-g5258 From ab7476cf76e560f0efda2a631a70aabe93009025 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 15 Aug 2008 15:29:38 -0700 Subject: debug: add notifier chain debugging, v2 - unbreak ia64 (and powerpc) where function pointers dont point at code but at data (reported by Tony Luck) [ mingo@elte.hu: various cleanups ] Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 3 +++ kernel/extable.c | 16 ++++++++++++++++ kernel/notifier.c | 10 +--------- lib/vsprintf.c | 2 +- 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2651f805ba6..4e1366b552a 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -187,6 +187,9 @@ extern unsigned long long memparse(char *ptr, char **retptr); extern int core_kernel_text(unsigned long addr); extern int __kernel_text_address(unsigned long addr); extern int kernel_text_address(unsigned long addr); +extern int func_ptr_is_kernel_text(void *ptr); +extern void *dereference_function_descriptor(void *ptr); + struct pid; extern struct pid *session_of_pgrp(struct pid *pgrp); diff --git a/kernel/extable.c b/kernel/extable.c index a26cb2e1702..adf0cc9c02d 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -66,3 +66,19 @@ int kernel_text_address(unsigned long addr) return 1; return module_text_address(addr) != NULL; } + +/* + * On some architectures (PPC64, IA64) function pointers + * are actually only tokens to some data that then holds the + * real function address. As a result, to find if a function + * pointer is part of the kernel text, we need to do some + * special dereferencing first. + */ +int func_ptr_is_kernel_text(void *ptr) +{ + unsigned long addr; + addr = (unsigned long) dereference_function_descriptor(ptr); + if (core_kernel_text(addr)) + return 1; + return module_text_address(addr) != NULL; +} diff --git a/kernel/notifier.c b/kernel/notifier.c index 143fdd77dbf..0f39e398ef6 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -21,10 +21,6 @@ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); static int notifier_chain_register(struct notifier_block **nl, struct notifier_block *n) { - if (!kernel_text_address((unsigned long)n->notifier_call)) { - WARN(1, "Invalid notifier registered!"); - return 0; - } while ((*nl) != NULL) { if (n->priority > (*nl)->priority) break; @@ -38,10 +34,6 @@ static int notifier_chain_register(struct notifier_block **nl, static int notifier_chain_cond_register(struct notifier_block **nl, struct notifier_block *n) { - if (!kernel_text_address((unsigned long)n->notifier_call)) { - WARN(1, "Invalid notifier registered!"); - return 0; - } while ((*nl) != NULL) { if ((*nl) == n) return 0; @@ -92,7 +84,7 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, next_nb = rcu_dereference(nb->next); #ifdef CONFIG_DEBUG_NOTIFIERS - if (!kernel_text_address((unsigned long)nb->notifier_call)) { + if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { WARN(1, "Invalid notifier called!"); nb = next_nb; continue; diff --git a/lib/vsprintf.c b/lib/vsprintf.c index d8d1d114224..f5e5ffb9942 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -513,7 +513,7 @@ static char *string(char *buf, char *end, char *s, int field_width, int precisio return buf; } -static inline void *dereference_function_descriptor(void *ptr) +void *dereference_function_descriptor(void *ptr) { #if defined(CONFIG_IA64) || defined(CONFIG_PPC64) void *p; -- cgit v1.2.3-18-g5258 From 76b189e91845eab3a9d52bb97f971d312d25652d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Sep 2008 09:57:35 +0200 Subject: lockdep: add might_lock() / might_lock_read() useful to establish a lock dependency in case the actual dependency is rare or hard to trigger. Signed-off-by: Peter Zijlstra Acked-by: Nick Piggin Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 331e5f1c2d8..0aa657aa8a1 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -480,4 +480,22 @@ static inline void print_irqtrace_events(struct task_struct *curr) # define lock_map_release(l) do { } while (0) #endif +#ifdef CONFIG_PROVE_LOCKING +# define might_lock(lock) \ +do { \ + typecheck(struct lockdep_map *, &(lock)->dep_map); \ + lock_acquire(&(lock)->dep_map, 0, 0, 0, 2, NULL, _THIS_IP_); \ + lock_release(&(lock)->dep_map, 0, _THIS_IP_); \ +} while (0) +# define might_lock_read(lock) \ +do { \ + typecheck(struct lockdep_map *, &(lock)->dep_map); \ + lock_acquire(&(lock)->dep_map, 0, 0, 1, 2, NULL, _THIS_IP_); \ + lock_release(&(lock)->dep_map, 0, _THIS_IP_); \ +} while (0) +#else +# define might_lock(lock) do { } while (0) +# define might_lock_read(lock) do { } while (0) +#endif + #endif /* __LINUX_LOCKDEP_H */ -- cgit v1.2.3-18-g5258 From c10d38dda1774ed4540380333cabd229eff37094 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 10 Sep 2008 13:37:17 +0200 Subject: x86: some lock annotations for user copy paths copy_to/from_user and all its variants (except the atomic ones) can take a page fault and perform non-trivial work like taking mmap_sem and entering the filesyste/pagecache. Unfortunately, this often escapes lockdep because a common pattern is to use it to read in some arguments just set up from userspace, or write data back to a hot buffer. In those cases, it will be unlikely for page reclaim to get a window in to cause copy_*_user to fault. With the new might_lock primitives, add some annotations to x86. I don't know if I caught all possible faulting points (it's a bit of a maze, and I didn't really look at 32-bit). But this is a starting point. Boots and runs OK so far. Signed-off-by: Nick Piggin Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/lib/usercopy_32.c | 7 ++++++- arch/x86/lib/usercopy_64.c | 4 ++++ include/asm-x86/uaccess.h | 14 ++++++++++++++ include/asm-x86/uaccess_32.h | 10 ++++++++-- include/asm-x86/uaccess_64.h | 12 ++++++++++++ 5 files changed, 44 insertions(+), 3 deletions(-) diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 24e60944971..8eedde2a9ca 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -33,6 +33,8 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon do { \ int __d0, __d1, __d2; \ might_sleep(); \ + if (current->mm) \ + might_lock_read(¤t->mm->mmap_sem); \ __asm__ __volatile__( \ " testl %1,%1\n" \ " jz 2f\n" \ @@ -120,6 +122,8 @@ EXPORT_SYMBOL(strncpy_from_user); do { \ int __d0; \ might_sleep(); \ + if (current->mm) \ + might_lock_read(¤t->mm->mmap_sem); \ __asm__ __volatile__( \ "0: rep; stosl\n" \ " movl %2,%0\n" \ @@ -148,7 +152,6 @@ do { \ unsigned long clear_user(void __user *to, unsigned long n) { - might_sleep(); if (access_ok(VERIFY_WRITE, to, n)) __do_clear_user(to, n); return n; @@ -191,6 +194,8 @@ long strnlen_user(const char __user *s, long n) unsigned long res, tmp; might_sleep(); + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); __asm__ __volatile__( " testl %0, %0\n" diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index f4df6e7c718..847d1294599 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -16,6 +16,8 @@ do { \ long __d0, __d1, __d2; \ might_sleep(); \ + if (current->mm) \ + might_lock_read(¤t->mm->mmap_sem); \ __asm__ __volatile__( \ " testq %1,%1\n" \ " jz 2f\n" \ @@ -65,6 +67,8 @@ unsigned long __clear_user(void __user *addr, unsigned long size) { long __d0; might_sleep(); + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); /* no memory constraint because it doesn't change any memory gcc knows about */ asm volatile( diff --git a/include/asm-x86/uaccess.h b/include/asm-x86/uaccess.h index 5f702d1d521..ad29752a171 100644 --- a/include/asm-x86/uaccess.h +++ b/include/asm-x86/uaccess.h @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include #include @@ -157,6 +159,9 @@ extern int __get_user_bad(void); int __ret_gu; \ unsigned long __val_gu; \ __chk_user_ptr(ptr); \ + might_sleep(); \ + if (current->mm) \ + might_lock_read(¤t->mm->mmap_sem); \ switch (sizeof(*(ptr))) { \ case 1: \ __get_user_x(1, __ret_gu, __val_gu, ptr); \ @@ -241,6 +246,9 @@ extern void __put_user_8(void); int __ret_pu; \ __typeof__(*(ptr)) __pu_val; \ __chk_user_ptr(ptr); \ + might_sleep(); \ + if (current->mm) \ + might_lock_read(¤t->mm->mmap_sem); \ __pu_val = x; \ switch (sizeof(*(ptr))) { \ case 1: \ @@ -265,6 +273,9 @@ extern void __put_user_8(void); #define __put_user_size(x, ptr, size, retval, errret) \ do { \ retval = 0; \ + might_sleep(); \ + if (current->mm) \ + might_lock_read(¤t->mm->mmap_sem); \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ @@ -317,6 +328,9 @@ do { \ #define __get_user_size(x, ptr, size, retval, errret) \ do { \ retval = 0; \ + might_sleep(); \ + if (current->mm) \ + might_lock_read(¤t->mm->mmap_sem); \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ diff --git a/include/asm-x86/uaccess_32.h b/include/asm-x86/uaccess_32.h index 6fdef39a0bc..d725e2d703f 100644 --- a/include/asm-x86/uaccess_32.h +++ b/include/asm-x86/uaccess_32.h @@ -82,8 +82,10 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) static __always_inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n) { - might_sleep(); - return __copy_to_user_inatomic(to, from, n); + might_sleep(); + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); + return __copy_to_user_inatomic(to, from, n); } static __always_inline unsigned long @@ -138,6 +140,8 @@ static __always_inline unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { might_sleep(); + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); if (__builtin_constant_p(n)) { unsigned long ret; @@ -160,6 +164,8 @@ static __always_inline unsigned long __copy_from_user_nocache(void *to, const void __user *from, unsigned long n) { might_sleep(); + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); if (__builtin_constant_p(n)) { unsigned long ret; diff --git a/include/asm-x86/uaccess_64.h b/include/asm-x86/uaccess_64.h index 515d4dce96b..40a7205fe57 100644 --- a/include/asm-x86/uaccess_64.h +++ b/include/asm-x86/uaccess_64.h @@ -28,6 +28,10 @@ static __always_inline __must_check int __copy_from_user(void *dst, const void __user *src, unsigned size) { int ret = 0; + + might_sleep(); + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); if (!__builtin_constant_p(size)) return copy_user_generic(dst, (__force void *)src, size); switch (size) { @@ -70,6 +74,10 @@ static __always_inline __must_check int __copy_to_user(void __user *dst, const void *src, unsigned size) { int ret = 0; + + might_sleep(); + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, src, size); switch (size) { @@ -112,6 +120,10 @@ static __always_inline __must_check int __copy_in_user(void __user *dst, const void __user *src, unsigned size) { int ret = 0; + + might_sleep(); + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, (__force void *)src, size); -- cgit v1.2.3-18-g5258 From 3ee1afa308f2a38e5d1e2ad3752ad7abcf480da1 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 10 Sep 2008 13:37:17 +0200 Subject: x86: some lock annotations for user copy paths, v2 - introduce might_fault() - handle the atomic user copy paths correctly [ mingo@elte.hu: move might_sleep() outside of in_atomic(). ] Signed-off-by: Nick Piggin Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/lib/usercopy_32.c | 12 +++--------- arch/x86/lib/usercopy_64.c | 8 ++------ include/asm-x86/uaccess.h | 18 ++++-------------- include/asm-x86/uaccess_32.h | 12 +++--------- include/asm-x86/uaccess_64.h | 12 +++--------- include/linux/kernel.h | 9 +++++++++ mm/memory.c | 15 +++++++++++++++ 7 files changed, 39 insertions(+), 47 deletions(-) diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 8eedde2a9ca..7393152a252 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -32,9 +32,7 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon #define __do_strncpy_from_user(dst, src, count, res) \ do { \ int __d0, __d1, __d2; \ - might_sleep(); \ - if (current->mm) \ - might_lock_read(¤t->mm->mmap_sem); \ + might_fault(); \ __asm__ __volatile__( \ " testl %1,%1\n" \ " jz 2f\n" \ @@ -121,9 +119,7 @@ EXPORT_SYMBOL(strncpy_from_user); #define __do_clear_user(addr,size) \ do { \ int __d0; \ - might_sleep(); \ - if (current->mm) \ - might_lock_read(¤t->mm->mmap_sem); \ + might_fault(); \ __asm__ __volatile__( \ "0: rep; stosl\n" \ " movl %2,%0\n" \ @@ -193,9 +189,7 @@ long strnlen_user(const char __user *s, long n) unsigned long mask = -__addr_ok(s); unsigned long res, tmp; - might_sleep(); - if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_fault(); __asm__ __volatile__( " testl %0, %0\n" diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 847d1294599..64d6c84e635 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -15,9 +15,7 @@ #define __do_strncpy_from_user(dst,src,count,res) \ do { \ long __d0, __d1, __d2; \ - might_sleep(); \ - if (current->mm) \ - might_lock_read(¤t->mm->mmap_sem); \ + might_fault(); \ __asm__ __volatile__( \ " testq %1,%1\n" \ " jz 2f\n" \ @@ -66,9 +64,7 @@ EXPORT_SYMBOL(strncpy_from_user); unsigned long __clear_user(void __user *addr, unsigned long size) { long __d0; - might_sleep(); - if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_fault(); /* no memory constraint because it doesn't change any memory gcc knows about */ asm volatile( diff --git a/include/asm-x86/uaccess.h b/include/asm-x86/uaccess.h index ad29752a171..39f8420c75d 100644 --- a/include/asm-x86/uaccess.h +++ b/include/asm-x86/uaccess.h @@ -8,8 +8,6 @@ #include #include #include -#include -#include #include #include @@ -159,9 +157,7 @@ extern int __get_user_bad(void); int __ret_gu; \ unsigned long __val_gu; \ __chk_user_ptr(ptr); \ - might_sleep(); \ - if (current->mm) \ - might_lock_read(¤t->mm->mmap_sem); \ + might_fault(); \ switch (sizeof(*(ptr))) { \ case 1: \ __get_user_x(1, __ret_gu, __val_gu, ptr); \ @@ -246,9 +242,7 @@ extern void __put_user_8(void); int __ret_pu; \ __typeof__(*(ptr)) __pu_val; \ __chk_user_ptr(ptr); \ - might_sleep(); \ - if (current->mm) \ - might_lock_read(¤t->mm->mmap_sem); \ + might_fault(); \ __pu_val = x; \ switch (sizeof(*(ptr))) { \ case 1: \ @@ -273,9 +267,7 @@ extern void __put_user_8(void); #define __put_user_size(x, ptr, size, retval, errret) \ do { \ retval = 0; \ - might_sleep(); \ - if (current->mm) \ - might_lock_read(¤t->mm->mmap_sem); \ + might_fault(); \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ @@ -328,9 +320,7 @@ do { \ #define __get_user_size(x, ptr, size, retval, errret) \ do { \ retval = 0; \ - might_sleep(); \ - if (current->mm) \ - might_lock_read(¤t->mm->mmap_sem); \ + might_fault(); \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ diff --git a/include/asm-x86/uaccess_32.h b/include/asm-x86/uaccess_32.h index d725e2d703f..d10e842ec3e 100644 --- a/include/asm-x86/uaccess_32.h +++ b/include/asm-x86/uaccess_32.h @@ -82,9 +82,7 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) static __always_inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n) { - might_sleep(); - if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_fault(); return __copy_to_user_inatomic(to, from, n); } @@ -139,9 +137,7 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) static __always_inline unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { - might_sleep(); - if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_fault(); if (__builtin_constant_p(n)) { unsigned long ret; @@ -163,9 +159,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n) static __always_inline unsigned long __copy_from_user_nocache(void *to, const void __user *from, unsigned long n) { - might_sleep(); - if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_fault(); if (__builtin_constant_p(n)) { unsigned long ret; diff --git a/include/asm-x86/uaccess_64.h b/include/asm-x86/uaccess_64.h index 40a7205fe57..13fd56fbc3a 100644 --- a/include/asm-x86/uaccess_64.h +++ b/include/asm-x86/uaccess_64.h @@ -29,9 +29,7 @@ int __copy_from_user(void *dst, const void __user *src, unsigned size) { int ret = 0; - might_sleep(); - if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_fault(); if (!__builtin_constant_p(size)) return copy_user_generic(dst, (__force void *)src, size); switch (size) { @@ -75,9 +73,7 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size) { int ret = 0; - might_sleep(); - if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_fault(); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, src, size); switch (size) { @@ -121,9 +117,7 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size) { int ret = 0; - might_sleep(); - if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_fault(); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, (__force void *)src, size); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2651f805ba6..e580ec09576 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -140,6 +140,15 @@ extern int _cond_resched(void); (__x < 0) ? -__x : __x; \ }) +#ifdef CONFIG_PROVE_LOCKING +void might_fault(void); +#else +static inline void might_fault(void) +{ + might_sleep(); +} +#endif + extern struct atomic_notifier_head panic_notifier_list; extern long (*panic_blink)(long time); NORET_TYPE void panic(const char * fmt, ...) diff --git a/mm/memory.c b/mm/memory.c index 1002f473f49..b8fdf4e5e65 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3016,3 +3016,18 @@ void print_vma_addr(char *prefix, unsigned long ip) } up_read(¤t->mm->mmap_sem); } + +#ifdef CONFIG_PROVE_LOCKING +void might_fault(void) +{ + might_sleep(); + /* + * it would be nicer only to annotate paths which are not under + * pagefault_disable, however that requires a larger audit and + * providing helpers like get_user_atomic. + */ + if (!in_atomic() && current->mm) + might_lock_read(¤t->mm->mmap_sem); +} +EXPORT_SYMBOL(might_fault); +#endif -- cgit v1.2.3-18-g5258 From 1d18ef489509314506328b9e464dd47c24c1d68f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Sep 2008 20:53:21 +0200 Subject: x86: some lock annotations for user copy paths, v3 - add annotation back to clear_user() - change probe_kernel_address() to _inatomic*() method Signed-off-by: Ingo Molnar --- arch/x86/lib/usercopy_32.c | 1 + include/asm-x86/uaccess.h | 2 -- include/linux/uaccess.h | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 7393152a252..fab5faba1d3 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -148,6 +148,7 @@ do { \ unsigned long clear_user(void __user *to, unsigned long n) { + might_fault(); if (access_ok(VERIFY_WRITE, to, n)) __do_clear_user(to, n); return n; diff --git a/include/asm-x86/uaccess.h b/include/asm-x86/uaccess.h index 39f8420c75d..dc8edb5c465 100644 --- a/include/asm-x86/uaccess.h +++ b/include/asm-x86/uaccess.h @@ -267,7 +267,6 @@ extern void __put_user_8(void); #define __put_user_size(x, ptr, size, retval, errret) \ do { \ retval = 0; \ - might_fault(); \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ @@ -320,7 +319,6 @@ do { \ #define __get_user_size(x, ptr, size, retval, errret) \ do { \ retval = 0; \ - might_fault(); \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index fec6decfb98..2062293e57e 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -78,7 +78,7 @@ static inline unsigned long __copy_from_user_nocache(void *to, \ set_fs(KERNEL_DS); \ pagefault_disable(); \ - ret = __get_user(retval, (__force typeof(retval) __user *)(addr)); \ + ret = __copy_from_user_inatomic((__force typeof(retval) __user *)(addr), &(retval), sizeof(retval)); \ pagefault_enable(); \ set_fs(old_fs); \ ret; \ -- cgit v1.2.3-18-g5258 From 53b9d87f41a3d8838210ad7cdef02d814817ce85 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 11 Sep 2008 17:02:58 -0700 Subject: lock debug: sit tight when we are already in a panic in: > http://bugzilla.kernel.org/show_bug.cgi?id=11543 The panic code called the kexec code which called mutex_trylock() which called spin_lock_mutex() which then stupidly went and blurted a load of debug stuff because of in_interrupt(). Keep the lock debug code from escallating an already crappy situation. Signed-off-by: Ingo Molnar --- include/linux/debug_locks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index 4aaa4afb1cb..096476f1fb3 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h @@ -17,7 +17,7 @@ extern int debug_locks_off(void); ({ \ int __ret = 0; \ \ - if (unlikely(c)) { \ + if (!oops_in_progress && unlikely(c)) { \ if (debug_locks_off() && !debug_locks_silent) \ WARN_ON(1); \ __ret = 1; \ -- cgit v1.2.3-18-g5258 From 30742d5c2277c325fb0e9d2d817d55a19995fe8f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Sep 2008 14:43:39 +0200 Subject: Revert "lockdep: fix compilation when CONFIG_TRACE_IRQFLAGS_SUPPORT is not set" This reverts commit bd8fbdee6562ee526f3c2582a3b373ef195015dd. This broke the powerpc build - more fixes are needed before we can undo this revert. --- include/linux/irqflags.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index f2993512b3b..74bde13224c 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -52,10 +52,10 @@ # define start_critical_timings() do { } while (0) #endif -#include - #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT +#include + #define local_irq_enable() \ do { trace_hardirqs_on(); raw_local_irq_enable(); } while (0) #define local_irq_disable() \ @@ -84,20 +84,21 @@ * The local_irq_*() APIs are equal to the raw_local_irq*() * if !TRACE_IRQFLAGS. */ -#define local_irq_disable() raw_local_irq_disable() -#define local_irq_enable() raw_local_irq_enable() -#define local_irq_save(flags) \ +# define raw_local_irq_disable() local_irq_disable() +# define raw_local_irq_enable() local_irq_enable() +# define raw_local_irq_save(flags) \ do { \ typecheck(unsigned long, flags); \ - raw_local_irq_save(flags); \ + local_irq_save(flags); \ } while (0) -# define local_irq_restore(flags) \ +# define raw_local_irq_restore(flags) \ do { \ typecheck(unsigned long, flags); \ - raw_local_irq_restore(flags); \ + local_irq_restore(flags); \ } while (0) #endif /* CONFIG_TRACE_IRQFLAGS_SUPPORT */ +#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT #define safe_halt() \ do { \ trace_hardirqs_on(); \ @@ -123,5 +124,6 @@ typecheck(unsigned long, flags); \ raw_irqs_disabled_flags(flags); \ }) +#endif /* CONFIG_X86 */ #endif -- cgit v1.2.3-18-g5258 From fb71e45338453698bd7460f7e8f171ea0304d218 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 15 Sep 2008 18:04:26 -0700 Subject: uaccess: fix parameters inversion for __copy_from_user_inatomic() The following patch changes to use __copy_from_user_inatomic(), but the passing parameters incorrect: x86: some lock annotations for user copy paths, v3 This fixes the netfilter crash reported by Steven Noonan. Reported-by: Steven Noonan Signed-off-by: Hiroshi Shimamoto Tested-by: Steven Noonan Signed-off-by: Ingo Molnar --- include/linux/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 2062293e57e..6b58367d145 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -78,7 +78,7 @@ static inline unsigned long __copy_from_user_nocache(void *to, \ set_fs(KERNEL_DS); \ pagefault_disable(); \ - ret = __copy_from_user_inatomic((__force typeof(retval) __user *)(addr), &(retval), sizeof(retval)); \ + ret = __copy_from_user_inatomic(&(retval), (__force typeof(retval) __user *)(addr), sizeof(retval)); \ pagefault_enable(); \ set_fs(old_fs); \ ret; \ -- cgit v1.2.3-18-g5258 From 6918bc5c830e890681eabb3c6cb6b8d117a52d14 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Sep 2008 15:33:41 +0200 Subject: lockstat: fixup signed division Some recent modification to this code made me notice the little todo mark. Now that we have more elaborate 64-bit division functions this isn't hard. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/lockdep_proc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 20dbcbf9c7d..8d3a6eba8d5 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -470,11 +470,12 @@ static void seq_line(struct seq_file *m, char c, int offset, int length) static void snprint_time(char *buf, size_t bufsiz, s64 nr) { - unsigned long rem; + s64 div; + s32 rem; nr += 5; /* for display rounding */ - rem = do_div(nr, 1000); /* XXX: do_div_signed */ - snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10); + div = div_s64_rem(nr, 1000, &rem); + snprintf(buf, bufsiz, "%lld.%02d", (long long)div, (int)rem/10); } static void seq_time(struct seq_file *m, s64 time) -- cgit v1.2.3-18-g5258 From 38d47c1b7075bd7ec3881141bb3629da58f88dab Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Sep 2008 19:32:20 +0200 Subject: futex: rely on get_user_pages() for shared futexes On the way of getting rid of the mmap_sem requirement for shared futexes, start by relying on get_user_pages(). Signed-off-by: Peter Zijlstra Acked-by: Nick Piggin Signed-off-by: Ingo Molnar --- include/linux/futex.h | 2 + kernel/futex.c | 162 +++++++++++++++++++++++++------------------------- 2 files changed, 82 insertions(+), 82 deletions(-) diff --git a/include/linux/futex.h b/include/linux/futex.h index 586ab56a3ec..8f627b9ae2b 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -164,6 +164,8 @@ union futex_key { } both; }; +#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } } + #ifdef CONFIG_FUTEX extern void exit_robust_list(struct task_struct *curr); extern void exit_pi_state_list(struct task_struct *curr); diff --git a/kernel/futex.c b/kernel/futex.c index 7d1136e97c1..a4c39fa0a7a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -161,6 +161,45 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) && key1->both.offset == key2->both.offset); } +/* + * Take a reference to the resource addressed by a key. + * Can be called while holding spinlocks. + * + */ +static void get_futex_key_refs(union futex_key *key) +{ + if (!key->both.ptr) + return; + + switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { + case FUT_OFF_INODE: + atomic_inc(&key->shared.inode->i_count); + break; + case FUT_OFF_MMSHARED: + atomic_inc(&key->private.mm->mm_count); + break; + } +} + +/* + * Drop a reference to the resource addressed by a key. + * The hash bucket spinlock must not be held. + */ +static void drop_futex_key_refs(union futex_key *key) +{ + if (!key->both.ptr) + return; + + switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { + case FUT_OFF_INODE: + iput(key->shared.inode); + break; + case FUT_OFF_MMSHARED: + mmdrop(key->private.mm); + break; + } +} + /** * get_futex_key - Get parameters which are the keys for a futex. * @uaddr: virtual address of the futex @@ -184,7 +223,6 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; struct page *page; int err; @@ -210,98 +248,47 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, key->private.address = address; return 0; } - /* - * The futex is hashed differently depending on whether - * it's in a shared or private mapping. So check vma first. - */ - vma = find_extend_vma(mm, address); - if (unlikely(!vma)) - return -EFAULT; - /* - * Permissions. - */ - if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) - return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; +again: + err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); + if (err < 0) + return err; + + lock_page(page); + if (!page->mapping) { + unlock_page(page); + put_page(page); + goto again; + } /* * Private mappings are handled in a simple way. * * NOTE: When userspace waits on a MAP_SHARED mapping, even if * it's a read-only handle, it's expected that futexes attach to - * the object not the particular process. Therefore we use - * VM_MAYSHARE here, not VM_SHARED which is restricted to shared - * mappings of _writable_ handles. + * the object not the particular process. */ - if (likely(!(vma->vm_flags & VM_MAYSHARE))) { - key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */ + if (PageAnon(page)) { + key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; - return 0; - } - - /* - * Linear file mappings are also simple. - */ - key->shared.inode = vma->vm_file->f_path.dentry->d_inode; - key->both.offset |= FUT_OFF_INODE; /* inode-based key. */ - if (likely(!(vma->vm_flags & VM_NONLINEAR))) { - key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) - + vma->vm_pgoff); - return 0; + } else { + key->both.offset |= FUT_OFF_INODE; /* inode-based key */ + key->shared.inode = page->mapping->host; + key->shared.pgoff = page->index; } - /* - * We could walk the page table to read the non-linear - * pte, and get the page index without fetching the page - * from swap. But that's a lot of code to duplicate here - * for a rare case, so we simply fetch the page. - */ - err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); - if (err >= 0) { - key->shared.pgoff = - page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - put_page(page); - return 0; - } - return err; -} + get_futex_key_refs(key); -/* - * Take a reference to the resource addressed by a key. - * Can be called while holding spinlocks. - * - */ -static void get_futex_key_refs(union futex_key *key) -{ - if (key->both.ptr == NULL) - return; - switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { - case FUT_OFF_INODE: - atomic_inc(&key->shared.inode->i_count); - break; - case FUT_OFF_MMSHARED: - atomic_inc(&key->private.mm->mm_count); - break; - } + unlock_page(page); + put_page(page); + return 0; } -/* - * Drop a reference to the resource addressed by a key. - * The hash bucket spinlock must not be held. - */ -static void drop_futex_key_refs(union futex_key *key) +static inline +void put_futex_key(struct rw_semaphore *fshared, union futex_key *key) { - if (!key->both.ptr) - return; - switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { - case FUT_OFF_INODE: - iput(key->shared.inode); - break; - case FUT_OFF_MMSHARED: - mmdrop(key->private.mm); - break; - } + drop_futex_key_refs(key); } static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) @@ -385,6 +372,7 @@ static int refill_pi_state_cache(void) /* pi_mutex gets initialized later */ pi_state->owner = NULL; atomic_set(&pi_state->refcount, 1); + pi_state->key = FUTEX_KEY_INIT; current->pi_state_cache = pi_state; @@ -462,7 +450,7 @@ void exit_pi_state_list(struct task_struct *curr) struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; struct futex_hash_bucket *hb; - union futex_key key; + union futex_key key = FUTEX_KEY_INIT; if (!futex_cmpxchg_enabled) return; @@ -725,7 +713,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, struct futex_hash_bucket *hb; struct futex_q *this, *next; struct plist_head *head; - union futex_key key; + union futex_key key = FUTEX_KEY_INIT; int ret; if (!bitset) @@ -760,6 +748,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, spin_unlock(&hb->lock); out: + put_futex_key(fshared, &key); futex_unlock_mm(fshared); return ret; } @@ -773,7 +762,7 @@ futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op) { - union futex_key key1, key2; + union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; struct futex_hash_bucket *hb1, *hb2; struct plist_head *head; struct futex_q *this, *next; @@ -873,6 +862,8 @@ retry: if (hb1 != hb2) spin_unlock(&hb2->lock); out: + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); futex_unlock_mm(fshared); return ret; @@ -886,7 +877,7 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, u32 __user *uaddr2, int nr_wake, int nr_requeue, u32 *cmpval) { - union futex_key key1, key2; + union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; struct futex_hash_bucket *hb1, *hb2; struct plist_head *head1; struct futex_q *this, *next; @@ -974,6 +965,8 @@ out_unlock: drop_futex_key_refs(&key1); out: + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); futex_unlock_mm(fshared); return ret; } @@ -1220,6 +1213,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, retry: futex_lock_mm(fshared); + q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) goto out_release_sem; @@ -1360,6 +1354,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, queue_unlock(&q, hb); out_release_sem: + put_futex_key(fshared, &q.key); futex_unlock_mm(fshared); return ret; } @@ -1411,6 +1406,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, retry: futex_lock_mm(fshared); + q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) goto out_release_sem; @@ -1625,6 +1621,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, queue_unlock(&q, hb); out_release_sem: + put_futex_key(fshared, &q.key); futex_unlock_mm(fshared); if (to) destroy_hrtimer_on_stack(&to->timer); @@ -1671,7 +1668,7 @@ static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared) struct futex_q *this, *next; u32 uval; struct plist_head *head; - union futex_key key; + union futex_key key = FUTEX_KEY_INIT; int ret, attempt = 0; retry: @@ -1744,6 +1741,7 @@ retry_unlocked: out_unlock: spin_unlock(&hb->lock); out: + put_futex_key(fshared, &key); futex_unlock_mm(fshared); return ret; -- cgit v1.2.3-18-g5258 From 61270708ecf1cda148e84fbf6e0703ee5aa81814 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Sep 2008 19:32:21 +0200 Subject: futex: reduce mmap_sem usage now that we rely on get_user_pages() for the shared key handling move all the mmap_sem stuff closely around the slow paths. Signed-off-by: Peter Zijlstra Acked-by: Nick Piggin Signed-off-by: Ingo Molnar --- kernel/futex.c | 83 +++------------------------------------------------------- 1 file changed, 4 insertions(+), 79 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index a4c39fa0a7a..6a726684217 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -122,24 +122,6 @@ struct futex_hash_bucket { static struct futex_hash_bucket futex_queues[1<mmap_sem, when futex is shared - */ -static inline void futex_lock_mm(struct rw_semaphore *fshared) -{ - if (fshared) - down_read(fshared); -} - -/* - * Release mm->mmap_sem, when the futex is shared - */ -static inline void futex_unlock_mm(struct rw_semaphore *fshared) -{ - if (fshared) - up_read(fshared); -} - /* * We hash on the keys returned from get_futex_key (see below). */ @@ -250,7 +232,9 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, } again: + down_read(&mm->mmap_sem); err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); + up_read(&mm->mmap_sem); if (err < 0) return err; @@ -327,8 +311,7 @@ static int futex_handle_fault(unsigned long address, if (attempt > 2) return ret; - if (!fshared) - down_read(&mm->mmap_sem); + down_read(&mm->mmap_sem); vma = find_vma(mm, address); if (vma && address >= vma->vm_start && (vma->vm_flags & VM_WRITE)) { @@ -348,8 +331,7 @@ static int futex_handle_fault(unsigned long address, current->min_flt++; } } - if (!fshared) - up_read(&mm->mmap_sem); + up_read(&mm->mmap_sem); return ret; } @@ -719,8 +701,6 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, if (!bitset) return -EINVAL; - futex_lock_mm(fshared); - ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) goto out; @@ -749,7 +729,6 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, spin_unlock(&hb->lock); out: put_futex_key(fshared, &key); - futex_unlock_mm(fshared); return ret; } @@ -769,8 +748,6 @@ futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, int ret, op_ret, attempt = 0; retryfull: - futex_lock_mm(fshared); - ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; @@ -821,12 +798,6 @@ retry: goto retry; } - /* - * If we would have faulted, release mmap_sem, - * fault it in and start all over again. - */ - futex_unlock_mm(fshared); - ret = get_user(dummy, uaddr2); if (ret) return ret; @@ -864,7 +835,6 @@ retry: out: put_futex_key(fshared, &key2); put_futex_key(fshared, &key1); - futex_unlock_mm(fshared); return ret; } @@ -884,8 +854,6 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, int ret, drop_count = 0; retry: - futex_lock_mm(fshared); - ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; @@ -908,12 +876,6 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, if (hb1 != hb2) spin_unlock(&hb2->lock); - /* - * If we would have faulted, release mmap_sem, fault - * it in and start all over again. - */ - futex_unlock_mm(fshared); - ret = get_user(curval, uaddr1); if (!ret) @@ -967,7 +929,6 @@ out_unlock: out: put_futex_key(fshared, &key2); put_futex_key(fshared, &key1); - futex_unlock_mm(fshared); return ret; } @@ -1211,8 +1172,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, q.pi_state = NULL; q.bitset = bitset; retry: - futex_lock_mm(fshared); - q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) @@ -1245,12 +1204,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, if (unlikely(ret)) { queue_unlock(&q, hb); - /* - * If we would have faulted, release mmap_sem, fault it in and - * start all over again. - */ - futex_unlock_mm(fshared); - ret = get_user(uval, uaddr); if (!ret) @@ -1264,12 +1217,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, /* Only actually queue if *uaddr contained val. */ queue_me(&q, hb); - /* - * Now the futex is queued and we have checked the data, we - * don't want to hold mmap_sem while we sleep. - */ - futex_unlock_mm(fshared); - /* * There might have been scheduling since the queue_me(), as we * cannot hold a spinlock across the get_user() in case it @@ -1355,7 +1302,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, out_release_sem: put_futex_key(fshared, &q.key); - futex_unlock_mm(fshared); return ret; } @@ -1404,8 +1350,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, q.pi_state = NULL; retry: - futex_lock_mm(fshared); - q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) @@ -1495,7 +1439,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, * exit to complete. */ queue_unlock(&q, hb); - futex_unlock_mm(fshared); cond_resched(); goto retry; @@ -1527,12 +1470,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, */ queue_me(&q, hb); - /* - * Now the futex is queued and we have checked the data, we - * don't want to hold mmap_sem while we sleep. - */ - futex_unlock_mm(fshared); - WARN_ON(!q.pi_state); /* * Block on the PI mutex: @@ -1545,7 +1482,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, ret = ret ? 0 : -EWOULDBLOCK; } - futex_lock_mm(fshared); spin_lock(q.lock_ptr); if (!ret) { @@ -1611,7 +1547,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, /* Unqueue and drop the lock */ unqueue_me_pi(&q); - futex_unlock_mm(fshared); if (to) destroy_hrtimer_on_stack(&to->timer); @@ -1622,7 +1557,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, out_release_sem: put_futex_key(fshared, &q.key); - futex_unlock_mm(fshared); if (to) destroy_hrtimer_on_stack(&to->timer); return ret; @@ -1646,8 +1580,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, goto retry_unlocked; } - futex_unlock_mm(fshared); - ret = get_user(uval, uaddr); if (!ret && (uval != -EFAULT)) goto retry; @@ -1679,10 +1611,6 @@ retry: */ if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) return -EPERM; - /* - * First take all the futex related locks: - */ - futex_lock_mm(fshared); ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) @@ -1742,7 +1670,6 @@ out_unlock: spin_unlock(&hb->lock); out: put_futex_key(fshared, &key); - futex_unlock_mm(fshared); return ret; @@ -1766,8 +1693,6 @@ pi_faulted: goto retry_unlocked; } - futex_unlock_mm(fshared); - ret = get_user(uval, uaddr); if (!ret && (uval != -EFAULT)) goto retry; -- cgit v1.2.3-18-g5258 From 734b05b10e51d4ba38c8fc3ee02e846aab09eedf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Sep 2008 19:32:22 +0200 Subject: futex: use fast_gup() Change the get_user_pages() call with fast_gup() which doesn't require holding the mmap_sem thereby removing the mmap_sem from all fast paths. Signed-off-by: Peter Zijlstra Acked-by: Nick Piggin Signed-off-by: Ingo Molnar --- kernel/futex.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 6a726684217..facf17d1a70 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -232,9 +232,7 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, } again: - down_read(&mm->mmap_sem); - err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); - up_read(&mm->mmap_sem); + err = get_user_pages_fast(address, 1, 0, &page); if (err < 0) return err; -- cgit v1.2.3-18-g5258 From c2f9f20154bfb137ccdf8c9159992429a40dfe20 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Sep 2008 19:32:23 +0200 Subject: futex: cleanup fshared fshared doesn't need to be a rw_sem pointer anymore, so clean that up. Signed-off-by: Peter Zijlstra Acked-by: Nick Piggin Signed-off-by: Ingo Molnar --- kernel/futex.c | 48 +++++++++++++++++++----------------------------- 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index facf17d1a70..60b47bb9e3d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -200,8 +200,7 @@ static void drop_futex_key_refs(union futex_key *key) * For other futexes, it points to ¤t->mm->mmap_sem and * caller must have taken the reader lock. but NOT any spinlocks. */ -static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, - union futex_key *key) +static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -268,7 +267,7 @@ again: } static inline -void put_futex_key(struct rw_semaphore *fshared, union futex_key *key) +void put_futex_key(int fshared, union futex_key *key) { drop_futex_key_refs(key); } @@ -297,10 +296,8 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from) /* * Fault handling. - * if fshared is non NULL, current->mm->mmap_sem is already held */ -static int futex_handle_fault(unsigned long address, - struct rw_semaphore *fshared, int attempt) +static int futex_handle_fault(unsigned long address, int attempt) { struct vm_area_struct * vma; struct mm_struct *mm = current->mm; @@ -687,8 +684,7 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: */ -static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, - int nr_wake, u32 bitset) +static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) { struct futex_hash_bucket *hb; struct futex_q *this, *next; @@ -735,8 +731,7 @@ out: * to this virtual address: */ static int -futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, - u32 __user *uaddr2, +futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; @@ -790,7 +785,7 @@ retry: */ if (attempt++) { ret = futex_handle_fault((unsigned long)uaddr2, - fshared, attempt); + attempt); if (ret) goto out; goto retry; @@ -841,8 +836,7 @@ out: * Requeue all waiters hashed on one physical page to another * physical page. */ -static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, - u32 __user *uaddr2, +static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, int nr_wake, int nr_requeue, u32 *cmpval) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; @@ -1048,8 +1042,7 @@ static void unqueue_me_pi(struct futex_q *q) * private futexes. */ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner, - struct rw_semaphore *fshared) + struct task_struct *newowner, int fshared) { u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; @@ -1128,7 +1121,7 @@ retry: handle_fault: spin_unlock(q->lock_ptr); - ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); + ret = futex_handle_fault((unsigned long)uaddr, attempt++); spin_lock(q->lock_ptr); @@ -1152,7 +1145,7 @@ handle_fault: static long futex_wait_restart(struct restart_block *restart); -static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, +static int futex_wait(u32 __user *uaddr, int fshared, u32 val, ktime_t *abs_time, u32 bitset) { struct task_struct *curr = current; @@ -1307,13 +1300,13 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; - struct rw_semaphore *fshared = NULL; + int fshared = 0; ktime_t t; t.tv64 = restart->futex.time; restart->fn = do_no_restart_syscall; if (restart->futex.flags & FLAGS_SHARED) - fshared = ¤t->mm->mmap_sem; + fshared = 1; return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, restart->futex.bitset); } @@ -1325,7 +1318,7 @@ static long futex_wait_restart(struct restart_block *restart) * if there are waiters then it will block, it does PI, etc. (Due to * races the kernel might see a 0 value of the futex too.) */ -static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, +static int futex_lock_pi(u32 __user *uaddr, int fshared, int detect, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; @@ -1571,8 +1564,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, queue_unlock(&q, hb); if (attempt++) { - ret = futex_handle_fault((unsigned long)uaddr, fshared, - attempt); + ret = futex_handle_fault((unsigned long)uaddr, attempt); if (ret) goto out_release_sem; goto retry_unlocked; @@ -1592,7 +1584,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, * This is the in-kernel slowpath: we look up the PI state (if any), * and do the rt-mutex unlock. */ -static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared) +static int futex_unlock_pi(u32 __user *uaddr, int fshared) { struct futex_hash_bucket *hb; struct futex_q *this, *next; @@ -1683,8 +1675,7 @@ pi_faulted: spin_unlock(&hb->lock); if (attempt++) { - ret = futex_handle_fault((unsigned long)uaddr, fshared, - attempt); + ret = futex_handle_fault((unsigned long)uaddr, attempt); if (ret) goto out; uval = 0; @@ -1816,8 +1807,7 @@ retry: * PI futexes happens in exit_pi_state(): */ if (!pi && (uval & FUTEX_WAITERS)) - futex_wake(uaddr, &curr->mm->mmap_sem, 1, - FUTEX_BITSET_MATCH_ANY); + futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); } return 0; } @@ -1913,10 +1903,10 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, { int ret = -ENOSYS; int cmd = op & FUTEX_CMD_MASK; - struct rw_semaphore *fshared = NULL; + int fshared = 0; if (!(op & FUTEX_PRIVATE_FLAG)) - fshared = ¤t->mm->mmap_sem; + fshared = 1; switch (cmd) { case FUTEX_WAIT: -- cgit v1.2.3-18-g5258 From 42569c39917a08e8de1e8b5685463be7b74baebd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 30 Sep 2008 12:33:07 +0200 Subject: futex: fixup get_futex_key() for private futexes With the get_user_pages_fast() patches we made get_futex_key() obtain a reference on the returned key, but failed to do so for private futexes. Signed-off-by: Peter Zijlstra Acked-by: Nick Piggin Signed-off-by: Ingo Molnar --- kernel/futex.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/futex.c b/kernel/futex.c index 60b47bb9e3d..62cbd648e28 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -227,6 +227,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) return -EFAULT; key->private.mm = mm; key->private.address = address; + get_futex_key_refs(key); return 0; } -- cgit v1.2.3-18-g5258 From 7317d7b87edb41a9135e30be1ec3f7ef817c53dd Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 30 Sep 2008 20:50:27 +1000 Subject: sched: improve preempt debugging This patch helped me out with a problem I recently had.... Basically, when the kernel lock is held, then preempt_count underflow does not get detected until it is released which may be a long time (and arbitrarily, eg at different points it may be rescheduled). If the bkl is released at schedule, the resulting output is actually fairly cryptic... Wi