From ca94c442535a44d508c99a77e54f21a59f4fc462 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 15 Jun 2009 17:17:47 +0200 Subject: sched: Introduce SCHED_RESET_ON_FORK scheduling policy flag This patch introduces a new flag SCHED_RESET_ON_FORK which can be passed to the kernel via sched_setscheduler(), ORed in the policy parameter. If set this will make sure that when the process forks a) the scheduling priority is reset to DEFAULT_PRIO if it was higher and b) the scheduling policy is reset to SCHED_NORMAL if it was either SCHED_FIFO or SCHED_RR. Why have this? Currently, if a process is real-time scheduled this will 'leak' to all its child processes. For security reasons it is often (always?) a good idea to make sure that if a process acquires RT scheduling this is confined to this process and only this process. More specifically this makes the per-process resource limit RLIMIT_RTTIME useful for security purposes, because it makes it impossible to use a fork bomb to circumvent the per-process RLIMIT_RTTIME accounting. This feature is also useful for tools like 'renice' which can then change the nice level of a process without having this spill to all its child processes. Why expose this via sched_setscheduler() and not other syscalls such as prctl() or sched_setparam()? prctl() does not take a pid parameter. Due to that it would be impossible to modify this flag for other processes than the current one. The struct passed to sched_setparam() can unfortunately not be extended without breaking compatibility, since sched_setparam() lacks a size parameter. How to use this from userspace? In your RT program simply replace this: sched_setscheduler(pid, SCHED_FIFO, ¶m); by this: sched_setscheduler(pid, SCHED_FIFO|SCHED_RESET_ON_FORK, ¶m); Signed-off-by: Lennart Poettering Acked-by: Peter Zijlstra LKML-Reference: <20090615152714.GA29092@tango.0pointer.de> Signed-off-by: Ingo Molnar --- kernel/sched.c | 49 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8ec9d13140b..32e6ede8525 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2613,12 +2613,28 @@ void sched_fork(struct task_struct *p, int clone_flags) set_task_cpu(p, cpu); /* - * Make sure we do not leak PI boosting priority to the child: + * Revert to default priority/policy on fork if requested. Make sure we + * do not leak PI boosting priority to the child. */ - p->prio = current->normal_prio; + if (current->sched_reset_on_fork && + (p->policy == SCHED_FIFO || p->policy == SCHED_RR)) + p->policy = SCHED_NORMAL; + + if (current->sched_reset_on_fork && + (current->normal_prio < DEFAULT_PRIO)) + p->prio = DEFAULT_PRIO; + else + p->prio = current->normal_prio; + if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; + /* + * We don't need the reset flag anymore after the fork. It has + * fulfilled its duty: + */ + p->sched_reset_on_fork = 0; + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -6094,17 +6110,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy, unsigned long flags; const struct sched_class *prev_class = p->sched_class; struct rq *rq; + int reset_on_fork; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); recheck: /* double check policy once rq lock held */ - if (policy < 0) + if (policy < 0) { + reset_on_fork = p->sched_reset_on_fork; policy = oldpolicy = p->policy; - else if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH && - policy != SCHED_IDLE) - return -EINVAL; + } else { + reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); + policy &= ~SCHED_RESET_ON_FORK; + + if (policy != SCHED_FIFO && policy != SCHED_RR && + policy != SCHED_NORMAL && policy != SCHED_BATCH && + policy != SCHED_IDLE) + return -EINVAL; + } + /* * Valid priorities for SCHED_FIFO and SCHED_RR are * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, @@ -6148,6 +6172,10 @@ recheck: /* can't change other user's priorities */ if (!check_same_owner(p)) return -EPERM; + + /* Normal users shall not reset the sched_reset_on_fork flag */ + if (p->sched_reset_on_fork && !reset_on_fork) + return -EPERM; } if (user) { @@ -6191,6 +6219,8 @@ recheck: if (running) p->sched_class->put_prev_task(rq, p); + p->sched_reset_on_fork = reset_on_fork; + oldprio = p->prio; __setscheduler(rq, p, policy, param->sched_priority); @@ -6307,14 +6337,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) if (p) { retval = security_task_getscheduler(p); if (!retval) - retval = p->policy; + retval = p->policy + | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); } read_unlock(&tasklist_lock); return retval; } /** - * sys_sched_getscheduler - get the RT priority of a thread + * sys_sched_getparam - get the RT priority of a thread * @pid: the pid in question. * @param: structure containing the RT priority. */ -- cgit v1.2.3-70-g09d2 From b9dc29e72fd3dc2a739ce8eafd958220d0745734 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 17 Jun 2009 10:46:01 +0200 Subject: sched: Clean up SCHED_RESET_ON_FORK Make SCHED_RESET_ON_FORK sched_fork() bits a self-contained unlikely code path. Signed-off-by: Mike Galbraith Acked-by: Lennart Poettering Cc: Peter Zijlstra LKML-Reference: <1245228361.18329.6.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 32e6ede8525..50e4e3d15e8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2613,28 +2613,30 @@ void sched_fork(struct task_struct *p, int clone_flags) set_task_cpu(p, cpu); /* - * Revert to default priority/policy on fork if requested. Make sure we - * do not leak PI boosting priority to the child. + * Make sure we do not leak PI boosting priority to the child. */ - if (current->sched_reset_on_fork && - (p->policy == SCHED_FIFO || p->policy == SCHED_RR)) - p->policy = SCHED_NORMAL; + p->prio = current->normal_prio; - if (current->sched_reset_on_fork && - (current->normal_prio < DEFAULT_PRIO)) - p->prio = DEFAULT_PRIO; - else - p->prio = current->normal_prio; + /* + * Revert to default priority/policy on fork if requested. + */ + if (unlikely(p->sched_reset_on_fork)) { + if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) + p->policy = SCHED_NORMAL; + + if (p->normal_prio < DEFAULT_PRIO) + p->prio = DEFAULT_PRIO; + + /* + * We don't need the reset flag anymore after the fork. It has + * fulfilled its duty: + */ + p->sched_reset_on_fork = 0; + } if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; - /* - * We don't need the reset flag anymore after the fork. It has - * fulfilled its duty: - */ - p->sched_reset_on_fork = 0; - #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); -- cgit v1.2.3-70-g09d2 From 6c697bdf08a09ce461e305a22362973036e95db3 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 17 Jun 2009 10:48:02 +0200 Subject: sched: Add SCHED_RESET_ON_FORK functionality for nice < 0 tasks Signed-off-by: Mike Galbraith Acked-by: Lennart Poettering Cc: Peter Zijlstra LKML-Reference: <1245228482.27326.1.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 50e4e3d15e8..34f94240642 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2627,6 +2627,11 @@ void sched_fork(struct task_struct *p, int clone_flags) if (p->normal_prio < DEFAULT_PRIO) p->prio = DEFAULT_PRIO; + if (PRIO_TO_NICE(p->static_prio) < 0) { + p->static_prio = NICE_TO_PRIO(0); + set_load_weight(p); + } + /* * We don't need the reset flag anymore after the fork. It has * fulfilled its duty: -- cgit v1.2.3-70-g09d2 From e74e396204bfcb67570ba4517b08f5918e69afea Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 30 Mar 2009 19:07:44 +0900 Subject: percpu: use dynamic percpu allocator as the default percpu allocator This patch makes most !CONFIG_HAVE_SETUP_PER_CPU_AREA archs use dynamic percpu allocator. The first chunk is allocated using embedding helper and 8k is reserved for modules. This ensures that the new allocator behaves almost identically to the original allocator as long as static percpu variables are concerned, so it shouldn't introduce much breakage. s390 and alpha use custom SHIFT_PERCPU_PTR() to work around addressing range limit the addressing model imposes. Unfortunately, this breaks if the address is specified using a variable, so for now, the two archs aren't converted. The following architectures are affected by this change. * sh * arm * cris * mips * sparc(32) * blackfin * avr32 * parisc (broken, under investigation) * m32r * powerpc(32) As this change makes the dynamic allocator the default one, CONFIG_HAVE_DYNAMIC_PER_CPU_AREA is replaced with its invert - CONFIG_HAVE_LEGACY_PER_CPU_AREA, which is added to yet-to-be converted archs. These archs implement their own setup_per_cpu_areas() and the conversion is not trivial. * powerpc(64) * sparc(64) * ia64 * alpha * s390 Boot and batch alloc/free tests on x86_32 with debug code (x86_32 doesn't use default first chunk initialization). Compile tested on sparc(32), powerpc(32), arm and alpha. Kyle McMartin reported that this change breaks parisc. The problem is still under investigation and he is okay with pushing this patch forward and fixing parisc later. [ Impact: use dynamic allocator for most archs w/o custom percpu setup ] Signed-off-by: Tejun Heo Acked-by: Rusty Russell Acked-by: David S. Miller Acked-by: Benjamin Herrenschmidt Acked-by: Martin Schwidefsky Reviewed-by: Christoph Lameter Cc: Paul Mundt Cc: Russell King Cc: Mikael Starvik Cc: Ralf Baechle Cc: Bryan Wu Cc: Kyle McMartin Cc: Matthew Wilcox Cc: Grant Grundler Cc: Hirokazu Takata Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Heiko Carstens Cc: Ingo Molnar --- arch/alpha/Kconfig | 3 +++ arch/ia64/Kconfig | 3 +++ arch/powerpc/Kconfig | 3 +++ arch/s390/Kconfig | 3 +++ arch/sparc/Kconfig | 3 +++ arch/x86/Kconfig | 3 --- include/linux/percpu.h | 12 +++++++++--- init/main.c | 24 ------------------------ kernel/module.c | 6 +++--- mm/Makefile | 2 +- mm/allocpercpu.c | 28 ++++++++++++++++++++++++++++ mm/percpu.c | 40 +++++++++++++++++++++++++++++++++++++++- 12 files changed, 95 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 9fb8aae5c39..05d86407188 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -70,6 +70,9 @@ config AUTO_IRQ_AFFINITY depends on SMP default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y + source "init/Kconfig" source "kernel/Kconfig.freezer" diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 170042b420d..328d2f8b8c3 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -89,6 +89,9 @@ config GENERIC_TIME_VSYSCALL bool default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y + config HAVE_SETUP_PER_CPU_AREA def_bool y diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index bf6cedfa05d..a774c2acbe6 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -46,6 +46,9 @@ config GENERIC_HARDIRQS_NO__DO_IRQ bool default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool PPC64 + config HAVE_SETUP_PER_CPU_AREA def_bool PPC64 diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index a14dba0e4d6..f4a3cc62d28 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -75,6 +75,9 @@ config VIRT_CPU_ACCOUNTING config ARCH_SUPPORTS_DEBUG_PAGEALLOC def_bool y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y + mainmenu "Linux Kernel Configuration" config S390 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 3f8b6a92eab..7a8698b913f 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -92,6 +92,9 @@ config AUDIT_ARCH bool default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y if SPARC64 + config HAVE_SETUP_PER_CPU_AREA def_bool y if SPARC64 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d1430ef6b4f..a48a90076d8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -149,9 +149,6 @@ config ARCH_HAS_CACHE_LINE_SIZE config HAVE_SETUP_PER_CPU_AREA def_bool y -config HAVE_DYNAMIC_PER_CPU_AREA - def_bool y - config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 26fd9d12f05..e5000343dd6 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -34,7 +34,7 @@ #ifdef CONFIG_SMP -#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA /* minimum unit size, also is the maximum supported allocation size */ #define PCPU_MIN_UNIT_SIZE PFN_ALIGN(64 << 10) @@ -80,7 +80,7 @@ extern ssize_t __init pcpu_embed_first_chunk( extern void *__alloc_reserved_percpu(size_t size, size_t align); -#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#else /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ struct percpu_data { void *ptrs[1]; @@ -99,11 +99,15 @@ struct percpu_data { (__typeof__(ptr))__p->ptrs[(cpu)]; \ }) -#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ extern void *__alloc_percpu(size_t size, size_t align); extern void free_percpu(void *__pdata); +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +extern void __init setup_per_cpu_areas(void); +#endif + #else /* CONFIG_SMP */ #define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) @@ -124,6 +128,8 @@ static inline void free_percpu(void *p) kfree(p); } +static inline void __init setup_per_cpu_areas(void) { } + #endif /* CONFIG_SMP */ #define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \ diff --git a/init/main.c b/init/main.c index 09131ec090c..602d724afa5 100644 --- a/init/main.c +++ b/init/main.c @@ -357,7 +357,6 @@ static void __init smp_init(void) #define smp_init() do { } while (0) #endif -static inline void setup_per_cpu_areas(void) { } static inline void setup_nr_cpu_ids(void) { } static inline void smp_prepare_cpus(unsigned int maxcpus) { } @@ -378,29 +377,6 @@ static void __init setup_nr_cpu_ids(void) nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; } -#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; - -EXPORT_SYMBOL(__per_cpu_offset); - -static void __init setup_per_cpu_areas(void) -{ - unsigned long size, i; - char *ptr; - unsigned long nr_possible_cpus = num_possible_cpus(); - - /* Copy section for each CPU (we discard the original) */ - size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); - ptr = alloc_bootmem_pages(size * nr_possible_cpus); - - for_each_possible_cpu(i) { - __per_cpu_offset[i] = ptr - __per_cpu_start; - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - ptr += size; - } -} -#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ - /* Called by boot processor to activate the rest. */ static void __init smp_init(void) { diff --git a/kernel/module.c b/kernel/module.c index 38928fcaff2..f5934954fa9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -364,7 +364,7 @@ EXPORT_SYMBOL_GPL(find_module); #ifdef CONFIG_SMP -#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA static void *percpu_modalloc(unsigned long size, unsigned long align, const char *name) @@ -389,7 +389,7 @@ static void percpu_modfree(void *freeme) free_percpu(freeme); } -#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */ /* Number of blocks used and allocated. */ static unsigned int pcpu_num_used, pcpu_num_allocated; @@ -535,7 +535,7 @@ static int percpu_modinit(void) } __initcall(percpu_modinit); -#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ static unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, diff --git a/mm/Makefile b/mm/Makefile index 5e0bd642669..c77c6487552 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -33,7 +33,7 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o -ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA obj-$(CONFIG_SMP) += percpu.o else obj-$(CONFIG_SMP) += allocpercpu.o diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index dfdee6a4735..df34ceae0c6 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -5,6 +5,8 @@ */ #include #include +#include +#include #ifndef cache_line_size #define cache_line_size() L1_CACHE_BYTES @@ -147,3 +149,29 @@ void free_percpu(void *__pdata) kfree(__percpu_disguise(__pdata)); } EXPORT_SYMBOL_GPL(free_percpu); + +/* + * Generic percpu area setup. + */ +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; + +EXPORT_SYMBOL(__per_cpu_offset); + +void __init setup_per_cpu_areas(void) +{ + unsigned long size, i; + char *ptr; + unsigned long nr_possible_cpus = num_possible_cpus(); + + /* Copy section for each CPU (we discard the original) */ + size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); + ptr = alloc_bootmem_pages(size * nr_possible_cpus); + + for_each_possible_cpu(i) { + __per_cpu_offset[i] = ptr - __per_cpu_start; + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); + ptr += size; + } +} +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ diff --git a/mm/percpu.c b/mm/percpu.c index b70f2acd885..b14984566f5 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -43,7 +43,7 @@ * * To use this allocator, arch code should do the followings. * - * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA + * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate * regular address to percpu pointer and back if they need to be @@ -1275,3 +1275,41 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, reserved_size, dyn_size, pcpue_unit_size, pcpue_ptr, NULL); } + +/* + * Generic percpu area setup. + * + * The embedding helper is used because its behavior closely resembles + * the original non-dynamic generic percpu area setup. This is + * important because many archs have addressing restrictions and might + * fail if the percpu area is located far away from the previous + * location. As an added bonus, in non-NUMA cases, embedding is + * generally a good idea TLB-wise because percpu area can piggy back + * on the physical linear memory mapping which uses large page + * mappings on applicable archs. + */ +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(__per_cpu_offset); + +void __init setup_per_cpu_areas(void) +{ + size_t static_size = __per_cpu_end - __per_cpu_start; + ssize_t unit_size; + unsigned long delta; + unsigned int cpu; + + /* + * Always reserve area for module percpu variables. That's + * what the legacy allocator did. + */ + unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE, -1); + if (unit_size < 0) + panic("Failed to initialized percpu areas."); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) + __per_cpu_offset[cpu] = delta + cpu * unit_size; +} +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ -- cgit v1.2.3-70-g09d2 From b9bf3121af348d9255f1c917830fe8c2df52efcb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2009 15:13:47 +0900 Subject: percpu: use DEFINE_PER_CPU_SHARED_ALIGNED() There are a few places where ___cacheline_aligned* is used with DEFINE_PER_CPU(). Use DEFINE_PER_CPU_SHARED_ALIGNED() instead. DEFINE_PER_CPU_SHARED_ALIGNED() applies alignment only on SMPs. While all other converted places used _in_smp variant or only get compiled for SMP, net/rds used unconditional ____cacheline_aligned. I don't see any reason these data structures should be aligned on UP and thus converted together. Signed-off-by: Tejun Heo Cc: Mike Frysinger Cc: Tony Luck Cc: Andy Grover --- arch/blackfin/mm/sram-alloc.c | 6 +++--- arch/ia64/kernel/smp.c | 3 ++- kernel/sched.c | 4 ++-- net/rds/ib_stats.c | 2 +- net/rds/iw_stats.c | 2 +- net/rds/page.c | 2 +- 6 files changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/arch/blackfin/mm/sram-alloc.c b/arch/blackfin/mm/sram-alloc.c index 0bc3c4ef0aa..99e4dbb1dfd 100644 --- a/arch/blackfin/mm/sram-alloc.c +++ b/arch/blackfin/mm/sram-alloc.c @@ -42,9 +42,9 @@ #include #include "blackfin_sram.h" -static DEFINE_PER_CPU(spinlock_t, l1sram_lock) ____cacheline_aligned_in_smp; -static DEFINE_PER_CPU(spinlock_t, l1_data_sram_lock) ____cacheline_aligned_in_smp; -static DEFINE_PER_CPU(spinlock_t, l1_inst_sram_lock) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(spinlock_t, l1sram_lock); +static DEFINE_PER_CPU_SHARED_ALIGNED(spinlock_t, l1_data_sram_lock); +static DEFINE_PER_CPU_SHARED_ALIGNED(spinlock_t, l1_inst_sram_lock); static spinlock_t l2_sram_lock ____cacheline_aligned_in_smp; /* the data structure for L1 scratchpad and DATA SRAM */ diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c index 94cf78ba28f..93ebfea43c6 100644 --- a/arch/ia64/kernel/smp.c +++ b/arch/ia64/kernel/smp.c @@ -58,7 +58,8 @@ static struct local_tlb_flush_counts { unsigned int count; } __attribute__((__aligned__(32))) local_tlb_flush_counts[NR_CPUS]; -static DEFINE_PER_CPU(unsigned short [NR_CPUS], shadow_flush_counts) ____cacheline_aligned; +static DEFINE_PER_CPU_SHARED_ALIGNED(unsigned short [NR_CPUS], + shadow_flush_counts); #define IPI_CALL_FUNC 0 #define IPI_CPU_STOP 1 diff --git a/kernel/sched.c b/kernel/sched.c index 7c9098d186e..34fd81d2178 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -318,12 +318,12 @@ struct task_group root_task_group; /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ -static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_cfs_rq); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); -static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); #endif /* CONFIG_RT_GROUP_SCHED */ #else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index 02e3e3d50d4..301ae51ae40 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c @@ -37,7 +37,7 @@ #include "rds.h" #include "ib.h" -DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned; +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); static char *rds_ib_stat_names[] = { "ib_connect_raced", diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c index ccc7e8f0bf0..fafea3cc92d 100644 --- a/net/rds/iw_stats.c +++ b/net/rds/iw_stats.c @@ -37,7 +37,7 @@ #include "rds.h" #include "iw.h" -DEFINE_PER_CPU(struct rds_iw_statistics, rds_iw_stats) ____cacheline_aligned; +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats); static char *rds_iw_stat_names[] = { "iw_connect_raced", diff --git a/net/rds/page.c b/net/rds/page.c index c460743a89a..de7bb84bcd7 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -39,7 +39,7 @@ struct rds_page_remainder { unsigned long r_offset; }; -DEFINE_PER_CPU(struct rds_page_remainder, rds_page_remainders) ____cacheline_aligned; +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders); /* * returns 0 on success or -errno on failure. -- cgit v1.2.3-70-g09d2 From 245b2e70eabd797932adb263a65da0bab3711753 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2009 15:13:48 +0900 Subject: percpu: clean up percpu variable definitions Percpu variable definition is about to be updated such that all percpu symbols including the static ones must be unique. Update percpu variable definitions accordingly. * as,cfq: rename ioc_count uniquely * cpufreq: rename cpu_dbs_info uniquely * xen: move nesting_count out of xen_evtchn_do_upcall() and rename it * mm: move ratelimits out of balance_dirty_pages_ratelimited_nr() and rename it * ipv4,6: rename cookie_scratch uniquely * x86 perf_counter: rename prev_left to pmc_prev_left, irq_entry to pmc_irq_entry and nmi_entry to pmc_nmi_entry * perf_counter: rename disable_count to perf_disable_count * ftrace: rename test_event_disable to ftrace_test_event_disable * kmemleak: rename test_pointer to kmemleak_test_pointer * mce: rename next_interval to mce_next_interval [ Impact: percpu usage cleanups, no duplicate static percpu var names ] Signed-off-by: Tejun Heo Reviewed-by: Christoph Lameter Cc: Ivan Kokshaysky Cc: Jens Axboe Cc: Dave Jones Cc: Jeremy Fitzhardinge Cc: linux-mm Cc: David S. Miller Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Li Zefan Cc: Catalin Marinas Cc: Andi Kleen --- arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++---- arch/x86/kernel/cpu/perf_counter.c | 14 +++++++------- block/as-iosched.c | 10 +++++----- block/cfq-iosched.c | 10 +++++----- drivers/cpufreq/cpufreq_conservative.c | 12 ++++++------ drivers/cpufreq/cpufreq_ondemand.c | 15 ++++++++------- drivers/xen/events.c | 9 +++++---- kernel/perf_counter.c | 6 +++--- kernel/trace/trace_events.c | 6 +++--- mm/kmemleak-test.c | 6 +++--- mm/page-writeback.c | 5 +++-- net/ipv4/syncookies.c | 5 +++-- net/ipv6/syncookies.c | 5 +++-- 13 files changed, 58 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 284d1de968b..cba8cd3e957 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1091,7 +1091,7 @@ void mce_log_therm_throt_event(__u64 status) */ static int check_interval = 5 * 60; /* 5 minutes */ -static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ +static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); static void mcheck_timer(unsigned long data) @@ -1110,7 +1110,7 @@ static void mcheck_timer(unsigned long data) * Alert userspace if needed. If we logged an MCE, reduce the * polling interval, otherwise increase the polling interval. */ - n = &__get_cpu_var(next_interval); + n = &__get_cpu_var(mce_next_interval); if (mce_notify_irq()) *n = max(*n/2, HZ/100); else @@ -1311,7 +1311,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c) static void mce_init_timer(void) { struct timer_list *t = &__get_cpu_var(mce_timer); - int *n = &__get_cpu_var(next_interval); + int *n = &__get_cpu_var(mce_next_interval); if (mce_ignore_ce) return; @@ -1914,7 +1914,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: t->expires = round_jiffies(jiffies + - __get_cpu_var(next_interval)); + __get_cpu_var(mce_next_interval)); add_timer_on(t, cpu); smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); break; diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 4946288d683..5fdf63aaaba 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -862,7 +862,7 @@ amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) x86_pmu_disable_counter(hwc, idx); } -static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], prev_left); +static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. @@ -901,7 +901,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, if (left > x86_pmu.max_period) left = x86_pmu.max_period; - per_cpu(prev_left[idx], smp_processor_id()) = left; + per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; /* * The hw counter starts counting from this counter offset, @@ -1089,7 +1089,7 @@ void perf_counter_print_debug(void) rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); rdmsrl(x86_pmu.perfctr + idx, pmc_count); - prev_left = per_cpu(prev_left[idx], cpu); + prev_left = per_cpu(pmc_prev_left[idx], cpu); pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); @@ -1561,8 +1561,8 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip) entry->ip[entry->nr++] = ip; } -static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); -static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); static void @@ -1709,9 +1709,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) struct perf_callchain_entry *entry; if (in_nmi()) - entry = &__get_cpu_var(nmi_entry); + entry = &__get_cpu_var(pmc_nmi_entry); else - entry = &__get_cpu_var(irq_entry); + entry = &__get_cpu_var(pmc_irq_entry); entry->nr = 0; diff --git a/block/as-iosched.c b/block/as-iosched.c index 7a12cf6ee1d..ce8ba57c655 100644 --- a/block/as-iosched.c +++ b/block/as-iosched.c @@ -146,7 +146,7 @@ enum arq_state { #define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2) #define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state) -static DEFINE_PER_CPU(unsigned long, ioc_count); +static DEFINE_PER_CPU(unsigned long, as_ioc_count); static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); @@ -161,7 +161,7 @@ static void as_antic_stop(struct as_data *ad); static void free_as_io_context(struct as_io_context *aic) { kfree(aic); - elv_ioc_count_dec(ioc_count); + elv_ioc_count_dec(as_ioc_count); if (ioc_gone) { /* * AS scheduler is exiting, grab exit lock and check @@ -169,7 +169,7 @@ static void free_as_io_context(struct as_io_context *aic) * complete ioc_gone and set it back to NULL. */ spin_lock(&ioc_gone_lock); - if (ioc_gone && !elv_ioc_count_read(ioc_count)) { + if (ioc_gone && !elv_ioc_count_read(as_ioc_count)) { complete(ioc_gone); ioc_gone = NULL; } @@ -211,7 +211,7 @@ static struct as_io_context *alloc_as_io_context(void) ret->seek_total = 0; ret->seek_samples = 0; ret->seek_mean = 0; - elv_ioc_count_inc(ioc_count); + elv_ioc_count_inc(as_ioc_count); } return ret; @@ -1507,7 +1507,7 @@ static void __exit as_exit(void) ioc_gone = &all_gone; /* ioc_gone's update must be visible before reading ioc_count */ smp_wmb(); - if (elv_ioc_count_read(ioc_count)) + if (elv_ioc_count_read(as_ioc_count)) wait_for_completion(&all_gone); synchronize_rcu(); } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 833ec18eaa6..0f1cc7d3855 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -48,7 +48,7 @@ static int cfq_slice_idle = HZ / 125; static struct kmem_cache *cfq_pool; static struct kmem_cache *cfq_ioc_pool; -static DEFINE_PER_CPU(unsigned long, ioc_count); +static DEFINE_PER_CPU(unsigned long, cfq_ioc_count); static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); @@ -1422,7 +1422,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head) cic = container_of(head, struct cfq_io_context, rcu_head); kmem_cache_free(cfq_ioc_pool, cic); - elv_ioc_count_dec(ioc_count); + elv_ioc_count_dec(cfq_ioc_count); if (ioc_gone) { /* @@ -1431,7 +1431,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head) * complete ioc_gone and set it back to NULL */ spin_lock(&ioc_gone_lock); - if (ioc_gone && !elv_ioc_count_read(ioc_count)) { + if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) { complete(ioc_gone); ioc_gone = NULL; } @@ -1557,7 +1557,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) INIT_HLIST_NODE(&cic->cic_list); cic->dtor = cfq_free_io_context; cic->exit = cfq_exit_io_context; - elv_ioc_count_inc(ioc_count); + elv_ioc_count_inc(cfq_ioc_count); } return cic; @@ -2658,7 +2658,7 @@ static void __exit cfq_exit(void) * this also protects us from entering cfq_slab_kill() with * pending RCU callbacks */ - if (elv_ioc_count_read(ioc_count)) + if (elv_ioc_count_read(cfq_ioc_count)) wait_for_completion(&all_gone); cfq_slab_kill(); } diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 7fc58af748b..a7ef465c83b 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -65,7 +65,7 @@ struct cpu_dbs_info_s { int cpu; unsigned int enable:1; }; -static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); +static DEFINE_PER_CPU(struct cpu_dbs_info_s, cs_cpu_dbs_info); static unsigned int dbs_enable; /* number of CPUs using this policy */ @@ -138,7 +138,7 @@ dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; - struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cpu_dbs_info, + struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cs_cpu_dbs_info, freq->cpu); struct cpufreq_policy *policy; @@ -298,7 +298,7 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy, /* we need to re-evaluate prev_cpu_idle */ for_each_online_cpu(j) { struct cpu_dbs_info_s *dbs_info; - dbs_info = &per_cpu(cpu_dbs_info, j); + dbs_info = &per_cpu(cs_cpu_dbs_info, j); dbs_info->prev_cpu_idle = get_cpu_idle_time(j, &dbs_info->prev_cpu_wall); if (dbs_tuners_ins.ignore_nice) @@ -388,7 +388,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) cputime64_t cur_wall_time, cur_idle_time; unsigned int idle_time, wall_time; - j_dbs_info = &per_cpu(cpu_dbs_info, j); + j_dbs_info = &per_cpu(cs_cpu_dbs_info, j); cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); @@ -528,7 +528,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int j; int rc; - this_dbs_info = &per_cpu(cpu_dbs_info, cpu); + this_dbs_info = &per_cpu(cs_cpu_dbs_info, cpu); switch (event) { case CPUFREQ_GOV_START: @@ -548,7 +548,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, for_each_cpu(j, policy->cpus) { struct cpu_dbs_info_s *j_dbs_info; - j_dbs_info = &per_cpu(cpu_dbs_info, j); + j_dbs_info = &per_cpu(cs_cpu_dbs_info, j); j_dbs_info->cur_policy = policy; j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 1911d172935..36f292a7bd0 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -73,7 +73,7 @@ struct cpu_dbs_info_s { unsigned int enable:1, sample_type:1; }; -static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); +static DEFINE_PER_CPU(struct cpu_dbs_info_s, od_cpu_dbs_info); static unsigned int dbs_enable; /* number of CPUs using this policy */ @@ -151,7 +151,8 @@ static unsigned int powersave_bias_target(struct cpufreq_policy *policy, unsigned int freq_hi, freq_lo; unsigned int index = 0; unsigned int jiffies_total, jiffies_hi, jiffies_lo; - struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, policy->cpu); + struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, + policy->cpu); if (!dbs_info->freq_table) { dbs_info->freq_lo = 0; @@ -196,7 +197,7 @@ static void ondemand_powersave_bias_init(void) { int i; for_each_online_cpu(i) { - struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, i); + struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, i); dbs_info->freq_table = cpufreq_frequency_get_table(i); dbs_info->freq_lo = 0; } @@ -297,7 +298,7 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy, /* we need to re-evaluate prev_cpu_idle */ for_each_online_cpu(j) { struct cpu_dbs_info_s *dbs_info; - dbs_info = &per_cpu(cpu_dbs_info, j); + dbs_info = &per_cpu(od_cpu_dbs_info, j); dbs_info->prev_cpu_idle = get_cpu_idle_time(j, &dbs_info->prev_cpu_wall); if (dbs_tuners_ins.ignore_nice) @@ -391,7 +392,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) unsigned int load, load_freq; int freq_avg; - j_dbs_info = &per_cpu(cpu_dbs_info, j); + j_dbs_info = &per_cpu(od_cpu_dbs_info, j); cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); @@ -548,7 +549,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int j; int rc; - this_dbs_info = &per_cpu(cpu_dbs_info, cpu); + this_dbs_info = &per_cpu(od_cpu_dbs_info, cpu); switch (event) { case CPUFREQ_GOV_START: @@ -570,7 +571,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, for_each_cpu(j, policy->cpus) { struct cpu_dbs_info_s *j_dbs_info; - j_dbs_info = &per_cpu(cpu_dbs_info, j); + j_dbs_info = &per_cpu(od_cpu_dbs_info, j); j_dbs_info->cur_policy = policy; j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, diff --git a/drivers/xen/events.c b/drivers/xen/events.c index ab581fa6268..7d2987e9b1b 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -602,6 +602,8 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +static DEFINE_PER_CPU(unsigned, xed_nesting_count); + /* * Search the CPUs pending events bitmasks. For each one found, map * the event number to an irq, and feed it into do_IRQ() for @@ -617,7 +619,6 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) struct pt_regs *old_regs = set_irq_regs(regs); struct shared_info *s = HYPERVISOR_shared_info; struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); - static DEFINE_PER_CPU(unsigned, nesting_count); unsigned count; exit_idle(); @@ -628,7 +629,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) vcpu_info->evtchn_upcall_pending = 0; - if (__get_cpu_var(nesting_count)++) + if (__get_cpu_var(xed_nesting_count)++) goto out; #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ @@ -653,8 +654,8 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) BUG_ON(!irqs_disabled()); - count = __get_cpu_var(nesting_count); - __get_cpu_var(nesting_count) = 0; + count = __get_cpu_var(xed_nesting_count); + __get_cpu_var(xed_nesting_count) = 0; } while(count != 1); out: diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 1a933a221ea..1fd7a2e7575 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -98,16 +98,16 @@ hw_perf_group_sched_in(struct perf_counter *group_leader, void __weak perf_counter_print_debug(void) { } -static DEFINE_PER_CPU(int, disable_count); +static DEFINE_PER_CPU(int, perf_disable_count); void __perf_disable(void) { - __get_cpu_var(disable_count)++; + __get_cpu_var(perf_disable_count)++; } bool __perf_enable(void) { - return !--__get_cpu_var(disable_count); + return !--__get_cpu_var(perf_disable_count); } void perf_disable(void) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index aa08be69a1b..54b1de5074b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1318,7 +1318,7 @@ static __init void event_trace_self_tests(void) #ifdef CONFIG_FUNCTION_TRACER -static DEFINE_PER_CPU(atomic_t, test_event_disable); +static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); static void function_test_events_call(unsigned long ip, unsigned long parent_ip) @@ -1334,7 +1334,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) pc = preempt_count(); resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); + disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); if (disabled != 1) goto out; @@ -1352,7 +1352,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) trace_nowake_buffer_unlock_commit(event, flags, pc); out: - atomic_dec(&per_cpu(test_event_disable, cpu)); + atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); ftrace_preempt_enable(resched); } diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index d5292fc6f52..177a5169bbd 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c @@ -36,7 +36,7 @@ struct test_node { }; static LIST_HEAD(test_list); -static DEFINE_PER_CPU(void *, test_pointer); +static DEFINE_PER_CPU(void *, kmemleak_test_pointer); /* * Some very simple testing. This function needs to be extended for @@ -86,9 +86,9 @@ static int __init kmemleak_test_init(void) } for_each_possible_cpu(i) { - per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL); + per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL); pr_info("kmemleak: kmalloc(129) = %p\n", - per_cpu(test_pointer, i)); + per_cpu(kmemleak_test_pointer, i)); } return 0; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7b0dcea4935..2c075dcf03d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -607,6 +607,8 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) } } +static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; + /** * balance_dirty_pages_ratelimited_nr - balance dirty memory state * @mapping: address_space which was dirtied @@ -624,7 +626,6 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, unsigned long nr_pages_dirtied) { - static DEFINE_PER_CPU(unsigned long, ratelimits) = 0; unsigned long ratelimit; unsigned long *p; @@ -637,7 +638,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, * tasks in balance_dirty_pages(). Period. */ preempt_disable(); - p = &__get_cpu_var(ratelimits); + p = &__get_cpu_var(bdp_ratelimits); *p += nr_pages_dirtied; if (unlikely(*p >= ratelimit)) { *p = 0; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 84d90f2799b..a6e0e077ac3 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -37,12 +37,13 @@ __initcall(init_syncookies); #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) -static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], cookie_scratch); +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], + ipv4_cookie_scratch); static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) { - __u32 *tmp = __get_cpu_var(cookie_scratch); + __u32 *tmp = __get_cpu_var(ipv4_cookie_scratch); memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c])); tmp[0] = (__force u32)saddr; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 23d0d6db046..6b6ae913b5d 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -74,12 +74,13 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, return child; } -static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], cookie_scratch); +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], + ipv6_cookie_scratch); static u32 cookie_hash(struct in6_addr *saddr, struct in6_addr *daddr, __be16 sport, __be16 dport, u32 count, int c) { - __u32 *tmp = __get_cpu_var(cookie_scratch); + __u32 *tmp = __get_cpu_var(ipv6_cookie_scratch); /* * we have 320 bits of information to hash, copy in the remaining -- cgit v1.2.3-70-g09d2 From c17ef45342cc033fdf7bdd5b28615e0090f8d2e7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Jun 2009 17:12:47 -0700 Subject: rcu: Remove Classic RCU Remove Classic RCU, given that the combination of Tree RCU and the proposed Bloatwatch RCU do everything that Classic RCU can with fewer bugs. Tree RCU has been default in x86 builds for almost six months, and seems to be quite reliable, so there does not seem to be much justification for keeping the Classic RCU code and config complexity around anymore. Signed-off-by: Paul E. McKenney Cc: akpm@linux-foundation.org Cc: niv@us.ibm.com Cc: dvhltc@us.ibm.com Cc: dipankar@in.ibm.com Cc: dhowells@redhat.com Cc: lethal@linux-sh.org Cc: kernel@wantstofly.org Signed-off-by: Ingo Molnar --- include/linux/rcuclassic.h | 178 ---------- include/linux/rcupdate.h | 4 +- init/Kconfig | 12 +- kernel/Makefile | 1 - kernel/rcuclassic.c | 807 --------------------------------------------- 5 files changed, 3 insertions(+), 999 deletions(-) delete mode 100644 include/linux/rcuclassic.h delete mode 100644 kernel/rcuclassic.c (limited to 'kernel') diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h deleted file mode 100644 index bfd92e1e5d2..00000000000 --- a/include/linux/rcuclassic.h +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Read-Copy Update mechanism for mutual exclusion (classic version) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2001 - * - * Author: Dipankar Sarma - * - * Based on the original work by Paul McKenney - * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. - * Papers: - * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf - * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU - * - */ - -#ifndef __LINUX_RCUCLASSIC_H -#define __LINUX_RCUCLASSIC_H - -#include -#include -#include -#include -#include - -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR -#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rcp->jiffies_stall */ -#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - -/* Global control variables for rcupdate callback mechanism. */ -struct rcu_ctrlblk { - long cur; /* Current batch number. */ - long completed; /* Number of the last completed batch */ - long pending; /* Number of the last pending batch */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - unsigned long gp_start; /* Time at which GP started in jiffies. */ - unsigned long jiffies_stall; - /* Time at which to check for CPU stalls. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - - int signaled; - - spinlock_t lock ____cacheline_internodealigned_in_smp; - DECLARE_BITMAP(cpumask, NR_CPUS); /* CPUs that need to switch for */ - /* current batch to proceed. */ -} ____cacheline_internodealigned_in_smp; - -/* Is batch a before batch b ? */ -static inline int rcu_batch_before(long a, long b) -{ - return (a - b) < 0; -} - -/* Is batch a after batch b ? */ -static inline int rcu_batch_after(long a, long b) -{ - return (a - b) > 0; -} - -/* Per-CPU data for Read-Copy UPdate. */ -struct rcu_data { - /* 1) quiescent state handling : */ - long quiescbatch; /* Batch # for grace period */ - int passed_quiesc; /* User-mode/idle loop etc. */ - int qs_pending; /* core waits for quiesc state */ - - /* 2) batch handling */ - /* - * if nxtlist is not NULL, then: - * batch: - * The batch # for the last entry of nxtlist - * [*nxttail[1], NULL = *nxttail[2]): - * Entries that batch # <= batch - * [*nxttail[0], *nxttail[1]): - * Entries that batch # <= batch - 1 - * [nxtlist, *nxttail[0]): - * Entries that batch # <= batch - 2 - * The grace period for these entries has completed, and - * the other grace-period-completed entries may be moved - * here temporarily in rcu_process_callbacks(). - */ - long batch; - struct rcu_head *nxtlist; - struct rcu_head **nxttail[3]; - long qlen; /* # of queued callbacks */ - struct rcu_head *donelist; - struct rcu_head **donetail; - long blimit; /* Upper limit on a processed batch */ - int cpu; - struct rcu_head barrier; -}; - -/* - * Increment the quiescent state counter. - * The counter is a bit degenerated: We do not need to know - * how many quiescent states passed, just if there was at least - * one since the start of the grace period. Thus just a flag. - */ -extern void rcu_qsctr_inc(int cpu); -extern void rcu_bh_qsctr_inc(int cpu); - -extern int rcu_pending(int cpu); -extern int rcu_needs_cpu(int cpu); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -extern struct lockdep_map rcu_lock_map; -# define rcu_read_acquire() \ - lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_) -# define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_) -#else -# define rcu_read_acquire() do { } while (0) -# define rcu_read_release() do { } while (0) -#endif - -#define __rcu_read_lock() \ - do { \ - preempt_disable(); \ - __acquire(RCU); \ - rcu_read_acquire(); \ - } while (0) -#define __rcu_read_unlock() \ - do { \ - rcu_read_release(); \ - __release(RCU); \ - preempt_enable(); \ - } while (0) -#define __rcu_read_lock_bh() \ - do { \ - local_bh_disable(); \ - __acquire(RCU_BH); \ - rcu_read_acquire(); \ - } while (0) -#define __rcu_read_unlock_bh() \ - do { \ - rcu_read_release(); \ - __release(RCU_BH); \ - local_bh_enable(); \ - } while (0) - -#define __synchronize_sched() synchronize_rcu() - -#define call_rcu_sched(head, func) call_rcu(head, func) - -extern void __rcu_init(void); -#define rcu_init_sched() do { } while (0) -extern void rcu_check_callbacks(int cpu, int user); -extern void rcu_restart_cpu(int cpu); - -extern long rcu_batches_completed(void); -extern long rcu_batches_completed_bh(void); - -#define rcu_enter_nohz() do { } while (0) -#define rcu_exit_nohz() do { } while (0) - -/* A context switch is a grace period for rcuclassic. */ -static inline int rcu_blocking_is_gp(void) -{ - return num_online_cpus() == 1; -} - -#endif /* __LINUX_RCUCLASSIC_H */ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 15fbb3ca634..0cdfdb622fa 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -54,9 +54,7 @@ struct rcu_head { /* Internal to kernel, but needed by rcupreempt.h. */ extern int rcu_scheduler_active; -#if defined(CONFIG_CLASSIC_RCU) -#include -#elif defined(CONFIG_TREE_RCU) +#if defined(CONFIG_TREE_RCU) #include #elif defined(CONFIG_PREEMPT_RCU) #include diff --git a/init/Kconfig b/init/Kconfig index 1ce05a4cb5f..d10f31dfa0b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -316,21 +316,13 @@ choice prompt "RCU Implementation" default TREE_RCU -config CLASSIC_RCU - bool "Classic RCU" - help - This option selects the classic RCU implementation that is - designed for best read-side performance on non-realtime - systems. - - Select this option if you are unsure. - config TREE_RCU bool "Tree-based hierarchical RCU" help This option selects the RCU implementation that is designed for very large SMP system with hundreds or - thousands of CPUs. + thousands of CPUs. It also scales down nicely to + smaller systems. config PREEMPT_RCU bool "Preemptible RCU" diff --git a/kernel/Makefile b/kernel/Makefile index 0a32cb21ec9..d6042bcc9e9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -80,7 +80,6 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o -obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o obj-$(CONFIG_TREE_RCU) += rcutree.o obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c deleted file mode 100644 index 0f2b0b31130..00000000000 --- a/kernel/rcuclassic.c +++ /dev/null @@ -1,807 +0,0 @@ -/* - * Read-Copy Update mechanism for mutual exclusion - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2001 - * - * Authors: Dipankar Sarma - * Manfred Spraul - * - * Based on the original work by Paul McKenney - * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. - * Papers: - * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf - * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -static struct lock_class_key rcu_lock_key; -struct lockdep_map rcu_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); -EXPORT_SYMBOL_GPL(rcu_lock_map); -#endif - - -/* Definition for rcupdate control block. */ -static struct rcu_ctrlblk rcu_ctrlblk = { - .cur = -300, - .completed = -300, - .pending = -300, - .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), - .cpumask = CPU_BITS_NONE, -}; - -static struct rcu_ctrlblk rcu_bh_ctrlblk = { - .cur = -300, - .completed = -300, - .pending = -300, - .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), - .cpumask = CPU_BITS_NONE, -}; - -static DEFINE_PER_CPU(struct rcu_data, rcu_data); -static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); - -/* - * Increment the quiescent state counter. - * The counter is a bit degenerated: We do not need to know - * how many quiescent states passed, just if there was at least - * one since the start of the grace period. Thus just a flag. - */ -void rcu_qsctr_inc(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - rdp->passed_quiesc = 1; -} - -void rcu_bh_qsctr_inc(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); - rdp->passed_quiesc = 1; -} - -static int blimit = 10; -static int qhimark = 10000; -static int qlowmark = 100; - -#ifdef CONFIG_SMP -static void force_quiescent_state(struct rcu_data *rdp, - struct rcu_ctrlblk *rcp) -{ - int cpu; - unsigned long flags; - - set_need_resched(); - spin_lock_irqsave(&rcp->lock, flags); - if (unlikely(!rcp->signaled)) { - rcp->signaled = 1; - /* - * Don't send IPI to itself. With irqs disabled, - * rdp->cpu is the current cpu. - * - * cpu_online_mask is updated by the _cpu_down() - * using __stop_machine(). Since we're in irqs disabled - * section, __stop_machine() is not exectuting, hence - * the cpu_online_mask is stable. - * - * However, a cpu might have been offlined _just_ before - * we disabled irqs while entering here. - * And rcu subsystem might not yet have handled the CPU_DEAD - * notification, leading to the offlined cpu's bit - * being set in the rcp->cpumask. - * - * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent - * sending smp_reschedule() to an offlined CPU. - */ - for_each_cpu_and(cpu, - to_cpumask(rcp->cpumask), cpu_online_mask) { - if (cpu != rdp->cpu) - smp_send_reschedule(cpu); - } - } - spin_unlock_irqrestore(&rcp->lock, flags); -} -#else -static inline void force_quiescent_state(struct rcu_data *rdp, - struct rcu_ctrlblk *rcp) -{ - set_need_resched(); -} -#endif - -static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - long batch; - - head->next = NULL; - smp_mb(); /* Read of rcu->cur must happen after any change by caller. */ - - /* - * Determine the batch number of this callback. - * - * Using ACCESS_ONCE to avoid the following error when gcc eliminates - * local variable "batch" and emits codes like this: - * 1) rdp->batch = rcp->cur + 1 # gets old value - * ...... - * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value - * then [*nxttail[0], *nxttail[1]) may contain callbacks - * that batch# = rdp->batch, see the comment of struct rcu_data. - */ - batch = ACCESS_ONCE(rcp->cur) + 1; - - if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) { - /* process callbacks */ - rdp->nxttail[0] = rdp->nxttail[1]; - rdp->nxttail[1] = rdp->nxttail[2]; - if (rcu_batch_after(batch - 1, rdp->batch)) - rdp->nxttail[0] = rdp->nxttail[2]; - } - - rdp->batch = batch; - *rdp->nxttail[2] = head; - rdp->nxttail[2] = &head->next; - - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_ctrlblk); - } -} - -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - -static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) -{ - rcp->gp_start = jiffies; - rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; -} - -static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) -{ - int cpu; - long delta; - unsigned long flags; - - /* Only let one CPU complain about others per time interval. */ - - spin_lock_irqsave(&rcp->lock, flags); - delta = jiffies - rcp->jiffies_stall; - if (delta < 2 || rcp->cur != rcp->completed) { - spin_unlock_irqrestore(&rcp->lock, flags); - return; - } - rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; - spin_unlock_irqrestore(&rcp->lock, flags); - - /* OK, time to rat on our buddy... */ - - printk(KERN_ERR "INFO: RCU detected CPU stalls:"); - for_each_possible_cpu(cpu) { - if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask))) - printk(" %d", cpu); - } - printk(" (detected by %d, t=%ld jiffies)\n", - smp_processor_id(), (long)(jiffies - rcp->gp_start)); -} - -static void print_cpu_stall(struct rcu_ctrlblk *rcp) -{ - unsigned long flags; - - printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", - smp_processor_id(), jiffies, - jiffies - rcp->gp_start); - dump_stack(); - spin_lock_irqsave(&rcp->lock, flags); - if ((long)(jiffies - rcp->jiffies_stall) >= 0) - rcp->jiffies_stall = - jiffies + RCU_SECONDS_TILL_STALL_RECHECK; - spin_unlock_irqrestore(&rcp->lock, flags); - set_need_resched(); /* kick ourselves to get things going. */ -} - -static void check_cpu_stall(struct rcu_ctrlblk *rcp) -{ - long delta; - - delta = jiffies - rcp->jiffies_stall; - if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) && - delta >= 0) { - - /* We haven't checked in, so go dump stack. */ - print_cpu_stall(rcp); - - } else if (rcp->cur != rcp->completed && delta >= 2) { - - /* They had two seconds to dump stack, so complain. */ - print_other_cpu_stall(rcp); - } -} - -#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - -static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) -{ -} - -static inline void check_cpu_stall(struct rcu_ctrlblk *rcp) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - -/** - * call_rcu - Queue an RCU callback for invocation after a grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual update function to be invoked after the grace period - * - * The update function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. - */ -void call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) -{ - unsigned long flags; - - head->func = func; - local_irq_save(flags); - __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data)); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(call_rcu); - -/** - * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual update function to be invoked after the grace period - * - * The update function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_bh() assumes - * that the read-side critical sections end on completion of a softirq - * handler. This means that read-side critical sections in process - * context must not be interrupted by softirqs. This interface is to be - * used when most of the read-side critical sections are in softirq context. - * RCU read-side critical sections are delimited by rcu_read_lock() and - * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() - * and rcu_read_unlock_bh(), if in process context. These may be nested. - */ -void call_rcu_bh(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) -{ - unsigned long flags; - - head->func = func; - local_irq_save(flags); - __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(call_rcu_bh); - -/* - * Return the number of RCU batches processed thus far. Useful - * for debug and statistics. - */ -long rcu_batches_completed(void) -{ - return rcu_ctrlblk.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - -/* - * Return the number of RCU batches processed thus far. Useful - * for debug and statistics. - */ -long rcu_batches_completed_bh(void) -{ - return rcu_bh_ctrlblk.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); - -/* Raises the softirq for processing rcu_callbacks. */ -static inline void raise_rcu_softirq(void) -{ - raise_softirq(RCU_SOFTIRQ); -} - -/* - * Invoke the completed RCU callbacks. They are expected to be in - * a per-cpu list. - */ -static void rcu_do_batch(struct rcu_data *rdp) -{ - unsigned long flags; - struct rcu_head *next, *list; - int count = 0; - - list = rdp->donelist; - while (list) { - next = list->next; - prefetch(next); - list->func(list); - list = next; - if (++count >= rdp->blimit) - break; - } - rdp->donelist = list; - - local_irq_save(flags); - rdp->qlen -= count; - local_irq_restore(flags); - if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) - rdp->blimit = blimit; - - if (!rdp->donelist) - rdp->donetail = &rdp->donelist; - else - raise_rcu_softirq(); -} - -/* - * Grace period handling: - * The grace period handling consists out of two steps: - * - A new grace period is started. - * This is done by rcu_start_batch. The start is not broadcasted to - * all cpus, they must pick this up by comparing rcp->cur with - * rdp->quiescbatch. All cpus are recorded in the - * rcu_ctrlblk.cpumask bitmap. - * - All cpus must go through a quiescent state. - * Since the start of the grace period is not broadcasted, at least two - * calls to rcu_check_quiescent_state are required: - * The first call just notices that a new grace period is running. The - * following calls check if there was a quiescent state since the beginning - * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If - * the bitmap is empty, then the grace period is completed. - * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace - * period (if necessary). - */ - -/* - * Register a new batch of callbacks, and start it up if there is currently no - * active batch and the batch to be registered has not already occurred. - * Caller must hold rcu_ctrlblk.lock. - */ -static void rcu_start_batch(struct rcu_ctrlblk *rcp) -{ - if (rcp->cur != rcp->pending && - rcp->completed == rcp->cur) { - rcp->cur++; - record_gp_stall_check_time(rcp); - - /* - * Accessing nohz_cpu_mask before incrementing rcp->cur needs a - * Barrier Otherwise it can cause tickless idle CPUs to be - * included in rcp->cpumask, which will extend graceperiods - * unnecessarily. - */ - smp_mb(); - cpumask_andnot(to_cpumask(rcp->cpumask), - cpu_online_mask, nohz_cpu_mask); - - rcp->signaled = 0; - } -} - -/* - * cpu went through a quiescent state since the beginning of the grace period. - * Clear it from the cpu mask and complete the grace period if it was the last - * cpu. Start another grace period if someone has further entries pending - */ -static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) -{ - cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask)); - if (cpumask_empty(to_cpumask(rcp->cpumask))) { - /* batch completed ! */ - rcp->completed = rcp->cur; - rcu_start_batch(rcp); - } -} - -/* - * Check if the cpu has gone through a quiescent state (say context - * switch). If so and if it already hasn't done so in this RCU - * quiescent cycle, then indicate that it has done so. - */ -static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - unsigned long flags; - - if (rdp->quiescbatch != rcp->cur) { - /* start new grace period: */ - rdp->qs_pending = 1; - rdp->passed_quiesc = 0; - rdp->quiescbatch = rcp->cur; - return; - } - - /* Grace period already completed for this cpu? - * qs_pending is checked instead of the actual bitmap to avoid - * cacheline trashing. - */ - if (!rdp->qs_pending) - return; - - /* - * Was there a quiescent state since the beginning of the grace - * period? If no, then exit and wait for the next call. - */ - if (!rdp->passed_quiesc) - return; - rdp->qs_pending = 0; - - spin_lock_irqsave(&rcp->lock, flags); - /* - * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync - * during cpu startup. Ignore the quiescent state. - */ - if (likely(rdp->quiescbatch == rcp->cur)) - cpu_quiet(rdp->cpu, rcp); - - spin_unlock_irqrestore(&rcp->lock, flags); -} - - -#ifdef CONFIG_HOTPLUG_CPU - -/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing - * locking requirements, the list it's pulling from has to belong to a cpu - * which is dead and hence not processing interrupts. - */ -static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, - struct rcu_head **tail, long batch) -{ - unsigned long flags; - - if (list) { - local_irq_save(flags); - this_rdp->batch = batch; - *this_rdp->nxttail[2] = list; - this_rdp->nxttail[2] = tail; - local_irq_restore(flags); - } -} - -static void __rcu_offline_cpu(struct rcu_data *this_rdp, - struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ - unsigned long flags; - - /* - * if the cpu going offline owns the grace period - * we can block indefinitely waiting for it, so flush - * it here - */ - spin_lock_irqsave(&rcp->lock, flags); - if (rcp->cur != rcp->completed) - cpu_quiet(rdp->cpu, rcp); - rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1); - rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1); - spin_unlock(&rcp->lock); - - this_rdp->qlen += rdp->qlen; - local_irq_restore(flags); -} - -static void rcu_offline_cpu(int cpu) -{ - struct rcu_data *this_rdp = &get_cpu_var(rcu_data); - struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); - - __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, - &per_cpu(rcu_data, cpu)); - __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, - &per_cpu(rcu_bh_data, cpu)); - put_cpu_var(rcu_data); - put_cpu_var(rcu_bh_data); -} - -#else - -static void rcu_offline_cpu(int cpu) -{ -} - -#endif - -/* - * This does the RCU processing work from softirq context. - */ -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - unsigned long flags; - long completed_snap; - - if (rdp->nxtlist) { - local_irq_save(flags); - completed_snap = ACCESS_ONCE(rcp->completed); - - /* - * move the other grace-period-completed entries to - * [rdp->nxtlist, *rdp->nxttail[0]) temporarily - */ - if (!rcu_batch_before(completed_snap, rdp->batch)) - rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2]; - else if (!rcu_batch_before(completed_snap, rdp->batch - 1)) - rdp->nxttail[0] = rdp->nxttail[1]; - - /* - * the grace period for entries in - * [rdp->nxtlist, *rdp->nxttail[0]) has completed and - * move these entries to donelist - */ - if (rdp->nxttail[0] != &rdp->nxtlist) { - *rdp->donetail = rdp->nxtlist; - rdp->donetail = rdp->nxttail[0]; - rdp->nxtlist = *rdp->nxttail[0]; - *rdp->donetail = NULL; - - if (rdp->nxttail[1] == rdp->nxttail[0]) - rdp->nxttail[1] = &rdp->nxtlist; - if (rdp->nxttail[2] == rdp->nxttail[0]) - rdp->nxttail[2] = &rdp->nxtlist; - rdp->nxttail[0] = &rdp->nxtlist; - } - - local_irq_restore(flags); - - if (rcu_batch_after(rdp->batch, rcp->pending)) { - unsigned long flags2; - - /* and start it/schedule start if it's a new batch */ - spin_lock_irqsave(&rcp->lock, flags2); - if (rcu_batch_after(rdp->batch, rcp->pending)) { - rcp->pending = rdp->batch; - rcu_start_batch(rcp); - } - spin_unlock_irqrestore(&rcp->lock, flags2); - } - } - - rcu_check_quiescent_state(rcp, rdp); - if (rdp->donelist) - rcu_do_batch(rdp); -} - -static void rcu_process_callbacks(struct softirq_action *unused) -{ - /* - * Memory references from any prior RCU read-side critical sections - * executed by the interrupted code must be see before any RCU - * grace-period manupulations below. - */ - - smp_mb(); /* See above block comment. */ - - __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); - __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); - - /* - * Memory references from any later RCU read-side critical sections - * executed by the interrupted code must be see after any RCU - * grace-period manupulations above. - */ - - smp_mb(); /* See above block comment. */ -} - -static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ - /* Check for CPU stalls, if enabled. */ - check_cpu_stall(rcp); - - if (rdp->nxtlist) { - long completed_snap = ACCESS_ONCE(rcp->completed); - - /* - * This cpu has pending rcu entries and the grace period - * for them has completed. - */ - if (!rcu_batch_before(completed_snap, rdp->batch)) - return 1; - if (!rcu_batch_before(completed_snap, rdp->batch - 1) && - rdp->nxttail[0] != rdp->nxttail[1]) - return 1; - if (rdp->nxttail[0] != &rdp->nxtlist) - return 1; - - /* - * This cpu has pending rcu entries and the new batch - * for then hasn't been started nor scheduled start - */ - if (rcu_batch_after(rdp->batch, rcp->pending)) - return 1; - } - - /* This cpu has finished callbacks to invoke */ - if (rdp->donelist) - return 1; - - /* The rcu core waits for a quiescent state from the cpu */ - if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) - return 1; - - /* nothing to do */ - return 0; -} - -/* - * Check to see if there is any immediate RCU-related work to be done - * by the current CPU, returning 1 if so. This function is part of the - * RCU implementation; it is -not- an exported member of the RCU API. - */ -int rcu_pending(int cpu) -{ - return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || - __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); -} - -/* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. This function is part of the RCU implementation; it is -not- - * an exported member of the RCU API. - */ -int rcu_needs_cpu(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); - - return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu); -} - -/* - * Top-level function driving RCU grace-period detection, normally - * invoked from the scheduler-clock interrupt. This function simply - * increments counters that are read only from softirq by this same - * CPU, so there are no memory barriers required. - */ -void rcu_check_callbacks(int cpu, int user) -{ - if (user || - (idle_cpu(cpu) && rcu_scheduler_active && - !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { - - /* - * Get here if this CPU took its interrupt from user - * mode or from the idle loop, and if this is not a - * nested interrupt. In this case, the CPU is in - * a quiescent state, so count it. - * - * Also do a memory barrier. This is needed to handle - * the case where writes from a preempt-disable section - * of code get reordered into schedule() by this CPU's - * write buffer. The memory barrier makes sure that - * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see - * by other CPUs to happen after any such write. - */ - - smp_mb(); /* See above block comment. */ - rcu_qsctr_inc(cpu); - rcu_bh_qsctr_inc(cpu); - - } else if (!in_softirq()) { - - /* - * Get here if this CPU did not take its interrupt from - * softirq, in other words, if it is not interrupting - * a rcu_bh read-side critical section. This is an _bh - * critical section, so count it. The memory barrier - * is needed for the same reason as is the above one. - */ - - smp_mb(); /* See above block comment. */ - rcu_bh_qsctr_inc(cpu); - } - raise_rcu_softirq(); -} - -static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - unsigned long flags; - - spin_lock_irqsave(&rcp->lock, flags); - memset(rdp, 0, sizeof(*rdp)); - rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist; - rdp->donetail = &rdp->donelist; - rdp->quiescbatch = rcp->completed; - rdp->qs_pending = 0; - rdp->cpu = cpu; - rdp->blimit = blimit; - spin_unlock_irqrestore(&rcp->lock, flags); -} - -static void __cpuinit rcu_online_cpu(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu); - - rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); - rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); -} - -static int __cpuinit rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - rcu_online_cpu(cpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - rcu_offline_cpu(cpu); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata rcu_nb = { - .notifier_call = rcu_cpu_notify, -}; - -/* - * Initializes rcu mechanism. Assumed to be called early. - * That is before local timer(SMP) or jiffie timer (uniproc) is setup. - * Note that rcu_qsctr and friends are implicitly - * initialized due to the choice of ``0'' for RCU_CTR_INVALID. - */ -void __init __rcu_init(void) -{ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - /* Register notifier for non-boot CPUs */ - register_cpu_notifier(&rcu_nb); -} - -module_param(blimit, int, 0); -module_param(qhimark, int, 0); -module_param(qlowmark, int, 0); -- cgit v1.2.3-70-g09d2 From 9e48858f7d36a6a3849f1d1b40c3bf5624b4ee7c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 7 May 2009 19:26:19 +1000 Subject: security: rename ptrace_may_access => ptrace_access_check The ->ptrace_may_access() methods are named confusingly - the real ptrace_may_access() returns a bool, while these security checks have a retval convention. Rename it to ptrace_access_check, to reduce the confusion factor. [ Impact: cleanup, no code changed ] Signed-off-by: Ingo Molnar Signed-off-by: James Morris --- include/linux/security.h | 14 +++++++------- kernel/ptrace.c | 2 +- security/capability.c | 2 +- security/commoncap.c | 4 ++-- security/security.c | 4 ++-- security/selinux/hooks.c | 6 +++--- security/smack/smack_lsm.c | 8 ++++---- 7 files changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/linux/security.h b/include/linux/security.h index 5eff459b383..145909165db 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -52,7 +52,7 @@ struct audit_krule; extern int cap_capable(struct task_struct *tsk, const struct cred *cred, int cap, int audit); extern int cap_settime(struct timespec *ts, struct timezone *tz); -extern int cap_ptrace_may_access(struct task_struct *child, unsigned int mode); +extern int cap_ptrace_access_check(struct task_struct *child, unsigned int mode); extern int cap_ptrace_traceme(struct task_struct *parent); extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); extern int cap_capset(struct cred *new, const struct cred *old, @@ -1209,7 +1209,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @alter contains the flag indicating whether changes are to be made. * Return 0 if permission is granted. * - * @ptrace_may_access: + * @ptrace_access_check: * Check permission before allowing the current process to trace the * @child process. * Security modules may also want to perform a process tracing check @@ -1224,7 +1224,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * Check that the @parent process has sufficient permission to trace the * current process before allowing the current process to present itself * to the @parent process for tracing. - * The parent process will still have to undergo the ptrace_may_access + * The parent process will still have to undergo the ptrace_access_check * checks before it is allowed to trace this one. * @parent contains the task_struct structure for debugger process. * Return 0 if permission is granted. @@ -1336,7 +1336,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) struct security_operations { char name[SECURITY_NAME_MAX + 1]; - int (*ptrace_may_access) (struct task_struct *child, unsigned int mode); + int (*ptrace_access_check) (struct task_struct *child, unsigned int mode); int (*ptrace_traceme) (struct task_struct *parent); int (*capget) (struct task_struct *target, kernel_cap_t *effective, @@ -1617,7 +1617,7 @@ extern int security_module_enable(struct security_operations *ops); extern int register_security(struct security_operations *ops); /* Security operations */ -int security_ptrace_may_access(struct task_struct *child, unsigned int mode); +int security_ptrace_access_check(struct task_struct *child, unsigned int mode); int security_ptrace_traceme(struct task_struct *parent); int security_capget(struct task_struct *target, kernel_cap_t *effective, @@ -1798,10 +1798,10 @@ static inline int security_init(void) return 0; } -static inline int security_ptrace_may_access(struct task_struct *child, +static inline int security_ptrace_access_check(struct task_struct *child, unsigned int mode) { - return cap_ptrace_may_access(child, mode); + return cap_ptrace_access_check(child, mode); } static inline int security_ptrace_traceme(struct task_struct *parent) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 61c78b2c07b..9a4184e04f2 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -152,7 +152,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) if (!dumpable && !capable(CAP_SYS_PTRACE)) return -EPERM; - return security_ptrace_may_access(task, mode); + return security_ptrace_access_check(task, mode); } bool ptrace_may_access(struct task_struct *task, unsigned int mode) diff --git a/security/capability.c b/security/capability.c index 21b6cead6a8..f218dd36164 100644 --- a/security/capability.c +++ b/security/capability.c @@ -863,7 +863,7 @@ struct security_operations default_security_ops = { void security_fixup_ops(struct security_operations *ops) { - set_to_cap_if_null(ops, ptrace_may_access); + set_to_cap_if_null(ops, ptrace_access_check); set_to_cap_if_null(ops, ptrace_traceme); set_to_cap_if_null(ops, capget); set_to_cap_if_null(ops, capset); diff --git a/security/commoncap.c b/security/commoncap.c index 48b7e0228fa..aa97704564d 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -101,7 +101,7 @@ int cap_settime(struct timespec *ts, struct timezone *tz) } /** - * cap_ptrace_may_access - Determine whether the current process may access + * cap_ptrace_access_check - Determine whether the current process may access * another * @child: The process to be accessed * @mode: The mode of attachment. @@ -109,7 +109,7 @@ int cap_settime(struct timespec *ts, struct timezone *tz) * Determine whether a process may access another, returning 0 if permission * granted, -ve if denied. */ -int cap_ptrace_may_access(struct task_struct *child, unsigned int mode) +int cap_ptrace_access_check(struct task_struct *child, unsigned int mode) { int ret = 0; diff --git a/security/security.c b/security/security.c index dc7674fbfc7..4501c5e1f98 100644 --- a/security/security.c +++ b/security/security.c @@ -124,9 +124,9 @@ int register_security(struct security_operations *ops) /* Security operations */ -int security_ptrace_may_access(struct task_struct *child, unsigned int mode) +int security_ptrace_access_check(struct task_struct *child, unsigned int mode) { - return security_ops->ptrace_may_access(child, mode); + return security_ops->ptrace_access_check(child, mode); } int security_ptrace_traceme(struct task_struct *parent) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index d6f64783acd..e3b4f3083dd 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1854,12 +1854,12 @@ static inline u32 open_file_to_av(struct file *file) /* Hook functions begin here. */ -static int selinux_ptrace_may_access(struct task_struct *child, +static int selinux_ptrace_access_check(struct task_struct *child, unsigned int mode) { int rc; - rc = cap_ptrace_may_access(child, mode); + rc = cap_ptrace_access_check(child, mode); if (rc) return rc; @@ -5315,7 +5315,7 @@ static int selinux_key_getsecurity(struct key *key, char **_buffer) static struct security_operations selinux_ops = { .name = "selinux", - .ptrace_may_access = selinux_ptrace_may_access, + .ptrace_access_check = selinux_ptrace_access_check, .ptrace_traceme = selinux_ptrace_traceme, .capget = selinux_capget, .capset = selinux_capset, diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 0023182078c..1c9bdbcbe3d 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -91,7 +91,7 @@ struct inode_smack *new_inode_smack(char *smack) */ /** - * smack_ptrace_may_access - Smack approval on PTRACE_ATTACH + * smack_ptrace_access_check - Smack approval on PTRACE_ATTACH * @ctp: child task pointer * @mode: ptrace attachment mode * @@ -99,13 +99,13 @@ struct inode_smack *new_inode_smack(char *smack) * * Do the capability checks, and require read and write. */ -static int smack_ptrace_may_access(struct task_struct *ctp, unsigned int mode) +static int smack_ptrace_access_check(struct task_struct *ctp, unsigned int mode) { int rc; struct smk_audit_info ad; char *sp, *tsp; - rc = cap_ptrace_may_access(ctp, mode); + rc = cap_ptrace_access_check(ctp, mode); if (rc != 0) return rc; @@ -3032,7 +3032,7 @@ static void smack_release_secctx(char *secdata, u32 seclen) struct security_operations smack_ops = { .name = "smack", - .ptrace_may_access = smack_ptrace_may_access, + .ptrace_access_check = smack_ptrace_access_check, .ptrace_traceme = smack_ptrace_traceme, .syslog = smack_syslog, -- cgit v1.2.3-70-g09d2 From 54d35f29f49224d86b994acb6e5969b9ba09022d Mon Sep 17 00:00:00 2001 From: Hitoshi Mitake Date: Mon, 29 Jun 2009 14:44:57 +0900 Subject: sched: Hide runqueues from direct reference at source code level for __raw_get_cpu_var() Hide __raw_get_cpu_var() as well - thus all the direct references to runqueues will abstracted out. Signed-off-by: Hitoshi Mitake LKML-Reference: <20090629.144457.886429910353660979.mitake@dcl.info.waseda.ac.jp> Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 168b2680ae2..ebc5151f88c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -692,6 +692,7 @@ static inline int cpu_of(struct rq *rq) #define this_rq() (&__get_cpu_var(runqueues)) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define raw_rq() (&__raw_get_cpu_var(runqueues)) inline void update_rq_clock(struct rq *rq) { @@ -6669,7 +6670,7 @@ EXPORT_SYMBOL(yield); */ void __sched io_schedule(void) { - struct rq *rq = &__raw_get_cpu_var(runqueues); + struct rq *rq = raw_rq(); delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); @@ -6681,7 +6682,7 @@ EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { - struct rq *rq = &__raw_get_cpu_var(runqueues); + struct rq *rq = raw_rq(); long ret; delayacct_blkio_start(); -- cgit v1.2.3-70-g09d2 From c5cb5a2d8d7dc872cf1504091ad0e59fe5ff7cb5 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 30 Jun 2009 17:08:14 -0400 Subject: kprobes: Clean up insn_pages by using list instead of hlist Use struct list instead of struct hlist for managing insn_pages, because insn_pages doesn't use hash table. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli LKML-Reference: <20090630210814.17851.64651.stgit@localhost.localdomain> Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 16b5739c516..6fe9dc6d1a8 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -103,7 +103,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = { #define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) struct kprobe_insn_page { - struct hlist_node hlist; + struct list_head list; kprobe_opcode_t *insns; /* Page of instruction slots */ char slot_used[INSNS_PER_PAGE]; int nused; @@ -117,7 +117,7 @@ enum kprobe_slot_state { }; static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ -static struct hlist_head kprobe_insn_pages; +static LIST_HEAD(kprobe_insn_pages); static int kprobe_garbage_slots; static int collect_garbage_slots(void); @@ -152,10 +152,9 @@ loop_end: static kprobe_opcode_t __kprobes *__get_insn_slot(void) { struct kprobe_insn_page *kip; - struct hlist_node *pos; retry: - hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { + list_for_each_entry(kip, &kprobe_insn_pages, list) { if (kip->nused < INSNS_PER_PAGE) { int i; for (i = 0; i < INSNS_PER_PAGE; i++) { @@ -189,8 +188,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void) kfree(kip); return NULL; } - INIT_HLIST_NODE(&kip->hlist); - hlist_add_head(&kip->hlist, &kprobe_insn_pages); + INIT_LIST_HEAD(&kip->list); + list_add(&kip->list, &kprobe_insn_pages); memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE); kip->slot_used[0] = SLOT_USED; kip->nused = 1; @@ -219,12 +218,8 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) * so as not to have to set it up again the * next time somebody inserts a probe. */ - hlist_del(&kip->hlist); - if (hlist_empty(&kprobe_insn_pages)) { - INIT_HLIST_NODE(&kip->hlist); - hlist_add_head(&kip->hlist, - &kprobe_insn_pages); - } else { + if (!list_is_singular(&kprobe_insn_pages)) { + list_del(&kip->list); module_free(NULL, kip->insns); kfree(kip); } @@ -235,14 +230,13 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) static int __kprobes collect_garbage_slots(void) { - struct kprobe_insn_page *kip; - struct hlist_node *pos, *next; + struct kprobe_insn_page *kip, *next; /* Ensure no-one is preepmted on the garbages */ if (check_safety()) return -EAGAIN; - hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { + list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { int i; if (kip->ngarbage == 0) continue; @@ -260,19 +254,17 @@ static int __kprobes collect_garbage_slots(void) void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) { struct kprobe_insn_page *kip; - struct hlist_node *pos; mutex_lock(&kprobe_insn_mutex); - hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { + list_for_each_entry(kip, &kprobe_insn_pages, list) { if (kip->insns <= slot && slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { int i = (slot - kip->insns) / MAX_INSN_SIZE; if (dirty) { kip->slot_used[i] = SLOT_DIRTY; kip->ngarbage++; - } else { + } else collect_one_slot(kip, i); - } break; } } -- cgit v1.2.3-70-g09d2 From 020e5f85cb087a40572c8b8b2dd06292a14fa212 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 1 Jul 2009 10:47:05 +0800 Subject: tracing/events: Add trace_event boot option We already have ftrace= boot option, and this adds a similar boot option for trace events, so allow trace events to be enabled at boot, for boot debugging purpose. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A4ACE29.3010407@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 5 +++++ Documentation/trace/events.txt | 9 +++++++++ kernel/trace/trace.c | 4 ++-- kernel/trace/trace.h | 3 +++ kernel/trace/trace_events.c | 37 +++++++++++++++++++++++++++++++++---- 5 files changed, 52 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index d3f41db3ed4..2582e7aea29 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2478,6 +2478,11 @@ and is between 256 and 4096 characters. It is defined in the file trace_buf_size=nn[KMG] [FTRACE] will set tracing buffer size. + trace_event=[event-list] + [FTRACE] Set and start specified trace events in order + to facilitate early boot debugging. + See also Documentation/trace/events.txt + trix= [HW,OSS] MediaTrix AudioTrix Pro Format: ,,,,,,,, diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt index f157d7594ea..2bcc8d4dea2 100644 --- a/Documentation/trace/events.txt +++ b/Documentation/trace/events.txt @@ -83,6 +83,15 @@ When reading one of these enable files, there are four results: X - there is a mixture of events enabled and disabled ? - this file does not affect any event +2.3 Boot option +--------------- + +In order to facilitate early boot debugging, use boot option: + + trace_event=[event-list] + +The format of this boot option is the same as described in section 2.1. + 3. Defining an event-enabled tracepoint ======================================= diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3aa0a0dfdfa..bdb3afc8b30 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -49,7 +49,7 @@ unsigned long __read_mostly tracing_thresh; * On boot up, the ring buffer is set to the minimum size, so that * we do not waste memory on systems that are not using tracing. */ -static int ring_buffer_expanded; +int ring_buffer_expanded; /* * We need to change this state when a selftest is running. @@ -63,7 +63,7 @@ static bool __read_mostly tracing_selftest_running; /* * If a tracer is running, we do not want to run SELFTEST. */ -static bool __read_mostly tracing_selftest_disabled; +bool __read_mostly tracing_selftest_disabled; /* For tracers that don't implement custom flags */ static struct tracer_opt dummy_tracer_opt[] = { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3548ae5cc78..52eb0d8dcd7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -517,6 +517,9 @@ extern unsigned long ftrace_update_tot_cnt; extern int DYN_FTRACE_TEST_NAME(void); #endif +extern int ring_buffer_expanded; +extern bool tracing_selftest_disabled; + #ifdef CONFIG_FTRACE_STARTUP_TEST extern int trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 53c8fd376a8..fecac1314cb 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -17,6 +17,8 @@ #include #include +#include + #include "trace_output.h" #define TRACE_SYSTEM "TRACE_SYSTEM" @@ -1133,6 +1135,18 @@ struct notifier_block trace_module_nb = { extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; +static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; + +static __init int setup_trace_event(char *str) +{ + strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); + ring_buffer_expanded = 1; + tracing_selftest_disabled = 1; + + return 1; +} +__setup("trace_event=", setup_trace_event); + static __init int event_trace_init(void) { struct ftrace_event_call *call; @@ -1140,6 +1154,8 @@ static __init int event_trace_init(void) struct dentry *entry; struct dentry *d_events; int ret; + char *buf = bootup_event_buf; + char *token; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -1185,6 +1201,19 @@ static __init int event_trace_init(void) &ftrace_event_format_fops); } + while (true) { + token = strsep(&buf, ","); + + if (!token) + break; + if (!*token) + continue; + + ret = ftrace_set_clr_event(token, 1); + if (ret) + pr_warning("Failed to enable trace event: %s\n", token); + } + ret = register_module_notifier(&trace_module_nb); if (ret) pr_warning("Failed to register trace events module notifier\n"); @@ -1392,10 +1421,10 @@ static __init void event_trace_self_test_with_function(void) static __init int event_trace_self_tests_init(void) { - - event_trace_self_tests(); - - event_trace_self_test_with_function(); + if (!tracing_selftest_disabled) { + event_trace_self_tests(); + event_trace_self_test_with_function(); + } return 0; } -- cgit v1.2.3-70-g09d2 From 03b042bf1dc14a268a3d65d38b4ec2a4261e8477 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Jun 2009 09:08:16 -0700 Subject: rcu: Add synchronize_sched_expedited() primitive This adds the synchronize_sched_expedited() primitive that implements the "big hammer" expedited RCU grace periods. This primitive is placed in kernel/sched.c rather than kernel/rcupdate.c due to its need to interact closely with the migration_thread() kthread. The idea is to wake up this kthread with req->task set to NULL, in response to which the kthread reports the quiescent state resulting from the kthread having been scheduled. Because this patch needs to fallback to the slow versions of the primitives in response to some races with CPU onlining and offlining, a new synchronize_rcu_bh() primitive is added as well. Signed-off-by: Paul E. McKenney Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Cc: davem@davemloft.net Cc: dada1@cosmosbay.com Cc: zbr@ioremap.net Cc: jeff.chua.linux@gmail.com Cc: paulus@samba.org Cc: laijs@cn.fujitsu.com Cc: jengelh@medozas.de Cc: r000n@r000n.net Cc: benh@kernel.crashing.org Cc: mathieu.desnoyers@polymtl.ca LKML-Reference: <12459460982947-git-send-email-> Signed-off-by: Ingo Molnar --- include/linux/rcupdate.h | 25 ++++----- include/linux/rcupreempt.h | 10 ++++ include/linux/rcutree.h | 12 ++++- kernel/rcupdate.c | 25 +++++++++ kernel/sched.c | 129 ++++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 186 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 0cdfdb622fa..3c89d6a2591 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -51,7 +51,19 @@ struct rcu_head { void (*func)(struct rcu_head *head); }; -/* Internal to kernel, but needed by rcupreempt.h. */ +/* Exported common interfaces */ +extern void synchronize_rcu(void); +extern void synchronize_rcu_bh(void); +extern void rcu_barrier(void); +extern void rcu_barrier_bh(void); +extern void rcu_barrier_sched(void); +extern void synchronize_sched_expedited(void); +extern int sched_expedited_torture_stats(char *page); + +/* Internal to kernel */ +extern void rcu_init(void); +extern void rcu_scheduler_starting(void); +extern int rcu_needs_cpu(int cpu); extern int rcu_scheduler_active; #if defined(CONFIG_TREE_RCU) @@ -257,15 +269,4 @@ extern void call_rcu(struct rcu_head *head, extern void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *head)); -/* Exported common interfaces */ -extern void synchronize_rcu(void); -extern void rcu_barrier(void); -extern void rcu_barrier_bh(void); -extern void rcu_barrier_sched(void); - -/* Internal to kernel */ -extern void rcu_init(void); -extern void rcu_scheduler_starting(void); -extern int rcu_needs_cpu(int cpu); - #endif /* __LINUX_RCUPDATE_H */ diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h index fce522782ff..f164ac9b780 100644 --- a/include/linux/rcupreempt.h +++ b/include/linux/rcupreempt.h @@ -74,6 +74,16 @@ extern int rcu_needs_cpu(int cpu); extern void __synchronize_sched(void); +static inline void synchronize_rcu_expedited(void) +{ + synchronize_rcu(); /* Placeholder for new rcupreempt implementation. */ +} + +static inline void synchronize_rcu_bh_expedited(void) +{ + synchronize_rcu_bh(); /* Placeholder for new rcupreempt impl. */ +} + extern void __rcu_init(void); extern void rcu_init_sched(void); extern void rcu_check_callbacks(int cpu, int user); diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 5a5153806c4..d4dfd248963 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -286,8 +286,14 @@ static inline void __rcu_read_unlock_bh(void) #define call_rcu_sched(head, func) call_rcu(head, func) -static inline void rcu_init_sched(void) +static inline void synchronize_rcu_expedited(void) +{ + synchronize_sched_expedited(); +} + +static inline void synchronize_rcu_bh_expedited(void) { + synchronize_sched_expedited(); } extern void __rcu_init(void); @@ -297,6 +303,10 @@ extern void rcu_restart_cpu(int cpu); extern long rcu_batches_completed(void); extern long rcu_batches_completed_bh(void); +static inline void rcu_init_sched(void) +{ +} + #ifdef CONFIG_NO_HZ void rcu_enter_nohz(void); void rcu_exit_nohz(void); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a967c9feb90..eae29c25fb1 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -98,6 +98,30 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); +/** + * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. + * + * Control will return to the caller some time after a full rcu_bh grace + * period has elapsed, in other words after all currently executing rcu_bh + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), + * and may be nested. + */ +void synchronize_rcu_bh(void) +{ + struct rcu_synchronize rcu; + + if (rcu_blocking_is_gp()) + return; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_bh(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_bh); + static void rcu_barrier_callback(struct rcu_head *notused) { if (atomic_dec_and_test(&rcu_barrier_cpu_count)) @@ -129,6 +153,7 @@ static void rcu_barrier_func(void *type) static inline void wait_migrated_callbacks(void) { wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); + smp_mb(); /* In case we didn't sleep. */ } /* diff --git a/kernel/sched.c b/kernel/sched.c index 7c9098d186e..9ae80bec1c1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7024,6 +7024,11 @@ fail: return ret; } +#define RCU_MIGRATION_IDLE 0 +#define RCU_MIGRATION_NEED_QS 1 +#define RCU_MIGRATION_GOT_QS 2 +#define RCU_MIGRATION_MUST_SYNC 3 + /* * migration_thread - this is a highprio system thread that performs * thread migration by bumping thread off CPU then 'pushing' onto @@ -7031,6 +7036,7 @@ fail: */ static int migration_thread(void *data) { + int badcpu; int cpu = (long)data; struct rq *rq; @@ -7065,8 +7071,17 @@ static int migration_thread(void *data) req = list_entry(head->next, struct migration_req, list); list_del_init(head->next); - spin_unlock(&rq->lock); - __migrate_task(req->task, cpu, req->dest_cpu); + if (req->task != NULL) { + spin_unlock(&rq->lock); + __migrate_task(req->task, cpu, req->dest_cpu); + } else if (likely(cpu == (badcpu = smp_processor_id()))) { + req->dest_cpu = RCU_MIGRATION_GOT_QS; + spin_unlock(&rq->lock); + } else { + req->dest_cpu = RCU_MIGRATION_MUST_SYNC; + spin_unlock(&rq->lock); + WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); + } local_irq_enable(); complete(&req->done); @@ -10554,3 +10569,113 @@ struct cgroup_subsys cpuacct_subsys = { .subsys_id = cpuacct_subsys_id, }; #endif /* CONFIG_CGROUP_CPUACCT */ + +#ifndef CONFIG_SMP + +int rcu_expedited_torture_stats(char *page) +{ + return 0; +} +EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); + +void synchronize_sched_expedited(void) +{ +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#else /* #ifndef CONFIG_SMP */ + +static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); +static DEFINE_MUTEX(rcu_sched_expedited_mutex); + +#define RCU_EXPEDITED_STATE_POST -2 +#define RCU_EXPEDITED_STATE_IDLE -1 + +static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; + +int rcu_expedited_torture_stats(char *page) +{ + int cnt = 0; + int cpu; + + cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); + for_each_online_cpu(cpu) { + cnt += sprintf(&page[cnt], " %d:%d", + cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); + } + cnt += sprintf(&page[cnt], "\n"); + return cnt; +} +EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); + +static long synchronize_sched_expedited_count; + +/* + * Wait for an rcu-sched grace period to elapse, but use "big hammer" + * approach to force grace period to end quickly. This consumes + * significant time on all CPUs, and is thus not recommended for + * any sort of common-case code. + * + * Note that it is illegal to call this function while holding any + * lock that is acquired by a CPU-hotplug notifier. Failing to + * observe this restriction will result in deadlock. + */ +void synchronize_sched_expedited(void) +{ + int cpu; + unsigned long flags; + bool need_full_sync = 0; + struct rq *rq; + struct migration_req *req; + long snap; + int trycount = 0; + + smp_mb(); /* ensure prior mod happens before capturing snap. */ + snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; + get_online_cpus(); + while (!mutex_trylock(&rcu_sched_expedited_mutex)) { + put_online_cpus(); + if (trycount++ < 10) + udelay(trycount * num_online_cpus()); + else { + synchronize_sched(); + return; + } + if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { + smp_mb(); /* ensure test happens before caller kfree */ + return; + } + get_online_cpus(); + } + rcu_expedited_state = RCU_EXPEDITED_STATE_POST; + for_each_online_cpu(cpu) { + rq = cpu_rq(cpu); + req = &per_cpu(rcu_migration_req, cpu); + init_completion(&req->done); + req->task = NULL; + req->dest_cpu = RCU_MIGRATION_NEED_QS; + spin_lock_irqsave(&rq->lock, flags); + list_add(&req->list, &rq->migration_queue); + spin_unlock_irqrestore(&rq->lock, flags); + wake_up_process(rq->migration_thread); + } + for_each_online_cpu(cpu) { + rcu_expedited_state = cpu; + req = &per_cpu(rcu_migration_req, cpu); + rq = cpu_rq(cpu); + wait_for_completion(&req->done); + spin_lock_irqsave(&rq->lock, flags); + if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) + need_full_sync = 1; + req->dest_cpu = RCU_MIGRATION_IDLE; + spin_unlock_irqrestore(&rq->lock, flags); + } + rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; + mutex_unlock(&rcu_sched_expedited_mutex); + put_online_cpus(); + if (need_full_sync) + synchronize_sched(); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#endif /* #else #ifndef CONFIG_SMP */ -- cgit v1.2.3-70-g09d2 From 0acc512cb1a29636df5e982c7d845edafe77c2d0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Jun 2009 09:08:17 -0700 Subject: rcu: Add synchronize_sched_expedited() torture tests This patch adds rcutorture tests for the new synchronize_sched_expedited() primitive, and also does some whitespace cleanups in kernel/rcutorture.c as well. Signed-off-by: Paul E. McKenney Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Cc: davem@davemloft.net Cc: dada1@cosmosbay.com Cc: zbr@ioremap.net Cc: jeff.chua.linux@gmail.com Cc: paulus@samba.org Cc: laijs@cn.fujitsu.com Cc: jengelh@medozas.de Cc: r000n@r000n.net Cc: benh@kernel.crashing.org Cc: mathieu.desnoyers@polymtl.ca LKML-Reference: <12459460981342-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutorture.c | 202 ++++++++++++++++++++++++++++------------------------ 1 file changed, 110 insertions(+), 92 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 9b4a975a4b4..b33db539a8a 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -257,14 +257,14 @@ struct rcu_torture_ops { void (*init)(void); void (*cleanup)(void); int (*readlock)(void); - void (*readdelay)(struct rcu_random_state *rrsp); + void (*read_delay)(struct rcu_random_state *rrsp); void (*readunlock)(int idx); int (*completed)(void); - void (*deferredfree)(struct rcu_torture *p); + void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*cb_barrier)(void); int (*stats)(char *page); - int irqcapable; + int irq_capable; char *name; }; static struct rcu_torture_ops *cur_ops = NULL; @@ -320,7 +320,7 @@ rcu_torture_cb(struct rcu_head *p) rp->rtort_mbtest = 0; rcu_torture_free(rp); } else - cur_ops->deferredfree(rp); + cur_ops->deferred_free(rp); } static void rcu_torture_deferred_free(struct rcu_torture *p) @@ -329,18 +329,18 @@ static void rcu_torture_deferred_free(struct rcu_torture *p) } static struct rcu_torture_ops rcu_ops = { - .init = NULL, - .cleanup = NULL, - .readlock = rcu_torture_read_lock, - .readdelay = rcu_read_delay, - .readunlock = rcu_torture_read_unlock, - .completed = rcu_torture_completed, - .deferredfree = rcu_torture_deferred_free, - .sync = synchronize_rcu, - .cb_barrier = rcu_barrier, - .stats = NULL, - .irqcapable = 1, - .name = "rcu" + .init = NULL, + .cleanup = NULL, + .readlock = rcu_torture_read_lock, + .read_delay = rcu_read_delay, + .readunlock = rcu_torture_read_unlock, + .completed = rcu_torture_completed, + .deferred_free = rcu_torture_deferred_free, + .sync = synchronize_rcu, + .cb_barrier = rcu_barrier, + .stats = NULL, + .irq_capable = 1, + .name = "rcu" }; static void rcu_sync_torture_deferred_free(struct rcu_torture *p) @@ -370,18 +370,18 @@ static void rcu_sync_torture_init(void) } static struct rcu_torture_ops rcu_sync_ops = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = rcu_torture_read_lock, - .readdelay = rcu_read_delay, - .readunlock = rcu_torture_read_unlock, - .completed = rcu_torture_completed, - .deferredfree = rcu_sync_torture_deferred_free, - .sync = synchronize_rcu, - .cb_barrier = NULL, - .stats = NULL, - .irqcapable = 1, - .name = "rcu_sync" + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = rcu_torture_read_lock, + .read_delay = rcu_read_delay, + .readunlock = rcu_torture_read_unlock, + .completed = rcu_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = synchronize_rcu, + .cb_barrier = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "rcu_sync" }; /* @@ -432,33 +432,33 @@ static void rcu_bh_torture_synchronize(void) } static struct rcu_torture_ops rcu_bh_ops = { - .init = NULL, - .cleanup = NULL, - .readlock = rcu_bh_torture_read_lock, - .readdelay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, - .deferredfree = rcu_bh_torture_deferred_free, - .sync = rcu_bh_torture_synchronize, - .cb_barrier = rcu_barrier_bh, - .stats = NULL, - .irqcapable = 1, - .name = "rcu_bh" + .init = NULL, + .cleanup = NULL, + .readlock = rcu_bh_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_bh_torture_read_unlock, + .completed = rcu_bh_torture_completed, + .deferred_free = rcu_bh_torture_deferred_free, + .sync = rcu_bh_torture_synchronize, + .cb_barrier = rcu_barrier_bh, + .stats = NULL, + .irq_capable = 1, + .name = "rcu_bh" }; static struct rcu_torture_ops rcu_bh_sync_ops = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = rcu_bh_torture_read_lock, - .readdelay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, - .deferredfree = rcu_sync_torture_deferred_free, - .sync = rcu_bh_torture_synchronize, - .cb_barrier = NULL, - .stats = NULL, - .irqcapable = 1, - .name = "rcu_bh_sync" + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = rcu_bh_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_bh_torture_read_unlock, + .completed = rcu_bh_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = rcu_bh_torture_synchronize, + .cb_barrier = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "rcu_bh_sync" }; /* @@ -530,17 +530,17 @@ static int srcu_torture_stats(char *page) } static struct rcu_torture_ops srcu_ops = { - .init = srcu_torture_init, - .cleanup = srcu_torture_cleanup, - .readlock = srcu_torture_read_lock, - .readdelay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock, - .completed = srcu_torture_completed, - .deferredfree = rcu_sync_torture_deferred_free, - .sync = srcu_torture_synchronize, - .cb_barrier = NULL, - .stats = srcu_torture_stats, - .name = "srcu" + .init = srcu_torture_init, + .cleanup = srcu_torture_cleanup, + .readlock = srcu_torture_read_lock, + .read_delay = srcu_read_delay, + .readunlock = srcu_torture_read_unlock, + .completed = srcu_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = srcu_torture_synchronize, + .cb_barrier = NULL, + .stats = srcu_torture_stats, + .name = "srcu" }; /* @@ -574,32 +574,49 @@ static void sched_torture_synchronize(void) } static struct rcu_torture_ops sched_ops = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = sched_torture_read_lock, - .readdelay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = sched_torture_read_unlock, - .completed = sched_torture_completed, - .deferredfree = rcu_sched_torture_deferred_free, - .sync = sched_torture_synchronize, - .cb_barrier = rcu_barrier_sched, - .stats = NULL, - .irqcapable = 1, - .name = "sched" + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = sched_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = sched_torture_read_unlock, + .completed = sched_torture_completed, + .deferred_free = rcu_sched_torture_deferred_free, + .sync = sched_torture_synchronize, + .cb_barrier = rcu_barrier_sched, + .stats = NULL, + .irq_capable = 1, + .name = "sched" }; static struct rcu_torture_ops sched_ops_sync = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = sched_torture_read_lock, - .readdelay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = sched_torture_read_unlock, - .completed = sched_torture_completed, - .deferredfree = rcu_sync_torture_deferred_free, - .sync = sched_torture_synchronize, - .cb_barrier = NULL, - .stats = NULL, - .name = "sched_sync" + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = sched_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = sched_torture_read_unlock, + .completed = sched_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = sched_torture_synchronize, + .cb_barrier = NULL, + .stats = NULL, + .name = "sched_sync" +}; + +extern int rcu_expedited_torture_stats(char *page); + +static struct rcu_torture_ops sched_expedited_ops = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = sched_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = sched_torture_read_unlock, + .completed = sched_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = synchronize_sched_expedited, + .cb_barrier = NULL, + .stats = rcu_expedited_torture_stats, + .irq_capable = 1, + .name = "sched_expedited" }; /* @@ -635,7 +652,7 @@ rcu_torture_writer(void *arg) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); old_rp->rtort_pipe_count++; - cur_ops->deferredfree(old_rp); + cur_ops->deferred_free(old_rp); } rcu_torture_current_version++; oldbatch = cur_ops->completed(); @@ -700,7 +717,7 @@ static void rcu_torture_timer(unsigned long unused) if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); spin_lock(&rand_lock); - cur_ops->readdelay(&rand); + cur_ops->read_delay(&rand); n_rcu_torture_timers++; spin_unlock(&rand_lock); preempt_disable(); @@ -738,11 +755,11 @@ rcu_torture_reader(void *arg) VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); set_user_nice(current, 19); - if (irqreader && cur_ops->irqcapable) + if (irqreader && cur_ops->irq_capable) setup_timer_on_stack(&t, rcu_torture_timer, 0); do { - if (irqreader && cur_ops->irqcapable) { + if (irqreader && cur_ops->irq_capable) { if (!timer_pending(&t)) mod_timer(&t, 1); } @@ -757,7 +774,7 @@ rcu_torture_reader(void *arg) } if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); - cur_ops->readdelay(&rand); + cur_ops->read_delay(&rand); preempt_disable(); pipe_count = p->rtort_pipe_count; if (pipe_count > RCU_TORTURE_PIPE_LEN) { @@ -778,7 +795,7 @@ rcu_torture_reader(void *arg) } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); rcutorture_shutdown_absorb("rcu_torture_reader"); - if (irqreader && cur_ops->irqcapable) + if (irqreader && cur_ops->irq_capable) del_timer_sync(&t); while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); @@ -1078,6 +1095,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, + &sched_expedited_ops, &srcu_ops, &sched_ops, &sched_ops_sync, }; mutex_lock(&fullstop_mutex); -- cgit v1.2.3-70-g09d2 From 4d09161196c9a836eacea4b36e2f217bc34894cf Mon Sep 17 00:00:00 2001 From: Robin Getz Date: Wed, 1 Jul 2009 21:08:37 -0400 Subject: printk: Enable the use of more than one CON_BOOT (early console) Today, register_console() assumes the following usage: - The first console to register with a flag set to CON_BOOT is the one and only bootconsole. - If another register_console() is called with an additional CON_BOOT, it is silently rejected. - As soon as a console without the CON_BOOT set calls registers the bootconsole is automatically unregistered. - Once there is a "real" console - register_console() will silently reject any consoles with it's CON_BOOT flag set. In many systems (alpha, blackfin, microblaze, mips, powerpc, sh, & x86), there are early_printk implementations, which use the CON_BOOT which come out serial ports, vga, usb, & memory buffers. In many embedded systems, it would be nice to have two bootconsoles - in case the primary fails, you always have access to a backup memory buffer - but this requires at least two CON_BOOT consoles... This patch enables that functionality. With the change applied, on boot you get (if you try to re-enable a boot console after the "real" console has been registered): root:/> dmesg | grep console bootconsole [early_shadow0] enabled bootconsole [early_BFuart0] enabled Kernel command line: root=/dev/mtdblock0 rw earlyprintk=serial,uart0,57600 console=ttyBF0,57600 nmi_debug=regs console handover:boot [early_BFuart0] boot [early_shadow0] -> real [ttyBF0] Too late to register bootconsole early_shadow0 or: root:/> dmesg | grep console Kernel command line: root=/dev/mtdblock0 rw console=ttyBF0,57600 console [ttyBF0] enabled Signed-off-by: Robin Getz Cc: "Linus Torvalds" Cc: "Andrew Morton" Cc: "Mike Frysinger" Cc: "Paul Mundt" LKML-Reference: <200907012108.38030.rgetz@blackfin.uclinux.org> Signed-off-by: Ingo Molnar --- kernel/printk.c | 146 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 92 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index b4d97b54c1e..41fe6099553 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -36,6 +36,12 @@ #include +/* + * for_each_console() allows you to iterate on each console + */ +#define for_each_console(con) \ + for (con = console_drivers; con != NULL; con = con->next) + /* * Architectures can override it: */ @@ -412,7 +418,7 @@ static void __call_console_drivers(unsigned start, unsigned end) { struct console *con; - for (con = console_drivers; con; con = con->next) { + for_each_console(con) { if ((con->flags & CON_ENABLED) && con->write && (cpu_online(smp_processor_id()) || (con->flags & CON_ANYTIME))) @@ -544,7 +550,7 @@ static int have_callable_console(void) { struct console *con; - for (con = console_drivers; con; con = con->next) + for_each_console(con) if (con->flags & CON_ANYTIME) return 1; @@ -1082,7 +1088,7 @@ void console_unblank(void) console_locked = 1; console_may_schedule = 0; - for (c = console_drivers; c != NULL; c = c->next) + for_each_console(c) if ((c->flags & CON_ENABLED) && c->unblank) c->unblank(); release_console_sem(); @@ -1097,7 +1103,7 @@ struct tty_driver *console_device(int *index) struct tty_driver *driver = NULL; acquire_console_sem(); - for (c = console_drivers; c != NULL; c = c->next) { + for_each_console(c) { if (!c->device) continue; driver = c->device(c, index); @@ -1134,25 +1140,49 @@ EXPORT_SYMBOL(console_start); * to register the console printing procedure with printk() and to * print any messages that were printed by the kernel before the * console driver was initialized. + * + * This can happen pretty early during the boot process (because of + * early_printk) - sometimes before setup_arch() completes - be careful + * of what kernel features are used - they may not be initialised yet. + * + * There are two types of consoles - bootconsoles (early_printk) and + * "real" consoles (everything which is not a bootconsole) which are + * handled differently. + * - Any number of bootconsoles can be registered at any time. + * - As soon as a "real" console is registered, all bootconsoles + * will be unregistered automatically. + * - Once a "real" console is registered, any attempt to register a + * bootconsoles will be rejected */ -void register_console(struct console *console) +void register_console(struct console *newcon) { int i; unsigned long flags; - struct console *bootconsole = NULL; + struct console *bcon = NULL; - if (console_drivers) { - if (console->flags & CON_BOOT) - return; - if (console_drivers->flags & CON_BOOT) - bootconsole = console_drivers; + /* + * before we register a new CON_BOOT console, make sure we don't + * already have a valid console + */ + if (console_drivers && newcon->flags & CON_BOOT) { + /* find the last or real console */ + for_each_console(bcon) { + if (!(bcon->flags & CON_BOOT)) { + printk(KERN_INFO "Too late to register bootconsole %s%d\n", + newcon->name, newcon->index); + return; + } + } } - if (preferred_console < 0 || bootconsole || !console_drivers) + if (console_drivers && console_drivers->flags & CON_BOOT) + bcon = console_drivers; + + if (preferred_console < 0 || bcon || !console_drivers) preferred_console = selected_console; - if (console->early_setup) - console->early_setup(); + if (newcon->early_setup) + newcon->early_setup(); /* * See if we want to use this console driver. If we @@ -1160,13 +1190,13 @@ void register_console(struct console *console) * that registers here. */ if (preferred_console < 0) { - if (console->index < 0) - console->index = 0; - if (console->setup == NULL || - console->setup(console, NULL) == 0) { - console->flags |= CON_ENABLED; - if (console->device) { - console->flags |= CON_CONSDEV; + if (newcon->index < 0) + newcon->index = 0; + if (newcon->setup == NULL || + newcon->setup(newcon, NULL) == 0) { + newcon->flags |= CON_ENABLED; + if (newcon->device) { + newcon->flags |= CON_CONSDEV; preferred_console = 0; } } @@ -1178,47 +1208,53 @@ void register_console(struct console *console) */ for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) { - if (strcmp(console_cmdline[i].name, console->name) != 0) + if (strcmp(console_cmdline[i].name, newcon->name) != 0) continue; - if (console->index >= 0 && - console->index != console_cmdline[i].index) + if (newcon->index >= 0 && + newcon->index != console_cmdline[i].index) continue; - if (console->index < 0) - console->index = console_cmdline[i].index; + if (newcon->index < 0) + newcon->index = console_cmdline[i].index; #ifdef CONFIG_A11Y_BRAILLE_CONSOLE if (console_cmdline[i].brl_options) { - console->flags |= CON_BRL; - braille_register_console(console, + newcon->flags |= CON_BRL; + braille_register_console(newcon, console_cmdline[i].index, console_cmdline[i].options, console_cmdline[i].brl_options); return; } #endif - if (console->setup && - console->setup(console, console_cmdline[i].options) != 0) + if (newcon->setup && + newcon->setup(newcon, console_cmdline[i].options) != 0) break; - console->flags |= CON_ENABLED; - console->index = console_cmdline[i].index; + newcon->flags |= CON_ENABLED; + newcon->index = console_cmdline[i].index; if (i == selected_console) { - console->flags |= CON_CONSDEV; + newcon->flags |= CON_CONSDEV; preferred_console = selected_console; } break; } - if (!(console->flags & CON_ENABLED)) + if (!(newcon->flags & CON_ENABLED)) return; - if (bootconsole && (console->flags & CON_CONSDEV)) { - printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n", - bootconsole->name, bootconsole->index, - console->name, console->index); - unregister_console(bootconsole); - console->flags &= ~CON_PRINTBUFFER; + if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { + /* we need to iterate through twice, to make sure we print + * everything out, before we unregister the console(s) + */ + printk(KERN_INFO "console handover:"); + for_each_console(bcon) + printk("boot [%s%d] ", bcon->name, bcon->index); + printk(" -> real [%s%d]\n", newcon->name, newcon->index); + for_each_console(bcon) + unregister_console(bcon); + newcon->flags &= ~CON_PRINTBUFFER; } else { - printk(KERN_INFO "console [%s%d] enabled\n", - console->name, console->index); + printk(KERN_INFO "%sconsole [%s%d] enabled\n", + (newcon->flags & CON_BOOT) ? "boot" : "" , + newcon->name, newcon->index); } /* @@ -1226,16 +1262,16 @@ void register_console(struct console *console) * preferred driver at the head of the list. */ acquire_console_sem(); - if ((console->flags & CON_CONSDEV) || console_drivers == NULL) { - console->next = console_drivers; - console_drivers = console; - if (console->next) - console->next->flags &= ~CON_CONSDEV; + if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) { + newcon->next = console_drivers; + console_drivers = newcon; + if (newcon->next) + newcon->next->flags &= ~CON_CONSDEV; } else { - console->next = console_drivers->next; - console_drivers->next = console; + newcon->next = console_drivers->next; + console_drivers->next = newcon; } - if (console->flags & CON_PRINTBUFFER) { + if (newcon->flags & CON_PRINTBUFFER) { /* * release_console_sem() will print out the buffered messages * for us. @@ -1287,11 +1323,13 @@ EXPORT_SYMBOL(unregister_console); static int __init disable_boot_consoles(void) { - if (console_drivers != NULL) { - if (console_drivers->flags & CON_BOOT) { + struct console *con; + + for_each_console(con) { + if (con->flags & CON_BOOT) { printk(KERN_INFO "turn off boot console %s%d\n", - console_drivers->name, console_drivers->index); - return unregister_console(console_drivers); + con->name, con->index); + return unregister_console(con); } } return 0; -- cgit v1.2.3-70-g09d2 From ddc1637af217dbd8bc51f30e6d24e84476a869a6 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 3 Jul 2009 17:34:24 +0800 Subject: kmemtrace: Print binary output only if 'bin' option is set Currently by default the output of kmemtrace is binary format instead of human-readable output. This patch makes the following changes: - We'll see human-readable output by default - We'll see binary output if 'bin' option is set Note: you may probably need to explicitly disable context-info binary output: # echo 0 > options/context-info # echo 1 > options/bin # cat trace_pipe v2: - use %pF to print call_site Signed-off-by: Li Zefan Acked-by: Pekka Enberg Acked-by: Eduard - Gabriel Munteanu Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A4DD0A0.5060500@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/kmemtrace.c | 120 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 90 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 1edaa9516e8..74903b62bcb 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -239,12 +239,52 @@ struct kmemtrace_user_event_alloc { }; static enum print_line_t -kmemtrace_print_alloc_user(struct trace_iterator *iter, - struct kmemtrace_alloc_entry *entry) +kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags) +{ + struct trace_seq *s = &iter->seq; + struct kmemtrace_alloc_entry *entry; + int ret; + + trace_assign_type(entry, iter->ent); + + ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu " + "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n", + entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr, + (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc, + (unsigned long)entry->gfp_flags, entry->node); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +kmemtrace_print_free_user(struct trace_iterator *iter, int flags) { - struct kmemtrace_user_event_alloc *ev_alloc; struct trace_seq *s = &iter->seq; + struct kmemtrace_free_entry *entry; + int ret; + + trace_assign_type(entry, iter->ent); + + ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n", + entry->type_id, (void *)entry->call_site, + (unsigned long)entry->ptr); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +kmemtrace_print_alloc_user_bin(struct trace_iterator *iter, int flags) +{ + struct trace_seq *s = &iter->seq; + struct kmemtrace_alloc_entry *entry; struct kmemtrace_user_event *ev; + struct kmemtrace_user_event_alloc *ev_alloc; + + trace_assign_type(entry, iter->ent); ev = trace_seq_reserve(s, sizeof(*ev)); if (!ev) @@ -271,12 +311,14 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter, } static enum print_line_t -kmemtrace_print_free_user(struct trace_iterator *iter, - struct kmemtrace_free_entry *entry) +kmemtrace_print_free_user_bin(struct trace_iterator *iter, int flags) { struct trace_seq *s = &iter->seq; + struct kmemtrace_free_entry *entry; struct kmemtrace_user_event *ev; + trace_assign_type(entry, iter->ent); + ev = trace_seq_reserve(s, sizeof(*ev)); if (!ev) return TRACE_TYPE_PARTIAL_LINE; @@ -294,12 +336,14 @@ kmemtrace_print_free_user(struct trace_iterator *iter, /* The two other following provide a more minimalistic output */ static enum print_line_t -kmemtrace_print_alloc_compress(struct trace_iterator *iter, - struct kmemtrace_alloc_entry *entry) +kmemtrace_print_alloc_compress(struct trace_iterator *iter) { + struct kmemtrace_alloc_entry *entry; struct trace_seq *s = &iter->seq; int ret; + trace_assign_type(entry, iter->ent); + /* Alloc entry */ ret = trace_seq_printf(s, " + "); if (!ret) @@ -362,12 +406,14 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter, } static enum print_line_t -kmemtrace_print_free_compress(struct trace_iterator *iter, - struct kmemtrace_free_entry *entry) +kmemtrace_print_free_compress(struct trace_iterator *iter) { + struct kmemtrace_free_entry *entry; struct trace_seq *s = &iter->seq; int ret; + trace_assign_type(entry, iter->ent); + /* Free entry */ ret = trace_seq_printf(s, " - "); if (!ret) @@ -421,32 +467,31 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) { struct trace_entry *entry = iter->ent; - switch (entry->type) { - case TRACE_KMEM_ALLOC: { - struct kmemtrace_alloc_entry *field; - - trace_assign_type(field, entry); - if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL) - return kmemtrace_print_alloc_compress(iter, field); - else - return kmemtrace_print_alloc_user(iter, field); - } - - case TRACE_KMEM_FREE: { - struct kmemtrace_free_entry *field; - - trace_assign_type(field, entry); - if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL) - return kmemtrace_print_free_compress(iter, field); - else - return kmemtrace_print_free_user(iter, field); - } + if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)) + return TRACE_TYPE_UNHANDLED; + switch (entry->type) { + case TRACE_KMEM_ALLOC: + return kmemtrace_print_alloc_compress(iter); + case TRACE_KMEM_FREE: + return kmemtrace_print_free_compress(iter); default: return TRACE_TYPE_UNHANDLED; } } +static struct trace_event kmem_trace_alloc = { + .type = TRACE_KMEM_ALLOC, + .trace = kmemtrace_print_alloc_user, + .binary = kmemtrace_print_alloc_user_bin, +}; + +static struct trace_event kmem_trace_free = { + .type = TRACE_KMEM_FREE, + .trace = kmemtrace_print_free_user, + .binary = kmemtrace_print_free_user_bin, +}; + static struct tracer kmem_tracer __read_mostly = { .name = "kmemtrace", .init = kmem_trace_init, @@ -463,6 +508,21 @@ void kmemtrace_init(void) static int __init init_kmem_tracer(void) { - return register_tracer(&kmem_tracer); + if (!register_ftrace_event(&kmem_trace_alloc)) { + pr_warning("Warning: could not register kmem events\n"); + return 1; + } + + if (!register_ftrace_event(&kmem_trace_free)) { + pr_warning("Warning: could not register kmem events\n"); + return 1; + } + + if (!register_tracer(&kmem_tracer)) { + pr_warning("Warning: could not register the kmem tracer\n"); + return 1; + } + + return 0; } device_initcall(init_kmem_tracer); -- cgit v1.2.3-70-g09d2 From c71320d0c445e01555a86fa6523609db47eeaef6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 5 Jul 2009 00:22:34 +0200 Subject: genirq: Fix comment describing suspend_device_irqs() The kerneldoc comment describing suspend_device_irqs() is currently misleading, because generally the function doesn't really disable interrupt lines at the chip level. Replace it with a more accurate one. Signed-off-by: Rafael J. Wysocki LKML-Reference: <200907050022.35117.rjw@sisk.pl> Signed-off-by: Thomas Gleixner --- kernel/irq/pm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 638d8bedec1..a0bb09e7986 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -15,10 +15,10 @@ /** * suspend_device_irqs - disable all currently enabled interrupt lines * - * During system-wide suspend or hibernation device interrupts need to be - * disabled at the chip level and this function is provided for this purpose. - * It disables all interrupt lines that are enabled at the moment and sets the - * IRQ_SUSPENDED flag for them. + * During system-wide suspend or hibernation device drivers need to be prevented + * from receiving interrupts and this function is provided for this purpose. + * It marks all interrupt lines in use, except for the timer ones, as disabled + * and sets the IRQ_SUSPENDED flag for each of them. */ void suspend_device_irqs(void) { -- cgit v1.2.3-70-g09d2 From 951ed4d36b77ba9fe1ea08fc3c59d8bb6c9bda32 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 7 Jul 2009 11:27:28 +0200 Subject: timekeeping: optimized ktime_get[_ts] for GENERIC_TIME=y The generic ktime_get function defined in kernel/hrtimer.c is suboptimial for GENERIC_TIME=y: 0) | ktime_get() { 0) | ktime_get_ts() { 0) | getnstimeofday() { 0) | read_tod_clock() { 0) 0.601 us | } 0) 1.938 us | } 0) | set_normalized_timespec() { 0) 0.602 us | } 0) 4.375 us | } 0) 5.523 us | } Overall there are two read_seqbegin/read_seqretry loops and a lot of unnecessary struct timespec calculations. ktime_get returns a nano second value which is the sum of xtime, wall_to_monotonic and the nano second delta from the clock source. ktime_get can be optimized for GENERIC_TIME=y. The new version only calls clocksource_read: 0) | ktime_get() { 0) | read_tod_clock() { 0) 0.610 us | } 0) 1.977 us | } It uses a single read_seqbegin/readseqretry loop and just adds everthing to a nano second value. ktime_get_ts is optimized in a similar fashion. [ tglx: added WARN_ON(timekeeping_suspended) as in getnstimeofday() ] Signed-off-by: Martin Schwidefsky Acked-by: john stultz LKML-Reference: <20090707112728.3005244d@skybase> Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 4 +++ kernel/time/timekeeping.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 9002958a96e..829e0664b72 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -48,6 +48,7 @@ #include +#ifndef CONFIG_GENERIC_TIME /** * ktime_get - get the monotonic time in ktime_t format * @@ -62,6 +63,7 @@ ktime_t ktime_get(void) return timespec_to_ktime(now); } EXPORT_SYMBOL_GPL(ktime_get); +#endif /** * ktime_get_real - get the real (wall-) time in ktime_t format @@ -106,6 +108,7 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = } }; +#ifndef CONFIG_GENERIC_TIME /** * ktime_get_ts - get the monotonic clock in timespec format * @ts: pointer to timespec variable @@ -130,6 +133,7 @@ void ktime_get_ts(struct timespec *ts) ts->tv_nsec + tomono.tv_nsec); } EXPORT_SYMBOL_GPL(ktime_get_ts); +#endif /* * Get the coarse grained time at the softirq based on xtime and diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e8c77d9c633..7a248135c6f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -125,6 +125,75 @@ void getnstimeofday(struct timespec *ts) EXPORT_SYMBOL(getnstimeofday); +ktime_t ktime_get(void) +{ + cycle_t cycle_now, cycle_delta; + unsigned int seq; + s64 secs, nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqbegin(&xtime_lock); + secs = xtime.tv_sec + wall_to_monotonic.tv_sec; + nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + nsecs += cyc2ns(clock, cycle_delta); + + } while (read_seqretry(&xtime_lock, seq)); + /* + * Use ktime_set/ktime_add_ns to create a proper ktime on + * 32-bit architectures without CONFIG_KTIME_SCALAR. + */ + return ktime_add_ns(ktime_set(secs, 0), nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get); + +/** + * ktime_get_ts - get the monotonic clock in timespec format + * @ts: pointer to timespec variable + * + * The function calculates the monotonic clock from the realtime + * clock and the wall_to_monotonic offset and stores the result + * in normalized timespec format in the variable pointed to by @ts. + */ +void ktime_get_ts(struct timespec *ts) +{ + cycle_t cycle_now, cycle_delta; + struct timespec tomono; + unsigned int seq; + s64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqbegin(&xtime_lock); + *ts = xtime; + tomono = wall_to_monotonic; + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + nsecs = cyc2ns(clock, cycle_delta); + + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, + ts->tv_nsec + tomono.tv_nsec + nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get_ts); + /** * do_gettimeofday - Returns the time of day in a timeval * @tv: pointer to the timeval to be set -- cgit v1.2.3-70-g09d2 From a40f262cc21fbfd781bbddcc40b16b83a75f5f34 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Jul 2009 13:00:31 +0200 Subject: timekeeping: Move ktime_get() functions to timekeeping.c The ktime_get() functions for GENERIC_TIME=n are still located in hrtimer.c. Move them to time/timekeeping.c where they belong. LKML-Reference: Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 60 ----------------------------------------------- kernel/time/timekeeping.c | 59 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 57 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 829e0664b72..43d151f185b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -48,39 +48,6 @@ #include -#ifndef CONFIG_GENERIC_TIME -/** - * ktime_get - get the monotonic time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get(void) -{ - struct timespec now; - - ktime_get_ts(&now); - - return timespec_to_ktime(now); -} -EXPORT_SYMBOL_GPL(ktime_get); -#endif - -/** - * ktime_get_real - get the real (wall-) time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get_real(void) -{ - struct timespec now; - - getnstimeofday(&now); - - return timespec_to_ktime(now); -} - -EXPORT_SYMBOL_GPL(ktime_get_real); - /* * The timer bases: * @@ -108,33 +75,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = } }; -#ifndef CONFIG_GENERIC_TIME -/** - * ktime_get_ts - get the monotonic clock in timespec format - * @ts: pointer to timespec variable - * - * The function calculates the monotonic clock from the realtime - * clock and the wall_to_monotonic offset and stores the result - * in normalized timespec format in the variable pointed to by @ts. - */ -void ktime_get_ts(struct timespec *ts) -{ - struct timespec tomono; - unsigned long seq; - - do { - seq = read_seqbegin(&xtime_lock); - getnstimeofday(ts); - tomono = wall_to_monotonic; - - } while (read_seqretry(&xtime_lock, seq)); - - set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, - ts->tv_nsec + tomono.tv_nsec); -} -EXPORT_SYMBOL_GPL(ktime_get_ts); -#endif - /* * Get the coarse grained time at the softirq based on xtime and * wall_to_monotonic. diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7a248135c6f..02c0b2c9c67 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -290,10 +290,65 @@ static void change_clocksource(void) clock->name); */ } -#else +#else /* GENERIC_TIME */ static inline void clocksource_forward_now(void) { } static inline void change_clocksource(void) { } -#endif + +/** + * ktime_get - get the monotonic time in ktime_t format + * + * returns the time in ktime_t format + */ +ktime_t ktime_get(void) +{ + struct timespec now; + + ktime_get_ts(&now); + + return timespec_to_ktime(now); +} +EXPORT_SYMBOL_GPL(ktime_get); + +/** + * ktime_get_ts - get the monotonic clock in timespec format + * @ts: pointer to timespec variable + * + * The function calculates the monotonic clock from the realtime + * clock and the wall_to_monotonic offset and stores the result + * in normalized timespec format in the variable pointed to by @ts. + */ +void ktime_get_ts(struct timespec *ts) +{ + struct timespec tomono; + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + getnstimeofday(ts); + tomono = wall_to_monotonic; + + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, + ts->tv_nsec + tomono.tv_nsec); +} +EXPORT_SYMBOL_GPL(ktime_get_ts); +#endif /* !GENERIC_TIME */ + +/** + * ktime_get_real - get the real (wall-) time in ktime_t format + * + * returns the time in ktime_t format + */ +ktime_t ktime_get_real(void) +{ + struct timespec now; + + getnstimeofday(&now); + + return timespec_to_ktime(now); +} +EXPORT_SYMBOL_GPL(ktime_get_real); /** * getrawmonotonic - Returns the raw monotonic time in a timespec -- cgit v1.2.3-70-g09d2 From 3adc54fa82a68be1cd1ac82ad786ee362796e50a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 30 Mar 2009 15:32:01 -0400 Subject: ring-buffer: make the buffer a true circular link list This patch changes the ring buffer data pages from using a link list head pointer, to making each buffer page point to another buffer page and never back to a "head". This makes the handling of the ring buffer less complex, since the traversing of the ring buffer pages no longer needs to account for the head pointer. This change also is needed to make the ring buffer lockless. [ Changes in version 2: - Added change that Lai Jiangshan mentioned. From: Lai Jiangshan Date: Thu, 11 Jun 2009 11:25:48 +0800 LKML-Reference: <4A30793C.6090208@cn.fujitsu.com> I'm not sure whether these 4 lines: bpage = list_entry(pages.next, struct buffer_page, list); list_del_init(&bpage->list); cpu_buffer->pages = &bpage->list; list_splice(&pages, cpu_buffer->pages); equal to these 2 lines: cpu_buffer->pages = pages.next; list_del(&pages); If there are equivalent, I think the second one are simpler. It may be not a really necessarily cleanup. What I asked is: if there are equivalent, could you use these two line: cpu_buffer->pages = pages.next; list_del(&pages); ] [ Impact: simplify the ring buffer to help make it lockless ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 49 ++++++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bf27bb7a63e..7c0168ad6d5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -406,7 +406,7 @@ struct ring_buffer_per_cpu { spinlock_t reader_lock; /* serialize readers */ raw_spinlock_t lock; struct lock_class_key lock_key; - struct list_head pages; + struct list_head *pages; struct buffer_page *head_page; /* read from head */ struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ @@ -498,7 +498,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); */ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) { - struct list_head *head = &cpu_buffer->pages; + struct list_head *head = cpu_buffer->pages; struct buffer_page *bpage, *tmp; if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) @@ -521,12 +521,13 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) { - struct list_head *head = &cpu_buffer->pages; struct buffer_page *bpage, *tmp; unsigned long addr; LIST_HEAD(pages); unsigned i; + WARN_ON(!nr_pages); + for (i = 0; i < nr_pages; i++) { bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); @@ -541,7 +542,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_init_page(bpage->page); } - list_splice(&pages, head); + /* + * The ring buffer page list is a circular list that does not + * start and end with a list head. All page list items point to + * other pages. + */ + cpu_buffer->pages = pages.next; + list_del(&pages); rb_check_pages(cpu_buffer); @@ -573,7 +580,6 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) spin_lock_init(&cpu_buffer->reader_lock); lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - INIT_LIST_HEAD(&cpu_buffer->pages); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); @@ -594,7 +600,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) goto fail_free_reader; cpu_buffer->head_page - = list_entry(cpu_buffer->pages.next, struct buffer_page, list); + = list_entry(cpu_buffer->pages, struct buffer_page, list); cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; return cpu_buffer; @@ -609,15 +615,20 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { - struct list_head *head = &cpu_buffer->pages; + struct list_head *head = cpu_buffer->pages; struct buffer_page *bpage, *tmp; free_buffer_page(cpu_buffer->reader_page); - list_for_each_entry_safe(bpage, tmp, head, list) { - list_del_init(&bpage->list); + if (head) { + list_for_each_entry_safe(bpage, tmp, head, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + bpage = list_entry(head, struct buffer_page, list); free_buffer_page(bpage); } + kfree(cpu_buffer); } @@ -760,14 +771,14 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) synchronize_sched(); for (i = 0; i < nr_pages; i++) { - if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) + if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) return; - p = cpu_buffer->pages.next; + p = cpu_buffer->pages->next; bpage = list_entry(p, struct buffer_page, list); list_del_init(&bpage->list); free_buffer_page(bpage); } - if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) + if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) return; rb_reset_cpu(cpu_buffer); @@ -795,7 +806,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, p = pages->next; bpage = list_entry(p, struct buffer_page, list); list_del_init(&bpage->list); - list_add_tail(&bpage->list, &cpu_buffer->pages); + list_add_tail(&bpage->list, cpu_buffer->pages); } rb_reset_cpu(cpu_buffer); @@ -992,9 +1003,6 @@ static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, { struct list_head *p = (*bpage)->list.next; - if (p == &cpu_buffer->pages) - p = p->next; - *bpage = list_entry(p, struct buffer_page, list); } @@ -2247,6 +2255,13 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->list.next = reader->list.next; cpu_buffer->reader_page->list.prev = reader->list.prev; + /* + * cpu_buffer->pages just needs to point to the buffer, it + * has no specific buffer page to point to. Lets move it out + * of our way so we don't accidently swap it. + */ + cpu_buffer->pages = reader->list.prev; + local_set(&cpu_buffer->reader_page->write, 0); local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); @@ -2719,7 +2734,7 @@ static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { cpu_buffer->head_page - = list_entry(cpu_buffer->pages.next, struct buffer_page, list); + = list_entry(cpu_buffer->pages, struct buffer_page, list); local_set(&cpu_buffer->head_page->write, 0); local_set(&cpu_buffer->head_page->entries, 0); local_set(&cpu_buffer->head_page->page->commit, 0); -- cgit v1.2.3-70-g09d2 From 77ae365eca895061c8bf2b2e3ae1d9ea62869739 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 27 Mar 2009 11:00:29 -0400 Subject: ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 1 - kernel/trace/ring_buffer.c | 886 ++++++++++++++++++++++++++++++++++++-------- kernel/trace/trace.c | 3 - 3 files changed, 738 insertions(+), 152 deletions(-) (limited to 'kernel') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 29f8599e6be..7fca71693ae 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -170,7 +170,6 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer); unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu); unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu); unsigned long ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu); -unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu); u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu); void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7c0168ad6d5..e648ba4f70e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -322,6 +322,14 @@ struct buffer_data_page { unsigned char data[]; /* data of buffer page */ }; +/* + * Note, the buffer_page list must be first. The buffer pages + * are allocated in cache lines, which means that each buffer + * page will be at the beginning of a cache line, and thus + * the least significant bits will be zero. We use this to + * add flags in the list struct pointers, to make the ring buffer + * lockless. + */ struct buffer_page { struct list_head list; /* list of buffer pages */ local_t write; /* index for next write */ @@ -330,6 +338,21 @@ struct buffer_page { struct buffer_data_page *page; /* Actual data page */ }; +/* + * The buffer page counters, write and entries, must be reset + * atomically when crossing page boundaries. To synchronize this + * update, two counters are inserted into the number. One is + * the actual counter for the write position or count on the page. + * + * The other is a counter of updaters. Before an update happens + * the update partition of the counter is incremented. This will + * allow the updater to update the counter atomically. + * + * The counter is 20 bits, and the state data is 12. + */ +#define RB_WRITE_MASK 0xfffff +#define RB_WRITE_INTCNT (1 << 20) + static void rb_init_page(struct buffer_data_page *bpage) { local_set(&bpage->commit, 0); @@ -403,7 +426,7 @@ int ring_buffer_print_page_header(struct trace_seq *s) struct ring_buffer_per_cpu { int cpu; struct ring_buffer *buffer; - spinlock_t reader_lock; /* serialize readers */ + spinlock_t reader_lock; /* serialize readers */ raw_spinlock_t lock; struct lock_class_key lock_key; struct list_head *pages; @@ -411,13 +434,12 @@ struct ring_buffer_per_cpu { struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ struct buffer_page *reader_page; - unsigned long nmi_dropped; - unsigned long commit_overrun; - unsigned long overrun; - unsigned long read; + local_t commit_overrun; + local_t overrun; local_t entries; local_t committing; local_t commits; + unsigned long read; u64 write_stamp; u64 read_stamp; atomic_t record_disabled; @@ -489,6 +511,385 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); +/* + * Making the ring buffer lockless makes things tricky. + * Although writes only happen on the CPU that they are on, + * and they only need to worry about interrupts. Reads can + * happen on any CPU. + * + * The reader page is always off the ring buffer, but when the + * reader finishes with a page, it needs to swap its page with + * a new one from the buffer. The reader needs to take from + * the head (writes go to the tail). But if a writer is in overwrite + * mode and wraps, it must push the head page forward. + * + * Here lies the problem. + * + * The reader must be careful to replace only the head page, and + * not another one. As described at the top of the file in the + * ASCII art, the reader sets its old page to point to the next + * page after head. It then sets the page after head to point to + * the old reader page. But if the writer moves the head page + * during this operation, the reader could end up with the tail. + * + * We use cmpxchg to help prevent this race. We also do something + * special with the page before head. We set the LSB to 1. + * + * When the writer must push the page forward, it will clear the + * bit that points to the head page, move the head, and then set + * the bit that points to the new head page. + * + * We also don't want an interrupt coming in and moving the head + * page on another writer. Thus we use the second LSB to catch + * that too. Thus: + * + * head->list->prev->next bit 1 bit 0 + * ------- ------- + * Normal page 0 0 + * Points to head page 0 1 + * New head page 1 0 + * + * Note we can not trust the prev pointer of the head page, because: + * + * +----+ +-----+ +-----+ + * | |------>| T |---X--->| N | + * | |<------| | | | + * +----+ +-----+ +-----+ + * ^ ^ | + * | +-----+ | | + * +----------| R |----------+ | + * | |<-----------+ + * +-----+ + * + * Key: ---X--> HEAD flag set in pointer + * T Tail page + * R Reader page + * N Next page + * + * (see __rb_reserve_next() to see where this happens) + * + * What the above shows is that the reader just swapped out + * the reader page with a page in the buffer, but before it + * could make the new header point back to the new page added + * it was preempted by a writer. The writer moved forward onto + * the new page added by the reader and is about to move forward + * again. + * + * You can see, it is legitimate for the previous pointer of + * the head (or any page) not to point back to itself. But only + * temporarially. + */ + +#define RB_PAGE_NORMAL 0UL +#define RB_PAGE_HEAD 1UL +#define RB_PAGE_UPDATE 2UL + + +#define RB_FLAG_MASK 3UL + +/* PAGE_MOVED is not part of the mask */ +#define RB_PAGE_MOVED 4UL + +/* + * rb_list_head - remove any bit + */ +static struct list_head *rb_list_head(struct list_head *list) +{ + unsigned long val = (unsigned long)list; + + return (struct list_head *)(val & ~RB_FLAG_MASK); +} + +/* + * rb_is_head_page - test if the give page is the head page + * + * Because the reader may move the head_page pointer, we can + * not trust what the head page is (it may be pointing to + * the reader page). But if the next page is a header page, + * its flags will be non zero. + */ +static int inline +rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *page, struct list_head *list) +{ + unsigned long val; + + val = (unsigned long)list->next; + + if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list) + return RB_PAGE_MOVED; + + return val & RB_FLAG_MASK; +} + +/* + * rb_is_reader_page + * + * The unique thing about the reader page, is that, if the + * writer is ever on it, the previous pointer never points + * back to the reader page. + */ +static int rb_is_reader_page(struct buffer_page *page) +{ + struct list_head *list = page->list.prev; + + return rb_list_head(list->next) != &page->list; +} + +/* + * rb_set_list_to_head - set a list_head to be pointing to head. + */ +static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer, + struct list_head *list) +{ + unsigned long *ptr; + + ptr = (unsigned long *)&list->next; + *ptr |= RB_PAGE_HEAD; + *ptr &= ~RB_PAGE_UPDATE; +} + +/* + * rb_head_page_activate - sets up head page + */ +static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *head; + + head = cpu_buffer->head_page; + if (!head) + return; + + /* + * Set the previous list pointer to have the HEAD flag. + */ + rb_set_list_to_head(cpu_buffer, head->list.prev); +} + +static void rb_list_head_clear(struct list_head *list) +{ + unsigned long *ptr = (unsigned long *)&list->next; + + *ptr &= ~RB_FLAG_MASK; +} + +/* + * rb_head_page_dactivate - clears head page ptr (for free list) + */ +static void +rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *hd; + + /* Go through the whole list and clear any pointers found. */ + rb_list_head_clear(cpu_buffer->pages); + + list_for_each(hd, cpu_buffer->pages) + rb_list_head_clear(hd); +} + +static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag, int new_flag) +{ + struct list_head *list; + unsigned long val = (unsigned long)&head->list; + unsigned long ret; + + list = &prev->list; + + val &= ~RB_FLAG_MASK; + + ret = (unsigned long)cmpxchg(&list->next, + val | old_flag, val | new_flag); + + /* check if the reader took the page */ + if ((ret & ~RB_FLAG_MASK) != val) + return RB_PAGE_MOVED; + + return ret & RB_FLAG_MASK; +} + +static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_UPDATE); +} + +static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_HEAD); +} + +static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_NORMAL); +} + +static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page **bpage) +{ + struct list_head *p = rb_list_head((*bpage)->list.next); + + *bpage = list_entry(p, struct buffer_page, list); +} + +static struct buffer_page * +rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *head; + struct buffer_page *page; + struct list_head *list; + int i; + + if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page)) + return NULL; + + /* sanity check */ + list = cpu_buffer->pages; + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list)) + return NULL; + + page = head = cpu_buffer->head_page; + /* + * It is possible that the writer moves the header behind + * where we started, and we miss in one loop. + * A second loop should grab the header, but we'll do + * three loops just because I'm paranoid. + */ + for (i = 0; i < 3; i++) { + do { + if (rb_is_head_page(cpu_buffer, page, page->list.prev)) { + cpu_buffer->head_page = page; + return page; + } + rb_inc_page(cpu_buffer, &page); + } while (page != head); + } + + RB_WARN_ON(cpu_buffer, 1); + + return NULL; +} + +static int rb_head_page_replace(struct buffer_page *old, + struct buffer_page *new) +{ + unsigned long *ptr = (unsigned long *)&old->list.prev->next; + unsigned long val; + unsigned long ret; + + val = *ptr & ~RB_FLAG_MASK; + val |= RB_PAGE_HEAD; + + ret = cmpxchg(ptr, val, &new->list); + + return ret == val; +} + +/* + * rb_tail_page_update - move the tail page forward + * + * Returns 1 if moved tail page, 0 if someone else did. + */ +static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *tail_page, + struct buffer_page *next_page) +{ + struct buffer_page *old_tail; + unsigned long old_entries; + unsigned long old_write; + int ret = 0; + + /* + * The tail page now needs to be moved forward. + * + * We need to reset the tail page, but without messing + * with possible erasing of data brought in by interrupts + * that have moved the tail page and are currently on it. + * + * We add a counter to the write field to denote this. + */ + old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write); + old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries); + + /* + * Just make sure we have seen our old_write and synchronize + * with any interrupts that come in. + */ + barrier(); + + /* + * If the tail page is still the same as what we think + * it is, then it is up to us to update the tail + * pointer. + */ + if (tail_page == cpu_buffer->tail_page) { + /* Zero the write counter */ + unsigned long val = old_write & ~RB_WRITE_MASK; + unsigned long eval = old_entries & ~RB_WRITE_MASK; + + /* + * This will only succeed if an interrupt did + * not come in and change it. In which case, we + * do not want to modify it. + */ + local_cmpxchg(&next_page->write, old_write, val); + local_cmpxchg(&next_page->entries, old_entries, eval); + + /* + * No need to worry about races with clearing out the commit. + * it only can increment when a commit takes place. But that + * only happens in the outer most nested commit. + */ + local_set(&next_page->page->commit, 0); + + old_tail = cmpxchg(&cpu_buffer->tail_page, + tail_page, next_page); + + if (old_tail == tail_page) + ret = 1; + } + + return ret; +} + +static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *bpage) +{ + unsigned long val = (unsigned long)bpage; + + if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK)) + return 1; + + return 0; +} + +/** + * rb_check_list - make sure a pointer to a list has the last bits zero + */ +static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer, + struct list_head *list) +{ + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev)) + return 1; + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next)) + return 1; + return 0; +} + /** * check_pages - integrity check of buffer pages * @cpu_buffer: CPU buffer with pages to test @@ -501,11 +902,16 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) struct list_head *head = cpu_buffer->pages; struct buffer_page *bpage, *tmp; + rb_head_page_deactivate(cpu_buffer); + if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) return -1; if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) return -1; + if (rb_check_list(cpu_buffer, head)) + return -1; + list_for_each_entry_safe(bpage, tmp, head, list) { if (RB_WARN_ON(cpu_buffer, bpage->list.next->prev != &bpage->list)) @@ -513,8 +919,12 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) if (RB_WARN_ON(cpu_buffer, bpage->list.prev->next != &bpage->list)) return -1; + if (rb_check_list(cpu_buffer, &bpage->list)) + return -1; } + rb_head_page_activate(cpu_buffer); + return 0; } @@ -533,6 +943,9 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); if (!bpage) goto free_pages; + + rb_check_bpage(cpu_buffer, bpage); + list_add(&bpage->list, &pages); addr = __get_free_page(GFP_KERNEL); @@ -586,6 +999,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) if (!bpage) goto fail_free_buffer; + rb_check_bpage(cpu_buffer, bpage); + cpu_buffer->reader_page = bpage; addr = __get_free_page(GFP_KERNEL); if (!addr) @@ -603,6 +1018,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) = list_entry(cpu_buffer->pages, struct buffer_page, list); cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; + rb_head_page_activate(cpu_buffer); + return cpu_buffer; fail_free_reader: @@ -620,6 +1037,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) free_buffer_page(cpu_buffer->reader_page); + rb_head_page_deactivate(cpu_buffer); + if (head) { list_for_each_entry_safe(bpage, tmp, head, list) { list_del_init(&bpage->list); @@ -770,6 +1189,8 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) atomic_inc(&cpu_buffer->record_disabled); synchronize_sched(); + rb_head_page_deactivate(cpu_buffer); + for (i = 0; i < nr_pages; i++) { if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) return; @@ -800,6 +1221,9 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, atomic_inc(&cpu_buffer->record_disabled); synchronize_sched(); + spin_lock_irq(&cpu_buffer->reader_lock); + rb_head_page_deactivate(cpu_buffer); + for (i = 0; i < nr_pages; i++) { if (RB_WARN_ON(cpu_buffer, list_empty(pages))) return; @@ -809,6 +1233,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add_tail(&bpage->list, cpu_buffer->pages); } rb_reset_cpu(cpu_buffer); + spin_unlock_irq(&cpu_buffer->reader_lock); rb_check_pages(cpu_buffer); @@ -958,22 +1383,15 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->read); } -static inline struct ring_buffer_event * -rb_head_event(struct ring_buffer_per_cpu *cpu_buffer) -{ - return __rb_page_index(cpu_buffer->head_page, - cpu_buffer->head_page->read); -} - static inline struct ring_buffer_event * rb_iter_head_event(struct ring_buffer_iter *iter) { return __rb_page_index(iter->head_page, iter->head); } -static inline unsigned rb_page_write(struct buffer_page *bpage) +static inline unsigned long rb_page_write(struct buffer_page *bpage) { - return local_read(&bpage->write); + return local_read(&bpage->write) & RB_WRITE_MASK; } static inline unsigned rb_page_commit(struct buffer_page *bpage) @@ -981,6 +1399,11 @@ static inline unsigned rb_page_commit(struct buffer_page *bpage) return local_read(&bpage->page->commit); } +static inline unsigned long rb_page_entries(struct buffer_page *bpage) +{ + return local_read(&bpage->entries) & RB_WRITE_MASK; +} + /* Size is determined by what has been commited */ static inline unsigned rb_page_size(struct buffer_page *bpage) { @@ -993,19 +1416,6 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) return rb_page_commit(cpu_buffer->commit_page); } -static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer) -{ - return rb_page_commit(cpu_buffer->head_page); -} - -static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page **bpage) -{ - struct list_head *p = (*bpage)->list.next; - - *bpage = list_entry(p, struct buffer_page, list); -} - static inline unsigned rb_event_index(struct ring_buffer_event *event) { @@ -1031,6 +1441,8 @@ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, static void rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) { + unsigned long max_count; + /* * We only race with interrupts and NMIs on this CPU. * If we own the commit event, then we can commit @@ -1040,9 +1452,16 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) * assign the commit to the tail. */ again: + max_count = cpu_buffer->buffer->pages * 100; + while (cpu_buffer->commit_page != cpu_buffer->tail_page) { - cpu_buffer->commit_page->page->commit = - cpu_buffer->commit_page->write; + if (RB_WARN_ON(cpu_buffer, !(--max_count))) + return; + if (RB_WARN_ON(cpu_buffer, + rb_is_reader_page(cpu_buffer->tail_page))) + return; + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); cpu_buffer->write_stamp = cpu_buffer->commit_page->page->time_stamp; @@ -1051,8 +1470,12 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) } while (rb_commit_index(cpu_buffer) != rb_page_write(cpu_buffer->commit_page)) { - cpu_buffer->commit_page->page->commit = - cpu_buffer->commit_page->write; + + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + RB_WARN_ON(cpu_buffer, + local_read(&cpu_buffer->commit_page->page->commit) & + ~RB_WRITE_MASK); barrier(); } @@ -1085,7 +1508,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) * to the head page instead of next. */ if (iter->head_page == cpu_buffer->reader_page) - iter->head_page = cpu_buffer->head_page; + iter->head_page = rb_set_head_page(cpu_buffer); else rb_inc_page(cpu_buffer, &iter->head_page); @@ -1129,6 +1552,163 @@ rb_update_event(struct ring_buffer_event *event, } } +/* + * rb_handle_head_page - writer hit the head page + * + * Returns: +1 to retry page + * 0 to continue + * -1 on error + */ +static int +rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *tail_page, + struct buffer_page *next_page) +{ + struct buffer_page *new_head; + int entries; + int type; + int ret; + + entries = rb_page_entries(next_page); + + /* + * The hard part is here. We need to move the head + * forward, and protect against both readers on + * other CPUs and writers coming in via interrupts. + */ + type = rb_head_page_set_update(cpu_buffer, next_page, tail_page, + RB_PAGE_HEAD); + + /* + * type can be one of four: + * NORMAL - an interrupt already moved it for us + * HEAD - we are the first to get here. + * UPDATE - we are the interrupt interrupting + * a current move. + * MOVED - a reader on another CPU moved the next + * pointer to its reader page. Give up + * and try again. + */ + + switch (type) { + case RB_PAGE_HEAD: + /* + * We changed the head to UPDATE, thus + * it is our responsibility to update + * the counters. + */ + local_add(entries, &cpu_buffer->overrun); + + /* + * The entries will be zeroed out when we move the + * tail page. + */ + + /* still more to do */ + break; + + case RB_PAGE_UPDATE: + /* + * This is an interrupt that interrupt the + * previous update. Still more to do. + */ + break; + case RB_PAGE_NORMAL: + /* + * An interrupt came in before the update + * and processed this for us. + * Nothing left to do. + */ + return 1; + case RB_PAGE_MOVED: + /* + * The reader is on another CPU and just did + * a swap with our next_page. + * Try again. + */ + return 1; + default: + RB_WARN_ON(cpu_buffer, 1); /* WTF??? */ + return -1; + } + + /* + * Now that we are here, the old head pointer is + * set to UPDATE. This will keep the reader from + * swapping the head page with the reader page. + * The reader (on another CPU) will spin till + * we are finished. + * + * We just need to protect against interrupts + * doing the job. We will set the next pointer + * to HEAD. After that, we set the old pointer + * to NORMAL, but only if it was HEAD before. + * otherwise we are an interrupt, and only + * want the outer most commit to reset it. + */ + new_head = next_page; + rb_inc_page(cpu_buffer, &new_head); + + ret = rb_head_page_set_head(cpu_buffer, new_head, next_page, + RB_PAGE_NORMAL); + + /* + * Valid returns are: + * HEAD - an interrupt came in and already set it. + * NORMAL - One of two things: + * 1) We really set it. + * 2) A bunch of interrupts came in and moved + * the page forward again. + */ + switch (ret) { + case RB_PAGE_HEAD: + case RB_PAGE_NORMAL: + /* OK */ + break; + default: + RB_WARN_ON(cpu_buffer, 1); + return -1; + } + + /* + * It is possible that an interrupt came in, + * set the head up, then more interrupts came in + * and moved it again. When we get back here, + * the page would have been set to NORMAL but we + * just set it back to HEAD. + * + * How do you detect this? Well, if that happened + * the tail page would have moved. + */ + if (ret == RB_PAGE_NORMAL) { + /* + * If the tail had moved passed next, then we need + * to reset the pointer. + */ + if (cpu_buffer->tail_page != tail_page && + cpu_buffer->tail_page != next_page) + rb_head_page_set_normal(cpu_buffer, new_head, + next_page, + RB_PAGE_HEAD); + } + + /* + * If this was the outer most commit (the one that + * changed the original pointer from HEAD to UPDATE), + * then it is up to us to reset it to NORMAL. + */ + if (type == RB_PAGE_HEAD) { + ret = rb_head_page_set_normal(cpu_buffer, next_page, + tail_page, + RB_PAGE_UPDATE); + if (RB_WARN_ON(cpu_buffer, + ret != RB_PAGE_UPDATE)) + return -1; + } + + return 0; +} + static unsigned rb_calculate_event_length(unsigned length) { struct ring_buffer_event event; /* Used only for sizeof array */ @@ -1207,96 +1787,93 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *commit_page, struct buffer_page *tail_page, u64 *ts) { - struct buffer_page *next_page, *head_page, *reader_page; struct ring_buffer *buffer = cpu_buffer->buffer; - bool lock_taken = false; - unsigned long flags; + struct buffer_page *next_page; + int ret; next_page = tail_page; - local_irq_save(flags); - /* - * Since the write to the buffer is still not - * fully lockless, we must be careful with NMIs. - * The locks in the writers are taken when a write - * crosses to a new page. The locks protect against - * races with the readers (this will soon be fixed - * with a lockless solution). - * - * Because we can not protect against NMIs, and we - * want to keep traces reentrant, we need to manage - * what happens when we are in an NMI. - * - * NMIs can happen after we take the lock. - * If we are in an NMI, only take the lock - * if it is not already taken. Otherwise - * simply fail. - */ - if (unlikely(in_nmi())) { - if (!__raw_spin_trylock(&cpu_buffer->lock)) { - cpu_buffer->nmi_dropped++; - goto out_reset; - } - } else - __raw_spin_lock(&cpu_buffer->lock); - - lock_taken = true; - rb_inc_page(cpu_buffer, &next_page); - head_page = cpu_buffer->head_page; - reader_page = cpu_buffer->reader_page; - - /* we grabbed the lock before incrementing */ - if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) - goto out_reset; - /* * If for some reason, we had an interrupt storm that made * it all the way around the buffer, bail, and warn * about it. */ if (unlikely(next_page == commit_page)) { - cpu_buffer->commit_overrun++; + local_inc(&cpu_buffer->commit_overrun); goto out_reset; } - if (next_page == head_page) { - if (!(buffer->flags & RB_FL_OVERWRITE)) - goto out_reset; - - /* tail_page has not moved yet? */ - if (tail_page == cpu_buffer->tail_page) { - /* count overflows */ - cpu_buffer->overrun += - local_read(&head_page->entries); + /* + * This is where the fun begins! + * + * We are fighting against races between a reader that + * could be on another CPU trying to swap its reader + * page with the buffer head. + * + * We are also fighting against interrupts coming in and + * moving the head or tail on us as well. + * + * If the next page is the head page then we have filled + * the buffer, unless the commit page is still on the + * reader page. + */ + if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) { - rb_inc_page(cpu_buffer, &head_page); - cpu_buffer->head_page = head_page; - cpu_buffer->head_page->read = 0; + /* + * If the commit is not on the reader page, then + * move the header page. + */ + if (!rb_is_reader_page(cpu_buffer->commit_page)) { + /* + * If we are not in overwrite mode, + * this is easy, just stop here. + */ + if (!(buffer->flags & RB_FL_OVERWRITE)) + goto out_reset; + + ret = rb_handle_head_page(cpu_buffer, + tail_page, + next_page); + if (ret < 0) + goto out_reset; + if (ret) + goto out_again; + } else { + /* + * We need to be careful here too. The + * commit page could still be on the reader + * page. We could have a small buffer, and + * have filled up the buffer with events + * from interrupts and such, and wrapped. + * + * Note, if the tail page is also the on the + * reader_page, we let it move out. + */ + if (unlikely((cpu_buffer->commit_page != + cpu_buffer->tail_page) && + (cpu_buffer->commit_page == + cpu_buffer->reader_page))) { + local_inc(&cpu_buffer->commit_overrun); + goto out_reset; + } } } - /* - * If the tail page is still the same as what we think - * it is, then it is up to us to update the tail - * pointer. - */ - if (tail_page == cpu_buffer->tail_page) { - local_set(&next_page->write, 0); - local_set(&next_page->entries, 0); - local_set(&next_page->page->commit, 0); - cpu_buffer->tail_page = next_page; - - /* reread the time stamp */ + ret = rb_tail_page_update(cpu_buffer, tail_page, next_page); + if (ret) { + /* + * Nested commits always have zero deltas, so + * just reread the time stamp + */ *ts = rb_time_stamp(buffer, cpu_buffer->cpu); - cpu_buffer->tail_page->page->time_stamp = *ts; + next_page->page->time_stamp = *ts; } - rb_reset_tail(cpu_buffer, tail_page, tail, length); + out_again: - __raw_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); + rb_reset_tail(cpu_buffer, tail_page, tail, length); /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); @@ -1305,9 +1882,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, /* reset write */ rb_reset_tail(cpu_buffer, tail_page, tail, length); - if (likely(lock_taken)) - __raw_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); return NULL; } @@ -1324,6 +1898,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, barrier(); tail_page = cpu_buffer->tail_page; write = local_add_return(length, &tail_page->write); + + /* set write to only the index of the write */ + write &= RB_WRITE_MASK; tail = write - length; /* See if we shot pass the end of this buffer page */ @@ -1368,12 +1945,16 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, bpage = cpu_buffer->tail_page; if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { + unsigned long write_mask = + local_read(&bpage->write) & ~RB_WRITE_MASK; /* * This is on the tail page. It is possible that * a write could come in and move the tail page * and write to the next page. That is fine * because we just shorten what is on this page. */ + old_index += write_mask; + new_index += write_mask; index = local_cmpxchg(&bpage->write, old_index, new_index); if (index == old_index) return 1; @@ -1882,9 +2463,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_write); static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = cpu_buffer->reader_page; - struct buffer_page *head = cpu_buffer->head_page; + struct buffer_page *head = rb_set_head_page(cpu_buffer); struct buffer_page *commit = cpu_buffer->commit_page; + /* In case of error, head will be NULL */ + if (unlikely(!head)) + return 1; + return reader->read == rb_page_commit(reader) && (commit == reader || (commit == head && @@ -1975,7 +2560,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) + ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun)) - cpu_buffer->read; return ret; @@ -1996,32 +2581,12 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = cpu_buffer->overrun; + ret = local_read(&cpu_buffer->overrun); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); -/** - * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped - * @buffer: The ring buffer - * @cpu: The per CPU buffer to get the number of overruns from - */ -unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu) -{ - struct ring_buffer_per_cpu *cpu_buffer; - unsigned long ret; - - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return 0; - - cpu_buffer = buffer->buffers[cpu]; - ret = cpu_buffer->nmi_dropped; - - return ret; -} -EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu); - /** * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits * @buffer: The ring buffer @@ -2037,7 +2602,7 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = cpu_buffer->commit_overrun; + ret = local_read(&cpu_buffer->commit_overrun); return ret; } @@ -2060,7 +2625,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; entries += (local_read(&cpu_buffer->entries) - - cpu_buffer->overrun) - cpu_buffer->read; + local_read(&cpu_buffer->overrun)) - cpu_buffer->read; } return entries; @@ -2083,7 +2648,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer) /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - overruns += cpu_buffer->overrun; + overruns += local_read(&cpu_buffer->overrun); } return overruns; @@ -2096,8 +2661,10 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) /* Iterator usage is expected to have record disabled */ if (list_empty(&cpu_buffer->reader_page->list)) { - iter->head_page = cpu_buffer->head_page; - iter->head = cpu_buffer->head_page->read; + iter->head_page = rb_set_head_page(cpu_buffer); + if (unlikely(!iter->head_page)) + return; + iter->head = iter->head_page->read; } else { iter->head_page = cpu_buffer->reader_page; iter->head = cpu_buffer->reader_page->read; @@ -2214,6 +2781,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) struct buffer_page *reader = NULL; unsigned long flags; int nr_loops = 0; + int ret; local_irq_save(flags); __raw_spin_lock(&cpu_buffer->lock); @@ -2247,11 +2815,17 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) goto out; /* - * Splice the empty reader page into the list around the head. * Reset the reader page to size zero. */ + local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->entries, 0); + local_set(&cpu_buffer->reader_page->page->commit, 0); - reader = cpu_buffer->head_page; + spin: + /* + * Splice the empty reader page into the list around the head. + */ + reader = rb_set_head_page(cpu_buffer); cpu_buffer->reader_page->list.next = reader->list.next; cpu_buffer->reader_page->list.prev = reader->list.prev; @@ -2262,22 +2836,35 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) */ cpu_buffer->pages = reader->list.prev; - local_set(&cpu_buffer->reader_page->write, 0); - local_set(&cpu_buffer->reader_page->entries, 0); - local_set(&cpu_buffer->reader_page->page->commit, 0); + /* The reader page will be pointing to the new head */ + rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list); - /* Make the reader page now replace the head */ - reader->list.prev->next = &cpu_buffer->reader_page->list; - reader->list.next->prev = &cpu_buffer->reader_page->list; + /* + * Here's the tricky part. + * + * We need to move the pointer past the header page. + * But we can only do that if a writer is not currently + * moving it. The page before the header page has the + * flag bit '1' set if it is pointing to the page we want. + * but if the writer is in the process of moving it + * than it will be '2' or already moved '0'. + */ + + ret = rb_head_page_replace(reader, cpu_buffer->reader_page); /* - * If the tail is on the reader, then we must set the head - * to the inserted page, otherwise we set it one before. + * If we did not convert it, then we must try again. */ - cpu_buffer->head_page = cpu_buffer->reader_page; + if (!ret) + goto spin; - if (cpu_buffer->commit_page != reader) - rb_inc_page(cpu_buffer, &cpu_buffer->head_page); + /* + * Yeah! We succeeded in replacing the page. + * + * Now make the new head point back to the reader page. + */ + reader->list.next->prev = &cpu_buffer->reader_page->list; + rb_inc_page(cpu_buffer, &cpu_buffer->head_page); /* Finally update the reader page to the new head */ cpu_buffer->reader_page = reader; @@ -2733,6 +3320,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_size); static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { + rb_head_page_deactivate(cpu_buffer); + cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); local_set(&cpu_buffer->head_page->write, 0); @@ -2750,16 +3339,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->read = 0; - cpu_buffer->nmi_dropped = 0; - cpu_buffer->commit_overrun = 0; - cpu_buffer->overrun = 0; - cpu_buffer->read = 0; + local_set(&cpu_buffer->commit_overrun, 0); + local_set(&cpu_buffer->overrun, 0); local_set(&cpu_buffer->entries, 0); local_set(&cpu_buffer->committing, 0); local_set(&cpu_buffer->commits, 0); + cpu_buffer->read = 0; cpu_buffer->write_stamp = 0; cpu_buffer->read_stamp = 0; + + rb_head_page_activate(cpu_buffer); } /** @@ -3107,7 +3697,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, read = 0; } else { /* update the entry counter */ - cpu_buffer->read += local_read(&reader->entries); + cpu_buffer->read += rb_page_entries(reader); /* swap the pages */ rb_init_page(bpage); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bdb3afc8b30..b591f7a1bd7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3630,9 +3630,6 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); trace_seq_printf(s, "commit overrun: %ld\n", cnt); - cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu); - trace_seq_printf(s, "nmi dropped: %ld\n", cnt); - count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); kfree(s); -- cgit v1.2.3-70-g09d2 From c5cb183608167c744cb28bbd85884be5a4ce875d Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 9 Jul 2009 16:20:12 +0800 Subject: tracing/filter: Remove preds from struct event_subsystem No need to save preds to event_subsystem, because it's not used. Signed-off-by: Xiao Guangrong Acked-by: Tom Zanussi Reviewed-by: Li Zefan Cc: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <4A55A83C.1030005@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 39 ++++++-------------------------------- 1 file changed, 6 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 936c621bbf4..b9aae72d13d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -420,17 +420,7 @@ EXPORT_SYMBOL_GPL(init_preds); static void filter_free_subsystem_preds(struct event_subsystem *system) { - struct event_filter *filter = system->filter; struct ftrace_event_call *call; - int i; - - if (filter->n_preds) { - for (i = 0; i < filter->n_preds; i++) - filter_free_pred(filter->preds[i]); - kfree(filter->preds); - filter->preds = NULL; - filter->n_preds = 0; - } list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) @@ -607,26 +597,9 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, struct filter_pred *pred, char *filter_string) { - struct event_filter *filter = system->filter; struct ftrace_event_call *call; int err = 0; - if (!filter->preds) { - filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), - GFP_KERNEL); - - if (!filter->preds) - return -ENOMEM; - } - - if (filter->n_preds == MAX_FILTER_PRED) { - parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - return -ENOSPC; - } - - filter->preds[filter->n_preds] = pred; - filter->n_preds++; - list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) @@ -1029,12 +1002,12 @@ static int replace_preds(struct event_subsystem *system, if (elt->op == OP_AND || elt->op == OP_OR) { pred = create_logical_pred(elt->op); - if (call) { + if (call) err = filter_add_pred(ps, call, pred); - filter_free_pred(pred); - } else + else err = filter_add_subsystem_pred(ps, system, pred, filter_string); + filter_free_pred(pred); if (err) return err; @@ -1048,12 +1021,12 @@ static int replace_preds(struct event_subsystem *system, } pred = create_pred(elt->op, operand1, operand2); - if (call) { + if (call) err = filter_add_pred(ps, call, pred); - filter_free_pred(pred); - } else + else err = filter_add_subsystem_pred(ps, system, pred, filter_string); + filter_free_pred(pred); if (err) return err; -- cgit v1.2.3-70-g09d2 From dc82ec98a4727fd51b77e92d05fe7d2db3dcc11c Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 9 Jul 2009 16:22:22 +0800 Subject: tracing/filter: Remove empty subsystem and its directory Remove empty subsystem and its directory when module unload. Before patch: # rmmod trace-events-sample.ko # ls sample enable filter After patch: # rmmod trace-events-sample.ko # ls sample ls: cannot access sample: No such file or directory Signed-off-by: Xiao Guangrong Acked-by: Tom Zanussi Reviewed-by: Li Zefan Cc: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <4A55A8BE.9010707@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 32 +++++++++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 52eb0d8dcd7..94305c7bc11 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -757,6 +757,7 @@ struct event_subsystem { const char *name; struct dentry *entry; void *filter; + int nr_events; }; struct filter_pred; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index fecac1314cb..90cf9360e14 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -851,8 +851,10 @@ event_subsystem_dir(const char *name, struct dentry *d_events) /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { - if (strcmp(system->name, name) == 0) + if (strcmp(system->name, name) == 0) { + system->nr_events++; return system->entry; + } } /* need to create new entry */ @@ -871,6 +873,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) return d_events; } + system->nr_events = 1; system->name = kstrdup(name, GFP_KERNEL); if (!system->name) { debugfs_remove(system->entry); @@ -905,6 +908,32 @@ event_subsystem_dir(const char *name, struct dentry *d_events) return system->entry; } +static void remove_subsystem_dir(const char *name) +{ + struct event_subsystem *system; + + if (strcmp(name, TRACE_SYSTEM) == 0) + return; + + list_for_each_entry(system, &event_subsystems, list) { + if (strcmp(system->name, name) == 0) { + if (!--system->nr_events) { + struct event_filter *filter = system->filter; + + debugfs_remove_recursive(system->entry); + list_del(&system->list); + if (filter) { + kfree(filter->filter_string); + kfree(filter); + } + kfree(system->name); + kfree(system); + } + break; + } + } +} + static int event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, const struct file_operations *id, @@ -1079,6 +1108,7 @@ static void trace_module_remove_events(struct module *mod) list_del(&call->list); trace_destroy_fields(call); destroy_preds(call); + remove_subsystem_dir(call->system); } } -- cgit v1.2.3-70-g09d2 From 68baafcfc46074c4bb4e4c3115c2c76a8a85f37d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 9 Jul 2009 04:46:29 +0200 Subject: tracing/function-graph-tracer: Use the %pf format Remove the obsolete seq_print_ip_sym() usage and replace it by the %pf format in order to print function symbols. Signed-off-by: Frederic Weisbecker Reviewed-by: Li Zefan Cc: Steven Rostedt Cc: Lai Jiangshan LKML-Reference: <1247107590-6428-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions_graph.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d2249abafb5..abf7c4ae2c8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -565,11 +565,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } - ret = seq_print_ip_sym(s, call->func, 0); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_printf(s, "();\n"); + ret = trace_seq_printf(s, "%pf();\n", (void *)call->func); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -612,11 +608,7 @@ print_graph_entry_nested(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } - ret = seq_print_ip_sym(s, call->func, 0); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_printf(s, "() {\n"); + ret = trace_seq_printf(s, "%pf() {\n", (void *)call->func); if (!ret) return TRACE_TYPE_PARTIAL_LINE; -- cgit v1.2.3-70-g09d2 From 6a167c655858cbec4175532fd00417661c87f149 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 9 Jul 2009 04:46:30 +0200 Subject: tracing/kmemtrace: Use the %pf format Remove the obsolete seq_print_ip_sym() usage and replace it by the %pf format in order to print function symbols. Signed-off-by: Frederic Weisbecker Reviewed-by: Li Zefan Cc: Steven Rostedt Cc: Lai Jiangshan Cc: Pekka Enberg Cc: Eduard - Gabriel Munteanu LKML-Reference: <1247107590-6428-3-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/kmemtrace.c | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 74903b62bcb..2f6fa47d410 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -389,19 +389,12 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter) if (!ret) return TRACE_TYPE_PARTIAL_LINE; - /* Node */ - ret = trace_seq_printf(s, "%4d ", entry->node); + /* Node and call site*/ + ret = trace_seq_printf(s, "%4d %pf\n", entry->node, + (void *)entry->call_site); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - /* Call site */ - ret = seq_print_ip_sym(s, entry->call_site, 0); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - if (!trace_seq_printf(s, "\n")) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; } @@ -447,19 +440,11 @@ kmemtrace_print_free_compress(struct trace_iterator *iter) if (!ret) return TRACE_TYPE_PARTIAL_LINE; - /* Skip node */ - ret = trace_seq_printf(s, " "); + /* Skip node and print call site*/ + ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - /* Call site */ - ret = seq_print_ip_sym(s, entry->call_site, 0); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - if (!trace_seq_printf(s, "\n")) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; } -- cgit v1.2.3-70-g09d2 From 80098c200e2ee3b4c86a9d1e156dbcd05380e08f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 6 Jul 2009 16:15:04 +0800 Subject: kmemtrace: Rename some functions So we have: - kmemtrace_print_alloc/free() for kmemtrace default output - kmemtrace_print_alloc/free_user() for binary output used by kmemtrace-user. Suggested-by: Eduard - Gabriel Munteanu Signed-off-by: Li Zefan Acked-by: Pekka Enberg Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A51B288.70505@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/kmemtrace.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 2f6fa47d410..dda53ccf749 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -239,7 +239,7 @@ struct kmemtrace_user_event_alloc { }; static enum print_line_t -kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags) +kmemtrace_print_alloc(struct trace_iterator *iter, int flags) { struct trace_seq *s = &iter->seq; struct kmemtrace_alloc_entry *entry; @@ -259,7 +259,7 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags) } static enum print_line_t -kmemtrace_print_free_user(struct trace_iterator *iter, int flags) +kmemtrace_print_free(struct trace_iterator *iter, int flags) { struct trace_seq *s = &iter->seq; struct kmemtrace_free_entry *entry; @@ -277,7 +277,7 @@ kmemtrace_print_free_user(struct trace_iterator *iter, int flags) } static enum print_line_t -kmemtrace_print_alloc_user_bin(struct trace_iterator *iter, int flags) +kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags) { struct trace_seq *s = &iter->seq; struct kmemtrace_alloc_entry *entry; @@ -311,7 +311,7 @@ kmemtrace_print_alloc_user_bin(struct trace_iterator *iter, int flags) } static enum print_line_t -kmemtrace_print_free_user_bin(struct trace_iterator *iter, int flags) +kmemtrace_print_free_user(struct trace_iterator *iter, int flags) { struct trace_seq *s = &iter->seq; struct kmemtrace_free_entry *entry; @@ -467,14 +467,14 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) static struct trace_event kmem_trace_alloc = { .type = TRACE_KMEM_ALLOC, - .trace = kmemtrace_print_alloc_user, - .binary = kmemtrace_print_alloc_user_bin, + .trace = kmemtrace_print_alloc, + .binary = kmemtrace_print_alloc_user, }; static struct trace_event kmem_trace_free = { .type = TRACE_KMEM_FREE, - .trace = kmemtrace_print_free_user, - .binary = kmemtrace_print_free_user_bin, + .trace = kmemtrace_print_free, + .binary = kmemtrace_print_free_user, }; static struct tracer kmem_tracer __read_mostly = { -- cgit v1.2.3-70-g09d2 From d8ea37d5de58d35a39d0b4e7d209751aaa1b8174 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 6 Jul 2009 16:10:18 +0800 Subject: tracing/stat: Add stat_release() callback Add stat_release() callback to struct tracer_stat, so a stat tracer can release it's entries after the stat file has been read out. Signed-off-by: Lai Jiangshan Signed-off-by: Li Zefan Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A51B16A.6020708@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_stat.c | 7 +++++-- kernel/trace/trace_stat.h | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index e66f5e49334..f069461f10b 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -49,7 +49,8 @@ static struct dentry *stat_dir; * but it will at least advance closer to the next one * to be released. */ -static struct rb_node *release_next(struct rb_node *node) +static struct rb_node *release_next(struct tracer_stat *ts, + struct rb_node *node) { struct stat_node *snode; struct rb_node *parent = rb_parent(node); @@ -67,6 +68,8 @@ static struct rb_node *release_next(struct rb_node *node) parent->rb_right = NULL; snode = container_of(node, struct stat_node, node); + if (ts->stat_release) + ts->stat_release(snode->stat); kfree(snode); return parent; @@ -78,7 +81,7 @@ static void reset_stat_session(struct stat_session *session) struct rb_node *node = session->stat_root.rb_node; while (node) - node = release_next(node); + node = release_next(session->ts, node); session->stat_root = RB_ROOT; } diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h index f3546a2cd82..8f03914b9a6 100644 --- a/kernel/trace/trace_stat.h +++ b/kernel/trace/trace_stat.h @@ -18,6 +18,8 @@ struct tracer_stat { int (*stat_cmp)(void *p1, void *p2); /* Print a stat entry */ int (*stat_show)(struct seq_file *s, void *p); + /* Release an entry */ + void (*stat_release)(void *stat); /* Print the headers of your stat entries */ int (*stat_headers)(struct seq_file *s); }; -- cgit v1.2.3-70-g09d2 From a35780005eb256eb5ec83ffcc802967295887a45 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 6 Jul 2009 16:10:23 +0800 Subject: tracing/workqueues: Add refcnt to struct cpu_workqueue_stats The stat entries can be freed when the stat file is being read. The worse is, the ptr can be freed immediately after it's returned from workqueue_stat_start/next(). Add a refcnt to struct cpu_workqueue_stats to avoid use-after-free. Signed-off-by: Lai Jiangshan Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <4A51B16F.6010608@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_workqueue.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 97fcea4acce..40cafb07dff 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "trace_stat.h" #include "trace.h" @@ -16,6 +17,7 @@ /* A cpu workqueue thread */ struct cpu_workqueue_stats { struct list_head list; + struct kref kref; int cpu; pid_t pid; /* Can be inserted from interrupt or user context, need to be atomic */ @@ -39,6 +41,11 @@ struct workqueue_global_stats { static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); #define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) +static void cpu_workqueue_stat_free(struct kref *kref) +{ + kfree(container_of(kref, struct cpu_workqueue_stats, kref)); +} + /* Insertion of a work */ static void probe_workqueue_insertion(struct task_struct *wq_thread, @@ -96,8 +103,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) return; } INIT_LIST_HEAD(&cws->list); + kref_init(&cws->kref); cws->cpu = cpu; - cws->pid = wq_thread->pid; spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); @@ -118,7 +125,7 @@ static void probe_workqueue_destruction(struct task_struct *wq_thread) list) { if (node->pid == wq_thread->pid) { list_del(&node->list); - kfree(node); + kref_put(&node->kref, cpu_workqueue_stat_free); goto found; } } @@ -137,9 +144,11 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - if (!list_empty(&workqueue_cpu_stat(cpu)->list)) + if (!list_empty(&workqueue_cpu_stat(cpu)->list)) { ret = list_entry(workqueue_cpu_stat(cpu)->list.next, struct cpu_workqueue_stats, list); + kref_get(&ret->kref); + } spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); @@ -162,9 +171,9 @@ static void *workqueue_stat_start(struct tracer_stat *trace) static void *workqueue_stat_next(void *prev, int idx) { struct cpu_workqueue_stats *prev_cws = prev; + struct cpu_workqueue_stats *ret; int cpu = prev_cws->cpu; unsigned long flags; - void *ret = NULL; spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { @@ -175,11 +184,14 @@ static void *workqueue_stat_next(void *prev, int idx) return NULL; } while (!(ret = workqueue_stat_start_cpu(cpu))); return ret; + } else { + ret = list_entry(prev_cws->list.next, + struct cpu_workqueue_stats, list); + kref_get(&ret->kref); } spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - return list_entry(prev_cws->list.next, struct cpu_workqueue_stats, - list); + return ret; } static int workqueue_stat_show(struct seq_file *s, void *p) @@ -203,6 +215,13 @@ static int workqueue_stat_show(struct seq_file *s, void *p) return 0; } +static void workqueue_stat_release(void *stat) +{ + struct cpu_workqueue_stats *node = stat; + + kref_put(&node->kref, cpu_workqueue_stat_free); +} + static int workqueue_stat_headers(struct seq_file *s) { seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); @@ -215,6 +234,7 @@ struct tracer_stat workqueue_stats __read_mostly = { .stat_start = workqueue_stat_start, .stat_next = workqueue_stat_next, .stat_show = workqueue_stat_show, + .stat_release = workqueue_stat_release, .stat_headers = workqueue_stat_headers }; -- cgit v1.2.3-70-g09d2 From 8259cf4342029aad37660e524178c8858f48b0ab Mon Sep 17 00:00:00 2001 From: Robin Getz Date: Thu, 9 Jul 2009 13:08:37 -0400 Subject: printk: Ensure that "console enabled" messages are printed on the console Today, when a console is registered without CON_PRINTBUFFER, end users never see the announcement of it being added, and never know if they missed something, if the console is really at the start or not, and just leads to general confusion. This re-orders existing code, to make sure the console is added, before the "console [%s%d] enabled" is printed out - ensuring that this message is _always_ seen. This has the desired/intended side effect of making sure that "console enabled:" messages are printed on the bootconsole, and the real console. This does cause the same line is printed twice if the bootconsole and real console are the same device, but if they are on different devices, the message is printed to both consoles. Signed-off-by : Robin Getz Cc: "Andrew Morton" Cc: "Linus Torvalds" LKML-Reference: <200907091308.37370.rgetz@blackfin.uclinux.org> Signed-off-by: Ingo Molnar --- kernel/printk.c | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 41fe6099553..668df351e6a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1240,22 +1240,14 @@ void register_console(struct console *newcon) if (!(newcon->flags & CON_ENABLED)) return; - if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { - /* we need to iterate through twice, to make sure we print - * everything out, before we unregister the console(s) - */ - printk(KERN_INFO "console handover:"); - for_each_console(bcon) - printk("boot [%s%d] ", bcon->name, bcon->index); - printk(" -> real [%s%d]\n", newcon->name, newcon->index); - for_each_console(bcon) - unregister_console(bcon); + /* + * If we have a bootconsole, and are switching to a real console, + * don't print everything out again, since when the boot console, and + * the real console are the same physical device, it's annoying to + * see the beginning boot messages twice + */ + if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) newcon->flags &= ~CON_PRINTBUFFER; - } else { - printk(KERN_INFO "%sconsole [%s%d] enabled\n", - (newcon->flags & CON_BOOT) ? "boot" : "" , - newcon->name, newcon->index); - } /* * Put this console in the list - keep the @@ -1281,6 +1273,28 @@ void register_console(struct console *newcon) spin_unlock_irqrestore(&logbuf_lock, flags); } release_console_sem(); + + /* + * By unregistering the bootconsoles after we enable the real console + * we get the "console xxx enabled" message on all the consoles - + * boot consoles, real consoles, etc - this is to ensure that end + * users know there might be something in the kernel's log buffer that + * went to the bootconsole (that they do not see on the real console) + */ + if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { + /* we need to iterate through twice, to make sure we print + * everything out, before we unregister the console(s) + */ + printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n", + newcon->name, newcon->index); + for_each_console(bcon) + if (bcon->flags & CON_BOOT) + unregister_console(bcon); + } else { + printk(KERN_INFO "%sconsole [%s%d] enabled\n", + (newcon->flags & CON_BOOT) ? "boot" : "" , + newcon->name, newcon->index); + } } EXPORT_SYMBOL(register_console); -- cgit v1.2.3-70-g09d2 From 1aaad49e856ce41adc07d8ae0c8ef35fc4483245 Mon Sep 17 00:00:00 2001 From: Frans Pop Date: Mon, 6 Jul 2009 13:31:48 +0200 Subject: printk: Restore previous console_loglevel when re-enabling logging When logging to console is disabled from userspace using klogctl() and later re-enabled, console_loglevel gets set to the default log level instead to the previous value. This means that if the kernel was booted with 'quiet', the boot is suddenly no longer quiet after logging to console gets re-enabled. Save the current console_loglevel when logging is disabled and restore to that value. If the log level is set to a specific value while disabled, this is interpreted as an implicit re-enabling of the logging. The problem that prompted this patch is described in: http://lkml.org/lkml/2009/6/28/234 There are two variations possible on the patch below: 1) If klogctl(7) is called while logging is not disabled, then set level to default (partially preserving current functionality): case 7: /* Enable logging to console */ - console_loglevel = default_console_loglevel; + if (saved_console_loglevel == -1) + console_loglevel = default_console_loglevel; + else { + console_loglevel = saved_console_loglevel; + saved_console_loglevel = -1; + } 2) If klogctl(8) is called while logging is disabled, then don't enable logging, but remember the requested value for when logging does get enabled again: case 8: /* Set level of messages printed to console */ [...] - console_loglevel = len; + if (saved_console_loglevel == -1) + console_loglevel = len; + else + saved_console_loglevel = len; Yet another option would be to ignore the request. Signed-off-by: Frans Pop Cc: cryptsetup@packages.debian.org Cc: Andrew Morton Cc: Linus Torvalds LKML-Reference: <200907061331.49930.elendil@planet.nl> Signed-off-by: Ingo Molnar --- kernel/printk.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 668df351e6a..e0daaf57985 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -67,6 +67,8 @@ int console_printk[4] = { DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ }; +static int saved_console_loglevel = -1; + /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. @@ -378,10 +380,15 @@ int do_syslog(int type, char __user *buf, int len) logged_chars = 0; break; case 6: /* Disable logging to console */ + if (saved_console_loglevel == -1) + saved_console_loglevel = console_loglevel; console_loglevel = minimum_console_loglevel; break; case 7: /* Enable logging to console */ - console_loglevel = default_console_loglevel; + if (saved_console_loglevel != -1) { + console_loglevel = saved_console_loglevel; + saved_console_loglevel = -1; + } break; case 8: /* Set level of messages printed to console */ error = -EINVAL; @@ -390,6 +397,8 @@ int do_syslog(int type, char __user *buf, int len) if (len < minimum_console_loglevel) len = minimum_console_loglevel; console_loglevel = len; + /* Implicitly re-enable logging to console */ + saved_console_loglevel = -1; error = 0; break; case 9: /* Number of chars in the log buffer */ -- cgit v1.2.3-70-g09d2 From 134e63756d5f3d0f7604dfcca847b09d1b14fd66 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 10 Jul 2009 09:51:34 +0000 Subject: genetlink: make netns aware This makes generic netlink network namespace aware. No generic netlink families except for the controller family are made namespace aware, they need to be checked one by one and then set the family->netnsok member to true. A new function genlmsg_multicast_netns() is introduced to allow sending a multicast message in a given namespace, for example when it applies to an object that lives in that namespace, a new function genlmsg_multicast_allns() to send a message to all network namespaces (for objects that do not have an associated netns). The function genlmsg_multicast() is changed to multicast the message in just init_net, which is currently correct for all generic netlink families since they only work in init_net right now. Some will later want to work in all net namespaces because they do not care about the netns at all -- those will have to be converted to use one of the new functions genlmsg_multicast_allns() or genlmsg_multicast_netns() whenever they are made netns aware in some way. After this patch families can easily decide whether or not they should be available in all net namespaces. Many genl families us it for objects not related to networking and should therefore be available in all namespaces, but that will have to be done on a per family basis. Note that this doesn't touch on the checkpoint/restart problem where network namespaces could be used, genl families and multicast groups are numbered globally and I see no easy way of changing that, especially since it must be possible to multicast to all network namespaces for those families that do not care about netns. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- fs/dlm/netlink.c | 2 +- include/net/genetlink.h | 66 +++++++++++++-- include/net/net_namespace.h | 2 + kernel/taskstats.c | 10 +-- net/irda/irnetlink.c | 2 +- net/netfilter/ipvs/ip_vs_ctl.c | 2 +- net/netlink/genetlink.c | 186 ++++++++++++++++++++++++++++++++--------- net/tipc/netlink.c | 2 +- net/wireless/nl80211.c | 14 ++-- 9 files changed, 223 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index ccc9d62c462..55ea369f43a 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -63,7 +63,7 @@ static int send_data(struct sk_buff *skb) return rv; } - return genlmsg_unicast(skb, listener_nlpid); + return genlmsg_unicast(&init_net, skb, listener_nlpid); } static int user_cmd(struct sk_buff *skb, struct genl_info *info) diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 1b0e3ee4ddd..2a1c06874c4 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -3,6 +3,7 @@ #include #include +#include /** * struct genl_multicast_group - generic netlink multicast group @@ -27,6 +28,8 @@ struct genl_multicast_group * @name: name of family * @version: protocol version * @maxattr: maximum number of attributes supported + * @netnsok: set to true if the family can handle network + * namespaces and should be presented in all of them * @attrbuf: buffer to store parsed attributes * @ops_list: list of all assigned operations * @family_list: family list @@ -39,6 +42,7 @@ struct genl_family char name[GENL_NAMSIZ]; unsigned int version; unsigned int maxattr; + bool netnsok; struct nlattr ** attrbuf; /* private */ struct list_head ops_list; /* private */ struct list_head family_list; /* private */ @@ -62,8 +66,32 @@ struct genl_info struct genlmsghdr * genlhdr; void * userhdr; struct nlattr ** attrs; +#ifdef CONFIG_NET_NS + struct net * _net; +#endif }; +#ifdef CONFIG_NET_NS +static inline struct net *genl_info_net(struct genl_info *info) +{ + return info->_net; +} + +static inline void genl_info_net_set(struct genl_info *info, struct net *net) +{ + info->_net = net; +} +#else +static inline struct net *genl_info_net(struct genl_info *info) +{ + return &init_net; +} + +static inline void genl_info_net_set(struct genl_info *info, struct net *net) +{ +} +#endif + /** * struct genl_ops - generic netlink operations * @cmd: command identifier @@ -98,8 +126,6 @@ extern int genl_register_mc_group(struct genl_family *family, extern void genl_unregister_mc_group(struct genl_family *family, struct genl_multicast_group *grp); -extern struct sock *genl_sock; - /** * genlmsg_put - Add generic netlink header to netlink message * @skb: socket buffer holding the message @@ -170,7 +196,21 @@ static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr) } /** - * genlmsg_multicast - multicast a netlink message + * genlmsg_multicast_netns - multicast a netlink message to a specific netns + * @net: the net namespace + * @skb: netlink message as socket buffer + * @pid: own netlink pid to avoid sending to yourself + * @group: multicast group id + * @flags: allocation flags + */ +static inline int genlmsg_multicast_netns(struct net *net, struct sk_buff *skb, + u32 pid, unsigned int group, gfp_t flags) +{ + return nlmsg_multicast(net->genl_sock, skb, pid, group, flags); +} + +/** + * genlmsg_multicast - multicast a netlink message to the default netns * @skb: netlink message as socket buffer * @pid: own netlink pid to avoid sending to yourself * @group: multicast group id @@ -179,17 +219,29 @@ static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr) static inline int genlmsg_multicast(struct sk_buff *skb, u32 pid, unsigned int group, gfp_t flags) { - return nlmsg_multicast(genl_sock, skb, pid, group, flags); + return genlmsg_multicast_netns(&init_net, skb, pid, group, flags); } +/** + * genlmsg_multicast_allns - multicast a netlink message to all net namespaces + * @skb: netlink message as socket buffer + * @pid: own netlink pid to avoid sending to yourself + * @group: multicast group id + * @flags: allocation flags + * + * This function must hold the RTNL or rcu_read_lock(). + */ +int genlmsg_multicast_allns(struct sk_buff *skb, u32 pid, + unsigned int group, gfp_t flags); + /** * genlmsg_unicast - unicast a netlink message * @skb: netlink message as socket buffer * @pid: netlink pid of the destination socket */ -static inline int genlmsg_unicast(struct sk_buff *skb, u32 pid) +static inline int genlmsg_unicast(struct net *net, struct sk_buff *skb, u32 pid) { - return nlmsg_unicast(genl_sock, skb, pid); + return nlmsg_unicast(net->genl_sock, skb, pid); } /** @@ -199,7 +251,7 @@ static inline int genlmsg_unicast(struct sk_buff *skb, u32 pid) */ static inline int genlmsg_reply(struct sk_buff *skb, struct genl_info *info) { - return genlmsg_unicast(skb, info->snd_pid); + return genlmsg_unicast(genl_info_net(info), skb, info->snd_pid); } /** diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index b34a6ee7375..3173d12d946 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -26,6 +26,7 @@ struct net_device; struct sock; struct ctl_table_header; struct net_generic; +struct sock; struct net { atomic_t count; /* To decided when the network @@ -57,6 +58,7 @@ struct net { spinlock_t rules_mod_lock; struct sock *rtnl; /* rtnetlink socket */ + struct sock *genl_sock; struct netns_core core; struct netns_mib mib; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 888adbcca30..ea8384d3caa 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -108,7 +108,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, /* * Send taskstats data in @skb to listener with nl_pid @pid */ -static int send_reply(struct sk_buff *skb, pid_t pid) +static int send_reply(struct sk_buff *skb, struct genl_info *info) { struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); void *reply = genlmsg_data(genlhdr); @@ -120,7 +120,7 @@ static int send_reply(struct sk_buff *skb, pid_t pid) return rc; } - return genlmsg_unicast(skb, pid); + return genlmsg_reply(skb, info); } /* @@ -150,7 +150,7 @@ static void send_cpu_listeners(struct sk_buff *skb, if (!skb_next) break; } - rc = genlmsg_unicast(skb_cur, s->pid); + rc = genlmsg_unicast(&init_net, skb_cur, s->pid); if (rc == -ECONNREFUSED) { s->valid = 0; delcount++; @@ -418,7 +418,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) goto err; } - rc = send_reply(rep_skb, info->snd_pid); + rc = send_reply(rep_skb, info); err: fput_light(file, fput_needed); @@ -487,7 +487,7 @@ free_return_rc: } else goto err; - return send_reply(rep_skb, info->snd_pid); + return send_reply(rep_skb, info); err: nlmsg_free(rep_skb); return rc; diff --git a/net/irda/irnetlink.c b/net/irda/irnetlink.c index 8dd7ed7e7c1..476b307bd80 100644 --- a/net/irda/irnetlink.c +++ b/net/irda/irnetlink.c @@ -115,7 +115,7 @@ static int irda_nl_get_mode(struct sk_buff *skb, struct genl_info *info) genlmsg_end(msg, hdr); - return genlmsg_unicast(msg, info->snd_pid); + return genlmsg_reply(msg, info); err_out: nlmsg_free(msg); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 7c1333c67ff..2d24d81474c 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3231,7 +3231,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) } genlmsg_end(msg, reply); - ret = genlmsg_unicast(msg, info->snd_pid); + ret = genlmsg_reply(msg, info); goto out; nla_put_failure: diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index eed4c6a8afc..575c6434150 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -18,8 +18,6 @@ #include #include -struct sock *genl_sock = NULL; - static DEFINE_MUTEX(genl_mutex); /* serialization of message processing */ static inline void genl_lock(void) @@ -175,10 +173,31 @@ int genl_register_mc_group(struct genl_family *family, mc_groups_longs++; } - err = netlink_change_ngroups(genl_sock, - mc_groups_longs * BITS_PER_LONG); - if (err) - goto out; + if (family->netnsok) { + struct net *net; + + rcu_read_lock(); + for_each_net_rcu(net) { + err = netlink_change_ngroups(net->genl_sock, + mc_groups_longs * BITS_PER_LONG); + if (err) { + /* + * No need to roll back, can only fail if + * memory allocation fails and then the + * number of _possible_ groups has been + * increased on some sockets which is ok. + */ + rcu_read_unlock(); + goto out; + } + } + rcu_read_unlock(); + } else { + err = netlink_change_ngroups(init_net.genl_sock, + mc_groups_longs * BITS_PER_LONG); + if (err) + goto out; + } grp->id = id; set_bit(id, mc_groups); @@ -195,8 +214,14 @@ EXPORT_SYMBOL(genl_register_mc_group); static void __genl_unregister_mc_group(struct genl_family *family, struct genl_multicast_group *grp) { + struct net *net; BUG_ON(grp->family != family); - netlink_clear_multicast_users(genl_sock, grp->id); + + rcu_read_lock(); + for_each_net_rcu(net) + netlink_clear_multicast_users(net->genl_sock, grp->id); + rcu_read_unlock(); + clear_bit(grp->id, mc_groups); list_del(&grp->list); genl_ctrl_event(CTRL_CMD_DELMCAST_GRP, grp); @@ -467,6 +492,7 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { struct genl_ops *ops; struct genl_family *family; + struct net *net = sock_net(skb->sk); struct genl_info info; struct genlmsghdr *hdr = nlmsg_data(nlh); int hdrlen, err; @@ -475,6 +501,10 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (family == NULL) return -ENOENT; + /* this family doesn't exist in this netns */ + if (!family->netnsok && !net_eq(net, &init_net)) + return -ENOENT; + hdrlen = GENL_HDRLEN + family->hdrsize; if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) return -EINVAL; @@ -492,7 +522,7 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EOPNOTSUPP; genl_unlock(); - err = netlink_dump_start(genl_sock, skb, nlh, + err = netlink_dump_start(net->genl_sock, skb, nlh, ops->dumpit, ops->done); genl_lock(); return err; @@ -514,6 +544,7 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) info.genlhdr = nlmsg_data(nlh); info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN; info.attrs = family->attrbuf; + genl_info_net_set(&info, net); return ops->doit(skb, &info); } @@ -534,6 +565,7 @@ static struct genl_family genl_ctrl = { .name = "nlctrl", .version = 0x2, .maxattr = CTRL_ATTR_MAX, + .netnsok = true, }; static int ctrl_fill_info(struct genl_family *family, u32 pid, u32 seq, @@ -650,6 +682,7 @@ static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) int i, n = 0; struct genl_family *rt; + struct net *net = sock_net(skb->sk); int chains_to_skip = cb->args[0]; int fams_to_skip = cb->args[1]; @@ -658,6 +691,8 @@ static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) continue; n = 0; list_for_each_entry(rt, genl_family_chain(i), family_list) { + if (!rt->netnsok && !net_eq(net, &init_net)) + continue; if (++n < fams_to_skip) continue; if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).pid, @@ -729,6 +764,7 @@ static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info) if (info->attrs[CTRL_ATTR_FAMILY_ID]) { u16 id = nla_get_u16(info->attrs[CTRL_ATTR_FAMILY_ID]); res = genl_family_find_byid(id); + err = -ENOENT; } if (info->attrs[CTRL_ATTR_FAMILY_NAME]) { @@ -736,49 +772,61 @@ static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info) name = nla_data(info->attrs[CTRL_ATTR_FAMILY_NAME]); res = genl_family_find_byname(name); + err = -ENOENT; } - if (res == NULL) { - err = -ENOENT; - goto errout; + if (res == NULL) + return err; + + if (!res->netnsok && !net_eq(genl_info_net(info), &init_net)) { + /* family doesn't exist here */ + return -ENOENT; } msg = ctrl_build_family_msg(res, info->snd_pid, info->snd_seq, CTRL_CMD_NEWFAMILY); - if (IS_ERR(msg)) { - err = PTR_ERR(msg); - goto errout; - } + if (IS_ERR(msg)) + return PTR_ERR(msg); - err = genlmsg_reply(msg, info); -errout: - return err; + return genlmsg_reply(msg, info); } static int genl_ctrl_event(int event, void *data) { struct sk_buff *msg; + struct genl_family *family; + struct genl_multicast_group *grp; - if (genl_sock == NULL) + /* genl is still initialising */ + if (!init_net.genl_sock) return 0; switch (event) { case CTRL_CMD_NEWFAMILY: case CTRL_CMD_DELFAMILY: - msg = ctrl_build_family_msg(data, 0, 0, event); - if (IS_ERR(msg)) - return PTR_ERR(msg); - - genlmsg_multicast(msg, 0, GENL_ID_CTRL, GFP_KERNEL); + family = data; + msg = ctrl_build_family_msg(family, 0, 0, event); break; case CTRL_CMD_NEWMCAST_GRP: case CTRL_CMD_DELMCAST_GRP: + grp = data; + family = grp->family; msg = ctrl_build_mcgrp_msg(data, 0, 0, event); - if (IS_ERR(msg)) - return PTR_ERR(msg); - - genlmsg_multicast(msg, 0, GENL_ID_CTRL, GFP_KERNEL); break; + default: + return -EINVAL; + } + + if (IS_ERR(msg)) + return PTR_ERR(msg); + + if (!family->netnsok) { + genlmsg_multicast_netns(&init_net, msg, 0, + GENL_ID_CTRL, GFP_KERNEL); + } else { + rcu_read_lock(); + genlmsg_multicast_allns(msg, 0, GENL_ID_CTRL, GFP_ATOMIC); + rcu_read_unlock(); } return 0; @@ -795,6 +843,33 @@ static struct genl_multicast_group notify_grp = { .name = "notify", }; +static int __net_init genl_pernet_init(struct net *net) +{ + /* we'll bump the group number right afterwards */ + net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, 0, + genl_rcv, &genl_mutex, + THIS_MODULE); + + if (!net->genl_sock && net_eq(net, &init_net)) + panic("GENL: Cannot initialize generic netlink\n"); + + if (!net->genl_sock) + return -ENOMEM; + + return 0; +} + +static void __net_exit genl_pernet_exit(struct net *net) +{ + netlink_kernel_release(net->genl_sock); + net->genl_sock = NULL; +} + +static struct pernet_operations genl_pernet_ops = { + .init = genl_pernet_init, + .exit = genl_pernet_exit, +}; + static int __init genl_init(void) { int i, err; @@ -804,36 +879,67 @@ static int __init genl_init(void) err = genl_register_family(&genl_ctrl); if (err < 0) - goto errout; + goto problem; err = genl_register_ops(&genl_ctrl, &genl_ctrl_ops); if (err < 0) - goto errout_register; + goto problem; netlink_set_nonroot(NETLINK_GENERIC, NL_NONROOT_RECV); - /* we'll bump the group number right afterwards */ - genl_sock = netlink_kernel_create(&init_net, NETLINK_GENERIC, 0, - genl_rcv, &genl_mutex, THIS_MODULE); - if (genl_sock == NULL) - panic("GENL: Cannot initialize generic netlink\n"); + err = register_pernet_subsys(&genl_pernet_ops); + if (err) + goto problem; err = genl_register_mc_group(&genl_ctrl, ¬ify_grp); if (err < 0) - goto errout_register; + goto problem; return 0; -errout_register: - genl_unregister_family(&genl_ctrl); -errout: +problem: panic("GENL: Cannot register controller: %d\n", err); } subsys_initcall(genl_init); -EXPORT_SYMBOL(genl_sock); EXPORT_SYMBOL(genl_register_ops); EXPORT_SYMBOL(genl_unregister_ops); EXPORT_SYMBOL(genl_register_family); EXPORT_SYMBOL(genl_unregister_family); + +static int genlmsg_mcast(struct sk_buff *skb, u32 pid, unsigned long group, + gfp_t flags) +{ + struct sk_buff *tmp; + struct net *net, *prev = NULL; + int err; + + for_each_net_rcu(net) { + if (prev) { + tmp = skb_clone(skb, flags); + if (!tmp) { + err = -ENOMEM; + goto error; + } + err = nlmsg_multicast(prev->genl_sock, tmp, + pid, group, flags); + if (err) + goto error; + } + + prev = net; + } + + return nlmsg_multicast(prev->genl_sock, skb, pid, group, flags); + error: + kfree_skb(skb); + return err; +} + +int genlmsg_multicast_allns(struct sk_buff *skb, u32 pid, unsigned int group, + gfp_t flags) +{ + return genlmsg_mcast(skb, pid, group, flags); +} +EXPORT_SYMBOL(genlmsg_multicast_allns); diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 3c57005e44d..7bda8e3d139 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -62,7 +62,7 @@ static int handle_cmd(struct sk_buff *skb, struct genl_info *info) rep_nlh = nlmsg_hdr(rep_buf); memcpy(rep_nlh, req_nlh, hdr_space); rep_nlh->nlmsg_len = rep_buf->len; - genlmsg_unicast(rep_buf, NETLINK_CB(skb).pid); + genlmsg_unicast(&init_net, rep_buf, NETLINK_CB(skb).pid); } return 0; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 9deb12f73c4..2a04beba436 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -413,7 +413,7 @@ static int nl80211_get_wiphy(struct sk_buff *skb, struct genl_info *info) cfg80211_unlock_rdev(dev); - return genlmsg_unicast(msg, info->snd_pid); + return genlmsg_reply(msg, info); out_free: nlmsg_free(msg); @@ -739,7 +739,7 @@ static int nl80211_get_interface(struct sk_buff *skb, struct genl_info *info) dev_put(netdev); cfg80211_unlock_rdev(dev); - return genlmsg_unicast(msg, info->snd_pid); + return genlmsg_reply(msg, info); out_free: nlmsg_free(msg); @@ -1030,7 +1030,7 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) goto nla_put_failure; genlmsg_end(msg, hdr); - err = genlmsg_unicast(msg, info->snd_pid); + err = genlmsg_reply(msg, info); goto out; nla_put_failure: @@ -1618,7 +1618,7 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) dev, mac_addr, &sinfo) < 0) goto out_free; - err = genlmsg_unicast(msg, info->snd_pid); + err = genlmsg_reply(msg, info); goto out; out_free: @@ -2087,7 +2087,7 @@ static int nl80211_get_mpath(struct sk_buff *skb, struct genl_info *info) dev, dst, next_hop, &pinfo) < 0) goto out_free; - err = genlmsg_unicast(msg, info->snd_pid); + err = genlmsg_reply(msg, info); goto out; out_free: @@ -2436,7 +2436,7 @@ static int nl80211_get_mesh_params(struct sk_buff *skb, cur_params.dot11MeshHWMPnetDiameterTraversalTime); nla_nest_end(msg, pinfoattr); genlmsg_end(msg, hdr); - err = genlmsg_unicast(msg, info->snd_pid); + err = genlmsg_reply(msg, info); goto out; nla_put_failure: @@ -2624,7 +2624,7 @@ static int nl80211_get_reg(struct sk_buff *skb, struct genl_info *info) nla_nest_end(msg, nl_reg_rules); genlmsg_end(msg, hdr); - err = genlmsg_unicast(msg, info->snd_pid); + err = genlmsg_reply(msg, info); goto out; nla_put_failure: -- cgit v1.2.3-70-g09d2 From da706d8bc833e7153622435560422e653bdb2e94 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 15 Jul 2009 16:27:30 +0800 Subject: ring_buffer: Fix warning while ignoring cmpxchg return value kernel/trace/ring_buffer.c: In function 'rb_tail_page_update': kernel/trace/ring_buffer.c:849: warning: value computed is not used kernel/trace/ring_buffer.c:850: warning: value computed is not used Add "(void)"s to fix this warning, because we don't need here to handle the fail case of cmpxchg, it's fine if an interrupt already did the job. Changed from V1: Add a comment(which is written by Steven) for it. Signed-off-by: Lai Jiangshan Acked-by: Steven Rostedt Signed-off-by: Frederic Weisbecker --- kernel/trace/ring_buffer.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e648ba4f70e..51633d74a21 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -845,9 +845,14 @@ static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, * This will only succeed if an interrupt did * not come in and change it. In which case, we * do not want to modify it. + * + * We add (void) to let the compiler know that we do not care + * about the return value of these functions. We use the + * cmpxchg to only update if an interrupt did not already + * do it for us. If the cmpxchg fails, we don't care. */ - local_cmpxchg(&next_page->write, old_write, val); - local_cmpxchg(&next_page->entries, old_entries, eval); + (void)local_cmpxchg(&next_page->write, old_write, val); + (void)local_cmpxchg(&next_page->entries, old_entries, eval); /* * No need to worry about races with clearing out the commit. -- cgit v1.2.3-70-g09d2 From 64fbcd162819bddaf0d99e78b16371b655aa5dee Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 15 Jul 2009 12:32:15 +0800 Subject: tracing/function: Simplify __ftrace_replace_code() Rewrite the __ftrace_replace_code() function, simplify it, but don't change the code's logic. First, we get the state we want to set, if the record has the same state, then do nothing, otherwise enable/disable it. Signed-off-by: Xiao Guangrong Reviewed-by: Li Zefan Signed-off-by: Frederic Weisbecker --- kernel/trace/ftrace.c | 72 +++++++++++++-------------------------------------- 1 file changed, 18 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bce9e01a29c..217caeca71c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1017,71 +1017,35 @@ static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable) { unsigned long ftrace_addr; - unsigned long ip, fl; + unsigned long flag = 0UL; ftrace_addr = (unsigned long)FTRACE_ADDR; - ip = rec->ip; - /* - * If this record is not to be traced and - * it is not enabled then do nothing. + * If this record is not to be traced or we want to disable it, + * then disable it. * - * If this record is not to be traced and - * it is enabled then disable it. + * If we want to enable it and filtering is off, then enable it. * + * If we want to enable it and filtering is on, enable it only if + * it's filtered */ - if (rec->flags & FTRACE_FL_NOTRACE) { - if (rec->flags & FTRACE_FL_ENABLED) - rec->flags &= ~FTRACE_FL_ENABLED; - else - return 0; - - } else if (ftrace_filtered && enable) { - /* - * Filtering is on: - */ - - fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED); - - /* Record is filtered and enabled, do nothing */ - if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) - return 0; - - /* Record is not filtered or enabled, do nothing */ - if (!fl) - return 0; - - /* Record is not filtered but enabled, disable it */ - if (fl == FTRACE_FL_ENABLED) - rec->flags &= ~FTRACE_FL_ENABLED; - else - /* Otherwise record is filtered but not enabled, enable it */ - rec->flags |= FTRACE_FL_ENABLED; - } else { - /* Disable or not filtered */ - - if (enable) { - /* if record is enabled, do nothing */ - if (rec->flags & FTRACE_FL_ENABLED) - return 0; - - rec->flags |= FTRACE_FL_ENABLED; - - } else { + if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) { + if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER)) + flag = FTRACE_FL_ENABLED; + } - /* if record is not enabled, do nothing */ - if (!(rec->flags & FTRACE_FL_ENABLED)) - return 0; + /* If the state of this record hasn't changed, then do nothing */ + if ((rec->flags & FTRACE_FL_ENABLED) == flag) + return 0; - rec->flags &= ~FTRACE_FL_ENABLED; - } + if (flag) { + rec->flags |= FTRACE_FL_ENABLED; + return ftrace_make_call(rec, ftrace_addr); } - if (rec->flags & FTRACE_FL_ENABLED) - return ftrace_make_call(rec, ftrace_addr); - else - return ftrace_make_nop(NULL, rec, ftrace_addr); + rec->flags &= ~FTRACE_FL_ENABLED; + return ftrace_make_nop(NULL, rec, ftrace_addr); } static void ftrace_replace_code(int enable) -- cgit v1.2.3-70-g09d2 From 79173bf556417a737e9d2e096e0788452ec30a61 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 16 Jul 2009 14:17:11 +0800 Subject: tracing/trace_stack: Cleanup for trace_lookup_stack() We can directly use %pF input format instead of sprint_symbol() and %s input format. Signed-off-by: Xiao Guangrong Reviewed-by: Li Zefan Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_stack.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index e644af91012..a4dc8d9ad1b 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -234,15 +234,8 @@ static void t_stop(struct seq_file *m, void *p) static int trace_lookup_stack(struct seq_file *m, long i) { unsigned long addr = stack_dump_trace[i]; -#ifdef CONFIG_KALLSYMS - char str[KSYM_SYMBOL_LEN]; - sprint_symbol(str, addr); - - return seq_printf(m, "%s\n", str); -#else - return seq_printf(m, "%p\n", (void*)addr); -#endif + return seq_printf(m, "%pF\n", (void *)addr); } static void print_disabled(struct seq_file *m) -- cgit v1.2.3-70-g09d2 From 6f2f3cf00ee32f75ba007a46bab88a54d68a5deb Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 16 Jul 2009 14:21:08 +0800 Subject: tracing/function: Cleanup for function tracer We can directly use %pf input format instead of kallsyms_lookup() and %s input format Signed-off-by: Xiao Guangrong Reviewed-by: Li Zefan Signed-off-by: Frederic Weisbecker --- kernel/trace/ftrace.c | 17 +++-------------- kernel/trace/trace_functions.c | 4 +--- 2 files changed, 4 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 217caeca71c..80a97a51442 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1403,18 +1403,13 @@ static int t_hash_show(struct seq_file *m, void *v) { struct ftrace_func_probe *rec; struct hlist_node *hnd = v; - char str[KSYM_SYMBOL_LEN]; rec = hlist_entry(hnd, struct ftrace_func_probe, node); if (rec->ops->print) return rec->ops->print(m, rec->ip, rec->ops, rec->data); - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - seq_printf(m, "%s:", str); - - kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str); - seq_printf(m, "%s", str); + seq_printf(m, "%pf:%pf", (void *)rec->ip, (void *)rec->ops->func); if (rec->data) seq_printf(m, ":%p", rec->data); @@ -1512,7 +1507,6 @@ static int t_show(struct seq_file *m, void *v) { struct ftrace_iterator *iter = m->private; struct dyn_ftrace *rec = v; - char str[KSYM_SYMBOL_LEN]; if (iter->flags & FTRACE_ITER_HASH) return t_hash_show(m, v); @@ -1525,9 +1519,7 @@ static int t_show(struct seq_file *m, void *v) if (!rec) return 0; - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - - seq_printf(m, "%s\n", str); + seq_printf(m, "%pf\n", (void *)rec->ip); return 0; } @@ -2508,7 +2500,6 @@ static void g_stop(struct seq_file *m, void *p) static int g_show(struct seq_file *m, void *v) { unsigned long *ptr = v; - char str[KSYM_SYMBOL_LEN]; if (!ptr) return 0; @@ -2518,9 +2509,7 @@ static int g_show(struct seq_file *m, void *v) return 0; } - kallsyms_lookup(*ptr, NULL, NULL, NULL, str); - - seq_printf(m, "%s\n", str); + seq_printf(m, "%pf\n", v); return 0; } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 7402144bff2..b53dc994dfb 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -288,11 +288,9 @@ static int ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - char str[KSYM_SYMBOL_LEN]; long count = (long)data; - kallsyms_lookup(ip, NULL, NULL, NULL, str); - seq_printf(m, "%s:", str); + seq_printf(m, "%pf:", (void *)ip); if (ops == &traceon_probe_ops) seq_printf(m, "traceon"); -- cgit v1.2.3-70-g09d2 From 566b0aaf798a0dddfc455d1a5b05c424c6686c65 Mon Sep 17 00:00:00 2001 From: "jolsa@redhat.com" Date: Thu, 16 Jul 2009 21:44:26 +0200 Subject: tracing: Remove unused fields/variables Signed-off-by: Jiri Olsa Cc: rostedt@goodmis.org LKML-Reference: <1247773468-11594-2-git-send-email-jolsa@redhat.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 3 --- kernel/trace/trace.c | 3 +-- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6227dc80637..24e3ff53b24 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1339,7 +1339,6 @@ struct ftrace_iterator { unsigned flags; unsigned char buffer[FTRACE_BUFF_MAX+1]; unsigned buffer_idx; - unsigned filtered; }; static void * @@ -2268,7 +2267,6 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, } if (isspace(ch)) { - iter->filtered++; iter->buffer[iter->buffer_idx] = 0; ret = ftrace_process_regex(iter->buffer, iter->buffer_idx, enable); @@ -2399,7 +2397,6 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) iter = file->private_data; if (iter->buffer_idx) { - iter->filtered++; iter->buffer[iter->buffer_idx] = 0; ftrace_match_records(iter->buffer, iter->buffer_idx, enable); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 60a49488b79..e30e6b1dbd4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4265,7 +4265,6 @@ void ftrace_dump(void) __init static int tracer_alloc_buffers(void) { - struct trace_array_cpu *data; int ring_buf_size; int i; int ret = -ENOMEM; @@ -4315,7 +4314,7 @@ __init static int tracer_alloc_buffers(void) /* Allocate the first page for all buffers */ for_each_tracing_cpu(i) { - data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); + global_trace.data[i] = &per_cpu(global_trace_cpu, i); max_tr.data[i] = &per_cpu(max_data, i); } -- cgit v1.2.3-70-g09d2 From e5d490b252423605a77c54b2e35b10ea663763df Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Jul 2009 12:23:11 +0100 Subject: profile: Suppress warning about large allocations when profile=1 is specified When profile= is used, a large buffer is allocated early at boot. This can be larger than what the page allocator can provide so it prints a warning. However, the caller is able to handle the situation so this patch suppresses the warning. Signed-off-by: Mel Gorman Reviewed-by: KOSAKI Motohiro Cc: Linux Memory Management List Cc: Heinz Diehl Cc: David Miller Cc: Arnaldo Carvalho de Melo Cc: Mel Gorman Cc: Andrew Morton LKML-Reference: <1247656992-19846-3-git-send-email-mel@csn.ul.ie> Signed-off-by: Ingo Molnar --- kernel/profile.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 69911b5745e..419250ebec4 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -117,11 +117,12 @@ int __ref profile_init(void) cpumask_copy(prof_cpu_mask, cpu_possible_mask); - prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); + prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN); if (prof_buffer) return 0; - prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO); + prof_buffer = alloc_pages_exact(buffer_bytes, + GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN); if (prof_buffer) return 0; -- cgit v1.2.3-70-g09d2 From e7aaaa6934636d7a6cadd9e2a05250fbb6a34f65 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: sched: Drop the need_resched() loop from cond_resched() The schedule() function is a loop that reschedules the current task while the TIF_NEED_RESCHED flag is set: void schedule(void) { need_resched: /* schedule code */ if (need_resched()) goto need_resched; } And cond_resched() repeat this loop: do { add_preempt_count(PREEMPT_ACTIVE); schedule(); sub_preempt_count(PREEMPT_ACTIVE); } while(need_resched()); This loop is needless because schedule() already did the check and nothing can set TIF_NEED_RESCHED between schedule() exit and the loop check in need_resched(). Then remove this needless loop. Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 03f7e3fd80b..4c5ee843d57 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6618,11 +6618,9 @@ static void __cond_resched(void) * PREEMPT_ACTIVE, which could trigger a second * cond_resched() call. */ - do { - add_preempt_count(PREEMPT_ACTIVE); - schedule(); - sub_preempt_count(PREEMPT_ACTIVE); - } while (need_resched()); + add_preempt_count(PREEMPT_ACTIVE); + schedule(); + sub_preempt_count(PREEMPT_ACTIVE); } int __sched _cond_resched(void) -- cgit v1.2.3-70-g09d2 From 4b2155678d7cc7b5f45d6b36049091376c3408a2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: sched: Remove obsolete comment in __cond_resched() Remove the outdated comment from __cond_resched() related to the now removed Big Kernel Semaphore. Reported-by: Arnd Bergmann Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 4c5ee843d57..4d39e96044b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6613,11 +6613,6 @@ static void __cond_resched(void) #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP __might_sleep(__FILE__, __LINE__); #endif - /* - * The BKS might be reacquired before we have dropped - * PREEMPT_ACTIVE, which could trigger a second - * cond_resched() call. - */ add_preempt_count(PREEMPT_ACTIVE); schedule(); sub_preempt_count(PREEMPT_ACTIVE); -- cgit v1.2.3-70-g09d2 From e09758fae8ccde97e026c704319eaa18d488dc86 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: sched: Cover the CONFIG_DEBUG_SPINLOCK_SLEEP off-case for __might_sleep() Cover the off case for __might_sleep(), so that we avoid #ifdefs in files that make use of it. Especially, this prepares for the __might_sleep() pull up on cond_resched(). Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-3-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 1 + kernel/sched.c | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d6320a3e8de..b804f694ae6 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -139,6 +139,7 @@ extern int _cond_resched(void); # define might_sleep() \ do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) #else + static inline void __might_sleep(char *file, int line) { } # define might_sleep() do { might_resched(); } while (0) #endif diff --git a/kernel/sched.c b/kernel/sched.c index 4d39e96044b..370a6c31c5e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6610,9 +6610,8 @@ static inline int should_resched(void) static void __cond_resched(void) { -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP __might_sleep(__FILE__, __LINE__); -#endif + add_preempt_count(PREEMPT_ACTIVE); schedule(); sub_preempt_count(PREEMPT_ACTIVE); -- cgit v1.2.3-70-g09d2 From e4aafea2d4bde8b44d6500c4ee7195bbfc51269e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: sched: Add a preempt count base offset to __might_sleep() Add a preempt count base offset to compare against the current preempt level count. It prepares to pull up the might_sleep check from cond_resched() to cond_resched_lock() and cond_resched_bh(). For these two helpers, we need to respectively ensure that once we'll unlock the given spinlock / reenable local softirqs, we will reach a sleepable state. Signed-off-by: Frederic Weisbecker [ Move and rename preempt_count_equals() ] Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-4-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 6 +++--- kernel/sched.c | 15 +++++++++++---- 2 files changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index b804f694ae6..2b5b1e0899a 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -125,7 +125,7 @@ extern int _cond_resched(void); #endif #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP - void __might_sleep(char *file, int line); + void __might_sleep(char *file, int line, int preempt_offset); /** * might_sleep - annotation for functions that can sleep * @@ -137,9 +137,9 @@ extern int _cond_resched(void); * supposed to. */ # define might_sleep() \ - do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) + do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) #else - static inline void __might_sleep(char *file, int line) { } + static inline void __might_sleep(char *file, int line, int preempt_offset) { } # define might_sleep() do { might_resched(); } while (0) #endif diff --git a/kernel/sched.c b/kernel/sched.c index 370a6c31c5e..3ff4d004bd9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6610,7 +6610,7 @@ static inline int should_resched(void) static void __cond_resched(void) { - __might_sleep(__FILE__, __LINE__); + __might_sleep(__FILE__, __LINE__, 0); add_preempt_count(PREEMPT_ACTIVE); schedule(); @@ -9429,13 +9429,20 @@ void __init sched_init(void) } #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP -void __might_sleep(char *file, int line) +static inline int preempt_count_equals(int preempt_offset) +{ + int nested = preempt_count() & ~PREEMPT_ACTIVE; + + return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); +} + +void __might_sleep(char *file, int line, int preempt_offset) { #ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ - if ((!in_atomic() && !irqs_disabled()) || - system_state != SYSTEM_RUNNING || oops_in_progress) + if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || + system_state != SYSTEM_RUNNING || oops_in_progress) return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; -- cgit v1.2.3-70-g09d2 From 613afbf83298efaead05ebcac23d2285609d7160 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: sched: Pull up the might_sleep() check into cond_resched() might_sleep() is called late-ish in cond_resched(), after the need_resched()/preempt enabled/system running tests are checked. It's better to check the sleeps while atomic earlier and not depend on some environment datas that reduce the chances to detect a problem. Also define cond_resched_*() helpers as macros, so that the FILE/LINE reported in the sleeping while atomic warning displays the real origin and not sched.h Changes in v2: - Call __might_sleep() directly instead of might_sleep() which may call cond_resched() - Turn cond_resched() into a macro so that the file:line couple reported refers to the caller of cond_resched() and not __cond_resched() itself. Changes in v3: - Also propagate this __might_sleep() pull up to cond_resched_lock() and cond_resched_softirq() Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-6-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- fs/dcache.c | 1 + include/linux/sched.h | 29 +++++++++++++++++++---------- kernel/sched.c | 12 +++++------- 3 files changed, 25 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/fs/dcache.c b/fs/dcache.c index 9e5cd3c3a6b..a100fa35a48 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "internal.h" int sysctl_vfs_cache_pressure __read_mostly = 100; diff --git a/include/linux/sched.h b/include/linux/sched.h index e2bdf18e05c..c41d424db88 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2286,17 +2286,26 @@ static inline int need_resched(void) */ extern int _cond_resched(void); -static inline int cond_resched(void) -{ - return _cond_resched(); -} +#define cond_resched() ({ \ + __might_sleep(__FILE__, __LINE__, 0); \ + _cond_resched(); \ +}) -extern int cond_resched_lock(spinlock_t * lock); -extern int cond_resched_softirq(void); -static inline int cond_resched_bkl(void) -{ - return _cond_resched(); -} +extern int __cond_resched_lock(spinlock_t *lock); + +#define cond_resched_lock(lock) ({ \ + __might_sleep(__FILE__, __LINE__, PREEMPT_OFFSET); \ + __cond_resched_lock(lock); \ +}) + +extern int __cond_resched_softirq(void); + +#define cond_resched_softirq() ({ \ + __might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET); \ + __cond_resched_softirq(); \ +}) + +#define cond_resched_bkl() cond_resched() /* * Does a critical section need to be broken due to another diff --git a/kernel/sched.c b/kernel/sched.c index 3ff4d004bd9..1f7919add8a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6610,8 +6610,6 @@ static inline int should_resched(void) static void __cond_resched(void) { - __might_sleep(__FILE__, __LINE__, 0); - add_preempt_count(PREEMPT_ACTIVE); schedule(); sub_preempt_count(PREEMPT_ACTIVE); @@ -6628,14 +6626,14 @@ int __sched _cond_resched(void) EXPORT_SYMBOL(_cond_resched); /* - * cond_resched_lock() - if a reschedule is pending, drop the given lock, + * __cond_resched_lock() - if a reschedule is pending, drop the given lock, * call schedule, and on return reacquire the lock. * * This works OK both with and without CONFIG_PREEMPT. We do strange low-level * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ -int cond_resched_lock(spinlock_t *lock) +int __cond_resched_lock(spinlock_t *lock) { int resched = should_resched(); int ret = 0; @@ -6651,9 +6649,9 @@ int cond_resched_lock(spinlock_t *lock) } return ret; } -EXPORT_SYMBOL(cond_resched_lock); +EXPORT_SYMBOL(__cond_resched_lock); -int __sched cond_resched_softirq(void) +int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); @@ -6665,7 +6663,7 @@ int __sched cond_resched_softirq(void) } return 0; } -EXPORT_SYMBOL(cond_resched_softirq); +EXPORT_SYMBOL(__cond_resched_softirq); /** * yield - yield the current processor to other threads. -- cgit v1.2.3-70-g09d2 From 7d536cb3fb9993bdcd5a2fbaa6b0670ded4e101c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 16 Jul 2009 10:54:02 +0800 Subject: tracing/events: record the size of dynamic arrays When a dynamic array is defined, we add __data_loc_foo in trace_entry to record the offset of the array, but the size of the array is not recorded, which causes 2 problems: - the event filter just compares the first 2 chars of the strings. - parsers can't parse dynamic arrays. So we encode the size of each dynamic array in the higher 16 bits of __data_loc_foo, while the offset is in lower 16 bits. Signed-off-by: Li Zefan LKML-Reference: <4A5E964A.9000403@cn.fujitsu.com> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- include/trace/ftrace.h | 14 ++++++++------ kernel/trace/trace_events_filter.c | 6 ++++-- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index cc78943e003..3cbb96ef34f 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -25,7 +25,7 @@ #define __array(type, item, len) type item[len]; #undef __dynamic_array -#define __dynamic_array(type, item, len) unsigned short __data_loc_##item; +#define __dynamic_array(type, item, len) u32 __data_loc_##item; #undef __string #define __string(item, src) __dynamic_array(char, item, -1) @@ -51,13 +51,14 @@ * Include the following: * * struct ftrace_data_offsets_ { - * int ; - * int ; + * u32 ; + * u32 ; * [...] * }; * - * The __dynamic_array() macro will create each int , this is + * The __dynamic_array() macro will create each u32 , this is * to keep the offset of each array from the beginning of the event. + * The size of an array is also encoded, in the higher 16 bits of . */ #undef __field @@ -67,7 +68,7 @@ #define __array(type, item, len) #undef __dynamic_array -#define __dynamic_array(type, item, len) int item; +#define __dynamic_array(type, item, len) u32 item; #undef __string #define __string(item, src) __dynamic_array(char, item, -1) @@ -207,7 +208,7 @@ ftrace_format_##call(struct trace_seq *s) \ #undef __get_dynamic_array #define __get_dynamic_array(field) \ - ((void *)__entry + __entry->__data_loc_##field) + ((void *)__entry + (__entry->__data_loc_##field & 0xffff)) #undef __get_str #define __get_str(field) (char *)__get_dynamic_array(field) @@ -325,6 +326,7 @@ ftrace_define_fields_##call(void) \ #define __dynamic_array(type, item, len) \ __data_offsets->item = __data_size + \ offsetof(typeof(*entry), __data); \ + __data_offsets->item |= (len * sizeof(type)) << 16; \ __data_size += (len) * sizeof(type); #undef __string diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index b9aae72d13d..1c80ef702b8 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -176,11 +176,13 @@ static int filter_pred_string(struct filter_pred *pred, void *event, static int filter_pred_strloc(struct filter_pred *pred, void *event, int val1, int val2) { - unsigned short str_loc = *(unsigned short *)(event + pred->offset); + u32 str_item = *(u32 *)(event + pred->offset); + int str_loc = str_item & 0xffff; + int str_len = str_item >> 16; char *addr = (char *)(event + str_loc); int cmp, match; - cmp = strncmp(addr, pred->str_val, pred->str_len); + cmp = strncmp(addr, pred->str_val, str_len); match = (!cmp) ^ pred->not; -- cgit v1.2.3-70-g09d2 From ff4e9da2330beb8d64498a513d3f9694e941b01a Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 22 Jun 2009 10:33:07 +0800 Subject: tracing: cleanup for tracing_trace_options_read() '\n' is already appended, and what we need is just an extra space for the '\0'. Signed-off-by: Xiao Guangrong LKML-Reference: <4A3EED63.3090908@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e30e6b1dbd4..38a4a3ee749 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2256,8 +2256,8 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf, len += 3; /* "no" and newline */ } - /* +2 for \n and \0 */ - buf = kmalloc(len + 2, GFP_KERNEL); + /* +1 for \0 */ + buf = kmalloc(len + 1, GFP_KERNEL); if (!buf) { mutex_unlock(&trace_types_lock); return -ENOMEM; @@ -2280,7 +2280,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf, } mutex_unlock(&trace_types_lock); - WARN_ON(r >= len + 2); + WARN_ON(r >= len + 1); r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -- cgit v1.2.3-70-g09d2 From 1f9963cbb0280e0cd554161e00f1a0eeddbf1ae1 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 20 Jul 2009 10:20:53 +0800 Subject: tracing/filters: improve subsystem filter Currently a subsystem filter should be applicable to all events under the subsystem, and if it failed, all the event filters will be cleared. Those behaviors make subsys filter much less useful: # echo 'vec == 1' > irq/softirq_entry/filter # echo 'irq == 5' > irq/filter bash: echo: write error: Invalid argument # cat irq/softirq_entry/filter none I'd expect it set the filter for irq_handler_entry/exit, and not touch softirq_entry/exit. The basic idea is, try to see if the filter can be applied to which events, and then just apply to the those events: # echo 'vec == 1' > softirq_entry/filter # echo 'irq == 5' > filter # cat irq_handler_entry/filter irq == 5 # cat softirq_entry/filter vec == 1 Changelog for v2: - do some cleanups to address Frederic's comments. Inspired-by: Steven Rostedt Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker LKML-Reference: <4A63D485.7030703@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 4 +- kernel/trace/trace.h | 3 +- kernel/trace/trace_events_filter.c | 124 ++++++++++++++++++++++++------------- 3 files changed, 87 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 5c093ffc655..26d3673d514 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -101,6 +101,8 @@ void trace_current_buffer_discard_commit(struct ring_buffer_event *event); void tracing_record_cmdline(struct task_struct *tsk); +struct event_filter; + struct ftrace_event_call { struct list_head list; char *name; @@ -116,7 +118,7 @@ struct ftrace_event_call { int (*define_fields)(void); struct list_head fields; int filter_active; - void *filter; + struct event_filter *filter; void *mod; #ifdef CONFIG_EVENT_PROFILE diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 94305c7bc11..758b0dbed55 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -750,13 +750,14 @@ struct event_filter { int n_preds; struct filter_pred **preds; char *filter_string; + bool no_reset; }; struct event_subsystem { struct list_head list; const char *name; struct dentry *entry; - void *filter; + struct event_filter *filter; int nr_events; }; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 1c80ef702b8..27c2dbea371 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -420,7 +420,14 @@ oom: } EXPORT_SYMBOL_GPL(init_preds); -static void filter_free_subsystem_preds(struct event_subsystem *system) +enum { + FILTER_DISABLE_ALL, + FILTER_INIT_NO_RESET, + FILTER_SKIP_NO_RESET, +}; + +static void filter_free_subsystem_preds(struct event_subsystem *system, + int flag) { struct ftrace_event_call *call; @@ -428,6 +435,14 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) if (!call->define_fields) continue; + if (flag == FILTER_INIT_NO_RESET) { + call->filter->no_reset = false; + continue; + } + + if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset) + continue; + if (!strcmp(call->system, system->name)) { filter_disable_preds(call); remove_filter_string(call->filter); @@ -529,7 +544,8 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size, static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_call *call, - struct filter_pred *pred) + struct filter_pred *pred, + bool dry_run) { struct ftrace_event_field *field; filter_pred_fn_t fn; @@ -541,10 +557,12 @@ static int filter_add_pred(struct filter_parse_state *ps, if (pred->op == OP_AND) { pred->pop_n = 2; - return filter_add_pred_fn(ps, call, pred, filter_pred_and); + fn = filter_pred_and; + goto add_pred_fn; } else if (pred->op == OP_OR) { pred->pop_n = 2; - return filter_add_pred_fn(ps, call, pred, filter_pred_or); + fn = filter_pred_or; + goto add_pred_fn; } field = find_event_field(call, pred->field_name); @@ -567,9 +585,6 @@ static int filter_add_pred(struct filter_parse_state *ps, else fn = filter_pred_strloc; pred->str_len = field->size; - if (pred->op == OP_NE) - pred->not = 1; - return filter_add_pred_fn(ps, call, pred, fn); } else { if (field->is_signed) ret = strict_strtoll(pred->str_val, 0, &val); @@ -580,27 +595,33 @@ static int filter_add_pred(struct filter_parse_state *ps, return -EINVAL; } pred->val = val; - } - fn = select_comparison_fn(pred->op, field->size, field->is_signed); - if (!fn) { - parse_error(ps, FILT_ERR_INVALID_OP, 0); - return -EINVAL; + fn = select_comparison_fn(pred->op, field->size, + field->is_signed); + if (!fn) { + parse_error(ps, FILT_ERR_INVALID_OP, 0); + return -EINVAL; + } } if (pred->op == OP_NE) pred->not = 1; - return filter_add_pred_fn(ps, call, pred, fn); +add_pred_fn: + if (!dry_run) + return filter_add_pred_fn(ps, call, pred, fn); + return 0; } static int filter_add_subsystem_pred(struct filter_parse_state *ps, struct event_subsystem *system, struct filter_pred *pred, - char *filter_string) + char *filter_string, + bool dry_run) { struct ftrace_event_call *call; int err = 0; + bool fail = true; list_for_each_entry(call, &ftrace_events, list) { @@ -610,16 +631,24 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, if (strcmp(call->system, system->name)) continue; - err = filter_add_pred(ps, call, pred); - if (err) { - filter_free_subsystem_preds(system); - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - goto out; - } - replace_filter_string(call->filter, filter_string); + if (call->filter->no_reset) + continue; + + err = filter_add_pred(ps, call, pred, dry_run); + if (err) + call->filter->no_reset = true; + else + fail = false; + + if (!dry_run) + replace_filter_string(call->filter, filter_string); } -out: - return err; + + if (fail) { + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + return err; + } + return 0; } static void parse_init(struct filter_parse_state *ps, @@ -978,12 +1007,14 @@ static int check_preds(struct filter_parse_state *ps) static int replace_preds(struct event_subsystem *system, struct ftrace_event_call *call, struct filter_parse_state *ps, - char *filter_string) + char *filter_string, + bool dry_run) { char *operand1 = NULL, *operand2 = NULL; struct filter_pred *pred; struct postfix_elt *elt; int err; + int n_preds = 0; err = check_preds(ps); if (err) @@ -1002,19 +1033,14 @@ static int replace_preds(struct event_subsystem *system, continue; } + if (n_preds++ == MAX_FILTER_PRED) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); + return -ENOSPC; + } + if (elt->op == OP_AND || elt->op == OP_OR) { pred = create_logical_pred(elt->op); - if (call) - err = filter_add_pred(ps, call, pred); - else - err = filter_add_subsystem_pred(ps, system, - pred, filter_string); - filter_free_pred(pred); - if (err) - return err; - - operand1 = operand2 = NULL; - continue; + goto add_pred; } if (!operand1 || !operand2) { @@ -1023,11 +1049,12 @@ static int replace_preds(struct event_subsystem *system, } pred = create_pred(elt->op, operand1, operand2); +add_pred: if (call) - err = filter_add_pred(ps, call, pred); + err = filter_add_pred(ps, call, pred, false); else err = filter_add_subsystem_pred(ps, system, pred, - filter_string); + filter_string, dry_run); filter_free_pred(pred); if (err) return err; @@ -1068,7 +1095,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) goto out; } - err = replace_preds(NULL, call, ps, filter_string); + err = replace_preds(NULL, call, ps, filter_string, false); if (err) append_filter_err(ps, call->filter); @@ -1092,7 +1119,7 @@ int apply_subsystem_event_filter(struct event_subsystem *system, mutex_lock(&event_mutex); if (!strcmp(strstrip(filter_string), "0")) { - filter_free_subsystem_preds(system); + filter_free_subsystem_preds(system, FILTER_DISABLE_ALL); remove_filter_string(system->filter); mutex_unlock(&event_mutex); return 0; @@ -1103,7 +1130,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system, if (!ps) goto out_unlock; - filter_free_subsystem_preds(system); replace_filter_string(system->filter, filter_string); parse_init(ps, filter_ops, filter_string); @@ -1113,9 +1139,23 @@ int apply_subsystem_event_filter(struct event_subsystem *system, goto out; } - err = replace_preds(system, NULL, ps, filter_string); - if (err) + filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET); + + /* try to see the filter can be applied to which events */ + err = replace_preds(system, NULL, ps, filter_string, true); + if (err) { + append_filter_err(ps, system->filter); + goto out; + } + + filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET); + + /* really apply the filter to the events */ + err = replace_preds(system, NULL, ps, filter_string, false); + if (err) { append_filter_err(ps, system->filter); + filter_free_subsystem_preds(system, 2); + } out: filter_opstack_clear(ps); -- cgit v1.2.3-70-g09d2 From 86886e55b273f565935491816c7c96b82469d4f8 Mon Sep 17 00:00:00 2001 From: Joseph Cihula Date: Tue, 30 Jun 2009 19:31:07 -0700 Subject: x86, intel_txt: Intel TXT Sx shutdown support Support for graceful handling of sleep states (S3/S4/S5) after an Intel(R) TXT launch. Without this patch, attempting to place the system in one of the ACPI sleep states (S3/S4/S5) will cause the TXT hardware to treat this as an attack and will cause a system reset, with memory locked. Not only may the subsequent memory scrub take some time, but the platform will be unable to enter the requested power state. This patch calls back into the tboot so that it may properly and securely clean up system state and clear the secrets-in-memory flag, after which it will place the system into the requested sleep state using ACPI information passed by the kernel. arch/x86/kernel/smpboot.c | 2 ++ drivers/acpi/acpica/hwsleep.c | 3 +++ kernel/cpu.c | 7 ++++++- 3 files changed, 11 insertions(+), 1 deletion(-) Signed-off-by: Joseph Cihula Signed-off-by: Shane Wang Signed-off-by: H. Peter Anvin --- arch/x86/kernel/smpboot.c | 2 ++ drivers/acpi/acpica/hwsleep.c | 3 +++ kernel/cpu.c | 7 ++++++- 3 files changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2fecda69ee6..61cc40887c4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -1317,6 +1318,7 @@ void play_dead_common(void) void native_play_dead(void) { play_dead_common(); + tboot_shutdown(TB_SHUTDOWN_WFS); wbinvd_halt(); } diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c index db307a356f0..8c01dd3724e 100644 --- a/drivers/acpi/acpica/hwsleep.c +++ b/drivers/acpi/acpica/hwsleep.c @@ -45,6 +45,7 @@ #include #include "accommon.h" #include "actables.h" +#include #define _COMPONENT ACPI_HARDWARE ACPI_MODULE_NAME("hwsleep") @@ -342,6 +343,8 @@ acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state) ACPI_FLUSH_CPU_CACHE(); + tboot_sleep(sleep_state, pm1a_control, pm1b_control); + /* Write #2: Write both SLP_TYP + SLP_EN */ status = acpi_hw_write_pm1_control(pm1a_control, pm1b_control); diff --git a/kernel/cpu.c b/kernel/cpu.c index 8ce10043e4a..ff071e022a8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -14,6 +14,7 @@ #include #include #include +#include #ifdef CONFIG_SMP /* Serializes the updates to cpu_online_mask, cpu_present_mask */ @@ -376,7 +377,7 @@ static cpumask_var_t frozen_cpus; int disable_nonboot_cpus(void) { - int cpu, first_cpu, error; + int cpu, first_cpu, error, num_cpus = 0; error = stop_machine_create(); if (error) @@ -391,6 +392,7 @@ int disable_nonboot_cpus(void) for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; + num_cpus++; error = _cpu_down(cpu, 1); if (!error) { cpumask_set_cpu(cpu, frozen_cpus); @@ -401,6 +403,9 @@ int disable_nonboot_cpus(void) break; } } + /* ensure all CPUs have gone into wait-for-SIPI */ + error |= tboot_wait_for_aps(num_cpus); + if (!error) { BUG_ON(num_online_cpus() > 1); /* Make sure the CPUs won't be enabled by someone else */ -- cgit v1.2.3-70-g09d2 From fbd90375d7531927d312766b548376d909811b4d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jul 2009 13:40:14 +0200 Subject: hrtimer: Remove cb_entry from struct hrtimer It's unused, remove it. Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner LKML-Reference: --- include/linux/hrtimer.h | 2 -- kernel/hrtimer.c | 1 - 2 files changed, 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 54648e625ef..40e7d54fc42 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -91,7 +91,6 @@ enum hrtimer_restart { * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) * @state: state information (See bit values above) - * @cb_entry: list head to enqueue an expired timer into the callback list * @start_site: timer statistics field to store the site where the timer * was started * @start_comm: timer statistics field to store the name of the process which @@ -108,7 +107,6 @@ struct hrtimer { enum hrtimer_restart (*function)(struct hrtimer *); struct hrtimer_clock_base *base; unsigned long state; - struct list_head cb_entry; #ifdef CONFIG_TIMER_STATS int start_pid; void *start_site; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 43d151f185b..052a0f53e4e 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1092,7 +1092,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, clock_id = CLOCK_MONOTONIC; timer->base = &cpu_base->clock_base[clock_id]; - INIT_LIST_HEAD(&timer->cb_entry); hrtimer_init_timer_hres(timer); #ifdef CONFIG_TIMER_STATS -- cgit v1.2.3-70-g09d2 From d8cc1ab793993c886c62abf77c93287df33ffd8b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 23 Jul 2009 11:28:40 +0800 Subject: trace_stack: Fix seqfile memory leak Every time we cat stack_trace, we leak memory allocated by seq_open(). Signed-off-by: Li Zefan LKML-Reference: <4A67D8E8.3020500@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index e644af91012..6a2a9d484cd 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -301,17 +301,14 @@ static const struct seq_operations stack_trace_seq_ops = { static int stack_trace_open(struct inode *inode, struct file *file) { - int ret; - - ret = seq_open(file, &stack_trace_seq_ops); - - return ret; + return seq_open(file, &stack_trace_seq_ops); } static const struct file_operations stack_trace_fops = { .open = stack_trace_open, .read = seq_read, .llseek = seq_lseek, + .release = seq_release, }; int -- cgit v1.2.3-70-g09d2 From 87827111a5538633b18e5c641ced673c4c2bb6ce Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 23 Jul 2009 11:29:11 +0800 Subject: function-graph: Fix seqfile memory leak Every time we cat set_graph_function, we leak memory allocated by seq_open(). Signed-off-by: Li Zefan LKML-Reference: <4A67D907.2010500@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4521c77d1a1..1f3ec2afa51 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2595,6 +2595,14 @@ ftrace_graph_open(struct inode *inode, struct file *file) return ret; } +static int +ftrace_graph_release(struct inode *inode, struct file *file) +{ + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + return 0; +} + static int ftrace_set_func(unsigned long *array, int *idx, char *buffer) { @@ -2724,9 +2732,10 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, } static const struct file_operations ftrace_graph_fops = { - .open = ftrace_graph_open, - .read = seq_read, - .write = ftrace_graph_write, + .open = ftrace_graph_open, + .read = seq_read, + .write = ftrace_graph_write, + .release = ftrace_graph_release, }; #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -- cgit v1.2.3-70-g09d2 From 636eacee3b0c76915151db37203cc624becb6d7b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 23 Jul 2009 11:29:47 +0800 Subject: tracing/stat: Fix seqfile memory leak Every time we cat a trace_stat file, we leak memory allocated by seq_open(). Also fix memory leak in a failure path in tracing_stat_open(). Signed-off-by: Li Zefan LKML-Reference: <4A67D92B.4060704@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_stat.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index e66f5e49334..aea321c82fa 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -73,7 +73,7 @@ static struct rb_node *release_next(struct rb_node *node) } } -static void reset_stat_session(struct stat_session *session) +static void __reset_stat_session(struct stat_session *session) { struct rb_node *node = session->stat_root.rb_node; @@ -83,10 +83,17 @@ static void reset_stat_session(struct stat_session *session) session->stat_root = RB_ROOT; } +static void reset_stat_session(struct stat_session *session) +{ + mutex_lock(&session->stat_mutex); + __reset_stat_session(session); + mutex_unlock(&session->stat_mutex); +} + static void destroy_session(struct stat_session *session) { debugfs_remove(session->file); - reset_stat_session(session); + __reset_stat_session(session); mutex_destroy(&session->stat_mutex); kfree(session); } @@ -150,7 +157,7 @@ static int stat_seq_init(struct stat_session *session) int i; mutex_lock(&session->stat_mutex); - reset_stat_session(session); + __reset_stat_session(session); if (!ts->stat_cmp) ts->stat_cmp = dummy_cmp; @@ -183,7 +190,7 @@ exit: return ret; exit_free_rbtree: - reset_stat_session(session); + __reset_stat_session(session); mutex_unlock(&session->stat_mutex); return ret; } @@ -250,16 +257,21 @@ static const struct seq_operations trace_stat_seq_ops = { static int tracing_stat_open(struct inode *inode, struct file *file) { int ret; - + struct seq_file *m; struct stat_session *session = inode->i_private; + ret = stat_seq_init(session); + if (ret) + return ret; + ret = seq_open(file, &trace_stat_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = session; - ret = stat_seq_init(session); + if (ret) { + reset_stat_session(session); + return ret; } + m = file->private_data; + m->private = session; return ret; } @@ -270,11 +282,9 @@ static int tracing_stat_release(struct inode *i, struct file *f) { struct stat_session *session = i->i_private; - mutex_lock(&session->stat_mutex); reset_stat_session(session); - mutex_unlock(&session->stat_mutex); - return 0; + return seq_release(i, f); } static const struct file_operations tracing_stat_fops = { -- cgit v1.2.3-70-g09d2 From 4c739ff043e5787d97c9691d62cabf7a29e75a9d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 22 Jul 2009 23:11:03 -0400 Subject: tracing: show proper address for trace-printk format Since the trace_printk may use pointers to the format fields in the buffer, they are exported via debugfs/tracing/printk_formats. This is used by utilities that read the ring buffer in binary format. It helps the utilities map the address of the format in the binary buffer to what the printf format looks like. Unfortunately, the way the output code works, it exports the address of the pointer to the format address, and not the format address itself. This makes the file totally useless in trying to figure out what format string a binary address belongs to. Signed-off-by: Steven Rostedt --- kernel/trace/trace_printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 7b627811082..687699d365a 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -176,7 +176,7 @@ static int t_show(struct seq_file *m, void *v) const char *str = *fmt; int i; - seq_printf(m, "0x%lx : \"", (unsigned long)fmt); + seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt); /* * Tabs and new lines need to be converted. -- cgit v1.2.3-70-g09d2 From 8650ae32ef7045e763825dee6256dde7f331bb85 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 22 Jul 2009 23:29:30 -0400 Subject: tracing: only truncate ftrace files when O_TRUNC is set The current code will truncate the ftrace files contents if O_APPEND is not set and the file is opened in write mode. This is incorrect. It should only truncate the file if O_TRUNC is set. Otherwise if one of these files is opened by a C program with fopen "r+", it will incorrectly truncate the file. Reported-by: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 4 ++-- kernel/trace/trace.c | 2 +- kernel/trace/trace_events.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1f3ec2afa51..1e1d23c2630 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1662,7 +1662,7 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) mutex_lock(&ftrace_regex_lock); if ((file->f_mode & FMODE_WRITE) && - !(file->f_flags & O_APPEND)) + (file->f_flags & O_TRUNC)) ftrace_filter_reset(enable); if (file->f_mode & FMODE_READ) { @@ -2577,7 +2577,7 @@ ftrace_graph_open(struct inode *inode, struct file *file) mutex_lock(&graph_lock); if ((file->f_mode & FMODE_WRITE) && - !(file->f_flags & O_APPEND)) { + (file->f_flags & O_TRUNC)) { ftrace_graph_count = 0; memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8bc8d8afea6..d8ef28574aa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2031,7 +2031,7 @@ static int tracing_open(struct inode *inode, struct file *file) /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && - !(file->f_flags & O_APPEND)) { + (file->f_flags & O_TRUNC)) { long cpu = (long) inode->i_private; if (cpu == TRACE_PIPE_ALL_CPU) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 53c8fd376a8..23d2972b22d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -376,7 +376,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file) const struct seq_operations *seq_ops; if ((file->f_mode & FMODE_WRITE) && - !(file->f_flags & O_APPEND)) + (file->f_flags & O_TRUNC)) ftrace_clear_events(); seq_ops = inode->i_private; -- cgit v1.2.3-70-g09d2 From a004cd42181409eda70804ded240a791f4564d61 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Jul 2009 09:54:05 +0200 Subject: sched: Fix return value of migration_init() migration_init() returns the return value of the hotplug notifier. In the success case this is NOTIFY_OK which is 1. initcall_debug evaluates that as an error code because init calls are expected to return 0 on success. Signed-off-by: Thomas Gleixner --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1f7919add8a..953f037dc05 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7654,7 +7654,7 @@ static int __init migration_init(void) migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); - return err; + return 0; } early_initcall(migration_init); #endif -- cgit v1.2.3-70-g09d2 From c94aa5ca3088018d2a7a9bd3258aefffe29df265 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: lockdep: Print the shortest dependency chain if finding a circle Currently lockdep will print the 1st circle detected if it exists when acquiring a new (next) lock. This patch prints the shortest path from the next lock to be acquired to the previous held lock if a circle is found. The patch still uses the current method to check circle, and once the circle is found, breadth-first search algorithem is used to compute the shortest path from the next lock to the previous lock in the forward lock dependency graph. Printing the shortest path will shorten the dependency chain, and make troubleshooting for possible circular locking easier. Signed-off-by: Ming Lei Signed-off-by: Peter Zijlstra LKML-Reference: <1246201486-7308-2-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 6 +++ kernel/lockdep.c | 115 +++++++++++++++++++++++++++++++++++++++++---- kernel/lockdep_internals.h | 83 ++++++++++++++++++++++++++++++++ 3 files changed, 195 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index b25d1b53df0..9ec026f8d09 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -149,6 +149,12 @@ struct lock_list { struct lock_class *class; struct stack_trace trace; int distance; + + /*The parent field is used to implement breadth-first search,and + *the bit 0 is reused to indicate if the lock has been accessed + *in BFS. + */ + struct lock_list *parent; }; /* diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 8bbeef996c7..93dc70d18cd 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -897,6 +897,79 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, return 1; } +static struct circular_queue lock_cq; +static int __search_shortest_path(struct lock_list *source_entry, + struct lock_class *target, + struct lock_list **target_entry, + int forward) +{ + struct lock_list *entry; + struct circular_queue *cq = &lock_cq; + int ret = 1; + + __cq_init(cq); + + mark_lock_accessed(source_entry, NULL); + if (source_entry->class == target) { + *target_entry = source_entry; + ret = 0; + goto exit; + } + + __cq_enqueue(cq, (unsigned long)source_entry); + + while (!__cq_empty(cq)) { + struct lock_list *lock; + struct list_head *head; + + __cq_dequeue(cq, (unsigned long *)&lock); + + if (!lock->class) { + ret = -2; + goto exit; + } + + if (forward) + head = &lock->class->locks_after; + else + head = &lock->class->locks_before; + + list_for_each_entry(entry, head, entry) { + if (!lock_accessed(entry)) { + mark_lock_accessed(entry, lock); + if (entry->class == target) { + *target_entry = entry; + ret = 0; + goto exit; + } + + if (__cq_enqueue(cq, (unsigned long)entry)) { + ret = -1; + goto exit; + } + } + } + } +exit: + return ret; +} + +static inline int __search_forward_shortest_path(struct lock_list *src_entry, + struct lock_class *target, + struct lock_list **target_entry) +{ + return __search_shortest_path(src_entry, target, target_entry, 1); + +} + +static inline int __search_backward_shortest_path(struct lock_list *src_entry, + struct lock_class *target, + struct lock_list **target_entry) +{ + return __search_shortest_path(src_entry, target, target_entry, 0); + +} + /* * Recursive, forwards-direction lock-dependency checking, used for * both noncyclic checking and for hardirq-unsafe/softirq-unsafe @@ -934,7 +1007,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth) { struct task_struct *curr = current; - if (!debug_locks_off_graph_unlock() || debug_locks_silent) + if (debug_locks_silent) return 0; printk("\n=======================================================\n"); @@ -954,19 +1027,41 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth) return 0; } -static noinline int print_circular_bug_tail(void) +static noinline int print_circular_bug(void) { struct task_struct *curr = current; struct lock_list this; + struct lock_list *target; + struct lock_list *parent; + int result; + unsigned long depth; - if (debug_locks_silent) + if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; this.class = hlock_class(check_source); if (!save_trace(&this.trace)) return 0; - print_circular_bug_entry(&this, 0); + result = __search_forward_shortest_path(&this, + hlock_class(check_target), + &target); + if (result) { + printk("\n%s:search shortest path failed:%d\n", __func__, + result); + return 0; + } + + depth = get_lock_depth(target); + + print_circular_bug_header(target, depth); + + parent = get_lock_parent(target); + + while (parent) { + print_circular_bug_entry(parent, --depth); + parent = get_lock_parent(parent); + } printk("\nother info that might help us debug this:\n\n"); lockdep_print_held_locks(curr); @@ -1072,14 +1167,15 @@ check_noncircular(struct lock_class *source, unsigned int depth) */ list_for_each_entry(entry, &source->locks_after, entry) { if (entry->class == hlock_class(check_target)) - return print_circular_bug_header(entry, depth+1); + return 2; debug_atomic_inc(&nr_cyclic_checks); - if (!check_noncircular(entry->class, depth+1)) - return print_circular_bug_entry(entry, depth+1); + if (check_noncircular(entry->class, depth+1) == 2) + return 2; } return 1; } + #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * Forwards and backwards subgraph searching, for the purposes of @@ -1484,8 +1580,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, */ check_source = next; check_target = prev; - if (!(check_noncircular(hlock_class(next), 0))) - return print_circular_bug_tail(); + if (check_noncircular(hlock_class(next), 0) == 2) + return print_circular_bug(); + if (!check_prev_add_irq(curr, prev, next)) return 0; diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index 699a2ac3a0d..6f48d37d5be 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -136,3 +136,86 @@ extern atomic_t nr_find_usage_backwards_recursions; # define debug_atomic_dec(ptr) do { } while (0) # define debug_atomic_read(ptr) 0 #endif + +/* The circular_queue and helpers is used to implement the + * breadth-first search(BFS)algorithem, by which we can build + * the shortest path from the next lock to be acquired to the + * previous held lock if there is a circular between them. + * */ +#define MAX_CIRCULAR_QUE_SIZE 4096UL +struct circular_queue{ + unsigned long element[MAX_CIRCULAR_QUE_SIZE]; + unsigned int front, rear; +}; + +#define LOCK_ACCESSED 1UL +#define LOCK_ACCESSED_MASK (~LOCK_ACCESSED) + +static inline void __cq_init(struct circular_queue *cq) +{ + cq->front = cq->rear = 0; +} + +static inline int __cq_empty(struct circular_queue *cq) +{ + return (cq->front == cq->rear); +} + +static inline int __cq_full(struct circular_queue *cq) +{ + return ((cq->rear + 1)%MAX_CIRCULAR_QUE_SIZE) == cq->front; +} + +static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) +{ + if (__cq_full(cq)) + return -1; + + cq->element[cq->rear] = elem; + cq->rear = (cq->rear + 1)%MAX_CIRCULAR_QUE_SIZE; + return 0; +} + +static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem) +{ + if (__cq_empty(cq)) + return -1; + + *elem = cq->element[cq->front]; + cq->front = (cq->front + 1)%MAX_CIRCULAR_QUE_SIZE; + return 0; +} + +static inline int __cq_get_elem_count(struct circular_queue *cq) +{ + return (cq->rear - cq->front)%MAX_CIRCULAR_QUE_SIZE; +} + +static inline void mark_lock_accessed(struct lock_list *lock, + struct lock_list *parent) +{ + lock->parent = (void *) parent + LOCK_ACCESSED; +} + +static inline unsigned long lock_accessed(struct lock_list *lock) +{ + return (unsigned long)lock->parent & LOCK_ACCESSED; +} + +static inline struct lock_list *get_lock_parent(struct lock_list *child) +{ + return (struct lock_list *) + ((unsigned long)child->parent & LOCK_ACCESSED_MASK); +} + +static inline unsigned long get_lock_depth(struct lock_list *child) +{ + unsigned long depth = 0; + struct lock_list *parent; + + while ((parent = get_lock_parent(child))) { + child = parent; + depth++; + } + return depth; +} -- cgit v1.2.3-70-g09d2 From d588e46155e9c51217b9840db1e94a0f594c1af2 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: lockdep: Improve implementation of BFS 1,replace %MAX_CIRCULAR_QUE_SIZE with &(MAX_CIRCULAR_QUE_SIZE-1) since we define MAX_CIRCULAR_QUE_SIZE as power of 2; 2,use bitmap to mark if a lock is accessed in BFS in order to clear it quickly, because we may search a graph many times. Signed-off-by: Ming Lei Signed-off-by: Peter Zijlstra LKML-Reference: <1246201486-7308-3-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 23 ++++++++++++++++------- kernel/lockdep_internals.h | 35 +++++++++++++++++++++++------------ 2 files changed, 39 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 93dc70d18cd..5dcca26a826 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -42,7 +42,7 @@ #include #include #include - +#include #include #include "lockdep_internals.h" @@ -118,7 +118,7 @@ static inline int debug_locks_off_graph_unlock(void) static int lockdep_initialized; unsigned long nr_list_entries; -static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; +struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; /* * All data structures here are protected by the global debug_lock. @@ -897,30 +897,38 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, return 1; } +unsigned long bfs_accessed[BITS_TO_LONGS(MAX_LOCKDEP_ENTRIES)]; static struct circular_queue lock_cq; + static int __search_shortest_path(struct lock_list *source_entry, struct lock_class *target, struct lock_list **target_entry, int forward) { struct lock_list *entry; + struct list_head *head; struct circular_queue *cq = &lock_cq; int ret = 1; - __cq_init(cq); - - mark_lock_accessed(source_entry, NULL); if (source_entry->class == target) { *target_entry = source_entry; ret = 0; goto exit; } + if (forward) + head = &source_entry->class->locks_after; + else + head = &source_entry->class->locks_before; + + if (list_empty(head)) + goto exit; + + __cq_init(cq); __cq_enqueue(cq, (unsigned long)source_entry); while (!__cq_empty(cq)) { struct lock_list *lock; - struct list_head *head; __cq_dequeue(cq, (unsigned long *)&lock); @@ -1040,6 +1048,7 @@ static noinline int print_circular_bug(void) return 0; this.class = hlock_class(check_source); + this.parent = NULL; if (!save_trace(&this.trace)) return 0; @@ -1580,10 +1589,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, */ check_source = next; check_target = prev; + if (check_noncircular(hlock_class(next), 0) == 2) return print_circular_bug(); - if (!check_prev_add_irq(curr, prev, next)) return 0; diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index 6f48d37d5be..c2f6594966f 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -137,23 +137,28 @@ extern atomic_t nr_find_usage_backwards_recursions; # define debug_atomic_read(ptr) 0 #endif + +extern unsigned long nr_list_entries; +extern struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; +extern unsigned long bfs_accessed[]; + +/*For good efficiency of modular, we use power of 2*/ +#define MAX_CIRCULAR_QUE_SIZE 4096UL + /* The circular_queue and helpers is used to implement the * breadth-first search(BFS)algorithem, by which we can build * the shortest path from the next lock to be acquired to the * previous held lock if there is a circular between them. * */ -#define MAX_CIRCULAR_QUE_SIZE 4096UL struct circular_queue{ unsigned long element[MAX_CIRCULAR_QUE_SIZE]; unsigned int front, rear; }; -#define LOCK_ACCESSED 1UL -#define LOCK_ACCESSED_MASK (~LOCK_ACCESSED) - static inline void __cq_init(struct circular_queue *cq) { cq->front = cq->rear = 0; + bitmap_zero(bfs_accessed, MAX_LOCKDEP_ENTRIES); } static inline int __cq_empty(struct circular_queue *cq) @@ -163,7 +168,7 @@ static inline int __cq_empty(struct circular_queue *cq) static inline int __cq_full(struct circular_queue *cq) { - return ((cq->rear + 1)%MAX_CIRCULAR_QUE_SIZE) == cq->front; + return ((cq->rear + 1)&(MAX_CIRCULAR_QUE_SIZE-1)) == cq->front; } static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) @@ -172,7 +177,7 @@ static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) return -1; cq->element[cq->rear] = elem; - cq->rear = (cq->rear + 1)%MAX_CIRCULAR_QUE_SIZE; + cq->rear = (cq->rear + 1)&(MAX_CIRCULAR_QUE_SIZE-1); return 0; } @@ -182,30 +187,36 @@ static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem) return -1; *elem = cq->element[cq->front]; - cq->front = (cq->front + 1)%MAX_CIRCULAR_QUE_SIZE; + cq->front = (cq->front + 1)&(MAX_CIRCULAR_QUE_SIZE-1); return 0; } static inline int __cq_get_elem_count(struct circular_queue *cq) { - return (cq->rear - cq->front)%MAX_CIRCULAR_QUE_SIZE; + return (cq->rear - cq->front)&(MAX_CIRCULAR_QUE_SIZE-1); } static inline void mark_lock_accessed(struct lock_list *lock, struct lock_list *parent) { - lock->parent = (void *) parent + LOCK_ACCESSED; + unsigned long nr; + nr = lock - list_entries; + WARN_ON(nr >= nr_list_entries); + lock->parent = parent; + set_bit(nr, bfs_accessed); } static inline unsigned long lock_accessed(struct lock_list *lock) { - return (unsigned long)lock->parent & LOCK_ACCESSED; + unsigned long nr; + nr = lock - list_entries; + WARN_ON(nr >= nr_list_entries); + return test_bit(nr, bfs_accessed); } static inline struct lock_list *get_lock_parent(struct lock_list *child) { - return (struct lock_list *) - ((unsigned long)child->parent & LOCK_ACCESSED_MASK); + return child->parent; } static inline unsigned long get_lock_depth(struct lock_list *child) -- cgit v1.2.3-70-g09d2 From 9e2d551ea0d767c0d624965f0c273e942f4be536 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: lockdep: Introduce match function to BFS 1,introduce match() to BFS in order to make it usable to match different pattern; 2,also rename some functions to make them more suitable. Signed-off-by: Ming Lei Signed-off-by: Peter Zijlstra LKML-Reference: <1246201486-7308-4-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 5dcca26a826..ce6d09e65ad 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -900,17 +900,18 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, unsigned long bfs_accessed[BITS_TO_LONGS(MAX_LOCKDEP_ENTRIES)]; static struct circular_queue lock_cq; -static int __search_shortest_path(struct lock_list *source_entry, - struct lock_class *target, - struct lock_list **target_entry, - int forward) +static int __bfs(struct lock_list *source_entry, + void *data, + int (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry, + int forward) { struct lock_list *entry; struct list_head *head; struct circular_queue *cq = &lock_cq; int ret = 1; - if (source_entry->class == target) { + if (match(source_entry, data)) { *target_entry = source_entry; ret = 0; goto exit; @@ -945,7 +946,7 @@ static int __search_shortest_path(struct lock_list *source_entry, list_for_each_entry(entry, head, entry) { if (!lock_accessed(entry)) { mark_lock_accessed(entry, lock); - if (entry->class == target) { + if (match(entry, data)) { *target_entry = entry; ret = 0; goto exit; @@ -962,19 +963,21 @@ exit: return ret; } -static inline int __search_forward_shortest_path(struct lock_list *src_entry, - struct lock_class *target, - struct lock_list **target_entry) +static inline int __bfs_forward(struct lock_list *src_entry, + void *data, + int (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry) { - return __search_shortest_path(src_entry, target, target_entry, 1); + return __bfs(src_entry, data, match, target_entry, 1); } -static inline int __search_backward_shortest_path(struct lock_list *src_entry, - struct lock_class *target, - struct lock_list **target_entry) +static inline int __bfs_backward(struct lock_list *src_entry, + void *data, + int (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry) { - return __search_shortest_path(src_entry, target, target_entry, 0); + return __bfs(src_entry, data, match, target_entry, 0); } @@ -1035,6 +1038,11 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth) return 0; } +static inline int class_equal(struct lock_list *entry, void *data) +{ + return entry->class == data; +} + static noinline int print_circular_bug(void) { struct task_struct *curr = current; @@ -1052,9 +1060,10 @@ static noinline int print_circular_bug(void) if (!save_trace(&this.trace)) return 0; - result = __search_forward_shortest_path(&this, - hlock_class(check_target), - &target); + result = __bfs_forward(&this, + hlock_class(check_target), + class_equal, + &target); if (result) { printk("\n%s:search shortest path failed:%d\n", __func__, result); -- cgit v1.2.3-70-g09d2 From db0002a32f31060ca900b533d93a074ddf7d5b61 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: lockdep: Implement check_noncircular() by BFS This patch uses BFS to implement check_noncircular() and prints the generated shortest circle if exists. Signed-off-by: Ming Lei Signed-off-by: Peter Zijlstra LKML-Reference: <1246201486-7308-5-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 89 +++++++++++++++++++++++--------------------------------- 1 file changed, 37 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index ce6d09e65ad..5609d309d56 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -985,12 +985,7 @@ static inline int __bfs_backward(struct lock_list *src_entry, * Recursive, forwards-direction lock-dependency checking, used for * both noncyclic checking and for hardirq-unsafe/softirq-unsafe * checking. - * - * (to keep the stackframe of the recursive functions small we - * use these global variables, and we also mark various helper - * functions as noinline.) */ -static struct held_lock *check_source, *check_target; /* * Print a dependency chain entry (this is only done when a deadlock @@ -1014,7 +1009,9 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth) * header first: */ static noinline int -print_circular_bug_header(struct lock_list *entry, unsigned int depth) +print_circular_bug_header(struct lock_list *entry, unsigned int depth, + struct held_lock *check_src, + struct held_lock *check_tgt) { struct task_struct *curr = current; @@ -1027,9 +1024,9 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth) printk( "-------------------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); - print_lock(check_source); + print_lock(check_src); printk("\nbut task is already holding lock:\n"); - print_lock(check_target); + print_lock(check_tgt); printk("\nwhich lock already depends on the new lock.\n\n"); printk("\nthe existing dependency chain (in reverse order) is:\n"); @@ -1043,36 +1040,24 @@ static inline int class_equal(struct lock_list *entry, void *data) return entry->class == data; } -static noinline int print_circular_bug(void) +static noinline int print_circular_bug(struct lock_list *this, + struct lock_list *target, + struct held_lock *check_src, + struct held_lock *check_tgt) { struct task_struct *curr = current; - struct lock_list this; - struct lock_list *target; struct lock_list *parent; - int result; unsigned long depth; if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - this.class = hlock_class(check_source); - this.parent = NULL; - if (!save_trace(&this.trace)) + if (!save_trace(&this->trace)) return 0; - result = __bfs_forward(&this, - hlock_class(check_target), - class_equal, - &target); - if (result) { - printk("\n%s:search shortest path failed:%d\n", __func__, - result); - return 0; - } - depth = get_lock_depth(target); - print_circular_bug_header(target, depth); + print_circular_bug_header(target, depth, check_src, check_tgt); parent = get_lock_parent(target); @@ -1090,6 +1075,16 @@ static noinline int print_circular_bug(void) return 0; } +static noinline int print_bfs_bug(int ret) +{ + if (!debug_locks_off_graph_unlock()) + return 0; + + WARN(1, "lockdep bfs error:%d\n", ret); + + return 0; +} + #define RECURSION_LIMIT 40 static int noinline print_infinite_recursion_bug(void) @@ -1168,31 +1163,17 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) * lead to . Print an error and return 0 if it does. */ static noinline int -check_noncircular(struct lock_class *source, unsigned int depth) +check_noncircular(struct lock_list *root, struct lock_class *target, + struct lock_list **target_entry) { - struct lock_list *entry; + int result; - if (lockdep_dependency_visit(source, depth)) - return 1; + debug_atomic_inc(&nr_cyclic_checks); - debug_atomic_inc(&nr_cyclic_check_recursions); - if (depth > max_recursion_depth) - max_recursion_depth = depth; - if (depth >= RECURSION_LIMIT) - return print_infinite_recursion_bug(); - /* - * Check this lock's dependency list: - */ - list_for_each_entry(entry, &source->locks_after, entry) { - if (entry->class == hlock_class(check_target)) - return 2; - debug_atomic_inc(&nr_cyclic_checks); - if (check_noncircular(entry->class, depth+1) == 2) - return 2; - } - return 1; -} + result = __bfs_forward(root, target, class_equal, target_entry); + return result; +} #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* @@ -1586,6 +1567,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, { struct lock_list *entry; int ret; + struct lock_list this; + struct lock_list *uninitialized_var(target_entry); /* * Prove that the new -> dependency would not @@ -1596,11 +1579,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * We are using global variables to control the recursion, to * keep the stackframe size of the recursive functions low: */ - check_source = next; - check_target = prev; - - if (check_noncircular(hlock_class(next), 0) == 2) - return print_circular_bug(); + this.class = hlock_class(next); + this.parent = NULL; + ret = check_noncircular(&this, hlock_class(prev), &target_entry); + if (unlikely(!ret)) + return print_circular_bug(&this, target_entry, next, prev); + else if (unlikely(ret < 0)) + return print_bfs_bug(ret); if (!check_prev_add_irq(curr, prev, next)) return 0; -- cgit v1.2.3-70-g09d2 From d7aaba140a09b7a2295aec061629c880f088de3d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: lockdep: Implement find_usage_*wards by BFS This patch uses BFS to implement find_usage_*wards(),which was originally writen by DFS. Signed-off-by: Ming Lei Signed-off-by: Peter Zijlstra LKML-Reference: <1246201486-7308-6-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 180 ++++++++++++++++++++++--------------------------------- 1 file changed, 72 insertions(+), 108 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 5609d309d56..b3ade507c6c 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -963,7 +963,7 @@ exit: return ret; } -static inline int __bfs_forward(struct lock_list *src_entry, +static inline int __bfs_forwards(struct lock_list *src_entry, void *data, int (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry) @@ -972,7 +972,7 @@ static inline int __bfs_forward(struct lock_list *src_entry, } -static inline int __bfs_backward(struct lock_list *src_entry, +static inline int __bfs_backwards(struct lock_list *src_entry, void *data, int (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry) @@ -1085,18 +1085,6 @@ static noinline int print_bfs_bug(int ret) return 0; } -#define RECURSION_LIMIT 40 - -static int noinline print_infinite_recursion_bug(void) -{ - if (!debug_locks_off_graph_unlock()) - return 0; - - WARN_ON(1); - - return 0; -} - unsigned long __lockdep_count_forward_deps(struct lock_class *class, unsigned int depth) { @@ -1170,7 +1158,7 @@ check_noncircular(struct lock_list *root, struct lock_class *target, debug_atomic_inc(&nr_cyclic_checks); - result = __bfs_forward(root, target, class_equal, target_entry); + result = __bfs_forwards(root, target, class_equal, target_entry); return result; } @@ -1181,101 +1169,70 @@ check_noncircular(struct lock_list *root, struct lock_class *target, * proving that two subgraphs can be connected by a new dependency * without creating any illegal irq-safe -> irq-unsafe lock dependency. */ -static enum lock_usage_bit find_usage_bit; static struct lock_class *forwards_match, *backwards_match; + +#define BFS_PROCESS_RET(ret) do { \ + if (ret < 0) \ + return print_bfs_bug(ret); \ + if (ret == 1) \ + return 1; \ + } while (0) + +static inline int usage_match(struct lock_list *entry, void *bit) +{ + return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit); +} + + + /* * Find a node in the forwards-direction dependency sub-graph starting - * at that matches . + * at @root->class that matches @bit. * - * Return 2 if such a node exists in the subgraph, and put that node - * into . + * Return 0 if such a node exists in the subgraph, and put that node + * into *@target_entry. * - * Return 1 otherwise and keep unchanged. - * Return 0 on error. + * Return 1 otherwise and keep *@target_entry unchanged. + * Return <0 on error. */ -static noinline int -find_usage_forwards(struct lock_class *source, unsigned int depth) +static int +find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, + struct lock_list **target_entry) { - struct lock_list *entry; - int ret; - - if (lockdep_dependency_visit(source, depth)) - return 1; - - if (depth > max_recursion_depth) - max_recursion_depth = depth; - if (depth >= RECURSION_LIMIT) - return print_infinite_recursion_bug(); + int result; debug_atomic_inc(&nr_find_usage_forwards_checks); - if (source->usage_mask & (1 << find_usage_bit)) { - forwards_match = source; - return 2; - } - /* - * Check this lock's dependency list: - */ - list_for_each_entry(entry, &source->locks_after, entry) { - debug_atomic_inc(&nr_find_usage_forwards_recursions); - ret = find_usage_forwards(entry->class, depth+1); - if (ret == 2 || ret == 0) - return ret; - } - return 1; + result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); + + return result; } /* * Find a node in the backwards-direction dependency sub-graph starting - * at that matches . + * at @root->class that matches @bit. * - * Return 2 if such a node exists in the subgraph, and put that node - * into . + * Return 0 if such a node exists in the subgraph, and put that node + * into *@target_entry. * - * Return 1 otherwise and keep unchanged. - * Return 0 on error. + * Return 1 otherwise and keep *@target_entry unchanged. + * Return <0 on error. */ -static noinline int -find_usage_backwards(struct lock_class *source, unsigned int depth) +static int +find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit, + struct lock_list **target_entry) { - struct lock_list *entry; - int ret; - - if (lockdep_dependency_visit(source, depth)) - return 1; - - if (!__raw_spin_is_locked(&lockdep_lock)) - return DEBUG_LOCKS_WARN_ON(1); - - if (depth > max_recursion_depth) - max_recursion_depth = depth; - if (depth >= RECURSION_LIMIT) - return print_infinite_recursion_bug(); + int result; debug_atomic_inc(&nr_find_usage_backwards_checks); - if (source->usage_mask & (1 << find_usage_bit)) { - backwards_match = source; - return 2; - } - if (!source && debug_locks_off_graph_unlock()) { - WARN_ON(1); - return 0; - } + result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); - /* - * Check this lock's dependency list: - */ - list_for_each_entry(entry, &source->locks_before, entry) { - debug_atomic_inc(&nr_find_usage_backwards_recursions); - ret = find_usage_backwards(entry->class, depth+1); - if (ret == 2 || ret == 0) - return ret; - } - return 1; + return result; } + static int print_bad_irq_dependency(struct task_struct *curr, struct held_lock *prev, @@ -1343,18 +1300,21 @@ check_usage(struct task_struct *curr, struct held_lock *prev, enum lock_usage_bit bit_forwards, const char *irqclass) { int ret; + struct lock_list this; + struct lock_list *uninitialized_var(target_entry); + + this.parent = NULL; + + this.class = hlock_class(prev); + ret = find_usage_backwards(&this, bit_backwards, &target_entry); + BFS_PROCESS_RET(ret); + backwards_match = target_entry->class; + + this.class = hlock_class(next); + ret = find_usage_forwards(&this, bit_forwards, &target_entry); + BFS_PROCESS_RET(ret); + forwards_match = target_entry->class; - find_usage_bit = bit_backwards; - /* fills in */ - ret = find_usage_backwards(hlock_class(prev), 0); - if (!ret || ret == 1) - return ret; - - find_usage_bit = bit_forwards; - ret = find_usage_forwards(hlock_class(next), 0); - if (!ret || ret == 1) - return ret; - /* ret == 2 */ return print_bad_irq_dependency(curr, prev, next, bit_backwards, bit_forwards, irqclass); } @@ -2029,14 +1989,16 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit bit, const char *irqclass) { int ret; + struct lock_list root; + struct lock_list *uninitialized_var(target_entry); - find_usage_bit = bit; - /* fills in */ - ret = find_usage_forwards(hlock_class(this), 0); - if (!ret || ret == 1) - return ret; + root.parent = NULL; + root.class = hlock_class(this); + ret = find_usage_forwards(&root, bit, &target_entry); + BFS_PROCESS_RET(ret); - return print_irq_inversion_bug(curr, forwards_match, this, 1, irqclass); + return print_irq_inversion_bug(curr, target_entry->class, + this, 1, irqclass); } /* @@ -2048,14 +2010,16 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit bit, const char *irqclass) { int ret; + struct lock_list root; + struct lock_list *uninitialized_var(target_entry); - find_usage_bit = bit; - /* fills in */ - ret = find_usage_backwards(hlock_class(this), 0); - if (!ret || ret == 1) - return ret; + root.parent = NULL; + root.class = hlock_class(this); + ret = find_usage_backwards(&root, bit, &target_entry); + BFS_PROCESS_RET(ret); - return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass); + return print_irq_inversion_bug(curr, target_entry->class, + this, 1, irqclass); } void print_irqtrace_events(struct task_struct *curr) -- cgit v1.2.3-70-g09d2 From 24208ca76707581a097c01a73fd63781e73d3404 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: lockdep: Introduce print_shortest_lock_dependencies Since the shortest lock dependencies' path may be obtained by BFS, we print the shortest one by print_shortest_lock_dependencies(). Signed-off-by: Ming Lei Signed-off-by: Peter Zijlstra LKML-Reference: <1246201486-7308-7-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 93 +++++++++++++++++++++++++++++++++------------- kernel/lockdep_internals.h | 4 +- 2 files changed, 69 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index b3ade507c6c..938dc500f1a 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -575,6 +575,36 @@ static void print_lock_class_header(struct lock_class *class, int depth) print_ip_sym((unsigned long)class->key); } +/* + * printk the shortest lock dependencies from @start to @end in reverse order: + */ +static void __used +print_shortest_lock_dependencies(struct lock_list *leaf, + struct lock_list *root) +{ + struct lock_list *entry = leaf; + int depth; + + /*compute depth from generated tree by BFS*/ + depth = get_lock_depth(leaf); + + do { + print_lock_class_header(entry->class, depth); + printk("%*s ... acquired at:\n", depth, ""); + print_stack_trace(&entry->trace, 2); + printk("\n"); + + if (depth == 0 && (entry != root)) { + printk("lockdep:%s bad BFS generated tree\n", __func__); + break; + } + + entry = get_lock_parent(entry); + depth--; + } while (entry && (depth >= 0)); + + return; +} /* * printk all lock dependencies starting at : */ @@ -992,7 +1022,7 @@ static inline int __bfs_backwards(struct lock_list *src_entry, * has been detected): */ static noinline int -print_circular_bug_entry(struct lock_list *target, unsigned int depth) +print_circular_bug_entry(struct lock_list *target, int depth) { if (debug_locks_silent) return 0; @@ -1047,7 +1077,7 @@ static noinline int print_circular_bug(struct lock_list *this, { struct task_struct *curr = current; struct lock_list *parent; - unsigned long depth; + int depth; if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; @@ -1169,7 +1199,6 @@ check_noncircular(struct lock_list *root, struct lock_class *target, * proving that two subgraphs can be connected by a new dependency * without creating any illegal irq-safe -> irq-unsafe lock dependency. */ -static struct lock_class *forwards_match, *backwards_match; #define BFS_PROCESS_RET(ret) do { \ @@ -1235,6 +1264,10 @@ find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit, static int print_bad_irq_dependency(struct task_struct *curr, + struct lock_list *prev_root, + struct lock_list *next_root, + struct lock_list *backwards_entry, + struct lock_list *forwards_entry, struct held_lock *prev, struct held_lock *next, enum lock_usage_bit bit1, @@ -1267,26 +1300,32 @@ print_bad_irq_dependency(struct task_struct *curr, printk("\nbut this new dependency connects a %s-irq-safe lock:\n", irqclass); - print_lock_name(backwards_match); + print_lock_name(backwards_entry->class); printk("\n... which became %s-irq-safe at:\n", irqclass); - print_stack_trace(backwards_match->usage_traces + bit1, 1); + print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); printk("\nto a %s-irq-unsafe lock:\n", irqclass); - print_lock_name(forwards_match); + print_lock_name(forwards_entry->class); printk("\n... which became %s-irq-unsafe at:\n", irqclass); printk("..."); - print_stack_trace(forwards_match->usage_traces + bit2, 1); + print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); printk("\nother info that might help us debug this:\n\n"); lockdep_print_held_locks(curr); - printk("\nthe %s-irq-safe lock's dependencies:\n", irqclass); - print_lock_dependencies(backwards_match, 0); + printk("\nthe dependencies between %s-irq-safe lock", irqclass); + printk(" and the holding lock:\n"); + if (!save_trace(&prev_root->trace)) + return 0; + print_shortest_lock_dependencies(backwards_entry, prev_root); - printk("\nthe %s-irq-unsafe lock's dependencies:\n", irqclass); - print_lock_dependencies(forwards_match, 0); + printk("\nthe dependencies between the lock to be acquired"); + printk(" and %s-irq-unsafe lock:\n", irqclass); + if (!save_trace(&next_root->trace)) + return 0; + print_shortest_lock_dependencies(forwards_entry, next_root); printk("\nstack backtrace:\n"); dump_stack(); @@ -1300,22 +1339,24 @@ check_usage(struct task_struct *curr, struct held_lock *prev, enum lock_usage_bit bit_forwards, const char *irqclass) { int ret; - struct lock_list this; + struct lock_list this, that; struct lock_list *uninitialized_var(target_entry); + struct lock_list *uninitialized_var(target_entry1); this.parent = NULL; this.class = hlock_class(prev); ret = find_usage_backwards(&this, bit_backwards, &target_entry); BFS_PROCESS_RET(ret); - backwards_match = target_entry->class; - this.class = hlock_class(next); - ret = find_usage_forwards(&this, bit_forwards, &target_entry); + that.parent = NULL; + that.class = hlock_class(next); + ret = find_usage_forwards(&that, bit_forwards, &target_entry1); BFS_PROCESS_RET(ret); - forwards_match = target_entry->class; - return print_bad_irq_dependency(curr, prev, next, + return print_bad_irq_dependency(curr, &this, &that, + target_entry, target_entry1, + prev, next, bit_backwards, bit_forwards, irqclass); } @@ -1944,7 +1985,8 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, * print irq inversion bug: */ static int -print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, +print_irq_inversion_bug(struct task_struct *curr, + struct lock_list *root, struct lock_list *other, struct held_lock *this, int forwards, const char *irqclass) { @@ -1962,17 +2004,16 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); else printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); - print_lock_name(other); + print_lock_name(other->class); printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); printk("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); - printk("\nthe first lock's dependencies:\n"); - print_lock_dependencies(hlock_class(this), 0); - - printk("\nthe second lock's dependencies:\n"); - print_lock_dependencies(other, 0); + printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); + if (!save_trace(&root->trace)) + return 0; + print_shortest_lock_dependencies(other, root); printk("\nstack backtrace:\n"); dump_stack(); @@ -1997,7 +2038,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, ret = find_usage_forwards(&root, bit, &target_entry); BFS_PROCESS_RET(ret); - return print_irq_inversion_bug(curr, target_entry->class, + return print_irq_inversion_bug(curr, &root, target_entry, this, 1, irqclass); } @@ -2018,7 +2059,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, ret = find_usage_backwards(&root, bit, &target_entry); BFS_PROCESS_RET(ret); - return print_irq_inversion_bug(curr, target_entry->class, + return print_irq_inversion_bug(curr, &root, target_entry, this, 1, irqclass); } diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index c2f6594966f..b115aaa0bf3 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -219,9 +219,9 @@ static inline struct lock_list *get_lock_parent(struct lock_list *child) return child->parent; } -static inline unsigned long get_lock_depth(struct lock_list *child) +static inline int get_lock_depth(struct lock_list *child) { - unsigned long depth = 0; + int depth = 0; struct lock_list *parent; while ((parent = get_lock_parent(child))) { -- cgit v1.2.3-70-g09d2 From ef681026ff85c49b7399ceec3eb62bbbcce605e8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: lockdep: Implement lockdep_count_*ward_deps by BFS Implement lockdep_count_{for,back}ward using BFS. Signed-off-by: Ming Lei Signed-off-by: Peter Zijlstra LKML-Reference: <1246201486-7308-8-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 52 +++++++++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 938dc500f1a..b896f23e00c 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -1115,61 +1115,59 @@ static noinline int print_bfs_bug(int ret) return 0; } -unsigned long __lockdep_count_forward_deps(struct lock_class *class, - unsigned int depth) +static int noop_count(struct lock_list *entry, void *data) { - struct lock_list *entry; - unsigned long ret = 1; + (*(unsigned long *)data)++; + return 0; +} - if (lockdep_dependency_visit(class, depth)) - return 0; +unsigned long __lockdep_count_forward_deps(struct lock_list *this) +{ + unsigned long count = 0; + struct lock_list *uninitialized_var(target_entry); - /* - * Recurse this class's dependency list: - */ - list_for_each_entry(entry, &class->locks_after, entry) - ret += __lockdep_count_forward_deps(entry->class, depth + 1); + __bfs_forwards(this, (void *)&count, noop_count, &target_entry); - return ret; + return count; } - unsigned long lockdep_count_forward_deps(struct lock_class *class) { unsigned long ret, flags; + struct lock_list this; + + this.parent = NULL; + this.class = class; local_irq_save(flags); __raw_spin_lock(&lockdep_lock); - ret = __lockdep_count_forward_deps(class, 0); + ret = __lockdep_count_forward_deps(&this); __raw_spin_unlock(&lockdep_lock); local_irq_restore(flags); return ret; } -unsigned long __lockdep_count_backward_deps(struct lock_class *class, - unsigned int depth) +unsigned long __lockdep_count_backward_deps(struct lock_list *this) { - struct lock_list *entry; - unsigned long ret = 1; + unsigned long count = 0; + struct lock_list *uninitialized_var(target_entry); - if (lockdep_dependency_visit(class, depth)) - return 0; - /* - * Recurse this class's dependency list: - */ - list_for_each_entry(entry, &class->locks_before, entry) - ret += __lockdep_count_backward_deps(entry->class, depth + 1); + __bfs_backwards(this, (void *)&count, noop_count, &target_entry); - return ret; + return count; } unsigned long lockdep_count_backward_deps(struct lock_class *class) { unsigned long ret, flags; + struct lock_list this; + + this.parent = NULL; + this.class = class; local_irq_save(flags); __raw_spin_lock(&lockdep_lock); - ret = __lockdep_count_backward_deps(class, 0); + ret = __lockdep_count_backward_deps(&this); __raw_spin_unlock(&lockdep_lock); local_irq_restore(flags); -- cgit v1.2.3-70-g09d2 From 4dd861d6467007681991d8ec079d928db2018cbb Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: lockdep: Update memory usage introduced by BFS Also account the BFS memory usage. Signed-off-by: Ming Lei [ fix build for !PROVE_LOCKING ] Signed-off-by: Peter Zijlstra LKML-Reference: <1246201486-7308-9-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index b896f23e00c..6358cf7e84b 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3429,7 +3429,11 @@ void __init lockdep_info(void) sizeof(struct list_head) * CLASSHASH_SIZE + sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + - sizeof(struct list_head) * CHAINHASH_SIZE) / 1024); + sizeof(struct list_head) * CHAINHASH_SIZE) / 1024 +#ifdef CONFIG_PROVE_LOCKING + + sizeof(struct circular_queue) + sizeof(bfs_accessed) +#endif + ); printk(" per task-struct memory footprint: %lu bytes\n", sizeof(struct held_lock) * MAX_LOCK_DEPTH); -- cgit v1.2.3-70-g09d2 From 12f3dfd022d7e616757a94f0538d3d525d806a16 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: lockdep: Add statistics info for max bfs queue depth Add BFS statistics to the existing lockdep stats. Signed-off-by: Ming Lei Signed-off-by: Peter Zijlstra LKML-Reference: <1246201486-7308-10-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 6 +++++- kernel/lockdep_internals.h | 3 ++- kernel/lockdep_proc.c | 2 ++ 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 6358cf7e84b..744da6265d9 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -929,7 +929,7 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, unsigned long bfs_accessed[BITS_TO_LONGS(MAX_LOCKDEP_ENTRIES)]; static struct circular_queue lock_cq; - +unsigned int max_bfs_queue_depth; static int __bfs(struct lock_list *source_entry, void *data, int (*match)(struct lock_list *entry, void *data), @@ -975,6 +975,7 @@ static int __bfs(struct lock_list *source_entry, list_for_each_entry(entry, head, entry) { if (!lock_accessed(entry)) { + unsigned int cq_depth; mark_lock_accessed(entry, lock); if (match(entry, data)) { *target_entry = entry; @@ -986,6 +987,9 @@ static int __bfs(struct lock_list *source_entry, ret = -1; goto exit; } + cq_depth = __cq_get_elem_count(cq); + if (max_bfs_queue_depth < cq_depth) + max_bfs_queue_depth = cq_depth; } } } diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index b115aaa0bf3..6baa8807efd 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -138,6 +138,7 @@ extern atomic_t nr_find_usage_backwards_recursions; #endif +extern unsigned int max_bfs_queue_depth; extern unsigned long nr_list_entries; extern struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; extern unsigned long bfs_accessed[]; @@ -191,7 +192,7 @@ static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem) return 0; } -static inline int __cq_get_elem_count(struct circular_queue *cq) +static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) { return (cq->rear - cq->front)&(MAX_CIRCULAR_QUE_SIZE-1); } diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index d7135aa2d2c..9a1bf34d2ff 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -411,6 +411,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v) max_lockdep_depth); seq_printf(m, " max recursion depth: %11u\n", max_recursion_depth); + seq_printf(m, " max bfs queue depth: %11u\n", + max_bfs_queue_depth); lockdep_stats_debug_show(m); seq_printf(m, " debug_locks: %11u\n", debug_locks); -- cgit v1.2.3-70-g09d2 From af012961450949ea297b209e091bd1a3805b8a0a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: lockdep: BFS cleanup Some cleanups of the lockdep code after the BFS series: - Remove the last traces of the generation id - Fixup comment style - Move the bfs routines into lockdep.c - Cleanup the bfs routines [ tom.leiming@gmail.com: Fix crash ] Signed-off-by: Peter Zijlstra LKML-Reference: <1246201486-7308-11-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 7 +- kernel/lockdep.c | 284 +++++++++++++++++++++++++++------------------ kernel/lockdep_internals.h | 97 +--------------- 3 files changed, 175 insertions(+), 213 deletions(-) (limited to 'kernel') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 9ec026f8d09..12aabfcb45f 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -58,7 +58,6 @@ struct lock_class { struct lockdep_subclass_key *key; unsigned int subclass; - unsigned int dep_gen_id; /* * IRQ/softirq usage tracking bits: @@ -150,9 +149,9 @@ struct lock_list { struct stack_trace trace; int distance; - /*The parent field is used to implement breadth-first search,and - *the bit 0 is reused to indicate if the lock has been accessed - *in BFS. + /* + * The parent field is used to implement breadth-first search, and the + * bit 0 is reused to indicate if the lock has been accessed in BFS. */ struct lock_list *parent; }; diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 744da6265d9..1cedb00e3e7 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -43,6 +43,7 @@ #include #include #include + #include #include "lockdep_internals.h" @@ -118,7 +119,7 @@ static inline int debug_locks_off_graph_unlock(void) static int lockdep_initialized; unsigned long nr_list_entries; -struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; +static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; /* * All data structures here are protected by the global debug_lock. @@ -390,19 +391,6 @@ unsigned int nr_process_chains; unsigned int max_lockdep_depth; unsigned int max_recursion_depth; -static unsigned int lockdep_dependency_gen_id; - -static bool lockdep_dependency_visit(struct lock_class *source, - unsigned int depth) -{ - if (!depth) - lockdep_dependency_gen_id++; - if (source->dep_gen_id == lockdep_dependency_gen_id) - return true; - source->dep_gen_id = lockdep_dependency_gen_id; - return false; -} - #ifdef CONFIG_DEBUG_LOCKDEP /* * We cannot printk in early bootup code. Not even early_printk() @@ -551,88 +539,6 @@ static void lockdep_print_held_locks(struct task_struct *curr) } } -static void print_lock_class_header(struct lock_class *class, int depth) -{ - int bit; - - printk("%*s->", depth, ""); - print_lock_name(class); - printk(" ops: %lu", class->ops); - printk(" {\n"); - - for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { - if (class->usage_mask & (1 << bit)) { - int len = depth; - - len += printk("%*s %s", depth, "", usage_str[bit]); - len += printk(" at:\n"); - print_stack_trace(class->usage_traces + bit, len); - } - } - printk("%*s }\n", depth, ""); - - printk("%*s ... key at: ",depth,""); - print_ip_sym((unsigned long)class->key); -} - -/* - * printk the shortest lock dependencies from @start to @end in reverse order: - */ -static void __used -print_shortest_lock_dependencies(struct lock_list *leaf, - struct lock_list *root) -{ - struct lock_list *entry = leaf; - int depth; - - /*compute depth from generated tree by BFS*/ - depth = get_lock_depth(leaf); - - do { - print_lock_class_header(entry->class, depth); - printk("%*s ... acquired at:\n", depth, ""); - print_stack_trace(&entry->trace, 2); - printk("\n"); - - if (depth == 0 && (entry != root)) { - printk("lockdep:%s bad BFS generated tree\n", __func__); - break; - } - - entry = get_lock_parent(entry); - depth--; - } while (entry && (depth >= 0)); - - return; -} -/* - * printk all lock dependencies starting at : - */ -static void __used -print_lock_dependencies(struct lock_class *class, int depth) -{ - struct lock_list *entry; - - if (lockdep_dependency_visit(class, depth)) - return; - - if (DEBUG_LOCKS_WARN_ON(depth >= 20)) - return; - - print_lock_class_header(class, depth); - - list_for_each_entry(entry, &class->locks_after, entry) { - if (DEBUG_LOCKS_WARN_ON(!entry->class)) - return; - - print_lock_dependencies(entry->class, depth + 1); - - printk("%*s ... acquired at:\n",depth,""); - print_stack_trace(&entry->trace, 2); - printk("\n"); - } -} - static void print_kernel_version(void) { printk("%s %.*s\n", init_utsname()->release, @@ -927,14 +833,106 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, return 1; } -unsigned long bfs_accessed[BITS_TO_LONGS(MAX_LOCKDEP_ENTRIES)]; -static struct circular_queue lock_cq; +/*For good efficiency of modular, we use power of 2*/ +#define MAX_CIRCULAR_QUEUE_SIZE 4096UL +#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1) + +/* The circular_queue and helpers is used to implement the + * breadth-first search(BFS)algorithem, by which we can build + * the shortest path from the next lock to be acquired to the + * previous held lock if there is a circular between them. + * */ +struct circular_queue { + unsigned long element[MAX_CIRCULAR_QUEUE_SIZE]; + unsigned int front, rear; +}; + +static struct circular_queue lock_cq; +static unsigned long bfs_accessed[BITS_TO_LONGS(MAX_LOCKDEP_ENTRIES)]; + unsigned int max_bfs_queue_depth; + +static inline void __cq_init(struct circular_queue *cq) +{ + cq->front = cq->rear = 0; + bitmap_zero(bfs_accessed, MAX_LOCKDEP_ENTRIES); +} + +static inline int __cq_empty(struct circular_queue *cq) +{ + return (cq->front == cq->rear); +} + +static inline int __cq_full(struct circular_queue *cq) +{ + return ((cq->rear + 1) & CQ_MASK) == cq->front; +} + +static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) +{ + if (__cq_full(cq)) + return -1; + + cq->element[cq->rear] = elem; + cq->rear = (cq->rear + 1) & CQ_MASK; + return 0; +} + +static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem) +{ + if (__cq_empty(cq)) + return -1; + + *elem = cq->element[cq->front]; + cq->front = (cq->front + 1) & CQ_MASK; + return 0; +} + +static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) +{ + return (cq->rear - cq->front) & CQ_MASK; +} + +static inline void mark_lock_accessed(struct lock_list *lock, + struct lock_list *parent) +{ + unsigned long nr; + nr = lock - list_entries; + WARN_ON(nr >= nr_list_entries); + lock->parent = parent; + set_bit(nr, bfs_accessed); +} + +static inline unsigned long lock_accessed(struct lock_list *lock) +{ + unsigned long nr; + nr = lock - list_entries; + WARN_ON(nr >= nr_list_entries); + return test_bit(nr, bfs_accessed); +} + +static inline struct lock_list *get_lock_parent(struct lock_list *child) +{ + return child->parent; +} + +static inline int get_lock_depth(struct lock_list *child) +{ + int depth = 0; + struct lock_list *parent; + + while ((parent = get_lock_parent(child))) { + child = parent; + depth++; + } + return depth; +} + static int __bfs(struct lock_list *source_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry, - int forward) + void *data, + int (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry, + int forward) { struct lock_list *entry; struct list_head *head; @@ -1202,14 +1200,6 @@ check_noncircular(struct lock_list *root, struct lock_class *target, * without creating any illegal irq-safe -> irq-unsafe lock dependency. */ - -#define BFS_PROCESS_RET(ret) do { \ - if (ret < 0) \ - return print_bfs_bug(ret); \ - if (ret == 1) \ - return 1; \ - } while (0) - static inline int usage_match(struct lock_list *entry, void *bit) { return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit); @@ -1263,6 +1253,60 @@ find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit, return result; } +static void print_lock_class_header(struct lock_class *class, int depth) +{ + int bit; + + printk("%*s->", depth, ""); + print_lock_name(class); + printk(" ops: %lu", class->ops); + printk(" {\n"); + + for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { + if (class->usage_mask & (1 << bit)) { + int len = depth; + + len += printk("%*s %s", depth, "", usage_str[bit]); + len += printk(" at:\n"); + print_stack_trace(class->usage_traces + bit, len); + } + } + printk("%*s }\n", depth, ""); + + printk("%*s ... key at: ",depth,""); + print_ip_sym((unsigned long)class->key); +} + +/* + * printk the shortest lock dependencies from @start to @end in reverse order: + */ +static void __used +print_shortest_lock_dependencies(struct lock_list *leaf, + struct lock_list *root) +{ + struct lock_list *entry = leaf; + int depth; + + /*compute depth from generated tree by BFS*/ + depth = get_lock_depth(leaf); + + do { + print_lock_class_header(entry->class, depth); + printk("%*s ... acquired at:\n", depth, ""); + print_stack_trace(&entry->trace, 2); + printk("\n"); + + if (depth == 0 && (entry != root)) { + printk("lockdep:%s bad BFS generated tree\n", __func__); + break; + } + + entry = get_lock_parent(entry); + depth--; + } while (entry && (depth >= 0)); + + return; +} static int print_bad_irq_dependency(struct task_struct *curr, @@ -1349,12 +1393,18 @@ check_usage(struct task_struct *curr, struct held_lock *prev, this.class = hlock_class(prev); ret = find_usage_backwards(&this, bit_backwards, &target_entry); - BFS_PROCESS_RET(ret); + if (ret < 0) + return print_bfs_bug(ret); + if (ret == 1) + return ret; that.parent = NULL; that.class = hlock_class(next); ret = find_usage_forwards(&that, bit_forwards, &target_entry1); - BFS_PROCESS_RET(ret); + if (ret < 0) + return print_bfs_bug(ret); + if (ret == 1) + return ret; return print_bad_irq_dependency(curr, &this, &that, target_entry, target_entry1, @@ -2038,7 +2088,10 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, root.parent = NULL; root.class = hlock_class(this); ret = find_usage_forwards(&root, bit, &target_entry); - BFS_PROCESS_RET(ret); + if (ret < 0) + return print_bfs_bug(ret); + if (ret == 1) + return ret; return print_irq_inversion_bug(curr, &root, target_entry, this, 1, irqclass); @@ -2059,7 +2112,10 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, root.parent = NULL; root.class = hlock_class(this); ret = find_usage_backwards(&root, bit, &target_entry); - BFS_PROCESS_RET(ret); + if (ret < 0) + return print_bfs_bug(ret); + if (ret == 1) + return ret; return print_irq_inversion_bug(curr, &root, target_entry, this, 1, irqclass); diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index 6baa8807efd..a2ee95ad131 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -91,6 +91,8 @@ extern unsigned int nr_process_chains; extern unsigned int max_lockdep_depth; extern unsigned int max_recursion_depth; +extern unsigned int max_bfs_queue_depth; + #ifdef CONFIG_PROVE_LOCKING extern unsigned long lockdep_count_forward_deps(struct lock_class *); extern unsigned long lockdep_count_backward_deps(struct lock_class *); @@ -136,98 +138,3 @@ extern atomic_t nr_find_usage_backwards_recursions; # define debug_atomic_dec(ptr) do { } while (0) # define debug_atomic_read(ptr) 0 #endif - - -extern unsigned int max_bfs_queue_depth; -extern unsigned long nr_list_entries; -extern struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; -extern unsigned long bfs_accessed[]; - -/*For good efficiency of modular, we use power of 2*/ -#define MAX_CIRCULAR_QUE_SIZE 4096UL - -/* The circular_queue and helpers is used to implement the - * breadth-first search(BFS)algorithem, by which we can build - * the shortest path from the next lock to be acquired to the - * previous held lock if there is a circular between them. - * */ -struct circular_queue{ - unsigned long element[MAX_CIRCULAR_QUE_SIZE]; - unsigned int front, rear; -}; - -static inline void __cq_init(struct circular_queue *cq) -{ - cq->front = cq->rear = 0; - bitmap_zero(bfs_accessed, MAX_LOCKDEP_ENTRIES); -} - -static inline int __cq_empty(struct circular_queue *cq) -{ - return (cq->front == cq->rear); -} - -static inline int __cq_full(struct circular_queue *cq) -{ - return ((cq->rear + 1)&(MAX_CIRCULAR_QUE_SIZE-1)) == cq->front; -} - -static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) -{ - if (__cq_full(cq)) - return -1; - - cq->element[cq->rear] = elem; - cq->rear = (cq->rear + 1)&(MAX_CIRCULAR_QUE_SIZE-1); - return 0; -} - -static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem) -{ - if (__cq_empty(cq)) - return -1; - - *elem = cq->element[cq->front]; - cq->front = (cq->front + 1)&(MAX_CIRCULAR_QUE_SIZE-1); - return 0; -} - -static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) -{ - return (cq->rear - cq->front)&(MAX_CIRCULAR_QUE_SIZE-1); -} - -static inline void mark_lock_accessed(struct lock_list *lock, - struct lock_list *parent) -{ - unsigned long nr; - nr = lock - list_entries; - WARN_ON(nr >= nr_list_entries); - lock->parent = parent; - set_bit(nr, bfs_accessed); -} - -static inline unsigned long lock_accessed(struct lock_list *lock) -{ - unsigned long nr; - nr = lock - list_entries; - WARN_ON(nr >= nr_list_entries); - return test_bit(nr, bfs_accessed); -} - -static inline struct lock_list *get_lock_parent(struct lock_list *child) -{ - return child->parent; -} - -static inline int get_lock_depth(struct lock_list *child) -{ - int depth = 0; - struct lock_list *parent; - - while ((parent = get_lock_parent(child))) { - child = parent; - depth++; - } - return depth; -} -- cgit v1.2.3-70-g09d2 From bdff78707f3ce47e891f3201c9666122a70556ce Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 24 Jul 2009 15:30:45 -0400 Subject: trace: stop tracer in oops_enter() If trace_printk_on_oops is set we lose interesting trace information when the tracer is enabled across oops handling and printing. We want the trace which might give us information _WHY_ we oopsed. Signed-off-by: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/panic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 984b3ecbd72..512ab73b0ca 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -301,6 +301,7 @@ int oops_may_print(void) */ void oops_enter(void) { + tracing_off(); /* can't trust the integrity of the kernel anymore: */ debug_locks_off(); do_oops_enter_exit(); -- cgit v1.2.3-70-g09d2 From 6560dc160f3a96b8f1f43e2c6b51aa6eb9898b90 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Thu, 23 Jul 2009 23:42:08 +0930 Subject: module: use MODULE_SYMBOL_PREFIX with module_layout The check_modstruct_version() needs to look up the symbol "module_layout" in the kernel, but it does so literally and not by a C identifier. The trouble is that it does not include a symbol prefix for those ports that need it (like the Blackfin and H8300 port). So make sure we tack on the MODULE_SYMBOL_PREFIX define to the front of it. Signed-off-by: Mike Frysinger Signed-off-by: Rusty Russell Signed-off-by: Linus Torvalds --- kernel/module.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 0a049837008..fd141140355 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1068,7 +1068,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, { const unsigned long *crc; - if (!find_symbol("module_layout", NULL, &crc, true, false)) + if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, + &crc, true, false)) BUG(); return check_version(sechdrs, versindex, "module_layout", mod, crc); } -- cgit v1.2.3-70-g09d2 From 9ae260270c90643156cda73427aa1f04c923e627 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 19 Jun 2009 02:51:13 +0200 Subject: update the comment in kthread_stop() Commit 63706172f332fd3f6e7458ebfb35fa6de9c21dc5 ("kthreads: rework kthread_stop()") removed the limitation that the thread function mysr not call do_exit() itself, but forgot to update the comment. Since that commit it is OK to use kthread_stop() even if kthread can exit itself. Signed-off-by: Oleg Nesterov Signed-off-by: Rusty Russell Signed-off-by: Linus Torvalds --- kernel/kthread.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 9b1a7de2697..eb8751aa041 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -180,10 +180,12 @@ EXPORT_SYMBOL(kthread_bind); * @k: thread created by kthread_create(). * * Sets kthread_should_stop() for @k to return true, wakes it, and - * waits for it to exit. Your threadfn() must not call do_exit() - * itself if you use this function! This can also be called after - * kthread_create() instead of calling wake_up_process(): the thread - * will exit without calling threadfn(). + * waits for it to exit. This can also be called after kthread_create() + * instead of calling wake_up_process(): the thread will exit without + * calling threadfn(). + * + * If threadfn() may call do_exit() itself, the caller must ensure + * task_struct can't go away. * * Returns the result of threadfn(), or %-EINTR if wake_up_process() * was never called. -- cgit v1.2.3-70-g09d2 From 38ceb592fcac9110c6b3c87ea0a27bff68c43486 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 28 Jul 2009 20:11:24 +0800 Subject: tracing: Fix invalid function_graph entry When print_graph_entry() computes a function call entry event, it needs to also check the next entry to guess if it matches the return event of the current function entry. In order to look at this next event, it needs to consume the current entry before going ahead in the ring buffer. However, if the current event that gets consumed is the last one in the ring buffer head page, the ring_buffer may reuse the page for writers. The consumed entry will then become invalid because of possible racy overwriting. Me must then handle this entry by making a copy of it. The fix also applies on 2.6.30 Signed-off-by: Lai Jiangshan Cc: Steven Rostedt Cc: stable@kernel.org LKML-Reference: <4A6EEAEC.3050508@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_functions_graph.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d2249abafb5..420ec348757 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -843,9 +843,16 @@ print_graph_function(struct trace_iterator *iter) switch (entry->type) { case TRACE_GRAPH_ENT: { - struct ftrace_graph_ent_entry *field; + /* + * print_graph_entry() may consume the current event, + * thus @field may become invalid, so we need to save it. + * sizeof(struct ftrace_graph_ent_entry) is very small, + * it can be safely saved at the stack. + */ + struct ftrace_graph_ent_entry *field, saved; trace_assign_type(field, entry); - return print_graph_entry(field, s, iter); + saved = *field; + return print_graph_entry(&saved, s, iter); } case TRACE_GRAPH_RET: { struct ftrace_graph_ret_entry *field; -- cgit v1.2.3-70-g09d2 From 74e7ff8c50b6b022e6ffaa736b16a4dc161d3eaf Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 28 Jul 2009 20:17:22 +0800 Subject: tracing: Fix missing function_graph events when we splice_read from trace_pipe About a half events are missing when we splice_read from trace_pipe. They are unexpectedly consumed because we ignore the TRACE_TYPE_NO_CONSUME return value used by the function graph tracer when it needs to consume the events by itself to walk on the ring buffer. The same problem appears with ftrace_dump() Example of an output before this patch: 1) | ktime_get_real() { 1) 2.846 us | read_hpet(); 1) 4.558 us | } 1) 6.195 us | } After this patch: 0) | ktime_get_real() { 0) | getnstimeofday() { 0) 1.960 us | read_hpet(); 0) 3.597 us | } 0) 5.196 us | } The fix also applies on 2.6.30 Signed-off-by: Lai Jiangshan Cc: Steven Rostedt Cc: stable@kernel.org LKML-Reference: <4A6EEC52.90704@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8bc8d8afea6..da984ad065a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3085,7 +3085,8 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) break; } - trace_consume(iter); + if (ret != TRACE_TYPE_NO_CONSUME) + trace_consume(iter); rem -= count; if (!find_next_entry_inc(iter)) { rem = 0; @@ -4233,8 +4234,11 @@ static void __ftrace_dump(bool disable_tracing) iter.pos = -1; if (find_next_entry_inc(&iter) != NULL) { - print_trace_line(&iter); - trace_consume(&iter); + int ret; + + ret = print_trace_line(&iter); + if (ret != TRACE_TYPE_NO_CONSUME) + trace_consume(&iter); } trace_printk_seq(&iter.seq); -- cgit v1.2.3-70-g09d2 From 933b787b57ca8bdc0fc8fb2cbf67b5e6d21beb84 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 29 Jul 2009 15:02:07 -0700 Subject: mm: copy over oom_adj value at fork time Fix a post-2.6.31 regression which was introduced by 2ff05b2b4eac2e63d345fc731ea151a060247f53 ("oom: move oom_adj value from task_struct to mm_struct"). After moving the oom_adj value from the task struct to the mm_struct, the oom_adj value was no longer properly inherited by child processes. Copying over the oom_adj value at fork time fixes that bug. [kosaki.motohiro@jp.fujitsu.com: test for current->mm before dereferencing it] Signed-off-by: Rik van Riel Reported-by: Paul Menage Cc: KOSAKI Motohiro Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 9b42695f0d1..29b532e718f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -426,6 +426,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; + mm->oom_adj = (current->mm) ? current->mm->oom_adj : 0; mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); -- cgit v1.2.3-70-g09d2 From 11c7da4b0ca76a57f51c996c883c480e203cf5a9 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 29 Jul 2009 15:02:08 -0700 Subject: kexec: fix omitting offset in extended crashkernel syntax Setting "crashkernel=512M-2G:64M,2G-:128M" does not work but it turns to work if it has a trailing-whitespace, like "crashkernel=512M-2G:64M,2G-:128M ". It was because of a bug in the parser, running over the cmdline. This patch adds a check of the termination. Reported-by: Jin Dongming Signed-off-by: Hidetoshi Seto Tested-by: Jin Dongming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index ae1c35201cc..f336e2107f9 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1228,7 +1228,7 @@ static int __init parse_crashkernel_mem(char *cmdline, } while (*cur++ == ','); if (*crash_size > 0) { - while (*cur != ' ' && *cur != '@') + while (*cur && *cur != ' ' && *cur != '@') cur++; if (*cur == '@') { cur++; -- cgit v1.2.3-70-g09d2 From 096b7fe012d66ed55e98bc8022405ede0cc80e96 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 29 Jul 2009 15:04:04 -0700 Subject: cgroups: fix pid namespace bug The bug was introduced by commit cc31edceee04a7b87f2be48f9489ebb72d264844 ("cgroups: convert tasks file to use a seq_file with shared pid array"). We cache a pid array for all threads that are opening the same "tasks" file, but the pids in the array are always from the namespace of the last process that opened the file, so all other threads will read pids from that namespace instead of their own namespaces. To fix it, we maintain a list of pid arrays, which is keyed by pid_ns. The list will be of length 1 at most time. Reported-by: Paul Menage Idea-by: Paul Menage Signed-off-by: Li Zefan Reviewed-by: Serge Hallyn Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 11 +++--- kernel/cgroup.c | 96 +++++++++++++++++++++++++++++++++++++------------- 2 files changed, 76 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 665fa70e409..20411d2876f 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -179,14 +179,11 @@ struct cgroup { */ struct list_head release_list; - /* pids_mutex protects the fields below */ + /* pids_mutex protects pids_list and cached pid arrays. */ struct rw_semaphore pids_mutex; - /* Array of process ids in the cgroup */ - pid_t *tasks_pids; - /* How many files are using the current tasks_pids array */ - int pids_use_count; - /* Length of the current tasks_pids array */ - int pids_length; + + /* Linked list of struct cgroup_pids */ + struct list_head pids_list; /* For RCU-protected deletion */ struct rcu_head rcu_head; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3737a682cdf..250dac05680 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -47,6 +47,7 @@ #include #include #include +#include #include @@ -960,6 +961,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->css_sets); INIT_LIST_HEAD(&cgrp->release_list); + INIT_LIST_HEAD(&cgrp->pids_list); init_rwsem(&cgrp->pids_mutex); } static void init_cgroup_root(struct cgroupfs_root *root) @@ -2201,12 +2203,30 @@ err: return ret; } +/* + * Cache pids for all threads in the same pid namespace that are + * opening the same "tasks" file. + */ +struct cgroup_pids { + /* The node in cgrp->pids_list */ + struct list_head list; + /* The cgroup those pids belong to */ + struct cgroup *cgrp; + /* The namepsace those pids belong to */ + struct pid_namespace *ns; + /* Array of process ids in the cgroup */ + pid_t *tasks_pids; + /* How many files are using the this tasks_pids array */ + int use_count; + /* Length of the current tasks_pids array */ + int length; +}; + static int cmppid(const void *a, const void *b) { return *(pid_t *)a - *(pid_t *)b; } - /* * seq_file methods for the "tasks" file. The seq_file position is the * next pid to display; the seq_file iterator is a pointer to the pid @@ -2221,45 +2241,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ - struct cgroup *cgrp = s->private; + struct cgroup_pids *cp = s->private; + struct cgroup *cgrp = cp->cgrp; int index = 0, pid = *pos; int *iter; down_read(&cgrp->pids_mutex); if (pid) { - int end = cgrp->pids_length; + int end = cp->length; while (index < end) { int mid = (index + end) / 2; - if (cgrp->tasks_pids[mid] == pid) { + if (cp->tasks_pids[mid] == pid) { index = mid; break; - } else if (cgrp->tasks_pids[mid] <= pid) + } else if (cp->tasks_pids[mid] <= pid) index = mid + 1; else end = mid; } } /* If we're off the end of the array, we're done */ - if (index >= cgrp->pids_length) + if (index >= cp->length) return NULL; /* Update the abstract position to be the actual pid that we found */ - iter = cgrp->tasks_pids + index; + iter = cp->tasks_pids + index; *pos = *iter; return iter; } static void cgroup_tasks_stop(struct seq_file *s, void *v) { - struct cgroup *cgrp = s->private; + struct cgroup_pids *cp = s->private; + struct cgroup *cgrp = cp->cgrp; up_read(&cgrp->pids_mutex); } static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) { - struct cgroup *cgrp = s->private; + struct cgroup_pids *cp = s->private; int *p = v; - int *end = cgrp->tasks_pids + cgrp->pids_length; + int *end = cp->tasks_pids + cp->length; /* * Advance to the next pid in the array. If this goes off the @@ -2286,26 +2308,33 @@ static struct seq_operations cgroup_tasks_seq_operations = { .show = cgroup_tasks_show, }; -static void release_cgroup_pid_array(struct cgroup *cgrp) +static void release_cgroup_pid_array(struct cgroup_pids *cp) { + struct cgroup *cgrp = cp->cgrp; + down_write(&cgrp->pids_mutex); - BUG_ON(!cgrp->pids_use_count); - if (!--cgrp->pids_use_count) { - kfree(cgrp->tasks_pids); - cgrp->tasks_pids = NULL; - cgrp->pids_length = 0; + BUG_ON(!cp->use_count); + if (!--cp->use_count) { + list_del(&cp->list); + put_pid_ns(cp->ns); + kfree(cp->tasks_pids); + kfree(cp); } up_write(&cgrp->pids_mutex); } static int cgroup_tasks_release(struct inode *inode, struct file *file) { - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + struct seq_file *seq; + struct cgroup_pids *cp; if (!(file->f_mode & FMODE_READ)) return 0; - release_cgroup_pid_array(cgrp); + seq = file->private_data; + cp = seq->private; + + release_cgroup_pid_array(cp); return seq_release(inode, file); } @@ -2324,6 +2353,8 @@ static struct file_operations cgroup_tasks_operations = { static int cgroup_tasks_open(struct inode *unused, struct file *file) { struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + struct pid_namespace *ns = current->nsproxy->pid_ns; + struct cgroup_pids *cp; pid_t *pidarray; int npids; int retval; @@ -2350,20 +2381,37 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file) * array if necessary */ down_write(&cgrp->pids_mutex); - kfree(cgrp->tasks_pids); - cgrp->tasks_pids = pidarray; - cgrp->pids_length = npids; - cgrp->pids_use_count++; + + list_for_each_entry(cp, &cgrp->pids_list, list) { + if (ns == cp->ns) + goto found; + } + + cp = kzalloc(sizeof(*cp), GFP_KERNEL); + if (!cp) { + up_write(&cgrp->pids_mutex); + kfree(pidarray); + return -ENOMEM; + } + cp->cgrp = cgrp; + cp->ns = ns; + get_pid_ns(ns); + list_add(&cp->list, &cgrp->pids_list); +found: + kfree(cp->tasks_pids); + cp->tasks_pids = pidarray; + cp->length = npids; + cp->use_count++; up_write(&cgrp->pids_mutex); file->f_op = &cgroup_tasks_operations; retval = seq_open(file, &cgroup_tasks_seq_operations); if (retval) { - release_cgroup_pid_array(cgrp); + release_cgroup_pid_array(cp); return retval; } - ((struct seq_file *)file->private_data)->private = cgrp; + ((struct seq_file *)file->private_data)->private = cp; return 0; } -- cgit v1.2.3-70-g09d2 From 887032670d47366a8c8f25396ea7c14b7b2cc620 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 29 Jul 2009 15:04:06 -0700 Subject: cgroup avoid permanent sleep at rmdir After commit ec64f51545fffbc4cb968f0cea56341a4b07e85a ("cgroup: fix frequent -EBUSY at rmdir"), cgroup's rmdir (especially against memcg) doesn't return -EBUSY by temporary ref counts. That commit expects all refs after pre_destroy() is temporary but...it wasn't. Then, rmdir can wait permanently. This patch tries to fix that and change followings. - set CGRP_WAIT_ON_RMDIR flag before pre_destroy(). - clear CGRP_WAIT_ON_RMDIR flag when the subsys finds racy case. if there are sleeping ones, wakes them up. - rmdir() sleeps only when CGRP_WAIT_ON_RMDIR flag is set. Tested-by: Daisuke Nishimura Reported-by: Daisuke Nishimura Reviewed-by: Paul Menage Acked-by: Balbir Sigh Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 17 ++++++++++++++++ kernel/cgroup.c | 55 ++++++++++++++++++++++++++++++++++---------------- mm/memcontrol.c | 23 ++++++++++++++++++--- 3 files changed, 75 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 20411d2876f..90bba9e6228 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -362,6 +362,23 @@ int cgroup_task_count(const struct cgroup *cgrp); /* Return true if cgrp is a descendant of the task's cgroup */ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); +/* + * When the subsys has to access css and may add permanent refcnt to css, + * it should take care of racy conditions with rmdir(). Following set of + * functions, is for stop/restart rmdir if necessary. + * Because these will call css_get/put, "css" should be alive css. + * + * cgroup_exclude_rmdir(); + * ...do some jobs which may access arbitrary empty cgroup + * cgroup_release_and_wakeup_rmdir(); + * + * When someone removes a cgroup while cgroup_exclude_rmdir() holds it, + * it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up. + */ + +void cgroup_exclude_rmdir(struct cgroup_subsys_state *css); +void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css); + /* * Control Group subsystem type. * See Documentation/cgroups/cgroups.txt for details diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 250dac05680..b6eadfe30e7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -735,16 +735,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry) * reference to css->refcnt. In general, this refcnt is expected to goes down * to zero, soon. * - * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; + * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; */ DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); -static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) +static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) { - if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) + if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) wake_up_all(&cgroup_rmdir_waitq); } +void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) +{ + css_get(css); +} + +void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) +{ + cgroup_wakeup_rmdir_waiter(css->cgroup); + css_put(css); +} + + static int rebind_subsystems(struct cgroupfs_root *root, unsigned long final_bits) { @@ -1359,7 +1371,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) * wake up rmdir() waiter. the rmdir should fail since the cgroup * is no longer empty. */ - cgroup_wakeup_rmdir_waiters(cgrp); + cgroup_wakeup_rmdir_waiter(cgrp); return 0; } @@ -2743,34 +2755,43 @@ again: } mutex_unlock(&cgroup_mutex); + /* + * In general, subsystem has no css->refcnt after pre_destroy(). But + * in racy cases, subsystem may have to get css->refcnt after + * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes + * make rmdir return -EBUSY too often. To avoid that, we use waitqueue + * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir + * and subsystem's reference count handling. Please see css_get/put + * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. + */ + set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); + /* * Call pre_destroy handlers of subsys. Notify subsystems * that rmdir() request comes. */ ret = cgroup_call_pre_destroy(cgrp); - if (ret) + if (ret) { + clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); return ret; + } mutex_lock(&cgroup_mutex); parent = cgrp->parent; if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { + clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); mutex_unlock(&cgroup_mutex); return -EBUSY; } - /* - * css_put/get is provided for subsys to grab refcnt to css. In typical - * case, subsystem has no reference after pre_destroy(). But, under - * hierarchy management, some *temporal* refcnt can be hold. - * To avoid returning -EBUSY to a user, waitqueue is used. If subsys - * is really busy, it should return -EBUSY at pre_destroy(). wake_up - * is called when css_put() is called and refcnt goes down to 0. - */ - set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); - if (!cgroup_clear_css_refs(cgrp)) { mutex_unlock(&cgroup_mutex); - schedule(); + /* + * Because someone may call cgroup_wakeup_rmdir_waiter() before + * prepare_to_wait(), we need to check this flag. + */ + if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) + schedule(); finish_wait(&cgroup_rmdir_waitq, &wait); clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); if (signal_pending(current)) @@ -3342,7 +3363,7 @@ void __css_put(struct cgroup_subsys_state *css) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } - cgroup_wakeup_rmdir_waiters(cgrp); + cgroup_wakeup_rmdir_waiter(cgrp); } rcu_read_unlock(); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e717964cb5a..fd4529d86de 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1207,6 +1207,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, ret = 0; out: unlock_page_cgroup(pc); + /* + * We charges against "to" which may not have any tasks. Then, "to" + * can be under rmdir(). But in current implementation, caller of + * this function is just force_empty() and it's garanteed that + * "to" is never removed. So, we don't check rmdir status here. + */ return ret; } @@ -1428,6 +1434,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, return; if (!ptr) return; + cgroup_exclude_rmdir(&ptr->css); pc = lookup_page_cgroup(page); mem_cgroup_lru_del_before_commit_swapcache(page); __mem_cgroup_commit_charge(ptr, pc, ctype); @@ -1457,8 +1464,12 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, } rcu_read_unlock(); } - /* add this page(page_cgroup) to the LRU we want. */ - + /* + * At swapin, we may charge account against cgroup which has no tasks. + * So, rmdir()->pre_destroy() can be called while we do this charge. + * In that case, we need to call pre_destroy() again. check it here. + */ + cgroup_release_and_wakeup_rmdir(&ptr->css); } void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) @@ -1664,7 +1675,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, if (!mem) return; - + cgroup_exclude_rmdir(&mem->css); /* at migration success, oldpage->mapping is NULL. */ if (oldpage->mapping) { target = oldpage; @@ -1704,6 +1715,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, */ if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) mem_cgroup_uncharge_page(target); + /* + * At migration, we may charge account against cgroup which has no tasks + * So, rmdir()->pre_destroy() can be called while we do this charge. + * In that case, we need to call pre_destroy() again. check it here. + */ + cgroup_release_and_wakeup_rmdir(&mem->css); } /* -- cgit v1.2.3-70-g09d2 From b62f495dad04fa94b5083aec638ff3072bccaaca Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 29 Jul 2009 15:04:09 -0700 Subject: profile: suppress warning about large allocations when profile=1 is specified When profile= is used, a large buffer is allocated early at boot. This can be larger than what the page allocator can provide so it prints a warning. However, the caller is able to handle the situation so this patch suppresses the warning. Signed-off-by: Mel Gorman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 69911b5745e..419250ebec4 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -117,11 +117,12 @@ int __ref profile_init(void) cpumask_copy(prof_cpu_mask, cpu_possible_mask); - prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); + prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN); if (prof_buffer) return 0; - prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO); + prof_buffer = alloc_pages_exact(buffer_bytes, + GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN); if (prof_buffer) return 0; -- cgit v1.2.3-70-g09d2 From ec30c5f3a18722f8fcf8c83146a10b03ac4d9ff1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 28 Jul 2009 19:47:23 -0400 Subject: kprobes: Use kernel_text_address() for checking probe address Use kernel_text_address() for checking probe address instead of __kernel_text_address(), because __kernel_text_address() returns true for init functions even after relaseing those functions. That will hit a BUG() in text_poke(). Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 16b5739c516..0540948e29a 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -694,7 +694,7 @@ int __kprobes register_kprobe(struct kprobe *p) p->addr = addr; preempt_disable(); - if (!__kernel_text_address((unsigned long) p->addr) || + if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr)) { preempt_enable(); return -EINVAL; -- cgit v1.2.3-70-g09d2 From 0083fc2c50e6c5127c2802ad323adf8143ab7856 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 1 Aug 2009 10:34:56 -0700 Subject: do_sigaltstack: avoid copying 'stack_t' as a structure to user space Ulrich Drepper correctly points out that there is generally padding in the structure on 64-bit hosts, and that copying the structure from kernel to user space can leak information from the kernel stack in those padding bytes. Avoid the whole issue by just copying the three members one by one instead, which also means that the function also can avoid the need for a stack frame. This also happens to match how we copy the new structure from user space, so it all even makes sense. [ The obvious solution of adding a memset() generates horrid code, gcc does really stupid things. ] Reported-by: Ulrich Drepper Signed-off-by: Linus Torvalds --- kernel/signal.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index ccf1ceedaeb..f268372c0cc 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2454,11 +2454,9 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s stack_t oss; int error; - if (uoss) { - oss.ss_sp = (void __user *) current->sas_ss_sp; - oss.ss_size = current->sas_ss_size; - oss.ss_flags = sas_ss_flags(sp); - } + oss.ss_sp = (void __user *) current->sas_ss_sp; + oss.ss_size = current->sas_ss_size; + oss.ss_flags = sas_ss_flags(sp); if (uss) { void __user *ss_sp; @@ -2501,13 +2499,16 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s current->sas_ss_size = ss_size; } + error = 0; if (uoss) { error = -EFAULT; - if (copy_to_user(uoss, &oss, sizeof(oss))) + if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss))) goto out; + error = __put_user(oss.ss_sp, &uoss->ss_sp) | + __put_user(oss.ss_size, &uoss->ss_size) | + __put_user(oss.ss_flags, &uoss->ss_flags); } - error = 0; out: return error; } -- cgit v1.2.3-70-g09d2 From 0dd8486b5cfe8048e0613334659d9252ecd1b08a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 1 Aug 2009 11:18:56 -0700 Subject: do_sigaltstack: small cleanups The previous commit ("do_sigaltstack: avoid copying 'stack_t' as a structure to user space") fixed a real bug. This one just cleans up the copy from user space to that gcc can generate better code for it (and so that it looks the same as the later copy back to user space). Signed-off-by: Linus Torvalds --- kernel/signal.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index f268372c0cc..64c5deeaca5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2464,10 +2464,12 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s int ss_flags; error = -EFAULT; - if (!access_ok(VERIFY_READ, uss, sizeof(*uss)) - || __get_user(ss_sp, &uss->ss_sp) - || __get_user(ss_flags, &uss->ss_flags) - || __get_user(ss_size, &uss->ss_size)) + if (!access_ok(VERIFY_READ, uss, sizeof(*uss))) + goto out; + error = __get_user(ss_sp, &uss->ss_sp) | + __get_user(ss_flags, &uss->ss_flags) | + __get_user(ss_size, &uss->ss_size); + if (error) goto out; error = -EPERM; -- cgit v1.2.3-70-g09d2 From c1dc0b9c0c8979ce4d411caadff5c0d79dee58bc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 2 Aug 2009 11:28:21 +0200 Subject: debug lockups: Improve lockup detection When debugging a recent lockup bug i found various deficiencies in how our current lockup detection helpers work: - SysRq-L is not very efficient as it uses a workqueue, hence it cannot punch through hard lockups and cannot see through most soft lockups either. - The SysRq-L code depends on the NMI watchdog - which is off by default. - We dont print backtraces from the RCU code's built-in 'RCU state machine is stuck' debug code. This debug code tends to be one of the first (and only) mechanisms that show that a lockup has occured. This patch changes the code so taht we: - Trigger the NMI backtrace code from SysRq-L instead of using a workqueue (which cannot punch through hard lockups) - Trigger print-all-CPU-backtraces from the RCU lockup detection code Also decouple the backtrace printing code from the NMI watchdog: - Dont use variable size cpumasks (it might not be initialized and they are a bit more fragile anyway) - Trigger an NMI immediately via an IPI, instead of waiting for the NMI tick to occur. This is a lot faster and can produce more relevant backtraces. It will also work if the NMI watchdog is disabled. - Dont print the 'dazed and confused' message when we print a backtrace from the NMI - Do a show_regs() plus a dump_stack() to get maximum info out of the dump. Worst-case we get two stacktraces - which is not a big deal. Sometimes, if register content is corrupted, the precise stack walker in show_regs() wont give us a full backtrace - in this case dump_stack() will do it. Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Andrew Morton Cc: Linus Torvalds LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/nmi.c | 18 ++++++++++++------ drivers/char/sysrq.c | 8 ++------ kernel/rcutree.c | 7 ++++++- 3 files changed, 20 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index b3025b43b63..1bb1ac20e9e 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -39,7 +39,7 @@ int unknown_nmi_panic; int nmi_watchdog_enabled; -static cpumask_var_t backtrace_mask; +static cpumask_t backtrace_mask __read_mostly; /* nmi_active: * >0: the lapic NMI watchdog is active, but can be disabled @@ -138,7 +138,6 @@ int __init check_nmi_watchdog(void) if (!prev_nmi_count) goto error; - alloc_cpumask_var(&backtrace_mask, GFP_KERNEL|__GFP_ZERO); printk(KERN_INFO "Testing NMI watchdog ... "); #ifdef CONFIG_SMP @@ -415,14 +414,17 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) } /* We can be called before check_nmi_watchdog, hence NULL check. */ - if (backtrace_mask != NULL && cpumask_test_cpu(cpu, backtrace_mask)) { + if (cpumask_test_cpu(cpu, &backtrace_mask)) { static DEFINE_SPINLOCK(lock); /* Serialise the printks */ spin_lock(&lock); printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); + show_regs(regs); dump_stack(); spin_unlock(&lock); - cpumask_clear_cpu(cpu, backtrace_mask); + cpumask_clear_cpu(cpu, &backtrace_mask); + + rc = 1; } /* Could check oops_in_progress here too, but it's safer not to */ @@ -556,10 +558,14 @@ void __trigger_all_cpu_backtrace(void) { int i; - cpumask_copy(backtrace_mask, cpu_online_mask); + cpumask_copy(&backtrace_mask, cpu_online_mask); + + printk(KERN_INFO "sending NMI to all CPUs:\n"); + apic->send_IPI_all(NMI_VECTOR); + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ for (i = 0; i < 10 * 1000; i++) { - if (cpumask_empty(backtrace_mask)) + if (cpumask_empty(&backtrace_mask)) break; mdelay(1); } diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 5d7a02f63e1..165f307f30e 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -222,12 +223,7 @@ static DECLARE_WORK(sysrq_showallcpus, sysrq_showregs_othercpus); static void sysrq_handle_showallcpus(int key, struct tty_struct *tty) { - struct pt_regs *regs = get_irq_regs(); - if (regs) { - printk(KERN_INFO "CPU%d:\n", smp_processor_id()); - show_regs(regs); - } - schedule_work(&sysrq_showallcpus); + trigger_all_cpu_backtrace(); } static struct sysrq_key_op sysrq_showallcpus_op = { diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7717b95c202..9c5fa9fc57e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -469,6 +470,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp) } printk(" (detected by %d, t=%ld jiffies)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start)); + trigger_all_cpu_backtrace(); + force_quiescent_state(rsp, 0); /* Kick them all. */ } @@ -479,12 +482,14 @@ static void print_cpu_stall(struct rcu_state *rsp) printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n", smp_processor_id(), jiffies - rsp->gp_start); - dump_stack(); + trigger_all_cpu_backtrace(); + spin_lock_irqsave(&rnp->lock, flags); if ((long)(jiffies - rsp->jiffies_stall) >= 0) rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; spin_unlock_irqrestore(&rnp->lock, flags); + set_need_resched(); /* kick ourselves to get things going. */ } -- cgit v1.2.3-70-g09d2 From e53c0994709166b111fbe9162d1a16ece7dfc45b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Jul 2009 14:42:10 +0200 Subject: perf_counter: Collapse inherit on read() Currently the counter value returned by read() is the value of the parent counter, to which child counters are only fed back on child exit. Thus read() can return rather erratic (and meaningless) numbers depending on the state of the child processes. Change this by always iterating the full child hierarchy on read() and sum all counters. Suggested-by: Corey Ashford Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 95093104195..48471d75ae0 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1688,6 +1688,18 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } +static u64 perf_counter_read_tree(struct perf_counter *counter) +{ + struct perf_counter *child; + u64 total = 0; + + total += perf_counter_read(counter); + list_for_each_entry(child, &counter->child_list, child_list) + total += perf_counter_read(child); + + return total; +} + /* * Read the performance counter - simple non blocking version for now */ @@ -1707,7 +1719,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) WARN_ON_ONCE(counter->ctx->parent_ctx); mutex_lock(&counter->child_mutex); - values[0] = perf_counter_read(counter); + values[0] = perf_counter_read_tree(counter); n = 1; if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) values[n++] = counter->total_time_enabled + -- cgit v1.2.3-70-g09d2 From 9f498cc5be7e013d8d6e4c616980ed0ffc8680d2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 23 Jul 2009 14:46:33 +0200 Subject: perf_counter: Full task tracing In order to be able to distinguish between no samples due to inactivity and no samples due to task ended, Arjan asked for PERF_EVENT_EXIT events. This is useful to the boot delay instrumentation (bootchart) app. This patch changes the PERF_EVENT_FORK to be emitted on every clone, and adds PERF_EVENT_EXIT to be emitted on task exit, after the task's counters have been closed. This task tracing is controlled through: attr.comm || attr.mmap and through the new attr.task field. Suggested-by: Arjan van de Ven Cc: Paul Mackerras Cc: Anton Blanchard Signed-off-by: Peter Zijlstra [ cleaned up perf_counter.h a bit ] Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 13 ++++++- kernel/fork.c | 4 +- kernel/perf_counter.c | 87 +++++++++++++++++++++++++++++--------------- 3 files changed, 71 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index bd15d7a5f5c..e604e6ef72d 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -181,8 +181,9 @@ struct perf_counter_attr { freq : 1, /* use freq, not period */ inherit_stat : 1, /* per task counts */ enable_on_exec : 1, /* next exec enables */ + task : 1, /* trace fork/exit */ - __reserved_1 : 51; + __reserved_1 : 50; __u32 wakeup_events; /* wakeup every n events */ __u32 __reserved_2; @@ -308,6 +309,15 @@ enum perf_event_type { */ PERF_EVENT_COMM = 3, + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * u32 tid, ptid; + * }; + */ + PERF_EVENT_EXIT = 4, + /* * struct { * struct perf_event_header header; @@ -323,6 +333,7 @@ enum perf_event_type { * struct { * struct perf_event_header header; * u32 pid, ppid; + * u32 tid, ptid; * }; */ PERF_EVENT_FORK = 7, diff --git a/kernel/fork.c b/kernel/fork.c index 29b532e718f..466531eb92c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1269,6 +1269,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); + perf_counter_fork(p); return p; bad_fork_free_pid: @@ -1410,9 +1411,6 @@ long do_fork(unsigned long clone_flags, init_completion(&vfork); } - if (!(clone_flags & CLONE_THREAD)) - perf_counter_fork(p); - audit_finish_fork(p); tracehook_report_clone(regs, clone_flags, nr, p); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 48471d75ae0..199ed477131 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -42,6 +42,7 @@ static int perf_overcommit __read_mostly = 1; static atomic_t nr_counters __read_mostly; static atomic_t nr_mmap_counters __read_mostly; static atomic_t nr_comm_counters __read_mostly; +static atomic_t nr_task_counters __read_mostly; /* * perf counter paranoia level: @@ -1654,6 +1655,8 @@ static void free_counter(struct perf_counter *counter) atomic_dec(&nr_mmap_counters); if (counter->attr.comm) atomic_dec(&nr_comm_counters); + if (counter->attr.task) + atomic_dec(&nr_task_counters); } if (counter->destroy) @@ -2831,10 +2834,12 @@ perf_counter_read_event(struct perf_counter *counter, } /* - * fork tracking + * task tracking -- fork/exit + * + * enabled by: attr.comm | attr.mmap | attr.task */ -struct perf_fork_event { +struct perf_task_event { struct task_struct *task; struct { @@ -2842,37 +2847,42 @@ struct perf_fork_event { u32 pid; u32 ppid; + u32 tid; + u32 ptid; } event; }; -static void perf_counter_fork_output(struct perf_counter *counter, - struct perf_fork_event *fork_event) +static void perf_counter_task_output(struct perf_counter *counter, + struct perf_task_event *task_event) { struct perf_output_handle handle; - int size = fork_event->event.header.size; - struct task_struct *task = fork_event->task; + int size = task_event->event.header.size; + struct task_struct *task = task_event->task; int ret = perf_output_begin(&handle, counter, size, 0, 0); if (ret) return; - fork_event->event.pid = perf_counter_pid(counter, task); - fork_event->event.ppid = perf_counter_pid(counter, task->real_parent); + task_event->event.pid = perf_counter_pid(counter, task); + task_event->event.ppid = perf_counter_pid(counter, task->real_parent); - perf_output_put(&handle, fork_event->event); + task_event->event.tid = perf_counter_tid(counter, task); + task_event->event.ptid = perf_counter_tid(counter, task->real_parent); + + perf_output_put(&handle, task_event->event); perf_output_end(&handle); } -static int perf_counter_fork_match(struct perf_counter *counter) +static int perf_counter_task_match(struct perf_counter *counter) { - if (counter->attr.comm || counter->attr.mmap) + if (counter->attr.comm || counter->attr.mmap || counter->attr.task) return 1; return 0; } -static void perf_counter_fork_ctx(struct perf_counter_context *ctx, - struct perf_fork_event *fork_event) +static void perf_counter_task_ctx(struct perf_counter_context *ctx, + struct perf_task_event *task_event) { struct perf_counter *counter; @@ -2881,19 +2891,19 @@ static void perf_counter_fork_ctx(struct perf_counter_context *ctx, rcu_read_lock(); list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_fork_match(counter)) - perf_counter_fork_output(counter, fork_event); + if (perf_counter_task_match(counter)) + perf_counter_task_output(counter, task_event); } rcu_read_unlock(); } -static void perf_counter_fork_event(struct perf_fork_event *fork_event) +static void perf_counter_task_event(struct perf_task_event *task_event) { struct perf_cpu_context *cpuctx; struct perf_counter_context *ctx; cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_fork_ctx(&cpuctx->ctx, fork_event); + perf_counter_task_ctx(&cpuctx->ctx, task_event); put_cpu_var(perf_cpu_context); rcu_read_lock(); @@ -2903,32 +2913,40 @@ static void perf_counter_fork_event(struct perf_fork_event *fork_event) */ ctx = rcu_dereference(current->perf_counter_ctxp); if (ctx) - perf_counter_fork_ctx(ctx, fork_event); + perf_counter_task_ctx(ctx, task_event); rcu_read_unlock(); } -void perf_counter_fork(struct task_struct *task) +static void perf_counter_task(struct task_struct *task, int new) { - struct perf_fork_event fork_event; + struct perf_task_event task_event; if (!atomic_read(&nr_comm_counters) && - !atomic_read(&nr_mmap_counters)) + !atomic_read(&nr_mmap_counters) && + !atomic_read(&nr_task_counters)) return; - fork_event = (struct perf_fork_event){ + task_event = (struct perf_task_event){ .task = task, .event = { .header = { - .type = PERF_EVENT_FORK, + .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, .misc = 0, - .size = sizeof(fork_event.event), + .size = sizeof(task_event.event), }, /* .pid */ /* .ppid */ + /* .tid */ + /* .ptid */ }, }; - perf_counter_fork_event(&fork_event); + perf_counter_task_event(&task_event); +} + +void perf_counter_fork(struct task_struct *task) +{ + perf_counter_task(task, 1); } /* @@ -3887,6 +3905,8 @@ done: atomic_inc(&nr_mmap_counters); if (counter->attr.comm) atomic_inc(&nr_comm_counters); + if (counter->attr.task) + atomic_inc(&nr_task_counters); } return counter; @@ -4248,8 +4268,10 @@ void perf_counter_exit_task(struct task_struct *child) struct perf_counter_context *child_ctx; unsigned long flags; - if (likely(!child->perf_counter_ctxp)) + if (likely(!child->perf_counter_ctxp)) { + perf_counter_task(child, 0); return; + } local_irq_save(flags); /* @@ -4267,15 +4289,22 @@ void perf_counter_exit_task(struct task_struct *child) * incremented the context's refcount before we do put_ctx below. */ spin_lock(&child_ctx->lock); - child->perf_counter_ctxp = NULL; /* * If this context is a clone; unclone it so it can't get * swapped to another process while we're removing all * the counters from it. */ unclone_ctx(child_ctx); - spin_unlock(&child_ctx->lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&child_ctx->lock, flags); + + /* + * Report the task dead after unscheduling the counters so that we + * won't get any samples after PERF_EVENT_EXIT. We can however still + * get a few PERF_EVENT_READ events. + */ + perf_counter_task(child, 0); + + child->perf_counter_ctxp = NULL; /* * We can recurse on the same lock type through: -- cgit v1.2.3-70-g09d2 From e414314cce7539788dd5d2c35decad11782dd858 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 23 Jul 2009 20:13:26 +0200 Subject: sched: Fix latencytop and sleep profiling vs group scheduling The latencytop and sleep accounting code assumes that any scheduler entity represents a task, this is not so. Cc: Arjan van de Ven Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 9ffb2b2ceba..652e8bdef9a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -611,9 +611,13 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHEDSTATS + struct task_struct *tsk = NULL; + + if (entity_is_task(se)) + tsk = task_of(se); + if (se->sleep_start) { u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; - struct task_struct *tsk = task_of(se); if ((s64)delta < 0) delta = 0; @@ -624,11 +628,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) se->sleep_start = 0; se->sum_sleep_runtime += delta; - account_scheduler_latency(tsk, delta >> 10, 1); + if (tsk) + account_scheduler_latency(tsk, delta >> 10, 1); } if (se->block_start) { u64 delta = rq_of(cfs_rq)->clock - se->block_start; - struct task_struct *tsk = task_of(se); if ((s64)delta < 0) delta = 0; @@ -639,17 +643,19 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) se->block_start = 0; se->sum_sleep_runtime += delta; - /* - * Blocking time is in units of nanosecs, so shift by 20 to - * get a milliseconds-range estimation of the amount of - * time that the task spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - - profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), - delta >> 20); + if (tsk) { + /* + * Blocking time is in units of nanosecs, so shift by + * 20 to get a milliseconds-range estimation of the + * amount of time that the task spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + profile_hits(SLEEP_PROFILING, + (void *)get_wchan(tsk), + delta >> 20); + } + account_scheduler_latency(tsk, delta >> 10, 0); } - account_scheduler_latency(tsk, delta >> 10, 0); } #endif } -- cgit v1.2.3-70-g09d2 From 07903af152b0597d94e9b0030746b63c4664e787 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Thu, 30 Jul 2009 10:57:28 -0400 Subject: sched: Fix race in cpupri introduced by cpumask_var changes Background: Several race conditions in the scheduler have cropped up recently, which Steven and I have tracked down using ftrace. The most recent one turns out to be a race in how the scheduler determines a suitable migration target for RT tasks, introduced recently with commit: commit 68e74568fbe5854952355e942acca51f138096d9 Date: Tue Nov 25 02:35:13 2008 +1030 sched: convert struct cpupri_vec cpumask_var_t. The original design of cpupri allowed lockless readers to quickly determine a best-estimate target. Races between the pri_active bitmap and the vec->mask were handled in the original code because we would detect and return "0" when this occured. The design was predicated on the *effective* atomicity (*) of caching the result of cpus_and() between the cpus_allowed and the vec->mask. Commit 68e74568 changed the behavior such that vec->mask is accessed multiple times. This introduces a subtle race, the result of which means we can have a result that returns "1", but with an empty bitmap. *) yes, we know cpus_and() is not a locked operator across the entire composite array, but it is implicitly atomic on a per-word basis which is all the design required to work. Implementation: Rather than forgoing the lockless design, or reverting to a stack-based cpumask_t, we simply check for when the race has been encountered and continue processing in the event that the race is hit. This renders the removal race as if the priority bit had been atomically cleared as well, and allows the algorithm to execute correctly. Signed-off-by: Gregory Haskins CC: Rusty Russell CC: Steven Rostedt Signed-off-by: Peter Zijlstra LKML-Reference: <20090730145728.25226.92769.stgit@dev.haskins.net> Signed-off-by: Ingo Molnar --- kernel/sched_cpupri.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index e6c251790dd..d014efbf947 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -81,8 +81,21 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) continue; - if (lowest_mask) + if (lowest_mask) { cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); + + /* + * We have to ensure that we have at least one bit + * still set in the array, since the map could have + * been concurrently emptied between the first and + * second reads of vec->mask. If we hit this + * condition, simply act as though we never hit this + * priority level and continue on. + */ + if (cpumask_any(lowest_mask) >= nr_cpu_ids) + continue; + } + return 1; } -- cgit v1.2.3-70-g09d2 From a5004278f0525dcb9aa43703ef77bf371ea837cd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 27 Jul 2009 14:04:49 +0200 Subject: sched: Fix cgroup smp fairness Commit ec4e0e2fe018992d980910db901637c814575914 ("fix inconsistency when redistribute per-cpu tg->cfs_rq shares") broke cgroup smp fairness. In order to avoid starvation of newly placed tasks, we never quite set the share of an empty cpu group-task to 0, but instead we set it as if there's a single NICE-0 task present. If however we actually set this in cfs_rq[cpu]->shares, that means the total shares for that group will be slightly inflated every time we balance, causing the observed unfairness. Fix this by setting cfs_rq[cpu]->shares to 0 but actually setting the effective weight of the related se to the inflated number. Signed-off-by: Peter Zijlstra LKML-Reference: <1248696557.6987.1615.camel@twins> Signed-off-by: Ingo Molnar --- kernel/sched.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ce1056e9b02..26976cd8be0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1523,13 +1523,18 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long sd_shares, unsigned long sd_rq_weight) { - unsigned long shares; unsigned long rq_weight; + unsigned long shares; + int boost = 0; if (!tg->se[cpu]) return; rq_weight = tg->cfs_rq[cpu]->rq_weight; + if (!rq_weight) { + boost = 1; + rq_weight = NICE_0_LOAD; + } /* * \Sum shares * rq_weight @@ -1546,8 +1551,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long flags; spin_lock_irqsave(&rq->lock, flags); - tg->cfs_rq[cpu]->shares = shares; - + tg->cfs_rq[cpu]->shares = boost ? 0 : shares; __set_se_shares(tg->se[cpu], shares); spin_unlock_irqrestore(&rq->lock, flags); } @@ -1560,7 +1564,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu, */ static int tg_shares_up(struct task_group *tg, void *data) { - unsigned long weight, rq_weight = 0; + unsigned long weight, rq_weight = 0, eff_weight = 0; unsigned long shares = 0; struct sched_domain *sd = data; int i; @@ -1572,11 +1576,13 @@ static int tg_shares_up(struct task_group *tg, void *data) * run here it will not get delayed by group starvation. */ weight = tg->cfs_rq[i]->load.weight; + tg->cfs_rq[i]->rq_weight = weight; + rq_weight += weight; + if (!weight) weight = NICE_0_LOAD; - tg->cfs_rq[i]->rq_weight = weight; - rq_weight += weight; + eff_weight += weight; shares += tg->cfs_rq[i]->shares; } @@ -1586,8 +1592,14 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) shares = tg->shares; - for_each_cpu(i, sched_domain_span(sd)) - update_group_shares_cpu(tg, i, shares, rq_weight); + for_each_cpu(i, sched_domain_span(sd)) { + unsigned long sd_rq_weight = rq_weight; + + if (!tg->cfs_rq[i]->rq_weight) + sd_rq_weight = eff_weight; + + update_group_shares_cpu(tg, i, shares, sd_rq_weight); + } return 0; } -- cgit v1.2.3-70-g09d2 From e709715915d69b6a929d77e7652c9c3fea61c317 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Jun 2009 15:41:20 +0200 Subject: sched: Optimize unused cgroup configuration When cgroup group scheduling is built in, skip some code paths if we don't have any (but the root) cgroups configured. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 26976cd8be0..ca1f76ba777 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1629,8 +1629,14 @@ static int tg_load_down(struct task_group *tg, void *data) static void update_shares(struct sched_domain *sd) { - u64 now = cpu_clock(raw_smp_processor_id()); - s64 elapsed = now - sd->last_update; + s64 elapsed; + u64 now; + + if (root_task_group_empty()) + return; + + now = cpu_clock(raw_smp_processor_id()); + elapsed = now - sd->last_update; if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { sd->last_update = now; @@ -1640,6 +1646,9 @@ static void update_shares(struct sched_domain *sd) static void update_shares_locked(struct rq *rq, struct sched_domain *sd) { + if (root_task_group_empty()) + return; + spin_unlock(&rq->lock); update_shares(sd); spin_lock(&rq->lock); @@ -1647,6 +1656,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) static void update_h_load(long cpu) { + if (root_task_group_empty()) + return; + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } -- cgit v1.2.3-70-g09d2 From da19ab510343c6496fe8b8f890091296032025c9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Jul 2009 00:21:22 -0400 Subject: sched: Check for pushing rt tasks after all scheduling The current method for pushing RT tasks after scheduling only happens after a context switch. But we found cases where a task is set up on a run queue to be pushed but the push never happens because the schedule chooses the same task. This bug was found with the help of Gregory Haskins and the use of ftrace (trace_printk). It tooks several days for both of us analyzing the code and the trace output to find this. Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra LKML-Reference: <20090729042526.205923666@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/sched.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ca1f76ba777..a030d4514cd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2839,14 +2839,14 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, * with the lock held can cause deadlocks; see schedule() for * details.) */ -static void finish_task_switch(struct rq *rq, struct task_struct *prev) +static int finish_task_switch(struct rq *rq, struct task_struct *prev) __releases(rq->lock) { struct mm_struct *mm = rq->prev_mm; long prev_state; -#ifdef CONFIG_SMP int post_schedule = 0; +#ifdef CONFIG_SMP if (current->sched_class->needs_post_schedule) post_schedule = current->sched_class->needs_post_schedule(rq); #endif @@ -2868,10 +2868,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) finish_arch_switch(prev); perf_counter_task_sched_in(current, cpu_of(rq)); finish_lock_switch(rq, prev); -#ifdef CONFIG_SMP - if (post_schedule) - current->sched_class->post_schedule(rq); -#endif fire_sched_in_preempt_notifiers(current); if (mm) @@ -2884,6 +2880,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) kprobe_flush_task(prev); put_task_struct(prev); } + + return post_schedule; } /** @@ -2894,8 +2892,15 @@ asmlinkage void schedule_tail(struct task_struct *prev) __releases(rq->lock) { struct rq *rq = this_rq(); + int post_schedule; + + post_schedule = finish_task_switch(rq, prev); + +#ifdef CONFIG_SMP + if (post_schedule) + current->sched_class->post_schedule(rq); +#endif - finish_task_switch(rq, prev); #ifdef __ARCH_WANT_UNLOCKED_CTXSW /* In this case, finish_task_switch does not reenable preemption */ preempt_enable(); @@ -2908,7 +2913,7 @@ asmlinkage void schedule_tail(struct task_struct *prev) * context_switch - switch to the new MM and the new * thread's register state. */ -static inline void +static inline int context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { @@ -2955,7 +2960,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * CPUs since it called schedule(), thus the 'rq' on its stack * frame will be invalid. */ - finish_task_switch(this_rq(), prev); + return finish_task_switch(this_rq(), prev); } /* @@ -5366,6 +5371,7 @@ asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; + int post_schedule = 0; struct rq *rq; int cpu; @@ -5416,15 +5422,25 @@ need_resched_nonpreemptible: rq->curr = next; ++*switch_count; - context_switch(rq, prev, next); /* unlocks the rq */ + post_schedule = context_switch(rq, prev, next); /* unlocks the rq */ /* * the context switch might have flipped the stack from under * us, hence refresh the local variables. */ cpu = smp_processor_id(); rq = cpu_rq(cpu); - } else + } else { +#ifdef CONFIG_SMP + if (current->sched_class->needs_post_schedule) + post_schedule = current->sched_class->needs_post_schedule(rq); +#endif spin_unlock_irq(&rq->lock); + } + +#ifdef CONFIG_SMP + if (post_schedule) + current->sched_class->post_schedule(rq); +#endif if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; -- cgit v1.2.3-70-g09d2 From c3a2ae3d93c0f10d29c071f599764d00b8de00cb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Jul 2009 00:21:23 -0400 Subject: sched: Add new prio to cpupri before removing old prio We need to add the new prio to the cpupri accounting before removing the old prio. This is because removing the old prio first will open a race window where the cpu will be removed from pri_active. In this case the cpu will not be visible for RT push and pulls. This could cause a RT task to not migrate appropriately, and create a very large latency. This bug was found with the use of ftrace sched events and trace_printk. Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra LKML-Reference: <20090729042526.438281019@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/sched_cpupri.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index d014efbf947..0f052fc674d 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -127,21 +127,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) /* * If the cpu was currently mapped to a different value, we - * first need to unmap the old value + * need to map it to the new value then remove the old value. + * Note, we must add the new value first, otherwise we risk the + * cpu being cleared from pri_active, and this cpu could be + * missed for a push or pull. */ - if (likely(oldpri != CPUPRI_INVALID)) { - struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; - - spin_lock_irqsave(&vec->lock, flags); - - vec->count--; - if (!vec->count) - clear_bit(oldpri, cp->pri_active); - cpumask_clear_cpu(cpu, vec->mask); - - spin_unlock_irqrestore(&vec->lock, flags); - } - if (likely(newpri != CPUPRI_INVALID)) { struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; @@ -154,6 +144,18 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) spin_unlock_irqrestore(&vec->lock, flags); } + if (likely(oldpri != CPUPRI_INVALID)) { + struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; + + spin_lock_irqsave(&vec->lock, flags); + + vec->count--; + if (!vec->count) + clear_bit(oldpri, cp->pri_active); + cpumask_clear_cpu(cpu, vec->mask); + + spin_unlock_irqrestore(&vec->lock, flags); + } *currpri = newpri; } -- cgit v1.2.3-70-g09d2 From 3f029d3c6d62068d59301d90c18dbde8ee402107 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Wed, 29 Jul 2009 11:08:47 -0400 Subject: sched: Enhance the pre/post scheduling logic We currently have an explicit "needs_post" vtable method which returns a stack variable for whether we should later run post-schedule. This leads to an awkward exchange of the variable as it bubbles back up out of the context switch. Peter Zijlstra observed that this information could be stored in the run-queue itself instead of handled on the stack. Therefore, we revert to the method of having context_switch return void, and update an internal rq->post_schedule variable when we require further processing. In addition, we fix a race condition where we try to access current->sched_class without holding the rq->lock. This is technically racy, as the sched-class could change out from under us. Instead, we reference the per-rq post_schedule variable with the runqueue unlocked, but with preemption disabled to see if we need to reacquire the rq->lock. Finally, we clean the code up slightly by removing the #ifdef CONFIG_SMP conditionals from the schedule() call, and implement some inline helper functions instead. This patch passes checkpatch, and rt-migrate. Signed-off-by: Gregory Haskins Signed-off-by: Peter Zijlstra LKML-Reference: <20090729150422.17691.55590.stgit@dev.haskins.net> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - kernel/sched.c | 82 +++++++++++++++++++++++++++++++-------------------- kernel/sched_rt.c | 31 +++++++------------ 3 files changed, 61 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c35bc29d2a..195d72d5c10 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1047,7 +1047,6 @@ struct sched_class { struct rq *busiest, struct sched_domain *sd, enum cpu_idle_type idle); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); - int (*needs_post_schedule) (struct rq *this_rq); void (*post_schedule) (struct rq *this_rq); void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); diff --git a/kernel/sched.c b/kernel/sched.c index a030d4514cd..613fee54fc8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -616,6 +616,7 @@ struct rq { unsigned char idle_at_tick; /* For active balancing */ + int post_schedule; int active_balance; int push_cpu; /* cpu of this runqueue: */ @@ -2839,17 +2840,11 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, * with the lock held can cause deadlocks; see schedule() for * details.) */ -static int finish_task_switch(struct rq *rq, struct task_struct *prev) +static void finish_task_switch(struct rq *rq, struct task_struct *prev) __releases(rq->lock) { struct mm_struct *mm = rq->prev_mm; long prev_state; - int post_schedule = 0; - -#ifdef CONFIG_SMP - if (current->sched_class->needs_post_schedule) - post_schedule = current->sched_class->needs_post_schedule(rq); -#endif rq->prev_mm = NULL; @@ -2880,10 +2875,44 @@ static int finish_task_switch(struct rq *rq, struct task_struct *prev) kprobe_flush_task(prev); put_task_struct(prev); } +} + +#ifdef CONFIG_SMP + +/* assumes rq->lock is held */ +static inline void pre_schedule(struct rq *rq, struct task_struct *prev) +{ + if (prev->sched_class->pre_schedule) + prev->sched_class->pre_schedule(rq, prev); +} + +/* rq->lock is NOT held, but preemption is disabled */ +static inline void post_schedule(struct rq *rq) +{ + if (rq->post_schedule) { + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + if (rq->curr->sched_class->post_schedule) + rq->curr->sched_class->post_schedule(rq); + spin_unlock_irqrestore(&rq->lock, flags); + + rq->post_schedule = 0; + } +} + +#else - return post_schedule; +static inline void pre_schedule(struct rq *rq, struct task_struct *p) +{ +} + +static inline void post_schedule(struct rq *rq) +{ } +#endif + /** * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. @@ -2892,14 +2921,14 @@ asmlinkage void schedule_tail(struct task_struct *prev) __releases(rq->lock) { struct rq *rq = this_rq(); - int post_schedule; - post_schedule = finish_task_switch(rq, prev); + finish_task_switch(rq, prev); -#ifdef CONFIG_SMP - if (post_schedule) - current->sched_class->post_schedule(rq); -#endif + /* + * FIXME: do we need to worry about rq being invalidated by the + * task_switch? + */ + post_schedule(rq); #ifdef __ARCH_WANT_UNLOCKED_CTXSW /* In this case, finish_task_switch does not reenable preemption */ @@ -2913,7 +2942,7 @@ asmlinkage void schedule_tail(struct task_struct *prev) * context_switch - switch to the new MM and the new * thread's register state. */ -static inline int +static inline void context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { @@ -2960,7 +2989,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * CPUs since it called schedule(), thus the 'rq' on its stack * frame will be invalid. */ - return finish_task_switch(this_rq(), prev); + finish_task_switch(this_rq(), prev); } /* @@ -5371,7 +5400,6 @@ asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; - int post_schedule = 0; struct rq *rq; int cpu; @@ -5403,10 +5431,7 @@ need_resched_nonpreemptible: switch_count = &prev->nvcsw; } -#ifdef CONFIG_SMP - if (prev->sched_class->pre_schedule) - prev->sched_class->pre_schedule(rq, prev); -#endif + pre_schedule(rq, prev); if (unlikely(!rq->nr_running)) idle_balance(cpu, rq); @@ -5422,25 +5447,17 @@ need_resched_nonpreemptible: rq->curr = next; ++*switch_count; - post_schedule = context_switch(rq, prev, next); /* unlocks the rq */ + context_switch(rq, prev, next); /* unlocks the rq */ /* * the context switch might have flipped the stack from under * us, hence refresh the local variables. */ cpu = smp_processor_id(); rq = cpu_rq(cpu); - } else { -#ifdef CONFIG_SMP - if (current->sched_class->needs_post_schedule) - post_schedule = current->sched_class->needs_post_schedule(rq); -#endif + } else spin_unlock_irq(&rq->lock); - } -#ifdef CONFIG_SMP - if (post_schedule) - current->sched_class->post_schedule(rq); -#endif + post_schedule(rq); if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; @@ -9403,6 +9420,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; + rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; rq->push_cpu = 0; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 3918e01994e..a8f89bc3e5e 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1056,6 +1056,11 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) return p; } +static inline int has_pushable_tasks(struct rq *rq) +{ + return !plist_head_empty(&rq->rt.pushable_tasks); +} + static struct task_struct *pick_next_task_rt(struct rq *rq) { struct task_struct *p = _pick_next_task_rt(rq); @@ -1064,6 +1069,12 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) if (p) dequeue_pushable_task(rq, p); + /* + * We detect this state here so that we can avoid taking the RQ + * lock again later if there is no need to push + */ + rq->post_schedule = has_pushable_tasks(rq); + return p; } @@ -1262,11 +1273,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) return lowest_rq; } -static inline int has_pushable_tasks(struct rq *rq) -{ - return !plist_head_empty(&rq->rt.pushable_tasks); -} - static struct task_struct *pick_next_pushable_task(struct rq *rq) { struct task_struct *p; @@ -1466,23 +1472,9 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) pull_rt_task(rq); } -/* - * assumes rq->lock is held - */ -static int needs_post_schedule_rt(struct rq *rq) -{ - return has_pushable_tasks(rq); -} - static void post_schedule_rt(struct rq *rq) { - /* - * This is only called if needs_post_schedule_rt() indicates that - * we need to push tasks away - */ - spin_lock_irq(&rq->lock); push_rt_tasks(rq); - spin_unlock_irq(&rq->lock); } /* @@ -1758,7 +1750,6 @@ static const struct sched_class rt_sched_class = { .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, .pre_schedule = pre_schedule_rt, - .needs_post_schedule = needs_post_schedule_rt, .post_schedule = post_schedule_rt, .task_wake_up = task_wake_up_rt, .switched_from = switched_from_rt, -- cgit v1.2.3-70-g09d2 From 00aec93d10a051ea64f83eff75d4065a19508ea6 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Thu, 30 Jul 2009 10:57:23 -0400 Subject: sched: Fully integrate cpus_active_map and root-domain code Reflect "active" cpus in the rq->rd->online field, instead of the online_map. The motivation is that things that use the root-domain code (such as cpupri) only care about cpus classified as "active" anyway. By synchronizing the root-domain state with the active map, we allow several optimizations. For instance, we can remove an extra cpumask_and from the scheduler hotpath by utilizing rq->rd->online (since it is now a cached version of cpu_active_map & rq->rd->span). Signed-off-by: Gregory Haskins Acked-by: Peter Zijlstra Acked-by: Max Krasnyansky Signed-off-by: Peter Zijlstra LKML-Reference: <20090730145723.25226.24493.stgit@dev.haskins.net> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_fair.c | 10 +++++++--- kernel/sched_rt.c | 7 ------- 3 files changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 613fee54fc8..475138c4254 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7927,7 +7927,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) rq->rd = rd; cpumask_set_cpu(rq->cpu, rd->span); - if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) + if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); spin_unlock_irqrestore(&rq->lock, flags); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 652e8bdef9a..49347298487 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1046,17 +1046,21 @@ static void yield_task_fair(struct rq *rq) * search starts with cpus closest then further out as needed, * so we always favor a closer, idle cpu. * Domains may include CPUs that are not usable for migration, - * hence we need to mask them out (cpu_active_mask) + * hence we need to mask them out (rq->rd->online) * * Returns the CPU we should wake onto. */ #if defined(ARCH_HAS_SCHED_WAKE_IDLE) + +#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online) + static int wake_idle(int cpu, struct task_struct *p) { struct sched_domain *sd; int i; unsigned int chosen_wakeup_cpu; int this_cpu; + struct rq *task_rq = task_rq(p); /* * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu @@ -1089,10 +1093,10 @@ static int wake_idle(int cpu, struct task_struct *p) for_each_domain(cpu, sd) { if ((sd->flags & SD_WAKE_IDLE) || ((sd->flags & SD_WAKE_IDLE_FAR) - && !task_hot(p, task_rq(p)->clock, sd))) { + && !task_hot(p, task_rq->clock, sd))) { for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { - if (cpu_active(i) && idle_cpu(i)) { + if (cpu_rd_active(i, task_rq) && idle_cpu(i)) { if (i != task_cpu(p)) { schedstat_inc(p, se.nr_wakeups_idle); diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index a8f89bc3e5e..13f728ef5b3 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1172,13 +1172,6 @@ static int find_lowest_rq(struct task_struct *task) if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) return -1; /* No targets found */ - /* - * Only consider CPUs that are usable for migration. - * I guess we might want to change cpupri_find() to ignore those - * in the first place. - */ - cpumask_and(lowest_mask, lowest_mask, cpu_active_mask); - /* * At this point we have built a mask of cpus representing the * lowest priority tasks in the system. Now we want to elect -- cgit v1.2.3-70-g09d2 From 8f48894fcc89ddec62e1762f73a0825793e59e91 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Jul 2009 12:25:30 +0200 Subject: sched: Add debug check to task_of() A frequent mistake appears to be to call task_of() on a scheduler entity that is not actually a task, which can result in a wild pointer. Add a check to catch these mistakes. Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 20 ++++++++++++++------ kernel/sched_rt.c | 16 ++++++++++++---- 2 files changed, 26 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 49347298487..342000b31ad 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class; * CFS operations on generic schedulable entities: */ -static inline struct task_struct *task_of(struct sched_entity *se) -{ - return container_of(se, struct task_struct, se); -} - #ifdef CONFIG_FAIR_GROUP_SCHED /* cpu runqueue to which this cfs_rq is attached */ @@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) /* An entity is a task if it doesn't "own" a runqueue */ #define entity_is_task(se) (!se->my_q) +static inline struct task_struct *task_of(struct sched_entity *se) +{ +#ifdef CONFIG_SCHED_DEBUG + WARN_ON_ONCE(!entity_is_task(se)); +#endif + return container_of(se, struct task_struct, se); +} + /* Walk up scheduling entities hierarchy */ #define for_each_sched_entity(se) \ for (; se; se = se->parent) @@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) } } -#else /* CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ + +static inline struct task_struct *task_of(struct sched_entity *se) +{ + return container_of(se, struct task_struct, se); +} static inline struct rq *rq_of(struct cfs_rq *cfs_rq) { diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 13f728ef5b3..f365e66b3d4 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -3,15 +3,18 @@ * policies) */ +#ifdef CONFIG_RT_GROUP_SCHED + +#define rt_entity_is_task(rt_se) (!(rt_se)->my_q) + static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) { +#ifdef CONFIG_SCHED_DEBUG + WARN_ON_ONCE(!rt_entity_is_task(rt_se)); +#endif return container_of(rt_se, struct task_struct, rt); } -#ifdef CONFIG_RT_GROUP_SCHED - -#define rt_entity_is_task(rt_se) (!(rt_se)->my_q) - static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) { return rt_rq->rq; @@ -26,6 +29,11 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) #define rt_entity_is_task(rt_se) (1) +static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) +{ + return container_of(rt_se, struct task_struct, rt); +} + static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) { return container_of(rt_rq, struct rq, rt); -- cgit v1.2.3-70-g09d2 From 693525e3bea25cf2ee6cf2b862ba7c148e891df2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jul 2009 13:56:38 +0200 Subject: sched: Ensure the migration task doesn't go away during use Like sched_migrate_task(), set_cpus_allowed_ptr() should hold onto the migration thread too. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 475138c4254..7f83be35d65 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7078,8 +7078,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ + struct task_struct *mt = rq->migration_thread; + + get_task_struct(mt); task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); + put_task_struct(mt); wait_for_completion(&req.done); tlb_migrate_finish(p->mm); return 0; -- cgit v1.2.3-70-g09d2 From bbfa26229a8143889e95e0df4a9d69067ee836cd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 2 Aug 2009 14:44:24 +0200 Subject: lockdep: Fix BFS build Fix: kernel/built-in.o: In function `lockdep_stats_show': lockdep_proc.c:(.text+0x48202): undefined reference to `max_bfs_queue_depth' As max_bfs_queue_depth is only available under CONFIG_PROVE_LOCKING=y. Cc: Ming Lei Cc: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/lockdep_proc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 9a1bf34d2ff..fba81f16e34 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -411,8 +411,10 @@ static int lockdep_stats_show(struct seq_file *m, void *v) max_lockdep_depth); seq_printf(m, " max recursion depth: %11u\n", max_recursion_depth); +#ifdef CONFIG_PROVE_LOCKING seq_printf(m, " max bfs queue depth: %11u\n", max_bfs_queue_depth); +#endif lockdep_stats_debug_show(m); seq_printf(m, " debug_locks: %11u\n", debug_locks); -- cgit v1.2.3-70-g09d2 From bcf08df3b23b3d13bf8c4ad6bd744a6ad30015fb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 19 Apr 2008 12:11:10 +0200 Subject: sched: Fix cpupri build on !CONFIG_SMP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This build bug: In file included from kernel/sched.c:1765: kernel/sched_rt.c: In function ‘has_pushable_tasks’: kernel/sched_rt.c:1069: error: ‘struct rt_rq’ has no member named ‘pushable_tasks’ kernel/sched_rt.c: In function ‘pick_next_task_rt’: kernel/sched_rt.c:1084: error: ‘struct rq’ has no member named ‘post_schedule’ Triggers because both pushable_tasks and post_schedule are SMP-only fields. Move pushable_tasks() to the SMP section and #ifdef the post_schedule use. Cc: Gregory Haskins Cc: Peter Zijlstra LKML-Reference: <20090729150422.17691.55590.stgit@dev.haskins.net> Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index f365e66b3d4..3d4020a9ba1 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -136,6 +136,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); } +static inline int has_pushable_tasks(struct rq *rq) +{ + return !plist_head_empty(&rq->rt.pushable_tasks); +} + #else static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) @@ -1064,11 +1069,6 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) return p; } -static inline int has_pushable_tasks(struct rq *rq) -{ - return !plist_head_empty(&rq->rt.pushable_tasks); -} - static struct task_struct *pick_next_task_rt(struct rq *rq) { struct task_struct *p = _pick_next_task_rt(rq); @@ -1077,11 +1077,13 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) if (p) dequeue_pushable_task(rq, p); +#ifdef CONFIG_SMP /* * We detect this state here so that we can avoid taking the RQ * lock again later if there is no need to push */ rq->post_schedule = has_pushable_tasks(rq); +#endif return p; } -- cgit v1.2.3-70-g09d2 From 4f84f4330a11b9eb828bf5af557f4c79c64614a3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Jul 2009 15:27:04 +0200 Subject: lockdep: Fix backtraces Truncate stupid -1 entries in backtraces. Signed-off-by: Peter Zijlstra LKML-Reference: <1248096665.15751.8816.camel@twins> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 1cedb00e3e7..2f0970297e3 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -367,11 +367,21 @@ static int save_trace(struct stack_trace *trace) save_stack_trace(trace); + /* + * Some daft arches put -1 at the end to indicate its a full trace. + * + * this is buggy anyway, since it takes a whole extra entry so a + * complete trace that maxes out the entries provided will be reported + * as incomplete, friggin useless + */ + if (trace->entries[trace->nr_entries-1] == ULONG_MAX) + trace->nr_entries--; + trace->max_entries = trace->nr_entries; nr_stack_trace_entries += trace->nr_entries; - if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { + if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { if (!debug_locks_off_graph_unlock()) return 0; -- cgit v1.2.3-70-g09d2 From 98c33eddaf41d225d99b40f9eedbd0fac4c08c05 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jul 2009 13:19:07 +0200 Subject: lockdep: Fix style nits fixes a few comments and whitespaces that annoyed me. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 2f0970297e3..4b6cebe8ab3 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -843,15 +843,18 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, return 1; } -/*For good efficiency of modular, we use power of 2*/ +/* + * For good efficiency of modular, we use power of 2 + */ #define MAX_CIRCULAR_QUEUE_SIZE 4096UL #define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1) -/* The circular_queue and helpers is used to implement the +/* + * The circular_queue and helpers is used to implement the * breadth-first search(BFS)algorithem, by which we can build * the shortest path from the next lock to be acquired to the * previous held lock if there is a circular between them. - * */ + */ struct circular_queue { unsigned long element[MAX_CIRCULAR_QUEUE_SIZE]; unsigned int front, rear; @@ -907,6 +910,7 @@ static inline void mark_lock_accessed(struct lock_list *lock, struct lock_list *parent) { unsigned long nr; + nr = lock - list_entries; WARN_ON(nr >= nr_list_entries); lock->parent = parent; @@ -916,6 +920,7 @@ static inline void mark_lock_accessed(struct lock_list *lock, static inline unsigned long lock_accessed(struct lock_list *lock) { unsigned long nr; + nr = lock - list_entries; WARN_ON(nr >= nr_list_entries); return test_bit(nr, bfs_accessed); -- cgit v1.2.3-70-g09d2 From f607c6685774811b8112e124f10a053d77015485 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Jul 2009 19:16:29 +0200 Subject: lockdep: Introduce lockdep_assert_held() Add a lockdep helper to validate that we indeed are the owner of a lock. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 8 ++++++++ kernel/lockdep.c | 33 +++++++++++++++++++++++++++++++++ kernel/sched.c | 2 ++ 3 files changed, 43 insertions(+) (limited to 'kernel') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 12aabfcb45f..a6d5e5e4d08 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -296,6 +296,10 @@ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass, extern void lock_release(struct lockdep_map *lock, int nested, unsigned long ip); +#define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map) + +extern int lock_is_held(struct lockdep_map *lock); + extern void lock_set_class(struct lockdep_map *lock, const char *name, struct lock_class_key *key, unsigned int subclass, unsigned long ip); @@ -314,6 +318,8 @@ extern void lockdep_trace_alloc(gfp_t mask); #define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) +#define lockdep_assert_held(l) WARN_ON(debug_locks && !lockdep_is_held(l)) + #else /* !LOCKDEP */ static inline void lockdep_off(void) @@ -358,6 +364,8 @@ struct lock_class_key { }; #define lockdep_depth(tsk) (0) +#define lockdep_assert_held(l) do { } while (0) + #endif /* !LOCKDEP */ #ifdef CONFIG_LOCK_STAT diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 4b6cebe8ab3..28914a50991 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3059,6 +3059,19 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) check_chain_key(curr); } +static int __lock_is_held(struct lockdep_map *lock) +{ + struct task_struct *curr = current; + int i; + + for (i = 0; i < curr->lockdep_depth; i++) { + if (curr->held_locks[i].instance == lock) + return 1; + } + + return 0; +} + /* * Check whether we follow the irq-flags state precisely: */ @@ -3160,6 +3173,26 @@ void lock_release(struct lockdep_map *lock, int nested, } EXPORT_SYMBOL_GPL(lock_release); +int lock_is_held(struct lockdep_map *lock) +{ + unsigned long flags; + int ret = 0; + + if (unlikely(current->lockdep_recursion)) + return ret; + + raw_local_irq_save(flags); + check_flags(flags); + + current->lockdep_recursion = 1; + ret = __lock_is_held(lock); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); + + return ret; +} +EXPORT_SYMBOL_GPL(lock_is_held); + void lockdep_set_current_reclaim_state(gfp_t gfp_mask) { current->lockdep_reclaim_gfp = gfp_mask; diff --git a/kernel/sched.c b/kernel/sched.c index 1b59e265273..2c75f7daa43 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6609,6 +6609,8 @@ int cond_resched_lock(spinlock_t *lock) int resched = should_resched(); int ret = 0; + lockdep_assert_held(lock); + if (spin_needbreak(lock) || resched) { spin_unlock(lock); if (resched) -- cgit v1.2.3-70-g09d2 From bb97a91e2549a7f2df9c21d32542582f549ab3ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Jul 2009 19:15:35 +0200 Subject: lockdep: Deal with many similar locks spin_lock_nest_lock() allows to take many instances of the same class, this can easily lead to overflow of MAX_LOCK_DEPTH. To avoid this overflow, we'll stop accounting instances but start reference counting the class in the held_lock structure. [ We could maintain a list of instances, if we'd move the hlock stuff into __lock_acquired(), but that would require significant modifications to the current code. ] We restrict this mode to spin_lock_nest_lock() only, because it degrades the lockdep quality due to lost of instance. For lockstat this means we don't track lock statistics for any but the first lock in the series. Currently nesting is limited to 11 bits because that was the spare space available in held_lock. This yields a 2048 instances maximium. Signed-off-by: Peter Zijlstra Cc: Marcelo Tosatti Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 4 ++- kernel/lockdep.c | 89 ++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 80 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index a6d5e5e4d08..47d42eff612 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -213,10 +213,12 @@ struct held_lock { * interrupt context: */ unsigned int irq_context:2; /* bit 0 - soft, bit 1 - hard */ - unsigned int trylock:1; + unsigned int trylock:1; /* 16 bits */ + unsigned int read:2; /* see lock_acquire() comment */ unsigned int check:2; /* see lock_acquire() comment */ unsigned int hardirqs_off:1; + unsigned int references:11; /* 32 bits */ }; /* diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 28914a50991..0bb246e21cd 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2708,13 +2708,15 @@ EXPORT_SYMBOL_GPL(lockdep_init_map); */ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, int hardirqs_off, - struct lockdep_map *nest_lock, unsigned long ip) + struct lockdep_map *nest_lock, unsigned long ip, + int references) { struct task_struct *curr = current; struct lock_class *class = NULL; struct held_lock *hlock; unsigned int depth, id; int chain_head = 0; + int class_idx; u64 chain_key; if (!prove_locking) @@ -2762,10 +2764,24 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) return 0; + class_idx = class - lock_classes + 1; + + if (depth) { + hlock = curr->held_locks + depth - 1; + if (hlock->class_idx == class_idx && nest_lock) { + if (hlock->references) + hlock->references++; + else + hlock->references = 2; + + return 1; + } + } + hlock = curr->held_locks + depth; if (DEBUG_LOCKS_WARN_ON(!class)) return 0; - hlock->class_idx = class - lock_classes + 1; + hlock->class_idx = class_idx; hlock->acquire_ip = ip; hlock->instance = lock; hlock->nest_lock = nest_lock; @@ -2773,6 +2789,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->read = read; hlock->check = check; hlock->hardirqs_off = !!hardirqs_off; + hlock->references = references; #ifdef CONFIG_LOCK_STAT hlock->waittime_stamp = 0; hlock->holdtime_stamp = sched_clock(); @@ -2881,6 +2898,30 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, return 1; } +static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) +{ + if (hlock->instance == lock) + return 1; + + if (hlock->references) { + struct lock_class *class = lock->class_cache; + + if (!class) + class = look_up_lock_class(lock, 0); + + if (DEBUG_LOCKS_WARN_ON(!class)) + return 0; + + if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) + return 0; + + if (hlock->class_idx == class - lock_classes + 1) + return 1; + } + + return 0; +} + static int __lock_set_class(struct lockdep_map *lock, const char *name, struct lock_class_key *key, unsigned int subclass, @@ -2904,7 +2945,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name, */ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) break; - if (hlock->instance == lock) + if (match_held_lock(hlock, lock)) goto found_it; prev_hlock = hlock; } @@ -2923,7 +2964,8 @@ found_it: if (!__lock_acquire(hlock->instance, hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, - hlock->nest_lock, hlock->acquire_ip)) + hlock->nest_lock, hlock->acquire_ip, + hlock->references)) return 0; } @@ -2962,20 +3004,34 @@ lock_release_non_nested(struct task_struct *curr, */ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) break; - if (hlock->instance == lock) + if (match_held_lock(hlock, lock)) goto found_it; prev_hlock = hlock; } return print_unlock_inbalance_bug(curr, lock, ip); found_it: - lock_release_holdtime(hlock); + if (hlock->instance == lock) + lock_release_holdtime(hlock); + + if (hlock->references) { + hlock->references--; + if (hlock->references) { + /* + * We had, and after removing one, still have + * references, the current lock stack is still + * valid. We're done! + */ + return 1; + } + } /* * We have the right lock to unlock, 'hlock' points to it. * Now we remove it from the stack, and add back the other * entries (if any), recalculating the hash along the way: */ + curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; @@ -2984,7 +3040,8 @@ found_it: if (!__lock_acquire(hlock->instance, hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, - hlock->nest_lock, hlock->acquire_ip)) + hlock->nest_lock, hlock->acquire_ip, + hlock->references)) return 0; } @@ -3014,7 +3071,7 @@ static int lock_release_nested(struct task_struct *curr, /* * Is the unlock non-nested: */ - if (hlock->instance != lock) + if (hlock->instance != lock || hlock->references) return lock_release_non_nested(curr, lock, ip); curr->lockdep_depth--; @@ -3065,7 +3122,9 @@ static int __lock_is_held(struct lockdep_map *lock) int i; for (i = 0; i < curr->lockdep_depth; i++) { - if (curr->held_locks[i].instance == lock) + struct held_lock *hlock = curr->held_locks + i; + + if (match_held_lock(hlock, lock)) return 1; } @@ -3148,7 +3207,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, current->lockdep_recursion = 1; __lock_acquire(lock, subclass, trylock, read, check, - irqs_disabled_flags(flags), nest_lock, ip); + irqs_disabled_flags(flags), nest_lock, ip, 0); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } @@ -3252,7 +3311,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) */ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) break; - if (hlock->instance == lock) + if (match_held_lock(hlock, lock)) goto found_it; prev_hlock = hlock; } @@ -3260,6 +3319,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) return; found_it: + if (hlock->instance != lock) + return; + hlock->waittime_stamp = sched_clock(); contention_point = lock_point(hlock_class(hlock)->contention_point, ip); @@ -3299,7 +3361,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) */ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) break; - if (hlock->instance == lock) + if (match_held_lock(hlock, lock)) goto found_it; prev_hlock = hlock; } @@ -3307,6 +3369,9 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) return; found_it: + if (hlock->instance != lock) + return; + cpu = smp_processor_id(); if (hlock->waittime_stamp) { now = sched_clock(); -- cgit v1.2.3-70-g09d2 From e351b660fddd4df76cc4635f896d311ed0ff3752 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 22 Jul 2009 22:48:09 +0800 Subject: lockdep: Reintroduce generation count to make BFS faster We still can apply DaveM's generation count optimization to BFS, based on the following idea: - before doing each BFS, increase the global generation id by 1 - if one node in the graph has been visited, mark it as visited by storing the current global generation id into the node's dep_gen_id field - so we can decide if one node has been visited already, by comparing the node's dep_gen_id with the global generation id. By applying DaveM's generation count optimization to current implementation of BFS, we gain the following advantages: - we save MAX_LOCKDEP_ENTRIES/8 bytes memory; - we remove the bitmap_zero(bfs_accessed, MAX_LOCKDEP_ENTRIES); in each BFS, which is very time-consuming since MAX_LOCKDEP_ENTRIES may be very large.(16384UL) Signed-off-by: Ming Lei Signed-off-by: Peter Zijlstra Cc: "David S. Miller" LKML-Reference: <1248274089-6358-1-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 1 + kernel/lockdep.c | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 47d42eff612..9ccf0e286b2 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -58,6 +58,7 @@ struct lock_class { struct lockdep_subclass_key *key; unsigned int subclass; + unsigned int dep_gen_id; /* * IRQ/softirq usage tracking bits: diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 0bb246e21cd..2b443b5090a 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -861,14 +861,15 @@ struct circular_queue { }; static struct circular_queue lock_cq; -static unsigned long bfs_accessed[BITS_TO_LONGS(MAX_LOCKDEP_ENTRIES)]; unsigned int max_bfs_queue_depth; +static unsigned int lockdep_dependency_gen_id; + static inline void __cq_init(struct circular_queue *cq) { cq->front = cq->rear = 0; - bitmap_zero(bfs_accessed, MAX_LOCKDEP_ENTRIES); + lockdep_dependency_gen_id++; } static inline int __cq_empty(struct circular_queue *cq) @@ -914,7 +915,7 @@ static inline void mark_lock_accessed(struct lock_list *lock, nr = lock - list_entries; WARN_ON(nr >= nr_list_entries); lock->parent = parent; - set_bit(nr, bfs_accessed); + lock->class->dep_gen_id = lockdep_dependency_gen_id; } static inline unsigned long lock_accessed(struct lock_list *lock) @@ -923,7 +924,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock) nr = lock - list_entries; WARN_ON(nr >= nr_list_entries); - return test_bit(nr, bfs_accessed); + return lock->class->dep_gen_id == lockdep_dependency_gen_id; } static inline struct lock_list *get_lock_parent(struct lock_list *child) @@ -3604,7 +3605,7 @@ void __init lockdep_info(void) sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + sizeof(struct list_head) * CHAINHASH_SIZE) / 1024 #ifdef CONFIG_PROVE_LOCKING - + sizeof(struct circular_queue) + sizeof(bfs_accessed) + + sizeof(struct circular_queue) #endif ); -- cgit v1.2.3-70-g09d2 From 90629209a020859b67423a6326f3765f220c7f5c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 2 Aug 2009 21:43:36 +0800 Subject: lockdep: Fix memory usage info of BFS The unit is KB, so sizeof(struct circular_queue) should be divided by 1024. Signed-off-by: Ming Lei Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Cc: davem@davemloft.net Cc: Ming Lei Cc: a.p.zijlstra@chello.nl LKML-Reference: <1249220616-7190-1-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 2b443b5090a..0843584aed5 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3603,10 +3603,11 @@ void __init lockdep_info(void) sizeof(struct list_head) * CLASSHASH_SIZE + sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + - sizeof(struct list_head) * CHAINHASH_SIZE) / 1024 + sizeof(struct list_head) * CHAINHASH_SIZE #ifdef CONFIG_PROVE_LOCKING + sizeof(struct circular_queue) #endif + ) / 1024 ); printk(" per task-struct memory footprint: %lu bytes\n", -- cgit v1.2.3-70-g09d2 From 70d715fd0597f18528f389b5ac59102263067744 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 3 Aug 2009 11:48:19 +0900 Subject: posix-timers: Fix oops in clock_nanosleep() with CLOCK_MONOTONIC_RAW Prevent calling do_nanosleep() with clockid CLOCK_MONOTONIC_RAW, it may cause oops, such as NULL pointer dereference. Signed-off-by: Hiroshi Shimamoto Cc: Andrew Morton Cc: Thomas Gleixner Cc: John Stultz Cc: LKML-Reference: <4A764FF3.50607@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- kernel/posix-timers.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 052ec4d195c..d089d052c4a 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -202,6 +202,12 @@ static int no_timer_create(struct k_itimer *new_timer) return -EOPNOTSUPP; } +static int no_nsleep(const clockid_t which_clock, int flags, + struct timespec *tsave, struct timespec __user *rmtp) +{ + return -EOPNOTSUPP; +} + /* * Return nonzero if we know a priori this clockid_t value is bogus. */ @@ -254,6 +260,7 @@ static __init int init_posix_timers(void) .clock_get = posix_get_monotonic_raw, .clock_set = do_posix_clock_nosettime, .timer_create = no_timer_create, + .nsleep = no_nsleep, }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); -- cgit v1.2.3-70-g09d2 From 5b0f437df0a3e374d26ad533eb78fe64744f55a8 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 30 Jul 2009 19:00:53 +0200 Subject: workqueues: Improve schedule_work() documentation Two important aspects of the schedule_work() function are not yet documented: - that it is allowed to pass a struct work_struct * to this function that is already on the kernel-global workqueue; - the meaning of its return value. The patch below documents both aspects. Signed-off-by: Bart Van Assche Cc: "Greg Kroah-Hartman" Cc: Andrew Morton LKML-Reference: <200907301900.54202.bart.vanassche@gmail.com> Signed-off-by: Ingo Molnar --- kernel/workqueue.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0668795d881..3c44b56b0da 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -600,7 +600,12 @@ static struct workqueue_struct *keventd_wq __read_mostly; * schedule_work - put work task in global workqueue * @work: job to be done * - * This puts a job in the kernel-global workqueue. + * Returns zero if @work was already on the kernel-global workqueue and + * non-zero otherwise. + * + * This puts a job in the kernel-global workqueue if it was not already + * queued and leaves it in the same position on the kernel-global + * workqueue otherwise. */ int schedule_work(struct work_struct *work) { -- cgit v1.2.3-70-g09d2 From cc6db4e60116c1f76577b6850a35ae7de69a95b6 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 31 Jul 2009 16:20:10 -0700 Subject: futex: Correct futex_wait_requeue_pi() commentary The state machine described in the comments wasn't updated with a follow-on fix. Address that and cleanup the corresponding commentary in the function. Signed-off-by: Darren Hart Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner LKML-Reference: <4A737C2A.9090001@us.ibm.com> Signed-off-by: Ingo Molnar --- kernel/futex.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 0672ff88f15..d077201b393 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2102,11 +2102,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * We call schedule in futex_wait_queue_me() when we enqueue and return there * via the following: * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() - * 2) wakeup on uaddr2 after a requeue and subsequent unlock - * 3) signal (before or after requeue) - * 4) timeout (before or after requeue) + * 2) wakeup on uaddr2 after a requeue + * 3) signal + * 4) timeout * - * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function. + * If 3, cleanup and return -ERESTARTNOINTR. * * If 2, we may then block on trying to take the rt_mutex and return via: * 5) successful lock @@ -2114,7 +2114,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * 7) timeout * 8) other lock acquisition failure * - * If 6, we setup a restart_block with futex_lock_pi() as the function. + * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). * * If 4 or 7, we cleanup and return with -ETIMEDOUT. * @@ -2232,14 +2232,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, rt_mutex_unlock(pi_mutex); } else if (ret == -EINTR) { /* - * We've already been requeued, but we have no way to - * restart by calling futex_lock_pi() directly. We - * could restart the syscall, but that will look at - * the user space value and return right away. So we - * drop back with EWOULDBLOCK to tell user space that - * "val" has been changed. That's the same what the - * restart of the syscall would do in - * futex_wait_setup(). + * We've already been requeued, but cannot restart by calling + * futex_lock_pi() directly. We could restart this syscall, but + * it would detect that the user space "val" changed and return + * -EWOULDBLOCK. Save the overhead of the restart and return + * -EWOULDBLOCK directly. */ ret = -EWOULDBLOCK; } -- cgit v1.2.3-70-g09d2 From 990a55eb25d9698d61352264cc43f3a9c04cce90 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 31 Jul 2009 00:02:06 +0200 Subject: irq: Clean up by removing irqfixup MODULE_PARM_DESC() It's wrong (the parm takes no arguments) and compile-time wiped away anyway (due to !MODULE). Signed-off-by: Jiri Slaby LKML-Reference: <1248991326-26267-1-git-send-email-jirislaby@gmail.com> Signed-off-by: Ingo Molnar --- kernel/irq/spurious.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 4d568294de3..114e704760f 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -297,7 +297,6 @@ static int __init irqfixup_setup(char *str) __setup("irqfixup", irqfixup_setup); module_param(irqfixup, int, 0644); -MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode"); static int __init irqpoll_setup(char *str) { -- cgit v1.2.3-70-g09d2 From 97fd9ed48ce2b807edc363bef3e817aeeb5cd5e6 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 21 Jul 2009 20:25:05 +0200 Subject: timers: Cache __next_timer_interrupt result Each time a cpu goes to sleep on a NOHZ=y system the timer wheel is searched for the next timer interrupt. It can take quite a few cycles to find the next pending timer. This patch adds a field to tvec_base that caches the result of __next_timer_interrupt. The hit ratio is around 80% on my thinkpad under normal use, on a server I've seen hit ratios from 5% to 95% dependent on the workload. -v2: jiffies wrap fixes Signed-off-by: Martin Schwidefsky Acked-by: Thomas Gleixner Cc: john stultz Cc: Venki Pallipadi LKML-Reference: <20090721202505.7d56a079@skybase> Signed-off-by: Ingo Molnar --- kernel/timer.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 0b36b9e5cc8..5c1e49ec2f1 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -72,6 +72,7 @@ struct tvec_base { spinlock_t lock; struct timer_list *running_timer; unsigned long timer_jiffies; + unsigned long next_timer; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -622,6 +623,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, if (timer_pending(timer)) { detach_timer(timer, 0); + if (timer->expires == base->next_timer && + !tbase_get_deferrable(timer->base)) + base->next_timer = base->timer_jiffies; ret = 1; } else { if (pending_only) @@ -663,6 +667,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, } timer->expires = expires; + if (time_before(timer->expires, base->next_timer) && + !tbase_get_deferrable(timer->base)) + base->next_timer = timer->expires; internal_add_timer(base, timer); out_unlock: @@ -781,6 +788,9 @@ void add_timer_on(struct timer_list *timer, int cpu) spin_lock_irqsave(&base->lock, flags); timer_set_base(timer, base); debug_timer_activate(timer); + if (time_before(timer->expires, base->next_timer) && + !tbase_get_deferrable(timer->base)) + base->next_timer = timer->expires; internal_add_timer(base, timer); /* * Check whether the other CPU is idle and needs to be @@ -817,6 +827,9 @@ int del_timer(struct timer_list *timer) base = lock_timer_base(timer, &flags); if (timer_pending(timer)) { detach_timer(timer, 1); + if (timer->expires == base->next_timer && + !tbase_get_deferrable(timer->base)) + base->next_timer = base->timer_jiffies; ret = 1; } spin_unlock_irqrestore(&base->lock, flags); @@ -850,6 +863,9 @@ int try_to_del_timer_sync(struct timer_list *timer) ret = 0; if (timer_pending(timer)) { detach_timer(timer, 1); + if (timer->expires == base->next_timer && + !tbase_get_deferrable(timer->base)) + base->next_timer = base->timer_jiffies; ret = 1; } out: @@ -1134,7 +1150,9 @@ unsigned long get_next_timer_interrupt(unsigned long now) unsigned long expires; spin_lock(&base->lock); - expires = __next_timer_interrupt(base); + if (time_before_eq(base->next_timer, base->timer_jiffies)) + base->next_timer = __next_timer_interrupt(base); + expires = base->next_timer; spin_unlock(&base->lock); if (time_before_eq(expires, now)) @@ -1523,6 +1541,7 @@ static int __cpuinit init_timers_cpu(int cpu) INIT_LIST_HEAD(base->tv1.vec + j); base->timer_jiffies = jiffies; + base->next_timer = base->timer_jiffies; return 0; } @@ -1535,6 +1554,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea timer = list_first_entry(head, struct timer_list, entry); detach_timer(timer, 0); timer_set_base(timer, new_base); + if (time_before(timer->expires, new_base->next_timer) && + !tbase_get_deferrable(timer->base)) + new_base->next_timer = timer->expires; internal_add_timer(new_base, timer); } } -- cgit v1.2.3-70-g09d2 From a2551df7ec568d87793d2eea4ca744e86318f205 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 31 Jul 2009 12:54:11 -0400 Subject: Security/SELinux: seperate lsm specific mmap_min_addr Currently SELinux enforcement of controls on the ability to map low memory is determined by the mmap_min_addr tunable. This patch causes SELinux to ignore the tunable and instead use a seperate Kconfig option specific to how much space the LSM should protect. The tunable will now only control the need for CAP_SYS_RAWIO and SELinux permissions will always protect the amount of low memory designated by CONFIG_LSM_MMAP_MIN_ADDR. This allows users who need to disable the mmap_min_addr controls (usual reason being they run WINE as a non-root user) to do so and still have SELinux controls preventing confined domains (like a web server) from being able to map some area of low memory. Signed-off-by: Eric Paris Signed-off-by: James Morris --- include/linux/mm.h | 15 --------------- include/linux/security.h | 17 +++++++++++++++++ kernel/sysctl.c | 7 ++++--- mm/Kconfig | 6 +++--- mm/mmap.c | 3 --- mm/nommu.c | 3 --- security/Kconfig | 16 ++++++++++++++++ security/Makefile | 2 +- security/commoncap.c | 2 +- security/min_addr.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++ security/selinux/hooks.c | 2 +- 11 files changed, 92 insertions(+), 30 deletions(-) create mode 100644 security/min_addr.c (limited to 'kernel') diff --git a/include/linux/mm.h b/include/linux/mm.h index ba3a7cb1eaa..9a72cc78e6b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -34,8 +34,6 @@ extern int sysctl_legacy_va_layout; #define sysctl_legacy_va_layout 0 #endif -extern unsigned long mmap_min_addr; - #include #include #include @@ -574,19 +572,6 @@ static inline void set_page_links(struct page *page, enum zone_type zone, set_page_section(page, pfn_to_section_nr(pfn)); } -/* - * If a hint addr is less than mmap_min_addr change hint to be as - * low as possible but still greater than mmap_min_addr - */ -static inline unsigned long round_hint_to_min(unsigned long hint) -{ - hint &= PAGE_MASK; - if (((void *)hint != NULL) && - (hint < mmap_min_addr)) - return PAGE_ALIGN(mmap_min_addr); - return hint; -} - /* * Some inline functions in vmstat.h depend on page_zone() */ diff --git a/include/linux/security.h b/include/linux/security.h index 963a48fc300..7b431155e39 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -28,6 +28,7 @@ #include #include #include +#include /* PAGE_ALIGN */ #include #include #include @@ -95,6 +96,7 @@ extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb); extern int cap_netlink_recv(struct sk_buff *skb, int cap); extern unsigned long mmap_min_addr; +extern unsigned long dac_mmap_min_addr; /* * Values used in the task_security_ops calls */ @@ -147,6 +149,21 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) opts->num_mnt_opts = 0; } +/* + * If a hint addr is less than mmap_min_addr change hint to be as + * low as possible but still greater than mmap_min_addr + */ +static inline unsigned long round_hint_to_min(unsigned long hint) +{ + hint &= PAGE_MASK; + if (((void *)hint != NULL) && + (hint < mmap_min_addr)) + return PAGE_ALIGN(mmap_min_addr); + return hint; +} + +extern int mmap_min_addr_handler(struct ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); /** * struct security_operations - main security structure * diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 98e02328c67..58be76017fd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -1306,10 +1307,10 @@ static struct ctl_table vm_table[] = { { .ctl_name = CTL_UNNUMBERED, .procname = "mmap_min_addr", - .data = &mmap_min_addr, - .maxlen = sizeof(unsigned long), + .data = &dac_mmap_min_addr, + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &mmap_min_addr_handler, }, #ifdef CONFIG_NUMA { diff --git a/mm/Kconfig b/mm/Kconfig index c948d4ca8bd..fe5f674d7a7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -225,9 +225,9 @@ config DEFAULT_MMAP_MIN_ADDR For most ia64, ppc64 and x86 users with lots of address space a value of 65536 is reasonable and should cause no problems. On arm and other archs it should not be higher than 32768. - Programs which use vm86 functionality would either need additional - permissions from either the LSM or the capabilities module or have - this protection disabled. + Programs which use vm86 functionality or have some need to map + this low address space will need CAP_SYS_RAWIO or disable this + protection by setting the value to 0. This value can be changed after boot using the /proc/sys/vm/mmap_min_addr tunable. diff --git a/mm/mmap.c b/mm/mmap.c index 34579b23ebd..8101de490c7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -88,9 +88,6 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; struct percpu_counter vm_committed_as; -/* amount of vm to protect from userspace access */ -unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; - /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to diff --git a/mm/nommu.c b/mm/nommu.c index 53cab10fece..28754c40be9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -69,9 +69,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; int heap_stack_gap = 0; -/* amount of vm to protect from userspace access */ -unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; - atomic_long_t mmap_pages_allocated; EXPORT_SYMBOL(mem_map); diff --git a/security/Kconfig b/security/Kconfig index d23c839038f..9c60c346a91 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -113,6 +113,22 @@ config SECURITY_ROOTPLUG If you are unsure how to answer this question, answer N. +config LSM_MMAP_MIN_ADDR + int "Low address space for LSM to from user allocation" + depends on SECURITY && SECURITY_SELINUX + default 65535 + help + This is the portion of low virtual memory which should be protected + from userspace allocation. Keeping a user from writing to low pages + can help reduce the impact of kernel NULL pointer bugs. + + For most ia64, ppc64 and x86 users with lots of address space + a value of 65536 is reasonable and should cause no problems. + On arm and other archs it should not be higher than 32768. + Programs which use vm86 functionality or have some need to map + this low address space will need the permission specific to the + systems running LSM. + source security/selinux/Kconfig source security/smack/Kconfig source security/tomoyo/Kconfig diff --git a/security/Makefile b/security/Makefile index c67557cdaa8..b56e7f9ecbc 100644 --- a/security/Makefile +++ b/security/Makefile @@ -8,7 +8,7 @@ subdir-$(CONFIG_SECURITY_SMACK) += smack subdir-$(CONFIG_SECURITY_TOMOYO) += tomoyo # always enable default capabilities -obj-y += commoncap.o +obj-y += commoncap.o min_addr.o # Object file lists obj-$(CONFIG_SECURITY) += security.o capability.o diff --git a/security/commoncap.c b/security/commoncap.c index 3852e943280..fe30751a6cd 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -1005,7 +1005,7 @@ int cap_file_mmap(struct file *file, unsigned long reqprot, { int ret = 0; - if (addr < mmap_min_addr) { + if (addr < dac_mmap_min_addr) { ret = cap_capable(current, current_cred(), CAP_SYS_RAWIO, SECURITY_CAP_AUDIT); /* set PF_SUPERPRIV if it turns out we allow the low mmap */ diff --git a/security/min_addr.c b/security/min_addr.c new file mode 100644 index 00000000000..14cc7b3b8d0 --- /dev/null +++ b/security/min_addr.c @@ -0,0 +1,49 @@ +#include +#include +#include +#include + +/* amount of vm to protect from userspace access by both DAC and the LSM*/ +unsigned long mmap_min_addr; +/* amount of vm to protect from userspace using CAP_SYS_RAWIO (DAC) */ +unsigned long dac_mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; +/* amount of vm to protect from userspace using the LSM = CONFIG_LSM_MMAP_MIN_ADDR */ + +/* + * Update mmap_min_addr = max(dac_mmap_min_addr, CONFIG_LSM_MMAP_MIN_ADDR) + */ +static void update_mmap_min_addr(void) +{ +#ifdef CONFIG_LSM_MMAP_MIN_ADDR + if (dac_mmap_min_addr > CONFIG_LSM_MMAP_MIN_ADDR) + mmap_min_addr = dac_mmap_min_addr; + else + mmap_min_addr = CONFIG_LSM_MMAP_MIN_ADDR; +#else + mmap_min_addr = dac_mmap_min_addr; +#endif +} + +/* + * sysctl handler which just sets dac_mmap_min_addr = the new value and then + * calls update_mmap_min_addr() so non MAP_FIXED hints get rounded properly + */ +int mmap_min_addr_handler(struct ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + + update_mmap_min_addr(); + + return ret; +} + +int __init init_mmap_min_addr(void) +{ + update_mmap_min_addr(); + + return 0; +} +pure_initcall(init_mmap_min_addr); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 8a78f584f46..5dee88362e7 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3040,7 +3040,7 @@ static int selinux_file_mmap(struct file *file, unsigned long reqprot, * at bad behaviour/exploit that we always want to get the AVC, even * if DAC would have also denied the operation. */ - if (addr < mmap_min_addr) { + if (addr < CONFIG_LSM_MMAP_MIN_ADDR) { rc = avc_has_perm(sid, sid, SECCLASS_MEMPROTECT, MEMPROTECT__MMAP_ZERO, NULL); if (rc) -- cgit v1.2.3-70-g09d2 From 0f2541d299d233eddddee4345795e0c46264fd56 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Aug 2009 12:02:48 -0400 Subject: ring-buffer: fix check of try_to_discard result The function ring_buffer_discard_commit inversed the code path of the result of try_to_discard. It should skip incrementing the entry counter if try_to_discard succeeded. But instead, it increments the entry conder if it succeeded to discard, and does not increment it if it fails. The result of this bug is that filtering will make the stat counters incorrect. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bf27bb7a63e..2fd1752f0c8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1785,7 +1785,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, */ RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); - if (!rb_try_to_discard(cpu_buffer, event)) + if (rb_try_to_discard(cpu_buffer, event)) goto out; /* -- cgit v1.2.3-70-g09d2 From 464e85eb0e63096bd52e4c3e2a6fb8357fb95828 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Aug 2009 15:26:37 -0400 Subject: ring-buffer: do not disable ring buffer on oops_in_progress The commit: commit e0fdace10e75dac67d906213b780ff1b1a4cc360 Author: David Miller Date: Fri Aug 1 01:11:22 2008 -0700 debug_locks: set oops_in_progress if we will log messages. Otherwise lock debugging messages on runqueue locks can deadlock the system due to the wakeups performed by printk(). Signed-off-by: David S. Miller Signed-off-by: Ingo Molnar Will permanently set oops_in_progress on any lockdep failure. When this triggers it will cause any read from the ring buffer to permanently disable the ring buffer (not to mention no locking of printk). This patch removes the check. It keeps the print in NMI which makes sense. This is probably OK, since the ring buffer should not cause something to set oops_in_progress anyway. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2fd1752f0c8..2606cee433d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2486,7 +2486,7 @@ static inline int rb_ok_to_lock(void) * buffer too. A one time deal is all you get from reading * the ring buffer from an NMI. */ - if (likely(!in_nmi() && !oops_in_progress)) + if (likely(!in_nmi())) return 1; tracing_off_permanent(); -- cgit v1.2.3-70-g09d2 From 1bbf20835c4e088667a090ce6523a0f70b62dc76 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Wed, 5 Aug 2009 12:05:21 -0700 Subject: rtmutex: Avoid deadlock in rt_mutex_start_proxy_lock() In the event of a lock steal or owner died, rt_mutex_start_proxy_lock() will give the rt_mutex to the waiting task, but it fails to release the wait_lock. This leads to subsequent deadlocks when other tasks try to acquire the rt_mutex. I also removed a few extra blank lines that really spaced this routine out. I must have been high on the \n when I wrote this originally... Signed-off-by: Darren Hart Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Dinakar Guniguntala Cc: John Stultz LKML-Reference: <4A79D7F1.4000405@us.ibm.com> Signed-off-by: Ingo Molnar --- kernel/rtmutex.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index fcd107a78c5..29bd4baf9e7 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -1039,16 +1039,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { /* We got the lock for task. */ debug_rt_mutex_lock(lock); - rt_mutex_set_owner(lock, task, 0); - + spin_unlock(&lock->wait_lock); rt_mutex_deadlock_account_lock(lock, task); return 1; } ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); - if (ret && !waiter->task) { /* * Reset the return value. We might have -- cgit v1.2.3-70-g09d2 From af6af30c0fcd77e621638e53ef8b176bca8bd3b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 5 Aug 2009 20:41:04 +0200 Subject: ftrace: Fix perf-tracepoint OOPS Not all tracepoints are created equal, in specific the ftrace tracepoints are created with TRACE_EVENT_FORMAT() which does not generate the needed bits to tie them into perf counters. For those events, don't create the 'id' file and fail ->profile_enable when their ID is specified through other means. Reported-by: Chris Mason Signed-off-by: Peter Zijlstra Cc: Steven Rostedt LKML-Reference: <1249497664.5890.4.camel@laptop> [ v2: fix build error in the !CONFIG_EVENT_PROFILE case ] Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 8 +++----- kernel/trace/trace_event_profile.c | 2 +- kernel/trace/trace_events.c | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 5c093ffc655..d7cd193c227 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -119,11 +119,9 @@ struct ftrace_event_call { void *filter; void *mod; -#ifdef CONFIG_EVENT_PROFILE - atomic_t profile_count; - int (*profile_enable)(struct ftrace_event_call *); - void (*profile_disable)(struct ftrace_event_call *); -#endif + atomic_t profile_count; + int (*profile_enable)(struct ftrace_event_call *); + void (*profile_disable)(struct ftrace_event_call *); }; #define MAX_FILTER_PRED 32 diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 5b5895afecf..11ba5bb4ed0 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -14,7 +14,7 @@ int ftrace_profile_enable(int event_id) mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id) { + if (event->id == event_id && event->profile_enable) { ret = event->profile_enable(event); break; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 23d2972b22d..e75276a49cf 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -940,7 +940,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, entry = trace_create_file("enable", 0644, call->dir, call, enable); - if (call->id) + if (call->id && call->profile_enable) entry = trace_create_file("id", 0444, call->dir, call, id); -- cgit v1.2.3-70-g09d2 From 0c9e6f639aed490202bbc79214f4495cf4bfde58 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 28 Jul 2009 20:26:06 +0800 Subject: tracing: Simplify print_graph_cpu() print_graph_cpu() is little over-designed. And "log10_all" may be wrong when there are holes in cpu_online_mask: the max online cpu id > cpumask_weight(cpu_online_mask) So change it by using a static column length for the cpu matching nr_cpu_ids number of decimal characters. Signed-off-by: Lai Jiangshan Cc: Steven Rostedt LKML-Reference: <4A6EEE5E.2000001@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_functions_graph.c | 30 ++++-------------------------- 1 file changed, 4 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index abf7c4ae2c8..e30472da15d 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -183,43 +183,19 @@ static void graph_trace_reset(struct trace_array *tr) unregister_ftrace_graph(); } -static inline int log10_cpu(int nb) -{ - if (nb / 100) - return 3; - if (nb / 10) - return 2; - return 1; -} +static int max_bytes_for_cpu; static enum print_line_t print_graph_cpu(struct trace_seq *s, int cpu) { - int i; int ret; - int log10_this = log10_cpu(cpu); - int log10_all = log10_cpu(cpumask_weight(cpu_online_mask)); - /* * Start with a space character - to make it stand out * to the right a bit when trace output is pasted into * email: */ - ret = trace_seq_printf(s, " "); - - /* - * Tricky - we space the CPU field according to the max - * number of online CPUs. On a 2-cpu system it would take - * a maximum of 1 digit - on a 128 cpu system it would - * take up to 3 digits: - */ - for (i = 0; i < log10_all - log10_this; i++) { - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - ret = trace_seq_printf(s, "%d) ", cpu); + ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -919,6 +895,8 @@ static struct tracer graph_trace __read_mostly = { static __init int init_graph_trace(void) { + max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); + return register_tracer(&graph_trace); } -- cgit v1.2.3-70-g09d2 From 5e5bf483986ad86ad25f25eec5299c86eb2d1c57 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 29 Jul 2009 17:11:12 +0200 Subject: tracing/core: Turn ftrace_cpu_disabled into a global var In order to prepare the moving of the function graph tracer insertion helpers from trace.c to trace_functions_graph.c, we need to export the ftrace_cpu_disabled variable. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt --- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 38a4a3ee749..b6211d60413 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -89,7 +89,7 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set) */ static int tracing_disabled = 1; -static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); +DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); static inline void ftrace_disable_cpu(void) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 758b0dbed55..c7e92732982 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -519,6 +519,7 @@ extern int DYN_FTRACE_TEST_NAME(void); extern int ring_buffer_expanded; extern bool tracing_selftest_disabled; +DECLARE_PER_CPU(local_t, ftrace_cpu_disabled); #ifdef CONFIG_FTRACE_STARTUP_TEST extern int trace_selftest_startup_function(struct tracer *trace, -- cgit v1.2.3-70-g09d2 From c0a0d0d3f65284c71115a9bb1ed801ee33eeb552 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 29 Jul 2009 17:51:13 +0200 Subject: tracing/core: Make the stack entry helpers global Make the stacktrace event insertion helpers globals. This has two effects: - Prepare for moving the sched events insertion helpers to the sched switch tracer file. - Move some ifdef outside function definitions Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt --- kernel/trace/trace.c | 24 ++++++++---------------- kernel/trace/trace.h | 28 +++++++++++++++++++++++++--- 2 files changed, 33 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b6211d60413..d6059a493e7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -866,10 +866,6 @@ struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, return event; } -static void ftrace_trace_stack(struct trace_array *tr, - unsigned long flags, int skip, int pc); -static void ftrace_trace_userstack(struct trace_array *tr, - unsigned long flags, int pc); static inline void __trace_buffer_unlock_commit(struct trace_array *tr, struct ring_buffer_event *event, @@ -1003,11 +999,11 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, trace_function(tr, ip, parent_ip, flags, pc); } +#ifdef CONFIG_STACKTRACE static void __ftrace_trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { -#ifdef CONFIG_STACKTRACE struct ftrace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; struct stack_entry *entry; @@ -1028,12 +1024,10 @@ static void __ftrace_trace_stack(struct trace_array *tr, save_stack_trace(&trace); if (!filter_check_discard(call, entry, tr->buffer, event)) ring_buffer_unlock_commit(tr->buffer, event); -#endif } -static void ftrace_trace_stack(struct trace_array *tr, - unsigned long flags, - int skip, int pc) +void ftrace_trace_stack(struct trace_array *tr, unsigned long flags, int skip, + int pc) { if (!(trace_flags & TRACE_ITER_STACKTRACE)) return; @@ -1041,17 +1035,14 @@ static void ftrace_trace_stack(struct trace_array *tr, __ftrace_trace_stack(tr, flags, skip, pc); } -void __trace_stack(struct trace_array *tr, - unsigned long flags, - int skip, int pc) +void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, + int pc) { __ftrace_trace_stack(tr, flags, skip, pc); } -static void ftrace_trace_userstack(struct trace_array *tr, - unsigned long flags, int pc) +void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags, int pc) { -#ifdef CONFIG_STACKTRACE struct ftrace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; @@ -1076,7 +1067,6 @@ static void ftrace_trace_userstack(struct trace_array *tr, save_stack_trace_user(&trace); if (!filter_check_discard(call, entry, tr->buffer, event)) ring_buffer_unlock_commit(tr->buffer, event); -#endif } #ifdef UNUSED @@ -1086,6 +1076,8 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) } #endif /* UNUSED */ +#endif /* CONFIG_STACKTRACE */ + static void ftrace_trace_special(void *__tr, unsigned long arg1, unsigned long arg2, unsigned long arg3, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c7e92732982..116524d6236 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -489,9 +489,31 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); -void __trace_stack(struct trace_array *tr, - unsigned long flags, - int skip, int pc); +#ifdef CONFIG_STACKTRACE +void ftrace_trace_stack(struct trace_array *tr, unsigned long flags, + int skip, int pc); + +void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags, + int pc); + +void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, + int pc); +#else +static inline void ftrace_trace_stack(struct trace_array *tr, + unsigned long flags, int skip, int pc) +{ +} + +static inline void ftrace_trace_userstack(struct trace_array *tr, + unsigned long flags, int pc) +{ +} + +static inline void __trace_stack(struct trace_array *tr, unsigned long flags, + int skip, int pc) +{ +} +#endif /* CONFIG_STACKTRACE */ extern cycle_t ftrace_now(int cpu); -- cgit v1.2.3-70-g09d2 From 82e04af498a85ba425efe77580b7ba08234411df Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 29 Jul 2009 18:00:29 +0200 Subject: tracing: Move sched event insertion helpers in the sched switch tracer file The sched events helpers which insert the sched switch and wakeup events into the ring buffer currently reside in trace.c But this file is quite overloaded and the right place for these helpers is in the sched switch tracer file. Then move them to trace_functions.c Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt --- kernel/trace/trace.c | 56 -------------------------------------- kernel/trace/trace_sched_switch.c | 57 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d6059a493e7..1b73acb40e5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1105,62 +1105,6 @@ __trace_special(void *__tr, void *__data, ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count()); } -void -tracing_sched_switch_trace(struct trace_array *tr, - struct task_struct *prev, - struct task_struct *next, - unsigned long flags, int pc) -{ - struct ftrace_event_call *call = &event_context_switch; - struct ring_buffer_event *event; - struct ctx_switch_entry *entry; - - event = trace_buffer_lock_reserve(tr, TRACE_CTX, - sizeof(*entry), flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->prev_pid = prev->pid; - entry->prev_prio = prev->prio; - entry->prev_state = prev->state; - entry->next_pid = next->pid; - entry->next_prio = next->prio; - entry->next_state = next->state; - entry->next_cpu = task_cpu(next); - - if (!filter_check_discard(call, entry, tr->buffer, event)) - trace_buffer_unlock_commit(tr, event, flags, pc); -} - -void -tracing_sched_wakeup_trace(struct trace_array *tr, - struct task_struct *wakee, - struct task_struct *curr, - unsigned long flags, int pc) -{ - struct ftrace_event_call *call = &event_wakeup; - struct ring_buffer_event *event; - struct ctx_switch_entry *entry; - - event = trace_buffer_lock_reserve(tr, TRACE_WAKE, - sizeof(*entry), flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->prev_pid = curr->pid; - entry->prev_prio = curr->prio; - entry->prev_state = curr->state; - entry->next_pid = wakee->pid; - entry->next_prio = wakee->prio; - entry->next_state = wakee->state; - entry->next_cpu = task_cpu(wakee); - - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); - ftrace_trace_stack(tr, flags, 6, pc); - ftrace_trace_userstack(tr, flags, pc); -} - void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index a98106dd979..e1285d7b548 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -20,6 +20,34 @@ static int sched_ref; static DEFINE_MUTEX(sched_register_mutex); static int sched_stopped; + +void +tracing_sched_switch_trace(struct trace_array *tr, + struct task_struct *prev, + struct task_struct *next, + unsigned long flags, int pc) +{ + struct ftrace_event_call *call = &event_context_switch; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; + + event = trace_buffer_lock_reserve(tr, TRACE_CTX, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->prev_pid = prev->pid; + entry->prev_prio = prev->prio; + entry->prev_state = prev->state; + entry->next_pid = next->pid; + entry->next_prio = next->prio; + entry->next_state = next->state; + entry->next_cpu = task_cpu(next); + + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, flags, pc); +} + static void probe_sched_switch(struct rq *__rq, struct task_struct *prev, struct task_struct *next) @@ -49,6 +77,35 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev, local_irq_restore(flags); } +void +tracing_sched_wakeup_trace(struct trace_array *tr, + struct task_struct *wakee, + struct task_struct *curr, + unsigned long flags, int pc) +{ + struct ftrace_event_call *call = &event_wakeup; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; + + event = trace_buffer_lock_reserve(tr, TRACE_WAKE, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->prev_pid = curr->pid; + entry->prev_prio = curr->prio; + entry->prev_state = curr->state; + entry->next_pid = wakee->pid; + entry->next_prio = wakee->prio; + entry->next_state = wakee->state; + entry->next_cpu = task_cpu(wakee); + + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); + ftrace_trace_stack(tr, flags, 6, pc); + ftrace_trace_userstack(tr, flags, pc); +} + static void probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) { -- cgit v1.2.3-70-g09d2 From 1a0799a8fef5acc6503f9c5e79b2cd003317826c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 29 Jul 2009 18:59:58 +0200 Subject: tracing/function-graph-tracer: Move graph event insertion helpers in the graph tracer file The function graph events helpers which insert the function entry and return events into the ring buffer currently reside in trace.c But this file is quite overloaded and the right place for these helpers is in the function graph tracer file. Then move them to trace_functions_graph.c Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt --- kernel/trace/trace.c | 110 ------------------------------- kernel/trace/trace.h | 1 + kernel/trace/trace_functions_graph.c | 122 ++++++++++++++++++++++++++++++++++- kernel/trace/trace_selftest.c | 1 + 4 files changed, 121 insertions(+), 113 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1b73acb40e5..0cfd1a62def 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -942,54 +942,6 @@ trace_function(struct trace_array *tr, ring_buffer_unlock_commit(tr->buffer, event); } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int __trace_graph_entry(struct trace_array *tr, - struct ftrace_graph_ent *trace, - unsigned long flags, - int pc) -{ - struct ftrace_event_call *call = &event_funcgraph_entry; - struct ring_buffer_event *event; - struct ftrace_graph_ent_entry *entry; - - if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) - return 0; - - event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT, - sizeof(*entry), flags, pc); - if (!event) - return 0; - entry = ring_buffer_event_data(event); - entry->graph_ent = *trace; - if (!filter_current_check_discard(call, entry, event)) - ring_buffer_unlock_commit(global_trace.buffer, event); - - return 1; -} - -static void __trace_graph_return(struct trace_array *tr, - struct ftrace_graph_ret *trace, - unsigned long flags, - int pc) -{ - struct ftrace_event_call *call = &event_funcgraph_exit; - struct ring_buffer_event *event; - struct ftrace_graph_ret_entry *entry; - - if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) - return; - - event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_RET, - sizeof(*entry), flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->ret = *trace; - if (!filter_current_check_discard(call, entry, event)) - ring_buffer_unlock_commit(global_trace.buffer, event); -} -#endif - void ftrace(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, unsigned long parent_ip, unsigned long flags, @@ -1129,68 +1081,6 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) local_irq_restore(flags); } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -int trace_graph_entry(struct ftrace_graph_ent *trace) -{ - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int ret; - int cpu; - int pc; - - if (!ftrace_trace_task(current)) - return 0; - - if (!ftrace_graph_addr(trace->func)) - return 0; - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { - pc = preempt_count(); - ret = __trace_graph_entry(tr, trace, flags, pc); - } else { - ret = 0; - } - /* Only do the atomic if it is not already set */ - if (!test_tsk_trace_graph(current)) - set_tsk_trace_graph(current); - - atomic_dec(&data->disabled); - local_irq_restore(flags); - - return ret; -} - -void trace_graph_return(struct ftrace_graph_ret *trace) -{ - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - int pc; - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { - pc = preempt_count(); - __trace_graph_return(tr, trace, flags, pc); - } - if (!trace->depth) - clear_tsk_trace_graph(current); - atomic_dec(&data->disabled); - local_irq_restore(flags); -} -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - - /** * trace_vbprintk - write binary msg to tracing buffer * diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 116524d6236..9301f1263c5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -471,6 +471,7 @@ void trace_function(struct trace_array *tr, void trace_graph_return(struct ftrace_graph_ret *trace); int trace_graph_entry(struct ftrace_graph_ent *trace); +void set_graph_array(struct trace_array *tr); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index e30472da15d..f97244a41a4 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -52,7 +52,7 @@ static struct tracer_flags tracer_flags = { .opts = trace_opts }; -/* pid on the last trace processed */ +static struct trace_array *graph_array; /* Add a function return address to the trace stack on thread info.*/ @@ -166,10 +166,121 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } +static int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned long flags, + int pc) +{ + struct ftrace_event_call *call = &event_funcgraph_entry; + struct ring_buffer_event *event; + struct ftrace_graph_ent_entry *entry; + + if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + return 0; + + event = trace_buffer_lock_reserve(tr, TRACE_GRAPH_ENT, + sizeof(*entry), flags, pc); + if (!event) + return 0; + entry = ring_buffer_event_data(event); + entry->graph_ent = *trace; + if (!filter_current_check_discard(call, entry, event)) + ring_buffer_unlock_commit(tr->buffer, event); + + return 1; +} + +int trace_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = graph_array; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int ret; + int cpu; + int pc; + + if (unlikely(!tr)) + return 0; + + if (!ftrace_trace_task(current)) + return 0; + + if (!ftrace_graph_addr(trace->func)) + return 0; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { + pc = preempt_count(); + ret = __trace_graph_entry(tr, trace, flags, pc); + } else { + ret = 0; + } + /* Only do the atomic if it is not already set */ + if (!test_tsk_trace_graph(current)) + set_tsk_trace_graph(current); + + atomic_dec(&data->disabled); + local_irq_restore(flags); + + return ret; +} + +static void __trace_graph_return(struct trace_array *tr, + struct ftrace_graph_ret *trace, + unsigned long flags, + int pc) +{ + struct ftrace_event_call *call = &event_funcgraph_exit; + struct ring_buffer_event *event; + struct ftrace_graph_ret_entry *entry; + + if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + return; + + event = trace_buffer_lock_reserve(tr, TRACE_GRAPH_RET, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->ret = *trace; + if (!filter_current_check_discard(call, entry, event)) + ring_buffer_unlock_commit(tr->buffer, event); +} + +void trace_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = graph_array; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { + pc = preempt_count(); + __trace_graph_return(tr, trace, flags, pc); + } + if (!trace->depth) + clear_tsk_trace_graph(current); + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + static int graph_trace_init(struct trace_array *tr) { - int ret = register_ftrace_graph(&trace_graph_return, - &trace_graph_entry); + int ret; + + graph_array = tr; + ret = register_ftrace_graph(&trace_graph_return, + &trace_graph_entry); if (ret) return ret; tracing_start_cmdline_record(); @@ -177,6 +288,11 @@ static int graph_trace_init(struct trace_array *tr) return 0; } +void set_graph_array(struct trace_array *tr) +{ + graph_array = tr; +} + static void graph_trace_reset(struct trace_array *tr) { tracing_stop_cmdline_record(); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 00dd6485bdd..d2cdbabb4ea 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -288,6 +288,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, * to detect and recover from possible hangs */ tracing_reset_online_cpus(tr); + set_graph_array(tr); ret = register_ftrace_graph(&trace_graph_return, &trace_graph_entry_watchdog); if (ret) { -- cgit v1.2.3-70-g09d2 From a2ca5e03b6a5a1d401062f0a7f78888cf9e5e3b0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 6 Aug 2009 07:32:21 +0200 Subject: tracing/events: Only define remove_subsystem_dir() if CONFIG_MODULES MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we disable modules, we get the following warning in ftrace events file: kernel/trace/trace_events.c:912: attention : ‘remove_subsystem_dir’ defined but not used remove_subystem_dir() is useless if !CONFIG_MODULES, then move it to the appropriate #ifdef section of trace_events.c Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt --- kernel/trace/trace_events.c | 52 ++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 90cf9360e14..70ecb7653b4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -908,32 +908,6 @@ event_subsystem_dir(const char *name, struct dentry *d_events) return system->entry; } -static void remove_subsystem_dir(const char *name) -{ - struct event_subsystem *system; - - if (strcmp(name, TRACE_SYSTEM) == 0) - return; - - list_for_each_entry(system, &event_subsystems, list) { - if (strcmp(system->name, name) == 0) { - if (!--system->nr_events) { - struct event_filter *filter = system->filter; - - debugfs_remove_recursive(system->entry); - list_del(&system->list); - if (filter) { - kfree(filter->filter_string); - kfree(filter); - } - kfree(system->name); - kfree(system); - } - break; - } - } -} - static int event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, const struct file_operations *id, @@ -1018,6 +992,32 @@ struct ftrace_module_file_ops { struct file_operations filter; }; +static void remove_subsystem_dir(const char *name) +{ + struct event_subsystem *system; + + if (strcmp(name, TRACE_SYSTEM) == 0) + return; + + list_for_each_entry(system, &event_subsystems, list) { + if (strcmp(system->name, name) == 0) { + if (!--system->nr_events) { + struct event_filter *filter = system->filter; + + debugfs_remove_recursive(system->entry); + list_del(&system->list); + if (filter) { + kfree(filter->filter_string); + kfree(filter); + } + kfree(system->name); + kfree(system); + } + break; + } + } +} + static struct ftrace_module_file_ops * trace_create_file_ops(struct module *mod) { -- cgit v1.2.3-70-g09d2 From 469535a598f28c13a2a42037e1b778f671af1d16 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 30 Jul 2009 19:19:18 +0200 Subject: ring-buffer: Fix advance of reader in rb_buffer_peek() When calling rb_buffer_peek() from ring_buffer_consume() and a padding event is returned, the function rb_advance_reader() is called twice. This may lead to missing samples or under high workloads to the warning below. This patch fixes this. If a padding event is returned by rb_buffer_peek() it will be consumed by the calling function now. Also, I simplified some code in ring_buffer_consume(). ------------[ cut here ]------------ WARNING: at /dev/shm/.source/linux/kernel/trace/ring_buffer.c:2289 rb_advance_reader+0x2e/0xc5() Hardware name: Anaheim Modules linked in: Pid: 29, comm: events/2 Tainted: G W 2.6.31-rc3-oprofile-x86_64-standard-00059-g5050dc2 #1 Call Trace: [] ? rb_advance_reader+0x2e/0xc5 [] warn_slowpath_common+0x77/0x8f [] warn_slowpath_null+0xf/0x11 [] rb_advance_reader+0x2e/0xc5 [] ring_buffer_consume+0xa0/0xd2 [] op_cpu_buffer_read_entry+0x21/0x9e [] ? __find_get_block+0x4b/0x165 [] sync_buffer+0xa5/0x401 [] ? __find_get_block+0x4b/0x165 [] ? wq_sync_buffer+0x0/0x78 [] wq_sync_buffer+0x5b/0x78 [] worker_thread+0x113/0x1ac [] ? autoremove_wake_function+0x0/0x38 [] ? worker_thread+0x0/0x1ac [] kthread+0x88/0x92 [] child_rip+0xa/0x20 [] ? kthread+0x0/0x92 [] ? child_rip+0x0/0x20 ---[ end trace f561c0a58fcc89bd ]--- Cc: Steven Rostedt Cc: Signed-off-by: Robert Richter Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2606cee433d..d4d3580a894 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2383,7 +2383,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) * the box. Return the padding, and we will release * the current locks, and try again. */ - rb_advance_reader(cpu_buffer); return event; case RINGBUF_TYPE_TIME_EXTEND: @@ -2519,6 +2518,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) if (dolock) spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(buffer, cpu, ts); + if (event && event->type_len == RINGBUF_TYPE_PADDING) + rb_advance_reader(cpu_buffer); if (dolock) spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); @@ -2590,12 +2591,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(buffer, cpu, ts); - if (!event) - goto out_unlock; - - rb_advance_reader(cpu_buffer); + if (event) + rb_advance_reader(cpu_buffer); - out_unlock: if (dolock) spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); -- cgit v1.2.3-70-g09d2 From 1054598cab8674438675085fae459e960eb10799 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Aug 2009 18:06:26 +0200 Subject: perf_counter: Fix double list iteration in per task precise stats Brice Goglin reported this crash with per task precise stats: > I finally managed to test the threaded perfcounter statistics (thanks a > lot for implementing it). I am running 2.6.31-rc5 (with the AMD > magny-cours patches but I don't think they matter here). I am trying to > measure local/remote memory accesses per thread during the well-known > stream benchmark. It's compiled with OpenMP using 16 threads on a > quad-socket quad-core barcelona machine. > > Command line is: > /mnt/scratch/bgoglin/cpunode/linux-2.6.31/tools/perf/perf record -f -s > -e r1000001e0 -e r1000002e0 -e r1000004e0 -e r1000008e0 ./stream > > It seems to work fine with a single -e on the command line > while it crashes when there are at least 2 of them. > It seems to work fine without -s as well. A silly copy-paste resulted in a messed up iteration which would cause the OOPS. Reported-by: Brice Goglin Signed-off-by: Peter Zijlstra Tested-by: Brice Goglin LKML-Reference: <1249574786.32113.550.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 199ed477131..673c1aaf733 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1104,7 +1104,7 @@ static void perf_counter_sync_stat(struct perf_counter_context *ctx, __perf_counter_sync_stat(counter, next_counter); counter = list_next_entry(counter, event_entry); - next_counter = list_next_entry(counter, event_entry); + next_counter = list_next_entry(next_counter, event_entry); } } -- cgit v1.2.3-70-g09d2 From 9795447f71324d8f14c19ed68b43c883135c3f59 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 7 Aug 2009 16:37:10 +0800 Subject: lockdep: Fix file mode of lock_stat /proc/lock_stat is writable. Signed-off-by: Li Zefan Cc: Peter Zijlstra LKML-Reference: <4A7BE7B6.10904@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/lockdep_proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index d7135aa2d2c..e94caa666db 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -758,7 +758,8 @@ static int __init lockdep_proc_init(void) &proc_lockdep_stats_operations); #ifdef CONFIG_LOCK_STAT - proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations); + proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL, + &proc_lock_stat_operations); #endif return 0; -- cgit v1.2.3-70-g09d2 From bd3f02212d6a457267e0c9c02c426151c436d9d4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Aug 2009 12:49:29 +0200 Subject: ring-buffer: Fix memleak in ring_buffer_free() I noticed oprofile memleaked in linux-2.6 current tree, and tracked this ring-buffer leak. Signed-off-by: Eric Dumazet LKML-Reference: <4A7C06B9.2090302@gmail.com> Cc: stable@kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d4d3580a894..a330513d96c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -735,6 +735,7 @@ ring_buffer_free(struct ring_buffer *buffer) put_online_cpus(); + kfree(buffer->buffers); free_cpumask_var(buffer->cpumask); kfree(buffer); -- cgit v1.2.3-70-g09d2 From 69dd647f969c28d18de77e2153f30d05a1874571 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 6 Aug 2009 15:07:29 -0700 Subject: generic-ipi: fix hotplug_cfd() Use CONFIG_HOTPLUG_CPU, not CONFIG_CPU_HOTPLUG When hot-unpluging a cpu, it will leak memory allocated at cpu hotplug, but only if CPUMASK_OFFSTACK=y, which is default to n. The bug was introduced by 8969a5ede0f9e17da4b943712429aef2c9bcd82b ("generic-ipi: remove kmalloc()"). Signed-off-by: Xiao Guangrong Cc: Ingo Molnar Cc: Jens Axboe Cc: Nick Piggin Cc: Peter Zijlstra Cc: Rusty Russell Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index ad63d850120..94188b8ecc3 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -57,7 +57,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_BAD; break; -#ifdef CONFIG_CPU_HOTPLUG +#ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: -- cgit v1.2.3-70-g09d2 From 9c8a8228d0827e0d91d28527209988f672f97d28 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 6 Aug 2009 15:09:28 -0700 Subject: execve: must clear current->clear_child_tid While looking at Jens Rosenboom bug report (http://lkml.org/lkml/2009/7/27/35) about strange sys_futex call done from a dying "ps" program, we found following problem. clone() syscall has special support for TID of created threads. This support includes two features. One (CLONE_CHILD_SETTID) is to set an integer into user memory with the TID value. One (CLONE_CHILD_CLEARTID) is to clear this same integer once the created thread dies. The integer location is a user provided pointer, provided at clone() time. kernel keeps this pointer value into current->clear_child_tid. At execve() time, we should make sure kernel doesnt keep this user provided pointer, as full user memory is replaced by a new one. As glibc fork() actually uses clone() syscall with CLONE_CHILD_SETTID and CLONE_CHILD_CLEARTID set, chances are high that we might corrupt user memory in forked processes. Following sequence could happen: 1) bash (or any program) starts a new process, by a fork() call that glibc maps to a clone( ... CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID ...) syscall 2) When new process starts, its current->clear_child_tid is set to a location that has a meaning only in bash (or initial program) context (&THREAD_SELF->tid) 3) This new process does the execve() syscall to start a new program. current->clear_child_tid is left unchanged (a non NULL value) 4) If this new program creates some threads, and initial thread exits, kernel will attempt to clear the integer pointed by current->clear_child_tid from mm_release() : if (tsk->clear_child_tid && !(tsk->flags & PF_SIGNALED) && atomic_read(&mm->mm_users) > 1) { u32 __user * tidptr = tsk->clear_child_tid; tsk->clear_child_tid = NULL; /* * We don't check the error code - if userspace has * not set up a proper pointer then tough luck. */ << here >> put_user(0, tidptr); sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0); } 5) OR : if new program is not multi-threaded, but spied by /proc/pid users (ps command for example), mm_users > 1, and the exiting program could corrupt 4 bytes in a persistent memory area (shm or memory mapped file) If current->clear_child_tid points to a writeable portion of memory of the new program, kernel happily and silently corrupts 4 bytes of memory, with unexpected effects. Fix is straightforward and should not break any sane program. Reported-by: Jens Rosenboom Acked-by: Linus Torvalds Signed-off-by: Eric Dumazet Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Sonny Rao Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Ulrich Drepper Cc: Oleg Nesterov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 466531eb92c..021e1138556 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -568,18 +568,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) * the value intact in a core dump, and to save the unnecessary * trouble otherwise. Userland only wants this done for a sys_exit. */ - if (tsk->clear_child_tid - && !(tsk->flags & PF_SIGNALED) - && atomic_read(&mm->mm_users) > 1) { - u32 __user * tidptr = tsk->clear_child_tid; + if (tsk->clear_child_tid) { + if (!(tsk->flags & PF_SIGNALED) && + atomic_read(&mm->mm_users) > 1) { + /* + * We don't check the error code - if userspace has + * not set up a proper pointer then tough luck. + */ + put_user(0, tsk->clear_child_tid); + sys_futex(tsk->clear_child_tid, FUTEX_WAKE, + 1, NULL, NULL, 0); + } tsk->clear_child_tid = NULL; - - /* - * We don't check the error code - if userspace has - * not set up a proper pointer then tough luck. - */ - put_user(0, tidptr); - sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0); } } -- cgit v1.2.3-70-g09d2 From ad7d6c7a0654a4bbda3e109f56af713267e96274 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 4 Aug 2009 09:01:33 -0700 Subject: x86/irq: Fix move_irq_desc() for nodes without ram Don't move it if target node is -1. Signed-off-by: Yinghai Lu LKML-Reference: <4A785B5D.4070702@kernel.org> Signed-off-by: Ingo Molnar --- kernel/irq/numa_migrate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 2f69bee57bf..3fd30197da2 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -107,8 +107,8 @@ out_unlock: struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) { - /* those all static, do move them */ - if (desc->irq < NR_IRQS_LEGACY) + /* those static or target node is -1, do not move them */ + if (desc->irq < NR_IRQS_LEGACY || node == -1) return desc; if (desc->node != node) -- cgit v1.2.3-70-g09d2 From 96b2de313b1e0e02aea80ee47df6a2b5cbdf8e13 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sat, 8 Aug 2009 10:49:09 -0500 Subject: tracing/filters: Don't use pred on alloc failure Dan Carpenter sent me a fix to prevent pred from being used if it couldn't be allocated. I noticed the same problem also existed for the create_pred() case and added a fix for that. Reported-by: Dan Carpenter Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Li Zefan LKML-Reference: <1249746549.6453.29.camel@tropicana> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 936c621bbf4..1557148be34 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1029,6 +1029,8 @@ static int replace_preds(struct event_subsystem *system, if (elt->op == OP_AND || elt->op == OP_OR) { pred = create_logical_pred(elt->op); + if (!pred) + return -ENOMEM; if (call) { err = filter_add_pred(ps, call, pred); filter_free_pred(pred); @@ -1048,6 +1050,8 @@ static int replace_preds(struct event_subsystem *system, } pred = create_pred(elt->op, operand1, operand2); + if (!pred) + return -ENOMEM; if (call) { err = filter_add_pred(ps, call, pred); filter_free_pred(pred); -- cgit v1.2.3-70-g09d2 From 26528e773ecc74fb1b61b7275f86f761cbb340ec Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sat, 8 Aug 2009 10:49:53 -0500 Subject: tracing/filters: Always free pred on filter_add_subsystem_pred() failure If filter_add_subsystem_pred() fails due to ENOSPC or ENOMEM, the pred doesn't get freed, while as a side effect it does for other errors. Make it so the caller always frees the pred for any error. Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Li Zefan LKML-Reference: <1249746593.6453.32.camel@tropicana> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 1557148be34..f32dc9d1ea7 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -624,9 +624,6 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, return -ENOSPC; } - filter->preds[filter->n_preds] = pred; - filter->n_preds++; - list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) @@ -643,6 +640,9 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, } replace_filter_string(call->filter, filter_string); } + + filter->preds[filter->n_preds] = pred; + filter->n_preds++; out: return err; } @@ -1034,9 +1034,12 @@ static int replace_preds(struct event_subsystem *system, if (call) { err = filter_add_pred(ps, call, pred); filter_free_pred(pred); - } else + } else { err = filter_add_subsystem_pred(ps, system, pred, filter_string); + if (err) + filter_free_pred(pred); + } if (err) return err; @@ -1055,9 +1058,12 @@ static int replace_preds(struct event_subsystem *system, if (call) { err = filter_add_pred(ps, call, pred); filter_free_pred(pred); - } else + } else { err = filter_add_subsystem_pred(ps, system, pred, filter_string); + if (err) + filter_free_pred(pred); + } if (err) return err; -- cgit v1.2.3-70-g09d2 From fb82ad719831db58e9baa4c67015aae3fe27e7e3 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sat, 8 Aug 2009 10:49:36 -0500 Subject: tracing/filters: Don't use pred on alloc failure Dan Carpenter sent me a fix to prevent pred from being used if it couldn't be allocated. This updates his patch for the same problem in the tracing tree (which has changed this code quite substantially). Reported-by: Dan Carpenter Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Li Zefan LKML-Reference: <1249746576.6453.30.camel@tropicana> Signed-off-by: Ingo Molnar The original report: create_logical_pred() could sometimes return NULL. It's a static checker complaining rather than problems at runtime... --- kernel/trace/trace_events_filter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 27c2dbea371..490337abed7 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1050,6 +1050,8 @@ static int replace_preds(struct event_subsystem *system, pred = create_pred(elt->op, operand1, operand2); add_pred: + if (!pred) + return -ENOMEM; if (call) err = filter_add_pred(ps, call, pred, false); else -- cgit v1.2.3-70-g09d2 From 42c2c8c854a716b05882a122632ddcd6dbe108f1 Mon Sep 17 00:00:00 2001 From: Sonic Zhang Date: Thu, 6 Aug 2009 15:58:11 -0700 Subject: printk: Fix "printk: Enable the use of more than one CON_BOOT (early console)" Don't return when we find the first bootconsole - it can leave other bootconsoles still installed, and they can be used and cause problems later (if they are in the init section, and eventually released), and cause problems. Make sure we remove all of them. Signed-off-by: Sonic Zhang Signed-off-by: Robin Getz Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index e0daaf57985..e10d193a833 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1352,7 +1352,7 @@ static int __init disable_boot_consoles(void) if (con->flags & CON_BOOT) { printk(KERN_INFO "turn off boot console %s%d\n", con->name, con->index); - return unregister_console(con); + unregister_console(con); } } return 0; -- cgit v1.2.3-70-g09d2 From 17d42c1c497aa54952b9e58c1502a46f0df40315 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 6 Aug 2009 16:03:30 -0700 Subject: posix_cpu_timers_exit_group(): Do not use thread_group_cputimer() When the process exits we don't have to run new cputimer nor use running one (as it not accounts when tsk->exit_state != 0) to get process CPU times. As there is only one thread we can just use CPU times fields from task and signal structs. Signed-off-by: Stanislaw Gruszka Cc: Peter Zijlstra Cc: Roland McGrath Cc: Vitaly Mayatskikh Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/posix-cpu-timers.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index bece7c0b67b..e33a21cb940 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -521,11 +521,12 @@ void posix_cpu_timers_exit(struct task_struct *tsk) } void posix_cpu_timers_exit_group(struct task_struct *tsk) { - struct task_cputime cputime; + struct signal_struct *const sig = tsk->signal; - thread_group_cputimer(tsk, &cputime); cleanup_timers(tsk->signal->cpu_timers, - cputime.utime, cputime.stime, cputime.sum_exec_runtime); + cputime_add(tsk->utime, sig->utime), + cputime_add(tsk->stime, sig->stime), + tsk->se.sum_exec_runtime + sig->sum_sched_runtime); } static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) -- cgit v1.2.3-70-g09d2 From c36ba80ea01d0aecb652c26799a912e760ce8981 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Thu, 6 Aug 2009 21:46:03 +0200 Subject: irq: Remove superfluous NULL pointer check in check_irq_resend() This takes care of the following entry from Dan's list: kernel/irq/resend.c +73 check_irq_resend(17) warning: variable derefenced before check 'desc->chip' Reported-by: Dan Carpenter Signed-off-by: Bartlomiej Zolnierkiewicz Cc: Jonathan Corbet Cc: Eugene Teo Cc: Julia Lawall LKML-Reference: <200908062146.03638.bzolnier@gmail.com> Signed-off-by: Ingo Molnar --- kernel/irq/resend.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 89c7117acf2..090c3763f3a 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -70,8 +70,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; - if (!desc->chip || !desc->chip->retrigger || - !desc->chip->retrigger(irq)) { + if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) { #ifdef CONFIG_HARDIRQS_SW_RESEND /* Set it pending and activate the softirq: */ set_bit(irq, irqs_resend); -- cgit v1.2.3-70-g09d2 From 3a6593050fbd8bbcaed3a44d01c31d907315c86c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jul 2009 17:34:57 +0200 Subject: perf_counter, ftrace: Fix perf_counter integration Adds possible second part to the assign argument of TP_EVENT(). TP_perf_assign( __perf_count(foo); __perf_addr(bar); ) Which, when specified make the swcounter increment with @foo instead of the usual 1, and report @bar for PERF_SAMPLE_ADDR (data address associated with the event) when this triggers a counter overflow. Signed-off-by: Peter Zijlstra Acked-by: Steven Rostedt Cc: Frederic Weisbecker Cc: Jason Baron Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- include/trace/ftrace.h | 110 ++++++++++++++++++++++++++++++++++++++----------- kernel/perf_counter.c | 6 +-- 2 files changed, 88 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 1867553c61e..fec71f8dbc4 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -144,6 +144,9 @@ #undef TP_fast_assign #define TP_fast_assign(args...) args +#undef TP_perf_assign +#define TP_perf_assign(args...) + #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ static int \ @@ -345,6 +348,88 @@ static inline int ftrace_get_offsets_##call( \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#ifdef CONFIG_EVENT_PROFILE + +/* + * Generate the functions needed for tracepoint perf_counter support. + * + * static void ftrace_profile_(proto) + * { + * extern void perf_tpcounter_event(int, u64, u64); + * u64 __addr = 0, __count = 1; + * + * <-- here we expand the TP_perf_assign() macro + * + * perf_tpcounter_event(event_.id, __addr, __count); + * } + * + * static int ftrace_profile_enable_(struct ftrace_event_call *event_call) + * { + * int ret = 0; + * + * if (!atomic_inc_return(&event_call->profile_count)) + * ret = register_trace_(ftrace_profile_); + * + * return ret; + * } + * + * static void ftrace_profile_disable_(struct ftrace_event_call *event_call) + * { + * if (atomic_add_negative(-1, &event->call->profile_count)) + * unregister_trace_(ftrace_profile_); + * } + * + */ + +#undef TP_fast_assign +#define TP_fast_assign(args...) + +#undef TP_perf_assign +#define TP_perf_assign(args...) args + +#undef __perf_addr +#define __perf_addr(a) __addr = (a) + +#undef __perf_count +#define __perf_count(c) __count = (c) + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ + \ +static void ftrace_profile_##call(proto) \ +{ \ + extern void perf_tpcounter_event(int, u64, u64); \ + u64 __addr = 0, __count = 1; \ + { assign; } \ + perf_tpcounter_event(event_##call.id, __addr, __count); \ +} \ + \ +static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ +{ \ + int ret = 0; \ + \ + if (!atomic_inc_return(&event_call->profile_count)) \ + ret = register_trace_##call(ftrace_profile_##call); \ + \ + return ret; \ +} \ + \ +static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ +{ \ + if (atomic_add_negative(-1, &event_call->profile_count)) \ + unregister_trace_##call(ftrace_profile_##call); \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +#undef TP_fast_assign +#define TP_fast_assign(args...) args + +#undef TP_perf_assign +#define TP_perf_assign(args...) + +#endif + /* * Stage 4 of the trace events. * @@ -447,28 +532,6 @@ static inline int ftrace_get_offsets_##call( \ #define TP_FMT(fmt, args...) fmt "\n", ##args #ifdef CONFIG_EVENT_PROFILE -#define _TRACE_PROFILE(call, proto, args) \ -static void ftrace_profile_##call(proto) \ -{ \ - extern void perf_tpcounter_event(int); \ - perf_tpcounter_event(event_##call.id); \ -} \ - \ -static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ -{ \ - int ret = 0; \ - \ - if (!atomic_inc_return(&event_call->profile_count)) \ - ret = register_trace_##call(ftrace_profile_##call); \ - \ - return ret; \ -} \ - \ -static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ -{ \ - if (atomic_add_negative(-1, &event_call->profile_count)) \ - unregister_trace_##call(ftrace_profile_##call); \ -} #define _TRACE_PROFILE_INIT(call) \ .profile_count = ATOMIC_INIT(-1), \ @@ -476,7 +539,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ .profile_disable = ftrace_profile_disable_##call, #else -#define _TRACE_PROFILE(call, proto, args) #define _TRACE_PROFILE_INIT(call) #endif @@ -502,7 +564,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ -_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ \ static struct ftrace_event_call event_##call; \ \ @@ -586,6 +647,5 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -#undef _TRACE_PROFILE #undef _TRACE_PROFILE_INIT diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 673c1aaf733..52eb4b68d34 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3703,17 +3703,17 @@ static const struct pmu perf_ops_task_clock = { }; #ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id) +void perf_tpcounter_event(int event_id, u64 addr, u64 count) { struct perf_sample_data data = { .regs = get_irq_regs(), - .addr = 0, + .addr = addr, }; if (!data.regs) data.regs = task_pt_regs(current); - do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data); + do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data); } EXPORT_SYMBOL_GPL(perf_tpcounter_event); -- cgit v1.2.3-70-g09d2 From f413cdb80ce00ec1a4d0ab949b5d96c81cae7f75 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Aug 2009 01:25:54 +0200 Subject: perf_counter: Fix/complete ftrace event records sampling This patch implements the kernel side support for ftrace event record sampling. A new counter sampling attribute is added: PERF_SAMPLE_TP_RECORD which requests ftrace events record sampling. In this case if a PERF_TYPE_TRACEPOINT counter is active and a tracepoint fires, we emit the tracepoint binary record to the perfcounter event buffer, as a sample. Result, after setting PERF_SAMPLE_TP_RECORD attribute from perf record: perf record -f -F 1 -a -e workqueue:workqueue_execution perf report -D 0x21e18 [0x48]: event: 9 . . ... raw event: size 72 bytes . 0000: 09 00 00 00 01 00 48 00 d0 c7 00 81 ff ff ff ff ......H........ . 0010: 0a 00 00 00 0a 00 00 00 21 00 00 00 00 00 00 00 ........!...... . 0020: 2b 00 01 02 0a 00 00 00 0a 00 00 00 65 76 65 6e +...........eve . 0030: 74 73 2f 31 00 00 00 00 00 00 00 00 0a 00 00 00 ts/1........... . 0040: e0 b1 31 81 ff ff ff ff ....... . 0x21e18 [0x48]: PERF_EVENT_SAMPLE (IP, 1): 10: 0xffffffff8100c7d0 period: 33 The raw ftrace binary record starts at offset 0020. Translation: struct trace_entry { type = 0x2b = 43; flags = 1; preempt_count = 2; pid = 0xa = 10; tgid = 0xa = 10; } thread_comm = "events/1" thread_pid = 0xa = 10; func = 0xffffffff8131b1e0 = flush_to_ldisc() What will come next? - Userspace support ('perf trace'), 'flight data recorder' mode for perf trace, etc. - The unconditional copy from the profiling callback brings some costs however if someone wants no such sampling to occur, and needs to be fixed in the future. For that we need to have an instant access to the perf counter attribute. This is a matter of a flag to add in the struct ftrace_event. - Take care of the events recursivity! Don't ever try to record a lock event for example, it seems some locking is used in the profiling fast path and lead to a tracing recursivity. That will be fixed using raw spinlock or recursivity protection. - [...] - Profit! :-) Signed-off-by: Frederic Weisbecker Cc: Li Zefan Cc: Tom Zanussi Cc: Arnaldo Carvalho de Melo Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Steven Rostedt Cc: Paul Mackerras Cc: Pekka Enberg Cc: Gabriel Munteanu Cc: Lai Jiangshan Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 4 +- include/linux/perf_counter.h | 9 ++- include/trace/ftrace.h | 130 ++++++++++++++++++++++++++++++++----------- kernel/perf_counter.c | 18 +++++- kernel/trace/trace.c | 1 + kernel/trace/trace.h | 4 -- tools/perf/builtin-record.c | 1 + 7 files changed, 126 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index d7cd193c227..a81170de7f6 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -89,7 +89,9 @@ enum print_line_t { TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ }; - +void tracing_generic_entry_update(struct trace_entry *entry, + unsigned long flags, + int pc); struct ring_buffer_event * trace_current_buffer_lock_reserve(int type, unsigned long len, unsigned long flags, int pc); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index e604e6ef72d..a67dd5c5b6d 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -121,8 +121,9 @@ enum perf_counter_sample_format { PERF_SAMPLE_CPU = 1U << 7, PERF_SAMPLE_PERIOD = 1U << 8, PERF_SAMPLE_STREAM_ID = 1U << 9, + PERF_SAMPLE_TP_RECORD = 1U << 10, - PERF_SAMPLE_MAX = 1U << 10, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ }; /* @@ -413,6 +414,11 @@ struct perf_callchain_entry { __u64 ip[PERF_MAX_STACK_DEPTH]; }; +struct perf_tracepoint_record { + int size; + char *record; +}; + struct task_struct; /** @@ -681,6 +687,7 @@ struct perf_sample_data { struct pt_regs *regs; u64 addr; u64 period; + void *private; }; extern int perf_counter_overflow(struct perf_counter *counter, int nmi, diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index fec71f8dbc4..7fb16d90e7b 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -353,15 +353,7 @@ static inline int ftrace_get_offsets_##call( \ /* * Generate the functions needed for tracepoint perf_counter support. * - * static void ftrace_profile_(proto) - * { - * extern void perf_tpcounter_event(int, u64, u64); - * u64 __addr = 0, __count = 1; - * - * <-- here we expand the TP_perf_assign() macro - * - * perf_tpcounter_event(event_.id, __addr, __count); - * } + * NOTE: The insertion profile callback (ftrace_profile_) is defined later * * static int ftrace_profile_enable_(struct ftrace_event_call *event_call) * { @@ -381,28 +373,10 @@ static inline int ftrace_get_offsets_##call( \ * */ -#undef TP_fast_assign -#define TP_fast_assign(args...) - -#undef TP_perf_assign -#define TP_perf_assign(args...) args - -#undef __perf_addr -#define __perf_addr(a) __addr = (a) - -#undef __perf_count -#define __perf_count(c) __count = (c) - #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ \ -static void ftrace_profile_##call(proto) \ -{ \ - extern void perf_tpcounter_event(int, u64, u64); \ - u64 __addr = 0, __count = 1; \ - { assign; } \ - perf_tpcounter_event(event_##call.id, __addr, __count); \ -} \ +static void ftrace_profile_##call(proto); \ \ static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ { \ @@ -422,12 +396,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -#undef TP_fast_assign -#define TP_fast_assign(args...) args - -#undef TP_perf_assign -#define TP_perf_assign(args...) - #endif /* @@ -647,5 +615,99 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +/* + * Define the insertion callback to profile events + * + * The job is very similar to ftrace_raw_event_ except that we don't + * insert in the ring buffer but in a perf counter. + * + * static void ftrace_profile_(proto) + * { + * struct ftrace_data_offsets_ __maybe_unused __data_offsets; + * struct ftrace_event_call *event_call = &event_; + * extern void perf_tpcounter_event(int, u64, u64, void *, int); + * struct ftrace_raw_##call *entry; + * u64 __addr = 0, __count = 1; + * unsigned long irq_flags; + * int __entry_size; + * int __data_size; + * int pc; + * + * local_save_flags(irq_flags); + * pc = preempt_count(); + * + * __data_size = ftrace_get_offsets_(&__data_offsets, args); + * __entry_size = __data_size + sizeof(*entry); + * + * do { + * char raw_data[__entry_size]; <- allocate our sample in the stack + * struct trace_entry *ent; + * + * entry = (struct ftrace_raw_ *)raw_data; + * ent = &entry->ent; + * tracing_generic_entry_update(ent, irq_flags, pc); + * ent->type = event_call->id; + * + * <- do some jobs with dynamic arrays + * + * <- affect our values + * + * perf_tpcounter_event(event_call->id, __addr, __count, entry, + * __entry_size); <- submit them to perf counter + * } while (0); + * + * } + */ + +#ifdef CONFIG_EVENT_PROFILE + +#undef __perf_addr +#define __perf_addr(a) __addr = (a) + +#undef __perf_count +#define __perf_count(c) __count = (c) + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ +static void ftrace_profile_##call(proto) \ +{ \ + struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ + struct ftrace_event_call *event_call = &event_##call; \ + extern void perf_tpcounter_event(int, u64, u64, void *, int); \ + struct ftrace_raw_##call *entry; \ + u64 __addr = 0, __count = 1; \ + unsigned long irq_flags; \ + int __entry_size; \ + int __data_size; \ + int pc; \ + \ + local_save_flags(irq_flags); \ + pc = preempt_count(); \ + \ + __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ + __entry_size = ALIGN(__data_size + sizeof(*entry), sizeof(u64));\ + \ + do { \ + char raw_data[__entry_size]; \ + struct trace_entry *ent; \ + \ + entry = (struct ftrace_raw_##call *)raw_data; \ + ent = &entry->ent; \ + tracing_generic_entry_update(ent, irq_flags, pc); \ + ent->type = event_call->id; \ + \ + tstruct \ + \ + { assign; } \ + \ + perf_tpcounter_event(event_call->id, __addr, __count, entry,\ + __entry_size); \ + } while (0); \ + \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#endif /* CONFIG_EVENT_PROFILE */ + #undef _TRACE_PROFILE_INIT diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 52eb4b68d34..868102172aa 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,6 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; + struct perf_tracepoint_record *tp; int callchain_size = 0; u64 time; struct { @@ -2714,6 +2715,11 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(u64); } + if (sample_type & PERF_SAMPLE_TP_RECORD) { + tp = data->private; + header.size += tp->size; + } + ret = perf_output_begin(&handle, counter, header.size, nmi, 1); if (ret) return; @@ -2777,6 +2783,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } + if (sample_type & PERF_SAMPLE_TP_RECORD) + perf_output_copy(&handle, tp->record, tp->size); + perf_output_end(&handle); } @@ -3703,11 +3712,18 @@ static const struct pmu perf_ops_task_clock = { }; #ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id, u64 addr, u64 count) +void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, + int entry_size) { + struct perf_tracepoint_record tp = { + .size = entry_size, + .record = record, + }; + struct perf_sample_data data = { .regs = get_irq_regs(), .addr = addr, + .private = &tp, }; if (!data.regs) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8930e39b9d8..c22b40f8f57 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -848,6 +848,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); } +EXPORT_SYMBOL_GPL(tracing_generic_entry_update); struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, int type, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3548ae5cc78..8b9f4f6e955 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -438,10 +438,6 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts); -void tracing_generic_entry_update(struct trace_entry *entry, - unsigned long flags, - int pc); - void default_wait_pipe(struct trace_iterator *iter); void poll_wait_pipe(struct trace_iterator *iter); diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 6da09928130..90c98082af1 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -412,6 +412,7 @@ static void create_counter(int counter, int cpu, pid_t pid) if (call_graph) attr->sample_type |= PERF_SAMPLE_CALLCHAIN; + attr->mmap = track; attr->comm = track; attr->inherit = (cpu < 0) && inherit; -- cgit v1.2.3-70-g09d2 From 7b4b6658e152ed4568cfff48175d93645df081d1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jul 2009 09:29:32 +0200 Subject: perf_counter: Fix software counters for fast moving event sources Reimplement the software counters to deal with fast moving event sources (such as tracepoints). This means being able to generate multiple overflows from a single 'event' as well as support throttling. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 164 +++++++++++++++++++++++++++++--------------------- 1 file changed, 94 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 868102172aa..615440ab929 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3344,87 +3344,81 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi, * Generic software counter infrastructure */ -static void perf_swcounter_update(struct perf_counter *counter) +/* + * We directly increment counter->count and keep a second value in + * counter->hw.period_left to count intervals. This period counter + * is kept in the range [-sample_period, 0] so that we can use the + * sign as trigger. + */ + +static u64 perf_swcounter_set_period(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; - u64 prev, now; - s64 delta; + u64 period = hwc->last_period; + u64 nr, offset; + s64 old, val; + + hwc->last_period = hwc->sample_period; again: - prev = atomic64_read(&hwc->prev_count); - now = atomic64_read(&hwc->count); - if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) - goto again; + old = val = atomic64_read(&hwc->period_left); + if (val < 0) + return 0; - delta = now - prev; + nr = div64_u64(period + val, period); + offset = nr * period; + val -= offset; + if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) + goto again; - atomic64_add(delta, &counter->count); - atomic64_sub(delta, &hwc->period_left); + return nr; } -static void perf_swcounter_set_period(struct perf_counter *counter) +static void perf_swcounter_overflow(struct perf_counter *counter, + int nmi, struct perf_sample_data *data) { struct hw_perf_counter *hwc = &counter->hw; - s64 left = atomic64_read(&hwc->period_left); - s64 period = hwc->sample_period; + u64 overflow; - if (unlikely(left <= -period)) { - left = period; - atomic64_set(&hwc->period_left, left); - hwc->last_period = period; - } + data->period = counter->hw.last_period; + overflow = perf_swcounter_set_period(counter); - if (unlikely(left <= 0)) { - left += period; - atomic64_add(period, &hwc->period_left); - hwc->last_period = period; - } + if (hwc->interrupts == MAX_INTERRUPTS) + return; - atomic64_set(&hwc->prev_count, -left); - atomic64_set(&hwc->count, -left); + for (; overflow; overflow--) { + if (perf_counter_overflow(counter, nmi, data)) { + /* + * We inhibit the overflow from happening when + * hwc->interrupts == MAX_INTERRUPTS. + */ + break; + } + } } -static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +static void perf_swcounter_unthrottle(struct perf_counter *counter) { - enum hrtimer_restart ret = HRTIMER_RESTART; - struct perf_sample_data data; - struct perf_counter *counter; - u64 period; - - counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); - counter->pmu->read(counter); - - data.addr = 0; - data.regs = get_irq_regs(); /* - * In case we exclude kernel IPs or are somehow not in interrupt - * context, provide the next best thing, the user IP. + * Nothing to do, we already reset hwc->interrupts. */ - if ((counter->attr.exclude_kernel || !data.regs) && - !counter->attr.exclude_user) - data.regs = task_pt_regs(current); +} - if (data.regs) { - if (perf_counter_overflow(counter, 0, &data)) - ret = HRTIMER_NORESTART; - } +static void perf_swcounter_add(struct perf_counter *counter, u64 nr, + int nmi, struct perf_sample_data *data) +{ + struct hw_perf_counter *hwc = &counter->hw; - period = max_t(u64, 10000, counter->hw.sample_period); - hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + atomic64_add(nr, &counter->count); - return ret; -} + if (!hwc->sample_period) + return; -static void perf_swcounter_overflow(struct perf_counter *counter, - int nmi, struct perf_sample_data *data) -{ - data->period = counter->hw.last_period; + if (!data->regs) + return; - perf_swcounter_update(counter); - perf_swcounter_set_period(counter); - if (perf_counter_overflow(counter, nmi, data)) - /* soft-disable the counter */ - ; + if (!atomic64_add_negative(nr, &hwc->period_left)) + perf_swcounter_overflow(counter, nmi, data); } static int perf_swcounter_is_counting(struct perf_counter *counter) @@ -3488,15 +3482,6 @@ static int perf_swcounter_match(struct perf_counter *counter, return 1; } -static void perf_swcounter_add(struct perf_counter *counter, u64 nr, - int nmi, struct perf_sample_data *data) -{ - int neg = atomic64_add_negative(nr, &counter->hw.count); - - if (counter->hw.sample_period && !neg && data->regs) - perf_swcounter_overflow(counter, nmi, data); -} - static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, enum perf_type_id type, u32 event, u64 nr, int nmi, @@ -3575,26 +3560,65 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi, static void perf_swcounter_read(struct perf_counter *counter) { - perf_swcounter_update(counter); } static int perf_swcounter_enable(struct perf_counter *counter) { - perf_swcounter_set_period(counter); + struct hw_perf_counter *hwc = &counter->hw; + + if (hwc->sample_period) { + hwc->last_period = hwc->sample_period; + perf_swcounter_set_period(counter); + } return 0; } static void perf_swcounter_disable(struct perf_counter *counter) { - perf_swcounter_update(counter); } static const struct pmu perf_ops_generic = { .enable = perf_swcounter_enable, .disable = perf_swcounter_disable, .read = perf_swcounter_read, + .unthrottle = perf_swcounter_unthrottle, }; +/* + * hrtimer based swcounter callback + */ + +static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +{ + enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; + struct perf_counter *counter; + u64 period; + + counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); + counter->pmu->read(counter); + + data.addr = 0; + data.regs = get_irq_regs(); + /* + * In case we exclude kernel IPs or are somehow not in interrupt + * context, provide the next best thing, the user IP. + */ + if ((counter->attr.exclude_kernel || !data.regs) && + !counter->attr.exclude_user) + data.regs = task_pt_regs(current); + + if (data.regs) { + if (perf_counter_overflow(counter, 0, &data)) + ret = HRTIMER_NORESTART; + } + + period = max_t(u64, 10000, counter->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + + return ret; +} + /* * Software counter: cpu wall time clock */ -- cgit v1.2.3-70-g09d2 From 10b8e3066066708f304e0fc5cfe658e05abf943d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Aug 2009 04:26:35 +0200 Subject: perf_counter: Work around gcc warning by initializing tracepoint record unconditionally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Despite that the tracepoint record is always present when the PERF_SAMPLE_TP_RECORD flag is set, gcc raises a warning, thinking it might not be initialized: kernel/perf_counter.c: In function ‘perf_counter_output’: kernel/perf_counter.c:2650: warning: ‘tp’ may be used uninitialized in this function Then, initialize it to NULL and always check if it's not NULL before dereference it. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249698400-5441-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 615440ab929..117622cb73a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_tracepoint_record *tp; + struct perf_tracepoint_record *tp = NULL; int callchain_size = 0; u64 time; struct { @@ -2717,7 +2717,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_TP_RECORD) { tp = data->private; - header.size += tp->size; + if (tp) + header.size += tp->size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2783,7 +2784,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if (sample_type & PERF_SAMPLE_TP_RECORD) + if ((sample_type & PERF_SAMPLE_TP_RECORD) && tp) perf_output_copy(&handle, tp->record, tp->size); perf_output_end(&handle); -- cgit v1.2.3-70-g09d2 From 3a43ce68ae1758fa6a839386025ef45acb6baa22 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Aug 2009 04:26:37 +0200 Subject: perf_counter: Fix tracepoint sampling to be part of generic sampling Based on Peter's comments, make tracepoint sampling generic just like all the other sampling bits are. This is a rename with no code changes: - PERF_SAMPLE_TP_RECORD to PERF_SAMPLE_RAW - struct perf_tracepoint_record to perf_raw_record We want the system in place that transport tracepoints raw samples events into the perf ring buffer to be generalized and usable by any type of counter. Reported-by; Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249698400-5441-4-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 10 +++++----- kernel/perf_counter.c | 20 ++++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a67dd5c5b6d..2aabe43c1d0 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -121,7 +121,7 @@ enum perf_counter_sample_format { PERF_SAMPLE_CPU = 1U << 7, PERF_SAMPLE_PERIOD = 1U << 8, PERF_SAMPLE_STREAM_ID = 1U << 9, - PERF_SAMPLE_TP_RECORD = 1U << 10, + PERF_SAMPLE_RAW = 1U << 10, PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ }; @@ -414,9 +414,9 @@ struct perf_callchain_entry { __u64 ip[PERF_MAX_STACK_DEPTH]; }; -struct perf_tracepoint_record { - int size; - char *record; +struct perf_raw_record { + u32 size; + void *data; }; struct task_struct; @@ -687,7 +687,7 @@ struct perf_sample_data { struct pt_regs *regs; u64 addr; u64 period; - void *private; + struct perf_raw_record *raw; }; extern int perf_counter_overflow(struct perf_counter *counter, int nmi, diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 117622cb73a..00231054041 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_tracepoint_record *tp = NULL; + struct perf_raw_record *raw = NULL; int callchain_size = 0; u64 time; struct { @@ -2715,10 +2715,10 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(u64); } - if (sample_type & PERF_SAMPLE_TP_RECORD) { - tp = data->private; - if (tp) - header.size += tp->size; + if (sample_type & PERF_SAMPLE_RAW) { + raw = data->raw; + if (raw) + header.size += raw->size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2784,8 +2784,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if ((sample_type & PERF_SAMPLE_TP_RECORD) && tp) - perf_output_copy(&handle, tp->record, tp->size); + if ((sample_type & PERF_SAMPLE_RAW) && raw) + perf_output_copy(&handle, raw->data, raw->size); perf_output_end(&handle); } @@ -3740,15 +3740,15 @@ static const struct pmu perf_ops_task_clock = { void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, int entry_size) { - struct perf_tracepoint_record tp = { + struct perf_raw_record raw = { .size = entry_size, - .record = record, + .data = record, }; struct perf_sample_data data = { .regs = get_irq_regs(), .addr = addr, - .private = &tp, + .raw = &raw, }; if (!data.regs) -- cgit v1.2.3-70-g09d2 From 3a80b4a3539696f4b0574876326860323035a302 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Aug 2009 19:49:01 +0200 Subject: perf_counter: Fix a race on perf_counter_ctx While extending perfcounters with BTS hw-tracing, Markus Metzger managed to trigger this warning: [ 995.557128] WARNING: at kernel/perf_counter.c:1191 __perf_counter_task_sched_out+0x48/0x6b() triggers because commit 9f498cc5be7e013d8d6e4c616980ed0ffc8680d2 (perf_counter: Full task tracing) removed clearing of tsk->perf_counter_ctxp out from under ctx->lock which introduced a race (against perf_lock_task_context). Move it back and deal with the exit notification by explicitly passing along the former task context. Reported-by: Markus T Metzger Signed-off-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1249667341.17467.5.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 00231054041..546e62d6294 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2850,7 +2850,8 @@ perf_counter_read_event(struct perf_counter *counter, */ struct perf_task_event { - struct task_struct *task; + struct task_struct *task; + struct perf_counter_context *task_ctx; struct { struct perf_event_header header; @@ -2910,24 +2911,23 @@ static void perf_counter_task_ctx(struct perf_counter_context *ctx, static void perf_counter_task_event(struct perf_task_event *task_event) { struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; + struct perf_counter_context *ctx = task_event->task_ctx; cpuctx = &get_cpu_var(perf_cpu_context); perf_counter_task_ctx(&cpuctx->ctx, task_event); put_cpu_var(perf_cpu_context); rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); + if (!ctx) + ctx = rcu_dereference(task_event->task->perf_counter_ctxp); if (ctx) perf_counter_task_ctx(ctx, task_event); rcu_read_unlock(); } -static void perf_counter_task(struct task_struct *task, int new) +static void perf_counter_task(struct task_struct *task, + struct perf_counter_context *task_ctx, + int new) { struct perf_task_event task_event; @@ -2937,8 +2937,9 @@ static void perf_counter_task(struct task_struct *task, int new) return; task_event = (struct perf_task_event){ - .task = task, - .event = { + .task = task, + .task_ctx = task_ctx, + .event = { .header = { .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, .misc = 0, @@ -2956,7 +2957,7 @@ static void perf_counter_task(struct task_struct *task, int new) void perf_counter_fork(struct task_struct *task) { - perf_counter_task(task, 1); + perf_counter_task(task, NULL, 1); } /* @@ -4310,7 +4311,7 @@ void perf_counter_exit_task(struct task_struct *child) unsigned long flags; if (likely(!child->perf_counter_ctxp)) { - perf_counter_task(child, 0); + perf_counter_task(child, NULL, 0); return; } @@ -4330,6 +4331,7 @@ void perf_counter_exit_task(struct task_struct *child) * incremented the context's refcount before we do put_ctx below. */ spin_lock(&child_ctx->lock); + child->perf_counter_ctxp = NULL; /* * If this context is a clone; unclone it so it can't get * swapped to another process while we're removing all @@ -4343,9 +4345,7 @@ void perf_counter_exit_task(struct task_struct *child) * won't get any samples after PERF_EVENT_EXIT. We can however still * get a few PERF_EVENT_READ events. */ - perf_counter_task(child, 0); - - child->perf_counter_ctxp = NULL; + perf_counter_task(child, child_ctx, 0); /* * We can recurse on the same lock type through: -- cgit v1.2.3-70-g09d2 From beda2c7ea2c15ed01eef00a997d2b0496c3a502d Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Sun, 9 Aug 2009 15:34:39 -0700 Subject: futex: Update futex_q lock_ptr on requeue proxy lock futex_requeue() can acquire the lock on behalf of a waiter early on or during the requeue loop if it is uncontended or in the event of a lock steal or owner died. On wakeup, the waiter (in futex_wait_requeue_pi()) cleans up the pi_state owner using the lock_ptr to protect against concurrent access to the pi_state. The pi_state is hung off futex_q's on the requeue target futex hash bucket so the lock_ptr needs to be updated accordingly. The problem manifested by triggering the WARN_ON in lookup_pi_state() about the pid != pi_state->owner->pid. With this patch, the pi_state is properly guarded against concurrent access via the requeue target hb lock. The astute reviewer may notice that there is a window of time between when futex_requeue() unlocks the hb locks and when futex_wait_requeue_pi() will acquire hb2->lock. During this time the pi_state and uval are not in sync with the underlying rtmutex owner (but the uval does indicate there are waiters, so no atomic changes will occur in userspace). However, this is not a problem. Should a contending thread enter lookup_pi_state() and acquire hb2->lock before the ownership is fixed up, it will find the pi_state hung off a waiter's (possibly the pending owner's) futex_q and block on the rtmutex. Once futex_wait_requeue_pi() fixes up the owner, it will also move the pi_state from the old owner's task->pi_state_list to its own. v3: Fix plist lock name for application to mainline (rather than -rt) Compile tested against tip/v2.6.31-rc5. Signed-off-by: Darren Hart Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Eric Dumazet Cc: Dinakar Guniguntala Cc: John Stultz LKML-Reference: <4A7F4EFF.6090903@us.ibm.com> Signed-off-by: Ingo Molnar --- kernel/futex.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 0672ff88f15..8cc3ee1363a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1010,15 +1010,19 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue * q: the futex_q * key: the key of the requeue target futex + * hb: the hash_bucket of the requeue target futex * * During futex_requeue, with requeue_pi=1, it is possible to acquire the * target futex if it is uncontended or via a lock steal. Set the futex_q key * to the requeue target futex so the waiter can detect the wakeup on the right * futex, but remove it from the hb and NULL the rt_waiter so it can detect - * atomic lock acquisition. Must be called with the q->lock_ptr held. + * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock + * to protect access to the pi_state to fixup the owner later. Must be called + * with both q->lock_ptr and hb->lock held. */ static inline -void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) +void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, + struct futex_hash_bucket *hb) { drop_futex_key_refs(&q->key); get_futex_key_refs(key); @@ -1030,6 +1034,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) WARN_ON(!q->rt_waiter); q->rt_waiter = NULL; + q->lock_ptr = &hb->lock; +#ifdef CONFIG_DEBUG_PI_LIST + q->list.plist.lock = &hb->lock; +#endif + wake_up_state(q->task, TASK_NORMAL); } @@ -1088,7 +1097,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, set_waiters); if (ret == 1) - requeue_pi_wake_futex(top_waiter, key2); + requeue_pi_wake_futex(top_waiter, key2, hb2); return ret; } @@ -1273,7 +1282,7 @@ retry_private: this->task, 1); if (ret == 1) { /* We got the lock. */ - requeue_pi_wake_futex(this, &key2); + requeue_pi_wake_futex(this, &key2, hb2); continue; } else if (ret) { /* -EDEADLK */ -- cgit v1.2.3-70-g09d2 From a044560c3a1f0ad75ce685c1ed7604820b9ed319 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 11:16:52 +0200 Subject: perf_counter: Correct PERF_SAMPLE_RAW output PERF_SAMPLE_* output switches should unconditionally output the correct format, as they are the only way to unambiguously parse the PERF_EVENT_SAMPLE data. Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249896447.17467.74.camel@twins> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ include/trace/ftrace.h | 3 ++- kernel/perf_counter.c | 30 ++++++++++++++++++++++++------ 3 files changed, 28 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2aabe43c1d0..a9d823a93fe 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -369,6 +369,8 @@ enum perf_event_type { * * { u64 nr, * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN + * { u32 size; + * char data[size];}&& PERF_SAMPLE_RAW * }; */ PERF_EVENT_SAMPLE = 9, diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 7fb16d90e7b..7167b9b97da 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -685,7 +685,8 @@ static void ftrace_profile_##call(proto) \ pc = preempt_count(); \ \ __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ - __entry_size = ALIGN(__data_size + sizeof(*entry), sizeof(u64));\ + __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\ + sizeof(u64)); \ \ do { \ char raw_data[__entry_size]; \ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 546e62d6294..5229d1666fa 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_raw_record *raw = NULL; int callchain_size = 0; u64 time; struct { @@ -2716,9 +2715,15 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } if (sample_type & PERF_SAMPLE_RAW) { - raw = data->raw; - if (raw) - header.size += raw->size; + int size = sizeof(u32); + + if (data->raw) + size += data->raw->size; + else + size += sizeof(u32); + + WARN_ON_ONCE(size & (sizeof(u64)-1)); + header.size += size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2784,8 +2789,21 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if ((sample_type & PERF_SAMPLE_RAW) && raw) - perf_output_copy(&handle, raw->data, raw->size); + if (sample_type & PERF_SAMPLE_RAW) { + if (data->raw) { + perf_output_put(&handle, data->raw->size); + perf_output_copy(&handle, data->raw->data, data->raw->size); + } else { + struct { + u32 size; + u32 data; + } raw = { + .size = sizeof(u32), + .data = 0, + }; + perf_output_put(&handle, raw); + } + } perf_output_end(&handle); } -- cgit v1.2.3-70-g09d2 From a4e95fc2cbb31d70a65beffeaf8773f881328c34 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 11:20:12 +0200 Subject: perf_counter: Require CAP_SYS_ADMIN for raw tracepoint data Raw tracepoint data contains various kernel internals and data from other users, so restrict this to CAP_SYS_ADMIN. Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249896452.17467.75.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 5229d1666fa..b0b20a07f39 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3787,6 +3787,14 @@ static void tp_perf_counter_destroy(struct perf_counter *counter) static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) { + /* + * Raw tracepoint data is a severe data leak, only allow root to + * have these. + */ + if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && + !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + if (ftrace_profile_enable(counter->attr.config)) return NULL; -- cgit v1.2.3-70-g09d2 From 2fc391112fb6f3424435a3aa2fda887497b5f807 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 12:33:05 +0100 Subject: locking, sched: Give waitqueue spinlocks their own lockdep classes Give waitqueue spinlocks their own lockdep classes when they are initialised from init_waitqueue_head(). This means that struct wait_queue::func functions can operate other waitqueues. This is used by CacheFiles to catch the page from a backing fs being unlocked and to wake up another thread to take a copy of it. Signed-off-by: Peter Zijlstra Signed-off-by: David Howells Tested-by: Takashi Iwai Cc: linux-cachefs@redhat.com Cc: torvalds@osdl.org Cc: akpm@linux-foundation.org LKML-Reference: <20090810113305.17284.81508.stgit@warthog.procyon.org.uk> Signed-off-by: Ingo Molnar --- include/linux/wait.h | 9 ++++++++- kernel/wait.c | 5 +++-- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/wait.h b/include/linux/wait.h index 6788e1a4d4c..cf3c2f5dba5 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -77,7 +77,14 @@ struct task_struct; #define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ { .flags = word, .bit_nr = bit, } -extern void init_waitqueue_head(wait_queue_head_t *q); +extern void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *); + +#define init_waitqueue_head(q) \ + do { \ + static struct lock_class_key __key; \ + \ + __init_waitqueue_head((q), &__key); \ + } while (0) #ifdef CONFIG_LOCKDEP # define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \ diff --git a/kernel/wait.c b/kernel/wait.c index ea7c3b4275c..c4bd3d825f3 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -10,13 +10,14 @@ #include #include -void init_waitqueue_head(wait_queue_head_t *q) +void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key) { spin_lock_init(&q->lock); + lockdep_set_class(&q->lock, key); INIT_LIST_HEAD(&q->task_list); } -EXPORT_SYMBOL(init_waitqueue_head); +EXPORT_SYMBOL(__init_waitqueue_head); void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) { -- cgit v1.2.3-70-g09d2 From 4dc88029fd916b860ef063c40180aa604ce93494 Mon Sep 17 00:00:00 2001 From: Dinakar Guniguntala Date: Mon, 10 Aug 2009 18:31:42 +0530 Subject: futex: Fix compat_futex to be same as futex for REQUEUE_PI Need to add the REQUEUE_PI checks to the compat_sys_futex API as well to ensure 32 bit requeue's work fine on a 64 bit system. Patch is against latest tip Signed-off-by: Dinakar Guniguntala Cc: Darren Hart LKML-Reference: <20090810130142.GA23619@in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/futex_compat.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d607a5b9ee2..235716556bf 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -180,7 +180,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, int cmd = op & FUTEX_CMD_MASK; if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET)) { + cmd == FUTEX_WAIT_BITSET || + cmd == FUTEX_WAIT_REQUEUE_PI)) { if (get_compat_timespec(&ts, utime)) return -EFAULT; if (!timespec_valid(&ts)) @@ -191,7 +192,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, t = ktime_add_safe(ktime_get(), t); tp = &t; } - if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) val2 = (int) (unsigned long) utime; return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); -- cgit v1.2.3-70-g09d2 From 392741e0a4e17c82e3978b7fcbf04291294dc0a1 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 7 Aug 2009 15:20:48 -0700 Subject: futex: Fix handling of bad requeue syscall pairing If futex_requeue(requeue_pi=1) finds a futex_q that was created by a call other the futex_wait_requeue_pi(), the q.rt_waiter may be null. If so, this will result in an oops from the following call graph: futex_requeue() rt_mutex_start_proxy_lock() task_blocks_on_rt_mutex() waiter->task dereference OOPS We currently WARN_ON() if this is detected, clearly this is inadequate. If we detect a mispairing in futex_requeue(), bail out, seding -EINVAL to user-space. V2: Fix parenthesis warnings. Signed-off-by: Darren Hart Acked-by: Peter Zijlstra Cc: Steven Rostedt Cc: John Kacur Cc: Eric Dumazet Cc: Dinakar Guniguntala Cc: John Stultz LKML-Reference: <4A7CA8C0.7010809@us.ibm.com> Signed-off-by: Ingo Molnar --- kernel/futex.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 8cc3ee1363a..e18cfbdc719 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1256,8 +1256,15 @@ retry_private: if (!match_futex(&this->key, &key1)) continue; - WARN_ON(!requeue_pi && this->rt_waiter); - WARN_ON(requeue_pi && !this->rt_waiter); + /* + * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always + * be paired with each other and no other futex ops. + */ + if ((requeue_pi && !this->rt_waiter) || + (!requeue_pi && this->rt_waiter)) { + ret = -EINVAL; + break; + } /* * Wake nr_wake waiters. For requeue_pi, if we acquired the -- cgit v1.2.3-70-g09d2 From 7770841e63730d62928b0879498064e9614b2ce0 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Fri, 7 Aug 2009 18:53:21 +0800 Subject: tracing: Rename set_tracer_flags()'s local variable trace_flags set_tracer_flags() have a local variable named trace_flags which has the same name than a global one in the same scope. This leads to confusion, using tracer_flags should be better by its meaning. Changelog: v1->v2: Simplified another patch in this patchset, no change in this patch. Signed-off-by: Zhao Lei Cc: Steven Rostedt Cc: Li Zefan Signed-off-by: Frederic Weisbecker --- kernel/trace/trace.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e793cda91dd..8ac204360a3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2118,23 +2118,23 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf, /* Try to assign a tracer specific option */ static int set_tracer_option(struct tracer *trace, char *cmp, int neg) { - struct tracer_flags *trace_flags = trace->flags; + struct tracer_flags *tracer_flags = trace->flags; struct tracer_opt *opts = NULL; int ret = 0, i = 0; int len; - for (i = 0; trace_flags->opts[i].name; i++) { - opts = &trace_flags->opts[i]; + for (i = 0; tracer_flags->opts[i].name; i++) { + opts = &tracer_flags->opts[i]; len = strlen(opts->name); if (strncmp(cmp, opts->name, len) == 0) { - ret = trace->set_flag(trace_flags->val, + ret = trace->set_flag(tracer_flags->val, opts->bit, !neg); break; } } /* Not found */ - if (!trace_flags->opts[i].name) + if (!tracer_flags->opts[i].name) return -EINVAL; /* Refused to handle */ @@ -2142,9 +2142,9 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg) return ret; if (neg) - trace_flags->val &= ~opts->bit; + tracer_flags->val &= ~opts->bit; else - trace_flags->val |= opts->bit; + tracer_flags->val |= opts->bit; return 0; } -- cgit v1.2.3-70-g09d2 From 066e0378c23f0a3db730893f6a041e4a3922a385 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 10 Aug 2009 16:52:23 -0400 Subject: tracing: Call arch_init_ftrace_syscalls at boot Call arch_init_ftrace_syscalls at boot, so we can determine early the set of syscalls for the syscall trace events. Signed-off-by: Jason Baron Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Masami Hiramatsu Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/ftrace.c | 15 ++++----------- include/trace/syscall.h | 1 - kernel/trace/trace_syscalls.c | 1 - 3 files changed, 4 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index afb31d72618..0d93d409b8d 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -516,31 +516,24 @@ int syscall_name_to_nr(char *name) return -1; } -void arch_init_ftrace_syscalls(void) +static int __init arch_init_ftrace_syscalls(void) { int i; struct syscall_metadata *meta; unsigned long **psys_syscall_table = &sys_call_table; - static atomic_t refs; - - if (atomic_inc_return(&refs) != 1) - goto end; syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * FTRACE_SYSCALL_MAX, GFP_KERNEL); if (!syscalls_metadata) { WARN_ON(1); - return; + return -ENOMEM; } for (i = 0; i < FTRACE_SYSCALL_MAX; i++) { meta = find_syscall_meta(psys_syscall_table[i]); syscalls_metadata[i] = meta; } - return; - - /* Paranoid: avoid overflow */ -end: - atomic_dec(&refs); + return 0; } +arch_initcall(arch_init_ftrace_syscalls); #endif diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 8cfe515cbc4..c55fcce4fbb 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -19,7 +19,6 @@ struct syscall_metadata { }; #ifdef CONFIG_FTRACE_SYSCALLS -extern void arch_init_ftrace_syscalls(void); extern struct syscall_metadata *syscall_nr_to_meta(int nr); extern void start_ftrace_syscalls(void); extern void stop_ftrace_syscalls(void); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 5e579645ac8..08aed439fea 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -106,7 +106,6 @@ void start_ftrace_syscalls(void) if (++refcount != 1) goto unlock; - arch_init_ftrace_syscalls(); read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, t) { -- cgit v1.2.3-70-g09d2 From a871bd33a6c0bc86fb47cd02ea2650dd43d3d95f Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 10 Aug 2009 16:52:31 -0400 Subject: tracing: Add syscall tracepoints add two tracepoints in syscall exit and entry path, conditioned on TIF_SYSCALL_FTRACE. Supports the syscall trace event code. Signed-off-by: Jason Baron Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Masami Hiramatsu Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/ptrace.c | 7 +++++-- include/trace/syscall.h | 20 ++++++++++++++++++++ kernel/tracepoint.c | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 09ecbde91c1..34dd6f15185 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -37,6 +37,9 @@ #include +DEFINE_TRACE(syscall_enter); +DEFINE_TRACE(syscall_exit); + #include "tls.h" enum x86_regset { @@ -1498,7 +1501,7 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) ret = -1L; if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) - ftrace_syscall_enter(regs); + trace_syscall_enter(regs, regs->orig_ax); if (unlikely(current->audit_context)) { if (IS_IA32) @@ -1524,7 +1527,7 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) - ftrace_syscall_exit(regs); + trace_syscall_exit(regs, regs->ax); if (test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall_exit(regs, 0); diff --git a/include/trace/syscall.h b/include/trace/syscall.h index c55fcce4fbb..3951d774de1 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -1,8 +1,28 @@ #ifndef _TRACE_SYSCALL_H #define _TRACE_SYSCALL_H +#include + #include + +extern void syscall_regfunc(void); +extern void syscall_unregfunc(void); + +DECLARE_TRACE_WITH_CALLBACK(syscall_enter, + TP_PROTO(struct pt_regs *regs, long id), + TP_ARGS(regs, id), + syscall_regfunc, + syscall_unregfunc +); + +DECLARE_TRACE_WITH_CALLBACK(syscall_exit, + TP_PROTO(struct pt_regs *regs, long ret), + TP_ARGS(regs, ret), + syscall_regfunc, + syscall_unregfunc +); + /* * A syscall entry in the ftrace syscalls array. * diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 1ef5d3a601c..070a42bb892 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -24,6 +24,7 @@ #include #include #include +#include extern struct tracepoint __start___tracepoints[]; extern struct tracepoint __stop___tracepoints[]; @@ -577,3 +578,40 @@ static int init_tracepoints(void) __initcall(init_tracepoints); #endif /* CONFIG_MODULES */ + +static DEFINE_MUTEX(regfunc_mutex); +static int sys_tracepoint_refcount; + +void syscall_regfunc(void) +{ + unsigned long flags; + struct task_struct *g, *t; + + mutex_lock(®func_mutex); + if (!sys_tracepoint_refcount) { + read_lock_irqsave(&tasklist_lock, flags); + do_each_thread(g, t) { + set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); + } while_each_thread(g, t); + read_unlock_irqrestore(&tasklist_lock, flags); + } + sys_tracepoint_refcount++; + mutex_unlock(®func_mutex); +} + +void syscall_unregfunc(void) +{ + unsigned long flags; + struct task_struct *g, *t; + + mutex_lock(®func_mutex); + sys_tracepoint_refcount--; + if (!sys_tracepoint_refcount) { + read_lock_irqsave(&tasklist_lock, flags); + do_each_thread(g, t) { + clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); + } while_each_thread(g, t); + read_unlock_irqrestore(&tasklist_lock, flags); + } + mutex_unlock(®func_mutex); +} -- cgit v1.2.3-70-g09d2 From f744bd576a827c5b02e756b81fc2578edf8179b8 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 10 Aug 2009 16:52:39 -0400 Subject: tracing: Raw_init() bailout in trace event register fail case Allow the return value of raw_init() trace event callback to bail us out of creating a trace event file, in case we fail to register our event. Also, we plan to return -ENOSYS for syscall events that don't match any syscalls listed in our arch tracing syscall table, we don't want to warn in that case, we just want this event to be invisible in debugfs and ignored. Signed-off-by: Jason Baron Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Masami Hiramatsu Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_events.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e0cbede9678..f95f8470dd3 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -925,15 +925,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, if (strcmp(call->system, TRACE_SYSTEM) != 0) d_events = event_subsystem_dir(call->system, d_events); - if (call->raw_init) { - ret = call->raw_init(); - if (ret < 0) { - pr_warning("Could not initialize trace point" - " events/%s\n", call->name); - return ret; - } - } - call->dir = debugfs_create_dir(call->name, d_events); if (!call->dir) { pr_warning("Could not create debugfs " @@ -1058,6 +1049,7 @@ static void trace_module_add_events(struct module *mod) struct ftrace_module_file_ops *file_ops = NULL; struct ftrace_event_call *call, *start, *end; struct dentry *d_events; + int ret; start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; @@ -1073,7 +1065,15 @@ static void trace_module_add_events(struct module *mod) /* The linker may leave blanks */ if (!call->name) continue; - + if (call->raw_init) { + ret = call->raw_init(); + if (ret < 0) { + if (ret != -ENOSYS) + pr_warning("Could not initialize trace " + "point events/%s\n", call->name); + continue; + } + } /* * This module has events, create file ops for this module * if not already done. @@ -1225,6 +1225,15 @@ static __init int event_trace_init(void) /* The linker may leave blanks */ if (!call->name) continue; + if (call->raw_init) { + ret = call->raw_init(); + if (ret < 0) { + if (ret != -ENOSYS) + pr_warning("Could not initialize trace " + "point events/%s\n", call->name); + continue; + } + } list_add(&call->list, &ftrace_events); event_create_dir(call, d_events, &ftrace_event_id_fops, &ftrace_enable_fops, &ftrace_event_filter_fops, -- cgit v1.2.3-70-g09d2 From 69fd4f0eb2ececbf8ade55e31a933e174965745e Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 10 Aug 2009 16:52:44 -0400 Subject: tracing: Add ftrace_event_call void * 'data' field add an optional void * pointer to 'ftrace_event_call' that is passed in for regfunc and unregfunc. This prepares for syscall tracepoints creation by passing the name of the syscall we want to trace and then retrieve its number through our arch syscall table. Signed-off-by: Jason Baron Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Masami Hiramatsu Signed-off-by: Frederic Weisbecker --- include/linux/ftrace_event.h | 5 +++-- include/trace/ftrace.h | 4 ++-- kernel/trace/trace_events.c | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index ac8c6f8cf24..8544f121d9f 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -112,8 +112,8 @@ struct ftrace_event_call { struct dentry *dir; struct trace_event *event; int enabled; - int (*regfunc)(void); - void (*unregfunc)(void); + int (*regfunc)(void *); + void (*unregfunc)(void *); int id; int (*raw_init)(void); int (*show_format)(struct trace_seq *s); @@ -122,6 +122,7 @@ struct ftrace_event_call { int filter_active; struct event_filter *filter; void *mod; + void *data; atomic_t profile_count; int (*profile_enable)(struct ftrace_event_call *); diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 25d3b02a06f..46d81b5e861 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -568,7 +568,7 @@ static void ftrace_raw_event_##call(proto) \ trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ } \ \ -static int ftrace_raw_reg_event_##call(void) \ +static int ftrace_raw_reg_event_##call(void *ptr) \ { \ int ret; \ \ @@ -579,7 +579,7 @@ static int ftrace_raw_reg_event_##call(void) \ return ret; \ } \ \ -static void ftrace_raw_unreg_event_##call(void) \ +static void ftrace_raw_unreg_event_##call(void *ptr) \ { \ unregister_trace_##call(ftrace_raw_event_##call); \ } \ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f95f8470dd3..1d289e2d669 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -86,14 +86,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, if (call->enabled) { call->enabled = 0; tracing_stop_cmdline_record(); - call->unregfunc(); + call->unregfunc(call->data); } break; case 1: if (!call->enabled) { call->enabled = 1; tracing_start_cmdline_record(); - call->regfunc(); + call->regfunc(call->data); } break; } -- cgit v1.2.3-70-g09d2 From fb34a08c3469b2be9eae626ccb96476b4687b810 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 10 Aug 2009 16:52:47 -0400 Subject: tracing: Add trace events for each syscall entry/exit Layer Frederic's syscall tracer on tracepoints. We create trace events via hooking into the SYSCALL_DEFINE macros. This allows us to individually toggle syscall entry and exit points on/off. Signed-off-by: Jason Baron Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Masami Hiramatsu Signed-off-by: Frederic Weisbecker --- include/linux/syscalls.h | 61 +++++++++++++- include/trace/syscall.h | 18 ++--- kernel/trace/trace_syscalls.c | 183 +++++++++++++++++++++--------------------- 3 files changed, 159 insertions(+), 103 deletions(-) (limited to 'kernel') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 80de7003d8c..5e5b4d33a31 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -64,6 +64,7 @@ struct perf_counter_attr; #include #include #include +#include #include #include #include @@ -112,6 +113,59 @@ struct perf_counter_attr; #define __SC_STR_TDECL5(t, a, ...) #t, __SC_STR_TDECL4(__VA_ARGS__) #define __SC_STR_TDECL6(t, a, ...) #t, __SC_STR_TDECL5(__VA_ARGS__) + +#define SYSCALL_TRACE_ENTER_EVENT(sname) \ + static struct ftrace_event_call event_enter_##sname; \ + static int init_enter_##sname(void) \ + { \ + int num; \ + num = syscall_name_to_nr("sys"#sname); \ + if (num < 0) \ + return -ENOSYS; \ + register_ftrace_event(&event_syscall_enter); \ + INIT_LIST_HEAD(&event_enter_##sname.fields); \ + init_preds(&event_enter_##sname); \ + return 0; \ + } \ + static struct ftrace_event_call __used \ + __attribute__((__aligned__(4))) \ + __attribute__((section("_ftrace_events"))) \ + event_enter_##sname = { \ + .name = "sys_enter"#sname, \ + .system = "syscalls", \ + .event = &event_syscall_enter, \ + .raw_init = init_enter_##sname, \ + .regfunc = reg_event_syscall_enter, \ + .unregfunc = unreg_event_syscall_enter, \ + .data = "sys"#sname, \ + } + +#define SYSCALL_TRACE_EXIT_EVENT(sname) \ + static struct ftrace_event_call event_exit_##sname; \ + static int init_exit_##sname(void) \ + { \ + int num; \ + num = syscall_name_to_nr("sys"#sname); \ + if (num < 0) \ + return -ENOSYS; \ + register_ftrace_event(&event_syscall_exit); \ + INIT_LIST_HEAD(&event_exit_##sname.fields); \ + init_preds(&event_exit_##sname); \ + return 0; \ + } \ + static struct ftrace_event_call __used \ + __attribute__((__aligned__(4))) \ + __attribute__((section("_ftrace_events"))) \ + event_exit_##sname = { \ + .name = "sys_exit"#sname, \ + .system = "syscalls", \ + .event = &event_syscall_exit, \ + .raw_init = init_exit_##sname, \ + .regfunc = reg_event_syscall_exit, \ + .unregfunc = unreg_event_syscall_exit, \ + .data = "sys"#sname, \ + } + #define SYSCALL_METADATA(sname, nb) \ static const struct syscall_metadata __used \ __attribute__((__aligned__(4))) \ @@ -121,7 +175,9 @@ struct perf_counter_attr; .nb_args = nb, \ .types = types_##sname, \ .args = args_##sname, \ - } + }; \ + SYSCALL_TRACE_ENTER_EVENT(sname); \ + SYSCALL_TRACE_EXIT_EVENT(sname); #define SYSCALL_DEFINE0(sname) \ static const struct syscall_metadata __used \ @@ -131,8 +187,9 @@ struct perf_counter_attr; .name = "sys_"#sname, \ .nb_args = 0, \ }; \ + SYSCALL_TRACE_ENTER_EVENT(_##sname); \ + SYSCALL_TRACE_EXIT_EVENT(_##sname); \ asmlinkage long sys_##sname(void) - #else #define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void) #endif diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 3951d774de1..73fb8b4a995 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -2,6 +2,8 @@ #define _TRACE_SYSCALL_H #include +#include +#include #include @@ -40,15 +42,13 @@ struct syscall_metadata { #ifdef CONFIG_FTRACE_SYSCALLS extern struct syscall_metadata *syscall_nr_to_meta(int nr); -extern void start_ftrace_syscalls(void); -extern void stop_ftrace_syscalls(void); -extern void ftrace_syscall_enter(struct pt_regs *regs); -extern void ftrace_syscall_exit(struct pt_regs *regs); -#else -static inline void start_ftrace_syscalls(void) { } -static inline void stop_ftrace_syscalls(void) { } -static inline void ftrace_syscall_enter(struct pt_regs *regs) { } -static inline void ftrace_syscall_exit(struct pt_regs *regs) { } +extern int syscall_name_to_nr(char *name); +extern struct trace_event event_syscall_enter; +extern struct trace_event event_syscall_exit; +extern int reg_event_syscall_enter(void *ptr); +extern void unreg_event_syscall_enter(void *ptr); +extern int reg_event_syscall_exit(void *ptr); +extern void unreg_event_syscall_exit(void *ptr); #endif #endif /* _TRACE_SYSCALL_H */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 08aed439fea..c7ae25ee95d 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,15 +1,16 @@ #include #include +#include #include #include "trace_output.h" #include "trace.h" -/* Keep a counter of the syscall tracing users */ -static int refcount; - -/* Prevent from races on thread flags toggling */ static DEFINE_MUTEX(syscall_trace_lock); +static int sys_refcount_enter; +static int sys_refcount_exit; +static DECLARE_BITMAP(enabled_enter_syscalls, FTRACE_SYSCALL_MAX); +static DECLARE_BITMAP(enabled_exit_syscalls, FTRACE_SYSCALL_MAX); /* Option to display the parameters types */ enum { @@ -95,53 +96,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } -void start_ftrace_syscalls(void) -{ - unsigned long flags; - struct task_struct *g, *t; - - mutex_lock(&syscall_trace_lock); - - /* Don't enable the flag on the tasks twice */ - if (++refcount != 1) - goto unlock; - - read_lock_irqsave(&tasklist_lock, flags); - - do_each_thread(g, t) { - set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); - } while_each_thread(g, t); - - read_unlock_irqrestore(&tasklist_lock, flags); - -unlock: - mutex_unlock(&syscall_trace_lock); -} - -void stop_ftrace_syscalls(void) -{ - unsigned long flags; - struct task_struct *g, *t; - - mutex_lock(&syscall_trace_lock); - - /* There are perhaps still some users */ - if (--refcount) - goto unlock; - - read_lock_irqsave(&tasklist_lock, flags); - - do_each_thread(g, t) { - clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); - } while_each_thread(g, t); - - read_unlock_irqrestore(&tasklist_lock, flags); - -unlock: - mutex_unlock(&syscall_trace_lock); -} - -void ftrace_syscall_enter(struct pt_regs *regs) +void ftrace_syscall_enter(struct pt_regs *regs, long id) { struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; @@ -150,6 +105,8 @@ void ftrace_syscall_enter(struct pt_regs *regs) int syscall_nr; syscall_nr = syscall_get_nr(current, regs); + if (!test_bit(syscall_nr, enabled_enter_syscalls)) + return; sys_data = syscall_nr_to_meta(syscall_nr); if (!sys_data) @@ -170,7 +127,7 @@ void ftrace_syscall_enter(struct pt_regs *regs) trace_wake_up(); } -void ftrace_syscall_exit(struct pt_regs *regs) +void ftrace_syscall_exit(struct pt_regs *regs, long ret) { struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; @@ -178,6 +135,8 @@ void ftrace_syscall_exit(struct pt_regs *regs) int syscall_nr; syscall_nr = syscall_get_nr(current, regs); + if (!test_bit(syscall_nr, enabled_exit_syscalls)) + return; sys_data = syscall_nr_to_meta(syscall_nr); if (!sys_data) @@ -196,54 +155,94 @@ void ftrace_syscall_exit(struct pt_regs *regs) trace_wake_up(); } -static int init_syscall_tracer(struct trace_array *tr) +int reg_event_syscall_enter(void *ptr) { - start_ftrace_syscalls(); - - return 0; + int ret = 0; + int num; + char *name; + + name = (char *)ptr; + num = syscall_name_to_nr(name); + if (num < 0 || num >= FTRACE_SYSCALL_MAX) + return -ENOSYS; + mutex_lock(&syscall_trace_lock); + if (!sys_refcount_enter) + ret = register_trace_syscall_enter(ftrace_syscall_enter); + if (ret) { + pr_info("event trace: Could not activate" + "syscall entry trace point"); + } else { + set_bit(num, enabled_enter_syscalls); + sys_refcount_enter++; + } + mutex_unlock(&syscall_trace_lock); + return ret; } -static void reset_syscall_tracer(struct trace_array *tr) +void unreg_event_syscall_enter(void *ptr) { - stop_ftrace_syscalls(); - tracing_reset_online_cpus(tr); -} - -static struct trace_event syscall_enter_event = { - .type = TRACE_SYSCALL_ENTER, - .trace = print_syscall_enter, -}; - -static struct trace_event syscall_exit_event = { - .type = TRACE_SYSCALL_EXIT, - .trace = print_syscall_exit, -}; + int num; + char *name; -static struct tracer syscall_tracer __read_mostly = { - .name = "syscall", - .init = init_syscall_tracer, - .reset = reset_syscall_tracer, - .flags = &syscalls_flags, -}; + name = (char *)ptr; + num = syscall_name_to_nr(name); + if (num < 0 || num >= FTRACE_SYSCALL_MAX) + return; + mutex_lock(&syscall_trace_lock); + sys_refcount_enter--; + clear_bit(num, enabled_enter_syscalls); + if (!sys_refcount_enter) + unregister_trace_syscall_enter(ftrace_syscall_enter); + mutex_unlock(&syscall_trace_lock); +} -__init int register_ftrace_syscalls(void) +int reg_event_syscall_exit(void *ptr) { - int ret; - - ret = register_ftrace_event(&syscall_enter_event); - if (!ret) { - printk(KERN_WARNING "event %d failed to register\n", - syscall_enter_event.type); - WARN_ON_ONCE(1); + int ret = 0; + int num; + char *name; + + name = (char *)ptr; + num = syscall_name_to_nr(name); + if (num < 0 || num >= FTRACE_SYSCALL_MAX) + return -ENOSYS; + mutex_lock(&syscall_trace_lock); + if (!sys_refcount_exit) + ret = register_trace_syscall_exit(ftrace_syscall_exit); + if (ret) { + pr_info("event trace: Could not activate" + "syscall exit trace point"); + } else { + set_bit(num, enabled_exit_syscalls); + sys_refcount_exit++; } + mutex_unlock(&syscall_trace_lock); + return ret; +} - ret = register_ftrace_event(&syscall_exit_event); - if (!ret) { - printk(KERN_WARNING "event %d failed to register\n", - syscall_exit_event.type); - WARN_ON_ONCE(1); - } +void unreg_event_syscall_exit(void *ptr) +{ + int num; + char *name; - return register_tracer(&syscall_tracer); + name = (char *)ptr; + num = syscall_name_to_nr(name); + if (num < 0 || num >= FTRACE_SYSCALL_MAX) + return; + mutex_lock(&syscall_trace_lock); + sys_refcount_exit--; + clear_bit(num, enabled_exit_syscalls); + if (!sys_refcount_exit) + unregister_trace_syscall_exit(ftrace_syscall_exit); + mutex_unlock(&syscall_trace_lock); } -device_initcall(register_ftrace_syscalls); + +struct trace_event event_syscall_enter = { + .trace = print_syscall_enter, + .type = TRACE_SYSCALL_ENTER +}; + +struct trace_event event_syscall_exit = { + .trace = print_syscall_exit, + .type = TRACE_SYSCALL_EXIT +}; -- cgit v1.2.3-70-g09d2 From 64c12e0444fcc6b75eb49144ba46d43dbdc6bc8f Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 10 Aug 2009 16:52:53 -0400 Subject: tracing: Add individual syscalls tracepoint id support The current state of syscalls tracepoints generates only one event id for every syscall events. This patch associates an id with each syscall trace event, so that we can identify each syscall trace event using the 'perf' tool. Signed-off-by: Jason Baron Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Masami Hiramatsu Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/ftrace.c | 10 ++++++++++ include/linux/syscalls.h | 22 ++++++++++++++++++---- include/trace/syscall.h | 8 ++++++++ kernel/trace/trace.h | 6 ------ kernel/trace/trace_syscalls.c | 26 ++++++++++++++++---------- 5 files changed, 52 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 0d93d409b8d..3cff1214e17 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -516,6 +516,16 @@ int syscall_name_to_nr(char *name) return -1; } +void set_syscall_enter_id(int num, int id) +{ + syscalls_metadata[num]->enter_id = id; +} + +void set_syscall_exit_id(int num, int id) +{ + syscalls_metadata[num]->exit_id = id; +} + static int __init arch_init_ftrace_syscalls(void) { int i; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 5e5b4d33a31..ce4b01c658e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -116,13 +116,20 @@ struct perf_counter_attr; #define SYSCALL_TRACE_ENTER_EVENT(sname) \ static struct ftrace_event_call event_enter_##sname; \ + struct trace_event enter_syscall_print_##sname = { \ + .trace = print_syscall_enter, \ + }; \ static int init_enter_##sname(void) \ { \ - int num; \ + int num, id; \ num = syscall_name_to_nr("sys"#sname); \ if (num < 0) \ return -ENOSYS; \ - register_ftrace_event(&event_syscall_enter); \ + id = register_ftrace_event(&enter_syscall_print_##sname);\ + if (!id) \ + return -ENODEV; \ + event_enter_##sname.id = id; \ + set_syscall_enter_id(num, id); \ INIT_LIST_HEAD(&event_enter_##sname.fields); \ init_preds(&event_enter_##sname); \ return 0; \ @@ -142,13 +149,20 @@ struct perf_counter_attr; #define SYSCALL_TRACE_EXIT_EVENT(sname) \ static struct ftrace_event_call event_exit_##sname; \ + struct trace_event exit_syscall_print_##sname = { \ + .trace = print_syscall_exit, \ + }; \ static int init_exit_##sname(void) \ { \ - int num; \ + int num, id; \ num = syscall_name_to_nr("sys"#sname); \ if (num < 0) \ return -ENOSYS; \ - register_ftrace_event(&event_syscall_exit); \ + id = register_ftrace_event(&exit_syscall_print_##sname);\ + if (!id) \ + return -ENODEV; \ + event_exit_##sname.id = id; \ + set_syscall_exit_id(num, id); \ INIT_LIST_HEAD(&event_exit_##sname.fields); \ init_preds(&event_exit_##sname); \ return 0; \ diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 73fb8b4a995..df628404241 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -32,23 +32,31 @@ DECLARE_TRACE_WITH_CALLBACK(syscall_exit, * @nb_args: number of parameters it takes * @types: list of types as strings * @args: list of args as strings (args[i] matches types[i]) + * @enter_id: associated ftrace enter event id + * @exit_id: associated ftrace exit event id */ struct syscall_metadata { const char *name; int nb_args; const char **types; const char **args; + int enter_id; + int exit_id; }; #ifdef CONFIG_FTRACE_SYSCALLS extern struct syscall_metadata *syscall_nr_to_meta(int nr); extern int syscall_name_to_nr(char *name); +void set_syscall_enter_id(int num, int id); +void set_syscall_exit_id(int num, int id); extern struct trace_event event_syscall_enter; extern struct trace_event event_syscall_exit; extern int reg_event_syscall_enter(void *ptr); extern void unreg_event_syscall_enter(void *ptr); extern int reg_event_syscall_exit(void *ptr); extern void unreg_event_syscall_exit(void *ptr); +enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags); +enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags); #endif #endif /* _TRACE_SYSCALL_H */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d682357e4b1..300ef788c97 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -34,8 +34,6 @@ enum trace_type { TRACE_GRAPH_ENT, TRACE_USER_STACK, TRACE_HW_BRANCHES, - TRACE_SYSCALL_ENTER, - TRACE_SYSCALL_EXIT, TRACE_KMEM_ALLOC, TRACE_KMEM_FREE, TRACE_POWER, @@ -319,10 +317,6 @@ extern void __ftrace_bad_type(void); TRACE_KMEM_ALLOC); \ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ TRACE_KMEM_FREE); \ - IF_ASSIGN(var, ent, struct syscall_trace_enter, \ - TRACE_SYSCALL_ENTER); \ - IF_ASSIGN(var, ent, struct syscall_trace_exit, \ - TRACE_SYSCALL_EXIT); \ __ftrace_bad_type(); \ } while (0) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index c7ae25ee95d..e58a9c11ba8 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -36,14 +36,18 @@ print_syscall_enter(struct trace_iterator *iter, int flags) struct syscall_metadata *entry; int i, ret, syscall; - trace_assign_type(trace, ent); - + trace = (typeof(trace))ent; syscall = trace->nr; - entry = syscall_nr_to_meta(syscall); + if (!entry) goto end; + if (entry->enter_id != ent->type) { + WARN_ON_ONCE(1); + goto end; + } + ret = trace_seq_printf(s, "%s(", entry->name); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -78,16 +82,20 @@ print_syscall_exit(struct trace_iterator *iter, int flags) struct syscall_metadata *entry; int ret; - trace_assign_type(trace, ent); - + trace = (typeof(trace))ent; syscall = trace->nr; - entry = syscall_nr_to_meta(syscall); + if (!entry) { trace_seq_printf(s, "\n"); return TRACE_TYPE_HANDLED; } + if (entry->exit_id != ent->type) { + WARN_ON_ONCE(1); + return TRACE_TYPE_UNHANDLED; + } + ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, trace->ret); if (!ret) @@ -114,7 +122,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; - event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size, + event = trace_current_buffer_lock_reserve(sys_data->enter_id, size, 0, 0); if (!event) return; @@ -142,7 +150,7 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret) if (!sys_data) return; - event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT, + event = trace_current_buffer_lock_reserve(sys_data->exit_id, sizeof(*entry), 0, 0); if (!event) return; @@ -239,10 +247,8 @@ void unreg_event_syscall_exit(void *ptr) struct trace_event event_syscall_enter = { .trace = print_syscall_enter, - .type = TRACE_SYSCALL_ENTER }; struct trace_event event_syscall_exit = { .trace = print_syscall_exit, - .type = TRACE_SYSCALL_EXIT }; -- cgit v1.2.3-70-g09d2 From f4b5ffccc83c82947f5d9f15d6f1b6edb1b71cd7 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 10 Aug 2009 16:53:02 -0400 Subject: tracing: Add perf counter support for syscalls tracing The perf counter support is automated for usual trace events. But we have to define specific callbacks for this to handle syscalls trace events Make 'perf stat -e syscalls:sys_enter_blah' work with syscall style tracepoints. Signed-off-by: Jason Baron Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Masami Hiramatsu Signed-off-by: Frederic Weisbecker --- include/linux/perf_counter.h | 2 + include/linux/syscalls.h | 52 +++++++++++++++++- include/trace/syscall.h | 7 +++ kernel/trace/trace_syscalls.c | 121 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 181 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a9d823a93fe..8e6460fb4c0 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -734,6 +734,8 @@ extern int sysctl_perf_counter_mlock; extern int sysctl_perf_counter_sample_rate; extern void perf_counter_init(void); +extern void perf_tpcounter_event(int event_id, u64 addr, u64 count, + void *record, int entry_size); #ifndef perf_misc_flags #define perf_misc_flags(regs) (user_mode(regs) ? PERF_EVENT_MISC_USER : \ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index ce4b01c658e..5541e75e140 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -98,6 +98,53 @@ struct perf_counter_attr; #define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__) #define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__) +#ifdef CONFIG_EVENT_PROFILE +#define TRACE_SYS_ENTER_PROFILE(sname) \ +static int prof_sysenter_enable_##sname(struct ftrace_event_call *event_call) \ +{ \ + int ret = 0; \ + if (!atomic_inc_return(&event_enter_##sname.profile_count)) \ + ret = reg_prof_syscall_enter("sys"#sname); \ + return ret; \ +} \ + \ +static void prof_sysenter_disable_##sname(struct ftrace_event_call *event_call)\ +{ \ + if (atomic_add_negative(-1, &event_enter_##sname.profile_count)) \ + unreg_prof_syscall_enter("sys"#sname); \ +} + +#define TRACE_SYS_EXIT_PROFILE(sname) \ +static int prof_sysexit_enable_##sname(struct ftrace_event_call *event_call) \ +{ \ + int ret = 0; \ + if (!atomic_inc_return(&event_exit_##sname.profile_count)) \ + ret = reg_prof_syscall_exit("sys"#sname); \ + return ret; \ +} \ + \ +static void prof_sysexit_disable_##sname(struct ftrace_event_call *event_call) \ +{ \ + if (atomic_add_negative(-1, &event_exit_##sname.profile_count)) \ + unreg_prof_syscall_exit("sys"#sname); \ +} + +#define TRACE_SYS_ENTER_PROFILE_INIT(sname) \ + .profile_count = ATOMIC_INIT(-1), \ + .profile_enable = prof_sysenter_enable_##sname, \ + .profile_disable = prof_sysenter_disable_##sname, + +#define TRACE_SYS_EXIT_PROFILE_INIT(sname) \ + .profile_count = ATOMIC_INIT(-1), \ + .profile_enable = prof_sysexit_enable_##sname, \ + .profile_disable = prof_sysexit_disable_##sname, +#else +#define TRACE_SYS_ENTER_PROFILE(sname) +#define TRACE_SYS_ENTER_PROFILE_INIT(sname) +#define TRACE_SYS_EXIT_PROFILE(sname) +#define TRACE_SYS_EXIT_PROFILE_INIT(sname) +#endif + #ifdef CONFIG_FTRACE_SYSCALLS #define __SC_STR_ADECL1(t, a) #a #define __SC_STR_ADECL2(t, a, ...) #a, __SC_STR_ADECL1(__VA_ARGS__) @@ -113,7 +160,6 @@ struct perf_counter_attr; #define __SC_STR_TDECL5(t, a, ...) #t, __SC_STR_TDECL4(__VA_ARGS__) #define __SC_STR_TDECL6(t, a, ...) #t, __SC_STR_TDECL5(__VA_ARGS__) - #define SYSCALL_TRACE_ENTER_EVENT(sname) \ static struct ftrace_event_call event_enter_##sname; \ struct trace_event enter_syscall_print_##sname = { \ @@ -134,6 +180,7 @@ struct perf_counter_attr; init_preds(&event_enter_##sname); \ return 0; \ } \ + TRACE_SYS_ENTER_PROFILE(sname); \ static struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) \ @@ -145,6 +192,7 @@ struct perf_counter_attr; .regfunc = reg_event_syscall_enter, \ .unregfunc = unreg_event_syscall_enter, \ .data = "sys"#sname, \ + TRACE_SYS_ENTER_PROFILE_INIT(sname) \ } #define SYSCALL_TRACE_EXIT_EVENT(sname) \ @@ -167,6 +215,7 @@ struct perf_counter_attr; init_preds(&event_exit_##sname); \ return 0; \ } \ + TRACE_SYS_EXIT_PROFILE(sname); \ static struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) \ @@ -178,6 +227,7 @@ struct perf_counter_attr; .regfunc = reg_event_syscall_exit, \ .unregfunc = unreg_event_syscall_exit, \ .data = "sys"#sname, \ + TRACE_SYS_EXIT_PROFILE_INIT(sname) \ } #define SYSCALL_METADATA(sname, nb) \ diff --git a/include/trace/syscall.h b/include/trace/syscall.h index df628404241..3ab6dd18fa3 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -57,6 +57,13 @@ extern int reg_event_syscall_exit(void *ptr); extern void unreg_event_syscall_exit(void *ptr); enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags); enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags); +#endif +#ifdef CONFIG_EVENT_PROFILE +int reg_prof_syscall_enter(char *name); +void unreg_prof_syscall_enter(char *name); +int reg_prof_syscall_exit(char *name); +void unreg_prof_syscall_exit(char *name); + #endif #endif /* _TRACE_SYSCALL_H */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index e58a9c11ba8..f4eaec3d559 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include "trace_output.h" @@ -252,3 +253,123 @@ struct trace_event event_syscall_enter = { struct trace_event event_syscall_exit = { .trace = print_syscall_exit, }; + +#ifdef CONFIG_EVENT_PROFILE +static DECLARE_BITMAP(enabled_prof_enter_syscalls, FTRACE_SYSCALL_MAX); +static DECLARE_BITMAP(enabled_prof_exit_syscalls, FTRACE_SYSCALL_MAX); +static int sys_prof_refcount_enter; +static int sys_prof_refcount_exit; + +static void prof_syscall_enter(struct pt_regs *regs, long id) +{ + struct syscall_metadata *sys_data; + int syscall_nr; + + syscall_nr = syscall_get_nr(current, regs); + if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) + return; + + sys_data = syscall_nr_to_meta(syscall_nr); + if (!sys_data) + return; + + perf_tpcounter_event(sys_data->enter_id, 0, 1, NULL, 0); +} + +int reg_prof_syscall_enter(char *name) +{ + int ret = 0; + int num; + + num = syscall_name_to_nr(name); + if (num < 0 || num >= FTRACE_SYSCALL_MAX) + return -ENOSYS; + + mutex_lock(&syscall_trace_lock); + if (!sys_prof_refcount_enter) + ret = register_trace_syscall_enter(prof_syscall_enter); + if (ret) { + pr_info("event trace: Could not activate" + "syscall entry trace point"); + } else { + set_bit(num, enabled_prof_enter_syscalls); + sys_prof_refcount_enter++; + } + mutex_unlock(&syscall_trace_lock); + return ret; +} + +void unreg_prof_syscall_enter(char *name) +{ + int num; + + num = syscall_name_to_nr(name); + if (num < 0 || num >= FTRACE_SYSCALL_MAX) + return; + + mutex_lock(&syscall_trace_lock); + sys_prof_refcount_enter--; + clear_bit(num, enabled_prof_enter_syscalls); + if (!sys_prof_refcount_enter) + unregister_trace_syscall_enter(prof_syscall_enter); + mutex_unlock(&syscall_trace_lock); +} + +static void prof_syscall_exit(struct pt_regs *regs, long ret) +{ + struct syscall_metadata *sys_data; + int syscall_nr; + + syscall_nr = syscall_get_nr(current, regs); + if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) + return; + + sys_data = syscall_nr_to_meta(syscall_nr); + if (!sys_data) + return; + + perf_tpcounter_event(sys_data->exit_id, 0, 1, NULL, 0); +} + +int reg_prof_syscall_exit(char *name) +{ + int ret = 0; + int num; + + num = syscall_name_to_nr(name); + if (num < 0 || num >= FTRACE_SYSCALL_MAX) + return -ENOSYS; + + mutex_lock(&syscall_trace_lock); + if (!sys_prof_refcount_exit) + ret = register_trace_syscall_exit(prof_syscall_exit); + if (ret) { + pr_info("event trace: Could not activate" + "syscall entry trace point"); + } else { + set_bit(num, enabled_prof_exit_syscalls); + sys_prof_refcount_exit++; + } + mutex_unlock(&syscall_trace_lock); + return ret; +} + +void unreg_prof_syscall_exit(char *name) +{ + int num; + + num = syscall_name_to_nr(name); + if (num < 0 || num >= FTRACE_SYSCALL_MAX) + return; + + mutex_lock(&syscall_trace_lock); + sys_prof_refcount_exit--; + clear_bit(num, enabled_prof_exit_syscalls); + if (!sys_prof_refcount_exit) + unregister_trace_syscall_exit(prof_syscall_exit); + mutex_unlock(&syscall_trace_lock); +} + +#endif + + -- cgit v1.2.3-70-g09d2 From e8f9f4d79a677f55c8ec3acbe87b33a87e2df0de Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 11 Aug 2009 17:42:52 +0200 Subject: tracing: Add ftrace event call parameter to its field descriptor handler Add the struct ftrace_event_call as a parameter of its show_format() callback. This way we can use it from the syscall trace events to retrieve the syscall name from the ftrace event call parameter and describe its fields using the syscalls metadata. Signed-off-by: Frederic Weisbecker Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Masami Hiramatsu Cc: Jason Baron --- include/linux/ftrace_event.h | 3 ++- include/trace/ftrace.h | 3 ++- kernel/trace/trace_events.c | 2 +- kernel/trace/trace_export.c | 6 ++++-- 4 files changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 8544f121d9f..189806b6e69 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -116,7 +116,8 @@ struct ftrace_event_call { void (*unregfunc)(void *); int id; int (*raw_init)(void); - int (*show_format)(struct trace_seq *s); + int (*show_format)(struct ftrace_event_call *call, + struct trace_seq *s); int (*define_fields)(void); struct list_head fields; int filter_active; diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 46d81b5e861..b250b061657 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -151,7 +151,8 @@ #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ static int \ -ftrace_format_##call(struct trace_seq *s) \ +ftrace_format_##call(struct ftrace_event_call *unused, \ + struct trace_seq *s) \ { \ struct ftrace_raw_##call field __attribute__((unused)); \ int ret = 0; \ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1d289e2d669..b568ade8f45 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -576,7 +576,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_printf(s, "format:\n"); trace_write_header(s); - r = call->show_format(s); + r = call->show_format(call, s); if (!r) { /* * ug! The format output is bigger than a PAGE!! diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d06cf898dc8..956d4bc675e 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -60,7 +60,8 @@ extern void __bad_type_size(void); #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ static int \ -ftrace_format_##call(struct trace_seq *s) \ +ftrace_format_##call(struct ftrace_event_call *unused, \ + struct trace_seq *s) \ { \ struct args field; \ int ret; \ @@ -76,7 +77,8 @@ ftrace_format_##call(struct trace_seq *s) \ #define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ tpfmt) \ static int \ -ftrace_format_##call(struct trace_seq *s) \ +ftrace_format_##call(struct ftrace_event_call *unused, \ + struct trace_seq *s) \ { \ struct args field; \ int ret; \ -- cgit v1.2.3-70-g09d2 From dc4ddb4c0b7348f1c9759ae8a9e7d734dc1cda82 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 11 Aug 2009 19:03:54 +0200 Subject: tracing: Add fields format definition for syscall events Define the format of the syscall trace fields to parse the binary values from a raw trace using the syscall events "format" file. This is defined dynamically using the syscalls metadata. It prepares the export of syscall event raw records to perf counters. Example: $ cat /debug/tracing/events/syscalls/sys_enter_sched_getparam/format name: sys_enter_sched_getparam ID: 39 format: field:unsigned short common_type; offset:0; size:2; field:unsigned char common_flags; offset:2; size:1; field:unsigned char common_preempt_count; offset:3; size:1; field:int common_pid; offset:4; size:4; field:int common_tgid; offset:8; size:4; field:pid_t pid; offset:12; size:8; field:struct sched_param * param; offset:20; size:8; print fmt: "pid: 0x%08lx, param: 0x%08lx", ((unsigned long)(REC->pid)), ((unsigned long)(REC->param)) Signed-off-by: Frederic Weisbecker Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Masami Hiramatsu Cc: Jason Baron --- include/linux/syscalls.h | 1 + include/trace/syscall.h | 2 ++ kernel/trace/trace_syscalls.c | 46 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+) (limited to 'kernel') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 5541e75e140..87d06c173dd 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -189,6 +189,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *event_call) \ .system = "syscalls", \ .event = &event_syscall_enter, \ .raw_init = init_enter_##sname, \ + .show_format = ftrace_format_syscall, \ .regfunc = reg_event_syscall_enter, \ .unregfunc = unreg_event_syscall_enter, \ .data = "sys"#sname, \ diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 3ab6dd18fa3..0cb03625edd 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -55,6 +55,8 @@ extern int reg_event_syscall_enter(void *ptr); extern void unreg_event_syscall_enter(void *ptr); extern int reg_event_syscall_exit(void *ptr); extern void unreg_event_syscall_exit(void *ptr); +extern int +ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s); enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags); enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags); #endif diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index f4eaec3d559..9ee6386cf84 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -105,6 +105,52 @@ print_syscall_exit(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } +int ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s) +{ + int i; + int nr; + int ret = 0; + struct syscall_metadata *entry; + int offset = sizeof(struct trace_entry); + + nr = syscall_name_to_nr((char *)call->data); + entry = syscall_nr_to_meta(nr); + + if (!entry) + return ret; + + for (i = 0; i < entry->nb_args; i++) { + ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i], + entry->args[i]); + if (!ret) + return 0; + ret = trace_seq_printf(s, "\toffset:%d;\tsize:%lu;\n", offset, + sizeof(unsigned long)); + if (!ret) + return 0; + offset += sizeof(unsigned long); + } + + trace_seq_printf(s, "\nprint fmt: \""); + for (i = 0; i < entry->nb_args; i++) { + ret = trace_seq_printf(s, "%s: 0x%%0%lulx%s", entry->args[i], + sizeof(unsigned long), + i == entry->nb_args - 1 ? "\", " : ", "); + if (!ret) + return 0; + } + + for (i = 0; i < entry->nb_args; i++) { + ret = trace_seq_printf(s, "((unsigned long)(REC->%s))%s", + entry->args[i], + i == entry->nb_args - 1 ? "\n" : ", "); + if (!ret) + return 0; + } + + return ret; +} + void ftrace_syscall_enter(struct pt_regs *regs, long id) { struct syscall_trace_enter *entry; -- cgit v1.2.3-70-g09d2 From 19007a67a64f9b3cbbd7024f972654ebf14daade Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 11 Aug 2009 20:22:53 +0200 Subject: tracing: Support for syscall events raw records in perfcounters This bring the support for raw syscall events in perfcounters. The arguments or exit value are saved as a raw sample using the PERF_SAMPLE_RAW attribute in a perf counter. Example (for now you must explicitly set the PERF_SAMPLE_RAW flag in perf record): perf record -e syscalls:sys_enter_open -f -F 1 -a perf report -D 0x2cbb8 [0x50]: event: 9 . . ... raw event: size 80 bytes . 0000: 09 00 00 00 02 00 50 00 20 e9 39 ab 0a 7f 00 00 ......P. .9.... . 0010: bc 14 00 00 bc 14 00 00 01 00 00 00 00 00 00 00 ............... . 0020: 2c 00 00 00 15 01 01 00 bc 14 00 00 bc 14 00 00 ,.............. ^ ^ ^ ^ ^ ^ ^ .......................... Event Size struct trace_entry . 0030: 00 00 00 00 46 98 43 02 00 00 00 00 80 08 00 00 ....F.C........ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ptr to file name open flags . 0040: 00 00 00 00 02 00 00 00 00 00 00 00 00 00 00 00 ............... ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ . open mode padding 0x2cbb8 [0x50]: PERF_EVENT_SAMPLE (IP, 2): 5308: 0x7f0aab39e920 period: 1 Signed-off-by: Frederic Weisbecker Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Jason Baron Cc: Masami Hiramatsu --- kernel/trace/trace_syscalls.c | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 9ee6386cf84..f837cccabcf 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -301,6 +301,17 @@ struct trace_event event_syscall_exit = { }; #ifdef CONFIG_EVENT_PROFILE + +struct syscall_enter_record { + struct trace_entry entry; + unsigned long args[0]; +}; + +struct syscall_exit_record { + struct trace_entry entry; + unsigned long ret; +}; + static DECLARE_BITMAP(enabled_prof_enter_syscalls, FTRACE_SYSCALL_MAX); static DECLARE_BITMAP(enabled_prof_exit_syscalls, FTRACE_SYSCALL_MAX); static int sys_prof_refcount_enter; @@ -308,8 +319,10 @@ static int sys_prof_refcount_exit; static void prof_syscall_enter(struct pt_regs *regs, long id) { + struct syscall_enter_record *rec; struct syscall_metadata *sys_data; int syscall_nr; + int size; syscall_nr = syscall_get_nr(current, regs); if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) @@ -319,7 +332,24 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) if (!sys_data) return; - perf_tpcounter_event(sys_data->enter_id, 0, 1, NULL, 0); + /* get the size after alignment with the u32 buffer size field */ + size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); + size = ALIGN(size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + do { + char raw_data[size]; + + /* zero the dead bytes from align to not leak stack to user */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + + rec = (struct syscall_enter_record *) raw_data; + tracing_generic_entry_update(&rec->entry, 0, 0); + rec->entry.type = sys_data->enter_id; + syscall_get_arguments(current, regs, 0, sys_data->nb_args, + (unsigned long *)&rec->args); + perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size); + } while(0); } int reg_prof_syscall_enter(char *name) @@ -364,6 +394,7 @@ void unreg_prof_syscall_enter(char *name) static void prof_syscall_exit(struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; + struct syscall_exit_record rec; int syscall_nr; syscall_nr = syscall_get_nr(current, regs); @@ -374,7 +405,11 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) if (!sys_data) return; - perf_tpcounter_event(sys_data->exit_id, 0, 1, NULL, 0); + tracing_generic_entry_update(&rec.entry, 0, 0); + rec.entry.type = sys_data->exit_id; + rec.ret = syscall_get_return_value(current, regs); + + perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec)); } int reg_prof_syscall_exit(char *name) -- cgit v1.2.3-70-g09d2 From 39cbb602b543e477df71dca84b5b2e36f8bd29fc Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Fri, 7 Aug 2009 12:01:08 -0400 Subject: Remove double removal of blktrace directory commit fd51d251e4cdb21f68e9dbc4336514d64a105a79 Author: Stefan Raspl Date: Tue May 19 09:59:08 2009 +0200 blktrace: remove debugfs entries on bad path added in an explicit invocation of debugfs_remove for bt->dir, in blk_remove_buf_file_callback we are also getting the directory removed. On occasion I am seeing memory corruption that I have bisected down to this commit. [The testing involves a (long) series of I/O benchmarks with blktrace invoked around the actual runs.] I believe that this committed patch is correct, but the problem actually lies in the code in blk_remove_buf_file_callback. With this patch I am able to consistently get complete runs whereas previously I could not get a single run to complete. The first part of the patch simply moves the debugfs_remove below the relay_close: the relay_close call will remove files under bt->dir, and so we should not remove the directory until all the files we created have been removed. (Note: This is not sufficient to fix the problem - the file system code has ref counts on the directoy, so our invocation does not cause the directory to actually be removed. Nonetheless, we should not rely upon that feature.) Signed-off-by: Alan D. Brunelle Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 1090b0aed9b..7a34cb563fe 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -267,8 +267,8 @@ static void blk_trace_free(struct blk_trace *bt) { debugfs_remove(bt->msg_file); debugfs_remove(bt->dropped_file); - debugfs_remove(bt->dir); relay_close(bt->rchan); + debugfs_remove(bt->dir); free_percpu(bt->sequence); free_percpu(bt->msg_data); kfree(bt); @@ -378,18 +378,8 @@ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, static int blk_remove_buf_file_callback(struct dentry *dentry) { - struct dentry *parent = dentry->d_parent; debugfs_remove(dentry); - /* - * this will fail for all but the last file, but that is ok. what we - * care about is the top level buts->name directory going away, when - * the last trace file is gone. Then we don't have to rmdir() that - * manually on trace stop, so it nicely solves the issue with - * force killing of running traces. - */ - - debugfs_remove(parent); return 0; } -- cgit v1.2.3-70-g09d2 From 28402971d869e26271b25301011f667d3a5640c3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 13 Aug 2009 10:13:22 +0200 Subject: perf_counter: Provide hw_perf_counter_setup_online() APIs Provide weak aliases for hw_perf_counter_setup_online(). This is used by the BTS patches (for v2.6.32), but it interacts with fixes so propagate this upstream. (it has no effect as of yet) Also export perf_counter_output() to architecture code. Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ kernel/perf_counter.c | 10 +++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a9d823a93fe..2b36afe6ae0 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -694,6 +694,8 @@ struct perf_sample_data { extern int perf_counter_overflow(struct perf_counter *counter, int nmi, struct perf_sample_data *data); +extern void perf_counter_output(struct perf_counter *counter, int nmi, + struct perf_sample_data *data); /* * Return 1 for a software counter, 0 for a hardware counter diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b0b20a07f39..e26d2fcfa32 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -88,6 +88,7 @@ void __weak hw_perf_disable(void) { barrier(); } void __weak hw_perf_enable(void) { barrier(); } void __weak hw_perf_counter_setup(int cpu) { barrier(); } +void __weak hw_perf_counter_setup_online(int cpu) { barrier(); } int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, @@ -2630,7 +2631,7 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) return task_pid_nr_ns(p, counter->ns); } -static void perf_counter_output(struct perf_counter *counter, int nmi, +void perf_counter_output(struct perf_counter *counter, int nmi, struct perf_sample_data *data) { int ret; @@ -4592,6 +4593,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) perf_counter_init_cpu(cpu); break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + hw_perf_counter_setup_online(cpu); + break; + case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: perf_counter_exit_cpu(cpu); @@ -4616,6 +4622,8 @@ void __init perf_counter_init(void) { perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, + (void *)(long)smp_processor_id()); register_cpu_notifier(&perf_cpu_nb); } -- cgit v1.2.3-70-g09d2 From bcfc2602e8541ac13b1def38e2591dca072cff7a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2009 09:51:55 +0200 Subject: perf_counter: Fix swcounter context invariance perf_swcounter_is_counting() uses a lock, which means we cannot use swcounters from NMI or when holding that particular lock, this is unintended. The below removes the lock, this opens up race window, but not worse than the swcounters already experience due to RCU traversal of the context in perf_swcounter_ctx_event(). This also fixes the hard lockups while opening a lockdep tracepoint counter. Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: Paul Mackerras Cc: stephane eranian Cc: Corey J Ashford LKML-Reference: <1250149915.10001.66.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 44 ++++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index e26d2fcfa32..3dd4339589a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3444,40 +3444,32 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr, static int perf_swcounter_is_counting(struct perf_counter *counter) { - struct perf_counter_context *ctx; - unsigned long flags; - int count; - + /* + * The counter is active, we're good! + */ if (counter->state == PERF_COUNTER_STATE_ACTIVE) return 1; + /* + * The counter is off/error, not counting. + */ if (counter->state != PERF_COUNTER_STATE_INACTIVE) return 0; /* - * If the counter is inactive, it could be just because - * its task is scheduled out, or because it's in a group - * which could not go on the PMU. We want to count in - * the first case but not the second. If the context is - * currently active then an inactive software counter must - * be the second case. If it's not currently active then - * we need to know whether the counter was active when the - * context was last active, which we can determine by - * comparing counter->tstamp_stopped with ctx->time. - * - * We are within an RCU read-side critical section, - * which protects the existence of *ctx. + * The counter is inactive, if the context is active + * we're part of a group that didn't make it on the 'pmu', + * not counting. */ - ctx = counter->ctx; - spin_lock_irqsave(&ctx->lock, flags); - count = 1; - /* Re-check state now we have the lock */ - if (counter->state < PERF_COUNTER_STATE_INACTIVE || - counter->ctx->is_active || - counter->tstamp_stopped < ctx->time) - count = 0; - spin_unlock_irqrestore(&ctx->lock, flags); - return count; + if (counter->ctx->is_active) + return 0; + + /* + * We're inactive and the context is too, this means the + * task is scheduled out, we're counting events that happen + * to us, like migration events. + */ + return 1; } static int perf_swcounter_match(struct perf_counter *counter, -- cgit v1.2.3-70-g09d2 From 3dab77fb1bf89664bb1c9544607159dcab6f7d57 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2009 11:47:53 +0200 Subject: perf: Rework/fix the whole read vs group stuff Replace PERF_SAMPLE_GROUP with PERF_SAMPLE_READ and introduce PERF_FORMAT_GROUP to deal with group reads in a more generic way. This allows you to get group reads out of read() as well. Signed-off-by: Peter Zijlstra Cc: Corey J Ashford Cc: Paul Mackerras Cc: stephane eranian LKML-Reference: <20090813103655.117411814@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 47 ++++++-- kernel/perf_counter.c | 274 +++++++++++++++++++++++++++++++------------ 2 files changed, 238 insertions(+), 83 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2b36afe6ae0..b53f7006cc4 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -115,7 +115,7 @@ enum perf_counter_sample_format { PERF_SAMPLE_TID = 1U << 1, PERF_SAMPLE_TIME = 1U << 2, PERF_SAMPLE_ADDR = 1U << 3, - PERF_SAMPLE_GROUP = 1U << 4, + PERF_SAMPLE_READ = 1U << 4, PERF_SAMPLE_CALLCHAIN = 1U << 5, PERF_SAMPLE_ID = 1U << 6, PERF_SAMPLE_CPU = 1U << 7, @@ -127,16 +127,32 @@ enum perf_counter_sample_format { }; /* - * Bits that can be set in attr.read_format to request that - * reads on the counter should return the indicated quantities, - * in increasing order of bit value, after the counter value. + * The format of the data returned by read() on a perf counter fd, + * as specified by attr.read_format: + * + * struct read_format { + * { u64 value; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 id; } && PERF_FORMAT_ID + * } && !PERF_FORMAT_GROUP + * + * { u64 nr; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 value; + * { u64 id; } && PERF_FORMAT_ID + * } cntr[nr]; + * } && PERF_FORMAT_GROUP + * }; */ enum perf_counter_read_format { PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, PERF_FORMAT_ID = 1U << 2, + PERF_FORMAT_GROUP = 1U << 3, - PERF_FORMAT_MAX = 1U << 3, /* non-ABI */ + PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ }; #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ @@ -343,10 +359,8 @@ enum perf_event_type { * struct { * struct perf_event_header header; * u32 pid, tid; - * u64 value; - * { u64 time_enabled; } && PERF_FORMAT_ENABLED - * { u64 time_running; } && PERF_FORMAT_RUNNING - * { u64 parent_id; } && PERF_FORMAT_ID + * + * struct read_format values; * }; */ PERF_EVENT_READ = 8, @@ -364,11 +378,22 @@ enum perf_event_type { * { u32 cpu, res; } && PERF_SAMPLE_CPU * { u64 period; } && PERF_SAMPLE_PERIOD * - * { u64 nr; - * { u64 id, val; } cnt[nr]; } && PERF_SAMPLE_GROUP + * { struct read_format values; } && PERF_SAMPLE_READ * * { u64 nr, * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN + * + * # + * # The RAW record below is opaque data wrt the ABI + * # + * # That is, the ABI doesn't make any promises wrt to + * # the stability of its content, it may vary depending + * # on event, hardware, kernel version and phase of + * # the moon. + * # + * # In other words, PERF_SAMPLE_RAW contents are not an ABI. + * # + * * { u32 size; * char data[size];}&& PERF_SAMPLE_RAW * }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 3dd4339589a..b8c6b97a20a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1692,7 +1692,32 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } -static u64 perf_counter_read_tree(struct perf_counter *counter) +static int perf_counter_read_size(struct perf_counter *counter) +{ + int entry = sizeof(u64); /* value */ + int size = 0; + int nr = 1; + + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + size += sizeof(u64); + + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + size += sizeof(u64); + + if (counter->attr.read_format & PERF_FORMAT_ID) + entry += sizeof(u64); + + if (counter->attr.read_format & PERF_FORMAT_GROUP) { + nr += counter->group_leader->nr_siblings; + size += sizeof(u64); + } + + size += entry * nr; + + return size; +} + +static u64 perf_counter_read_value(struct perf_counter *counter) { struct perf_counter *child; u64 total = 0; @@ -1704,14 +1729,96 @@ static u64 perf_counter_read_tree(struct perf_counter *counter) return total; } +static int perf_counter_read_entry(struct perf_counter *counter, + u64 read_format, char __user *buf) +{ + int n = 0, count = 0; + u64 values[2]; + + values[n++] = perf_counter_read_value(counter); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(counter); + + count = n * sizeof(u64); + + if (copy_to_user(buf, values, count)) + return -EFAULT; + + return count; +} + +static int perf_counter_read_group(struct perf_counter *counter, + u64 read_format, char __user *buf) +{ + struct perf_counter *leader = counter->group_leader, *sub; + int n = 0, size = 0, err = -EFAULT; + u64 values[3]; + + values[n++] = 1 + leader->nr_siblings; + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = leader->total_time_enabled + + atomic64_read(&leader->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = leader->total_time_running + + atomic64_read(&leader->child_total_time_running); + } + + size = n * sizeof(u64); + + if (copy_to_user(buf, values, size)) + return -EFAULT; + + err = perf_counter_read_entry(leader, read_format, buf + size); + if (err < 0) + return err; + + size += err; + + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + err = perf_counter_read_entry(counter, read_format, + buf + size); + if (err < 0) + return err; + + size += err; + } + + return size; +} + +static int perf_counter_read_one(struct perf_counter *counter, + u64 read_format, char __user *buf) +{ + u64 values[4]; + int n = 0; + + values[n++] = perf_counter_read_value(counter); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(counter); + + if (copy_to_user(buf, values, n * sizeof(u64))) + return -EFAULT; + + return n * sizeof(u64); +} + /* * Read the performance counter - simple non blocking version for now */ static ssize_t perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) { - u64 values[4]; - int n; + u64 read_format = counter->attr.read_format; + int ret; /* * Return end-of-file for a read on a counter that is in @@ -1721,28 +1828,18 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) if (counter->state == PERF_COUNTER_STATE_ERROR) return 0; + if (count < perf_counter_read_size(counter)) + return -ENOSPC; + WARN_ON_ONCE(counter->ctx->parent_ctx); mutex_lock(&counter->child_mutex); - values[0] = perf_counter_read_tree(counter); - n = 1; - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - if (counter->attr.read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); + if (read_format & PERF_FORMAT_GROUP) + ret = perf_counter_read_group(counter, read_format, buf); + else + ret = perf_counter_read_one(counter, read_format, buf); mutex_unlock(&counter->child_mutex); - if (count < n * sizeof(u64)) - return -EINVAL; - count = n * sizeof(u64); - - if (copy_to_user(buf, values, count)) - return -EFAULT; - - return count; + return ret; } static ssize_t @@ -2631,6 +2728,79 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) return task_pid_nr_ns(p, counter->ns); } +static void perf_output_read_one(struct perf_output_handle *handle, + struct perf_counter *counter) +{ + u64 read_format = counter->attr.read_format; + u64 values[4]; + int n = 0; + + values[n++] = atomic64_read(&counter->count); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(counter); + + perf_output_copy(handle, values, n * sizeof(u64)); +} + +/* + * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult. + */ +static void perf_output_read_group(struct perf_output_handle *handle, + struct perf_counter *counter) +{ + struct perf_counter *leader = counter->group_leader, *sub; + u64 read_format = counter->attr.read_format; + u64 values[5]; + int n = 0; + + values[n++] = 1 + leader->nr_siblings; + + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = leader->total_time_enabled; + + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = leader->total_time_running; + + if (leader != counter) + leader->pmu->read(leader); + + values[n++] = atomic64_read(&leader->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(leader); + + perf_output_copy(handle, values, n * sizeof(u64)); + + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + n = 0; + + if (sub != counter) + sub->pmu->read(sub); + + values[n++] = atomic64_read(&sub->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(sub); + + perf_output_copy(handle, values, n * sizeof(u64)); + } +} + +static void perf_output_read(struct perf_output_handle *handle, + struct perf_counter *counter) +{ + if (counter->attr.read_format & PERF_FORMAT_GROUP) + perf_output_read_group(handle, counter); + else + perf_output_read_one(handle, counter); +} + void perf_counter_output(struct perf_counter *counter, int nmi, struct perf_sample_data *data) { @@ -2642,10 +2812,6 @@ void perf_counter_output(struct perf_counter *counter, int nmi, struct { u32 pid, tid; } tid_entry; - struct { - u64 id; - u64 counter; - } group_entry; struct perf_callchain_entry *callchain = NULL; int callchain_size = 0; u64 time; @@ -2700,10 +2866,8 @@ void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_PERIOD) header.size += sizeof(u64); - if (sample_type & PERF_SAMPLE_GROUP) { - header.size += sizeof(u64) + - counter->nr_siblings * sizeof(group_entry); - } + if (sample_type & PERF_SAMPLE_READ) + header.size += perf_counter_read_size(counter); if (sample_type & PERF_SAMPLE_CALLCHAIN) { callchain = perf_callchain(data->regs); @@ -2760,26 +2924,8 @@ void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_PERIOD) perf_output_put(&handle, data->period); - /* - * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. - */ - if (sample_type & PERF_SAMPLE_GROUP) { - struct perf_counter *leader, *sub; - u64 nr = counter->nr_siblings; - - perf_output_put(&handle, nr); - - leader = counter->group_leader; - list_for_each_entry(sub, &leader->sibling_list, list_entry) { - if (sub != counter) - sub->pmu->read(sub); - - group_entry.id = primary_counter_id(sub); - group_entry.counter = atomic64_read(&sub->count); - - perf_output_put(&handle, group_entry); - } - } + if (sample_type & PERF_SAMPLE_READ) + perf_output_read(&handle, counter); if (sample_type & PERF_SAMPLE_CALLCHAIN) { if (callchain) @@ -2818,8 +2964,6 @@ struct perf_read_event { u32 pid; u32 tid; - u64 value; - u64 format[3]; }; static void @@ -2831,34 +2975,20 @@ perf_counter_read_event(struct perf_counter *counter, .header = { .type = PERF_EVENT_READ, .misc = 0, - .size = sizeof(event) - sizeof(event.format), + .size = sizeof(event) + perf_counter_read_size(counter), }, .pid = perf_counter_pid(counter, task), .tid = perf_counter_tid(counter, task), - .value = atomic64_read(&counter->count), }; - int ret, i = 0; - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - event.header.size += sizeof(u64); - event.format[i++] = counter->total_time_enabled; - } - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - event.header.size += sizeof(u64); - event.format[i++] = counter->total_time_running; - } - - if (counter->attr.read_format & PERF_FORMAT_ID) { - event.header.size += sizeof(u64); - event.format[i++] = primary_counter_id(counter); - } + int ret; ret = perf_output_begin(&handle, counter, event.header.size, 0, 0); if (ret) return; - perf_output_copy(&handle, &event, event.header.size); + perf_output_put(&handle, event); + perf_output_read(&handle, counter); + perf_output_end(&handle); } @@ -3921,9 +4051,9 @@ perf_counter_alloc(struct perf_counter_attr *attr, atomic64_set(&hwc->period_left, hwc->sample_period); /* - * we currently do not support PERF_SAMPLE_GROUP on inherited counters + * we currently do not support PERF_FORMAT_GROUP on inherited counters */ - if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) + if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) goto done; switch (attr->type) { -- cgit v1.2.3-70-g09d2 From 970892a9031a5dc7217bd394fb9d89fa75a4a7bc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2009 11:47:54 +0200 Subject: perf_counter: Fix an ipi-deadlock perf_pending_counter() is called from IRQ context and will call perf_counter_disable(), however perf_counter_disable() uses smp_call_function_single() which doesn't fancy being used with IRQs disabled due to IPI deadlocks. Fix this by making it use the local __perf_counter_disable() call and teaching the counter_sched_out() code about pending disables as well. This should cover the case where a counter migrates before the pending queue gets processed. Signed-off-by: Peter Zijlstra Cc: Corey J Ashford Cc: Paul Mackerras Cc: stephane eranian LKML-Reference: <20090813103655.244097721@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b8c6b97a20a..3f841beefc7 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -307,6 +307,10 @@ counter_sched_out(struct perf_counter *counter, return; counter->state = PERF_COUNTER_STATE_INACTIVE; + if (counter->pending_disable) { + counter->pending_disable = 0; + counter->state = PERF_COUNTER_STATE_OFF; + } counter->tstamp_stopped = ctx->time; counter->pmu->disable(counter); counter->oncpu = -1; @@ -2343,7 +2347,7 @@ static void perf_pending_counter(struct perf_pending_entry *entry) if (counter->pending_disable) { counter->pending_disable = 0; - perf_counter_disable(counter); + __perf_counter_disable(counter); } if (counter->pending_wakeup) { -- cgit v1.2.3-70-g09d2 From 94d5d1b2d891f1fd5205f978246b7864d998b25c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2009 16:14:42 +0200 Subject: perf_counter: Report the cloning task as parent on perf_counter_fork() A bug in (9f498cc: perf_counter: Full task tracing) makes profiling multi-threaded apps it go belly up. [ output as: (PID:TID):(PPID:PTID) ] # ./perf report -D | grep FORK 0x4b0 [0x18]: PERF_EVENT_FORK: (3237:3237):(3236:3236) 0xa10 [0x18]: PERF_EVENT_FORK: (3237:3238):(3236:3236) 0xa70 [0x18]: PERF_EVENT_FORK: (3237:3239):(3236:3236) 0xad0 [0x18]: PERF_EVENT_FORK: (3237:3240):(3236:3236) 0xb18 [0x18]: PERF_EVENT_FORK: (3237:3241):(3236:3236) Shows us that the test (27d028d perf report: Update for the new FORK/EXIT events) in builtin-report.c: /* * A thread clone will have the same PID for both * parent and child. */ if (thread == parent) return 0; Will clearly fail. The problem is that perf_counter_fork() reports the actual parent, instead of the cloning thread. Fixing that (with the below patch), yields: # ./perf report -D | grep FORK 0x4c8 [0x18]: PERF_EVENT_FORK: (1590:1590):(1589:1589) 0xbd8 [0x18]: PERF_EVENT_FORK: (1590:1591):(1590:1590) 0xc80 [0x18]: PERF_EVENT_FORK: (1590:1592):(1590:1590) 0x3338 [0x18]: PERF_EVENT_FORK: (1590:1593):(1590:1590) 0x66b0 [0x18]: PERF_EVENT_FORK: (1590:1594):(1590:1590) Which both makes more sense and doesn't confuse perf report anymore. Reported-by: Pekka Enberg Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: paulus@samba.org Cc: Anton Blanchard Cc: Arjan van de Ven LKML-Reference: <1250172882.5241.62.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 3f841beefc7..534e20d14d6 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3028,10 +3028,10 @@ static void perf_counter_task_output(struct perf_counter *counter, return; task_event->event.pid = perf_counter_pid(counter, task); - task_event->event.ppid = perf_counter_pid(counter, task->real_parent); + task_event->event.ppid = perf_counter_pid(counter, current); task_event->event.tid = perf_counter_tid(counter, task); - task_event->event.ptid = perf_counter_tid(counter, task->real_parent); + task_event->event.ptid = perf_counter_tid(counter, current); perf_output_put(&handle, task_event->event); perf_output_end(&handle); -- cgit v1.2.3-70-g09d2 From 2d860ad76f4ee4d2eba0fe3797c8d7cdce432cc0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 13 Aug 2009 13:05:10 -0700 Subject: genirq: prevent wakeup of freed irq thread free_irq() can remove an irqaction while the corresponding interrupt is in progress, but free_irq() sets action->thread to NULL unconditionally, which might lead to a NULL pointer dereference in handle_IRQ_event() when the hard interrupt context tries to wake up the handler thread. Prevent this by moving the thread stop after synchronize_irq(). No need to set action->thread to NULL either as action is going to be freed anyway. This fixes a boot crash reported against preempt-rt which uses the mainline irq threads code to implement full irq threading. [ tglx: removed local irqthread variable ] Signed-off-by: Linus Torvalds Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 61c679db468..d222515a5a0 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -761,7 +761,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action, **action_ptr; - struct task_struct *irqthread; unsigned long flags; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); @@ -809,9 +808,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) desc->chip->disable(irq); } - irqthread = action->thread; - action->thread = NULL; - spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); @@ -819,12 +815,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) /* Make sure it's not being used on another CPU: */ synchronize_irq(irq); - if (irqthread) { - if (!test_bit(IRQTF_DIED, &action->thread_flags)) - kthread_stop(irqthread); - put_task_struct(irqthread); - } - #ifdef CONFIG_DEBUG_SHIRQ /* * It's a shared IRQ -- the driver ought to be prepared for an IRQ @@ -840,6 +830,13 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) local_irq_restore(flags); } #endif + + if (action->thread) { + if (!test_bit(IRQTF_DIED, &action->thread_flags)) + kthread_stop(action->thread); + put_task_struct(action->thread); + } + return action; } -- cgit v1.2.3-70-g09d2 From 60d970c254b95ec7a0fc4c590b510253987b64a0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 13 Aug 2009 23:37:26 +0200 Subject: tracing: Fix syscall tracing on !HAVE_FTRACE_SYSCALLS architectures The new syscall_regfunc()/unregfunc() functions rely on the existence of TIF_SYSCALL_FTRACE - but that TIF flag is only offered by HAVE_FTRACE_SYSCALLS. Cc: Frederic Weisbecker Cc: Jason Baron Cc: Steven Rostedt Cc: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/tracepoint.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 070a42bb892..35dd27adb82 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -579,6 +579,8 @@ __initcall(init_tracepoints); #endif /* CONFIG_MODULES */ +#ifdef CONFIG_FTRACE_SYSCALLS + static DEFINE_MUTEX(regfunc_mutex); static int sys_tracepoint_refcount; @@ -615,3 +617,4 @@ void syscall_unregfunc(void) } mutex_unlock(®func_mutex); } +#endif -- cgit v1.2.3-70-g09d2 From 5a165657bef7c47e5ff4cd138f7758ef6278e87b Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Thu, 13 Aug 2009 05:20:45 +0000 Subject: net: skb ftracer - Add config option to enable new ftracer (v3) skb allocation / consumption corelator - Add config option This patch adds a Kconfig option to enable the addtition of the skb source tracer. Signed-off-by: Neil Horman Kconfig | 10 ++++++++++ 1 file changed, 10 insertions(+) Signed-off-by: David S. Miller --- kernel/trace/Kconfig | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 019f380fd76..f09d7635c0e 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -234,6 +234,16 @@ config BOOT_TRACER You must pass in initcall_debug and ftrace=initcall to the kernel command line to enable this on bootup. +config SKB_SOURCES_TRACER + bool "Trace skb source information" + select GENERIC_TRACER + help + This tracer helps developers/sysadmins correlate skb allocation and + consumption. The idea being that some processes will primarily consume data + that was allocated on certain numa nodes. By being able to visualize which + nodes the data was allocated on, a sysadmin or developer can optimize the + scheduling of those processes to cut back on cross node chatter. + config TRACE_BRANCH_PROFILING bool select GENERIC_TRACER -- cgit v1.2.3-70-g09d2 From 9ec04da7489d2c9ae01ea6e9b5fa313ccf3d35fb Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Thu, 13 Aug 2009 05:23:56 +0000 Subject: net: skb ftracer - Add actual ftrace code to kernel (v3) skb allocation / consumption correlator Add ftracer module to kernel to print out a list that correlates a process id, an skb it read, and the numa nodes on wich the process was running when it was read along with the numa node the skbuff was allocated on. Signed-off-by: Neil Horman Makefile | 1 trace.h | 19 ++++++ trace_skb_sources.c | 154 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 174 insertions(+) Signed-off-by: David S. Miller --- kernel/trace/Makefile | 1 + kernel/trace/trace.h | 19 +++++ kernel/trace/trace_skb_sources.c | 154 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 174 insertions(+) create mode 100644 kernel/trace/trace_skb_sources.c (limited to 'kernel') diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 844164dca90..ee5e5b1b928 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -49,6 +49,7 @@ obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o ifeq ($(CONFIG_BLOCK),y) obj-$(CONFIG_EVENT_TRACING) += blktrace.o endif +obj-$(CONFIG_SKB_SOURCES_TRACER) += trace_skb_sources.o obj-$(CONFIG_EVENT_TRACING) += trace_events.o obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8b9f4f6e955..8a6281b2330 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -40,6 +41,7 @@ enum trace_type { TRACE_KMEM_FREE, TRACE_POWER, TRACE_BLK, + TRACE_SKB_SOURCE, __TRACE_LAST_TYPE, }; @@ -171,6 +173,21 @@ struct trace_power { struct power_trace state_data; }; +struct skb_record { + pid_t pid; /* pid of the copying process */ + int anid; /* node where skb was allocated */ + int cnid; /* node to which skb was copied in userspace */ + char ifname[IFNAMSIZ]; /* Name of the receiving interface */ + int rx_queue; /* The rx queue the skb was received on */ + int ccpu; /* Cpu the application got this frame from */ + int len; /* length of the data copied */ +}; + +struct trace_skb_event { + struct trace_entry ent; + struct skb_record event_data; +}; + enum kmemtrace_type_id { KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ @@ -323,6 +340,8 @@ extern void __ftrace_bad_type(void); TRACE_SYSCALL_ENTER); \ IF_ASSIGN(var, ent, struct syscall_trace_exit, \ TRACE_SYSCALL_EXIT); \ + IF_ASSIGN(var, ent, struct trace_skb_event, \ + TRACE_SKB_SOURCE); \ __ftrace_bad_type(); \ } while (0) diff --git a/kernel/trace/trace_skb_sources.c b/kernel/trace/trace_skb_sources.c new file mode 100644 index 00000000000..40eb071afdb --- /dev/null +++ b/kernel/trace/trace_skb_sources.c @@ -0,0 +1,154 @@ +/* + * ring buffer based tracer for analyzing per-socket skb sources + * + * Neil Horman + * Copyright (C) 2009 + * + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" +#include "trace_output.h" + +EXPORT_TRACEPOINT_SYMBOL_GPL(skb_copy_datagram_iovec); + +static struct trace_array *skb_trace; +static int __read_mostly trace_skb_source_enabled; + +static void probe_skb_dequeue(const struct sk_buff *skb, int len) +{ + struct ring_buffer_event *event; + struct trace_skb_event *entry; + struct trace_array *tr = skb_trace; + struct net_device *dev; + + if (!trace_skb_source_enabled) + return; + + if (in_interrupt()) + return; + + event = trace_buffer_lock_reserve(tr, TRACE_SKB_SOURCE, + sizeof(*entry), 0, 0); + if (!event) + return; + entry = ring_buffer_event_data(event); + + entry->event_data.pid = current->pid; + entry->event_data.anid = page_to_nid(virt_to_page(skb->data)); + entry->event_data.cnid = cpu_to_node(smp_processor_id()); + entry->event_data.len = len; + entry->event_data.rx_queue = skb->queue_mapping; + entry->event_data.ccpu = smp_processor_id(); + + dev = dev_get_by_index(sock_net(skb->sk), skb->iif); + if (dev) { + memcpy(entry->event_data.ifname, dev->name, IFNAMSIZ); + dev_put(dev); + } else { + strcpy(entry->event_data.ifname, "Unknown"); + } + + trace_buffer_unlock_commit(tr, event, 0, 0); +} + +static int tracing_skb_source_register(void) +{ + int ret; + + ret = register_trace_skb_copy_datagram_iovec(probe_skb_dequeue); + if (ret) + pr_info("skb source trace: Couldn't activate dequeue tracepoint"); + + return ret; +} + +static void start_skb_source_trace(struct trace_array *tr) +{ + trace_skb_source_enabled = 1; +} + +static void stop_skb_source_trace(struct trace_array *tr) +{ + trace_skb_source_enabled = 0; +} + +static void skb_source_trace_reset(struct trace_array *tr) +{ + trace_skb_source_enabled = 0; + unregister_trace_skb_copy_datagram_iovec(probe_skb_dequeue); +} + + +static int skb_source_trace_init(struct trace_array *tr) +{ + int cpu; + skb_trace = tr; + + trace_skb_source_enabled = 1; + tracing_skb_source_register(); + + for_each_cpu(cpu, cpu_possible_mask) + tracing_reset(tr, cpu); + return 0; +} + +static enum print_line_t skb_source_print_line(struct trace_iterator *iter) +{ + int ret = 0; + struct trace_entry *entry = iter->ent; + struct trace_skb_event *event; + struct skb_record *record; + struct trace_seq *s = &iter->seq; + + trace_assign_type(event, entry); + record = &event->event_data; + if (entry->type != TRACE_SKB_SOURCE) + return TRACE_TYPE_UNHANDLED; + + ret = trace_seq_printf(s, " %d %d %d %s %d %d %d\n", + record->pid, + record->anid, + record->cnid, + record->ifname, + record->rx_queue, + record->ccpu, + record->len); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static void skb_source_print_header(struct seq_file *s) +{ + seq_puts(s, "# PID ANID CNID IFC RXQ CCPU LEN\n"); + seq_puts(s, "# | | | | | | |\n"); +} + +static struct tracer skb_source_tracer __read_mostly = +{ + .name = "skb_sources", + .init = skb_source_trace_init, + .start = start_skb_source_trace, + .stop = stop_skb_source_trace, + .reset = skb_source_trace_reset, + .print_line = skb_source_print_line, + .print_header = skb_source_print_header, +}; + +static int init_skb_source_trace(void) +{ + return register_tracer(&skb_source_tracer); +} +device_initcall(init_skb_source_trace); -- cgit v1.2.3-70-g09d2 From 9188499cdb117d86a1ea6b04374095b098d56936 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 13 Aug 2009 09:44:57 -0400 Subject: security: introducing security_request_module Calling request_module() will trigger a userspace upcall which will load a new module into the kernel. This can be a dangerous event if the process able to trigger request_module() is able to control either the modprobe binary or the module binary. This patch adds a new security hook to request_module() which can be used by an LSM to control a processes ability to call request_module(). Signed-off-by: Eric Paris Acked-by: Serge Hallyn Signed-off-by: James Morris --- include/linux/security.h | 10 ++++++++++ kernel/kmod.c | 4 ++++ security/capability.c | 6 ++++++ security/security.c | 5 +++++ 4 files changed, 25 insertions(+) (limited to 'kernel') diff --git a/include/linux/security.h b/include/linux/security.h index 57ead99d259..1e3dd86eea9 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -678,6 +678,9 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @inode points to the inode to use as a reference. * The current task must be the one that nominated @inode. * Return 0 if successful. + * @kernel_module_request: + * Ability to trigger the kernel to automatically upcall to userspace for + * userspace to load a kernel module with the given name. * @task_setuid: * Check permission before setting one or more of the user identity * attributes of the current process. The @flags parameter indicates @@ -1489,6 +1492,7 @@ struct security_operations { void (*cred_commit)(struct cred *new, const struct cred *old); int (*kernel_act_as)(struct cred *new, u32 secid); int (*kernel_create_files_as)(struct cred *new, struct inode *inode); + int (*kernel_module_request)(void); int (*task_setuid) (uid_t id0, uid_t id1, uid_t id2, int flags); int (*task_fix_setuid) (struct cred *new, const struct cred *old, int flags); @@ -1741,6 +1745,7 @@ int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp); void security_commit_creds(struct cred *new, const struct cred *old); int security_kernel_act_as(struct cred *new, u32 secid); int security_kernel_create_files_as(struct cred *new, struct inode *inode); +int security_kernel_module_request(void); int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags); int security_task_fix_setuid(struct cred *new, const struct cred *old, int flags); @@ -2292,6 +2297,11 @@ static inline int security_kernel_create_files_as(struct cred *cred, return 0; } +static inline int security_kernel_module_request(void) +{ + return 0; +} + static inline int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags) { diff --git a/kernel/kmod.c b/kernel/kmod.c index 385c31a1bdb..5a7ae57f983 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -78,6 +78,10 @@ int __request_module(bool wait, const char *fmt, ...) #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; + ret = security_kernel_module_request(); + if (ret) + return ret; + va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); va_end(args); diff --git a/security/capability.c b/security/capability.c index ec057305402..1b943f54b2e 100644 --- a/security/capability.c +++ b/security/capability.c @@ -396,6 +396,11 @@ static int cap_kernel_create_files_as(struct cred *new, struct inode *inode) return 0; } +static int cap_kernel_module_request(void) +{ + return 0; +} + static int cap_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags) { return 0; @@ -945,6 +950,7 @@ void security_fixup_ops(struct security_operations *ops) set_to_cap_if_null(ops, cred_commit); set_to_cap_if_null(ops, kernel_act_as); set_to_cap_if_null(ops, kernel_create_files_as); + set_to_cap_if_null(ops, kernel_module_request); set_to_cap_if_null(ops, task_setuid); set_to_cap_if_null(ops, task_fix_setuid); set_to_cap_if_null(ops, task_setgid); diff --git a/security/security.c b/security/security.c index 4501c5e1f98..0e993f42ce3 100644 --- a/security/security.c +++ b/security/security.c @@ -709,6 +709,11 @@ int security_kernel_create_files_as(struct cred *new, struct inode *inode) return security_ops->kernel_create_files_as(new, inode); } +int security_kernel_module_request(void) +{ + return security_ops->kernel_module_request(); +} + int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags) { return security_ops->task_setuid(id0, id1, id2, flags); -- cgit v1.2.3-70-g09d2 From 31089c13bcb18d2cd2a3ddfbe3a28666346f237e Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 14 Aug 2009 15:47:18 +0200 Subject: timekeeping: Introduce timekeeping_leap_insert Move the adjustment of xtime, wall_to_monotonic and the update of the vsyscall variables to the timekeeping code. Signed-off-by: John Stultz Signed-off-by: Martin Schwidefsky LKML-Reference: <20090814134807.609730216@de.ibm.com> Signed-off-by: Thomas Gleixner --- include/linux/time.h | 1 + kernel/time/ntp.c | 7 ++----- kernel/time/timekeeping.c | 7 +++++++ 3 files changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/time.h b/include/linux/time.h index ea16c1a01d5..e7c84455888 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -147,6 +147,7 @@ extern struct timespec timespec_trunc(struct timespec t, unsigned gran); extern int timekeeping_valid_for_hres(void); extern void update_wall_time(void); extern void update_xtime_cache(u64 nsec); +extern void timekeeping_leap_insert(int leapsecond); struct tms; extern void do_sys_times(struct tms *); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 7fc64375ff4..4800f933910 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -194,8 +194,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) case TIME_OK: break; case TIME_INS: - xtime.tv_sec--; - wall_to_monotonic.tv_sec++; + timekeeping_leap_insert(-1); time_state = TIME_OOP; printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); @@ -203,9 +202,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) res = HRTIMER_RESTART; break; case TIME_DEL: - xtime.tv_sec++; + timekeeping_leap_insert(1); time_tai--; - wall_to_monotonic.tv_sec--; time_state = TIME_WAIT; printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); @@ -219,7 +217,6 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) time_state = TIME_OK; break; } - update_vsyscall(&xtime, clock); write_sequnlock(&xtime_lock); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 02c0b2c9c67..b8b70fb545f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -58,6 +58,13 @@ void update_xtime_cache(u64 nsec) struct clocksource *clock; +/* must hold xtime_lock */ +void timekeeping_leap_insert(int leapsecond) +{ + xtime.tv_sec += leapsecond; + wall_to_monotonic.tv_sec -= leapsecond; + update_vsyscall(&xtime, clock); +} #ifdef CONFIG_GENERIC_TIME /** -- cgit v1.2.3-70-g09d2 From a0f7d48bfb95a4c5172a2756dbc4b82afc8e9ae4 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:19 +0200 Subject: timekeeping: Remove clocksource inline functions The three inline functions clocksource_read, clocksource_enable and clocksource_disable are simple wrappers of an indirect call plus the copy from and to the mult_orig value. The functions are exclusively used by the timekeeping code which has intimate knowledge of the clocksource anyway. Therefore remove the inline functions. No functional change. Signed-off-by: Martin Schwidefsky Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134807.903108946@de.ibm.com> Signed-off-by: Thomas Gleixner --- include/linux/clocksource.h | 58 --------------------------------------------- kernel/time/timekeeping.c | 41 ++++++++++++++++++++++---------- 2 files changed, 28 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 1219be4fb42..a1ef46f61c8 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -267,64 +267,6 @@ static inline u32 clocksource_hz2mult(u32 hz, u32 shift_constant) return (u32)tmp; } -/** - * clocksource_read: - Access the clocksource's current cycle value - * @cs: pointer to clocksource being read - * - * Uses the clocksource to return the current cycle_t value - */ -static inline cycle_t clocksource_read(struct clocksource *cs) -{ - return cs->read(cs); -} - -/** - * clocksource_enable: - enable clocksource - * @cs: pointer to clocksource - * - * Enables the specified clocksource. The clocksource callback - * function should start up the hardware and setup mult and field - * members of struct clocksource to reflect hardware capabilities. - */ -static inline int clocksource_enable(struct clocksource *cs) -{ - int ret = 0; - - if (cs->enable) - ret = cs->enable(cs); - - /* - * The frequency may have changed while the clocksource - * was disabled. If so the code in ->enable() must update - * the mult value to reflect the new frequency. Make sure - * mult_orig follows this change. - */ - cs->mult_orig = cs->mult; - - return ret; -} - -/** - * clocksource_disable: - disable clocksource - * @cs: pointer to clocksource - * - * Disables the specified clocksource. The clocksource callback - * function should power down the now unused hardware block to - * save power. - */ -static inline void clocksource_disable(struct clocksource *cs) -{ - /* - * Save mult_orig in mult so clocksource_enable() can - * restore the value regardless if ->enable() updates - * the value of mult or not. - */ - cs->mult = cs->mult_orig; - - if (cs->disable) - cs->disable(cs); -} - /** * cyc2ns - converts clocksource cycles to nanoseconds * @cs: Pointer to clocksource diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b8b70fb545f..016a2591d71 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -79,7 +79,7 @@ static void clocksource_forward_now(void) cycle_t cycle_now, cycle_delta; s64 nsec; - cycle_now = clocksource_read(clock); + cycle_now = clock->read(clock); cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; clock->cycle_last = cycle_now; @@ -114,7 +114,7 @@ void getnstimeofday(struct timespec *ts) *ts = xtime; /* read clocksource: */ - cycle_now = clocksource_read(clock); + cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; @@ -146,7 +146,7 @@ ktime_t ktime_get(void) nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; /* read clocksource: */ - cycle_now = clocksource_read(clock); + cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; @@ -186,7 +186,7 @@ void ktime_get_ts(struct timespec *ts) tomono = wall_to_monotonic; /* read clocksource: */ - cycle_now = clocksource_read(clock); + cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; @@ -274,16 +274,29 @@ static void change_clocksource(void) clocksource_forward_now(); - if (clocksource_enable(new)) + if (new->enable && !new->enable(new)) return; + /* + * The frequency may have changed while the clocksource + * was disabled. If so the code in ->enable() must update + * the mult value to reflect the new frequency. Make sure + * mult_orig follows this change. + */ + new->mult_orig = new->mult; new->raw_time = clock->raw_time; old = clock; clock = new; - clocksource_disable(old); + /* + * Save mult_orig in mult so that the value can be restored + * regardless if ->enable() updates the value of mult or not. + */ + old->mult = old->mult_orig; + if (old->disable) + old->disable(old); clock->cycle_last = 0; - clock->cycle_last = clocksource_read(clock); + clock->cycle_last = clock->read(clock); clock->error = 0; clock->xtime_nsec = 0; clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); @@ -373,7 +386,7 @@ void getrawmonotonic(struct timespec *ts) seq = read_seqbegin(&xtime_lock); /* read clocksource: */ - cycle_now = clocksource_read(clock); + cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; @@ -435,9 +448,12 @@ void __init timekeeping_init(void) ntp_init(); clock = clocksource_get_next(); - clocksource_enable(clock); + if (clock->enable) + clock->enable(clock); + /* set mult_orig on enable */ + clock->mult_orig = clock->mult; clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); - clock->cycle_last = clocksource_read(clock); + clock->cycle_last = clock->read(clock); xtime.tv_sec = sec; xtime.tv_nsec = 0; @@ -477,8 +493,7 @@ static int timekeeping_resume(struct sys_device *dev) } update_xtime_cache(0); /* re-base the last cycle value */ - clock->cycle_last = 0; - clock->cycle_last = clocksource_read(clock); + clock->cycle_last = clock->read(clock); clock->error = 0; timekeeping_suspended = 0; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -630,7 +645,7 @@ void update_wall_time(void) return; #ifdef CONFIG_GENERIC_TIME - offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; + offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #else offset = clock->cycle_interval; #endif -- cgit v1.2.3-70-g09d2 From 1be396794897f80bfc8774719ba60309a9e3d374 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:20 +0200 Subject: timekeeping: Move reset of cycle_last for tsc clocksource to tsc change_clocksource resets the cycle_last value to zero then sets it to a value read from the clocksource. The reset to zero is required only for the TSC clocksource to make the read_tsc function work after a resume. The reason is that the TSC read function uses cycle_last to detect backwards going TSCs. In the resume case cycle_last contains the TSC value from the last update before the suspend. On resume the TSC starts counting from 0 again and would trip over the cycle_last comparison. This is subtle and surprising. Move the reset to a resume function in the tsc code. Signed-off-by: Martin Schwidefsky Acked-by: Thomas Gleixner Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134808.142191175@de.ibm.com> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc.c | 6 ++++++ kernel/time/timekeeping.c | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 71f4368b357..968425422c4 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -744,10 +744,16 @@ static cycle_t __vsyscall_fn vread_tsc(void) } #endif +static void resume_tsc(void) +{ + clocksource_tsc.cycle_last = 0; +} + static struct clocksource clocksource_tsc = { .name = "tsc", .rating = 300, .read = read_tsc, + .resume = resume_tsc, .mask = CLOCKSOURCE_MASK(64), .shift = 22, .flags = CLOCK_SOURCE_IS_CONTINUOUS | diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 016a2591d71..b5673016089 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -295,7 +295,6 @@ static void change_clocksource(void) if (old->disable) old->disable(old); - clock->cycle_last = 0; clock->cycle_last = clock->read(clock); clock->error = 0; clock->xtime_nsec = 0; -- cgit v1.2.3-70-g09d2 From f1b82746c1e93daf24e1ab9bfbd39bcdb2e7018b Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:21 +0200 Subject: clocksource: Cleanup clocksource selection If a non high-resolution clocksource is first set as override clock and then registered it becomes active even if the system is in one-shot mode. Move the override check from sysfs_override_clocksource to the clocksource selection. That fixes the bug and simplifies the code. The check in clocksource_register for double registration of the same clocksource is removed without replacement. To find the initial clocksource a new weak function in jiffies.c is defined that returns the jiffies clocksource. The architecture code can then override the weak function with a more suitable clocksource, e.g. the TOD clock on s390. [ tglx: Folded in a fix from John Stultz ] Signed-off-by: Martin Schwidefsky Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134808.388024160@de.ibm.com> Signed-off-by: Thomas Gleixner --- arch/s390/kernel/time.c | 4 ++ include/linux/clocksource.h | 2 + kernel/time/clocksource.c | 134 +++++++++++++++++--------------------------- kernel/time/jiffies.c | 6 +- kernel/time/timekeeping.c | 4 +- 5 files changed, 64 insertions(+), 86 deletions(-) (limited to 'kernel') diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index d4c8e9c47c8..afefe514df0 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -205,6 +205,10 @@ static struct clocksource clocksource_tod = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; +struct clocksource * __init clocksource_default_clock(void) +{ + return &clocksource_tod; +} void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) { diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index a1ef46f61c8..f263b3abf46 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -322,6 +323,7 @@ extern void clocksource_touch_watchdog(void); extern struct clocksource* clocksource_get_next(void); extern void clocksource_change_rating(struct clocksource *cs, int rating); extern void clocksource_resume(void); +extern struct clocksource * __init __weak clocksource_default_clock(void); #ifdef CONFIG_GENERIC_TIME_VSYSCALL extern void update_vsyscall(struct timespec *ts, struct clocksource *c); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 7466cb81125..e91662e87cd 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -21,7 +21,6 @@ * * TODO WishList: * o Allow clocksource drivers to be unregistered - * o get rid of clocksource_jiffies extern */ #include @@ -107,12 +106,9 @@ u64 timecounter_cyc2time(struct timecounter *tc, } EXPORT_SYMBOL(timecounter_cyc2time); -/* XXX - Would like a better way for initializing curr_clocksource */ -extern struct clocksource clocksource_jiffies; - /*[Clocksource internal variables]--------- * curr_clocksource: - * currently selected clocksource. Initialized to clocksource_jiffies. + * currently selected clocksource. * next_clocksource: * pending next selected clocksource. * clocksource_list: @@ -123,9 +119,8 @@ extern struct clocksource clocksource_jiffies; * override_name: * Name of the user-specified clocksource. */ -static struct clocksource *curr_clocksource = &clocksource_jiffies; +static struct clocksource *curr_clocksource; static struct clocksource *next_clocksource; -static struct clocksource *clocksource_override; static LIST_HEAD(clocksource_list); static DEFINE_SPINLOCK(clocksource_lock); static char override_name[32]; @@ -320,6 +315,7 @@ void clocksource_touch_watchdog(void) clocksource_resume_watchdog(); } +#ifdef CONFIG_GENERIC_TIME /** * clocksource_get_next - Returns the selected clocksource * @@ -339,56 +335,65 @@ struct clocksource *clocksource_get_next(void) } /** - * select_clocksource - Selects the best registered clocksource. + * clocksource_select - Select the best clocksource available * * Private function. Must hold clocksource_lock when called. * * Select the clocksource with the best rating, or the clocksource, * which is selected by userspace override. */ -static struct clocksource *select_clocksource(void) +static void clocksource_select(void) { - struct clocksource *next; + struct clocksource *best, *cs; if (list_empty(&clocksource_list)) - return NULL; + return; + /* First clocksource on the list has the best rating. */ + best = list_first_entry(&clocksource_list, struct clocksource, list); + /* Check for the override clocksource. */ + list_for_each_entry(cs, &clocksource_list, list) { + if (strcmp(cs->name, override_name) != 0) + continue; + /* + * Check to make sure we don't switch to a non-highres + * capable clocksource if the tick code is in oneshot + * mode (highres or nohz) + */ + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && + tick_oneshot_mode_active()) { + /* Override clocksource cannot be used. */ + printk(KERN_WARNING "Override clocksource %s is not " + "HRT compatible. Cannot switch while in " + "HRT/NOHZ mode\n", cs->name); + override_name[0] = 0; + } else + /* Override clocksource can be used. */ + best = cs; + break; + } + if (curr_clocksource != best) + next_clocksource = best; +} - if (clocksource_override) - next = clocksource_override; - else - next = list_entry(clocksource_list.next, struct clocksource, - list); +#else /* CONFIG_GENERIC_TIME */ - if (next == curr_clocksource) - return NULL; +static void clocksource_select(void) { } - return next; -} +#endif /* * Enqueue the clocksource sorted by rating */ -static int clocksource_enqueue(struct clocksource *c) +static void clocksource_enqueue(struct clocksource *cs) { - struct list_head *tmp, *entry = &clocksource_list; - - list_for_each(tmp, &clocksource_list) { - struct clocksource *cs; + struct list_head *entry = &clocksource_list; + struct clocksource *tmp; - cs = list_entry(tmp, struct clocksource, list); - if (cs == c) - return -EBUSY; + list_for_each_entry(tmp, &clocksource_list, list) /* Keep track of the place, where to insert */ - if (cs->rating >= c->rating) - entry = tmp; - } - list_add(&c->list, entry); - - if (strlen(c->name) == strlen(override_name) && - !strcmp(c->name, override_name)) - clocksource_override = c; - - return 0; + if (tmp->rating >= cs->rating) + entry = &tmp->list; + list_add(&cs->list, entry); } /** @@ -397,19 +402,16 @@ static int clocksource_enqueue(struct clocksource *c) * * Returns -EBUSY if registration fails, zero otherwise. */ -int clocksource_register(struct clocksource *c) +int clocksource_register(struct clocksource *cs) { unsigned long flags; - int ret; spin_lock_irqsave(&clocksource_lock, flags); - ret = clocksource_enqueue(c); - if (!ret) - next_clocksource = select_clocksource(); + clocksource_enqueue(cs); + clocksource_select(); spin_unlock_irqrestore(&clocksource_lock, flags); - if (!ret) - clocksource_check_watchdog(c); - return ret; + clocksource_check_watchdog(cs); + return 0; } EXPORT_SYMBOL(clocksource_register); @@ -425,7 +427,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating) list_del(&cs->list); cs->rating = rating; clocksource_enqueue(cs); - next_clocksource = select_clocksource(); + clocksource_select(); spin_unlock_irqrestore(&clocksource_lock, flags); } @@ -438,9 +440,7 @@ void clocksource_unregister(struct clocksource *cs) spin_lock_irqsave(&clocksource_lock, flags); list_del(&cs->list); - if (clocksource_override == cs) - clocksource_override = NULL; - next_clocksource = select_clocksource(); + clocksource_select(); spin_unlock_irqrestore(&clocksource_lock, flags); } @@ -478,9 +478,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, struct sysdev_attribute *attr, const char *buf, size_t count) { - struct clocksource *ovr = NULL; size_t ret = count; - int len; /* strings from sysfs write are not 0 terminated! */ if (count >= sizeof(override_name)) @@ -495,37 +493,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, if (count > 0) memcpy(override_name, buf, count); override_name[count] = 0; - - len = strlen(override_name); - if (len) { - struct clocksource *cs; - - ovr = clocksource_override; - /* try to select it: */ - list_for_each_entry(cs, &clocksource_list, list) { - if (strlen(cs->name) == len && - !strcmp(cs->name, override_name)) - ovr = cs; - } - } - - /* - * Check to make sure we don't switch to a non-highres capable - * clocksource if the tick code is in oneshot mode (highres or nohz) - */ - if (tick_oneshot_mode_active() && ovr && - !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) { - printk(KERN_WARNING "%s clocksource is not HRT compatible. " - "Cannot switch while in HRT/NOHZ mode\n", ovr->name); - ovr = NULL; - override_name[0] = 0; - } - - /* Reselect, when the override name has changed */ - if (ovr != clocksource_override) { - clocksource_override = ovr; - next_clocksource = select_clocksource(); - } + clocksource_select(); spin_unlock_irq(&clocksource_lock); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index c3f6c30816e..5404a845690 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -61,7 +61,6 @@ struct clocksource clocksource_jiffies = { .read = jiffies_read, .mask = 0xffffffff, /*32bits*/ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ - .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT, .shift = JIFFIES_SHIFT, }; @@ -71,3 +70,8 @@ static int __init init_jiffies_clocksource(void) } core_initcall(init_jiffies_clocksource); + +struct clocksource * __init __weak clocksource_default_clock(void) +{ + return &clocksource_jiffies; +} diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b5673016089..325a9b63265 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -269,7 +269,7 @@ static void change_clocksource(void) new = clocksource_get_next(); - if (clock == new) + if (!new || clock == new) return; clocksource_forward_now(); @@ -446,7 +446,7 @@ void __init timekeeping_init(void) ntp_init(); - clock = clocksource_get_next(); + clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); /* set mult_orig on enable */ -- cgit v1.2.3-70-g09d2 From 8cf4e750f8459d51c2e8a035a201da4bf7aa996a Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:22 +0200 Subject: clocksource: Delay clocksource watchdog highres enablement The clocksource watchdog marks a clock as highres capable before it checked the deviation from the watchdog clocksource even for a single time. Make sure that the deviation is at least checked once before doing the switch to highres mode. Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134808.627795883@de.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 47 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e91662e87cd..76256c5aecb 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -153,11 +153,8 @@ static unsigned long watchdog_resumed; #define WATCHDOG_INTERVAL (HZ >> 1) #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) -static void clocksource_ratewd(struct clocksource *cs, int64_t delta) +static void clocksource_unstable(struct clocksource *cs, int64_t delta) { - if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD) - return; - printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", cs->name, delta); cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); @@ -183,31 +180,31 @@ static void clocksource_watchdog(unsigned long data) list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { csnow = cs->read(cs); - if (unlikely(resumed)) { + /* Clocksource initialized ? */ + if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { + cs->flags |= CLOCK_SOURCE_WATCHDOG; cs->wd_last = csnow; continue; } - /* Initialized ? */ - if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { - if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && - (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { - cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; - /* - * We just marked the clocksource as - * highres-capable, notify the rest of the - * system as well so that we transition - * into high-res mode: - */ - tick_clock_notify(); - } - cs->flags |= CLOCK_SOURCE_WATCHDOG; - cs->wd_last = csnow; - } else { - cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask); - cs->wd_last = csnow; - /* Check the delta. Might remove from the list ! */ - clocksource_ratewd(cs, cs_nsec - wd_nsec); + /* Check the deviation from the watchdog clocksource. */ + cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask); + cs->wd_last = csnow; + if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { + clocksource_unstable(cs, cs_nsec - wd_nsec); + continue; + } + + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && + (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && + (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + /* + * We just marked the clocksource as highres-capable, + * notify the rest of the system as well so that we + * transition into high-res mode: + */ + tick_clock_notify(); } } -- cgit v1.2.3-70-g09d2 From 0f8e8ef7c204988246da5a42d576b7fa5277a8e4 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:23 +0200 Subject: clocksource: Simplify clocksource watchdog resume logic To resume the clocksource watchdog just remove the CLOCK_SOURCE_WATCHDOG bit from the watched clocksource. Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134808.880925790@de.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 76256c5aecb..89a7b91bfbd 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -145,7 +145,6 @@ static struct clocksource *watchdog; static struct timer_list watchdog_timer; static DEFINE_SPINLOCK(watchdog_lock); static cycle_t watchdog_last; -static unsigned long watchdog_resumed; /* * Interval: 0.5sec Threshold: 0.0625s @@ -167,12 +166,9 @@ static void clocksource_watchdog(unsigned long data) struct clocksource *cs, *tmp; cycle_t csnow, wdnow; int64_t wd_nsec, cs_nsec; - int resumed; spin_lock(&watchdog_lock); - resumed = test_and_clear_bit(0, &watchdog_resumed); - wdnow = watchdog->read(watchdog); wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); watchdog_last = wdnow; @@ -223,14 +219,26 @@ static void clocksource_watchdog(unsigned long data) } spin_unlock(&watchdog_lock); } + +static inline void clocksource_reset_watchdog(void) +{ + struct clocksource *cs; + + list_for_each_entry(cs, &watchdog_list, wd_list) + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; +} + static void clocksource_resume_watchdog(void) { - set_bit(0, &watchdog_resumed); + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + clocksource_reset_watchdog(); + spin_unlock_irqrestore(&watchdog_lock, flags); } static void clocksource_check_watchdog(struct clocksource *cs) { - struct clocksource *cse; unsigned long flags; spin_lock_irqsave(&watchdog_lock, flags); @@ -256,8 +264,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) watchdog_timer.function = clocksource_watchdog; /* Reset watchdog cycles */ - list_for_each_entry(cse, &watchdog_list, wd_list) - cse->flags &= ~CLOCK_SOURCE_WATCHDOG; + clocksource_reset_watchdog(); /* Start if list is not empty */ if (!list_empty(&watchdog_list)) { watchdog_last = watchdog->read(watchdog); -- cgit v1.2.3-70-g09d2 From fb63a0ebe615fba9de8c75ea44ded999d1e24c65 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:24 +0200 Subject: clocksource: Refactor clocksource watchdog Refactor clocksource watchdog code to make it more readable. Add clocksource_dequeue_watchdog to remove a clocksource from the watchdog list when it is unregistered. Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134809.110881699@de.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 97 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 69 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 89a7b91bfbd..56aaa749645 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -145,6 +145,7 @@ static struct clocksource *watchdog; static struct timer_list watchdog_timer; static DEFINE_SPINLOCK(watchdog_lock); static cycle_t watchdog_last; +static int watchdog_running; /* * Interval: 0.5sec Threshold: 0.0625s @@ -168,6 +169,8 @@ static void clocksource_watchdog(unsigned long data) int64_t wd_nsec, cs_nsec; spin_lock(&watchdog_lock); + if (!watchdog_running) + goto out; wdnow = watchdog->read(watchdog); wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); @@ -217,9 +220,30 @@ static void clocksource_watchdog(unsigned long data) watchdog_timer.expires += WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, next_cpu); } +out: spin_unlock(&watchdog_lock); } +static inline void clocksource_start_watchdog(void) +{ + if (watchdog_running || !watchdog || list_empty(&watchdog_list)) + return; + init_timer(&watchdog_timer); + watchdog_timer.function = clocksource_watchdog; + watchdog_last = watchdog->read(watchdog); + watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; + add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); + watchdog_running = 1; +} + +static inline void clocksource_stop_watchdog(void) +{ + if (!watchdog_running || (watchdog && !list_empty(&watchdog_list))) + return; + del_timer(&watchdog_timer); + watchdog_running = 0; +} + static inline void clocksource_reset_watchdog(void) { struct clocksource *cs; @@ -237,55 +261,70 @@ static void clocksource_resume_watchdog(void) spin_unlock_irqrestore(&watchdog_lock, flags); } -static void clocksource_check_watchdog(struct clocksource *cs) +static void clocksource_enqueue_watchdog(struct clocksource *cs) { unsigned long flags; spin_lock_irqsave(&watchdog_lock, flags); if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { - int started = !list_empty(&watchdog_list); - + /* cs is a clocksource to be watched. */ list_add(&cs->wd_list, &watchdog_list); - if (!started && watchdog) { - watchdog_last = watchdog->read(watchdog); - watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, - cpumask_first(cpu_online_mask)); - } + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; } else { + /* cs is a watchdog. */ if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; - + /* Pick the best watchdog. */ if (!watchdog || cs->rating > watchdog->rating) { - if (watchdog) - del_timer(&watchdog_timer); watchdog = cs; - init_timer(&watchdog_timer); - watchdog_timer.function = clocksource_watchdog; - /* Reset watchdog cycles */ clocksource_reset_watchdog(); - /* Start if list is not empty */ - if (!list_empty(&watchdog_list)) { - watchdog_last = watchdog->read(watchdog); - watchdog_timer.expires = - jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, - cpumask_first(cpu_online_mask)); - } } } + /* Check if the watchdog timer needs to be started. */ + clocksource_start_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); } -#else -static void clocksource_check_watchdog(struct clocksource *cs) + +static void clocksource_dequeue_watchdog(struct clocksource *cs) +{ + struct clocksource *tmp; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { + /* cs is a watched clocksource. */ + list_del_init(&cs->wd_list); + } else if (cs == watchdog) { + /* Reset watchdog cycles */ + clocksource_reset_watchdog(); + /* Current watchdog is removed. Find an alternative. */ + watchdog = NULL; + list_for_each_entry(tmp, &clocksource_list, list) { + if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY) + continue; + if (!watchdog || tmp->rating > watchdog->rating) + watchdog = tmp; + } + } + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +#else /* CONFIG_CLOCKSOURCE_WATCHDOG */ + +static void clocksource_enqueue_watchdog(struct clocksource *cs) { if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; } +static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } -#endif + +#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ /** * clocksource_resume - resume the clocksource(s) @@ -414,14 +453,13 @@ int clocksource_register(struct clocksource *cs) clocksource_enqueue(cs); clocksource_select(); spin_unlock_irqrestore(&clocksource_lock, flags); - clocksource_check_watchdog(cs); + clocksource_enqueue_watchdog(cs); return 0; } EXPORT_SYMBOL(clocksource_register); /** * clocksource_change_rating - Change the rating of a registered clocksource - * */ void clocksource_change_rating(struct clocksource *cs, int rating) { @@ -434,6 +472,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating) clocksource_select(); spin_unlock_irqrestore(&clocksource_lock, flags); } +EXPORT_SYMBOL(clocksource_change_rating); /** * clocksource_unregister - remove a registered clocksource @@ -442,11 +481,13 @@ void clocksource_unregister(struct clocksource *cs) { unsigned long flags; + clocksource_dequeue_watchdog(cs); spin_lock_irqsave(&clocksource_lock, flags); list_del(&cs->list); clocksource_select(); spin_unlock_irqrestore(&clocksource_lock, flags); } +EXPORT_SYMBOL(clocksource_unregister); #ifdef CONFIG_SYSFS /** -- cgit v1.2.3-70-g09d2 From c55c87c892c1875deace0c8fc28787335277fdf2 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:25 +0200 Subject: clocksource: Move watchdog downgrade to a work queue thread Move the downgrade of an unstable clocksource from the timer interrupt context into the process context of a work queue thread. This is needed to be able to do the clocksource switch with stop_machine. Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134809.354926067@de.ibm.com> Signed-off-by: Thomas Gleixner --- include/linux/clocksource.h | 1 + kernel/time/clocksource.c | 56 +++++++++++++++++++++++++++++++-------------- 2 files changed, 40 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index f263b3abf46..19ad43af62d 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -213,6 +213,7 @@ extern struct clocksource *clock; /* current clocksource */ #define CLOCK_SOURCE_WATCHDOG 0x10 #define CLOCK_SOURCE_VALID_FOR_HRES 0x20 +#define CLOCK_SOURCE_UNSTABLE 0x40 /* simplify initialization of mask field */ #define CLOCKSOURCE_MASK(bits) (cycle_t)((bits) < 64 ? ((1ULL<<(bits))-1) : -1) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 56aaa749645..f1508019bfb 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -143,10 +143,13 @@ fs_initcall(clocksource_done_booting); static LIST_HEAD(watchdog_list); static struct clocksource *watchdog; static struct timer_list watchdog_timer; +static struct work_struct watchdog_work; static DEFINE_SPINLOCK(watchdog_lock); static cycle_t watchdog_last; static int watchdog_running; +static void clocksource_watchdog_work(struct work_struct *work); + /* * Interval: 0.5sec Threshold: 0.0625s */ @@ -158,15 +161,16 @@ static void clocksource_unstable(struct clocksource *cs, int64_t delta) printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", cs->name, delta); cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); - clocksource_change_rating(cs, 0); - list_del(&cs->wd_list); + cs->flags |= CLOCK_SOURCE_UNSTABLE; + schedule_work(&watchdog_work); } static void clocksource_watchdog(unsigned long data) { - struct clocksource *cs, *tmp; + struct clocksource *cs; cycle_t csnow, wdnow; int64_t wd_nsec, cs_nsec; + int next_cpu; spin_lock(&watchdog_lock); if (!watchdog_running) @@ -176,7 +180,12 @@ static void clocksource_watchdog(unsigned long data) wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); watchdog_last = wdnow; - list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { + list_for_each_entry(cs, &watchdog_list, wd_list) { + + /* Clocksource already marked unstable? */ + if (cs->flags & CLOCK_SOURCE_UNSTABLE) + continue; + csnow = cs->read(cs); /* Clocksource initialized ? */ @@ -207,19 +216,15 @@ static void clocksource_watchdog(unsigned long data) } } - if (!list_empty(&watchdog_list)) { - /* - * Cycle through CPUs to check if the CPUs stay - * synchronized to each other. - */ - int next_cpu = cpumask_next(raw_smp_processor_id(), - cpu_online_mask); - - if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(cpu_online_mask); - watchdog_timer.expires += WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, next_cpu); - } + /* + * Cycle through CPUs to check if the CPUs stay synchronized + * to each other. + */ + next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(cpu_online_mask); + watchdog_timer.expires += WATCHDOG_INTERVAL; + add_timer_on(&watchdog_timer, next_cpu); out: spin_unlock(&watchdog_lock); } @@ -228,6 +233,7 @@ static inline void clocksource_start_watchdog(void) { if (watchdog_running || !watchdog || list_empty(&watchdog_list)) return; + INIT_WORK(&watchdog_work, clocksource_watchdog_work); init_timer(&watchdog_timer); watchdog_timer.function = clocksource_watchdog; watchdog_last = watchdog->read(watchdog); @@ -313,6 +319,22 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs) spin_unlock_irqrestore(&watchdog_lock, flags); } +static void clocksource_watchdog_work(struct work_struct *work) +{ + struct clocksource *cs, *tmp; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + list_del_init(&cs->wd_list); + clocksource_change_rating(cs, 0); + } + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); + spin_unlock(&watchdog_lock); +} + #else /* CONFIG_CLOCKSOURCE_WATCHDOG */ static void clocksource_enqueue_watchdog(struct clocksource *cs) -- cgit v1.2.3-70-g09d2 From 155ec60226ae0ae2aadaa57c951a58a359331030 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:26 +0200 Subject: timekeeping: Introduce struct timekeeper Add struct timekeeper to keep the internal values timekeeping.c needs in regard to the currently selected clock source. This moves the timekeeping intervals, xtime_nsec and the ntp error value from struct clocksource to struct timekeeper. The raw_time is removed from the clocksource as well. It gets treated like xtime as a global variable. Eventually xtime raw_time should be moved to struct timekeeper. [ tglx: minor cleanup ] Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134809.613209842@de.ibm.com> Signed-off-by: Thomas Gleixner --- arch/s390/kernel/time.c | 1 - include/linux/clocksource.h | 54 +--------- kernel/time/clocksource.c | 6 +- kernel/time/timekeeping.c | 235 +++++++++++++++++++++++++++++--------------- 4 files changed, 164 insertions(+), 132 deletions(-) (limited to 'kernel') diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index afefe514df0..e76c2e7a8b9 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -280,7 +280,6 @@ void __init time_init(void) now = get_clock(); tod_to_timeval(now - TOD_UNIX_EPOCH, &xtime); clocksource_tod.cycle_last = now; - clocksource_tod.raw_time = xtime; tod_to_timeval(sched_clock_base_cc - TOD_UNIX_EPOCH, &ts); set_normalized_timespec(&wall_to_monotonic, -ts.tv_sec, -ts.tv_nsec); write_sequnlock_irqrestore(&xtime_lock, flags); diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 19ad43af62d..e12e3095e2f 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -155,8 +155,6 @@ extern u64 timecounter_cyc2time(struct timecounter *tc, * @flags: flags describing special properties * @vread: vsyscall based read * @resume: resume function for the clocksource, if necessary - * @cycle_interval: Used internally by timekeeping core, please ignore. - * @xtime_interval: Used internally by timekeeping core, please ignore. */ struct clocksource { /* @@ -182,19 +180,12 @@ struct clocksource { #define CLKSRC_FSYS_MMIO_SET(mmio, addr) do { } while (0) #endif - /* timekeeping specific data, ignore */ - cycle_t cycle_interval; - u64 xtime_interval; - u32 raw_interval; /* * Second part is written at each timer interrupt * Keep it in a different cache line to dirty no * more than one cache line. */ cycle_t cycle_last ____cacheline_aligned_in_smp; - u64 xtime_nsec; - s64 error; - struct timespec raw_time; #ifdef CONFIG_CLOCKSOURCE_WATCHDOG /* Watchdog related data, used by the framework */ @@ -203,8 +194,6 @@ struct clocksource { #endif }; -extern struct clocksource *clock; /* current clocksource */ - /* * Clock source flags bits:: */ @@ -270,50 +259,15 @@ static inline u32 clocksource_hz2mult(u32 hz, u32 shift_constant) } /** - * cyc2ns - converts clocksource cycles to nanoseconds - * @cs: Pointer to clocksource - * @cycles: Cycles + * clocksource_cyc2ns - converts clocksource cycles to nanoseconds * - * Uses the clocksource and ntp ajdustment to convert cycle_ts to nanoseconds. + * Converts cycles to nanoseconds, using the given mult and shift. * * XXX - This could use some mult_lxl_ll() asm optimization */ -static inline s64 cyc2ns(struct clocksource *cs, cycle_t cycles) +static inline s64 clocksource_cyc2ns(cycle_t cycles, u32 mult, u32 shift) { - u64 ret = (u64)cycles; - ret = (ret * cs->mult) >> cs->shift; - return ret; -} - -/** - * clocksource_calculate_interval - Calculates a clocksource interval struct - * - * @c: Pointer to clocksource. - * @length_nsec: Desired interval length in nanoseconds. - * - * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment - * pair and interval request. - * - * Unless you're the timekeeping code, you should not be using this! - */ -static inline void clocksource_calculate_interval(struct clocksource *c, - unsigned long length_nsec) -{ - u64 tmp; - - /* Do the ns -> cycle conversion first, using original mult */ - tmp = length_nsec; - tmp <<= c->shift; - tmp += c->mult_orig/2; - do_div(tmp, c->mult_orig); - - c->cycle_interval = (cycle_t)tmp; - if (c->cycle_interval == 0) - c->cycle_interval = 1; - - /* Go back from cycles -> shifted ns, this time use ntp adjused mult */ - c->xtime_interval = (u64)c->cycle_interval * c->mult; - c->raw_interval = ((u64)c->cycle_interval * c->mult_orig) >> c->shift; + return ((u64) cycles * mult) >> shift; } diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index f1508019bfb..f18c9a6bdcf 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -177,7 +177,8 @@ static void clocksource_watchdog(unsigned long data) goto out; wdnow = watchdog->read(watchdog); - wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); + wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask, + watchdog->mult, watchdog->shift); watchdog_last = wdnow; list_for_each_entry(cs, &watchdog_list, wd_list) { @@ -196,7 +197,8 @@ static void clocksource_watchdog(unsigned long data) } /* Check the deviation from the watchdog clocksource. */ - cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask); + cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) & + cs->mask, cs->mult, cs->shift); cs->wd_last = csnow; if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { clocksource_unstable(cs, cs_nsec - wd_nsec); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 325a9b63265..7af45cbf6b1 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -19,6 +19,65 @@ #include #include +/* Structure holding internal timekeeping values. */ +struct timekeeper { + /* Current clocksource used for timekeeping. */ + struct clocksource *clock; + + /* Number of clock cycles in one NTP interval. */ + cycle_t cycle_interval; + /* Number of clock shifted nano seconds in one NTP interval. */ + u64 xtime_interval; + /* Raw nano seconds accumulated per NTP interval. */ + u32 raw_interval; + + /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */ + u64 xtime_nsec; + /* Difference between accumulated time and NTP time in ntp + * shifted nano seconds. */ + s64 ntp_error; +}; + +struct timekeeper timekeeper; + +/** + * timekeeper_setup_internals - Set up internals to use clocksource clock. + * + * @clock: Pointer to clocksource. + * + * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment + * pair and interval request. + * + * Unless you're the timekeeping code, you should not be using this! + */ +static void timekeeper_setup_internals(struct clocksource *clock) +{ + cycle_t interval; + u64 tmp; + + timekeeper.clock = clock; + clock->cycle_last = clock->read(clock); + + /* Do the ns -> cycle conversion first, using original mult */ + tmp = NTP_INTERVAL_LENGTH; + tmp <<= clock->shift; + tmp += clock->mult_orig/2; + do_div(tmp, clock->mult_orig); + if (tmp == 0) + tmp = 1; + + interval = (cycle_t) tmp; + timekeeper.cycle_interval = interval; + + /* Go back from cycles -> shifted ns */ + timekeeper.xtime_interval = (u64) interval * clock->mult; + timekeeper.raw_interval = + ((u64) interval * clock->mult_orig) >> clock->shift; + + timekeeper.xtime_nsec = 0; + + timekeeper.ntp_error = 0; +} /* * This read-write spinlock protects us from races in SMP while @@ -46,6 +105,11 @@ struct timespec xtime __attribute__ ((aligned (16))); struct timespec wall_to_monotonic __attribute__ ((aligned (16))); static unsigned long total_sleep_time; /* seconds */ +/* + * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. + */ +struct timespec raw_time; + /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -56,42 +120,42 @@ void update_xtime_cache(u64 nsec) timespec_add_ns(&xtime_cache, nsec); } -struct clocksource *clock; - /* must hold xtime_lock */ void timekeeping_leap_insert(int leapsecond) { xtime.tv_sec += leapsecond; wall_to_monotonic.tv_sec -= leapsecond; - update_vsyscall(&xtime, clock); + update_vsyscall(&xtime, timekeeper.clock); } #ifdef CONFIG_GENERIC_TIME /** - * clocksource_forward_now - update clock to the current time + * timekeeping_forward_now - update clock to the current time * * Forward the current clock to update its state since the last call to * update_wall_time(). This is useful before significant clock changes, * as it avoids having to deal with this time offset explicitly. */ -static void clocksource_forward_now(void) +static void timekeeping_forward_now(void) { cycle_t cycle_now, cycle_delta; + struct clocksource *clock; s64 nsec; + clock = timekeeper.clock; cycle_now = clock->read(clock); cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; clock->cycle_last = cycle_now; - nsec = cyc2ns(clock, cycle_delta); + nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); /* If arch requires, add in gettimeoffset() */ nsec += arch_gettimeoffset(); timespec_add_ns(&xtime, nsec); - nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; - clock->raw_time.tv_nsec += nsec; + nsec = clocksource_cyc2ns(cycle_delta, clock->mult_orig, clock->shift); + timespec_add_ns(&raw_time, nsec); } /** @@ -103,6 +167,7 @@ static void clocksource_forward_now(void) void getnstimeofday(struct timespec *ts) { cycle_t cycle_now, cycle_delta; + struct clocksource *clock; unsigned long seq; s64 nsecs; @@ -114,13 +179,15 @@ void getnstimeofday(struct timespec *ts) *ts = xtime; /* read clocksource: */ + clock = timekeeper.clock; cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; /* convert to nanoseconds: */ - nsecs = cyc2ns(clock, cycle_delta); + nsecs = clocksource_cyc2ns(cycle_delta, clock->mult, + clock->shift); /* If arch requires, add in gettimeoffset() */ nsecs += arch_gettimeoffset(); @@ -135,6 +202,7 @@ EXPORT_SYMBOL(getnstimeofday); ktime_t ktime_get(void) { cycle_t cycle_now, cycle_delta; + struct clocksource *clock; unsigned int seq; s64 secs, nsecs; @@ -146,13 +214,15 @@ ktime_t ktime_get(void) nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; /* read clocksource: */ + clock = timekeeper.clock; cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; /* convert to nanoseconds: */ - nsecs += cyc2ns(clock, cycle_delta); + nsecs += clocksource_cyc2ns(cycle_delta, clock->mult, + clock->shift); } while (read_seqretry(&xtime_lock, seq)); /* @@ -174,6 +244,7 @@ EXPORT_SYMBOL_GPL(ktime_get); void ktime_get_ts(struct timespec *ts) { cycle_t cycle_now, cycle_delta; + struct clocksource *clock; struct timespec tomono; unsigned int seq; s64 nsecs; @@ -186,13 +257,15 @@ void ktime_get_ts(struct timespec *ts) tomono = wall_to_monotonic; /* read clocksource: */ + clock = timekeeper.clock; cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; /* convert to nanoseconds: */ - nsecs = cyc2ns(clock, cycle_delta); + nsecs = clocksource_cyc2ns(cycle_delta, clock->mult, + clock->shift); } while (read_seqretry(&xtime_lock, seq)); @@ -233,7 +306,7 @@ int do_settimeofday(struct timespec *tv) write_seqlock_irqsave(&xtime_lock, flags); - clocksource_forward_now(); + timekeeping_forward_now(); ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; @@ -243,10 +316,10 @@ int do_settimeofday(struct timespec *tv) update_xtime_cache(0); - clock->error = 0; + timekeeper.ntp_error = 0; ntp_clear(); - update_vsyscall(&xtime, clock); + update_vsyscall(&xtime, timekeeper.clock); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -269,10 +342,10 @@ static void change_clocksource(void) new = clocksource_get_next(); - if (!new || clock == new) + if (!new || timekeeper.clock == new) return; - clocksource_forward_now(); + timekeeping_forward_now(); if (new->enable && !new->enable(new)) return; @@ -284,9 +357,9 @@ static void change_clocksource(void) */ new->mult_orig = new->mult; - new->raw_time = clock->raw_time; - old = clock; - clock = new; + old = timekeeper.clock; + timekeeper_setup_internals(new); + /* * Save mult_orig in mult so that the value can be restored * regardless if ->enable() updates the value of mult or not. @@ -295,22 +368,10 @@ static void change_clocksource(void) if (old->disable) old->disable(old); - clock->cycle_last = clock->read(clock); - clock->error = 0; - clock->xtime_nsec = 0; - clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); - tick_clock_notify(); - - /* - * We're holding xtime lock and waking up klogd would deadlock - * us on enqueue. So no printing! - printk(KERN_INFO "Time: %s clocksource has been installed.\n", - clock->name); - */ } #else /* GENERIC_TIME */ -static inline void clocksource_forward_now(void) { } +static inline void timekeeping_forward_now(void) { } static inline void change_clocksource(void) { } /** @@ -380,20 +441,23 @@ void getrawmonotonic(struct timespec *ts) unsigned long seq; s64 nsecs; cycle_t cycle_now, cycle_delta; + struct clocksource *clock; do { seq = read_seqbegin(&xtime_lock); /* read clocksource: */ + clock = timekeeper.clock; cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; /* convert to nanoseconds: */ - nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; + nsecs = clocksource_cyc2ns(cycle_delta, clock->mult_orig, + clock->shift); - *ts = clock->raw_time; + *ts = raw_time; } while (read_seqretry(&xtime_lock, seq)); @@ -413,7 +477,7 @@ int timekeeping_valid_for_hres(void) do { seq = read_seqbegin(&xtime_lock); - ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; + ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; } while (read_seqretry(&xtime_lock, seq)); @@ -439,6 +503,7 @@ unsigned long __attribute__((weak)) read_persistent_clock(void) */ void __init timekeeping_init(void) { + struct clocksource *clock; unsigned long flags; unsigned long sec = read_persistent_clock(); @@ -451,11 +516,13 @@ void __init timekeeping_init(void) clock->enable(clock); /* set mult_orig on enable */ clock->mult_orig = clock->mult; - clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); - clock->cycle_last = clock->read(clock); + + timekeeper_setup_internals(clock); xtime.tv_sec = sec; xtime.tv_nsec = 0; + raw_time.tv_sec = 0; + raw_time.tv_nsec = 0; set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); update_xtime_cache(0); @@ -492,8 +559,8 @@ static int timekeeping_resume(struct sys_device *dev) } update_xtime_cache(0); /* re-base the last cycle value */ - clock->cycle_last = clock->read(clock); - clock->error = 0; + timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); + timekeeper.ntp_error = 0; timekeeping_suspended = 0; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -514,7 +581,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) timekeeping_suspend_time = read_persistent_clock(); write_seqlock_irqsave(&xtime_lock, flags); - clocksource_forward_now(); + timekeeping_forward_now(); timekeeping_suspended = 1; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -549,7 +616,7 @@ device_initcall(timekeeping_init_device); * If the error is already larger, we look ahead even further * to compensate for late or lost adjustments. */ -static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, +static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, s64 *offset) { s64 tick_error, i; @@ -565,7 +632,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, * here. This is tuned so that an error of about 1 msec is adjusted * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). */ - error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); + error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); error2 = abs(error2); for (look_ahead = 0; error2 > 0; look_ahead++) error2 >>= 2; @@ -574,8 +641,9 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, * Now calculate the error in (1 << look_ahead) ticks, but first * remove the single look ahead already included in the error. */ - tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1); - tick_error -= clock->xtime_interval >> 1; + tick_error = tick_length >> + (NTP_SCALE_SHIFT - timekeeper.clock->shift + 1); + tick_error -= timekeeper.xtime_interval >> 1; error = ((error - tick_error) >> look_ahead) + tick_error; /* Finally calculate the adjustment shift value. */ @@ -600,18 +668,19 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, * this is optimized for the most common adjustments of -1,0,1, * for other values we can do a bit more work. */ -static void clocksource_adjust(s64 offset) +static void timekeeping_adjust(s64 offset) { - s64 error, interval = clock->cycle_interval; + s64 error, interval = timekeeper.cycle_interval; int adj; - error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1); + error = timekeeper.ntp_error >> + (NTP_SCALE_SHIFT - timekeeper.clock->shift - 1); if (error > interval) { error >>= 2; if (likely(error <= interval)) adj = 1; else - adj = clocksource_bigadjust(error, &interval, &offset); + adj = timekeeping_bigadjust(error, &interval, &offset); } else if (error < -interval) { error >>= 2; if (likely(error >= -interval)) { @@ -619,15 +688,15 @@ static void clocksource_adjust(s64 offset) interval = -interval; offset = -offset; } else - adj = clocksource_bigadjust(error, &interval, &offset); + adj = timekeeping_bigadjust(error, &interval, &offset); } else return; - clock->mult += adj; - clock->xtime_interval += interval; - clock->xtime_nsec -= offset; - clock->error -= (interval - offset) << - (NTP_SCALE_SHIFT - clock->shift); + timekeeper.clock->mult += adj; + timekeeper.xtime_interval += interval; + timekeeper.xtime_nsec -= offset; + timekeeper.ntp_error -= (interval - offset) << + (NTP_SCALE_SHIFT - timekeeper.clock->shift); } /** @@ -637,53 +706,59 @@ static void clocksource_adjust(s64 offset) */ void update_wall_time(void) { + struct clocksource *clock; cycle_t offset; + s64 nsecs; /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) return; + clock = timekeeper.clock; #ifdef CONFIG_GENERIC_TIME offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #else - offset = clock->cycle_interval; + offset = timekeeper.cycle_interval; #endif - clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift; + timekeeper.xtime_nsec = (s64)xtime.tv_nsec << clock->shift; /* normally this loop will run just once, however in the * case of lost or late ticks, it will accumulate correctly. */ - while (offset >= clock->cycle_interval) { + while (offset >= timekeeper.cycle_interval) { + u64 nsecps = (u64)NSEC_PER_SEC << clock->shift; + /* accumulate one interval */ - offset -= clock->cycle_interval; - clock->cycle_last += clock->cycle_interval; + offset -= timekeeper.cycle_interval; + clock->cycle_last += timekeeper.cycle_interval; - clock->xtime_nsec += clock->xtime_interval; - if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { - clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; + timekeeper.xtime_nsec += timekeeper.xtime_interval; + if (timekeeper.xtime_nsec >= nsecps) { + timekeeper.xtime_nsec -= nsecps; xtime.tv_sec++; second_overflow(); } - clock->raw_time.tv_nsec += clock->raw_interval; - if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) { - clock->raw_time.tv_nsec -= NSEC_PER_SEC; - clock->raw_time.tv_sec++; + raw_time.tv_nsec += timekeeper.raw_interval; + if (raw_time.tv_nsec >= NSEC_PER_SEC) { + raw_time.tv_nsec -= NSEC_PER_SEC; + raw_time.tv_sec++; } /* accumulate error between NTP and clock interval */ - clock->error += tick_length; - clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); + timekeeper.ntp_error += tick_length; + timekeeper.ntp_error -= timekeeper.xtime_interval << + (NTP_SCALE_SHIFT - clock->shift); } /* correct the clock when NTP error is too big */ - clocksource_adjust(offset); + timekeeping_adjust(offset); /* * Since in the loop above, we accumulate any amount of time * in xtime_nsec over a second into xtime.tv_sec, its possible for * xtime_nsec to be fairly small after the loop. Further, if we're - * slightly speeding the clocksource up in clocksource_adjust(), + * slightly speeding the clocksource up in timekeeping_adjust(), * its possible the required corrective factor to xtime_nsec could * cause it to underflow. * @@ -695,24 +770,26 @@ void update_wall_time(void) * We'll correct this error next time through this function, when * xtime_nsec is not as small. */ - if (unlikely((s64)clock->xtime_nsec < 0)) { - s64 neg = -(s64)clock->xtime_nsec; - clock->xtime_nsec = 0; - clock->error += neg << (NTP_SCALE_SHIFT - clock->shift); + if (unlikely((s64)timekeeper.xtime_nsec < 0)) { + s64 neg = -(s64)timekeeper.xtime_nsec; + timekeeper.xtime_nsec = 0; + timekeeper.ntp_error += neg << (NTP_SCALE_SHIFT - clock->shift); } /* store full nanoseconds into xtime after rounding it up and * add the remainder to the error difference. */ - xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1; - clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; - clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift); + xtime.tv_nsec = ((s64)timekeeper.xtime_nsec >> clock->shift) + 1; + timekeeper.xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; + timekeeper.ntp_error += timekeeper.xtime_nsec << + (NTP_SCALE_SHIFT - clock->shift); - update_xtime_cache(cyc2ns(clock, offset)); + nsecs = clocksource_cyc2ns(offset, clock->mult, clock->shift); + update_xtime_cache(nsecs); /* check to see if there is a new clocksource to use */ change_clocksource(); - update_vsyscall(&xtime, clock); + update_vsyscall(&xtime, timekeeper.clock); } /** -- cgit v1.2.3-70-g09d2 From 23ce72117c714baab794e66c8daf343bf6a912bf Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:27 +0200 Subject: timekeeping: Add xtime_shift and ntp_error_shift to struct timekeeper The xtime_nsec value in the timekeeper structure is shifted by a few bits to improve precision. This happens to be the same value as the clock->shift. To improve readability add xtime_shift to the timekeeper and use it instead of the clock->shift. Likewise add ntp_error_shift and replace all (NTP_SCALE_SHIFT - clock->shift) expressions. Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134809.871899606@de.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7af45cbf6b1..dfdab1cefe1 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -23,6 +23,8 @@ struct timekeeper { /* Current clocksource used for timekeeping. */ struct clocksource *clock; + /* The shift value of the current clocksource. */ + int shift; /* Number of clock cycles in one NTP interval. */ cycle_t cycle_interval; @@ -36,6 +38,9 @@ struct timekeeper { /* Difference between accumulated time and NTP time in ntp * shifted nano seconds. */ s64 ntp_error; + /* Shift conversion between clock shifted nano seconds and + * ntp shifted nano seconds. */ + int ntp_error_shift; }; struct timekeeper timekeeper; @@ -75,8 +80,10 @@ static void timekeeper_setup_internals(struct clocksource *clock) ((u64) interval * clock->mult_orig) >> clock->shift; timekeeper.xtime_nsec = 0; + timekeeper.shift = clock->shift; timekeeper.ntp_error = 0; + timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; } /* @@ -641,8 +648,7 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, * Now calculate the error in (1 << look_ahead) ticks, but first * remove the single look ahead already included in the error. */ - tick_error = tick_length >> - (NTP_SCALE_SHIFT - timekeeper.clock->shift + 1); + tick_error = tick_length >> (timekeeper.ntp_error_shift + 1); tick_error -= timekeeper.xtime_interval >> 1; error = ((error - tick_error) >> look_ahead) + tick_error; @@ -673,8 +679,7 @@ static void timekeeping_adjust(s64 offset) s64 error, interval = timekeeper.cycle_interval; int adj; - error = timekeeper.ntp_error >> - (NTP_SCALE_SHIFT - timekeeper.clock->shift - 1); + error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); if (error > interval) { error >>= 2; if (likely(error <= interval)) @@ -696,7 +701,7 @@ static void timekeeping_adjust(s64 offset) timekeeper.xtime_interval += interval; timekeeper.xtime_nsec -= offset; timekeeper.ntp_error -= (interval - offset) << - (NTP_SCALE_SHIFT - timekeeper.clock->shift); + timekeeper.ntp_error_shift; } /** @@ -708,7 +713,7 @@ void update_wall_time(void) { struct clocksource *clock; cycle_t offset; - s64 nsecs; + u64 nsecs; /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) @@ -720,13 +725,13 @@ void update_wall_time(void) #else offset = timekeeper.cycle_interval; #endif - timekeeper.xtime_nsec = (s64)xtime.tv_nsec << clock->shift; + timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; /* normally this loop will run just once, however in the * case of lost or late ticks, it will accumulate correctly. */ while (offset >= timekeeper.cycle_interval) { - u64 nsecps = (u64)NSEC_PER_SEC << clock->shift; + u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; /* accumulate one interval */ offset -= timekeeper.cycle_interval; @@ -748,7 +753,7 @@ void update_wall_time(void) /* accumulate error between NTP and clock interval */ timekeeper.ntp_error += tick_length; timekeeper.ntp_error -= timekeeper.xtime_interval << - (NTP_SCALE_SHIFT - clock->shift); + timekeeper.ntp_error_shift; } /* correct the clock when NTP error is too big */ @@ -773,16 +778,16 @@ void update_wall_time(void) if (unlikely((s64)timekeeper.xtime_nsec < 0)) { s64 neg = -(s64)timekeeper.xtime_nsec; timekeeper.xtime_nsec = 0; - timekeeper.ntp_error += neg << (NTP_SCALE_SHIFT - clock->shift); + timekeeper.ntp_error += neg << timekeeper.ntp_error_shift; } /* store full nanoseconds into xtime after rounding it up and * add the remainder to the error difference. */ - xtime.tv_nsec = ((s64)timekeeper.xtime_nsec >> clock->shift) + 1; - timekeeper.xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; - timekeeper.ntp_error += timekeeper.xtime_nsec << - (NTP_SCALE_SHIFT - clock->shift); + xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1; + timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift; + timekeeper.ntp_error += timekeeper.xtime_nsec << + timekeeper.ntp_error_shift; nsecs = clocksource_cyc2ns(offset, clock->mult, clock->shift); update_xtime_cache(nsecs); -- cgit v1.2.3-70-g09d2 From 0a54419836254a27baecd9037103171bcbabaf67 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:28 +0200 Subject: timekeeping: Move NTP adjusted clock multiplier to struct timekeeper The clocksource structure has two multipliers, the unmodified multiplier clock->mult_orig and the NTP corrected multiplier clock->mult. The NTP multiplier is misplaced in the struct clocksource, this is private information of the timekeeping code. Add the mult field to the struct timekeeper to contain the NTP corrected value, keep the unmodifed multiplier in clock->mult and remove clock->mult_orig. Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134810.149047645@de.ibm.com> Signed-off-by: Thomas Gleixner --- arch/arm/plat-omap/common.c | 7 ++---- include/linux/clocksource.h | 4 +--- kernel/time/timekeeping.c | 53 ++++++++++++++++++++------------------------- 3 files changed, 27 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/arch/arm/plat-omap/common.c b/arch/arm/plat-omap/common.c index ebcf006406f..95587b6c025 100644 --- a/arch/arm/plat-omap/common.c +++ b/arch/arm/plat-omap/common.c @@ -253,11 +253,8 @@ static struct clocksource clocksource_32k = { */ unsigned long long sched_clock(void) { - unsigned long long ret; - - ret = (unsigned long long)clocksource_32k.read(&clocksource_32k); - ret = (ret * clocksource_32k.mult_orig) >> clocksource_32k.shift; - return ret; + return clocksource_cyc2ns(clocksource_32k.read(&clocksource_32k), + clocksource_32k.mult, clocksource_32k.shift); } static int __init omap_init_clocksource_32k(void) diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index e12e3095e2f..e34015effeb 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -149,8 +149,7 @@ extern u64 timecounter_cyc2time(struct timecounter *tc, * @disable: optional function to disable the clocksource * @mask: bitmask for two's complement * subtraction of non 64 bit counters - * @mult: cycle to nanosecond multiplier (adjusted by NTP) - * @mult_orig: cycle to nanosecond multiplier (unadjusted by NTP) + * @mult: cycle to nanosecond multiplier * @shift: cycle to nanosecond divisor (power of two) * @flags: flags describing special properties * @vread: vsyscall based read @@ -168,7 +167,6 @@ struct clocksource { void (*disable)(struct clocksource *cs); cycle_t mask; u32 mult; - u32 mult_orig; u32 shift; unsigned long flags; cycle_t (*vread)(void); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index dfdab1cefe1..f4056f6c263 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -41,6 +41,8 @@ struct timekeeper { /* Shift conversion between clock shifted nano seconds and * ntp shifted nano seconds. */ int ntp_error_shift; + /* NTP adjusted clock multiplier */ + u32 mult; }; struct timekeeper timekeeper; @@ -66,8 +68,8 @@ static void timekeeper_setup_internals(struct clocksource *clock) /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; tmp <<= clock->shift; - tmp += clock->mult_orig/2; - do_div(tmp, clock->mult_orig); + tmp += clock->mult/2; + do_div(tmp, clock->mult); if (tmp == 0) tmp = 1; @@ -77,13 +79,20 @@ static void timekeeper_setup_internals(struct clocksource *clock) /* Go back from cycles -> shifted ns */ timekeeper.xtime_interval = (u64) interval * clock->mult; timekeeper.raw_interval = - ((u64) interval * clock->mult_orig) >> clock->shift; + ((u64) interval * clock->mult) >> clock->shift; timekeeper.xtime_nsec = 0; timekeeper.shift = clock->shift; timekeeper.ntp_error = 0; timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; + + /* + * The timekeeper keeps its own mult values for the currently + * active clocksource. These value will be adjusted via NTP + * to counteract clock drifting. + */ + timekeeper.mult = clock->mult; } /* @@ -154,14 +163,15 @@ static void timekeeping_forward_now(void) cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; clock->cycle_last = cycle_now; - nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); + nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult, + timekeeper.shift); /* If arch requires, add in gettimeoffset() */ nsec += arch_gettimeoffset(); timespec_add_ns(&xtime, nsec); - nsec = clocksource_cyc2ns(cycle_delta, clock->mult_orig, clock->shift); + nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); timespec_add_ns(&raw_time, nsec); } @@ -193,8 +203,8 @@ void getnstimeofday(struct timespec *ts) cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; /* convert to nanoseconds: */ - nsecs = clocksource_cyc2ns(cycle_delta, clock->mult, - clock->shift); + nsecs = clocksource_cyc2ns(cycle_delta, timekeeper.mult, + timekeeper.shift); /* If arch requires, add in gettimeoffset() */ nsecs += arch_gettimeoffset(); @@ -228,8 +238,8 @@ ktime_t ktime_get(void) cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; /* convert to nanoseconds: */ - nsecs += clocksource_cyc2ns(cycle_delta, clock->mult, - clock->shift); + nsecs += clocksource_cyc2ns(cycle_delta, timekeeper.mult, + timekeeper.shift); } while (read_seqretry(&xtime_lock, seq)); /* @@ -271,8 +281,8 @@ void ktime_get_ts(struct timespec *ts) cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; /* convert to nanoseconds: */ - nsecs = clocksource_cyc2ns(cycle_delta, clock->mult, - clock->shift); + nsecs = clocksource_cyc2ns(cycle_delta, timekeeper.mult, + timekeeper.shift); } while (read_seqretry(&xtime_lock, seq)); @@ -356,22 +366,10 @@ static void change_clocksource(void) if (new->enable && !new->enable(new)) return; - /* - * The frequency may have changed while the clocksource - * was disabled. If so the code in ->enable() must update - * the mult value to reflect the new frequency. Make sure - * mult_orig follows this change. - */ - new->mult_orig = new->mult; old = timekeeper.clock; timekeeper_setup_internals(new); - /* - * Save mult_orig in mult so that the value can be restored - * regardless if ->enable() updates the value of mult or not. - */ - old->mult = old->mult_orig; if (old->disable) old->disable(old); @@ -461,7 +459,7 @@ void getrawmonotonic(struct timespec *ts) cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; /* convert to nanoseconds: */ - nsecs = clocksource_cyc2ns(cycle_delta, clock->mult_orig, + nsecs = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); *ts = raw_time; @@ -521,9 +519,6 @@ void __init timekeeping_init(void) clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); - /* set mult_orig on enable */ - clock->mult_orig = clock->mult; - timekeeper_setup_internals(clock); xtime.tv_sec = sec; @@ -697,7 +692,7 @@ static void timekeeping_adjust(s64 offset) } else return; - timekeeper.clock->mult += adj; + timekeeper.mult += adj; timekeeper.xtime_interval += interval; timekeeper.xtime_nsec -= offset; timekeeper.ntp_error -= (interval - offset) << @@ -789,7 +784,7 @@ void update_wall_time(void) timekeeper.ntp_error += timekeeper.xtime_nsec << timekeeper.ntp_error_shift; - nsecs = clocksource_cyc2ns(offset, clock->mult, clock->shift); + nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift); update_xtime_cache(nsecs); /* check to see if there is a new clocksource to use */ -- cgit v1.2.3-70-g09d2 From 2ba2a3054fdffc8e6452f4ee120760322a6fbd43 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:29 +0200 Subject: timekeeping: Add timekeeper read_clock helper functions Add timekeeper_read_clock_ntp and timekeeper_read_clock_raw and use them for getnstimeofday, ktime_get, ktime_get_ts and getrawmonotonic. Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134810.435105711@de.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 91 ++++++++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f4056f6c263..27ae01b596b 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -95,6 +95,40 @@ static void timekeeper_setup_internals(struct clocksource *clock) timekeeper.mult = clock->mult; } +/* Timekeeper helper functions. */ +static inline s64 timekeeping_get_ns(void) +{ + cycle_t cycle_now, cycle_delta; + struct clocksource *clock; + + /* read clocksource: */ + clock = timekeeper.clock; + cycle_now = clock->read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* return delta convert to nanoseconds using ntp adjusted mult. */ + return clocksource_cyc2ns(cycle_delta, timekeeper.mult, + timekeeper.shift); +} + +static inline s64 timekeeping_get_ns_raw(void) +{ + cycle_t cycle_now, cycle_delta; + struct clocksource *clock; + + /* read clocksource: */ + clock = timekeeper.clock; + cycle_now = clock->read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* return delta convert to nanoseconds using ntp adjusted mult. */ + return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); +} + /* * This read-write spinlock protects us from races in SMP while * playing with xtime. @@ -183,8 +217,6 @@ static void timekeeping_forward_now(void) */ void getnstimeofday(struct timespec *ts) { - cycle_t cycle_now, cycle_delta; - struct clocksource *clock; unsigned long seq; s64 nsecs; @@ -194,17 +226,7 @@ void getnstimeofday(struct timespec *ts) seq = read_seqbegin(&xtime_lock); *ts = xtime; - - /* read clocksource: */ - clock = timekeeper.clock; - cycle_now = clock->read(clock); - - /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - - /* convert to nanoseconds: */ - nsecs = clocksource_cyc2ns(cycle_delta, timekeeper.mult, - timekeeper.shift); + nsecs = timekeeping_get_ns(); /* If arch requires, add in gettimeoffset() */ nsecs += arch_gettimeoffset(); @@ -218,8 +240,6 @@ EXPORT_SYMBOL(getnstimeofday); ktime_t ktime_get(void) { - cycle_t cycle_now, cycle_delta; - struct clocksource *clock; unsigned int seq; s64 secs, nsecs; @@ -229,17 +249,7 @@ ktime_t ktime_get(void) seq = read_seqbegin(&xtime_lock); secs = xtime.tv_sec + wall_to_monotonic.tv_sec; nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; - - /* read clocksource: */ - clock = timekeeper.clock; - cycle_now = clock->read(clock); - - /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - - /* convert to nanoseconds: */ - nsecs += clocksource_cyc2ns(cycle_delta, timekeeper.mult, - timekeeper.shift); + nsecs += timekeeping_get_ns(); } while (read_seqretry(&xtime_lock, seq)); /* @@ -260,8 +270,6 @@ EXPORT_SYMBOL_GPL(ktime_get); */ void ktime_get_ts(struct timespec *ts) { - cycle_t cycle_now, cycle_delta; - struct clocksource *clock; struct timespec tomono; unsigned int seq; s64 nsecs; @@ -272,17 +280,7 @@ void ktime_get_ts(struct timespec *ts) seq = read_seqbegin(&xtime_lock); *ts = xtime; tomono = wall_to_monotonic; - - /* read clocksource: */ - clock = timekeeper.clock; - cycle_now = clock->read(clock); - - /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - - /* convert to nanoseconds: */ - nsecs = clocksource_cyc2ns(cycle_delta, timekeeper.mult, - timekeeper.shift); + nsecs = timekeeping_get_ns(); } while (read_seqretry(&xtime_lock, seq)); @@ -445,23 +443,10 @@ void getrawmonotonic(struct timespec *ts) { unsigned long seq; s64 nsecs; - cycle_t cycle_now, cycle_delta; - struct clocksource *clock; do { seq = read_seqbegin(&xtime_lock); - - /* read clocksource: */ - clock = timekeeper.clock; - cycle_now = clock->read(clock); - - /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - - /* convert to nanoseconds: */ - nsecs = clocksource_cyc2ns(cycle_delta, clock->mult, - clock->shift); - + nsecs = timekeeping_get_ns_raw(); *ts = raw_time; } while (read_seqretry(&xtime_lock, seq)); -- cgit v1.2.3-70-g09d2 From 75c5158f70c065b9704b924503d96e8297838f79 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:30 +0200 Subject: timekeeping: Update clocksource with stop_machine update_wall_time calls change_clocksource HZ times per second to check if a new clock source is available. In close to 100% of all calls there is no new clock. Replace the tick based check by an update done with stop_machine. Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134810.711836357@de.ibm.com> Signed-off-by: Thomas Gleixner --- include/linux/clocksource.h | 2 + kernel/time/clocksource.c | 112 +++++++++++++++++--------------------------- kernel/time/timekeeping.c | 41 ++++++++++------ 3 files changed, 72 insertions(+), 83 deletions(-) (limited to 'kernel') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index e34015effeb..9ea40ff26f0 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -291,4 +291,6 @@ static inline void update_vsyscall_tz(void) } #endif +extern void timekeeping_notify(struct clocksource *clock); + #endif /* _LINUX_CLOCKSOURCE_H */ diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index f18c9a6bdcf..a1657b5fdeb 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -109,35 +109,17 @@ EXPORT_SYMBOL(timecounter_cyc2time); /*[Clocksource internal variables]--------- * curr_clocksource: * currently selected clocksource. - * next_clocksource: - * pending next selected clocksource. * clocksource_list: * linked list with the registered clocksources - * clocksource_lock: - * protects manipulations to curr_clocksource and next_clocksource - * and the clocksource_list + * clocksource_mutex: + * protects manipulations to curr_clocksource and the clocksource_list * override_name: * Name of the user-specified clocksource. */ static struct clocksource *curr_clocksource; -static struct clocksource *next_clocksource; static LIST_HEAD(clocksource_list); -static DEFINE_SPINLOCK(clocksource_lock); +static DEFINE_MUTEX(clocksource_mutex); static char override_name[32]; -static int finished_booting; - -/* clocksource_done_booting - Called near the end of core bootup - * - * Hack to avoid lots of clocksource churn at boot time. - * We use fs_initcall because we want this to start before - * device_initcall but after subsys_initcall. - */ -static int __init clocksource_done_booting(void) -{ - finished_booting = 1; - return 0; -} -fs_initcall(clocksource_done_booting); #ifdef CONFIG_CLOCKSOURCE_WATCHDOG static LIST_HEAD(watchdog_list); @@ -356,18 +338,16 @@ static inline void clocksource_resume_watchdog(void) { } void clocksource_resume(void) { struct clocksource *cs; - unsigned long flags; - spin_lock_irqsave(&clocksource_lock, flags); + mutex_lock(&clocksource_mutex); - list_for_each_entry(cs, &clocksource_list, list) { + list_for_each_entry(cs, &clocksource_list, list) if (cs->resume) cs->resume(); - } clocksource_resume_watchdog(); - spin_unlock_irqrestore(&clocksource_lock, flags); + mutex_unlock(&clocksource_mutex); } /** @@ -383,28 +363,13 @@ void clocksource_touch_watchdog(void) } #ifdef CONFIG_GENERIC_TIME -/** - * clocksource_get_next - Returns the selected clocksource - * - */ -struct clocksource *clocksource_get_next(void) -{ - unsigned long flags; - spin_lock_irqsave(&clocksource_lock, flags); - if (next_clocksource && finished_booting) { - curr_clocksource = next_clocksource; - next_clocksource = NULL; - } - spin_unlock_irqrestore(&clocksource_lock, flags); - - return curr_clocksource; -} +static int finished_booting; /** * clocksource_select - Select the best clocksource available * - * Private function. Must hold clocksource_lock when called. + * Private function. Must hold clocksource_mutex when called. * * Select the clocksource with the best rating, or the clocksource, * which is selected by userspace override. @@ -413,7 +378,7 @@ static void clocksource_select(void) { struct clocksource *best, *cs; - if (list_empty(&clocksource_list)) + if (!finished_booting || list_empty(&clocksource_list)) return; /* First clocksource on the list has the best rating. */ best = list_first_entry(&clocksource_list, struct clocksource, list); @@ -438,13 +403,31 @@ static void clocksource_select(void) best = cs; break; } - if (curr_clocksource != best) - next_clocksource = best; + if (curr_clocksource != best) { + printk(KERN_INFO "Switching to clocksource %s\n", best->name); + curr_clocksource = best; + timekeeping_notify(curr_clocksource); + } } +/* + * clocksource_done_booting - Called near the end of core bootup + * + * Hack to avoid lots of clocksource churn at boot time. + * We use fs_initcall because we want this to start before + * device_initcall but after subsys_initcall. + */ +static int __init clocksource_done_booting(void) +{ + finished_booting = 1; + clocksource_select(); + return 0; +} +fs_initcall(clocksource_done_booting); + #else /* CONFIG_GENERIC_TIME */ -static void clocksource_select(void) { } +static inline void clocksource_select(void) { } #endif @@ -471,13 +454,11 @@ static void clocksource_enqueue(struct clocksource *cs) */ int clocksource_register(struct clocksource *cs) { - unsigned long flags; - - spin_lock_irqsave(&clocksource_lock, flags); + mutex_lock(&clocksource_mutex); clocksource_enqueue(cs); clocksource_select(); - spin_unlock_irqrestore(&clocksource_lock, flags); clocksource_enqueue_watchdog(cs); + mutex_unlock(&clocksource_mutex); return 0; } EXPORT_SYMBOL(clocksource_register); @@ -487,14 +468,12 @@ EXPORT_SYMBOL(clocksource_register); */ void clocksource_change_rating(struct clocksource *cs, int rating) { - unsigned long flags; - - spin_lock_irqsave(&clocksource_lock, flags); + mutex_lock(&clocksource_mutex); list_del(&cs->list); cs->rating = rating; clocksource_enqueue(cs); clocksource_select(); - spin_unlock_irqrestore(&clocksource_lock, flags); + mutex_unlock(&clocksource_mutex); } EXPORT_SYMBOL(clocksource_change_rating); @@ -503,13 +482,11 @@ EXPORT_SYMBOL(clocksource_change_rating); */ void clocksource_unregister(struct clocksource *cs) { - unsigned long flags; - + mutex_lock(&clocksource_mutex); clocksource_dequeue_watchdog(cs); - spin_lock_irqsave(&clocksource_lock, flags); list_del(&cs->list); clocksource_select(); - spin_unlock_irqrestore(&clocksource_lock, flags); + mutex_unlock(&clocksource_mutex); } EXPORT_SYMBOL(clocksource_unregister); @@ -527,9 +504,9 @@ sysfs_show_current_clocksources(struct sys_device *dev, { ssize_t count = 0; - spin_lock_irq(&clocksource_lock); + mutex_lock(&clocksource_mutex); count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name); - spin_unlock_irq(&clocksource_lock); + mutex_unlock(&clocksource_mutex); return count; } @@ -557,14 +534,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, if (buf[count-1] == '\n') count--; - spin_lock_irq(&clocksource_lock); + mutex_lock(&clocksource_mutex); if (count > 0) memcpy(override_name, buf, count); override_name[count] = 0; clocksource_select(); - spin_unlock_irq(&clocksource_lock); + mutex_unlock(&clocksource_mutex); return ret; } @@ -584,7 +561,7 @@ sysfs_show_available_clocksources(struct sys_device *dev, struct clocksource *src; ssize_t count = 0; - spin_lock_irq(&clocksource_lock); + mutex_lock(&clocksource_mutex); list_for_each_entry(src, &clocksource_list, list) { /* * Don't show non-HRES clocksource if the tick code is @@ -596,7 +573,7 @@ sysfs_show_available_clocksources(struct sys_device *dev, max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "%s ", src->name); } - spin_unlock_irq(&clocksource_lock); + mutex_unlock(&clocksource_mutex); count += snprintf(buf + count, max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n"); @@ -651,11 +628,10 @@ device_initcall(init_clocksource_sysfs); */ static int __init boot_override_clocksource(char* str) { - unsigned long flags; - spin_lock_irqsave(&clocksource_lock, flags); + mutex_lock(&clocksource_mutex); if (str) strlcpy(override_name, str, sizeof(override_name)); - spin_unlock_irqrestore(&clocksource_lock, flags); + mutex_unlock(&clocksource_mutex); return 1; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 27ae01b596b..41579e7fcf9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -18,6 +18,7 @@ #include #include #include +#include /* Structure holding internal timekeeping values. */ struct timekeeper { @@ -179,6 +180,7 @@ void timekeeping_leap_insert(int leapsecond) } #ifdef CONFIG_GENERIC_TIME + /** * timekeeping_forward_now - update clock to the current time * @@ -351,31 +353,40 @@ EXPORT_SYMBOL(do_settimeofday); * * Accumulates current time interval and initializes new clocksource */ -static void change_clocksource(void) +static int change_clocksource(void *data) { struct clocksource *new, *old; - new = clocksource_get_next(); - - if (!new || timekeeper.clock == new) - return; + new = (struct clocksource *) data; timekeeping_forward_now(); + if (!new->enable || new->enable(new) == 0) { + old = timekeeper.clock; + timekeeper_setup_internals(new); + if (old->disable) + old->disable(old); + } + return 0; +} - if (new->enable && !new->enable(new)) +/** + * timekeeping_notify - Install a new clock source + * @clock: pointer to the clock source + * + * This function is called from clocksource.c after a new, better clock + * source has been registered. The caller holds the clocksource_mutex. + */ +void timekeeping_notify(struct clocksource *clock) +{ + if (timekeeper.clock == clock) return; - - old = timekeeper.clock; - timekeeper_setup_internals(new); - - if (old->disable) - old->disable(old); - + stop_machine(change_clocksource, clock, NULL); tick_clock_notify(); } + #else /* GENERIC_TIME */ + static inline void timekeeping_forward_now(void) { } -static inline void change_clocksource(void) { } /** * ktime_get - get the monotonic time in ktime_t format @@ -416,6 +427,7 @@ void ktime_get_ts(struct timespec *ts) ts->tv_nsec + tomono.tv_nsec); } EXPORT_SYMBOL_GPL(ktime_get_ts); + #endif /* !GENERIC_TIME */ /** @@ -773,7 +785,6 @@ void update_wall_time(void) update_xtime_cache(nsecs); /* check to see if there is a new clocksource to use */ - change_clocksource(); update_vsyscall(&xtime, timekeeper.clock); } -- cgit v1.2.3-70-g09d2 From d4f587c67fc39e0030ddd718675e252e208da4d7 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:31 +0200 Subject: timekeeping: Increase granularity of read_persistent_clock() The persistent clock of some architectures (e.g. s390) have a better granularity than seconds. To reduce the delta between the host clock and the guest clock in a virtualized system change the read_persistent_clock function to return a struct timespec. Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134811.013873340@de.ibm.com> Signed-off-by: Thomas Gleixner --- arch/m68knommu/kernel/time.c | 5 ++-- arch/mips/dec/time.c | 5 ++-- arch/mips/lasat/ds1603.c | 5 ++-- arch/mips/lasat/sysctl.c | 8 ++++-- arch/mips/lemote/lm2e/setup.c | 5 ++-- arch/mips/mti-malta/malta-time.c | 5 ++-- arch/mips/pmc-sierra/yosemite/setup.c | 5 ++-- arch/mips/sibyte/swarm/setup.c | 15 +++++++--- arch/mips/sni/time.c | 5 ++-- arch/powerpc/kernel/time.c | 7 +++-- arch/s390/kernel/time.c | 22 +++------------ arch/sh/kernel/time.c | 6 ++-- arch/x86/kernel/rtc.c | 5 ++-- arch/xtensa/kernel/time.c | 5 ++-- include/linux/time.h | 2 +- kernel/time/timekeeping.c | 52 +++++++++++++++++++---------------- 16 files changed, 83 insertions(+), 74 deletions(-) (limited to 'kernel') diff --git a/arch/m68knommu/kernel/time.c b/arch/m68knommu/kernel/time.c index d182b2f7221..68432248515 100644 --- a/arch/m68knommu/kernel/time.c +++ b/arch/m68knommu/kernel/time.c @@ -72,9 +72,10 @@ static unsigned long read_rtc_mmss(void) return mktime(year, mon, day, hour, min, sec);; } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { - return read_rtc_mmss(); + ts->tv_sec = read_rtc_mmss(); + ts->tv_nsec = 0; } int update_persistent_clock(struct timespec now) diff --git a/arch/mips/dec/time.c b/arch/mips/dec/time.c index 463136e6685..02f505f23c3 100644 --- a/arch/mips/dec/time.c +++ b/arch/mips/dec/time.c @@ -18,7 +18,7 @@ #include #include -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { unsigned int year, mon, day, hour, min, sec, real_year; unsigned long flags; @@ -53,7 +53,8 @@ unsigned long read_persistent_clock(void) year += real_year - 72 + 2000; - return mktime(year, mon, day, hour, min, sec); + ts->tv_sec = mktime(year, mon, day, hour, min, sec); + ts->tv_nsec = 0; } /* diff --git a/arch/mips/lasat/ds1603.c b/arch/mips/lasat/ds1603.c index 52cb1436a12..c6fd96ff118 100644 --- a/arch/mips/lasat/ds1603.c +++ b/arch/mips/lasat/ds1603.c @@ -135,7 +135,7 @@ static void rtc_end_op(void) lasat_ndelay(1000); } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { unsigned long word; unsigned long flags; @@ -147,7 +147,8 @@ unsigned long read_persistent_clock(void) rtc_end_op(); spin_unlock_irqrestore(&rtc_lock, flags); - return word; + ts->tv_sec = word; + ts->tv_nsec = 0; } int rtc_mips_set_mmss(unsigned long time) diff --git a/arch/mips/lasat/sysctl.c b/arch/mips/lasat/sysctl.c index 8f88886feb1..3f04d4c406b 100644 --- a/arch/mips/lasat/sysctl.c +++ b/arch/mips/lasat/sysctl.c @@ -92,10 +92,12 @@ static int rtctmp; int proc_dolasatrtc(ctl_table *table, int write, struct file *filp, void *buffer, size_t *lenp, loff_t *ppos) { + struct timespec ts; int r; if (!write) { - rtctmp = read_persistent_clock(); + read_persistent_clock(&ts); + rtctmp = ts.tv_sec; /* check for time < 0 and set to 0 */ if (rtctmp < 0) rtctmp = 0; @@ -134,9 +136,11 @@ int sysctl_lasat_rtc(ctl_table *table, void *oldval, size_t *oldlenp, void *newval, size_t newlen) { + struct timespec ts; int r; - rtctmp = read_persistent_clock(); + read_persistent_clock(&ts); + rtctmp = ts.tv_sec; if (rtctmp < 0) rtctmp = 0; r = sysctl_intvec(table, oldval, oldlenp, newval, newlen); diff --git a/arch/mips/lemote/lm2e/setup.c b/arch/mips/lemote/lm2e/setup.c index ebd6ceaef2f..24b355df612 100644 --- a/arch/mips/lemote/lm2e/setup.c +++ b/arch/mips/lemote/lm2e/setup.c @@ -54,9 +54,10 @@ void __init plat_time_init(void) mips_hpt_frequency = cpu_clock_freq / 2; } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { - return mc146818_get_cmos_time(); + ts->tv_sec = mc146818_get_cmos_time(); + ts->tv_nsec = 0; } void (*__wbflush)(void); diff --git a/arch/mips/mti-malta/malta-time.c b/arch/mips/mti-malta/malta-time.c index 0b97d47691f..3c6f190aa61 100644 --- a/arch/mips/mti-malta/malta-time.c +++ b/arch/mips/mti-malta/malta-time.c @@ -100,9 +100,10 @@ static unsigned int __init estimate_cpu_frequency(void) return count; } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { - return mc146818_get_cmos_time(); + ts->tv_sec = mc146818_get_cmos_time(); + ts->tv_nsec = 0; } static void __init plat_perf_setup(void) diff --git a/arch/mips/pmc-sierra/yosemite/setup.c b/arch/mips/pmc-sierra/yosemite/setup.c index 2d3c0dca275..3498ac9c35a 100644 --- a/arch/mips/pmc-sierra/yosemite/setup.c +++ b/arch/mips/pmc-sierra/yosemite/setup.c @@ -70,7 +70,7 @@ void __init bus_error_init(void) } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { unsigned int year, month, day, hour, min, sec; unsigned long flags; @@ -92,7 +92,8 @@ unsigned long read_persistent_clock(void) m48t37_base->control = 0x00; spin_unlock_irqrestore(&rtc_lock, flags); - return mktime(year, month, day, hour, min, sec); + ts->tv_sec = mktime(year, month, day, hour, min, sec); + ts->tv_nsec = 0; } int rtc_mips_set_time(unsigned long tim) diff --git a/arch/mips/sibyte/swarm/setup.c b/arch/mips/sibyte/swarm/setup.c index 672e45d495a..623ffc933c4 100644 --- a/arch/mips/sibyte/swarm/setup.c +++ b/arch/mips/sibyte/swarm/setup.c @@ -87,19 +87,26 @@ enum swarm_rtc_type { enum swarm_rtc_type swarm_rtc_type; -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { + unsigned long sec; + switch (swarm_rtc_type) { case RTC_XICOR: - return xicor_get_time(); + sec = xicor_get_time(); + break; case RTC_M4LT81: - return m41t81_get_time(); + sec = m41t81_get_time(); + break; case RTC_NONE: default: - return mktime(2000, 1, 1, 0, 0, 0); + sec = mktime(2000, 1, 1, 0, 0, 0); + break; } + ts->tv_sec = sec; + tv->tv_nsec = 0; } int rtc_mips_set_time(unsigned long sec) diff --git a/arch/mips/sni/time.c b/arch/mips/sni/time.c index 0d9ec1a5c24..62df6a598e0 100644 --- a/arch/mips/sni/time.c +++ b/arch/mips/sni/time.c @@ -182,7 +182,8 @@ void __init plat_time_init(void) setup_pit_timer(); } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { - return -1; + ts->tv_sec = -1; + ts->tv_nsec = 0; } diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index eae4511ceea..ad63f30fe3d 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -769,7 +769,7 @@ int update_persistent_clock(struct timespec now) return ppc_md.set_rtc_time(&tm); } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { struct rtc_time tm; static int first = 1; @@ -787,8 +787,9 @@ unsigned long read_persistent_clock(void) if (!ppc_md.get_rtc_time) return 0; ppc_md.get_rtc_time(&tm); - return mktime(tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday, - tm.tm_hour, tm.tm_min, tm.tm_sec); + ts->tv_sec = mktime(tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec); + ts->tv_nsec = 0; } /* clocksource code */ diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index e76c2e7a8b9..a94ec48587b 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -182,12 +182,9 @@ static void timing_alert_interrupt(__u16 code) static void etr_reset(void); static void stp_reset(void); -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { - struct timespec ts; - - tod_to_timeval(get_clock() - TOD_UNIX_EPOCH, &ts); - return ts.tv_sec; + tod_to_timeval(get_clock() - TOD_UNIX_EPOCH, ts); } static cycle_t read_tod_clock(struct clocksource *cs) @@ -248,7 +245,6 @@ void __init time_init(void) { struct timespec ts; unsigned long flags; - cycle_t now; /* Reset time synchronization interfaces. */ etr_reset(); @@ -266,20 +262,10 @@ void __init time_init(void) panic("Could not register TOD clock source"); /* - * The TOD clock is an accurate clock. The xtime should be - * initialized in a way that the difference between TOD and - * xtime is reasonably small. Too bad that timekeeping_init - * sets xtime.tv_nsec to zero. In addition the clock source - * change from the jiffies clock source to the TOD clock - * source add another error of up to 1/HZ second. The same - * function sets wall_to_monotonic to a value that is too - * small for /proc/uptime to be accurate. - * Reset xtime and wall_to_monotonic to sane values. + * Reset wall_to_monotonic to the initial timestamp created + * in head.S to get a precise value in /proc/uptime. */ write_seqlock_irqsave(&xtime_lock, flags); - now = get_clock(); - tod_to_timeval(now - TOD_UNIX_EPOCH, &xtime); - clocksource_tod.cycle_last = now; tod_to_timeval(sched_clock_base_cc - TOD_UNIX_EPOCH, &ts); set_normalized_timespec(&wall_to_monotonic, -ts.tv_sec, -ts.tv_nsec); write_sequnlock_irqrestore(&xtime_lock, flags); diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c index 9b352a1e3fb..3f4706aa975 100644 --- a/arch/sh/kernel/time.c +++ b/arch/sh/kernel/time.c @@ -39,11 +39,9 @@ void (*rtc_sh_get_time)(struct timespec *) = null_rtc_get_time; int (*rtc_sh_set_time)(const time_t) = null_rtc_set_time; #ifdef CONFIG_GENERIC_CMOS_UPDATE -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { - struct timespec tv; - rtc_sh_get_time(&tv); - return tv.tv_sec; + rtc_sh_get_time(&ts); } int update_persistent_clock(struct timespec now) diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 5d465b207e7..bf67dcb4a44 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -178,7 +178,7 @@ static int set_rtc_mmss(unsigned long nowtime) } /* not static: needed by APM */ -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { unsigned long retval, flags; @@ -186,7 +186,8 @@ unsigned long read_persistent_clock(void) retval = get_wallclock(); spin_unlock_irqrestore(&rtc_lock, flags); - return retval; + ts->tv_sec = retval; + ts->tv_nsec = 0; } int update_persistent_clock(struct timespec now) diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c index 8848120d291..19085ff0484 100644 --- a/arch/xtensa/kernel/time.c +++ b/arch/xtensa/kernel/time.c @@ -59,9 +59,8 @@ static struct irqaction timer_irqaction = { void __init time_init(void) { - xtime.tv_nsec = 0; - xtime.tv_sec = read_persistent_clock(); - + /* FIXME: xtime&wall_to_monotonic are set in timekeeping_init. */ + read_persistent_clock(&xtime); set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); diff --git a/include/linux/time.h b/include/linux/time.h index e7c84455888..53a3216f0d1 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -101,7 +101,7 @@ extern struct timespec xtime; extern struct timespec wall_to_monotonic; extern seqlock_t xtime_lock; -extern unsigned long read_persistent_clock(void); +extern void read_persistent_clock(struct timespec *ts); extern int update_persistent_clock(struct timespec now); extern int no_sync_cmos_clock __read_mostly; void timekeeping_init(void); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 41579e7fcf9..f1a21ce491e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -154,7 +154,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); */ struct timespec xtime __attribute__ ((aligned (16))); struct timespec wall_to_monotonic __attribute__ ((aligned (16))); -static unsigned long total_sleep_time; /* seconds */ +static struct timespec total_sleep_time; /* * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. @@ -487,17 +487,18 @@ int timekeeping_valid_for_hres(void) } /** - * read_persistent_clock - Return time in seconds from the persistent clock. + * read_persistent_clock - Return time from the persistent clock. * * Weak dummy function for arches that do not yet support it. - * Returns seconds from epoch using the battery backed persistent clock. - * Returns zero if unsupported. + * Reads the time from the battery backed persistent clock. + * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. * * XXX - Do be sure to remove it once all arches implement it. */ -unsigned long __attribute__((weak)) read_persistent_clock(void) +void __attribute__((weak)) read_persistent_clock(struct timespec *ts) { - return 0; + ts->tv_sec = 0; + ts->tv_nsec = 0; } /* @@ -507,7 +508,9 @@ void __init timekeeping_init(void) { struct clocksource *clock; unsigned long flags; - unsigned long sec = read_persistent_clock(); + struct timespec now; + + read_persistent_clock(&now); write_seqlock_irqsave(&xtime_lock, flags); @@ -518,19 +521,20 @@ void __init timekeeping_init(void) clock->enable(clock); timekeeper_setup_internals(clock); - xtime.tv_sec = sec; - xtime.tv_nsec = 0; + xtime.tv_sec = now.tv_sec; + xtime.tv_nsec = now.tv_nsec; raw_time.tv_sec = 0; raw_time.tv_nsec = 0; set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); update_xtime_cache(0); - total_sleep_time = 0; + total_sleep_time.tv_sec = 0; + total_sleep_time.tv_nsec = 0; write_sequnlock_irqrestore(&xtime_lock, flags); } /* time in seconds when suspend began */ -static unsigned long timekeeping_suspend_time; +static struct timespec timekeeping_suspend_time; /** * timekeeping_resume - Resumes the generic timekeeping subsystem. @@ -543,18 +547,19 @@ static unsigned long timekeeping_suspend_time; static int timekeeping_resume(struct sys_device *dev) { unsigned long flags; - unsigned long now = read_persistent_clock(); + struct timespec ts; + + read_persistent_clock(&ts); clocksource_resume(); write_seqlock_irqsave(&xtime_lock, flags); - if (now && (now > timekeeping_suspend_time)) { - unsigned long sleep_length = now - timekeeping_suspend_time; - - xtime.tv_sec += sleep_length; - wall_to_monotonic.tv_sec -= sleep_length; - total_sleep_time += sleep_length; + if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { + ts = timespec_sub(ts, timekeeping_suspend_time); + xtime = timespec_add_safe(xtime, ts); + wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); + total_sleep_time = timespec_add_safe(total_sleep_time, ts); } update_xtime_cache(0); /* re-base the last cycle value */ @@ -577,7 +582,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) { unsigned long flags; - timekeeping_suspend_time = read_persistent_clock(); + read_persistent_clock(&timekeeping_suspend_time); write_seqlock_irqsave(&xtime_lock, flags); timekeeping_forward_now(); @@ -801,9 +806,10 @@ void update_wall_time(void) */ void getboottime(struct timespec *ts) { - set_normalized_timespec(ts, - - (wall_to_monotonic.tv_sec + total_sleep_time), - - wall_to_monotonic.tv_nsec); + struct timespec boottime; + + boottime = timespec_add_safe(wall_to_monotonic, total_sleep_time); + set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); } /** @@ -812,7 +818,7 @@ void getboottime(struct timespec *ts) */ void monotonic_to_bootbased(struct timespec *ts) { - ts->tv_sec += total_sleep_time; + *ts = timespec_add_safe(*ts, total_sleep_time); } unsigned long get_seconds(void) -- cgit v1.2.3-70-g09d2 From 23970e389e9cee43c4b41023935e1417271708b2 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:32 +0200 Subject: timekeeping: Introduce read_boot_clock Add the new function read_boot_clock to get the exact time the system has been started. For architectures without support for exact boot time a new weak function is added that returns 0. Use the exact boot time to initialize wall_to_monotonic, or xtime if the read_boot_clock returned 0. Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134811.296703241@de.ibm.com> Signed-off-by: Thomas Gleixner --- arch/s390/kernel/time.c | 17 +++++------------ include/linux/time.h | 1 + kernel/time/timekeeping.c | 24 ++++++++++++++++++++++-- 3 files changed, 28 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index a94ec48587b..6bff1a1d906 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -187,6 +187,11 @@ void read_persistent_clock(struct timespec *ts) tod_to_timeval(get_clock() - TOD_UNIX_EPOCH, ts); } +void read_boot_clock(struct timespec *ts) +{ + tod_to_timeval(sched_clock_base_cc - TOD_UNIX_EPOCH, ts); +} + static cycle_t read_tod_clock(struct clocksource *cs) { return get_clock(); @@ -243,9 +248,6 @@ void update_vsyscall_tz(void) */ void __init time_init(void) { - struct timespec ts; - unsigned long flags; - /* Reset time synchronization interfaces. */ etr_reset(); stp_reset(); @@ -261,15 +263,6 @@ void __init time_init(void) if (clocksource_register(&clocksource_tod) != 0) panic("Could not register TOD clock source"); - /* - * Reset wall_to_monotonic to the initial timestamp created - * in head.S to get a precise value in /proc/uptime. - */ - write_seqlock_irqsave(&xtime_lock, flags); - tod_to_timeval(sched_clock_base_cc - TOD_UNIX_EPOCH, &ts); - set_normalized_timespec(&wall_to_monotonic, -ts.tv_sec, -ts.tv_nsec); - write_sequnlock_irqrestore(&xtime_lock, flags); - /* Enable TOD clock interrupts on the boot cpu. */ init_cpu_timer(); diff --git a/include/linux/time.h b/include/linux/time.h index 53a3216f0d1..f505988398e 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -102,6 +102,7 @@ extern struct timespec wall_to_monotonic; extern seqlock_t xtime_lock; extern void read_persistent_clock(struct timespec *ts); +extern void read_boot_clock(struct timespec *ts); extern int update_persistent_clock(struct timespec now); extern int no_sync_cmos_clock __read_mostly; void timekeeping_init(void); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f1a21ce491e..15e06defca5 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -501,6 +501,21 @@ void __attribute__((weak)) read_persistent_clock(struct timespec *ts) ts->tv_nsec = 0; } +/** + * read_boot_clock - Return time of the system start. + * + * Weak dummy function for arches that do not yet support it. + * Function to read the exact time the system has been started. + * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. + * + * XXX - Do be sure to remove it once all arches implement it. + */ +void __attribute__((weak)) read_boot_clock(struct timespec *ts) +{ + ts->tv_sec = 0; + ts->tv_nsec = 0; +} + /* * timekeeping_init - Initializes the clocksource and common timekeeping values */ @@ -508,9 +523,10 @@ void __init timekeeping_init(void) { struct clocksource *clock; unsigned long flags; - struct timespec now; + struct timespec now, boot; read_persistent_clock(&now); + read_boot_clock(&boot); write_seqlock_irqsave(&xtime_lock, flags); @@ -525,8 +541,12 @@ void __init timekeeping_init(void) xtime.tv_nsec = now.tv_nsec; raw_time.tv_sec = 0; raw_time.tv_nsec = 0; + if (boot.tv_sec == 0 && boot.tv_nsec == 0) { + boot.tv_sec = xtime.tv_sec; + boot.tv_nsec = xtime.tv_nsec; + } set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); + -boot.tv_sec, -boot.tv_nsec); update_xtime_cache(0); total_sleep_time.tv_sec = 0; total_sleep_time.tv_nsec = 0; -- cgit v1.2.3-70-g09d2 From 6ea41d252f35465a2308a4038a323b6b07de06f6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 15 Aug 2009 13:20:42 +0200 Subject: clocksource: Call clocksource_change_rating() outside of watchdog_lock The changes to the watchdog logic introduced a lock inversion between watchdog_lock and clocksource_mutex. Change the rating outside of watchdog_lock to avoid it. Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index a1657b5fdeb..02dc22d888f 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -307,16 +307,23 @@ static void clocksource_watchdog_work(struct work_struct *work) { struct clocksource *cs, *tmp; unsigned long flags; + LIST_HEAD(unstable); spin_lock_irqsave(&watchdog_lock, flags); list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) if (cs->flags & CLOCK_SOURCE_UNSTABLE) { list_del_init(&cs->wd_list); - clocksource_change_rating(cs, 0); + list_add(&cs->wd_list, &unstable); } /* Check if the watchdog timer needs to be stopped. */ clocksource_stop_watchdog(); - spin_unlock(&watchdog_lock); + spin_unlock_irqrestore(&watchdog_lock, flags); + + /* Needs to be done outside of watchdog lock */ + list_for_each_entry_safe(cs, tmp, &unstable, wd_list) { + list_del_init(&cs->wd_list); + clocksource_change_rating(cs, 0); + } } #else /* CONFIG_CLOCKSOURCE_WATCHDOG */ -- cgit v1.2.3-70-g09d2 From 27569620c748ec13f801b4683b448a2ac2adaae4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 15 Aug 2009 09:53:46 -0700 Subject: rcu: Split hierarchical RCU initialization into boot-time and CPU-online pieces This patch divides the rcutree initialization into boot-time and hotplug-time components, so that the tree data structures are guaranteed to be fully linked at boot time regardless of what might happen in CPU hotplug operations. This makes RCU more resilient against CPU hotplug misbehavior (and vice versa), but more importantly, does a better job of compartmentalizing the code. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: josht@linux.vnet.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: hugh.dickins@tiscali.co.uk Cc: benh@kernel.crashing.org LKML-Reference: <1250355231152-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 53 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7717b95c202..f3e43274ed5 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1325,22 +1325,40 @@ int rcu_needs_cpu(int cpu) } /* - * Initialize a CPU's per-CPU RCU data. We take this "scorched earth" - * approach so that we don't have to worry about how long the CPU has - * been gone, or whether it ever was online previously. We do trust the - * ->mynode field, as it is constant for a given struct rcu_data and - * initialized during early boot. - * - * Note that only one online or offline event can be happening at a given - * time. Note also that we can accept some slop in the rsp->completed - * access due to the fact that this CPU cannot possibly have any RCU - * callbacks in flight yet. + * Do boot-time initialization of a CPU's per-CPU RCU data. + */ +static void __init +rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) +{ + unsigned long flags; + int i; + struct rcu_data *rdp = rsp->rda[cpu]; + struct rcu_node *rnp = rcu_get_root(rsp); + + /* Set up local state, ensuring consistent view of global state. */ + spin_lock_irqsave(&rnp->lock, flags); + rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); + rdp->nxtlist = NULL; + for (i = 0; i < RCU_NEXT_SIZE; i++) + rdp->nxttail[i] = &rdp->nxtlist; + rdp->qlen = 0; +#ifdef CONFIG_NO_HZ + rdp->dynticks = &per_cpu(rcu_dynticks, cpu); +#endif /* #ifdef CONFIG_NO_HZ */ + rdp->cpu = cpu; + spin_unlock_irqrestore(&rnp->lock, flags); +} + +/* + * Initialize a CPU's per-CPU RCU data. Note that only one online or + * offline event can be happening at a given time. Note also that we + * can accept some slop in the rsp->completed access due to the fact + * that this CPU cannot possibly have any RCU callbacks in flight yet. */ static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp) { unsigned long flags; - int i; long lastcomp; unsigned long mask; struct rcu_data *rdp = rsp->rda[cpu]; @@ -1355,16 +1373,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->qs_pending = 1; /* so set up to respond to current GP. */ rdp->beenonline = 1; /* We have now been online. */ rdp->passed_quiesc_completed = lastcomp - 1; - rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; - rdp->qlen = 0; rdp->blimit = blimit; -#ifdef CONFIG_NO_HZ - rdp->dynticks = &per_cpu(rcu_dynticks, cpu); -#endif /* #ifdef CONFIG_NO_HZ */ - rdp->cpu = cpu; spin_unlock(&rnp->lock); /* irqs remain disabled. */ /* @@ -1539,8 +1548,12 @@ void __init __rcu_init(void) #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ rcu_init_one(&rcu_state); RCU_DATA_PTR_INIT(&rcu_state, rcu_data); + for_each_possible_cpu(i) + rcu_boot_init_percpu_data(i, &rcu_state); rcu_init_one(&rcu_bh_state); RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data); + for_each_possible_cpu(i) + rcu_boot_init_percpu_data(i, &rcu_bh_state); for_each_online_cpu(i) rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i); -- cgit v1.2.3-70-g09d2 From 2e597558086dec36d5c33521a36e0f6b1bc3f3a7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 15 Aug 2009 09:53:48 -0700 Subject: rcu: Simplify RCU CPU-hotplug notification Use the new cpu_notifier() API to simplify RCU's CPU-hotplug notifiers, collapsing down to a single such notifier. This makes it trivial to provide the notifier-ordering guarantee that rcu_barrier() depends on. Also remove redundant open_softirq() calls from Hierarchical RCU notifier. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: josht@linux.vnet.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: hugh.dickins@tiscali.co.uk Cc: benh@kernel.crashing.org LKML-Reference: <12503552312510-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcupdate.c | 16 +++++++++++++++- kernel/rcupreempt.c | 25 ++----------------------- kernel/rcutree.c | 17 +++++------------ 3 files changed, 22 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index eae29c25fb1..8df115600c2 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -217,9 +217,13 @@ static void rcu_migrate_callback(struct rcu_head *notused) wake_up(&rcu_migrate_wq); } +extern int rcu_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu); + static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, unsigned long action, void *hcpu) { + rcu_cpu_notify(self, action, hcpu); if (action == CPU_DYING) { /* * preempt_disable() in on_each_cpu() prevents stop_machine(), @@ -244,8 +248,18 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, void __init rcu_init(void) { + int i; + __rcu_init(); - hotcpu_notifier(rcu_barrier_cpu_hotplug, 0); + cpu_notifier(rcu_barrier_cpu_hotplug, 0); + + /* + * We don't need protection against CPU-hotplug here because + * this is called early in boot, before either interrupts + * or the scheduler are operational. + */ + for_each_online_cpu(i) + rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i); } void rcu_scheduler_starting(void) diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index beb0e659adc..9b87f5134ed 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -1417,8 +1417,8 @@ int rcu_pending(int cpu) return 0; } -static int __cpuinit rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +int __cpuinit rcu_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -1439,10 +1439,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata rcu_nb = { - .notifier_call = rcu_cpu_notify, -}; - void __init __rcu_init(void) { int cpu; @@ -1471,23 +1467,6 @@ void __init __rcu_init(void) rdp->waitschedtail = &rdp->waitschedlist; rdp->rcu_sched_sleeping = 0; } - register_cpu_notifier(&rcu_nb); - - /* - * We don't need protection against CPU-Hotplug here - * since - * a) If a CPU comes online while we are iterating over the - * cpu_online_mask below, we would only end up making a - * duplicate call to rcu_online_cpu() which sets the corresponding - * CPU's mask in the rcu_cpu_online_map. - * - * b) A CPU cannot go offline at this point in time since the user - * does not have access to the sysfs interface, nor do we - * suspend the system. - */ - for_each_online_cpu(cpu) - rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f3e43274ed5..75762cddbe0 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1132,6 +1132,8 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; + WARN_ON_ONCE(rdp->beenonline == 0); + /* * If an RCU GP has gone long enough, go check for dyntick * idle CPUs and, if needed, send resched IPIs. @@ -1416,14 +1418,13 @@ static void __cpuinit rcu_online_cpu(int cpu) { rcu_init_percpu_data(cpu, &rcu_state); rcu_init_percpu_data(cpu, &rcu_bh_state); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } /* * Handle CPU online/offline notifcation events. */ -static int __cpuinit rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +int __cpuinit rcu_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -1532,10 +1533,6 @@ do { \ } \ } while (0) -static struct notifier_block __cpuinitdata rcu_nb = { - .notifier_call = rcu_cpu_notify, -}; - void __init __rcu_init(void) { int i; /* All used by RCU_DATA_PTR_INIT(). */ @@ -1554,11 +1551,7 @@ void __init __rcu_init(void) RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data); for_each_possible_cpu(i) rcu_boot_init_percpu_data(i, &rcu_bh_state); - - for_each_online_cpu(i) - rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i); - /* Register notifier for non-boot CPUs */ - register_cpu_notifier(&rcu_nb); + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } module_param(blimit, int, 0); -- cgit v1.2.3-70-g09d2 From 8064d54929f23613e649dc7e14f7a94454487d58 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 15 Aug 2009 09:53:49 -0700 Subject: rcu: Make preemptable RCU scan all CPUs when summing RCU counters This patch eliminates the counter-moving during CPU-offline notifiers, eliminating potential confusion if counters are scanned during counter-movement process. This confusion could result in premature ending of an RCU grace period. For example, if there are two tasks in RCU read-side critical sections (so that the sum of the counters is two), and the counter for the CPU going offline is -2, then moving the count to another CPU can result in the sum momentarily appearing to be zero. Since there are no memory barriers in either case, many more such scenarios are possible. So just don't move the counts!!! Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: josht@linux.vnet.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: hugh.dickins@tiscali.co.uk Cc: benh@kernel.crashing.org LKML-Reference: <12503552312863-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcupreempt.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 9b87f5134ed..2748b89910b 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -849,7 +849,7 @@ rcu_try_flip_waitzero(void) /* Check to see if the sum of the "last" counters is zero. */ RCU_TRACE_ME(rcupreempt_trace_try_flip_z1); - for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) + for_each_possible_cpu(cpu) sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx]; if (sum != 0) { RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1); @@ -1067,12 +1067,6 @@ void rcu_offline_cpu(int cpu) /* seen -after- acknowledgement. */ } - RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0]; - RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1]; - - RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0; - RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0; - cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map)); spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); -- cgit v1.2.3-70-g09d2 From b612ba804b8a656333013ad2ee96fb2377df5dbb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 15 Aug 2009 09:53:50 -0700 Subject: rcu: Make rcupreempt_trace.c look at offline CPUs Given that offline CPUs can now have non-zero counters, we need to dump counters for offline CPUs as well as for online CPUs. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: josht@linux.vnet.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: hugh.dickins@tiscali.co.uk Cc: benh@kernel.crashing.org LKML-Reference: <12503552313921-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcupreempt_trace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c index 7c2665cac17..11640346a50 100644 --- a/kernel/rcupreempt_trace.c +++ b/kernel/rcupreempt_trace.c @@ -236,12 +236,13 @@ static ssize_t rcuctrs_read(struct file *filp, char __user *buffer, cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE, "CPU last cur F M\n"); - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { long *flipctr = rcupreempt_flipctr(cpu); cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt, - "%3d %4ld %3ld %d %d\n", + "%3d%c %4ld %3ld %d %d\n", cpu, + cpu_is_offline(cpu) ? '!' : ' ', flipctr[!f], flipctr[f], rcupreempt_flip_flag(cpu), -- cgit v1.2.3-70-g09d2 From 684ca5cc9a772532bc893cdc994bd89bf0773719 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sat, 15 Aug 2009 09:53:51 -0700 Subject: rcu: Fix typo in rcu_irq_exit() comment header Signed-off-by: Josh Triplett Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: josht@linux.vnet.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: hugh.dickins@tiscali.co.uk Cc: benh@kernel.crashing.org LKML-Reference: <1250355231169-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcupreempt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 2748b89910b..510898a7bd6 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -548,7 +548,7 @@ void rcu_irq_enter(void) * rcu_irq_exit - Called from exiting Hard irq context. * * If the CPU was idle with dynamic ticks active, update the - * rcu_dyntick_sched.dynticks to put let the RCU handling be + * rcu_dyntick_sched.dynticks to let the RCU handling be * aware that the CPU is going back to idle with no ticks. */ void rcu_irq_exit(void) -- cgit v1.2.3-70-g09d2 From 84bc4af59081ee974dd80210e694ab59ebe51ce8 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 13 Aug 2009 17:36:53 -0700 Subject: futex: Detect mismatched requeue targets There is currently no check to ensure that userspace uses the same futex requeue target (uaddr2) in futex_requeue() that the waiter used in futex_wait_requeue_pi(). A mismatch here could very unexpected results as the waiter assumes it either wakes on uaddr1 or uaddr2. We could detect this on wakeup in the waiter, but the cleanup is more intense after the improper requeue has occured. This patch stores the waiter's expected requeue target in a new requeue_pi_key pointer in the futex_q which futex_requeue() checks prior to attempting to do a proxy lock acquistion or a requeue when requeue_pi=1. If they don't match, return -EINVAL from futex_requeue, aborting the requeue of any remaining waiters. Signed-off-by: Darren Hart Cc: Peter Zijlstra Cc: Eric Dumazet Cc: John Kacur Cc: Dinakar Guniguntala Cc: John Stultz LKML-Reference: <20090814003650.14634.63916.stgit@Aeon> Signed-off-by: Thomas Gleixner --- kernel/futex.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index d077201b393..f0dea283e77 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -115,6 +115,9 @@ struct futex_q { /* rt_waiter storage for requeue_pi: */ struct rt_mutex_waiter *rt_waiter; + /* The expected requeue pi target futex key: */ + union futex_key *requeue_pi_key; + /* Bitset for the optional bitmasked wakeup */ u32 bitset; }; @@ -1080,6 +1083,10 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, if (!top_waiter) return 0; + /* Ensure we requeue to the expected futex. */ + if (!match_futex(top_waiter->requeue_pi_key, key2)) + return -EINVAL; + /* * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in * the contended case or if set_waiters is 1. The pi_state is returned @@ -1260,6 +1267,12 @@ retry_private: continue; } + /* Ensure we requeue to the expected futex for requeue_pi. */ + if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) { + ret = -EINVAL; + break; + } + /* * Requeue nr_requeue waiters and possibly one more in the case * of requeue_pi if we couldn't acquire the lock atomically. @@ -1735,6 +1748,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, q.pi_state = NULL; q.bitset = bitset; q.rt_waiter = NULL; + q.requeue_pi_key = NULL; if (abs_time) { to = &timeout; @@ -1842,6 +1856,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, q.pi_state = NULL; q.rt_waiter = NULL; + q.requeue_pi_key = NULL; retry: q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); @@ -2153,15 +2168,16 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, debug_rt_mutex_init_waiter(&rt_waiter); rt_waiter.task = NULL; - q.pi_state = NULL; - q.bitset = bitset; - q.rt_waiter = &rt_waiter; - key2 = FUTEX_KEY_INIT; ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); if (unlikely(ret != 0)) goto out; + q.pi_state = NULL; + q.bitset = bitset; + q.rt_waiter = &rt_waiter; + q.requeue_pi_key = &key2; + /* Prepare to wait on uaddr. */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); if (ret) -- cgit v1.2.3-70-g09d2 From 788084aba2ab7348257597496befcbccabdc98a3 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 31 Jul 2009 12:54:11 -0400 Subject: Security/SELinux: seperate lsm specific mmap_min_addr Currently SELinux enforcement of controls on the ability to map low memory is determined by the mmap_min_addr tunable. This patch causes SELinux to ignore the tunable and instead use a seperate Kconfig option specific to how much space the LSM should protect. The tunable will now only control the need for CAP_SYS_RAWIO and SELinux permissions will always protect the amount of low memory designated by CONFIG_LSM_MMAP_MIN_ADDR. This allows users who need to disable the mmap_min_addr controls (usual reason being they run WINE as a non-root user) to do so and still have SELinux controls preventing confined domains (like a web server) from being able to map some area of low memory. Signed-off-by: Eric Paris Signed-off-by: James Morris --- include/linux/mm.h | 15 --------------- include/linux/security.h | 17 +++++++++++++++++ kernel/sysctl.c | 7 ++++--- mm/Kconfig | 6 +++--- mm/mmap.c | 3 --- mm/nommu.c | 3 --- security/Kconfig | 16 ++++++++++++++++ security/Makefile | 2 +- security/commoncap.c | 2 +- security/min_addr.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++ security/selinux/hooks.c | 2 +- 11 files changed, 92 insertions(+), 30 deletions(-) create mode 100644 security/min_addr.c (limited to 'kernel') diff --git a/include/linux/mm.h b/include/linux/mm.h index ba3a7cb1eaa..9a72cc78e6b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -34,8 +34,6 @@ extern int sysctl_legacy_va_layout; #define sysctl_legacy_va_layout 0 #endif -extern unsigned long mmap_min_addr; - #include #include #include @@ -574,19 +572,6 @@ static inline void set_page_links(struct page *page, enum zone_type zone, set_page_section(page, pfn_to_section_nr(pfn)); } -/* - * If a hint addr is less than mmap_min_addr change hint to be as - * low as possible but still greater than mmap_min_addr - */ -static inline unsigned long round_hint_to_min(unsigned long hint) -{ - hint &= PAGE_MASK; - if (((void *)hint != NULL) && - (hint < mmap_min_addr)) - return PAGE_ALIGN(mmap_min_addr); - return hint; -} - /* * Some inline functions in vmstat.h depend on page_zone() */ diff --git a/include/linux/security.h b/include/linux/security.h index ac4bc3760b4..dc3472c1f78 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -28,6 +28,7 @@ #include #include #include +#include /* PAGE_ALIGN */ #include #include #include @@ -95,6 +96,7 @@ extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb); extern int cap_netlink_recv(struct sk_buff *skb, int cap); extern unsigned long mmap_min_addr; +extern unsigned long dac_mmap_min_addr; /* * Values used in the task_security_ops calls */ @@ -147,6 +149,21 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) opts->num_mnt_opts = 0; } +/* + * If a hint addr is less than mmap_min_addr change hint to be as + * low as possible but still greater than mmap_min_addr + */ +static inline unsigned long round_hint_to_min(unsigned long hint) +{ + hint &= PAGE_MASK; + if (((void *)hint != NULL) && + (hint < mmap_min_addr)) + return PAGE_ALIGN(mmap_min_addr); + return hint; +} + +extern int mmap_min_addr_handler(struct ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); /** * struct security_operations - main security structure * diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 98e02328c67..58be76017fd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -1306,10 +1307,10 @@ static struct ctl_table vm_table[] = { { .ctl_name = CTL_UNNUMBERED, .procname = "mmap_min_addr", - .data = &mmap_min_addr, - .maxlen = sizeof(unsigned long), + .data = &dac_mmap_min_addr, + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &mmap_min_addr_handler, }, #ifdef CONFIG_NUMA { diff --git a/mm/Kconfig b/mm/Kconfig index c948d4ca8bd..fe5f674d7a7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -225,9 +225,9 @@ config DEFAULT_MMAP_MIN_ADDR For most ia64, ppc64 and x86 users with lots of address space a value of 65536 is reasonable and should cause no problems. On arm and other archs it should not be higher than 32768. - Programs which use vm86 functionality would either need additional - permissions from either the LSM or the capabilities module or have - this protection disabled. + Programs which use vm86 functionality or have some need to map + this low address space will need CAP_SYS_RAWIO or disable this + protection by setting the value to 0. This value can be changed after boot using the /proc/sys/vm/mmap_min_addr tunable. diff --git a/mm/mmap.c b/mm/mmap.c index 34579b23ebd..8101de490c7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -88,9 +88,6 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; struct percpu_counter vm_committed_as; -/* amount of vm to protect from userspace access */ -unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; - /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to diff --git a/mm/nommu.c b/mm/nommu.c index 53cab10fece..28754c40be9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -69,9 +69,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; int heap_stack_gap = 0; -/* amount of vm to protect from userspace access */ -unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; - atomic_long_t mmap_pages_allocated; EXPORT_SYMBOL(mem_map); diff --git a/security/Kconfig b/security/Kconfig index d23c839038f..9c60c346a91 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -113,6 +113,22 @@ config SECURITY_ROOTPLUG If you are unsure how to answer this question, answer N. +config LSM_MMAP_MIN_ADDR + int "Low address space for LSM to from user allocation" + depends on SECURITY && SECURITY_SELINUX + default 65535 + help + This is the portion of low virtual memory which should be protected + from userspace allocation. Keeping a user from writing to low pages + can help reduce the impact of kernel NULL pointer bugs. + + For most ia64, ppc64 and x86 users with lots of address space + a value of 65536 is reasonable and should cause no problems. + On arm and other archs it should not be higher than 32768. + Programs which use vm86 functionality or have some need to map + this low address space will need the permission specific to the + systems running LSM. + source security/selinux/Kconfig source security/smack/Kconfig source security/tomoyo/Kconfig diff --git a/security/Makefile b/security/Makefile index c67557cdaa8..b56e7f9ecbc 100644 --- a/security/Makefile +++ b/security/Makefile @@ -8,7 +8,7 @@ subdir-$(CONFIG_SECURITY_SMACK) += smack subdir-$(CONFIG_SECURITY_TOMOYO) += tomoyo # always enable default capabilities -obj-y += commoncap.o +obj-y += commoncap.o min_addr.o # Object file lists obj-$(CONFIG_SECURITY) += security.o capability.o diff --git a/security/commoncap.c b/security/commoncap.c index 6bcf6e81e54..e3097c0a131 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -1005,7 +1005,7 @@ int cap_file_mmap(struct file *file, unsigned long reqprot, { int ret = 0; - if (addr < mmap_min_addr) { + if (addr < dac_mmap_min_addr) { ret = cap_capable(current, current_cred(), CAP_SYS_RAWIO, SECURITY_CAP_AUDIT); /* set PF_SUPERPRIV if it turns out we allow the low mmap */ diff --git a/security/min_addr.c b/security/min_addr.c new file mode 100644 index 00000000000..14cc7b3b8d0 --- /dev/null +++ b/security/min_addr.c @@ -0,0 +1,49 @@ +#include +#include +#include +#include + +/* amount of vm to protect from userspace access by both DAC and the LSM*/ +unsigned long mmap_min_addr; +/* amount of vm to protect from userspace using CAP_SYS_RAWIO (DAC) */ +unsigned long dac_mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; +/* amount of vm to protect from userspace using the LSM = CONFIG_LSM_MMAP_MIN_ADDR */ + +/* + * Update mmap_min_addr = max(dac_mmap_min_addr, CONFIG_LSM_MMAP_MIN_ADDR) + */ +static void update_mmap_min_addr(void) +{ +#ifdef CONFIG_LSM_MMAP_MIN_ADDR + if (dac_mmap_min_addr > CONFIG_LSM_MMAP_MIN_ADDR) + mmap_min_addr = dac_mmap_min_addr; + else + mmap_min_addr = CONFIG_LSM_MMAP_MIN_ADDR; +#else + mmap_min_addr = dac_mmap_min_addr; +#endif +} + +/* + * sysctl handler which just sets dac_mmap_min_addr = the new value and then + * calls update_mmap_min_addr() so non MAP_FIXED hints get rounded properly + */ +int mmap_min_addr_handler(struct ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + + update_mmap_min_addr(); + + return ret; +} + +int __init init_mmap_min_addr(void) +{ + update_mmap_min_addr(); + + return 0; +} +pure_initcall(init_mmap_min_addr); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index e6d1432b080..8d8b69c5664 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3036,7 +3036,7 @@ static int selinux_file_mmap(struct file *file, unsigned long reqprot, * at bad behaviour/exploit that we always want to get the AVC, even * if DAC would have also denied the operation. */ - if (addr < mmap_min_addr) { + if (addr < CONFIG_LSM_MMAP_MIN_ADDR) { rc = avc_has_perm(sid, sid, SECCLASS_MEMPROTECT, MEMPROTECT__MMAP_ZERO, NULL); if (rc) -- cgit v1.2.3-70-g09d2 From 212274347fc4d2a7c56bf6c953b02c809e7e0be1 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 17 Aug 2009 13:39:33 +0800 Subject: lockdep: Fix missing entry in /proc/lock_stat One entry is missing in the output of /proc/lock_stat. The cause is, when ls_start() is called the 2nd time, we should start from stats[@pos-1] but not stats[@pos], because pos == 0 is the header. Signed-off-by: Li Zefan Cc: Peter Zijlstra LKML-Reference: <4A88ED15.20800@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/lockdep_proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index fba81f16e34..5dbe30b4e59 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -634,7 +634,7 @@ static void *ls_start(struct seq_file *m, loff_t *pos) if (*pos == 0) return SEQ_START_TOKEN; - data->iter = data->stats + *pos; + data->iter = data->stats + (*pos - 1); if (data->iter >= data->iter_end) data->iter = NULL; -- cgit v1.2.3-70-g09d2 From e9d65725bdf5954283625ca4d770bfc34f2ae56a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 17 Aug 2009 13:39:49 +0800 Subject: lockdep: Fix missing entries in /proc/lock_chains Two entries are missing in the output of /proc/lock_chains. One is chains[1]. When lc_next() is called the 1st time, chains[0] is returned. And when it's called the 2nd time, chains[2] is returned. The other missing ons is, when lc_start() is called the 2nd time, we should start from chains[@pos-1] but not chains[@pos], because pos == 0 is the header. Signed-off-by: Li Zefan Cc: Peter Zijlstra LKML-Reference: <4A88ED25.2040306@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/lockdep_proc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 5dbe30b4e59..9a7996e371f 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -160,8 +160,8 @@ static void *lc_next(struct seq_file *m, void *v, loff_t *pos) else { chain = v; - if (*pos < nr_lock_chains) - chain = lock_chains + *pos; + if (*pos - 1 < nr_lock_chains) + chain = lock_chains + (*pos - 1); else chain = NULL; } @@ -174,8 +174,8 @@ static void *lc_start(struct seq_file *m, loff_t *pos) if (*pos == 0) return SEQ_START_TOKEN; - if (*pos < nr_lock_chains) - return lock_chains + *pos; + if (*pos - 1 < nr_lock_chains) + return lock_chains + (*pos - 1); return NULL; } -- cgit v1.2.3-70-g09d2 From 8109e1de8502421f9efff1359f2779b1adcc0724 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 17 Aug 2009 13:40:17 +0800 Subject: lockdep: Simplify lockdep seqfile code Use seq_list_start_head() and seq_list_next(). Signed-off-by: Li Zefan Cc: Peter Zijlstra LKML-Reference: <4A88ED41.5000000@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/lockdep_proc.c | 46 +++++----------------------------------------- 1 file changed, 5 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 9a7996e371f..05fb5fde9b1 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -25,38 +25,12 @@ static void *l_next(struct seq_file *m, void *v, loff_t *pos) { - struct lock_class *class; - - (*pos)++; - - if (v == SEQ_START_TOKEN) - class = m->private; - else { - class = v; - - if (class->lock_entry.next != &all_lock_classes) - class = list_entry(class->lock_entry.next, - struct lock_class, lock_entry); - else - class = NULL; - } - - return class; + return seq_list_next(v, &all_lock_classes, pos); } static void *l_start(struct seq_file *m, loff_t *pos) { - struct lock_class *class; - loff_t i = 0; - - if (*pos == 0) - return SEQ_START_TOKEN; - - list_for_each_entry(class, &all_lock_classes, lock_entry) { - if (++i == *pos) - return class; - } - return NULL; + return seq_list_start_head(&all_lock_classes, *pos); } static void l_stop(struct seq_file *m, void *v) @@ -82,11 +56,11 @@ static void print_name(struct seq_file *m, struct lock_class *class) static int l_show(struct seq_file *m, void *v) { - struct lock_class *class = v; + struct lock_class *class = list_entry(v, struct lock_class, lock_entry); struct lock_list *entry; char usage[LOCK_USAGE_CHARS]; - if (v == SEQ_START_TOKEN) { + if (v == &all_lock_classes) { seq_printf(m, "all lock classes:\n"); return 0; } @@ -128,17 +102,7 @@ static const struct seq_operations lockdep_ops = { static int lockdep_open(struct inode *inode, struct file *file) { - int res = seq_open(file, &lockdep_ops); - if (!res) { - struct seq_file *m = file->private_data; - - if (!list_empty(&all_lock_classes)) - m->private = list_entry(all_lock_classes.next, - struct lock_class, lock_entry); - else - m->private = NULL; - } - return res; + return seq_open(file, &lockdep_ops); } static const struct file_operations proc_lockdep_operations = { -- cgit v1.2.3-70-g09d2 From 12aac19d4ba41019a1748f49d3c5d259b1bfb26d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 17 Aug 2009 13:40:39 +0800 Subject: lockdep: Simplify lockdep_chains seqfile code - make lc_next() call lc_start() - use lock_chains directly instead of storing it in m->private Signed-off-by: Li Zefan Cc: Peter Zijlstra LKML-Reference: <4A88ED57.5060609@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/lockdep_proc.c | 37 +++++++------------------------------ 1 file changed, 7 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 05fb5fde9b1..2bbe25f55d0 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -113,26 +113,6 @@ static const struct file_operations proc_lockdep_operations = { }; #ifdef CONFIG_PROVE_LOCKING -static void *lc_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct lock_chain *chain; - - (*pos)++; - - if (v == SEQ_START_TOKEN) - chain = m->private; - else { - chain = v; - - if (*pos - 1 < nr_lock_chains) - chain = lock_chains + (*pos - 1); - else - chain = NULL; - } - - return chain; -} - static void *lc_start(struct seq_file *m, loff_t *pos) { if (*pos == 0) @@ -144,6 +124,12 @@ static void *lc_start(struct seq_file *m, loff_t *pos) return NULL; } +static void *lc_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return lc_start(m, pos); +} + static void lc_stop(struct seq_file *m, void *v) { } @@ -184,16 +170,7 @@ static const struct seq_operations lockdep_chains_ops = { static int lockdep_chains_open(struct inode *inode, struct file *file) { - int res = seq_open(file, &lockdep_chains_ops); - if (!res) { - struct seq_file *m = file->private_data; - - if (nr_lock_chains) - m->private = lock_chains; - else - m->private = NULL; - } - return res; + return seq_open(file, &lockdep_chains_ops); } static const struct file_operations proc_lockdep_chains_operations = { -- cgit v1.2.3-70-g09d2 From 96004bb2a1e4ccad2b1eeb92e51031d1e8e609e3 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 17 Aug 2009 13:40:59 +0800 Subject: lockdep: Simplify lock_stat seqfile code - make ls_next() call ls_start() - remove redundant code in lock_stat_release() Signed-off-by: Li Zefan Cc: Peter Zijlstra LKML-Reference: <4A88ED6B.6030602@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/lockdep_proc.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 2bbe25f55d0..8dac8402e07 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -383,7 +383,6 @@ struct lock_stat_data { }; struct lock_stat_seq { - struct lock_stat_data *iter; struct lock_stat_data *iter_end; struct lock_stat_data stats[MAX_LOCKDEP_KEYS]; }; @@ -571,34 +570,22 @@ static void seq_header(struct seq_file *m) static void *ls_start(struct seq_file *m, loff_t *pos) { struct lock_stat_seq *data = m->private; + struct lock_stat_data *iter; if (*pos == 0) return SEQ_START_TOKEN; - data->iter = data->stats + (*pos - 1); - if (data->iter >= data->iter_end) - data->iter = NULL; + iter = data->stats + (*pos - 1); + if (iter >= data->iter_end) + iter = NULL; - return data->iter; + return iter; } static void *ls_next(struct seq_file *m, void *v, loff_t *pos) { - struct lock_stat_seq *data = m->private; - (*pos)++; - - if (v == SEQ_START_TOKEN) - data->iter = data->stats; - else { - data->iter = v; - data->iter++; - } - - if (data->iter == data->iter_end) - data->iter = NULL; - - return data->iter; + return ls_start(m, pos); } static void ls_stop(struct seq_file *m, void *v) @@ -636,7 +623,6 @@ static int lock_stat_open(struct inode *inode, struct file *file) struct lock_stat_data *iter = data->stats; struct seq_file *m = file->private_data; - data->iter = iter; list_for_each_entry(class, &all_lock_classes, lock_entry) { iter->class = class; iter->stats = lock_stats(class); @@ -644,7 +630,7 @@ static int lock_stat_open(struct inode *inode, struct file *file) } data->iter_end = iter; - sort(data->stats, data->iter_end - data->iter, + sort(data->stats, data->iter_end - data->stats, sizeof(struct lock_stat_data), lock_stat_cmp, NULL); @@ -679,7 +665,6 @@ static int lock_stat_release(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; vfree(seq->private); - seq->private = NULL; return seq_release(inode, file); } -- cgit v1.2.3-70-g09d2 From b25c340c195447afb1860da580fe2a85a6b652c5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 13 Aug 2009 12:17:22 +0200 Subject: genirq: Add oneshot support For threaded interrupt handlers we expect the hard interrupt handler part to mask the interrupt on the originating device. The interrupt line itself is reenabled after the hard interrupt handler has executed. This requires access to the originating device from hard interrupt context which is not always possible. There are devices which can only be accessed via a bus (i2c, spi, ...). The bus access requires thread context. For such devices we need to keep the interrupt line masked until the threaded handler has executed. Add a new flag IRQF_ONESHOT which allows drivers to request that the interrupt is not unmasked after the hard interrupt context handler has been executed and the thread has been woken. The interrupt line is unmasked after the thread handler function has been executed. Note that for now IRQF_ONESHOT cannot be used with IRQF_SHARED to avoid complex accounting mechanisms. For oneshot interrupts the primary handler simply returns IRQ_WAKE_THREAD and does nothing else. A generic implementation irq_default_primary_handler() is provided to avoid useless copies all over the place. It is automatically installed when request_threaded_irq() is called with handler=NULL and thread_fn!=NULL. Signed-off-by: Thomas Gleixner Cc: Mark Brown Cc: Dmitry Torokhov Cc: Trilok Soni Cc: Pavel Machek Cc: Brian Swetland Cc: Joonyoung Shim Cc: m.szyprowski@samsung.com Cc: t.fujak@samsung.com Cc: kyungmin.park@samsung.com, Cc: David Brownell Cc: Daniel Ribeiro Cc: arve@android.com Cc: Barry Song <21cnbao@gmail.com> --- include/linux/interrupt.h | 4 ++++ include/linux/irq.h | 1 + kernel/irq/chip.c | 14 +++++++++++--- kernel/irq/manage.c | 49 +++++++++++++++++++++++++++++++++++++++++++---- 4 files changed, 61 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 35e7df1e9f3..1ac57e522a1 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -50,6 +50,9 @@ * IRQF_IRQPOLL - Interrupt is used for polling (only the interrupt that is * registered first in an shared interrupt is considered for * performance reasons) + * IRQF_ONESHOT - Interrupt is not reenabled after the hardirq handler finished. + * Used by threaded interrupts which need to keep the + * irq line disabled until the threaded handler has been run. */ #define IRQF_DISABLED 0x00000020 #define IRQF_SAMPLE_RANDOM 0x00000040 @@ -59,6 +62,7 @@ #define IRQF_PERCPU 0x00000400 #define IRQF_NOBALANCING 0x00000800 #define IRQF_IRQPOLL 0x00001000 +#define IRQF_ONESHOT 0x00002000 /* * Bits used by threaded handlers: diff --git a/include/linux/irq.h b/include/linux/irq.h index cb2e77a3f7f..5e7c6ee8c35 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -69,6 +69,7 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #define IRQ_MOVE_PCNTXT 0x01000000 /* IRQ migration from process context */ #define IRQ_AFFINITY_SET 0x02000000 /* IRQ affinity was set from userspace*/ #define IRQ_SUSPENDED 0x04000000 /* IRQ has gone through suspend sequence */ +#define IRQ_ONESHOT 0x08000000 /* IRQ is not unmasked after hardirq */ #ifdef CONFIG_IRQ_PER_CPU # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 13c68e71b72..b08c0d24f20 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -382,7 +382,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; - if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) + + if (unlikely(desc->status & IRQ_ONESHOT)) + desc->status |= IRQ_MASKED; + else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) desc->chip->unmask(irq); out_unlock: spin_unlock(&desc->lock); @@ -478,8 +481,13 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) kstat_incr_irqs_this_cpu(irq, desc); /* Start handling the irq */ - if (desc->chip->ack) - desc->chip->ack(irq); + if (unlikely(desc->status & IRQ_ONESHOT)) { + desc->status |= IRQ_MASKED; + mask_ack_irq(desc, irq); + } else { + if (desc->chip->ack) + desc->chip->ack(irq); + } /* Mark the IRQ currently in progress.*/ desc->status |= IRQ_INPROGRESS; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d222515a5a0..d7f7b5fd247 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -436,6 +436,16 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, return ret; } +/* + * Default primary interrupt handler for threaded interrupts. Is + * assigned as primary handler when request_threaded_irq is called + * with handler == NULL. Useful for oneshot interrupts. + */ +static irqreturn_t irq_default_primary_handler(int irq, void *dev_id) +{ + return IRQ_WAKE_THREAD; +} + static int irq_wait_for_interrupt(struct irqaction *action) { while (!kthread_should_stop()) { @@ -451,6 +461,21 @@ static int irq_wait_for_interrupt(struct irqaction *action) return -1; } +/* + * Oneshot interrupts keep the irq line masked until the threaded + * handler finished. unmask if the interrupt has not been disabled and + * is marked MASKED. + */ +static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) +{ + spin_lock_irq(&desc->lock); + if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { + desc->status &= ~IRQ_MASKED; + desc->chip->unmask(irq); + } + spin_unlock_irq(&desc->lock); +} + #ifdef CONFIG_SMP /* * Check whether we need to change the affinity of the interrupt thread. @@ -492,7 +517,7 @@ static int irq_thread(void *data) struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); - int wake; + int wake, oneshot = desc->status & IRQ_ONESHOT; sched_setscheduler(current, SCHED_FIFO, ¶m); current->irqaction = action; @@ -518,6 +543,9 @@ static int irq_thread(void *data) spin_unlock_irq(&desc->lock); action->thread_fn(action->irq, action->dev_id); + + if (oneshot) + irq_finalize_oneshot(action->irq, desc); } wake = atomic_dec_and_test(&desc->threads_active); @@ -590,6 +618,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) rand_initialize_irq(irq); } + /* Oneshot interrupts are not allowed with shared */ + if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED)) + return -EINVAL; + /* * Threaded handler ? */ @@ -663,9 +695,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) desc->status |= IRQ_PER_CPU; #endif - desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | + desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT | IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); + if (new->flags & IRQF_ONESHOT) + desc->status |= IRQ_ONESHOT; + if (!(desc->status & IRQ_NOAUTOEN)) { desc->depth = 0; desc->status &= ~IRQ_DISABLED; @@ -878,6 +913,8 @@ EXPORT_SYMBOL(free_irq); * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Primary handler for threaded interrupts + * If NULL and thread_fn != NULL the default + * primary handler is installed * @thread_fn: Function called from the irq handler thread * If NULL, no irq thread is created * @irqflags: Interrupt type flags @@ -957,8 +994,12 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, if (desc->status & IRQ_NOREQUEST) return -EINVAL; - if (!handler) - return -EINVAL; + + if (!handler) { + if (!thread_fn) + return -EINVAL; + handler = irq_default_primary_handler; + } action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) -- cgit v1.2.3-70-g09d2 From 70aedd24d20e75198f5a0b11750faabbb56924e2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 13 Aug 2009 12:17:48 +0200 Subject: genirq: Add buslock support Some interrupt chips are connected to a "slow" bus (i2c, spi ...). The bus access needs to sleep and therefor cannot be called in atomic contexts. Some of the generic interrupt management functions like disable_irq(), enable_irq() ... call interrupt chip functions with the irq_desc->lock held and interrupts disabled. This does not work for such devices. Provide a separate synchronization mechanism for such interrupt chips. The irq_chip structure is extended by two optional functions (bus_lock and bus_sync_and_unlock). The idea is to serialize the bus access for those operations in the core code so that drivers which are behind that bus operated interrupt controller do not have to worry about it and just can use the normal interfaces. To achieve this we add two function pointers to the irq_chip: bus_lock and bus_sync_unlock. bus_lock() is called to serialize access to the interrupt controller bus. Now the core code can issue chip->mask/unmask ... commands without changing the fast path code at all. The chip implementation merily stores that information in a chip private data structure and returns. No bus interaction as these functions are called from atomic context. After that bus_sync_unlock() is called outside the atomic context. Now the chip implementation issues the bus commands, waits for completion and unlocks the interrupt controller bus. The irq_chip implementation as pseudo code: struct irq_chip_data { struct mutex mutex; unsigned int irq_offset; unsigned long mask; unsigned long mask_status; } static void bus_lock(unsigned int irq) { struct irq_chip_data *data = get_irq_desc_chip_data(irq); mutex_lock(&data->mutex); } static void mask(unsigned int irq) { struct irq_chip_data *data = get_irq_desc_chip_data(irq); irq -= data->irq_offset; data->mask |= (1 << irq); } static void unmask(unsigned int irq) { struct irq_chip_data *data = get_irq_desc_chip_data(irq); irq -= data->irq_offset; data->mask &= ~(1 << irq); } static void bus_sync_unlock(unsigned int irq) { struct irq_chip_data *data = get_irq_desc_chip_data(irq); if (data->mask != data->mask_status) { do_bus_magic_to_set_mask(data->mask); data->mask_status = data->mask; } mutex_unlock(&data->mutex); } The device drivers can use request_threaded_irq, free_irq, disable_irq and enable_irq as usual with the only restriction that the calls need to come from non atomic context. Signed-off-by: Thomas Gleixner Cc: Mark Brown Cc: Dmitry Torokhov Cc: Trilok Soni Cc: Pavel Machek Cc: Brian Swetland Cc: Joonyoung Shim Cc: m.szyprowski@samsung.com Cc: t.fujak@samsung.com Cc: kyungmin.park@samsung.com, Cc: David Brownell Cc: Daniel Ribeiro Cc: arve@android.com Cc: Barry Song <21cnbao@gmail.com> --- include/linux/irq.h | 6 ++++++ kernel/irq/chip.c | 2 ++ kernel/irq/internals.h | 13 +++++++++++++ kernel/irq/manage.c | 19 ++++++++++++++++++- 4 files changed, 39 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 5e7c6ee8c35..ce8171bc6fa 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -101,6 +101,9 @@ struct msi_desc; * @set_type: set the flow type (IRQ_TYPE_LEVEL/etc.) of an IRQ * @set_wake: enable/disable power-management wake-on of an IRQ * + * @bus_lock: function to lock access to slow bus (i2c) chips + * @bus_sync_unlock: function to sync and unlock slow bus (i2c) chips + * * @release: release function solely used by UML * @typename: obsoleted by name, kept as migration helper */ @@ -124,6 +127,9 @@ struct irq_chip { int (*set_type)(unsigned int irq, unsigned int flow_type); int (*set_wake)(unsigned int irq, unsigned int on); + void (*bus_lock)(unsigned int irq); + void (*bus_sync_unlock)(unsigned int irq); + /* Currently used only by UML, might disappear one day.*/ #ifdef CONFIG_IRQ_RELEASE_METHOD void (*release)(unsigned int irq, void *dev_id); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b08c0d24f20..f856330e684 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -580,6 +580,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, desc->chip = &dummy_irq_chip; } + chip_bus_lock(irq, desc); spin_lock_irqsave(&desc->lock, flags); /* Uninstall? */ @@ -599,6 +600,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, desc->chip->startup(irq); } spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(irq, desc); } EXPORT_SYMBOL_GPL(__set_irq_handler); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index e70ed5592eb..1b5d742c6a7 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -44,6 +44,19 @@ extern int irq_select_affinity_usr(unsigned int irq); extern void irq_set_thread_affinity(struct irq_desc *desc); +/* Inline functions for support of irq chips on slow busses */ +static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc) +{ + if (unlikely(desc->chip->bus_lock)) + desc->chip->bus_lock(irq); +} + +static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc) +{ + if (unlikely(desc->chip->bus_sync_unlock)) + desc->chip->bus_sync_unlock(irq); +} + /* * Debugging printout: */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d7f7b5fd247..0a3fd5b524c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -230,9 +230,11 @@ void disable_irq_nosync(unsigned int irq) if (!desc) return; + chip_bus_lock(irq, desc); spin_lock_irqsave(&desc->lock, flags); __disable_irq(desc, irq, false); spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(irq, desc); } EXPORT_SYMBOL(disable_irq_nosync); @@ -294,7 +296,8 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) * matches the last disable, processing of interrupts on this * IRQ line is re-enabled. * - * This function may be called from IRQ context. + * This function may be called from IRQ context only when + * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! */ void enable_irq(unsigned int irq) { @@ -304,9 +307,11 @@ void enable_irq(unsigned int irq) if (!desc) return; + chip_bus_lock(irq, desc); spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, false); spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(irq, desc); } EXPORT_SYMBOL(enable_irq); @@ -468,12 +473,14 @@ static int irq_wait_for_interrupt(struct irqaction *action) */ static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) { + chip_bus_lock(irq, desc); spin_lock_irq(&desc->lock); if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { desc->status &= ~IRQ_MASKED; desc->chip->unmask(irq); } spin_unlock_irq(&desc->lock); + chip_bus_sync_unlock(irq, desc); } #ifdef CONFIG_SMP @@ -904,7 +911,14 @@ EXPORT_SYMBOL_GPL(remove_irq); */ void free_irq(unsigned int irq, void *dev_id) { + struct irq_desc *desc = irq_to_desc(irq); + + if (!desc) + return; + + chip_bus_lock(irq, desc); kfree(__free_irq(irq, dev_id)); + chip_bus_sync_unlock(irq, desc); } EXPORT_SYMBOL(free_irq); @@ -1011,7 +1025,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, action->name = devname; action->dev_id = dev_id; + chip_bus_lock(irq, desc); retval = __setup_irq(irq, desc, action); + chip_bus_sync_unlock(irq, desc); + if (retval) kfree(action); -- cgit v1.2.3-70-g09d2 From 399b5da29b9f851eb7b96e2882097127f003e87c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 13 Aug 2009 13:21:38 +0200 Subject: genirq: Support nested threaded irq handling Interrupt chips which are behind a slow bus (i2c, spi ...) and demultiplex other interrupt sources need to run their interrupt handler in a thread. The demultiplexed interrupt handlers need to run in thread context as well and need to finish before the demux handler thread can reenable the interrupt line. So the easiest way is to run the sub device handlers in the context of the demultiplexing handler thread. To avoid that a separate thread is created for the subdevices the function set_nested_irq_thread() is provided which sets the IRQ_NESTED_THREAD flag in the interrupt descriptor. A driver which calls request_threaded_irq() must not be aware of the fact that the threaded handler is called in the context of the demultiplexing handler thread. The setup code checks the IRQ_NESTED_THREAD flag which was set from the irq chip setup code and does not setup a separate thread for the interrupt. The primary function which is provided by the device driver is replaced by an internal dummy function which warns when it is called. For the demultiplexing handler a helper function handle_nested_irq() is provided which calls the demux interrupt thread function in the context of the caller and does the proper interrupt accounting and takes the interrupt disabled status of the demultiplexed subdevice into account. Signed-off-by: Thomas Gleixner Cc: Mark Brown Cc: Dmitry Torokhov Cc: Trilok Soni Cc: Pavel Machek Cc: Brian Swetland Cc: Joonyoung Shim Cc: m.szyprowski@samsung.com Cc: t.fujak@samsung.com Cc: kyungmin.park@samsung.com, Cc: David Brownell Cc: Daniel Ribeiro Cc: arve@android.com Cc: Barry Song <21cnbao@gmail.com> --- include/linux/irq.h | 3 +++ kernel/irq/chip.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/irq/manage.c | 34 ++++++++++++++++++++++++--- 3 files changed, 101 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index ce8171bc6fa..8778ee99393 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -70,6 +70,7 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #define IRQ_AFFINITY_SET 0x02000000 /* IRQ affinity was set from userspace*/ #define IRQ_SUSPENDED 0x04000000 /* IRQ has gone through suspend sequence */ #define IRQ_ONESHOT 0x08000000 /* IRQ is not unmasked after hardirq */ +#define IRQ_NESTED_THREAD 0x10000000 /* IRQ is nested into another, no own handler thread */ #ifdef CONFIG_IRQ_PER_CPU # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU) @@ -386,6 +387,8 @@ set_irq_chained_handler(unsigned int irq, __set_irq_handler(irq, handle, 1, NULL); } +extern void set_irq_nested_thread(unsigned int irq, int nest); + extern void set_irq_noprobe(unsigned int irq); extern void set_irq_probe(unsigned int irq); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f856330e684..5765aad9499 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -222,6 +222,34 @@ int set_irq_chip_data(unsigned int irq, void *data) } EXPORT_SYMBOL(set_irq_chip_data); +/** + * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq + * + * @irq: Interrupt number + * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag + * + * The IRQ_NESTED_THREAD flag indicates that on + * request_threaded_irq() no separate interrupt thread should be + * created for the irq as the handler are called nested in the + * context of a demultiplexing interrupt handler thread. + */ +void set_irq_nested_thread(unsigned int irq, int nest) +{ + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + + if (!desc) + return; + + spin_lock_irqsave(&desc->lock, flags); + if (nest) + desc->status |= IRQ_NESTED_THREAD; + else + desc->status &= ~IRQ_NESTED_THREAD; + spin_unlock_irqrestore(&desc->lock, flags); +} +EXPORT_SYMBOL_GPL(set_irq_nested_thread); + /* * default enable function */ @@ -299,6 +327,45 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq) } } +/* + * handle_nested_irq - Handle a nested irq from a irq thread + * @irq: the interrupt number + * + * Handle interrupts which are nested into a threaded interrupt + * handler. The handler function is called inside the calling + * threads context. + */ +void handle_nested_irq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irqaction *action; + irqreturn_t action_ret; + + might_sleep(); + + spin_lock_irq(&desc->lock); + + kstat_incr_irqs_this_cpu(irq, desc); + + action = desc->action; + if (unlikely(!action || (desc->status & IRQ_DISABLED))) + goto out_unlock; + + desc->status |= IRQ_INPROGRESS; + spin_unlock_irq(&desc->lock); + + action_ret = action->thread_fn(action->irq, action->dev_id); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); + + spin_lock_irq(&desc->lock); + desc->status &= ~IRQ_INPROGRESS; + +out_unlock: + spin_unlock_irq(&desc->lock); +} +EXPORT_SYMBOL_GPL(handle_nested_irq); + /** * handle_simple_irq - Simple and software-decoded IRQs. * @irq: the interrupt number diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0a3fd5b524c..e485cea04bf 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -451,6 +451,16 @@ static irqreturn_t irq_default_primary_handler(int irq, void *dev_id) return IRQ_WAKE_THREAD; } +/* + * Primary handler for nested threaded interrupts. Should never be + * called. + */ +static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id) +{ + WARN(1, "Primary handler called for nested irq %d\n", irq); + return IRQ_NONE; +} + static int irq_wait_for_interrupt(struct irqaction *action) { while (!kthread_should_stop()) { @@ -600,7 +610,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) struct irqaction *old, **old_ptr; const char *old_name = NULL; unsigned long flags; - int shared = 0; + int nested, shared = 0; int ret; if (!desc) @@ -630,9 +640,27 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) return -EINVAL; /* - * Threaded handler ? + * Check whether the interrupt nests into another interrupt + * thread. + */ + nested = desc->status & IRQ_NESTED_THREAD; + if (nested) { + if (!new->thread_fn) + return -EINVAL; + /* + * Replace the primary handler which was provided from + * the driver for non nested interrupt handling by the + * dummy function which warns when called. + */ + new->handler = irq_nested_primary_handler; + } + + /* + * Create a handler thread when a thread function is supplied + * and the interrupt does not nest into another interrupt + * thread. */ - if (new->thread_fn) { + if (new->thread_fn && !nested) { struct task_struct *t; t = kthread_create(irq_thread, new, "irq/%d-%s", irq, -- cgit v1.2.3-70-g09d2 From 7ead8b8313d92b3a69a1a61b0dcbc4cd66c960dc Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 17 Aug 2009 16:56:28 +0800 Subject: tracing/events: Add module tracepoints Add trace points to trace module_load, module_free, module_get, module_put and module_request, and use trace_event facility to get the trace output. Here's the sample output: TASK-PID CPU# TIMESTAMP FUNCTION | | | | | <...>-42 [000] 1.758380: module_request: fb0 wait=1 call_site=fb_open ... <...>-60 [000] 3.269403: module_load: scsi_wait_scan <...>-60 [000] 3.269432: module_put: scsi_wait_scan call_site=sys_init_module refcnt=0 <...>-61 [001] 3.273168: module_free: scsi_wait_scan ... <...>-1021 [000] 13.836081: module_load: sunrpc <...>-1021 [000] 13.840589: module_put: sunrpc call_site=sys_init_module refcnt=-1 <...>-1027 [000] 13.848098: module_get: sunrpc call_site=try_module_get refcnt=0 <...>-1027 [000] 13.848308: module_get: sunrpc call_site=get_filesystem refcnt=1 <...>-1027 [000] 13.848692: module_put: sunrpc call_site=put_filesystem refcnt=0 ... modprobe-2587 [001] 1088.437213: module_load: trace_events_sample F modprobe-2587 [001] 1088.437786: module_put: trace_events_sample call_site=sys_init_module refcnt=0 Note: - the taints flag can be 'F', 'C' and/or 'P' if mod->taints != 0 - the module refcnt is percpu, so it can be negative in a specific cpu Signed-off-by: Li Zefan Acked-by: Rusty Russell Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Rusty Russell LKML-Reference: <4A891B3C.5030608@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/module.h | 14 ++++- include/trace/events/module.h | 126 ++++++++++++++++++++++++++++++++++++++++++ kernel/kmod.c | 4 ++ kernel/module.c | 11 ++++ 4 files changed, 152 insertions(+), 3 deletions(-) create mode 100644 include/trace/events/module.h (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index 098bdb7bfac..f8f92d015ef 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -17,10 +17,12 @@ #include #include #include -#include +#include #include +#include + /* Not Yet Implemented */ #define MODULE_SUPPORTED_DEVICE(name) @@ -462,7 +464,10 @@ static inline local_t *__module_ref_addr(struct module *mod, int cpu) static inline void __module_get(struct module *module) { if (module) { - local_inc(__module_ref_addr(module, get_cpu())); + unsigned int cpu = get_cpu(); + local_inc(__module_ref_addr(module, cpu)); + trace_module_get(module, _THIS_IP_, + local_read(__module_ref_addr(module, cpu))); put_cpu(); } } @@ -473,8 +478,11 @@ static inline int try_module_get(struct module *module) if (module) { unsigned int cpu = get_cpu(); - if (likely(module_is_live(module))) + if (likely(module_is_live(module))) { local_inc(__module_ref_addr(module, cpu)); + trace_module_get(module, _THIS_IP_, + local_read(__module_ref_addr(module, cpu))); + } else ret = 0; put_cpu(); diff --git a/include/trace/events/module.h b/include/trace/events/module.h new file mode 100644 index 00000000000..84160fb1847 --- /dev/null +++ b/include/trace/events/module.h @@ -0,0 +1,126 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM module + +#if !defined(_TRACE_MODULE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MODULE_H + +#include + +#ifdef CONFIG_MODULES + +struct module; + +#define show_module_flags(flags) __print_flags(flags, "", \ + { (1UL << TAINT_PROPRIETARY_MODULE), "P" }, \ + { (1UL << TAINT_FORCED_MODULE), "F" }, \ + { (1UL << TAINT_CRAP), "C" }) + +TRACE_EVENT(module_load, + + TP_PROTO(struct module *mod), + + TP_ARGS(mod), + + TP_STRUCT__entry( + __field( unsigned int, taints ) + __string( name, mod->name ) + ), + + TP_fast_assign( + __entry->taints = mod->taints; + __assign_str(name, mod->name); + ), + + TP_printk("%s %s", __get_str(name), show_module_flags(__entry->taints)) +); + +TRACE_EVENT(module_free, + + TP_PROTO(struct module *mod), + + TP_ARGS(mod), + + TP_STRUCT__entry( + __string( name, mod->name ) + ), + + TP_fast_assign( + __assign_str(name, mod->name); + ), + + TP_printk("%s", __get_str(name)) +); + +TRACE_EVENT(module_get, + + TP_PROTO(struct module *mod, unsigned long ip, int refcnt), + + TP_ARGS(mod, ip, refcnt), + + TP_STRUCT__entry( + __field( unsigned long, ip ) + __field( int, refcnt ) + __string( name, mod->name ) + ), + + TP_fast_assign( + __entry->ip = ip; + __entry->refcnt = refcnt; + __assign_str(name, mod->name); + ), + + TP_printk("%s call_site=%pf refcnt=%d", + __get_str(name), (void *)__entry->ip, __entry->refcnt) +); + +TRACE_EVENT(module_put, + + TP_PROTO(struct module *mod, unsigned long ip, int refcnt), + + TP_ARGS(mod, ip, refcnt), + + TP_STRUCT__entry( + __field( unsigned long, ip ) + __field( int, refcnt ) + __string( name, mod->name ) + ), + + TP_fast_assign( + __entry->ip = ip; + __entry->refcnt = refcnt; + __assign_str(name, mod->name); + ), + + TP_printk("%s call_site=%pf refcnt=%d", + __get_str(name), (void *)__entry->ip, __entry->refcnt) +); + +TRACE_EVENT(module_request, + + TP_PROTO(char *name, bool wait, unsigned long ip), + + TP_ARGS(name, wait, ip), + + TP_STRUCT__entry( + __field( bool, wait ) + __field( unsigned long, ip ) + __string( name, name ) + ), + + TP_fast_assign( + __entry->wait = wait; + __entry->ip = ip; + __assign_str(name, name); + ), + + TP_printk("%s wait=%d call_site=%pf", + __get_str(name), (int)__entry->wait, (void *)__entry->ip) +); + +#endif /* CONFIG_MODULES */ + +#endif /* _TRACE_MODULE_H */ + +/* This part must be outside protection */ +#include + diff --git a/kernel/kmod.c b/kernel/kmod.c index 385c31a1bdb..a92280870e3 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -37,6 +37,8 @@ #include #include +#include + extern int max_threads; static struct workqueue_struct *khelper_wq; @@ -108,6 +110,8 @@ int __request_module(bool wait, const char *fmt, ...) return -ENOMEM; } + trace_module_request(module_name, wait, _RET_IP_); + ret = call_usermodehelper(modprobe_path, argv, envp, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); atomic_dec(&kmod_concurrent); diff --git a/kernel/module.c b/kernel/module.c index fd141140355..b1821438694 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -55,6 +55,11 @@ #include #include +#define CREATE_TRACE_POINTS +#include + +EXPORT_TRACEPOINT_SYMBOL(module_get); + #if 0 #define DEBUGP printk #else @@ -940,6 +945,8 @@ void module_put(struct module *module) if (module) { unsigned int cpu = get_cpu(); local_dec(__module_ref_addr(module, cpu)); + trace_module_put(module, _RET_IP_, + local_read(__module_ref_addr(module, cpu))); /* Maybe they're waiting for us to drop reference? */ if (unlikely(!module_is_live(module))) wake_up_process(module->waiter); @@ -1491,6 +1498,8 @@ static int __unlink_module(void *_mod) /* Free a module, remove from lists, etc (must hold module_mutex). */ static void free_module(struct module *mod) { + trace_module_free(mod); + /* Delete from various lists */ stop_machine(__unlink_module, mod, NULL); remove_notes_attrs(mod); @@ -2358,6 +2367,8 @@ static noinline struct module *load_module(void __user *umod, /* Get rid of temporary copy */ vfree(hdr); + trace_module_load(mod); + /* Done! */ return mod; -- cgit v1.2.3-70-g09d2 From ba8b3a40ba7e06d00c27508f090803af90e8dbbf Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 17 Aug 2009 16:55:18 +0800 Subject: tracing/syscalls: Fix to print parameter types When syscall tracing was implemented as a tracer, "syscall_arg_type" trace option could be set to enable the display of syscall parameter types. Now this option is gone since it's no longer a tracer, but the code is still there but dead. So we remove dead code and re-enable the printing of paramete types via the verbose option: # echo verbose > trace_options # echo syscalls > set_event # cat trace ... bash-3331 [000] 95.348937: sys_fcntl64 -> 0x1 bash-3331 [000] 95.348942: sys_close(unsigned int fd: a) ... Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Jason Baron LKML-Reference: <4A891AF6.5050102@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_syscalls.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index f837cccabcf..f130dacfeef 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -13,21 +13,6 @@ static int sys_refcount_exit; static DECLARE_BITMAP(enabled_enter_syscalls, FTRACE_SYSCALL_MAX); static DECLARE_BITMAP(enabled_exit_syscalls, FTRACE_SYSCALL_MAX); -/* Option to display the parameters types */ -enum { - TRACE_SYSCALLS_OPT_TYPES = 0x1, -}; - -static struct tracer_opt syscalls_opts[] = { - { TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) }, - { } -}; - -static struct tracer_flags syscalls_flags = { - .val = 0, /* By default: no parameters types */ - .opts = syscalls_opts -}; - enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags) { @@ -55,7 +40,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags) for (i = 0; i < entry->nb_args; i++) { /* parameter types */ - if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) { + if (trace_flags & TRACE_ITER_VERBOSE) { ret = trace_seq_printf(s, "%s ", entry->types[i]); if (!ret) return TRACE_TYPE_PARTIAL_LINE; -- cgit v1.2.3-70-g09d2 From 97d53202a5670a08b79c8ef2e4fff1c1ee21317c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 17 Aug 2009 16:52:53 +0800 Subject: trace_stat: Fix missing entry in stat file One entry is missing in the output of a stat file. The cause is, when stat_seq_start() is called the 2nd time, we should start from the (pos-1)th elem in the rbtree but not pos, because pos == 0 is the header. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A891A65.70009@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_stat.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 07c60b09258..a4bb239eb98 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -203,17 +203,21 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos) { struct stat_session *session = s->private; struct rb_node *node; + int n = *pos; int i; /* Prevent from tracer switch or rbtree modification */ mutex_lock(&session->stat_mutex); /* If we are in the beginning of the file, print the headers */ - if (!*pos && session->ts->stat_headers) - return SEQ_START_TOKEN; + if (session->ts->stat_headers) { + if (n == 0) + return SEQ_START_TOKEN; + n--; + } node = rb_first(&session->stat_root); - for (i = 0; node && i < *pos; i++) + for (i = 0; node && i < n; i++) node = rb_next(node); return node; -- cgit v1.2.3-70-g09d2 From 2fc5f0cff4cf1c4cd336d0f61f11bca6eeee1d84 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 17 Aug 2009 16:53:37 +0800 Subject: trace_stack: Simplify seqfile code Extract duplicate code in t_start() and t_next(). Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A891A91.4030602@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_stack.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 0da1cff08d6..0f6facb050a 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -186,43 +186,33 @@ static const struct file_operations stack_max_size_fops = { }; static void * -t_next(struct seq_file *m, void *v, loff_t *pos) +__next(struct seq_file *m, loff_t *pos) { - long i; + long n = *pos - 1; - (*pos)++; - - if (v == SEQ_START_TOKEN) - i = 0; - else { - i = *(long *)v; - i++; - } - - if (i >= max_stack_trace.nr_entries || - stack_dump_trace[i] == ULONG_MAX) + if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) return NULL; - m->private = (void *)i; - + m->private = (void *)n; return &m->private; } -static void *t_start(struct seq_file *m, loff_t *pos) +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) { - void *t = SEQ_START_TOKEN; - loff_t l = 0; + (*pos)++; + return __next(m, pos); +} +static void *t_start(struct seq_file *m, loff_t *pos) +{ local_irq_disable(); __raw_spin_lock(&max_stack_lock); if (*pos == 0) return SEQ_START_TOKEN; - for (; t && l < *pos; t = t_next(m, t, &l)) - ; - - return t; + return __next(m, pos); } static void t_stop(struct seq_file *m, void *p) -- cgit v1.2.3-70-g09d2 From 3be04b471b95b870bd129a138463756629e86f3f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 17 Aug 2009 16:54:03 +0800 Subject: ftrace: Simplify seqfile code Use seq_release_private(). Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Li Zefan LKML-Reference: <4A891AAB.8090701@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 094863416b2..1993b7186cd 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1556,17 +1556,6 @@ ftrace_avail_open(struct inode *inode, struct file *file) return ret; } -int ftrace_avail_release(struct inode *inode, struct file *file) -{ - struct seq_file *m = (struct seq_file *)file->private_data; - struct ftrace_iterator *iter = m->private; - - seq_release(inode, file); - kfree(iter); - - return 0; -} - static int ftrace_failures_open(struct inode *inode, struct file *file) { @@ -2427,14 +2416,14 @@ static const struct file_operations ftrace_avail_fops = { .open = ftrace_avail_open, .read = seq_read, .llseek = seq_lseek, - .release = ftrace_avail_release, + .release = seq_release_private, }; static const struct file_operations ftrace_failures_fops = { .open = ftrace_failures_open, .read = seq_read, .llseek = seq_lseek, - .release = ftrace_avail_release, + .release = seq_release_private, }; static const struct file_operations ftrace_filter_fops = { -- cgit v1.2.3-70-g09d2 From e1ac3614ff606ae03677f47459113f98a19af63c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 14 Aug 2009 15:39:10 +1000 Subject: perf_counter: Check task on counter read IPI In general, code in perf_counter.c that is called through an IPI checks, for per-task counters, that the counter's task is still the current task. This is to handle the race condition where the cpu switches from the task we want to another task in the interval between sending the IPI and the IPI arriving and being handled on the target CPU. For some reason, __perf_counter_read is missing this check, yet there is no reason why the race condition can't occur. This adds a check that the current task is the one we want. If it isn't, we just return. In that case the counter->count value should be up to date, since it will have been updated when the counter was scheduled out, which must have happened since the IPI was sent. I don't have an example of an actual failure due to this race, but it seems obvious that it could occur and we need to guard against it. Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <19076.63614.277861.368125@drongo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 534e20d14d6..b8fe7397b90 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1503,10 +1503,21 @@ static void perf_counter_enable_on_exec(struct task_struct *task) */ static void __perf_counter_read(void *info) { + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; unsigned long flags; + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. In that case + * counter->count would have been updated to a recent sample + * when the counter was scheduled out. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + local_irq_save(flags); if (ctx->is_active) update_context_time(ctx); -- cgit v1.2.3-70-g09d2 From de809347aeef0a68c04576c464414d0e4dce59fc Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Mon, 17 Aug 2009 05:43:01 -0400 Subject: timers: Drop write permission on /proc/timer_list /proc/timer_list and /proc/slabinfo are not supposed to be written, so there should be no write permissions on it. Signed-off-by: WANG Cong Cc: Pekka Enberg Cc: Vegard Nossum Cc: Eduard - Gabriel Munteanu Cc: linux-mm@kvack.org Cc: Christoph Lameter Cc: David Rientjes Cc: Amerigo Wang Cc: Matt Mackall Cc: Arjan van de Ven LKML-Reference: <20090817094525.6355.88682.sendpatchset@localhost.localdomain> Signed-off-by: Ingo Molnar --- kernel/time/timer_list.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index a999b92a127..fddd69d16e0 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -286,7 +286,7 @@ static int __init init_timer_list_procfs(void) { struct proc_dir_entry *pe; - pe = proc_create("timer_list", 0644, NULL, &timer_list_fops); + pe = proc_create("timer_list", 0444, NULL, &timer_list_fops); if (!pe) return -ENOMEM; return 0; -- cgit v1.2.3-70-g09d2 From 3a5209e3b26386fd174820ee6e8e27479b24869a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 17 Aug 2009 16:00:30 -0700 Subject: trace_skb: fix build when CONFIG_NET is not enabled Fix trace_skb_sources build when CONFIG_NET is not enabled: kernel/built-in.o: In function `probe_skb_dequeue': trace_skb_sources.c:(.text+0xd9152): undefined reference to `init_net' trace_skb_sources.c:(.text+0xd9188): undefined reference to `dev_get_by_index' Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f09d7635c0e..9a2f19bf051 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -236,6 +236,7 @@ config BOOT_TRACER config SKB_SOURCES_TRACER bool "Trace skb source information" + depends on NET select GENERIC_TRACER help This tracer helps developers/sysadmins correlate skb allocation and -- cgit v1.2.3-70-g09d2 From f2d84b65b9778e8a35dd904f7d3993f0a60c9756 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Fri, 7 Aug 2009 18:55:48 +0800 Subject: ftrace: Unify effect of writing to trace_options and option/* "echo noglobal-clock > trace_options" can be used to change trace clock but "echo 0 > options/global-clock" can't. The flag toggling will be silently accepted without actually changing the clock callback. We can fix it by using set_tracer_flags() in trace_options_core_write(). Changelog: v1->v2: Simplified switch() after Li Zefan 's suggestion Signed-off-by: Zhao Lei Cc: Steven Rostedt Cc: Li Zefan Signed-off-by: Frederic Weisbecker --- kernel/trace/trace.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c22b40f8f57..8c358395d33 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3896,17 +3896,9 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret < 0) return ret; - switch (val) { - case 0: - trace_flags &= ~(1 << index); - break; - case 1: - trace_flags |= 1 << index; - break; - - default: + if (val != 0 && val != 1) return -EINVAL; - } + set_tracer_flags(1 << index, val); *ppos += cnt; -- cgit v1.2.3-70-g09d2 From f738eb1b63edf664da1b4ac76895d988749b2f07 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 18 Aug 2009 11:32:24 +0200 Subject: perf_counter: Fix the PARISC build PARISC does not build: /home/mingo/tip/kernel/perf_counter.c: In function 'perf_counter_index': /home/mingo/tip/kernel/perf_counter.c:2016: error: 'PERF_COUNTER_INDEX_OFFSET' undeclared (first use in this function) /home/mingo/tip/kernel/perf_counter.c:2016: error: (Each undeclared identifier is reported only once /home/mingo/tip/kernel/perf_counter.c:2016: error: for each function it appears in.) As PERF_COUNTER_INDEX_OFFSET is not defined. Now, we could define it in the architecture - but lets also provide a core default of 0 (which happens to be what all but one architecture uses at the moment). Architectures that need a different index offset should set this value in their asm/perf_counter.h files. Cc: Kyle McMartin Cc: Helge Deller Cc: linux-parisc@vger.kernel.org Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b8fe7397b90..36f65e2b8b5 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2019,6 +2019,10 @@ int perf_counter_task_disable(void) return 0; } +#ifndef PERF_COUNTER_INDEX_OFFSET +# define PERF_COUNTER_INDEX_OFFSET 0 +#endif + static int perf_counter_index(struct perf_counter *counter) { if (counter->state != PERF_COUNTER_STATE_ACTIVE) -- cgit v1.2.3-70-g09d2 From 69ab849439b506cd8dd2879527fdb64d95dd5211 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 17 Aug 2009 14:07:16 +0200 Subject: genirq: Wake up irq thread after action has been installed The wake_up_process() of the new irq thread in __setup_irq() is too early as the irqaction is not yet fully initialized especially action->irq is not yet set. The interrupt thread might dereference the wrong irq descriptor. Move the wakeup after the action is installed and action->irq has been set. Reported-by: Michael Buesch Signed-off-by: Thomas Gleixner Tested-by: Michael Buesch --- kernel/irq/manage.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d222515a5a0..0ec9ed83173 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -607,7 +607,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ get_task_struct(t); new->thread = t; - wake_up_process(t); } /* @@ -690,6 +689,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) (int)(new->flags & IRQF_TRIGGER_MASK)); } + new->irq = irq; *old_ptr = new; /* Reset broken irq detection when installing new handler */ @@ -707,7 +707,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) spin_unlock_irqrestore(&desc->lock, flags); - new->irq = irq; + /* + * Strictly no need to wake it up, but hung_task complains + * when no hard interrupt wakes the thread up. + */ + if (new->thread) + wake_up_process(new->thread); + register_irq_proc(irq, desc); new->dir = NULL; register_handler_proc(irq, new); -- cgit v1.2.3-70-g09d2 From 49a02c514d967921a908ac64e9c0ec0f0fc17fd8 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:51:52 +0200 Subject: sched: Use structure to store local data in __build_sched_domains Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105152.GB29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 165 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 89 insertions(+), 76 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1b59e265273..565ff775fcd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8091,6 +8091,22 @@ struct static_sched_domain { DECLARE_BITMAP(span, CONFIG_NR_CPUS); }; +struct s_data { +#ifdef CONFIG_NUMA + int sd_allnodes; + cpumask_var_t domainspan; + cpumask_var_t covered; + cpumask_var_t notcovered; +#endif + cpumask_var_t nodemask; + cpumask_var_t this_sibling_map; + cpumask_var_t this_core_map; + cpumask_var_t send_covered; + cpumask_var_t tmpmask; + struct sched_group **sched_group_nodes; + struct root_domain *rd; +}; + /* * SMT sched-domains: */ @@ -8385,54 +8401,49 @@ static void set_domain_attribute(struct sched_domain *sd, static int __build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { + struct s_data d; int i, err = -ENOMEM; - struct root_domain *rd; - cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, - tmpmask; #ifdef CONFIG_NUMA - cpumask_var_t domainspan, covered, notcovered; - struct sched_group **sched_group_nodes = NULL; - int sd_allnodes = 0; - - if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) + d.sd_allnodes = 0; + if (!alloc_cpumask_var(&d.domainspan, GFP_KERNEL)) goto out; - if (!alloc_cpumask_var(&covered, GFP_KERNEL)) + if (!alloc_cpumask_var(&d.covered, GFP_KERNEL)) goto free_domainspan; - if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) + if (!alloc_cpumask_var(&d.notcovered, GFP_KERNEL)) goto free_covered; #endif - if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) + if (!alloc_cpumask_var(&d.nodemask, GFP_KERNEL)) goto free_notcovered; - if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) + if (!alloc_cpumask_var(&d.this_sibling_map, GFP_KERNEL)) goto free_nodemask; - if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) + if (!alloc_cpumask_var(&d.this_core_map, GFP_KERNEL)) goto free_this_sibling_map; - if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) + if (!alloc_cpumask_var(&d.send_covered, GFP_KERNEL)) goto free_this_core_map; - if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) + if (!alloc_cpumask_var(&d.tmpmask, GFP_KERNEL)) goto free_send_covered; #ifdef CONFIG_NUMA /* * Allocate the per-node list of sched groups */ - sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), - GFP_KERNEL); - if (!sched_group_nodes) { + d.sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), + GFP_KERNEL); + if (!d.sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); goto free_tmpmask; } #endif - rd = alloc_rootdomain(); - if (!rd) { + d.rd = alloc_rootdomain(); + if (!d.rd) { printk(KERN_WARNING "Cannot alloc root domain\n"); goto free_sched_groups; } #ifdef CONFIG_NUMA - sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; + sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d.sched_group_nodes; #endif /* @@ -8441,18 +8452,20 @@ static int __build_sched_domains(const struct cpumask *cpu_map, for_each_cpu(i, cpu_map) { struct sched_domain *sd = NULL, *p; - cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); + cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), + cpu_map); #ifdef CONFIG_NUMA if (cpumask_weight(cpu_map) > - SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { + SD_NODES_PER_DOMAIN*cpumask_weight(d.nodemask)) { sd = &per_cpu(allnodes_domains, i).sd; SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); cpumask_copy(sched_domain_span(sd), cpu_map); - cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); + cpu_to_allnodes_group(i, cpu_map, &sd->groups, + d.tmpmask); p = sd; - sd_allnodes = 1; + d.sd_allnodes = 1; } else p = NULL; @@ -8471,11 +8484,11 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = &per_cpu(phys_domains, i).sd; SD_INIT(sd, CPU); set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), nodemask); + cpumask_copy(sched_domain_span(sd), d.nodemask); sd->parent = p; if (p) p->child = sd; - cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); + cpu_to_phys_group(i, cpu_map, &sd->groups, d.tmpmask); #ifdef CONFIG_SCHED_MC p = sd; @@ -8486,7 +8499,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, cpu_coregroup_mask(i)); sd->parent = p; p->child = sd; - cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); + cpu_to_core_group(i, cpu_map, &sd->groups, d.tmpmask); #endif #ifdef CONFIG_SCHED_SMT @@ -8498,54 +8511,54 @@ static int __build_sched_domains(const struct cpumask *cpu_map, topology_thread_cpumask(i), cpu_map); sd->parent = p; p->child = sd; - cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); + cpu_to_cpu_group(i, cpu_map, &sd->groups, d.tmpmask); #endif } #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ for_each_cpu(i, cpu_map) { - cpumask_and(this_sibling_map, + cpumask_and(d.this_sibling_map, topology_thread_cpumask(i), cpu_map); - if (i != cpumask_first(this_sibling_map)) + if (i != cpumask_first(d.this_sibling_map)) continue; - init_sched_build_groups(this_sibling_map, cpu_map, + init_sched_build_groups(d.this_sibling_map, cpu_map, &cpu_to_cpu_group, - send_covered, tmpmask); + d.send_covered, d.tmpmask); } #endif #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ for_each_cpu(i, cpu_map) { - cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); - if (i != cpumask_first(this_core_map)) + cpumask_and(d.this_core_map, cpu_coregroup_mask(i), cpu_map); + if (i != cpumask_first(d.this_core_map)) continue; - init_sched_build_groups(this_core_map, cpu_map, + init_sched_build_groups(d.this_core_map, cpu_map, &cpu_to_core_group, - send_covered, tmpmask); + d.send_covered, d.tmpmask); } #endif /* Set up physical groups */ for (i = 0; i < nr_node_ids; i++) { - cpumask_and(nodemask, cpumask_of_node(i), cpu_map); - if (cpumask_empty(nodemask)) + cpumask_and(d.nodemask, cpumask_of_node(i), cpu_map); + if (cpumask_empty(d.nodemask)) continue; - init_sched_build_groups(nodemask, cpu_map, + init_sched_build_groups(d.nodemask, cpu_map, &cpu_to_phys_group, - send_covered, tmpmask); + d.send_covered, d.tmpmask); } #ifdef CONFIG_NUMA /* Set up node groups */ - if (sd_allnodes) { + if (d.sd_allnodes) { init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, - send_covered, tmpmask); + d.send_covered, d.tmpmask); } for (i = 0; i < nr_node_ids; i++) { @@ -8553,15 +8566,15 @@ static int __build_sched_domains(const struct cpumask *cpu_map, struct sched_group *sg, *prev; int j; - cpumask_clear(covered); - cpumask_and(nodemask, cpumask_of_node(i), cpu_map); - if (cpumask_empty(nodemask)) { - sched_group_nodes[i] = NULL; + cpumask_clear(d.covered); + cpumask_and(d.nodemask, cpumask_of_node(i), cpu_map); + if (cpumask_empty(d.nodemask)) { + d.sched_group_nodes[i] = NULL; continue; } - sched_domain_node_span(i, domainspan); - cpumask_and(domainspan, domainspan, cpu_map); + sched_domain_node_span(i, d.domainspan); + cpumask_and(d.domainspan, d.domainspan, cpu_map); sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, i); @@ -8570,30 +8583,30 @@ static int __build_sched_domains(const struct cpumask *cpu_map, "node %d\n", i); goto error; } - sched_group_nodes[i] = sg; - for_each_cpu(j, nodemask) { + d.sched_group_nodes[i] = sg; + for_each_cpu(j, d.nodemask) { struct sched_domain *sd; sd = &per_cpu(node_domains, j).sd; sd->groups = sg; } sg->__cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), nodemask); + cpumask_copy(sched_group_cpus(sg), d.nodemask); sg->next = sg; - cpumask_or(covered, covered, nodemask); + cpumask_or(d.covered, d.covered, d.nodemask); prev = sg; for (j = 0; j < nr_node_ids; j++) { int n = (i + j) % nr_node_ids; - cpumask_complement(notcovered, covered); - cpumask_and(tmpmask, notcovered, cpu_map); - cpumask_and(tmpmask, tmpmask, domainspan); - if (cpumask_empty(tmpmask)) + cpumask_complement(d.notcovered, d.covered); + cpumask_and(d.tmpmask, d.notcovered, cpu_map); + cpumask_and(d.tmpmask, d.tmpmask, d.domainspan); + if (cpumask_empty(d.tmpmask)) break; - cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); - if (cpumask_empty(tmpmask)) + cpumask_and(d.tmpmask, d.tmpmask, cpumask_of_node(n)); + if (cpumask_empty(d.tmpmask)) continue; sg = kmalloc_node(sizeof(struct sched_group) + @@ -8605,9 +8618,9 @@ static int __build_sched_domains(const struct cpumask *cpu_map, goto error; } sg->__cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), tmpmask); + cpumask_copy(sched_group_cpus(sg), d.tmpmask); sg->next = prev->next; - cpumask_or(covered, covered, tmpmask); + cpumask_or(d.covered, d.covered, d.tmpmask); prev->next = sg; prev = sg; } @@ -8638,13 +8651,13 @@ static int __build_sched_domains(const struct cpumask *cpu_map, #ifdef CONFIG_NUMA for (i = 0; i < nr_node_ids; i++) - init_numa_sched_groups_power(sched_group_nodes[i]); + init_numa_sched_groups_power(d.sched_group_nodes[i]); - if (sd_allnodes) { + if (d.sd_allnodes) { struct sched_group *sg; cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, - tmpmask); + d.tmpmask); init_numa_sched_groups_power(sg); } #endif @@ -8659,42 +8672,42 @@ static int __build_sched_domains(const struct cpumask *cpu_map, #else sd = &per_cpu(phys_domains, i).sd; #endif - cpu_attach_domain(sd, rd, i); + cpu_attach_domain(sd, d.rd, i); } err = 0; free_tmpmask: - free_cpumask_var(tmpmask); + free_cpumask_var(d.tmpmask); free_send_covered: - free_cpumask_var(send_covered); + free_cpumask_var(d.send_covered); free_this_core_map: - free_cpumask_var(this_core_map); + free_cpumask_var(d.this_core_map); free_this_sibling_map: - free_cpumask_var(this_sibling_map); + free_cpumask_var(d.this_sibling_map); free_nodemask: - free_cpumask_var(nodemask); + free_cpumask_var(d.nodemask); free_notcovered: #ifdef CONFIG_NUMA - free_cpumask_var(notcovered); + free_cpumask_var(d.notcovered); free_covered: - free_cpumask_var(covered); + free_cpumask_var(d.covered); free_domainspan: - free_cpumask_var(domainspan); + free_cpumask_var(d.domainspan); out: #endif return err; free_sched_groups: #ifdef CONFIG_NUMA - kfree(sched_group_nodes); + kfree(d.sched_group_nodes); #endif goto free_tmpmask; #ifdef CONFIG_NUMA error: - free_sched_groups(cpu_map, tmpmask); - free_rootdomain(rd); + free_sched_groups(cpu_map, d.tmpmask); + free_rootdomain(d.rd); goto free_tmpmask; #endif } -- cgit v1.2.3-70-g09d2 From 2109b99ee192764b407dc7f52babb74740eea6f9 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:53:00 +0200 Subject: sched: Separate out allocation/free/goto-hell from __build_sched_domains Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105300.GC29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 171 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 99 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 565ff775fcd..c5d1fee4236 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8107,6 +8107,23 @@ struct s_data { struct root_domain *rd; }; +enum s_alloc { + sa_sched_groups = 0, + sa_rootdomain, + sa_tmpmask, + sa_send_covered, + sa_this_core_map, + sa_this_sibling_map, + sa_nodemask, + sa_sched_group_nodes, +#ifdef CONFIG_NUMA + sa_notcovered, + sa_covered, + sa_domainspan, +#endif + sa_none, +}; + /* * SMT sched-domains: */ @@ -8394,6 +8411,77 @@ static void set_domain_attribute(struct sched_domain *sd, } } +static void __free_domain_allocs(struct s_data *d, enum s_alloc what, + const struct cpumask *cpu_map) +{ + switch (what) { + case sa_sched_groups: + free_sched_groups(cpu_map, d->tmpmask); /* fall through */ + d->sched_group_nodes = NULL; + case sa_rootdomain: + free_rootdomain(d->rd); /* fall through */ + case sa_tmpmask: + free_cpumask_var(d->tmpmask); /* fall through */ + case sa_send_covered: + free_cpumask_var(d->send_covered); /* fall through */ + case sa_this_core_map: + free_cpumask_var(d->this_core_map); /* fall through */ + case sa_this_sibling_map: + free_cpumask_var(d->this_sibling_map); /* fall through */ + case sa_nodemask: + free_cpumask_var(d->nodemask); /* fall through */ + case sa_sched_group_nodes: +#ifdef CONFIG_NUMA + kfree(d->sched_group_nodes); /* fall through */ + case sa_notcovered: + free_cpumask_var(d->notcovered); /* fall through */ + case sa_covered: + free_cpumask_var(d->covered); /* fall through */ + case sa_domainspan: + free_cpumask_var(d->domainspan); /* fall through */ +#endif + case sa_none: + break; + } +} + +static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, + const struct cpumask *cpu_map) +{ +#ifdef CONFIG_NUMA + if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) + return sa_none; + if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) + return sa_domainspan; + if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) + return sa_covered; + /* Allocate the per-node list of sched groups */ + d->sched_group_nodes = kcalloc(nr_node_ids, + sizeof(struct sched_group *), GFP_KERNEL); + if (!d->sched_group_nodes) { + printk(KERN_WARNING "Can not alloc sched group node list\n"); + return sa_notcovered; + } + sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; +#endif + if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) + return sa_sched_group_nodes; + if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) + return sa_nodemask; + if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) + return sa_this_sibling_map; + if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) + return sa_this_core_map; + if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) + return sa_send_covered; + d->rd = alloc_rootdomain(); + if (!d->rd) { + printk(KERN_WARNING "Cannot alloc root domain\n"); + return sa_tmpmask; + } + return sa_rootdomain; +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -8401,50 +8489,17 @@ static void set_domain_attribute(struct sched_domain *sd, static int __build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { + enum s_alloc alloc_state = sa_none; struct s_data d; - int i, err = -ENOMEM; + int i; #ifdef CONFIG_NUMA d.sd_allnodes = 0; - if (!alloc_cpumask_var(&d.domainspan, GFP_KERNEL)) - goto out; - if (!alloc_cpumask_var(&d.covered, GFP_KERNEL)) - goto free_domainspan; - if (!alloc_cpumask_var(&d.notcovered, GFP_KERNEL)) - goto free_covered; -#endif - - if (!alloc_cpumask_var(&d.nodemask, GFP_KERNEL)) - goto free_notcovered; - if (!alloc_cpumask_var(&d.this_sibling_map, GFP_KERNEL)) - goto free_nodemask; - if (!alloc_cpumask_var(&d.this_core_map, GFP_KERNEL)) - goto free_this_sibling_map; - if (!alloc_cpumask_var(&d.send_covered, GFP_KERNEL)) - goto free_this_core_map; - if (!alloc_cpumask_var(&d.tmpmask, GFP_KERNEL)) - goto free_send_covered; - -#ifdef CONFIG_NUMA - /* - * Allocate the per-node list of sched groups - */ - d.sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), - GFP_KERNEL); - if (!d.sched_group_nodes) { - printk(KERN_WARNING "Can not alloc sched group node list\n"); - goto free_tmpmask; - } #endif - d.rd = alloc_rootdomain(); - if (!d.rd) { - printk(KERN_WARNING "Cannot alloc root domain\n"); - goto free_sched_groups; - } - -#ifdef CONFIG_NUMA - sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d.sched_group_nodes; -#endif + alloc_state = __visit_domain_allocation_hell(&d, cpu_map); + if (alloc_state != sa_rootdomain) + goto error; + alloc_state = sa_sched_groups; /* * Set up domains for cpus specified by the cpu_map. @@ -8675,41 +8730,13 @@ static int __build_sched_domains(const struct cpumask *cpu_map, cpu_attach_domain(sd, d.rd, i); } - err = 0; - -free_tmpmask: - free_cpumask_var(d.tmpmask); -free_send_covered: - free_cpumask_var(d.send_covered); -free_this_core_map: - free_cpumask_var(d.this_core_map); -free_this_sibling_map: - free_cpumask_var(d.this_sibling_map); -free_nodemask: - free_cpumask_var(d.nodemask); -free_notcovered: -#ifdef CONFIG_NUMA - free_cpumask_var(d.notcovered); -free_covered: - free_cpumask_var(d.covered); -free_domainspan: - free_cpumask_var(d.domainspan); -out: -#endif - return err; - -free_sched_groups: -#ifdef CONFIG_NUMA - kfree(d.sched_group_nodes); -#endif - goto free_tmpmask; + d.sched_group_nodes = NULL; /* don't free this we still need it */ + __free_domain_allocs(&d, sa_tmpmask, cpu_map); + return 0; -#ifdef CONFIG_NUMA error: - free_sched_groups(cpu_map, d.tmpmask); - free_rootdomain(d.rd); - goto free_tmpmask; -#endif + __free_domain_allocs(&d, alloc_state, cpu_map); + return -ENOMEM; } static int build_sched_domains(const struct cpumask *cpu_map) -- cgit v1.2.3-70-g09d2 From 7f4588f3aa395632fec9ba2e15a1920f0682fda0 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:54:06 +0200 Subject: sched: Separate out build of NUMA sched domain from __build_sched_domains ... to further strip down __build_sched_domains(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105406.GD29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 57 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c5d1fee4236..dd95a470837 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8482,6 +8482,37 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, return sa_rootdomain; } +static struct sched_domain *__build_numa_sched_domains(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) +{ + struct sched_domain *sd = NULL; +#ifdef CONFIG_NUMA + struct sched_domain *parent; + + d->sd_allnodes = 0; + if (cpumask_weight(cpu_map) > + SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { + sd = &per_cpu(allnodes_domains, i).sd; + SD_INIT(sd, ALLNODES); + set_domain_attribute(sd, attr); + cpumask_copy(sched_domain_span(sd), cpu_map); + cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); + d->sd_allnodes = 1; + } + parent = sd; + + sd = &per_cpu(node_domains, i).sd; + SD_INIT(sd, NODE); + set_domain_attribute(sd, attr); + sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); + sd->parent = parent; + if (parent) + parent->child = sd; + cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); +#endif + return sd; +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -8510,31 +8541,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); -#ifdef CONFIG_NUMA - if (cpumask_weight(cpu_map) > - SD_NODES_PER_DOMAIN*cpumask_weight(d.nodemask)) { - sd = &per_cpu(allnodes_domains, i).sd; - SD_INIT(sd, ALLNODES); - set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), cpu_map); - cpu_to_allnodes_group(i, cpu_map, &sd->groups, - d.tmpmask); - p = sd; - d.sd_allnodes = 1; - } else - p = NULL; - - sd = &per_cpu(node_domains, i).sd; - SD_INIT(sd, NODE); - set_domain_attribute(sd, attr); - sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); - sd->parent = p; - if (p) - p->child = sd; - cpumask_and(sched_domain_span(sd), - sched_domain_span(sd), cpu_map); -#endif - + sd = __build_numa_sched_domains(&d, cpu_map, attr, i); p = sd; sd = &per_cpu(phys_domains, i).sd; SD_INIT(sd, CPU); -- cgit v1.2.3-70-g09d2 From 87cce6622c2ab2f0e96ecc2a37133378a7db3177 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:54:55 +0200 Subject: sched: Separate out build of CPU sched domain from __build_sched_domains ... to further strip down __build_sched_domains(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105455.GE29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index dd95a470837..3d0666c5e02 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8513,6 +8513,22 @@ static struct sched_domain *__build_numa_sched_domains(struct s_data *d, return sd; } +static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) +{ + struct sched_domain *sd; + sd = &per_cpu(phys_domains, i).sd; + SD_INIT(sd, CPU); + set_domain_attribute(sd, attr); + cpumask_copy(sched_domain_span(sd), d->nodemask); + sd->parent = parent; + if (parent) + parent->child = sd; + cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); + return sd; +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -8542,15 +8558,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, cpu_map); sd = __build_numa_sched_domains(&d, cpu_map, attr, i); - p = sd; - sd = &per_cpu(phys_domains, i).sd; - SD_INIT(sd, CPU); - set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), d.nodemask); - sd->parent = p; - if (p) - p->child = sd; - cpu_to_phys_group(i, cpu_map, &sd->groups, d.tmpmask); + sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); #ifdef CONFIG_SCHED_MC p = sd; -- cgit v1.2.3-70-g09d2 From 410c408108bb85f32fe132aaf448388af0b6da64 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:56:14 +0200 Subject: sched: Separate out build of MC sched domain from __build_sched_domains ... to further strip down __build_sched_domains(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105614.GF29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3d0666c5e02..5c829d4ba8f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8529,6 +8529,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, return sd; } +static struct sched_domain *__build_mc_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) +{ + struct sched_domain *sd = parent; +#ifdef CONFIG_SCHED_MC + sd = &per_cpu(core_domains, i).sd; + SD_INIT(sd, MC); + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); + sd->parent = parent; + parent->child = sd; + cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); +#endif + return sd; +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -8559,18 +8576,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = __build_numa_sched_domains(&d, cpu_map, attr, i); sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); - -#ifdef CONFIG_SCHED_MC - p = sd; - sd = &per_cpu(core_domains, i).sd; - SD_INIT(sd, MC); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, - cpu_coregroup_mask(i)); - sd->parent = p; - p->child = sd; - cpu_to_core_group(i, cpu_map, &sd->groups, d.tmpmask); -#endif + sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); #ifdef CONFIG_SCHED_SMT p = sd; -- cgit v1.2.3-70-g09d2 From d81735355533cd4b2bce9508d86fcad24a38cf47 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:57:03 +0200 Subject: sched: Separate out build of SMT sched domain from __build_sched_domains ... to further strip down __build_sched_domains(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105703.GG29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 5c829d4ba8f..2ecec06e3f0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8546,6 +8546,23 @@ static struct sched_domain *__build_mc_sched_domain(struct s_data *d, return sd; } +static struct sched_domain *__build_smt_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) +{ + struct sched_domain *sd = parent; +#ifdef CONFIG_SCHED_SMT + sd = &per_cpu(cpu_domains, i).sd; + SD_INIT(sd, SIBLING); + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); + sd->parent = parent; + parent->child = sd; + cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); +#endif + return sd; +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -8569,7 +8586,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, * Set up domains for cpus specified by the cpu_map. */ for_each_cpu(i, cpu_map) { - struct sched_domain *sd = NULL, *p; + struct sched_domain *sd; cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); @@ -8577,18 +8594,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = __build_numa_sched_domains(&d, cpu_map, attr, i); sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); - -#ifdef CONFIG_SCHED_SMT - p = sd; - sd = &per_cpu(cpu_domains, i).sd; - SD_INIT(sd, SIBLING); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), - topology_thread_cpumask(i), cpu_map); - sd->parent = p; - p->child = sd; - cpu_to_cpu_group(i, cpu_map, &sd->groups, d.tmpmask); -#endif + sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); } #ifdef CONFIG_SCHED_SMT -- cgit v1.2.3-70-g09d2 From 0e8e85c941d8f1b43bcc2e3b8b7026cdae476c53 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:57:51 +0200 Subject: sched: Separate out build of SMT sched groups from __build_sched_domains ... to further strip down __build_sched_domains(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105751.GH29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2ecec06e3f0..43cfc6e54e9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8563,6 +8563,25 @@ static struct sched_domain *__build_smt_sched_domain(struct s_data *d, return sd; } +static void build_sched_groups(struct s_data *d, enum sched_domain_level l, + const struct cpumask *cpu_map, int cpu) +{ + switch (l) { +#ifdef CONFIG_SCHED_SMT + case SD_LV_SIBLING: /* set up CPU (sibling) groups */ + cpumask_and(d->this_sibling_map, cpu_map, + topology_thread_cpumask(cpu)); + if (cpu == cpumask_first(d->this_sibling_map)) + init_sched_build_groups(d->this_sibling_map, cpu_map, + &cpu_to_cpu_group, + d->send_covered, d->tmpmask); + break; +#endif + default: + break; + } +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -8597,19 +8616,9 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); } -#ifdef CONFIG_SCHED_SMT - /* Set up CPU (sibling) groups */ for_each_cpu(i, cpu_map) { - cpumask_and(d.this_sibling_map, - topology_thread_cpumask(i), cpu_map); - if (i != cpumask_first(d.this_sibling_map)) - continue; - - init_sched_build_groups(d.this_sibling_map, cpu_map, - &cpu_to_cpu_group, - d.send_covered, d.tmpmask); + build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); } -#endif #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ -- cgit v1.2.3-70-g09d2 From a2af04cdbb748158043e31799b28c48272081600 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:58:38 +0200 Subject: sched: Separate out build of MC sched groups from __build_sched_domains ... to further strip down __build_sched_domains(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105838.GI29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 43cfc6e54e9..f2c202f6629 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8576,6 +8576,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, &cpu_to_cpu_group, d->send_covered, d->tmpmask); break; +#endif +#ifdef CONFIG_SCHED_MC + case SD_LV_MC: /* set up multi-core groups */ + cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); + if (cpu == cpumask_first(d->this_core_map)) + init_sched_build_groups(d->this_core_map, cpu_map, + &cpu_to_core_group, + d->send_covered, d->tmpmask); + break; #endif default: break; @@ -8618,21 +8627,9 @@ static int __build_sched_domains(const struct cpumask *cpu_map, for_each_cpu(i, cpu_map) { build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); + build_sched_groups(&d, SD_LV_MC, cpu_map, i); } -#ifdef CONFIG_SCHED_MC - /* Set up multi-core groups */ - for_each_cpu(i, cpu_map) { - cpumask_and(d.this_core_map, cpu_coregroup_mask(i), cpu_map); - if (i != cpumask_first(d.this_core_map)) - continue; - - init_sched_build_groups(d.this_core_map, cpu_map, - &cpu_to_core_group, - d.send_covered, d.tmpmask); - } -#endif - /* Set up physical groups */ for (i = 0; i < nr_node_ids; i++) { cpumask_and(d.nodemask, cpumask_of_node(i), cpu_map); -- cgit v1.2.3-70-g09d2 From 86548096f252bfe2065f1ea2d301e7319a16375d Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:59:28 +0200 Subject: sched: Separate out build of CPU sched groups from __build_sched_domains ... to further strip down __build_sched_domains(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105928.GJ29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f2c202f6629..b09a41c93ae 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8586,6 +8586,13 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, d->send_covered, d->tmpmask); break; #endif + case SD_LV_CPU: /* set up physical groups */ + cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); + if (!cpumask_empty(d->nodemask)) + init_sched_build_groups(d->nodemask, cpu_map, + &cpu_to_phys_group, + d->send_covered, d->tmpmask); + break; default: break; } @@ -8631,15 +8638,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, } /* Set up physical groups */ - for (i = 0; i < nr_node_ids; i++) { - cpumask_and(d.nodemask, cpumask_of_node(i), cpu_map); - if (cpumask_empty(d.nodemask)) - continue; - - init_sched_build_groups(d.nodemask, cpu_map, - &cpu_to_phys_group, - d.send_covered, d.tmpmask); - } + for (i = 0; i < nr_node_ids; i++) + build_sched_groups(&d, SD_LV_CPU, cpu_map, i); #ifdef CONFIG_NUMA /* Set up node groups */ -- cgit v1.2.3-70-g09d2 From de616e36c700dc312d9021dd75f769c463f85122 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 13:00:13 +0200 Subject: sched: Separate out build of ALLNODES sched groups from __build_sched_domains For the sake of completeness. Now all calls to init_sched_build_groups() are contained in build_sched_groups(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818110013.GK29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b09a41c93ae..52c1953bc41 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8593,6 +8593,12 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, &cpu_to_phys_group, d->send_covered, d->tmpmask); break; +#ifdef CONFIG_NUMA + case SD_LV_ALLNODES: + init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, + d->send_covered, d->tmpmask); + break; +#endif default: break; } @@ -8643,11 +8649,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, #ifdef CONFIG_NUMA /* Set up node groups */ - if (d.sd_allnodes) { - init_sched_build_groups(cpu_map, cpu_map, - &cpu_to_allnodes_group, - d.send_covered, d.tmpmask); - } + if (d.sd_allnodes) + build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); for (i = 0; i < nr_node_ids; i++) { /* Set up node groups */ -- cgit v1.2.3-70-g09d2 From 0601a88d8fa4508eaa49a6d96c6685e1dece38e3 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 13:01:11 +0200 Subject: sched: Separate out build of NUMA sched groups from __build_sched_domains ... to further strip down __build_sched_domains(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818110111.GL29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 130 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 67 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 52c1953bc41..c1ce884a016 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8246,6 +8246,71 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) sg = sg->next; } while (sg != group_head); } + +static int build_numa_sched_groups(struct s_data *d, + const struct cpumask *cpu_map, int num) +{ + struct sched_domain *sd; + struct sched_group *sg, *prev; + int n, j; + + cpumask_clear(d->covered); + cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); + if (cpumask_empty(d->nodemask)) { + d->sched_group_nodes[num] = NULL; + goto out; + } + + sched_domain_node_span(num, d->domainspan); + cpumask_and(d->domainspan, d->domainspan, cpu_map); + + sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, num); + if (!sg) { + printk(KERN_WARNING "Can not alloc domain group for node %d\n", + num); + return -ENOMEM; + } + d->sched_group_nodes[num] = sg; + + for_each_cpu(j, d->nodemask) { + sd = &per_cpu(node_domains, j).sd; + sd->groups = sg; + } + + sg->__cpu_power = 0; + cpumask_copy(sched_group_cpus(sg), d->nodemask); + sg->next = sg; + cpumask_or(d->covered, d->covered, d->nodemask); + + prev = sg; + for (j = 0; j < nr_node_ids; j++) { + n = (num + j) % nr_node_ids; + cpumask_complement(d->notcovered, d->covered); + cpumask_and(d->tmpmask, d->notcovered, cpu_map); + cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); + if (cpumask_empty(d->tmpmask)) + break; + cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); + if (cpumask_empty(d->tmpmask)) + continue; + sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, num); + if (!sg) { + printk(KERN_WARNING + "Can not alloc domain group for node %d\n", j); + return -ENOMEM; + } + sg->__cpu_power = 0; + cpumask_copy(sched_group_cpus(sg), d->tmpmask); + sg->next = prev->next; + cpumask_or(d->covered, d->covered, d->tmpmask); + prev->next = sg; + prev = sg; + } +out: + return 0; +} #endif /* CONFIG_NUMA */ #ifdef CONFIG_NUMA @@ -8652,70 +8717,9 @@ static int __build_sched_domains(const struct cpumask *cpu_map, if (d.sd_allnodes) build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); - for (i = 0; i < nr_node_ids; i++) { - /* Set up node groups */ - struct sched_group *sg, *prev; - int j; - - cpumask_clear(d.covered); - cpumask_and(d.nodemask, cpumask_of_node(i), cpu_map); - if (cpumask_empty(d.nodemask)) { - d.sched_group_nodes[i] = NULL; - continue; - } - - sched_domain_node_span(i, d.domainspan); - cpumask_and(d.domainspan, d.domainspan, cpu_map); - - sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, i); - if (!sg) { - printk(KERN_WARNING "Can not alloc domain group for " - "node %d\n", i); + for (i = 0; i < nr_node_ids; i++) + if (build_numa_sched_groups(&d, cpu_map, i)) goto error; - } - d.sched_group_nodes[i] = sg; - for_each_cpu(j, d.nodemask) { - struct sched_domain *sd; - - sd = &per_cpu(node_domains, j).sd; - sd->groups = sg; - } - sg->__cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), d.nodemask); - sg->next = sg; - cpumask_or(d.covered, d.covered, d.nodemask); - prev = sg; - - for (j = 0; j < nr_node_ids; j++) { - int n = (i + j) % nr_node_ids; - - cpumask_complement(d.notcovered, d.covered); - cpumask_and(d.tmpmask, d.notcovered, cpu_map); - cpumask_and(d.tmpmask, d.tmpmask, d.domainspan); - if (cpumask_empty(d.tmpmask)) - break; - - cpumask_and(d.tmpmask, d.tmpmask, cpumask_of_node(n)); - if (cpumask_empty(d.tmpmask)) - continue; - - sg = kmalloc_node(sizeof(struct sched_group) + - cpumask_size(), - GFP_KERNEL, i); - if (!sg) { - printk(KERN_WARNING - "Can not alloc domain group for node %d\n", j); - goto error; - } - sg->__cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), d.tmpmask); - sg->next = prev->next; - cpumask_or(d.covered, d.covered, d.tmpmask); - prev->next = sg; - prev = sg; - } - } #endif /* Calculate CPU power for physical packages and nodes */ -- cgit v1.2.3-70-g09d2 From 294b0c9619a0469a3b385b6fc47e79f64222a692 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 13:02:29 +0200 Subject: sched: Consolidate definition of variable sd in __build_sched_domains Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818110229.GM29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c1ce884a016..cf4c953d648 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8678,6 +8678,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, { enum s_alloc alloc_state = sa_none; struct s_data d; + struct sched_domain *sd; int i; #ifdef CONFIG_NUMA d.sd_allnodes = 0; @@ -8692,8 +8693,6 @@ static int __build_sched_domains(const struct cpumask *cpu_map, * Set up domains for cpus specified by the cpu_map. */ for_each_cpu(i, cpu_map) { - struct sched_domain *sd; - cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); @@ -8725,22 +8724,19 @@ static int __build_sched_domains(const struct cpumask *cpu_map, /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; - + sd = &per_cpu(cpu_domains, i).sd; init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(core_domains, i).sd; - + sd = &per_cpu(core_domains, i).sd; init_sched_groups_power(i, sd); } #endif for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(phys_domains, i).sd; - + sd = &per_cpu(phys_domains, i).sd; init_sched_groups_power(i, sd); } @@ -8759,7 +8755,6 @@ static int __build_sched_domains(const struct cpumask *cpu_map, /* Attach the domains */ for_each_cpu(i, cpu_map) { - struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i).sd; #elif defined(CONFIG_SCHED_MC) -- cgit v1.2.3-70-g09d2 From b08dc3eba0c34027010caeda258f495074ae3a54 Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Tue, 18 Aug 2009 21:57:13 +0800 Subject: Security/SELinux: remove duplicated #include Remove duplicated #include('s) in kernel/sysctl.c Signed-off-by: Huang Weiyi Acked-by: Eric Paris Signed-off-by: James Morris --- kernel/sysctl.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 58be76017fd..71d8dc7f992 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -49,7 +49,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3-70-g09d2 From 0753ba01e126020bf0f8150934903b48935b697d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 18 Aug 2009 14:11:10 -0700 Subject: mm: revert "oom: move oom_adj value" The commit 2ff05b2b (oom: move oom_adj value) moveed the oom_adj value to the mm_struct. It was a very good first step for sanitize OOM. However Paul Menage reported the commit makes regression to his job scheduler. Current OOM logic can kill OOM_DISABLED process. Why? His program has the code of similar to the following. ... set_oom_adj(OOM_DISABLE); /* The job scheduler never killed by oom */ ... if (vfork() == 0) { set_oom_adj(0); /* Invoked child can be killed */ execve("foo-bar-cmd"); } .... vfork() parent and child are shared the same mm_struct. then above set_oom_adj(0) doesn't only change oom_adj for vfork() child, it's also change oom_adj for vfork() parent. Then, vfork() parent (job scheduler) lost OOM immune and it was killed. Actually, fork-setting-exec idiom is very frequently used in userland program. We must not break this assumption. Then, this patch revert commit 2ff05b2b and related commit. Reverted commit list --------------------- - commit 2ff05b2b4e (oom: move oom_adj value from task_struct to mm_struct) - commit 4d8b9135c3 (oom: avoid unnecessary mm locking and scanning for OOM_DISABLE) - commit 8123681022 (oom: only oom kill exiting tasks with attached memory) - commit 933b787b57 (mm: copy over oom_adj value at fork time) Signed-off-by: KOSAKI Motohiro Cc: Paul Menage Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: Rik van Riel Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Nick Piggin Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 15 +++------ fs/proc/base.c | 19 ++--------- include/linux/mm_types.h | 2 -- include/linux/sched.h | 1 + kernel/fork.c | 1 - mm/oom_kill.c | 64 +++++++++++++++++++++++--------------- 6 files changed, 48 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index fad18f9456e..ffead13f944 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1167,13 +1167,11 @@ CHAPTER 3: PER-PROCESS PARAMETERS 3.1 /proc//oom_adj - Adjust the oom-killer score ------------------------------------------------------ -This file can be used to adjust the score used to select which processes should -be killed in an out-of-memory situation. The oom_adj value is a characteristic -of the task's mm, so all threads that share an mm with pid will have the same -oom_adj value. A high value will increase the likelihood of this process being -killed by the oom-killer. Valid values are in the range -16 to +15 as -explained below and a special value of -17, which disables oom-killing -altogether for threads sharing pid's mm. +This file can be used to adjust the score used to select which processes +should be killed in an out-of-memory situation. Giving it a high score will +increase the likelihood of this process being killed by the oom-killer. Valid +values are in the range -16 to +15, plus the special value -17, which disables +oom-killing altogether for this process. The process to be killed in an out-of-memory situation is selected among all others based on its badness score. This value equals the original memory size of the process @@ -1187,9 +1185,6 @@ the parent's score if they do not share the same memory. Thus forking servers are the prime candidates to be killed. Having only one 'hungry' child will make parent less preferable than the child. -/proc//oom_adj cannot be changed for kthreads since they are immune from -oom-killing already. - /proc//oom_score shows process' current badness score. The following heuristics are then applied: diff --git a/fs/proc/base.c b/fs/proc/base.c index 175db258942..6f742f6658a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1003,12 +1003,7 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf, if (!task) return -ESRCH; - task_lock(task); - if (task->mm) - oom_adjust = task->mm->oom_adj; - else - oom_adjust = OOM_DISABLE; - task_unlock(task); + oom_adjust = task->oomkilladj; put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); @@ -1037,19 +1032,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, task = get_proc_task(file->f_path.dentry->d_inode); if (!task) return -ESRCH; - task_lock(task); - if (!task->mm) { - task_unlock(task); - put_task_struct(task); - return -EINVAL; - } - if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) { - task_unlock(task); + if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) { put_task_struct(task); return -EACCES; } - task->mm->oom_adj = oom_adjust; - task_unlock(task); + task->oomkilladj = oom_adjust; put_task_struct(task); if (end - buffer == 0) return -EIO; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7acc8439d9b..0042090a4d7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -240,8 +240,6 @@ struct mm_struct { unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ - s8 oom_adj; /* OOM kill score adjustment (bit shift) */ - cpumask_t cpu_vm_mask; /* Architecture-specific MM context */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 3ab08e4bb6b..0f1ea4a6695 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1198,6 +1198,7 @@ struct task_struct { * a short time */ unsigned char fpu_counter; + s8 oomkilladj; /* OOM kill score adjustment (bit shift). */ #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif diff --git a/kernel/fork.c b/kernel/fork.c index 021e1138556..144326b7af5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -426,7 +426,6 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; - mm->oom_adj = (current->mm) ? current->mm->oom_adj : 0; mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 175a67a78a9..a7b2460e922 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -58,7 +58,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) unsigned long points, cpu_time, run_time; struct mm_struct *mm; struct task_struct *child; - int oom_adj; task_lock(p); mm = p->mm; @@ -66,11 +65,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) task_unlock(p); return 0; } - oom_adj = mm->oom_adj; - if (oom_adj == OOM_DISABLE) { - task_unlock(p); - return 0; - } /* * The memory size of the process is the basis for the badness. @@ -154,15 +148,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) points /= 8; /* - * Adjust the score by oom_adj. + * Adjust the score by oomkilladj. */ - if (oom_adj) { - if (oom_adj > 0) { + if (p->oomkilladj) { + if (p->oomkilladj > 0) { if (!points) points = 1; - points <<= oom_adj; + points <<= p->oomkilladj; } else - points >>= -(oom_adj); + points >>= -(p->oomkilladj); } #ifdef DEBUG @@ -257,8 +251,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, *ppoints = ULONG_MAX; } + if (p->oomkilladj == OOM_DISABLE) + continue; + points = badness(p, uptime.tv_sec); - if (points > *ppoints) { + if (points > *ppoints || !chosen) { chosen = p; *ppoints = points; } @@ -307,7 +304,8 @@ static void dump_tasks(const struct mem_cgroup *mem) } printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, - get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); + get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, + p->comm); task_unlock(p); } while_each_thread(g, p); } @@ -325,8 +323,11 @@ static void __oom_kill_task(struct task_struct *p, int verbose) return; } - if (!p->mm) + if (!p->mm) { + WARN_ON(1); + printk(KERN_WARNING "tried to kill an mm-less task!\n"); return; + } if (verbose) printk(KERN_ERR "Killed process %d (%s)\n", @@ -348,13 +349,28 @@ static int oom_kill_task(struct task_struct *p) struct mm_struct *mm; struct task_struct *g, *q; - task_lock(p); mm = p->mm; - if (!mm || mm->oom_adj == OOM_DISABLE) { - task_unlock(p); + + /* WARNING: mm may not be dereferenced since we did not obtain its + * value from get_task_mm(p). This is OK since all we need to do is + * compare mm to q->mm below. + * + * Furthermore, even if mm contains a non-NULL value, p->mm may + * change to NULL at any time since we do not hold task_lock(p). + * However, this is of no concern to us. + */ + + if (mm == NULL) return 1; - } - task_unlock(p); + + /* + * Don't kill the process if any threads are set to OOM_DISABLE + */ + do_each_thread(g, q) { + if (q->mm == mm && q->oomkilladj == OOM_DISABLE) + return 1; + } while_each_thread(g, q); + __oom_kill_task(p, 1); /* @@ -377,11 +393,10 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, struct task_struct *c; if (printk_ratelimit()) { - task_lock(current); printk(KERN_WARNING "%s invoked oom-killer: " - "gfp_mask=0x%x, order=%d, oom_adj=%d\n", - current->comm, gfp_mask, order, - current->mm ? current->mm->oom_adj : OOM_DISABLE); + "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", + current->comm, gfp_mask, order, current->oomkilladj); + task_lock(current); cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); @@ -394,9 +409,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* * If the task is already exiting, don't alarm the sysadmin or kill * its children or threads, just set TIF_MEMDIE so it can die quickly - * if its mm is still attached. */ - if (p->mm && (p->flags & PF_EXITING)) { + if (p->flags & PF_EXITING) { __oom_kill_task(p, 0); return 0; } -- cgit v1.2.3-70-g09d2 From eda1e328556565e211b7450250e40d6de751563a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 11 Aug 2009 17:29:04 +0200 Subject: tracing: handle broken names in ftrace filter If one filter item (for set_ftrace_filter and set_ftrace_notrace) is being setup by more than 1 consecutive writes (FTRACE_ITER_CONT flag), it won't be handled corretly. I used following program to test/verify: [snip] #include #include #include #include #include int main(int argc, char **argv) { int fd, i; char *file = argv[1]; if (-1 == (fd = open(file, O_WRONLY))) { perror("open failed"); return -1; } for(i = 0; i < (argc - 2); i++) { int len = strlen(argv[2+i]); int cnt, off = 0; while(len) { cnt = write(fd, argv[2+i] + off, len); len -= cnt; off += cnt; } } close(fd); return 0; } [snip] before change: sh-4.0# echo > ./set_ftrace_filter sh-4.0# /test ./set_ftrace_filter "sys" "_open " sh-4.0# cat ./set_ftrace_filter #### all functions enabled #### sh-4.0# after change: sh-4.0# echo > ./set_ftrace_notrace sh-4.0# test ./set_ftrace_notrace "sys" "_open " sh-4.0# cat ./set_ftrace_notrace sys_open sh-4.0# Signed-off-by: Jiri Olsa LKML-Reference: <20090811152904.GA26065@jolsa.lab.eng.brq.redhat.com> Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1e1d23c2630..25edd5cc593 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2278,7 +2278,11 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, read++; cnt--; - if (!(iter->flags & ~FTRACE_ITER_CONT)) { + /* + * If the parser haven't finished with the last write, + * continue reading the user input without skipping spaces. + */ + if (!(iter->flags & FTRACE_ITER_CONT)) { /* skip white space */ while (cnt && isspace(ch)) { ret = get_user(ch, ubuf++); @@ -2288,8 +2292,9 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, cnt--; } + /* only spaces were written */ if (isspace(ch)) { - file->f_pos += read; + *ppos += read; ret = read; goto out; } @@ -2319,12 +2324,12 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, if (ret) goto out; iter->buffer_idx = 0; - } else + } else { iter->flags |= FTRACE_ITER_CONT; + iter->buffer[iter->buffer_idx++] = ch; + } - - file->f_pos += read; - + *ppos += read; ret = read; out: mutex_unlock(&ftrace_regex_lock); -- cgit v1.2.3-70-g09d2 From de481560eb0bd9d940b90311eba85711e4b1150b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 30 Apr 2009 12:32:04 -0400 Subject: kconfig: keep config.gz around even if CONFIG_IKCONFIG_PROC is not set If CONFIG_IKCONFIG is set but CONFIG_IKCONFIG_PROC is not, then gcc will optimize the config.gz out, because nobody uses it. This patch adds "__used" to the config.gz data to keep it around so that code like extract-ikconfig can still find it. [ Impact: allow extract-ikconfig to find config.gz ] Signed-off-by: Steven Rostedt --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 2093a691f1c..d0c84e6bf50 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -119,7 +119,7 @@ $(obj)/config_data.gz: .config FORCE $(call if_changed,gzip) quiet_cmd_ikconfiggz = IKCFG $@ - cmd_ikconfiggz = (echo "static const char kernel_config_data[] = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@ + cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@ targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call if_changed,ikconfiggz) -- cgit v1.2.3-70-g09d2 From d0981a1b21a03866c8da7f44e35e389c2e0d6061 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Aug 2009 11:26:09 +0200 Subject: clocksource: Protect the watchdog rating changes with clocksource_mutex Martin pointed out that commit 6ea41d2529 (clocksource: Call clocksource_change_rating() outside of watchdog_lock) has a theoretical reference count problem. The calls to clocksource_change_rating() are now done outside of the clocksource mutex and outside of the watchdog lock. A concurrent clocksource_unregister() could remove the clock. Split out the code which changes the rating from clocksource_change_rating() into __clocksource_change_rating(). Protect the clocksource_watchdog_work() code sequence with the clocksource_mutex() and call __clocksource_change_rating(). LKML-Reference: Signed-off-by: Thomas Gleixner Cc: Martin Schwidefsky --- kernel/time/clocksource.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 02dc22d888f..c6bff11f795 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -131,6 +131,7 @@ static cycle_t watchdog_last; static int watchdog_running; static void clocksource_watchdog_work(struct work_struct *work); +static void __clocksource_change_rating(struct clocksource *cs, int rating); /* * Interval: 0.5sec Threshold: 0.0625s @@ -309,6 +310,7 @@ static void clocksource_watchdog_work(struct work_struct *work) unsigned long flags; LIST_HEAD(unstable); + mutex_lock(&clocksource_mutex); spin_lock_irqsave(&watchdog_lock, flags); list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) if (cs->flags & CLOCK_SOURCE_UNSTABLE) { @@ -322,8 +324,9 @@ static void clocksource_watchdog_work(struct work_struct *work) /* Needs to be done outside of watchdog lock */ list_for_each_entry_safe(cs, tmp, &unstable, wd_list) { list_del_init(&cs->wd_list); - clocksource_change_rating(cs, 0); + __clocksource_change_rating(cs, 0); } + mutex_unlock(&clocksource_mutex); } #else /* CONFIG_CLOCKSOURCE_WATCHDOG */ @@ -470,16 +473,21 @@ int clocksource_register(struct clocksource *cs) } EXPORT_SYMBOL(clocksource_register); +static void __clocksource_change_rating(struct clocksource *cs, int rating) +{ + list_del(&cs->list); + cs->rating = rating; + clocksource_enqueue(cs); + clocksource_select(); +} + /** * clocksource_change_rating - Change the rating of a registered clocksource */ void clocksource_change_rating(struct clocksource *cs, int rating) { mutex_lock(&clocksource_mutex); - list_del(&cs->list); - cs->rating = rating; - clocksource_enqueue(cs); - clocksource_select(); + __clocksource_change_rating(cs, rating); mutex_unlock(&clocksource_mutex); } EXPORT_SYMBOL(clocksource_change_rating); -- cgit v1.2.3-70-g09d2 From 01548f4d3e8e94caf323a4f664eb347fd34a34ab Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 18 Aug 2009 17:09:42 +0200 Subject: clocksource: Avoid clocksource watchdog circular locking dependency stop_machine from a multithreaded workqueue is not allowed because of a circular locking dependency between cpu_down and the workqueue execution. Use a kernel thread to do the clocksource downgrade. Signed-off-by: Martin Schwidefsky Cc: Peter Zijlstra Cc: john stultz LKML-Reference: <20090818170942.3ab80c91@skybase> Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c6bff11f795..e0c86ad6e9f 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -29,6 +29,7 @@ #include #include /* for spin_unlock_irq() using preempt_count() m68k */ #include +#include void timecounter_init(struct timecounter *tc, const struct cyclecounter *cc, @@ -130,7 +131,7 @@ static DEFINE_SPINLOCK(watchdog_lock); static cycle_t watchdog_last; static int watchdog_running; -static void clocksource_watchdog_work(struct work_struct *work); +static int clocksource_watchdog_kthread(void *data); static void __clocksource_change_rating(struct clocksource *cs, int rating); /* @@ -139,6 +140,15 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating); #define WATCHDOG_INTERVAL (HZ >> 1) #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) +static void clocksource_watchdog_work(struct work_struct *work) +{ + /* + * If kthread_run fails the next watchdog scan over the + * watchdog_list will find the unstable clock again. + */ + kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); +} + static void clocksource_unstable(struct clocksource *cs, int64_t delta) { printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", @@ -167,8 +177,10 @@ static void clocksource_watchdog(unsigned long data) list_for_each_entry(cs, &watchdog_list, wd_list) { /* Clocksource already marked unstable? */ - if (cs->flags & CLOCK_SOURCE_UNSTABLE) + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + schedule_work(&watchdog_work); continue; + } csnow = cs->read(cs); @@ -304,7 +316,7 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs) spin_unlock_irqrestore(&watchdog_lock, flags); } -static void clocksource_watchdog_work(struct work_struct *work) +static int clocksource_watchdog_kthread(void *data) { struct clocksource *cs, *tmp; unsigned long flags; @@ -327,6 +339,7 @@ static void clocksource_watchdog_work(struct work_struct *work) __clocksource_change_rating(cs, 0); } mutex_unlock(&clocksource_mutex); + return 0; } #else /* CONFIG_CLOCKSOURCE_WATCHDOG */ -- cgit v1.2.3-70-g09d2 From 1423cc033df017c762a9155eec470da77a460141 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 18 Aug 2009 23:06:14 -0700 Subject: rcu: Delay rcu_barrier() wait until beginning of next CPU-hotunplug operation. Ingo Molnar reported this lockup: [ 200.380003] Hangcheck: hangcheck value past margin! [ 248.192003] INFO: task S99local:2974 blocked for more than 120 seconds. [ 248.194532] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 248.202330] S99local D 0000000c 6256 2974 2687 0x00000000 [ 248.208929] 9c7ebe90 00000086 6b67ef8b 0000000c 9f25a610 81a69869 00000001 820b6990 [ 248.216123] 820b6990 820b6990 9c6e4c20 9c6e4eb4 82c78990 00000000 6b993559 0000000c [ 248.220616] 9c7ebe90 8105f22a 9c6e4eb4 9c6e4c20 00000001 9c7ebe98 9c7ebeb4 81a65cb3 [ 248.229990] Call Trace: [ 248.234049] [<81a69869>] ? _spin_unlock_irqrestore+0x22/0x37 [ 248.239769] [<8105f22a>] ? prepare_to_wait+0x48/0x4e [ 248.244796] [<81a65cb3>] rcu_barrier_cpu_hotplug+0xaa/0xc9 [ 248.250343] [<8105f029>] ? autoremove_wake_function+0x0/0x38 [ 248.256063] [<81062cf2>] notifier_call_chain+0x49/0x71 [ 248.261263] [<81062da0>] raw_notifier_call_chain+0x11/0x13 [ 248.266809] [<81a0b475>] _cpu_down+0x272/0x288 [ 248.271316] [<81a0b4d5>] cpu_down+0x4a/0xa2 [ 248.275563] [<81a0c48a>] store_online+0x2a/0x5e [ 248.280156] [<81a0c460>] ? store_online+0x0/0x5e [ 248.284836] [<814ddc35>] sysdev_store+0x20/0x28 [ 248.289429] [<8112e403>] sysfs_write_file+0xb8/0xe3 [ 248.294369] [<8112e34b>] ? sysfs_write_file+0x0/0xe3 [ 248.299396] [<810e4c8f>] vfs_write+0x91/0x120 [ 248.303817] [<810e4dc1>] sys_write+0x40/0x65 [ 248.308150] [<81002d73>] sysenter_do_call+0x12/0x28 This change moves an RCU grace period delay off of the critical path for CPU-hotunplug operations. Since RCU callback migration is only performed on CPU-hotunplug operations, and since the rcu_barrier() race is provoked only by consecutive CPU-hotunplug operations, it is not necessary to delay the end of a given CPU-hotunplug operation. We can instead choose to delay the beginning of the next CPU-hotunplug operation. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney Cc: Josh Triplett Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: hugh.dickins@tiscali.co.uk Cc: benh@kernel.crashing.org LKML-Reference: <20090819060614.GA14383@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/rcupdate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 8df115600c2..bd5d5c8e514 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -238,7 +238,8 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, call_rcu_bh(rcu_migrate_head, rcu_migrate_callback); call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback); call_rcu(rcu_migrate_head + 2, rcu_migrate_callback); - } else if (action == CPU_POST_DEAD) { + } else if (action == CPU_DOWN_PREPARE) { + /* Don't need to wait until next removal operation. */ /* rcu_migrate_head is protected by cpu_add_remove_lock */ wait_migrated_callbacks(); } -- cgit v1.2.3-70-g09d2 From e6971969c331caa5c3c88cbd1be4f465b3355452 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 19 Aug 2009 15:52:25 +0800 Subject: tracing/syscalls: Fix fields format for enter events The "format" file of a trace event is originally for parsers to parse ftrace binary output. But the "format" file of a syscall event can only be used by perfcounter, because it describes the format of struct syscall_enter_record not struct syscall_trace_enter. To fix this, we remove struct syscall_enter_record, and then struct syscall_trace_enter will be used by both perf profile and ftrace. Signed-off-by: Li Zefan Cc: Jason Baron Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A8BAF39.1030404@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_syscalls.c | 51 ++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index f130dacfeef..d10daf0922b 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -90,26 +90,39 @@ print_syscall_exit(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } +extern char *__bad_type_size(void); + +#define SYSCALL_FIELD(type, name) \ + sizeof(type) != sizeof(trace.name) ? \ + __bad_type_size() : \ + #type, #name, offsetof(typeof(trace), name), sizeof(trace.name) + int ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s) { int i; int nr; - int ret = 0; + int ret; struct syscall_metadata *entry; - int offset = sizeof(struct trace_entry); + struct syscall_trace_enter trace; + int offset = offsetof(struct syscall_trace_enter, args); - nr = syscall_name_to_nr((char *)call->data); + nr = syscall_name_to_nr(call->data); entry = syscall_nr_to_meta(nr); if (!entry) - return ret; + return 0; + + ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", + SYSCALL_FIELD(int, nr)); + if (!ret) + return 0; for (i = 0; i < entry->nb_args; i++) { ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i], entry->args[i]); if (!ret) return 0; - ret = trace_seq_printf(s, "\toffset:%d;\tsize:%lu;\n", offset, + ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset, sizeof(unsigned long)); if (!ret) return 0; @@ -118,7 +131,7 @@ int ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s) trace_seq_printf(s, "\nprint fmt: \""); for (i = 0; i < entry->nb_args; i++) { - ret = trace_seq_printf(s, "%s: 0x%%0%lulx%s", entry->args[i], + ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i], sizeof(unsigned long), i == entry->nb_args - 1 ? "\", " : ", "); if (!ret) @@ -287,16 +300,6 @@ struct trace_event event_syscall_exit = { #ifdef CONFIG_EVENT_PROFILE -struct syscall_enter_record { - struct trace_entry entry; - unsigned long args[0]; -}; - -struct syscall_exit_record { - struct trace_entry entry; - unsigned long ret; -}; - static DECLARE_BITMAP(enabled_prof_enter_syscalls, FTRACE_SYSCALL_MAX); static DECLARE_BITMAP(enabled_prof_exit_syscalls, FTRACE_SYSCALL_MAX); static int sys_prof_refcount_enter; @@ -304,7 +307,7 @@ static int sys_prof_refcount_exit; static void prof_syscall_enter(struct pt_regs *regs, long id) { - struct syscall_enter_record *rec; + struct syscall_trace_enter *rec; struct syscall_metadata *sys_data; int syscall_nr; int size; @@ -328,9 +331,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) /* zero the dead bytes from align to not leak stack to user */ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; - rec = (struct syscall_enter_record *) raw_data; - tracing_generic_entry_update(&rec->entry, 0, 0); - rec->entry.type = sys_data->enter_id; + rec = (struct syscall_trace_enter *) raw_data; + tracing_generic_entry_update(&rec->ent, 0, 0); + rec->ent.type = sys_data->enter_id; + rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size); @@ -379,7 +383,7 @@ void unreg_prof_syscall_enter(char *name) static void prof_syscall_exit(struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; - struct syscall_exit_record rec; + struct syscall_trace_exit rec; int syscall_nr; syscall_nr = syscall_get_nr(current, regs); @@ -390,8 +394,9 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) if (!sys_data) return; - tracing_generic_entry_update(&rec.entry, 0, 0); - rec.entry.type = sys_data->exit_id; + tracing_generic_entry_update(&rec.ent, 0, 0); + rec.ent.type = sys_data->exit_id; + rec.nr = syscall_nr; rec.ret = syscall_get_return_value(current, regs); perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec)); -- cgit v1.2.3-70-g09d2 From 10a5b66f625904ad5a2867cf7a28073e1236ff32 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 19 Aug 2009 15:53:05 +0800 Subject: tracing/syscalls: Add fields format for exit events Add "format" file for syscall exit events: # cat events/syscalls/sys_exit_open/format name: sys_exit_open ID: 344 format: field:unsigned short common_type; offset:0; size:2; field:unsigned char common_flags; offset:2; size:1; field:unsigned char common_preempt_count; offset:3; size:1; field:int common_pid; offset:4; size:4; field:int common_tgid; offset:8; size:4; field:int nr; offset:12; size:4; field:unsigned long ret; offset:16; size:4; Signed-off-by: Li Zefan Cc: Jason Baron Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A8BAF61.3060307@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/syscalls.h | 3 ++- include/trace/syscall.h | 6 ++++-- kernel/trace/trace_syscalls.c | 18 +++++++++++++++++- 3 files changed, 23 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 87d06c173dd..8d57f77794e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -189,7 +189,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *event_call) \ .system = "syscalls", \ .event = &event_syscall_enter, \ .raw_init = init_enter_##sname, \ - .show_format = ftrace_format_syscall, \ + .show_format = syscall_enter_format, \ .regfunc = reg_event_syscall_enter, \ .unregfunc = unreg_event_syscall_enter, \ .data = "sys"#sname, \ @@ -225,6 +225,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *event_call) \ .system = "syscalls", \ .event = &event_syscall_exit, \ .raw_init = init_exit_##sname, \ + .show_format = syscall_exit_format, \ .regfunc = reg_event_syscall_exit, \ .unregfunc = unreg_event_syscall_exit, \ .data = "sys"#sname, \ diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 0cb03625edd..5ce85d75d31 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -55,8 +55,10 @@ extern int reg_event_syscall_enter(void *ptr); extern void unreg_event_syscall_enter(void *ptr); extern int reg_event_syscall_exit(void *ptr); extern void unreg_event_syscall_exit(void *ptr); -extern int -ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s); +extern int syscall_enter_format(struct ftrace_event_call *call, + struct trace_seq *s); +extern int syscall_exit_format(struct ftrace_event_call *call, + struct trace_seq *s); enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags); enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags); #endif diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index d10daf0922b..7336b6c265d 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -97,7 +97,7 @@ extern char *__bad_type_size(void); __bad_type_size() : \ #type, #name, offsetof(typeof(trace), name), sizeof(trace.name) -int ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s) +int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) { int i; int nr; @@ -149,6 +149,22 @@ int ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s) return ret; } +int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) +{ + int ret; + struct syscall_trace_exit trace; + + ret = trace_seq_printf(s, + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", + SYSCALL_FIELD(int, nr), + SYSCALL_FIELD(unsigned long, ret)); + if (!ret) + return 0; + + return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n"); +} + void ftrace_syscall_enter(struct pt_regs *regs, long id) { struct syscall_trace_enter *entry; -- cgit v1.2.3-70-g09d2 From 14be96c9716cb8c46dca94bd890defd7856e0734 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 19 Aug 2009 15:53:52 +0800 Subject: tracing/events: Add ftrace_event_call param to define_fields() This parameter is needed by syscall events to add define_fields() handler. Signed-off-by: Li Zefan Cc: Jason Baron Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A8BAF90.6060801@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 2 +- include/trace/ftrace.h | 3 +-- kernel/trace/trace_events.c | 2 +- kernel/trace/trace_export.c | 5 ++--- 4 files changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 189806b6e69..35b3a4a5ba8 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -118,7 +118,7 @@ struct ftrace_event_call { int (*raw_init)(void); int (*show_format)(struct ftrace_event_call *call, struct trace_seq *s); - int (*define_fields)(void); + int (*define_fields)(struct ftrace_event_call *); struct list_head fields; int filter_active; struct event_filter *filter; diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index b250b061657..4e81c9b3751 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -294,10 +294,9 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ int \ -ftrace_define_fields_##call(void) \ +ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ { \ struct ftrace_raw_##call field; \ - struct ftrace_event_call *event_call = &event_##call; \ int ret; \ \ __common_field(int, type, 1); \ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index b568ade8f45..af8fb8ebef0 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -941,7 +941,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, id); if (call->define_fields) { - ret = call->define_fields(); + ret = call->define_fields(call); if (ret < 0) { pr_warning("Could not initialize trace point" " events/%s\n", call->name); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 956d4bc675e..cf2c752a25b 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -119,7 +119,7 @@ ftrace_format_##call(struct ftrace_event_call *unused, \ #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ -int ftrace_define_fields_##call(void); \ +int ftrace_define_fields_##call(struct ftrace_event_call *event_call); \ static int ftrace_raw_init_event_##call(void); \ \ struct ftrace_event_call __used \ @@ -184,9 +184,8 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ int \ -ftrace_define_fields_##call(void) \ +ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ { \ - struct ftrace_event_call *event_call = &event_##call; \ struct args field; \ int ret; \ \ -- cgit v1.2.3-70-g09d2 From e647d6b314266adb904d4b84973eda0afa856946 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 19 Aug 2009 15:54:32 +0800 Subject: tracing/events: Add trace_define_common_fields() Extract duplicate code. Also prepare for the later patch. Signed-off-by: Li Zefan Cc: Jason Baron Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A8BAFB8.1010304@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 8 +------- include/trace/ftrace.h | 8 +++----- kernel/trace/trace_events.c | 22 ++++++++++++++++++++++ kernel/trace/trace_export.c | 8 +++----- 4 files changed, 29 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 35b3a4a5ba8..427cbae47f8 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -142,6 +142,7 @@ extern int filter_current_check_discard(struct ftrace_event_call *call, extern int trace_define_field(struct ftrace_event_call *call, char *type, char *name, int offset, int size, int is_signed); +extern int trace_define_common_fields(struct ftrace_event_call *call); #define is_signed_type(type) (((type)(-1)) < 0) @@ -166,11 +167,4 @@ do { \ __trace_printk(ip, fmt, ##args); \ } while (0) -#define __common_field(type, item, is_signed) \ - ret = trace_define_field(event_call, #type, "common_" #item, \ - offsetof(typeof(field.ent), item), \ - sizeof(field.ent.item), is_signed); \ - if (ret) \ - return ret; - #endif /* _LINUX_FTRACE_EVENT_H */ diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 4e81c9b3751..127400255e4 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -299,11 +299,9 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ struct ftrace_raw_##call field; \ int ret; \ \ - __common_field(int, type, 1); \ - __common_field(unsigned char, flags, 0); \ - __common_field(unsigned char, preempt_count, 0); \ - __common_field(int, pid, 1); \ - __common_field(int, tgid, 1); \ + ret = trace_define_common_fields(event_call); \ + if (ret) \ + return ret; \ \ tstruct; \ \ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index af8fb8ebef0..9c7ecfb3416 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -62,6 +62,28 @@ err: } EXPORT_SYMBOL_GPL(trace_define_field); +#define __common_field(type, item) \ + ret = trace_define_field(call, #type, "common_" #item, \ + offsetof(typeof(ent), item), \ + sizeof(ent.item), \ + is_signed_type(type)); \ + if (ret) \ + return ret; + +int trace_define_common_fields(struct ftrace_event_call *call) +{ + int ret; + struct trace_entry ent; + + __common_field(unsigned short, type); + __common_field(unsigned char, flags); + __common_field(unsigned char, preempt_count); + __common_field(int, pid); + __common_field(int, tgid); + + return ret; +} + #ifdef CONFIG_MODULES static void trace_destroy_fields(struct ftrace_event_call *call) diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index cf2c752a25b..70875303ae4 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -189,11 +189,9 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ struct args field; \ int ret; \ \ - __common_field(unsigned char, type, 0); \ - __common_field(unsigned char, flags, 0); \ - __common_field(unsigned char, preempt_count, 0); \ - __common_field(int, pid, 1); \ - __common_field(int, tgid, 1); \ + ret = trace_define_common_fields(event_call); \ + if (ret) \ + return ret; \ \ tstruct; \ \ -- cgit v1.2.3-70-g09d2 From 540b7b8d65575c80162f2a0f38e1d313c92a6042 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 19 Aug 2009 15:54:51 +0800 Subject: tracing/syscalls: Add filtering support Add filtering support for syscall events: # echo 'mode == 0666' > events/syscalls/sys_enter_open # echo 'ret == 0' > events/syscalls/sys_exit_open # echo 1 > events/syscalls/sys_enter_open # echo 1 > events/syscalls/sys_exit_open # cat trace ... modprobe-3084 [001] 117.463140: sys_open(filename: 917d3e8, flags: 0, mode: 1b6) modprobe-3084 [001] 117.463176: sys_open -> 0x0 less-3086 [001] 117.510455: sys_open(filename: 9c6bdb8, flags: 8000, mode: 1b6) sendmail-2574 [001] 122.145840: sys_open(filename: b807a365, flags: 0, mode: 1b6) ... Signed-off-by: Li Zefan Cc: Jason Baron Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A8BAFCB.1040006@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 5 +++-- include/linux/syscalls.h | 16 +++++++++----- include/trace/syscall.h | 7 ++++++ kernel/trace/trace_events.c | 5 +++-- kernel/trace/trace_syscalls.c | 51 +++++++++++++++++++++++++++++++++++++++---- 5 files changed, 71 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 427cbae47f8..df5b085c415 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -140,8 +140,9 @@ extern int filter_current_check_discard(struct ftrace_event_call *call, void *rec, struct ring_buffer_event *event); -extern int trace_define_field(struct ftrace_event_call *call, char *type, - char *name, int offset, int size, int is_signed); +extern int trace_define_field(struct ftrace_event_call *call, + const char *type, const char *name, + int offset, int size, int is_signed); extern int trace_define_common_fields(struct ftrace_event_call *call); #define is_signed_type(type) (((type)(-1)) < 0) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 8d57f77794e..f124c899555 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -190,6 +190,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *event_call) \ .event = &event_syscall_enter, \ .raw_init = init_enter_##sname, \ .show_format = syscall_enter_format, \ + .define_fields = syscall_enter_define_fields, \ .regfunc = reg_event_syscall_enter, \ .unregfunc = unreg_event_syscall_enter, \ .data = "sys"#sname, \ @@ -226,6 +227,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *event_call) \ .event = &event_syscall_exit, \ .raw_init = init_exit_##sname, \ .show_format = syscall_exit_format, \ + .define_fields = syscall_exit_define_fields, \ .regfunc = reg_event_syscall_exit, \ .unregfunc = unreg_event_syscall_exit, \ .data = "sys"#sname, \ @@ -233,6 +235,8 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *event_call) \ } #define SYSCALL_METADATA(sname, nb) \ + SYSCALL_TRACE_ENTER_EVENT(sname); \ + SYSCALL_TRACE_EXIT_EVENT(sname); \ static const struct syscall_metadata __used \ __attribute__((__aligned__(4))) \ __attribute__((section("__syscalls_metadata"))) \ @@ -241,20 +245,22 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *event_call) \ .nb_args = nb, \ .types = types_##sname, \ .args = args_##sname, \ - }; \ - SYSCALL_TRACE_ENTER_EVENT(sname); \ - SYSCALL_TRACE_EXIT_EVENT(sname); + .enter_event = &event_enter_##sname, \ + .exit_event = &event_exit_##sname, \ + }; #define SYSCALL_DEFINE0(sname) \ + SYSCALL_TRACE_ENTER_EVENT(_##sname); \ + SYSCALL_TRACE_EXIT_EVENT(_##sname); \ static const struct syscall_metadata __used \ __attribute__((__aligned__(4))) \ __attribute__((section("__syscalls_metadata"))) \ __syscall_meta_##sname = { \ .name = "sys_"#sname, \ .nb_args = 0, \ + .enter_event = &event_enter__##sname, \ + .exit_event = &event_exit__##sname, \ }; \ - SYSCALL_TRACE_ENTER_EVENT(_##sname); \ - SYSCALL_TRACE_EXIT_EVENT(_##sname); \ asmlinkage long sys_##sname(void) #else #define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void) diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 5ce85d75d31..9661dd406b9 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -34,6 +34,8 @@ DECLARE_TRACE_WITH_CALLBACK(syscall_exit, * @args: list of args as strings (args[i] matches types[i]) * @enter_id: associated ftrace enter event id * @exit_id: associated ftrace exit event id + * @enter_event: associated syscall_enter trace event + * @exit_event: associated syscall_exit trace event */ struct syscall_metadata { const char *name; @@ -42,6 +44,9 @@ struct syscall_metadata { const char **args; int enter_id; int exit_id; + + struct ftrace_event_call *enter_event; + struct ftrace_event_call *exit_event; }; #ifdef CONFIG_FTRACE_SYSCALLS @@ -59,6 +64,8 @@ extern int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s); extern int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s); +extern int syscall_enter_define_fields(struct ftrace_event_call *call); +extern int syscall_exit_define_fields(struct ftrace_event_call *call); enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags); enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags); #endif diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9c7ecfb3416..79d352027a6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -27,8 +27,8 @@ DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); -int trace_define_field(struct ftrace_event_call *call, char *type, - char *name, int offset, int size, int is_signed) +int trace_define_field(struct ftrace_event_call *call, const char *type, + const char *name, int offset, int size, int is_signed) { struct ftrace_event_field *field; @@ -83,6 +83,7 @@ int trace_define_common_fields(struct ftrace_event_call *call) return ret; } +EXPORT_SYMBOL_GPL(trace_define_common_fields); #ifdef CONFIG_MODULES diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 7336b6c265d..28e4dae4af2 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -165,6 +165,49 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n"); } +int syscall_enter_define_fields(struct ftrace_event_call *call) +{ + struct syscall_trace_enter trace; + struct syscall_metadata *meta; + int ret; + int nr; + int i; + int offset = offsetof(typeof(trace), args); + + nr = syscall_name_to_nr(call->data); + meta = syscall_nr_to_meta(nr); + + if (!meta) + return 0; + + ret = trace_define_common_fields(call); + if (ret) + return ret; + + for (i = 0; i < meta->nb_args; i++) { + ret = trace_define_field(call, meta->types[i], + meta->args[i], offset, + sizeof(unsigned long), 0); + offset += sizeof(unsigned long); + } + + return ret; +} + +int syscall_exit_define_fields(struct ftrace_event_call *call) +{ + struct syscall_trace_exit trace; + int ret; + + ret = trace_define_common_fields(call); + if (ret) + return ret; + + ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0); + + return ret; +} + void ftrace_syscall_enter(struct pt_regs *regs, long id) { struct syscall_trace_enter *entry; @@ -192,8 +235,8 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id) entry->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); - trace_current_buffer_unlock_commit(event, 0, 0); - trace_wake_up(); + if (!filter_current_check_discard(sys_data->enter_event, entry, event)) + trace_current_buffer_unlock_commit(event, 0, 0); } void ftrace_syscall_exit(struct pt_regs *regs, long ret) @@ -220,8 +263,8 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret) entry->nr = syscall_nr; entry->ret = syscall_get_return_value(current, regs); - trace_current_buffer_unlock_commit(event, 0, 0); - trace_wake_up(); + if (!filter_current_check_discard(sys_data->exit_event, entry, event)) + trace_current_buffer_unlock_commit(event, 0, 0); } int reg_event_syscall_enter(void *ptr) -- cgit v1.2.3-70-g09d2 From f833bab87fca5c3ce13778421b1365845843b976 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 17 Aug 2009 14:34:59 -0700 Subject: clockevent: Prevent dead lock on clockevents_lock Currently clockevents_notify() is called with interrupts enabled at some places and interrupts disabled at some other places. This results in a deadlock in this scenario. cpu A holds clockevents_lock in clockevents_notify() with irqs enabled cpu B waits for clockevents_lock in clockevents_notify() with irqs disabled cpu C doing set_mtrr() which will try to rendezvous of all the cpus. This will result in C and A come to the rendezvous point and waiting for B. B is stuck forever waiting for the spinlock and thus not reaching the rendezvous point. Fix the clockevents code so that clockevents_lock is taken with interrupts disabled and thus avoid the above deadlock. Also call lapic_timer_propagate_broadcast() on the destination cpu so that we avoid calling smp_call_function() in the clockevents notifier chain. This issue left us wondering if we need to change the MTRR rendezvous logic to use stop machine logic (instead of smp_call_function) or add a check in spinlock debug code to see if there are other spinlocks which gets taken under both interrupts enabled/disabled conditions. Signed-off-by: Suresh Siddha Signed-off-by: Venkatesh Pallipadi Cc: "Pallipadi Venkatesh" Cc: "Brown Len" LKML-Reference: <1250544899.2709.210.camel@sbs-t61.sc.intel.com> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process.c | 6 +----- drivers/acpi/processor_idle.c | 6 ++++-- kernel/time/clockevents.c | 16 ++++++++++------ kernel/time/tick-broadcast.c | 7 +++---- 4 files changed, 18 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 994dd6a4a2a..071166a4ba8 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -519,16 +519,12 @@ static void c1e_idle(void) if (!cpumask_test_cpu(cpu, c1e_mask)) { cpumask_set_cpu(cpu, c1e_mask); /* - * Force broadcast so ACPI can not interfere. Needs - * to run with interrupts enabled as it uses - * smp_function_call. + * Force broadcast so ACPI can not interfere. */ - local_irq_enable(); clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &cpu); printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", cpu); - local_irq_disable(); } clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 0efa59e7e3a..66393d5c4c7 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -162,8 +162,9 @@ static void lapic_timer_check_state(int state, struct acpi_processor *pr, pr->power.timer_broadcast_on_state = state; } -static void lapic_timer_propagate_broadcast(struct acpi_processor *pr) +static void lapic_timer_propagate_broadcast(void *arg) { + struct acpi_processor *pr = (struct acpi_processor *) arg; unsigned long reason; reason = pr->power.timer_broadcast_on_state < INT_MAX ? @@ -635,7 +636,8 @@ static int acpi_processor_power_verify(struct acpi_processor *pr) working++; } - lapic_timer_propagate_broadcast(pr); + smp_call_function_single(pr->id, lapic_timer_propagate_broadcast, + pr, 1); return (working); } diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index a6dcd67b041..620b58abdc3 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -137,11 +137,12 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, */ int clockevents_register_notifier(struct notifier_block *nb) { + unsigned long flags; int ret; - spin_lock(&clockevents_lock); + spin_lock_irqsave(&clockevents_lock, flags); ret = raw_notifier_chain_register(&clockevents_chain, nb); - spin_unlock(&clockevents_lock); + spin_unlock_irqrestore(&clockevents_lock, flags); return ret; } @@ -178,16 +179,18 @@ static void clockevents_notify_released(void) */ void clockevents_register_device(struct clock_event_device *dev) { + unsigned long flags; + BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); BUG_ON(!dev->cpumask); - spin_lock(&clockevents_lock); + spin_lock_irqsave(&clockevents_lock, flags); list_add(&dev->list, &clockevent_devices); clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); clockevents_notify_released(); - spin_unlock(&clockevents_lock); + spin_unlock_irqrestore(&clockevents_lock, flags); } EXPORT_SYMBOL_GPL(clockevents_register_device); @@ -235,8 +238,9 @@ void clockevents_exchange_device(struct clock_event_device *old, void clockevents_notify(unsigned long reason, void *arg) { struct list_head *node, *tmp; + unsigned long flags; - spin_lock(&clockevents_lock); + spin_lock_irqsave(&clockevents_lock, flags); clockevents_do_notify(reason, arg); switch (reason) { @@ -251,7 +255,7 @@ void clockevents_notify(unsigned long reason, void *arg) default: break; } - spin_unlock(&clockevents_lock); + spin_unlock_irqrestore(&clockevents_lock, flags); } EXPORT_SYMBOL_GPL(clockevents_notify); #endif diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 877dbedc311..c2ec25087a3 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -205,11 +205,11 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) * Powerstate information: The system enters/leaves a state, where * affected devices might stop */ -static void tick_do_broadcast_on_off(void *why) +static void tick_do_broadcast_on_off(unsigned long *reason) { struct clock_event_device *bc, *dev; struct tick_device *td; - unsigned long flags, *reason = why; + unsigned long flags; int cpu, bc_stopped; spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -276,8 +276,7 @@ void tick_broadcast_on_off(unsigned long reason, int *oncpu) printk(KERN_ERR "tick-broadcast: ignoring broadcast for " "offline CPU #%d\n", *oncpu); else - smp_call_function_single(*oncpu, tick_do_broadcast_on_off, - &reason, 1); + tick_do_broadcast_on_off(&reason); } /* -- cgit v1.2.3-70-g09d2 From a15098c90df1ac2b1bfe1d33dd1c47063213aa9a Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sun, 9 Aug 2009 19:02:51 +0000 Subject: powerpc: Enable GCOV Make it possible to enable GCOV code coverage measurement on powerpc. Lightly tested on 64-bit, seems to work as expected. Signed-off-by: Michael Ellerman Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/Makefile | 7 +++++++ arch/powerpc/kernel/vdso32/Makefile | 1 + arch/powerpc/kernel/vdso64/Makefile | 2 ++ arch/powerpc/xmon/Makefile | 2 ++ kernel/gcov/Kconfig | 2 +- 5 files changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 035946f9d5f..720e8c3b8e9 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -115,6 +115,13 @@ ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC),) obj-y += ppc_save_regs.o endif +# Disable GCOV in odd or sensitive code +GCOV_PROFILE_prom_init.o := n +GCOV_PROFILE_ftrace.o := n +GCOV_PROFILE_machine_kexec_64.o := n +GCOV_PROFILE_machine_kexec_32.o := n +GCOV_PROFILE_kprobes.o := n + extra-$(CONFIG_PPC_FPU) += fpu.o extra-$(CONFIG_ALTIVEC) += vector.o extra-$(CONFIG_PPC64) += entry_64.o diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile index c3d57bd01a8..b54b8168813 100644 --- a/arch/powerpc/kernel/vdso32/Makefile +++ b/arch/powerpc/kernel/vdso32/Makefile @@ -12,6 +12,7 @@ endif targets := $(obj-vdso32) vdso32.so vdso32.so.dbg obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32)) +GCOV_PROFILE := n EXTRA_CFLAGS := -shared -fno-common -fno-builtin EXTRA_CFLAGS += -nostdlib -Wl,-soname=linux-vdso32.so.1 \ diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile index fa7f1b8f3e5..dd0c8e93677 100644 --- a/arch/powerpc/kernel/vdso64/Makefile +++ b/arch/powerpc/kernel/vdso64/Makefile @@ -7,6 +7,8 @@ obj-vdso64 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o targets := $(obj-vdso64) vdso64.so vdso64.so.dbg obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64)) +GCOV_PROFILE := n + EXTRA_CFLAGS := -shared -fno-common -fno-builtin EXTRA_CFLAGS += -nostdlib -Wl,-soname=linux-vdso64.so.1 \ $(call ld-option, -Wl$(comma)--hash-style=sysv) diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile index 85ab97ab840..faa81b6a661 100644 --- a/arch/powerpc/xmon/Makefile +++ b/arch/powerpc/xmon/Makefile @@ -2,6 +2,8 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror +GCOV_PROFILE := n + ifdef CONFIG_PPC64 EXTRA_CFLAGS += -mno-minimal-toc endif diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 22e9dcfaa3d..654efd09f6a 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -34,7 +34,7 @@ config GCOV_KERNEL config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on S390 || X86 + depends on S390 || X86 || (PPC && EXPERIMENTAL) default n ---help--- This options activates profiling for the entire kernel. -- cgit v1.2.3-70-g09d2 From 4539f07701b3f743580d19dc5d655fb8d21b0a3c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 20 Aug 2009 16:13:35 +0800 Subject: tracing/syscalls: Fix the output of syscalls with no arguments Before: # echo 1 > events/syscalls/sys_enter_sync/enable # cat events/syscalls/sys_enter_sync/format ... field:int nr; offset:12; size:4; print fmt: "# sync # cat trace ... sync-8950 [000] 2366.087670: sys_sync( After: # echo 1 > events/syscalls/sys_enter_sync/enable # cat events/syscalls/sys_enter_sync/format ... field:int nr; offset:12; size:4; print fmt: "" # sync # cat trace sync-2134 [001] 136.780735: sys_sync() Reported-by: Masami Hiramatsu Signed-off-by: Li Zefan Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: Jason Baron Cc: Masami Hiramatsu LKML-Reference: <4A8D05AF.20103@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_syscalls.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 28e4dae4af2..46c1b977a2c 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -46,15 +46,22 @@ print_syscall_enter(struct trace_iterator *iter, int flags) return TRACE_TYPE_PARTIAL_LINE; } /* parameter values */ - ret = trace_seq_printf(s, "%s: %lx%s ", entry->args[i], + ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i], trace->args[i], - i == entry->nb_args - 1 ? ")" : ","); + i == entry->nb_args - 1 ? "" : ", "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } + ret = trace_seq_putc(s, ')'); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + end: - trace_seq_printf(s, "\n"); + ret = trace_seq_putc(s, '\n'); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; } @@ -129,24 +136,24 @@ int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) offset += sizeof(unsigned long); } - trace_seq_printf(s, "\nprint fmt: \""); + trace_seq_puts(s, "\nprint fmt: \""); for (i = 0; i < entry->nb_args; i++) { ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i], sizeof(unsigned long), - i == entry->nb_args - 1 ? "\", " : ", "); + i == entry->nb_args - 1 ? "" : ", "); if (!ret) return 0; } + trace_seq_putc(s, '"'); for (i = 0; i < entry->nb_args; i++) { - ret = trace_seq_printf(s, "((unsigned long)(REC->%s))%s", - entry->args[i], - i == entry->nb_args - 1 ? "\n" : ", "); + ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))", + entry->args[i]); if (!ret) return 0; } - return ret; + return trace_seq_putc(s, '\n'); } int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) -- cgit v1.2.3-70-g09d2 From cde7e5ca4e329a157108769d1f752d191cbb71c6 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 18 Aug 2009 13:01:01 +0900 Subject: sched: Use for_each_class macro in move_one_task() Replace for loop with the macro for_each_class to cleanup. Signed-off-by: Hiroshi Shimamoto LKML-Reference: <4A8A277D.4090304@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7f83be35d65..1b529efe887 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3461,9 +3461,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, { const struct sched_class *class; - for (class = sched_class_highest; class; class = class->next) + for_each_class(class) { if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) return 1; + } return 0; } -- cgit v1.2.3-70-g09d2 From a8af7246c114bfd939e539f9566b872c06f6225c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Aug 2009 13:58:54 +0200 Subject: sched: Avoid division by zero Patch a5004278f0525dcb9aa43703ef77bf371ea837cd (sched: Fix cgroup smp fairness) introduced the possibility of a divide-by-zero because load-balancing is not synchronized between sched_domains. This can cause the state of cpus to change between the first and second loop over the sched domain in tg_shares_up(). Reported-by: Yinghai Lu Signed-off-by: Peter Zijlstra Cc: Jes Sorensen Cc: Jens Axboe Cc: Linus Torvalds LKML-Reference: <1250855934.7538.30.camel@twins> Signed-off-by: Ingo Molnar --- kernel/sched.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1b529efe887..8f8a98eab9d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1522,7 +1522,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); */ static void update_group_shares_cpu(struct task_group *tg, int cpu, - unsigned long sd_shares, unsigned long sd_rq_weight) + unsigned long sd_shares, unsigned long sd_rq_weight, + unsigned long sd_eff_weight) { unsigned long rq_weight; unsigned long shares; @@ -1535,13 +1536,15 @@ update_group_shares_cpu(struct task_group *tg, int cpu, if (!rq_weight) { boost = 1; rq_weight = NICE_0_LOAD; + if (sd_rq_weight == sd_eff_weight) + sd_eff_weight += NICE_0_LOAD; + sd_rq_weight = sd_eff_weight; } /* - * \Sum shares * rq_weight - * shares = ----------------------- - * \Sum rq_weight - * + * \Sum_j shares_j * rq_weight_i + * shares_i = ----------------------------- + * \Sum_j rq_weight_j */ shares = (sd_shares * rq_weight) / sd_rq_weight; shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); @@ -1593,14 +1596,8 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) shares = tg->shares; - for_each_cpu(i, sched_domain_span(sd)) { - unsigned long sd_rq_weight = rq_weight; - - if (!tg->cfs_rq[i]->rq_weight) - sd_rq_weight = eff_weight; - - update_group_shares_cpu(tg, i, shares, sd_rq_weight); - } + for_each_cpu(i, sched_domain_span(sd)) + update_group_shares_cpu(tg, i, shares, rq_weight, eff_weight); return 0; } -- cgit v1.2.3-70-g09d2 From 4464fcaa9cbfc9c551956b48af203e2f775ca892 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Aug 2009 17:19:36 +0200 Subject: perf_counter: Fix typo in read() output generation When you iterate a list, using the iterator is useful. Before: ID: 5 ID: 5 ID: 5 ID: 5 EVNT: 0x40088b scale: nan ID: 5 CNT: 1006252 ID: 6 CNT: 1011090 ID: 7 CNT: 1011196 ID: 8 CNT: 1011095 EVNT: 0x40088c scale: 1.000000 ID: 5 CNT: 2003065 ID: 6 CNT: 2011671 ID: 7 CNT: 2012620 ID: 8 CNT: 2013479 EVNT: 0x40088c scale: 1.000000 ID: 5 CNT: 3002390 ID: 6 CNT: 3015996 ID: 7 CNT: 3018019 ID: 8 CNT: 3020006 EVNT: 0x40088b scale: 1.000000 ID: 5 CNT: 4002406 ID: 6 CNT: 4021120 ID: 7 CNT: 4024241 ID: 8 CNT: 4027059 After: ID: 1 ID: 2 ID: 3 ID: 4 EVNT: 0x400889 scale: nan ID: 1 CNT: 1005270 ID: 2 CNT: 1009833 ID: 3 CNT: 1010065 ID: 4 CNT: 1010088 EVNT: 0x400898 scale: nan ID: 1 CNT: 2001531 ID: 2 CNT: 2022309 ID: 3 CNT: 2022470 ID: 4 CNT: 2022627 EVNT: 0x400888 scale: 0.489467 ID: 1 CNT: 3001261 ID: 2 CNT: 3027088 ID: 3 CNT: 3027941 ID: 4 CNT: 3028762 Reported-by: stephane eranian Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey J Ashford Cc: perfmon2-devel LKML-Reference: <1250867976.7538.73.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 36f65e2b8b5..f274e195988 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1791,7 +1791,7 @@ static int perf_counter_read_group(struct perf_counter *counter, size += err; list_for_each_entry(sub, &leader->sibling_list, list_entry) { - err = perf_counter_read_entry(counter, read_format, + err = perf_counter_read_entry(sub, read_format, buf + size); if (err < 0) return err; -- cgit v1.2.3-70-g09d2 From da15cfdae03351c689736f8d142618592e3cebc3 Mon Sep 17 00:00:00 2001 From: john stultz Date: Wed, 19 Aug 2009 19:13:34 -0700 Subject: time: Introduce CLOCK_REALTIME_COARSE After talking with some application writers who want very fast, but not fine-grained timestamps, I decided to try to implement new clock_ids to clock_gettime(): CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE which returns the time at the last tick. This is very fast as we don't have to access any hardware (which can be very painful if you're using something like the acpi_pm clocksource), and we can even use the vdso clock_gettime() method to avoid the syscall. The only trade off is you only get low-res tick grained time resolution. This isn't a new idea, I know Ingo has a patch in the -rt tree that made the vsyscall gettimeofday() return coarse grained time when the vsyscall64 sysctrl was set to 2. However this affects all applications on a system. With this method, applications can choose the proper speed/granularity trade-off for themselves. Signed-off-by: John Stultz Cc: Andi Kleen Cc: nikolag@ca.ibm.com Cc: Darren Hart Cc: arjan@infradead.org Cc: jonathan@jonmasters.org LKML-Reference: <1250734414.6897.5.camel@localhost.localdomain> Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/vgtod.h | 1 + arch/x86/kernel/vsyscall_64.c | 1 + arch/x86/vdso/vclock_gettime.c | 39 ++++++++++++++++++++++++++++++++++++--- include/linux/time.h | 4 ++++ kernel/posix-timers.c | 35 +++++++++++++++++++++++++++++++++++ kernel/time/timekeeping.c | 21 +++++++++++++++++++++ 6 files changed, 98 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index dc27a69e5d2..3d61e204826 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -21,6 +21,7 @@ struct vsyscall_gtod_data { u32 shift; } clock; struct timespec wall_to_monotonic; + struct timespec wall_time_coarse; }; extern struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 25ee06a80aa..cf53a78e2dc 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -87,6 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; + vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 6a40b78b46a..ee55754cc3c 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -86,14 +86,47 @@ notrace static noinline int do_monotonic(struct timespec *ts) return 0; } +notrace static noinline int do_realtime_coarse(struct timespec *ts) +{ + unsigned long seq; + do { + seq = read_seqbegin(>od->lock); + ts->tv_sec = gtod->wall_time_coarse.tv_sec; + ts->tv_nsec = gtod->wall_time_coarse.tv_nsec; + } while (unlikely(read_seqretry(>od->lock, seq))); + return 0; +} + +notrace static noinline int do_monotonic_coarse(struct timespec *ts) +{ + unsigned long seq, ns, secs; + do { + seq = read_seqbegin(>od->lock); + secs = gtod->wall_time_coarse.tv_sec; + ns = gtod->wall_time_coarse.tv_nsec; + secs += gtod->wall_to_monotonic.tv_sec; + ns += gtod->wall_to_monotonic.tv_nsec; + } while (unlikely(read_seqretry(>od->lock, seq))); + vset_normalized_timespec(ts, secs, ns); + return 0; +} + notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { - if (likely(gtod->sysctl_enabled && gtod->clock.vread)) + if (likely(gtod->sysctl_enabled)) switch (clock) { case CLOCK_REALTIME: - return do_realtime(ts); + if (likely(gtod->clock.vread)) + return do_realtime(ts); + break; case CLOCK_MONOTONIC: - return do_monotonic(ts); + if (likely(gtod->clock.vread)) + return do_monotonic(ts); + break; + case CLOCK_REALTIME_COARSE: + return do_realtime_coarse(ts); + case CLOCK_MONOTONIC_COARSE: + return do_monotonic_coarse(ts); } return vdso_fallback_gettime(clock, ts); } diff --git a/include/linux/time.h b/include/linux/time.h index f505988398e..256232f7e5e 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -110,6 +110,8 @@ extern int timekeeping_suspended; unsigned long get_seconds(void); struct timespec current_kernel_time(void); +struct timespec __current_kernel_time(void); /* does not hold xtime_lock */ +struct timespec get_monotonic_coarse(void); #define CURRENT_TIME (current_kernel_time()) #define CURRENT_TIME_SEC ((struct timespec) { get_seconds(), 0 }) @@ -243,6 +245,8 @@ struct itimerval { #define CLOCK_PROCESS_CPUTIME_ID 2 #define CLOCK_THREAD_CPUTIME_ID 3 #define CLOCK_MONOTONIC_RAW 4 +#define CLOCK_REALTIME_COARSE 5 +#define CLOCK_MONOTONIC_COARSE 6 /* * The IDs of various hardware clocks: diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index d089d052c4a..495440779ce 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -242,6 +242,25 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) return 0; } + +static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp) +{ + *tp = current_kernel_time(); + return 0; +} + +static int posix_get_monotonic_coarse(clockid_t which_clock, + struct timespec *tp) +{ + *tp = get_monotonic_coarse(); + return 0; +} + +int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) +{ + *tp = ktime_to_timespec(KTIME_LOW_RES); + return 0; +} /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ @@ -262,10 +281,26 @@ static __init int init_posix_timers(void) .timer_create = no_timer_create, .nsleep = no_nsleep, }; + struct k_clock clock_realtime_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_realtime_coarse, + .clock_set = do_posix_clock_nosettime, + .timer_create = no_timer_create, + .nsleep = no_nsleep, + }; + struct k_clock clock_monotonic_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_monotonic_coarse, + .clock_set = do_posix_clock_nosettime, + .timer_create = no_timer_create, + .nsleep = no_nsleep, + }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); + register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); + register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 15e06defca5..03cbeb34d14 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -847,6 +847,10 @@ unsigned long get_seconds(void) } EXPORT_SYMBOL(get_seconds); +struct timespec __current_kernel_time(void) +{ + return xtime_cache; +} struct timespec current_kernel_time(void) { @@ -862,3 +866,20 @@ struct timespec current_kernel_time(void) return now; } EXPORT_SYMBOL(current_kernel_time); + +struct timespec get_monotonic_coarse(void) +{ + struct timespec now, mono; + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + + now = xtime_cache; + mono = wall_to_monotonic; + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, + now.tv_nsec + mono.tv_nsec); + return now; +} -- cgit v1.2.3-70-g09d2 From 269c861baa2fe7c114c3bc7831292758d29eb336 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 19 Aug 2009 18:05:35 -0700 Subject: generic-ipi: Allow cpus not yet online to call smp_call_function with irqs disabled Because of deadlock possiblities smp_call_function() is not allowed to be called with interrupts disabled. Add an exception for the cpu not yet online, as no one else can send smp call function interrupt to this cpu that is not yet online and as such deadlock condition is not possible. Signed-off-by: Suresh Siddha Acked-by: Nick Piggin Signed-off-by: H. Peter Anvin --- kernel/smp.c | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index ad63d850120..2accdf6712e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -176,6 +176,11 @@ void generic_smp_call_function_interrupt(void) struct call_function_data *data; int cpu = get_cpu(); + /* + * Shouldn't receive this interrupt on a cpu that is not yet online. + */ + WARN_ON_ONCE(!cpu_online(cpu)); + /* * Ensure entry is visible on call_function_queue after we have * entered the IPI. See comment in smp_call_function_many. @@ -230,6 +235,11 @@ void generic_smp_call_function_single_interrupt(void) unsigned int data_flags; LIST_HEAD(list); + /* + * Shouldn't receive this interrupt on a cpu that is not yet online. + */ + WARN_ON_ONCE(!cpu_online(smp_processor_id())); + spin_lock(&q->lock); list_replace_init(&q->list, &list); spin_unlock(&q->lock); @@ -285,8 +295,14 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, */ this_cpu = get_cpu(); - /* Can deadlock when called with interrupts disabled */ - WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() + && !oops_in_progress); if (cpu == this_cpu) { local_irq_save(flags); @@ -329,8 +345,14 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, { csd_lock(data); - /* Can deadlock when called with interrupts disabled */ - WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress); + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() + && !oops_in_progress); generic_exec_single(cpu, data, wait); } @@ -365,8 +387,14 @@ void smp_call_function_many(const struct cpumask *mask, unsigned long flags; int cpu, next_cpu, this_cpu = smp_processor_id(); - /* Can deadlock when called with interrupts disabled */ - WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() + && !oops_in_progress); /* So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); -- cgit v1.2.3-70-g09d2 From d0af9eed5aa91b6b7b5049cae69e5ea956fd85c3 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 19 Aug 2009 18:05:36 -0700 Subject: x86, pat/mtrr: Rendezvous all the cpus for MTRR/PAT init SDM Vol 3a section titled "MTRR considerations in MP systems" specifies the need for synchronizing the logical cpu's while initializing/updating MTRR. Currently Linux kernel does the synchronization of all cpu's only when a single MTRR register is programmed/updated. During an AP online (during boot/cpu-online/resume) where we initialize all the MTRR/PAT registers, we don't follow this synchronization algorithm. This can lead to scenarios where during a dynamic cpu online, that logical cpu is initializing MTRR/PAT with cache disabled (cr0.cd=1) etc while other logical HT sibling continue to run (also with cache disabled because of cr0.cd=1 on its sibling). Starting from Westmere, VMX transitions with cr0.cd=1 don't work properly (because of some VMX performance optimizations) and the above scenario (with one logical cpu doing VMX activity and another logical cpu coming online) can result in system crash. Fix the MTRR initialization by doing rendezvous of all the cpus. During boot and resume, we delay the MTRR/PAT init for APs till all the logical cpu's come online and the rendezvous process at the end of AP's bringup, will initialize the MTRR/PAT for all AP's. For dynamic single cpu online, we synchronize all the logical cpus and do the MTRR/PAT init on the AP that is coming online. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mtrr.h | 7 +++++++ arch/x86/kernel/cpu/mtrr/main.c | 46 +++++++++++++++++++++++++++++++++-------- arch/x86/kernel/smpboot.c | 14 +++++++++++++ arch/x86/power/cpu.c | 2 +- kernel/cpu.c | 14 +++++++++++++ 5 files changed, 73 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index a51ada8467d..d5366ec5cb8 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -121,8 +121,12 @@ extern int mtrr_del_page(int reg, unsigned long base, unsigned long size); extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi); extern void mtrr_ap_init(void); extern void mtrr_bp_init(void); +extern void set_mtrr_aps_delayed_init(void); +extern void mtrr_aps_init(void); +extern void mtrr_bp_restore(void); extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); +extern u32 mtrr_aps_delayed_init; # else static inline u8 mtrr_type_lookup(u64 addr, u64 end) { @@ -161,6 +165,9 @@ static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) #define mtrr_ap_init() do {} while (0) #define mtrr_bp_init() do {} while (0) +#define set_mtrr_aps_delayed_init() do {} while (0) +#define mtrr_aps_init() do {} while (0) +#define mtrr_bp_restore() do {} while (0) # endif #ifdef CONFIG_COMPAT diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 7af0f88a416..7339be0aa58 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -58,6 +58,7 @@ unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; +u32 mtrr_aps_delayed_init; static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; @@ -163,7 +164,10 @@ static void ipi_handler(void *info) if (data->smp_reg != ~0U) { mtrr_if->set(data->smp_reg, data->smp_base, data->smp_size, data->smp_type); - } else { + } else if (mtrr_aps_delayed_init) { + /* + * Initialize the MTRRs inaddition to the synchronisation. + */ mtrr_if->set_all(); } @@ -265,6 +269,8 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ */ if (reg != ~0U) mtrr_if->set(reg, base, size, type); + else if (!mtrr_aps_delayed_init) + mtrr_if->set_all(); /* Wait for the others */ while (atomic_read(&data.count)) @@ -721,9 +727,7 @@ void __init mtrr_bp_init(void) void mtrr_ap_init(void) { - unsigned long flags; - - if (!mtrr_if || !use_intel()) + if (!use_intel() || mtrr_aps_delayed_init) return; /* * Ideally we should hold mtrr_mutex here to avoid mtrr entries @@ -738,11 +742,7 @@ void mtrr_ap_init(void) * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug * lock to prevent mtrr entry changes */ - local_irq_save(flags); - - mtrr_if->set_all(); - - local_irq_restore(flags); + set_mtrr(~0U, 0, 0, 0); } /** @@ -753,6 +753,34 @@ void mtrr_save_state(void) smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); } +void set_mtrr_aps_delayed_init(void) +{ + if (!use_intel()) + return; + + mtrr_aps_delayed_init = 1; +} + +/* + * MTRR initialization for all AP's + */ +void mtrr_aps_init(void) +{ + if (!use_intel()) + return; + + set_mtrr(~0U, 0, 0, 0); + mtrr_aps_delayed_init = 0; +} + +void mtrr_bp_restore(void) +{ + if (!use_intel()) + return; + + mtrr_if->set_all(); +} + static int __init mtrr_init_finialize(void) { if (!mtrr_if) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2fecda69ee6..d720b7e0cf3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1116,9 +1116,22 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) if (is_uv_system()) uv_system_init(); + + set_mtrr_aps_delayed_init(); out: preempt_enable(); } + +void arch_enable_nonboot_cpus_begin(void) +{ + set_mtrr_aps_delayed_init(); +} + +void arch_enable_nonboot_cpus_end(void) +{ + mtrr_aps_init(); +} + /* * Early setup to make printk work. */ @@ -1140,6 +1153,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) setup_ioapic_dest(); #endif check_nmi_watchdog(); + mtrr_aps_init(); } static int __initdata setup_possible_cpus = -1; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index b3d20b9cac6..417c9f5b4af 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -242,7 +242,7 @@ static void __restore_processor_state(struct saved_context *ctxt) fix_processor_context(); do_fpu_end(); - mtrr_ap_init(); + mtrr_bp_restore(); #ifdef CONFIG_X86_OLD_MCE mcheck_init(&boot_cpu_data); diff --git a/kernel/cpu.c b/kernel/cpu.c index 8ce10043e4a..f5f9485b8c0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -413,6 +413,14 @@ int disable_nonboot_cpus(void) return error; } +void __weak arch_enable_nonboot_cpus_begin(void) +{ +} + +void __weak arch_enable_nonboot_cpus_end(void) +{ +} + void __ref enable_nonboot_cpus(void) { int cpu, error; @@ -424,6 +432,9 @@ void __ref enable_nonboot_cpus(void) goto out; printk("Enabling non-boot CPUs ...\n"); + + arch_enable_nonboot_cpus_begin(); + for_each_cpu(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { @@ -432,6 +443,9 @@ void __ref enable_nonboot_cpus(void) } printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); } + + arch_enable_nonboot_cpus_end(); + cpumask_clear(frozen_cpus); out: cpu_maps_update_done(); -- cgit v1.2.3-70-g09d2 From 5e928f77a09a07f9dd595bb8a489965d69a83458 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 18 Aug 2009 23:38:32 +0200 Subject: PM: Introduce core framework for run-time PM of I/O devices (rev. 17) Introduce a core framework for run-time power management of I/O devices. Add device run-time PM fields to 'struct dev_pm_info' and device run-time PM callbacks to 'struct dev_pm_ops'. Introduce a run-time PM workqueue and define some device run-time PM helper functions at the core level. Document all these things. Special thanks to Alan Stern for his help with the design and multiple detailed reviews of the pereceding versions of this patch and to Magnus Damm for testing feedback. Signed-off-by: Rafael J. Wysocki Acked-by: Magnus Damm --- Documentation/power/runtime_pm.txt | 378 ++++++++++++++ drivers/base/dd.c | 11 + drivers/base/power/Makefile | 1 + drivers/base/power/main.c | 22 +- drivers/base/power/power.h | 31 +- drivers/base/power/runtime.c | 1011 ++++++++++++++++++++++++++++++++++++ include/linux/pm.h | 101 +++- include/linux/pm_runtime.h | 114 ++++ kernel/power/Kconfig | 14 + kernel/power/main.c | 17 + 10 files changed, 1689 insertions(+), 11 deletions(-) create mode 100644 Documentation/power/runtime_pm.txt create mode 100644 drivers/base/power/runtime.c create mode 100644 include/linux/pm_runtime.h (limited to 'kernel') diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt new file mode 100644 index 00000000000..f49a33b704d --- /dev/null +++ b/Documentation/power/runtime_pm.txt @@ -0,0 +1,378 @@ +Run-time Power Management Framework for I/O Devices + +(C) 2009 Rafael J. Wysocki , Novell Inc. + +1. Introduction + +Support for run-time power management (run-time PM) of I/O devices is provided +at the power management core (PM core) level by means of: + +* The power management workqueue pm_wq in which bus types and device drivers can + put their PM-related work items. It is strongly recommended that pm_wq be + used for queuing all work items related to run-time PM, because this allows + them to be synchronized with system-wide power transitions (suspend to RAM, + hibernation and resume from system sleep states). pm_wq is declared in + include/linux/pm_runtime.h and defined in kernel/power/main.c. + +* A number of run-time PM fields in the 'power' member of 'struct device' (which + is of the type 'struct dev_pm_info', defined in include/linux/pm.h) that can + be used for synchronizing run-time PM operations with one another. + +* Three device run-time PM callbacks in 'struct dev_pm_ops' (defined in + include/linux/pm.h). + +* A set of helper functions defined in drivers/base/power/runtime.c that can be + used for carrying out run-time PM operations in such a way that the + synchronization between them is taken care of by the PM core. Bus types and + device drivers are encouraged to use these functions. + +The run-time PM callbacks present in 'struct dev_pm_ops', the device run-time PM +fields of 'struct dev_pm_info' and the core helper functions provided for +run-time PM are described below. + +2. Device Run-time PM Callbacks + +There are three device run-time PM callbacks defined in 'struct dev_pm_ops': + +struct dev_pm_ops { + ... + int (*runtime_suspend)(struct device *dev); + int (*runtime_resume)(struct device *dev); + void (*runtime_idle)(struct device *dev); + ... +}; + +The ->runtime_suspend() callback is executed by the PM core for the bus type of +the device being suspended. The bus type's callback is then _entirely_ +_responsible_ for handling the device as appropriate, which may, but need not +include executing the device driver's own ->runtime_suspend() callback (from the +PM core's point of view it is not necessary to implement a ->runtime_suspend() +callback in a device driver as long as the bus type's ->runtime_suspend() knows +what to do to handle the device). + + * Once the bus type's ->runtime_suspend() callback has completed successfully + for given device, the PM core regards the device as suspended, which need + not mean that the device has been put into a low power state. It is + supposed to mean, however, that the device will not process data and will + not communicate with the CPU(s) and RAM until its bus type's + ->runtime_resume() callback is executed for it. The run-time PM status of + a device after successful execution of its bus type's ->runtime_suspend() + callback is 'suspended'. + + * If the bus type's ->runtime_suspend() callback returns -EBUSY or -EAGAIN, + the device's run-time PM status is supposed to be 'active', which means that + the device _must_ be fully operational afterwards. + + * If the bus type's ->runtime_suspend() callback returns an error code + different from -EBUSY or -EAGAIN, the PM core regards this as a fatal + error and will refuse to run the helper functions described in Section 4 + for the device, until the status of it is directly set either to 'active' + or to 'suspended' (the PM core provides special helper functions for this + purpose). + +In particular, if the driver requires remote wakeup capability for proper +functioning and device_may_wakeup() returns 'false' for the device, then +->runtime_suspend() should return -EBUSY. On the other hand, if +device_may_wakeup() returns 'true' for the device and the device is put +into a low power state during the execution of its bus type's +->runtime_suspend(), it is expected that remote wake-up (i.e. hardware mechanism +allowing the device to request a change of its power state, such as PCI PME) +will be enabled for the device. Generally, remote wake-up should be enabled +for all input devices put into a low power state at run time. + +The ->runtime_resume() callback is executed by the PM core for the bus type of +the device being woken up. The bus type's callback is then _entirely_ +_responsible_ for handling the device as appropriate, which may, but need not +include executing the device driver's own ->runtime_resume() callback (from the +PM core's point of view it is not necessary to implement a ->runtime_resume() +callback in a device driver as long as the bus type's ->runtime_resume() knows +what to do to handle the device). + + * Once the bus type's ->runtime_resume() callback has completed successfully, + the PM core regards the device as fully operational, which means that the + device _must_ be able to complete I/O operations as needed. The run-time + PM status of the device is then 'active'. + + * If the bus type's ->runtime_resume() callback returns an error code, the PM + core regards this as a fatal error and will refuse to run the helper + functions described in Section 4 for the device, until its status is + directly set either to 'active' or to 'suspended' (the PM core provides + special helper functions for this purpose). + +The ->runtime_idle() callback is executed by the PM core for the bus type of +given device whenever the device appears to be idle, which is indicated to the +PM core by two counters, the device's usage counter and the counter of 'active' +children of the device. + + * If any of these counters is decreased using a helper function provided by + the PM core and it turns out to be equal to zero, the other counter is + checked. If that counter also is equal to zero, the PM core executes the + device bus type's ->runtime_idle() callback (with the device as an + argument). + +The action performed by a bus type's ->runtime_idle() callback is totally +dependent on the bus type in question, but the expected and recommended action +is to check if the device can be suspended (i.e. if all of the conditions +necessary for suspending the device are satisfied) and to queue up a suspend +request for the device in that case. + +The helper functions provided by the PM core, described in Section 4, guarantee +that the following constraints are met with respect to the bus type's run-time +PM callbacks: + +(1) The callbacks are mutually exclusive (e.g. it is forbidden to execute + ->runtime_suspend() in parallel with ->runtime_resume() or with another + instance of ->runtime_suspend() for the same device) with the exception that + ->runtime_suspend() or ->runtime_resume() can be executed in parallel with + ->runtime_idle() (although ->runtime_idle() will not be started while any + of the other callbacks is being executed for the same device). + +(2) ->runtime_idle() and ->runtime_suspend() can only be executed for 'active' + devices (i.e. the PM core will only execute ->runtime_idle() or + ->runtime_suspend() for the devices the run-time PM status of which is + 'active'). + +(3) ->runtime_idle() and ->runtime_suspend() can only be executed for a device + the usage counter of which is equal to zero _and_ either the counter of + 'active' children of which is equal to zero, or the 'power.ignore_children' + flag of which is set. + +(4) ->runtime_resume() can only be executed for 'suspended' devices (i.e. the + PM core will only execute ->runtime_resume() for the devices the run-time + PM status of which is 'suspended'). + +Additionally, the helper functions provided by the PM core obey the following +rules: + + * If ->runtime_suspend() is about to be executed or there's a pending request + to execute it, ->runtime_idle() will not be executed for the same device. + + * A request to execute or to schedule the execution of ->runtime_suspend() + will cancel any pending requests to execute ->runtime_idle() for the same + device. + + * If ->runtime_resume() is about to be executed or there's a pending request + to execute it, the other callbacks will not be executed for the same device. + + * A request to execute ->runtime_resume() will cancel any pending or + scheduled requests to execute the other callbacks for the same device. + +3. Run-time PM Device Fields + +The following device run-time PM fields are present in 'struct dev_pm_info', as +defined in include/linux/pm.h: + + struct timer_list suspend_timer; + - timer used for scheduling (delayed) suspend request + + unsigned long timer_expires; + - timer expiration time, in jiffies (if this is different from zero, the + timer is running and will expire at that time, otherwise the timer is not + running) + + struct work_struct work; + - work structure used for queuing up requests (i.e. work items in pm_wq) + + wait_queue_head_t wait_queue; + - wait queue used if any of the helper functions needs to wait for another + one to complete + + spinlock_t lock; + - lock used for synchronisation + + atomic_t usage_count; + - the usage counter of the device + + atomic_t child_count; + - the count of 'active' children of the device + + unsigned int ignore_children; + - if set, the value of child_count is ignored (but still updated) + + unsigned int disable_depth; + - used for disabling the helper funcions (they work normally if this is + equal to zero); the initial value of it is 1 (i.e. run-time PM is + initially disabled for all devices) + + unsigned int runtime_error; + - if set, there was a fatal error (one of the callbacks returned error code + as described in Section 2), so the helper funtions will not work until + this flag is cleared; this is the error code returned by the failing + callback + + unsigned int idle_notification; + - if set, ->runtime_idle() is being executed + + unsigned int request_pending; + - if set, there's a pending request (i.e. a work item queued up into pm_wq) + + enum rpm_request request; + - type of request that's pending (valid if request_pending is set) + + unsigned int deferred_resume; + - set if ->runtime_resume() is about to be run while ->runtime_suspend() is + being executed for that device and it is not practical to wait for the + suspend to complete; means "start a resume as soon as you've suspended" + + enum rpm_status runtime_status; + - the run-time PM status of the device; this field's initial value is + RPM_SUSPENDED, which means that each device is initially regarded by the + PM core as 'suspended', regardless of its real hardware status + +All of the above fields are members of the 'power' member of 'struct device'. + +4. Run-time PM Device Helper Functions + +The following run-time PM helper functions are defined in +drivers/base/power/runtime.c and include/linux/pm_runtime.h: + + void pm_runtime_init(struct device *dev); + - initialize the device run-time PM fields in 'struct dev_pm_info' + + void pm_runtime_remove(struct device *dev); + - make sure that the run-time PM of the device will be disabled after + removing the device from device hierarchy + + int pm_runtime_idle(struct device *dev); + - execute ->runtime_idle() for the device's bus type; returns 0 on success + or error code on failure, where -EINPROGRESS means that ->runtime_idle() + is already being executed + + int pm_runtime_suspend(struct device *dev); + - execute ->runtime_suspend() for the device's bus type; returns 0 on + success, 1 if the device's run-time PM status was already 'suspended', or + error code on failure, where -EAGAIN or -EBUSY means it is safe to attempt + to suspend the device again in future + + int pm_runtime_resume(struct device *dev); + - execute ->runtime_resume() for the device's bus type; returns 0 on + success, 1 if the device's run-time PM status was already 'active' or + error code on failure, where -EAGAIN means it may be safe to attempt to + resume the device again in future, but 'power.runtime_error' should be + checked additionally + + int pm_request_idle(struct device *dev); + - submit a request to execute ->runtime_idle() for the device's bus type + (the request is represented by a work item in pm_wq); returns 0 on success + or error code if the request has not been queued up + + int pm_schedule_suspend(struct device *dev, unsigned int delay); + - schedule the execution of ->runtime_suspend() for the device's bus type + in future, where 'delay' is the time to wait before queuing up a suspend + work item in pm_wq, in milliseconds (if 'delay' is zero, the work item is + queued up immediately); returns 0 on success, 1 if the device's PM + run-time status was already 'suspended', or error code if the request + hasn't been scheduled (or queued up if 'delay' is 0); if the execution of + ->runtime_suspend() is already scheduled and not yet expired, the new + value of 'delay' will be used as the time to wait + + int pm_request_resume(struct device *dev); + - submit a request to execute ->runtime_resume() for the device's bus type + (the request is represented by a work item in pm_wq); returns 0 on + success, 1 if the device's run-time PM status was already 'active', or + error code if the request hasn't been queued up + + void pm_runtime_get_noresume(struct device *dev); + - increment the device's usage counter + + int pm_runtime_get(struct device *dev); + - increment the device's usage counter, run pm_request_resume(dev) and + return its result + + int pm_runtime_get_sync(struct device *dev); + - increment the device's usage counter, run pm_runtime_resume(dev) and + return its result + + void pm_runtime_put_noidle(struct device *dev); + - decrement the device's usage counter + + int pm_runtime_put(struct device *dev); + - decrement the device's usage counter, run pm_request_idle(dev) and return + its result + + int pm_runtime_put_sync(struct device *dev); + - decrement the device's usage counter, run pm_runtime_idle(dev) and return + its result + + void pm_runtime_enable(struct device *dev); + - enable the run-time PM helper functions to run the device bus type's + run-time PM callbacks described in Section 2 + + int pm_runtime_disable(struct device *dev); + - prevent the run-time PM helper functions from running the device bus + type's run-time PM callbacks, make sure that all of the pending run-time + PM operations on the device are either completed or canceled; returns + 1 if there was a resume request pending and it was necessary to execute + ->runtime_resume() for the device's bus type to satisfy that request, + otherwise 0 is returned + + void pm_suspend_ignore_children(struct device *dev, bool enable); + - set/unset the power.ignore_children flag of the device + + int pm_runtime_set_active(struct device *dev); + - clear the device's 'power.runtime_error' flag, set the device's run-time + PM status to 'active' and update its parent's counter of 'active' + children as appropriate (it is only valid to use this function if + 'power.runtime_error' is set or 'power.disable_depth' is greater than + zero); it will fail and return error code if the device has a parent + which is not active and the 'power.ignore_children' flag of which is unset + + void pm_runtime_set_suspended(struct device *dev); + - clear the device's 'power.runtime_error' flag, set the device's run-time + PM status to 'suspended' and update its parent's counter of 'active' + children as appropriate (it is only valid to use this function if + 'power.runtime_error' is set or 'power.disable_depth' is greater than + zero) + +It is safe to execute the following helper functions from interrupt context: + +pm_request_idle() +pm_schedule_suspend() +pm_request_resume() +pm_runtime_get_noresume() +pm_runtime_get() +pm_runtime_put_noidle() +pm_runtime_put() +pm_suspend_ignore_children() +pm_runtime_set_active() +pm_runtime_set_suspended() +pm_runtime_enable() + +5. Run-time PM Initialization, Device Probing and Removal + +Initially, the run-time PM is disabled for all devices, which means that the +majority of the run-time PM helper funtions described in Section 4 will return +-EAGAIN until pm_runtime_enable() is called for the device. + +In addition to that, the initial run-time PM status of all devices is +'suspended', but it need not reflect the actual physical state of the device. +Thus, if the device is initially active (i.e. it is able to process I/O), its +run-time PM status must be changed to 'active', with the help of +pm_runtime_set_active(), before pm_runtime_enable() is called for the device. + +However, if the device has a parent and the parent's run-time PM is enabled, +calling pm_runtime_set_active() for the device will affect the parent, unless +the parent's 'power.ignore_children' flag is set. Namely, in that case the +parent won't be able to suspend at run time, using the PM core's helper +functions, as long as the child's status is 'active', even if the child's +run-time PM is still disabled (i.e. pm_runtime_enable() hasn't been called for +the child yet or pm_runtime_disable() has been called for it). For this reason, +once pm_runtime_set_active() has been called for the device, pm_runtime_enable() +should be called for it too as soon as reasonably possible or its run-time PM +status should be changed back to 'suspended' with the help of +pm_runtime_set_suspended(). + +If the default initial run-time PM status of the device (i.e. 'suspended') +reflects the actual state of the device, its bus type's or its driver's +->probe() callback will likely need to wake it up using one of the PM core's +helper functions described in Section 4. In that case, pm_runtime_resume() +should be used. Of course, for this purpose the device's run-time PM has to be +enabled earlier by calling pm_runtime_enable(). + +If the device bus type's or driver's ->probe() or ->remove() callback runs +pm_runtime_suspend() or pm_runtime_idle() or their asynchronous counterparts, +they will fail returning -EAGAIN, because the device's usage counter is +incremented by the core before executing ->probe() and ->remove(). Still, it +may be desirable to suspend the device as soon as ->probe() or ->remove() has +finished, so the PM core uses pm_runtime_idle_sync() to invoke the device bus +type's ->runtime_idle() callback at that time. diff --git a/drivers/base/dd.c b/drivers/base/dd.c index f0106875f01..7b34b3a48f6 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "base.h" #include "power/power.h" @@ -202,7 +203,10 @@ int driver_probe_device(struct device_driver *drv, struct device *dev) pr_debug("bus: '%s': %s: matched device %s with driver %s\n", drv->bus->name, __func__, dev_name(dev), drv->name); + pm_runtime_get_noresume(dev); + pm_runtime_barrier(dev); ret = really_probe(dev, drv); + pm_runtime_put_sync(dev); return ret; } @@ -245,7 +249,9 @@ int device_attach(struct device *dev) ret = 0; } } else { + pm_runtime_get_noresume(dev); ret = bus_for_each_drv(dev->bus, NULL, dev, __device_attach); + pm_runtime_put_sync(dev); } up(&dev->sem); return ret; @@ -306,6 +312,9 @@ static void __device_release_driver(struct device *dev) drv = dev->driver; if (drv) { + pm_runtime_get_noresume(dev); + pm_runtime_barrier(dev); + driver_sysfs_remove(dev); if (dev->bus) @@ -324,6 +333,8 @@ static void __device_release_driver(struct device *dev) blocking_notifier_call_chain(&dev->bus->p->bus_notifier, BUS_NOTIFY_UNBOUND_DRIVER, dev); + + pm_runtime_put_sync(dev); } } diff --git a/drivers/base/power/Makefile b/drivers/base/power/Makefile index 911208b8925..3ce3519e8f3 100644 --- a/drivers/base/power/Makefile +++ b/drivers/base/power/Makefile @@ -1,5 +1,6 @@ obj-$(CONFIG_PM) += sysfs.o obj-$(CONFIG_PM_SLEEP) += main.o +obj-$(CONFIG_PM_RUNTIME) += runtime.o obj-$(CONFIG_PM_TRACE_RTC) += trace.o ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 1b1a786b7de..86990011277 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -48,6 +49,16 @@ static DEFINE_MUTEX(dpm_list_mtx); */ static bool transition_started; +/** + * device_pm_init - Initialize the PM-related part of a device object + * @dev: Device object being initialized. + */ +void device_pm_init(struct device *dev) +{ + dev->power.status = DPM_ON; + pm_runtime_init(dev); +} + /** * device_pm_lock - lock the list of active devices used by the PM core */ @@ -105,6 +116,7 @@ void device_pm_remove(struct device *dev) mutex_lock(&dpm_list_mtx); list_del_init(&dev->power.entry); mutex_unlock(&dpm_list_mtx); + pm_runtime_remove(dev); } /** @@ -512,6 +524,7 @@ static void dpm_complete(pm_message_t state) mutex_unlock(&dpm_list_mtx); device_complete(dev, state); + pm_runtime_put_noidle(dev); mutex_lock(&dpm_list_mtx); } @@ -757,7 +770,14 @@ static int dpm_prepare(pm_message_t state) dev->power.status = DPM_PREPARING; mutex_unlock(&dpm_list_mtx); - error = device_prepare(dev, state); + pm_runtime_get_noresume(dev); + if (pm_runtime_barrier(dev) && device_may_wakeup(dev)) { + /* Wake-up requested during system sleep transition. */ + pm_runtime_put_noidle(dev); + error = -EBUSY; + } else { + error = device_prepare(dev, state); + } mutex_lock(&dpm_list_mtx); if (error) { diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h index c7cb4fc3735..b8fa1aa5225 100644 --- a/drivers/base/power/power.h +++ b/drivers/base/power/power.h @@ -1,7 +1,14 @@ -static inline void device_pm_init(struct device *dev) -{ - dev->power.status = DPM_ON; -} +#ifdef CONFIG_PM_RUNTIME + +extern void pm_runtime_init(struct device *dev); +extern void pm_runtime_remove(struct device *dev); + +#else /* !CONFIG_PM_RUNTIME */ + +static inline void pm_runtime_init(struct device *dev) {} +static inline void pm_runtime_remove(struct device *dev) {} + +#endif /* !CONFIG_PM_RUNTIME */ #ifdef CONFIG_PM_SLEEP @@ -16,23 +23,33 @@ static inline struct device *to_device(struct list_head *entry) return container_of(entry, struct device, power.entry); } +extern void device_pm_init(struct device *dev); extern void device_pm_add(struct device *); extern void device_pm_remove(struct device *); extern void device_pm_move_before(struct device *, struct device *); extern void device_pm_move_after(struct device *, struct device *); extern void device_pm_move_last(struct device *); -#else /* CONFIG_PM_SLEEP */ +#else /* !CONFIG_PM_SLEEP */ + +static inline void device_pm_init(struct device *dev) +{ + pm_runtime_init(dev); +} + +static inline void device_pm_remove(struct device *dev) +{ + pm_runtime_remove(dev); +} static inline void device_pm_add(struct device *dev) {} -static inline void device_pm_remove(struct device *dev) {} static inline void device_pm_move_before(struct device *deva, struct device *devb) {} static inline void device_pm_move_after(struct device *deva, struct device *devb) {} static inline void device_pm_move_last(struct device *dev) {} -#endif +#endif /* !CONFIG_PM_SLEEP */ #ifdef CONFIG_PM diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c new file mode 100644 index 00000000000..38556f6cc22 --- /dev/null +++ b/drivers/base/power/runtime.c @@ -0,0 +1,1011 @@ +/* + * drivers/base/power/runtime.c - Helper functions for device run-time PM + * + * Copyright (c) 2009 Rafael J. Wysocki , Novell Inc. + * + * This file is released under the GPLv2. + */ + +#include +#include +#include + +static int __pm_runtime_resume(struct device *dev, bool from_wq); +static int __pm_request_idle(struct device *dev); +static int __pm_request_resume(struct device *dev); + +/** + * pm_runtime_deactivate_timer - Deactivate given device's suspend timer. + * @dev: Device to handle. + */ +static void pm_runtime_deactivate_timer(struct device *dev) +{ + if (dev->power.timer_expires > 0) { + del_timer(&dev->power.suspend_timer); + dev->power.timer_expires = 0; + } +} + +/** + * pm_runtime_cancel_pending - Deactivate suspend timer and cancel requests. + * @dev: Device to handle. + */ +static void pm_runtime_cancel_pending(struct device *dev) +{ + pm_runtime_deactivate_timer(dev); + /* + * In case there's a request pending, make sure its work function will + * return without doing anything. + */ + dev->power.request = RPM_REQ_NONE; +} + +/** + * __pm_runtime_idle - Notify device bus type if the device can be suspended. + * @dev: Device to notify the bus type about. + * + * This function must be called under dev->power.lock with interrupts disabled. + */ +static int __pm_runtime_idle(struct device *dev) + __releases(&dev->power.lock) __acquires(&dev->power.lock) +{ + int retval = 0; + + dev_dbg(dev, "__pm_runtime_idle()!\n"); + + if (dev->power.runtime_error) + retval = -EINVAL; + else if (dev->power.idle_notification) + retval = -EINPROGRESS; + else if (atomic_read(&dev->power.usage_count) > 0 + || dev->power.disable_depth > 0 + || dev->power.runtime_status != RPM_ACTIVE) + retval = -EAGAIN; + else if (!pm_children_suspended(dev)) + retval = -EBUSY; + if (retval) + goto out; + + if (dev->power.request_pending) { + /* + * If an idle notification request is pending, cancel it. Any + * other pending request takes precedence over us. + */ + if (dev->power.request == RPM_REQ_IDLE) { + dev->power.request = RPM_REQ_NONE; + } else if (dev->power.request != RPM_REQ_NONE) { + retval = -EAGAIN; + goto out; + } + } + + dev->power.idle_notification = true; + + if (dev->bus && dev->bus->pm && dev->bus->pm->runtime_idle) { + spin_unlock_irq(&dev->power.lock); + + dev->bus->pm->runtime_idle(dev); + + spin_lock_irq(&dev->power.lock); + } + + dev->power.idle_notification = false; + wake_up_all(&dev->power.wait_queue); + + out: + dev_dbg(dev, "__pm_runtime_idle() returns %d!\n", retval); + + return retval; +} + +/** + * pm_runtime_idle - Notify device bus type if the device can be suspended. + * @dev: Device to notify the bus type about. + */ +int pm_runtime_idle(struct device *dev) +{ + int retval; + + spin_lock_irq(&dev->power.lock); + retval = __pm_runtime_idle(dev); + spin_unlock_irq(&dev->power.lock); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_runtime_idle); + +/** + * __pm_runtime_suspend - Carry out run-time suspend of given device. + * @dev: Device to suspend. + * @from_wq: If set, the function has been called via pm_wq. + * + * Check if the device can be suspended and run the ->runtime_suspend() callback + * provided by its bus type. If another suspend has been started earlier, wait + * for it to finish. If an idle notification or suspend request is pending or + * scheduled, cancel it. + * + * This function must be called under dev->power.lock with interrupts disabled. + */ +int __pm_runtime_suspend(struct device *dev, bool from_wq) + __releases(&dev->power.lock) __acquires(&dev->power.lock) +{ + struct device *parent = NULL; + bool notify = false; + int retval = 0; + + dev_dbg(dev, "__pm_runtime_suspend()%s!\n", + from_wq ? " from workqueue" : ""); + + repeat: + if (dev->power.runtime_error) { + retval = -EINVAL; + goto out; + } + + /* Pending resume requests take precedence over us. */ + if (dev->power.request_pending + && dev->power.request == RPM_REQ_RESUME) { + retval = -EAGAIN; + goto out; + } + + /* Other scheduled or pending requests need to be canceled. */ + pm_runtime_cancel_pending(dev); + + if (dev->power.runtime_status == RPM_SUSPENDED) + retval = 1; + else if (dev->power.runtime_status == RPM_RESUMING + || dev->power.disable_depth > 0 + || atomic_read(&dev->power.usage_count) > 0) + retval = -EAGAIN; + else if (!pm_children_suspended(dev)) + retval = -EBUSY; + if (retval) + goto out; + + if (dev->power.runtime_status == RPM_SUSPENDING) { + DEFINE_WAIT(wait); + + if (from_wq) { + retval = -EINPROGRESS; + goto out; + } + + /* Wait for the other suspend running in parallel with us. */ + for (;;) { + prepare_to_wait(&dev->power.wait_queue, &wait, + TASK_UNINTERRUPTIBLE); + if (dev->power.runtime_status != RPM_SUSPENDING) + break; + + spin_unlock_irq(&dev->power.lock); + + schedule(); + + spin_lock_irq(&dev->power.lock); + } + finish_wait(&dev->power.wait_queue, &wait); + goto repeat; + } + + dev->power.runtime_status = RPM_SUSPENDING; + + if (dev->bus && dev->bus->pm && dev->bus->pm->runtime_suspend) { + spin_unlock_irq(&dev->power.lock); + + retval = dev->bus->pm->runtime_suspend(dev); + + spin_lock_irq(&dev->power.lock); + dev->power.runtime_error = retval; + } else { + retval = -ENOSYS; + } + + if (retval) { + dev->power.runtime_status = RPM_ACTIVE; + pm_runtime_cancel_pending(dev); + dev->power.deferred_resume = false; + + if (retval == -EAGAIN || retval == -EBUSY) { + notify = true; + dev->power.runtime_error = 0; + } + } else { + dev->power.runtime_status = RPM_SUSPENDED; + + if (dev->parent) { + parent = dev->parent; + atomic_add_unless(&parent->power.child_count, -1, 0); + } + } + wake_up_all(&dev->power.wait_queue); + + if (dev->power.deferred_resume) { + dev->power.deferred_resume = false; + __pm_runtime_resume(dev, false); + retval = -EAGAIN; + goto out; + } + + if (notify) + __pm_runtime_idle(dev); + + if (parent && !parent->power.ignore_children) { + spin_unlock_irq(&dev->power.lock); + + pm_request_idle(parent); + + spin_lock_irq(&dev->power.lock); + } + + out: + dev_dbg(dev, "__pm_runtime_suspend() returns %d!\n", retval); + + return retval; +} + +/** + * pm_runtime_suspend - Carry out run-time suspend of given device. + * @dev: Device to suspend. + */ +int pm_runtime_suspend(struct device *dev) +{ + int retval; + + spin_lock_irq(&dev->power.lock); + retval = __pm_runtime_suspend(dev, false); + spin_unlock_irq(&dev->power.lock); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_runtime_suspend); + +/** + * __pm_runtime_resume - Carry out run-time resume of given device. + * @dev: Device to resume. + * @from_wq: If set, the function has been called via pm_wq. + * + * Check if the device can be woken up and run the ->runtime_resume() callback + * provided by its bus type. If another resume has been started earlier, wait + * for it to finish. If there's a suspend running in parallel with this + * function, wait for it to finish and resume the device. Cancel any scheduled + * or pending requests. + * + * This function must be called under dev->power.lock with interrupts disabled. + */ +int __pm_runtime_resume(struct device *dev, bool from_wq) + __releases(&dev->power.lock) __acquires(&dev->power.lock) +{ + struct device *parent = NULL; + int retval = 0; + + dev_dbg(dev, "__pm_runtime_resume()%s!\n", + from_wq ? " from workqueue" : ""); + + repeat: + if (dev->power.runtime_error) { + retval = -EINVAL; + goto out; + } + + pm_runtime_cancel_pending(dev); + + if (dev->power.runtime_status == RPM_ACTIVE) + retval = 1; + else if (dev->power.disable_depth > 0) + retval = -EAGAIN; + if (retval) + goto out; + + if (dev->power.runtime_status == RPM_RESUMING + || dev->power.runtime_status == RPM_SUSPENDING) { + DEFINE_WAIT(wait); + + if (from_wq) { + if (dev->power.runtime_status == RPM_SUSPENDING) + dev->power.deferred_resume = true; + retval = -EINPROGRESS; + goto out; + } + + /* Wait for the operation carried out in parallel with us. */ + for (;;) { + prepare_to_wait(&dev->power.wait_queue, &wait, + TASK_UNINTERRUPTIBLE); + if (dev->power.runtime_status != RPM_RESUMING + && dev->power.runtime_status != RPM_SUSPENDING) + break; + + spin_unlock_irq(&dev->power.lock); + + schedule(); + + spin_lock_irq(&dev->power.lock); + } + finish_wait(&dev->power.wait_queue, &wait); + goto repeat; + } + + if (!parent && dev->parent) { + /* + * Increment the parent's resume counter and resume it if + * necessary. + */ + parent = dev->parent; + spin_unlock_irq(&dev->power.lock); + + pm_runtime_get_noresume(parent); + + spin_lock_irq(&parent->power.lock); + /* + * We can resume if the parent's run-time PM is disabled or it + * is set to ignore children. + */ + if (!parent->power.disable_depth + && !parent->power.ignore_children) { + __pm_runtime_resume(parent, false); + if (parent->power.runtime_status != RPM_ACTIVE) + retval = -EBUSY; + } + spin_unlock_irq(&parent->power.lock); + + spin_lock_irq(&dev->power.lock); + if (retval) + goto out; + goto repeat; + } + + dev->power.runtime_status = RPM_RESUMING; + + if (dev->bus && dev->bus->pm && dev->bus->pm->runtime_resume) { + spin_unlock_irq(&dev->power.lock); + + retval = dev->bus->pm->runtime_resume(dev); + + spin_lock_irq(&dev->power.lock); + dev->power.runtime_error = retval; + } else { + retval = -ENOSYS; + } + + if (retval) { + dev->power.runtime_status = RPM_SUSPENDED; + pm_runtime_cancel_pending(dev); + } else { + dev->power.runtime_status = RPM_ACTIVE; + if (parent) + atomic_inc(&parent->power.child_count); + } + wake_up_all(&dev->power.wait_queue); + + if (!retval) + __pm_request_idle(dev); + + out: + if (parent) { + spin_unlock_irq(&dev->power.lock); + + pm_runtime_put(parent); + + spin_lock_irq(&dev->power.lock); + } + + dev_dbg(dev, "__pm_runtime_resume() returns %d!\n", retval); + + return retval; +} + +/** + * pm_runtime_resume - Carry out run-time resume of given device. + * @dev: Device to suspend. + */ +int pm_runtime_resume(struct device *dev) +{ + int retval; + + spin_lock_irq(&dev->power.lock); + retval = __pm_runtime_resume(dev, false); + spin_unlock_irq(&dev->power.lock); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_runtime_resume); + +/** + * pm_runtime_work - Universal run-time PM work function. + * @work: Work structure used for scheduling the execution of this function. + * + * Use @work to get the device object the work is to be done for, determine what + * is to be done and execute the appropriate run-time PM function. + */ +static void pm_runtime_work(struct work_struct *work) +{ + struct device *dev = container_of(work, struct device, power.work); + enum rpm_request req; + + spin_lock_irq(&dev->power.lock); + + if (!dev->power.request_pending) + goto out; + + req = dev->power.request; + dev->power.request = RPM_REQ_NONE; + dev->power.request_pending = false; + + switch (req) { + case RPM_REQ_NONE: + break; + case RPM_REQ_IDLE: + __pm_runtime_idle(dev); + break; + case RPM_REQ_SUSPEND: + __pm_runtime_suspend(dev, true); + break; + case RPM_REQ_RESUME: + __pm_runtime_resume(dev, true); + break; + } + + out: + spin_unlock_irq(&dev->power.lock); +} + +/** + * __pm_request_idle - Submit an idle notification request for given device. + * @dev: Device to handle. + * + * Check if the device's run-time PM status is correct for suspending the device + * and queue up a request to run __pm_runtime_idle() for it. + * + * This function must be called under dev->power.lock with interrupts disabled. + */ +static int __pm_request_idle(struct device *dev) +{ + int retval = 0; + + if (dev->power.runtime_error) + retval = -EINVAL; + else if (atomic_read(&dev->power.usage_count) > 0 + || dev->power.disable_depth > 0 + || dev->power.runtime_status == RPM_SUSPENDED + || dev->power.runtime_status == RPM_SUSPENDING) + retval = -EAGAIN; + else if (!pm_children_suspended(dev)) + retval = -EBUSY; + if (retval) + return retval; + + if (dev->power.request_pending) { + /* Any requests other then RPM_REQ_IDLE take precedence. */ + if (dev->power.request == RPM_REQ_NONE) + dev->power.request = RPM_REQ_IDLE; + else if (dev->power.request != RPM_REQ_IDLE) + retval = -EAGAIN; + return retval; + } + + dev->power.request = RPM_REQ_IDLE; + dev->power.request_pending = true; + queue_work(pm_wq, &dev->power.work); + + return retval; +} + +/** + * pm_request_idle - Submit an idle notification request for given device. + * @dev: Device to handle. + */ +int pm_request_idle(struct device *dev) +{ + unsigned long flags; + int retval; + + spin_lock_irqsave(&dev->power.lock, flags); + retval = __pm_request_idle(dev); + spin_unlock_irqrestore(&dev->power.lock, flags); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_request_idle); + +/** + * __pm_request_suspend - Submit a suspend request for given device. + * @dev: Device to suspend. + * + * This function must be called under dev->power.lock with interrupts disabled. + */ +static int __pm_request_suspend(struct device *dev) +{ + int retval = 0; + + if (dev->power.runtime_error) + return -EINVAL; + + if (dev->power.runtime_status == RPM_SUSPENDED) + retval = 1; + else if (atomic_read(&dev->power.usage_count) > 0 + || dev->power.disable_depth > 0) + retval = -EAGAIN; + else if (dev->power.runtime_status == RPM_SUSPENDING) + retval = -EINPROGRESS; + else if (!pm_children_suspended(dev)) + retval = -EBUSY; + if (retval < 0) + return retval; + + pm_runtime_deactivate_timer(dev); + + if (dev->power.request_pending) { + /* + * Pending resume requests take precedence over us, but we can + * overtake any other pending request. + */ + if (dev->power.request == RPM_REQ_RESUME) + retval = -EAGAIN; + else if (dev->power.request != RPM_REQ_SUSPEND) + dev->power.request = retval ? + RPM_REQ_NONE : RPM_REQ_SUSPEND; + return retval; + } else if (retval) { + return retval; + } + + dev->power.request = RPM_REQ_SUSPEND; + dev->power.request_pending = true; + queue_work(pm_wq, &dev->power.work); + + return 0; +} + +/** + * pm_suspend_timer_fn - Timer function for pm_schedule_suspend(). + * @data: Device pointer passed by pm_schedule_suspend(). + * + * Check if the time is right and execute __pm_request_suspend() in that case. + */ +static void pm_suspend_timer_fn(unsigned long data) +{ + struct device *dev = (struct device *)data; + unsigned long flags; + unsigned long expires; + + spin_lock_irqsave(&dev->power.lock, flags); + + expires = dev->power.timer_expires; + /* If 'expire' is after 'jiffies' we've been called too early. */ + if (expires > 0 && !time_after(expires, jiffies)) { + dev->power.timer_expires = 0; + __pm_request_suspend(dev); + } + + spin_unlock_irqrestore(&dev->power.lock, flags); +} + +/** + * pm_schedule_suspend - Set up a timer to submit a suspend request in future. + * @dev: Device to suspend. + * @delay: Time to wait before submitting a suspend request, in milliseconds. + */ +int pm_schedule_suspend(struct device *dev, unsigned int delay) +{ + unsigned long flags; + int retval = 0; + + spin_lock_irqsave(&dev->power.lock, flags); + + if (dev->power.runtime_error) { + retval = -EINVAL; + goto out; + } + + if (!delay) { + retval = __pm_request_suspend(dev); + goto out; + } + + pm_runtime_deactivate_timer(dev); + + if (dev->power.request_pending) { + /* + * Pending resume requests take precedence over us, but any + * other pending requests have to be canceled. + */ + if (dev->power.request == RPM_REQ_RESUME) { + retval = -EAGAIN; + goto out; + } + dev->power.request = RPM_REQ_NONE; + } + + if (dev->power.runtime_status == RPM_SUSPENDED) + retval = 1; + else if (dev->power.runtime_status == RPM_SUSPENDING) + retval = -EINPROGRESS; + else if (atomic_read(&dev->power.usage_count) > 0 + || dev->power.disable_depth > 0) + retval = -EAGAIN; + else if (!pm_children_suspended(dev)) + retval = -EBUSY; + if (retval) + goto out; + + dev->power.timer_expires = jiffies + msecs_to_jiffies(delay); + mod_timer(&dev->power.suspend_timer, dev->power.timer_expires); + + out: + spin_unlock_irqrestore(&dev->power.lock, flags); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_schedule_suspend); + +/** + * pm_request_resume - Submit a resume request for given device. + * @dev: Device to resume. + * + * This function must be called under dev->power.lock with interrupts disabled. + */ +static int __pm_request_resume(struct device *dev) +{ + int retval = 0; + + if (dev->power.runtime_error) + return -EINVAL; + + if (dev->power.runtime_status == RPM_ACTIVE) + retval = 1; + else if (dev->power.runtime_status == RPM_RESUMING) + retval = -EINPROGRESS; + else if (dev->power.disable_depth > 0) + retval = -EAGAIN; + if (retval < 0) + return retval; + + pm_runtime_deactivate_timer(dev); + + if (dev->power.request_pending) { + /* If non-resume request is pending, we can overtake it. */ + dev->power.request = retval ? RPM_REQ_NONE : RPM_REQ_RESUME; + return retval; + } else if (retval) { + return retval; + } + + dev->power.request = RPM_REQ_RESUME; + dev->power.request_pending = true; + queue_work(pm_wq, &dev->power.work); + + return retval; +} + +/** + * pm_request_resume - Submit a resume request for given device. + * @dev: Device to resume. + */ +int pm_request_resume(struct device *dev) +{ + unsigned long flags; + int retval; + + spin_lock_irqsave(&dev->power.lock, flags); + retval = __pm_request_resume(dev); + spin_unlock_irqrestore(&dev->power.lock, flags); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_request_resume); + +/** + * __pm_runtime_get - Reference count a device and wake it up, if necessary. + * @dev: Device to handle. + * @sync: If set and the device is suspended, resume it synchronously. + * + * Increment the usage count of the device and if it was zero previously, + * resume it or submit a resume request for it, depending on the value of @sync. + */ +int __pm_runtime_get(struct device *dev, bool sync) +{ + int retval = 1; + + if (atomic_add_return(1, &dev->power.usage_count) == 1) + retval = sync ? pm_runtime_resume(dev) : pm_request_resume(dev); + + return retval; +} +EXPORT_SYMBOL_GPL(__pm_runtime_get); + +/** + * __pm_runtime_put - Decrement the device's usage counter and notify its bus. + * @dev: Device to handle. + * @sync: If the device's bus type is to be notified, do that synchronously. + * + * Decrement the usage count of the device and if it reaches zero, carry out a + * synchronous idle notification or submit an idle notification request for it, + * depending on the value of @sync. + */ +int __pm_runtime_put(struct device *dev, bool sync) +{ + int retval = 0; + + if (atomic_dec_and_test(&dev->power.usage_count)) + retval = sync ? pm_runtime_idle(dev) : pm_request_idle(dev); + + return retval; +} +EXPORT_SYMBOL_GPL(__pm_runtime_put); + +/** + * __pm_runtime_set_status - Set run-time PM status of a device. + * @dev: Device to handle. + * @status: New run-time PM status of the device. + * + * If run-time PM of the device is disabled or its power.runtime_error field is + * different from zero, the status may be changed either to RPM_ACTIVE, or to + * RPM_SUSPENDED, as long as that reflects the actual state of the device. + * However, if the device has a parent and the parent is not active, and the + * parent's power.ignore_children flag is unset, the device's status cannot be + * set to RPM_ACTIVE, so -EBUSY is returned in that case. + * + * If successful, __pm_runtime_set_status() clears the power.runtime_error field + * and the device parent's counter of unsuspended children is modified to + * reflect the new status. If the new status is RPM_SUSPENDED, an idle + * notification request for the parent is submitted. + */ +int __pm_runtime_set_status(struct device *dev, unsigned int status) +{ + struct device *parent = dev->parent; + unsigned long flags; + bool notify_parent = false; + int error = 0; + + if (status != RPM_ACTIVE && status != RPM_SUSPENDED) + return -EINVAL; + + spin_lock_irqsave(&dev->power.lock, flags); + + if (!dev->power.runtime_error && !dev->power.disable_depth) { + error = -EAGAIN; + goto out; + } + + if (dev->power.runtime_status == status) + goto out_set; + + if (status == RPM_SUSPENDED) { + /* It always is possible to set the status to 'suspended'. */ + if (parent) { + atomic_add_unless(&parent->power.child_count, -1, 0); + notify_parent = !parent->power.ignore_children; + } + goto out_set; + } + + if (parent) { + spin_lock_irq(&parent->power.lock); + + /* + * It is invalid to put an active child under a parent that is + * not active, has run-time PM enabled and the + * 'power.ignore_children' flag unset. + */ + if (!parent->power.disable_depth + && !parent->power.ignore_children + && parent->power.runtime_status != RPM_ACTIVE) { + error = -EBUSY; + } else { + if (dev->power.runtime_status == RPM_SUSPENDED) + atomic_inc(&parent->power.child_count); + } + + spin_unlock_irq(&parent->power.lock); + + if (error) + goto out; + } + + out_set: + dev->power.runtime_status = status; + dev->power.runtime_error = 0; + out: + spin_unlock_irqrestore(&dev->power.lock, flags); + + if (notify_parent) + pm_request_idle(parent); + + return error; +} +EXPORT_SYMBOL_GPL(__pm_runtime_set_status); + +/** + * __pm_runtime_barrier - Cancel pending requests and wait for completions. + * @dev: Device to handle. + * + * Flush all pending requests for the device from pm_wq and wait for all + * run-time PM operations involving the device in progress to complete. + * + * Should be called under dev->power.lock with interrupts disabled. + */ +static void __pm_runtime_barrier(struct device *dev) +{ + pm_runtime_deactivate_timer(dev); + + if (dev->power.request_pending) { + dev->power.request = RPM_REQ_NONE; + spin_unlock_irq(&dev->power.lock); + + cancel_work_sync(&dev->power.work); + + spin_lock_irq(&dev->power.lock); + dev->power.request_pending = false; + } + + if (dev->power.runtime_status == RPM_SUSPENDING + || dev->power.runtime_status == RPM_RESUMING + || dev->power.idle_notification) { + DEFINE_WAIT(wait); + + /* Suspend, wake-up or idle notification in progress. */ + for (;;) { + prepare_to_wait(&dev->power.wait_queue, &wait, + TASK_UNINTERRUPTIBLE); + if (dev->power.runtime_status != RPM_SUSPENDING + && dev->power.runtime_status != RPM_RESUMING + && !dev->power.idle_notification) + break; + spin_unlock_irq(&dev->power.lock); + + schedule(); + + spin_lock_irq(&dev->power.lock); + } + finish_wait(&dev->power.wait_queue, &wait); + } +} + +/** + * pm_runtime_barrier - Flush pending requests and wait for completions. + * @dev: Device to handle. + * + * Prevent the device from being suspended by incrementing its usage counter and + * if there's a pending resume request for the device, wake the device up. + * Next, make sure that all pending requests for the device have been flushed + * from pm_wq and wait for all run-time PM operations involving the device in + * progress to complete. + * + * Return value: + * 1, if there was a resume request pending and the device had to be woken up, + * 0, otherwise + */ +int pm_runtime_barrier(struct device *dev) +{ + int retval = 0; + + pm_runtime_get_noresume(dev); + spin_lock_irq(&dev->power.lock); + + if (dev->power.request_pending + && dev->power.request == RPM_REQ_RESUME) { + __pm_runtime_resume(dev, false); + retval = 1; + } + + __pm_runtime_barrier(dev); + + spin_unlock_irq(&dev->power.lock); + pm_runtime_put_noidle(dev); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_runtime_barrier); + +/** + * __pm_runtime_disable - Disable run-time PM of a device. + * @dev: Device to handle. + * @check_resume: If set, check if there's a resume request for the device. + * + * Increment power.disable_depth for the device and if was zero previously, + * cancel all pending run-time PM requests for the device and wait for all + * operations in progress to complete. The device can be either active or + * suspended after its run-time PM has been disabled. + * + * If @check_resume is set and there's a resume request pending when + * __pm_runtime_disable() is called and power.disable_depth is zero, the + * function will wake up the device before disabling its run-time PM. + */ +void __pm_runtime_disable(struct device *dev, bool check_resume) +{ + spin_lock_irq(&dev->power.lock); + + if (dev->power.disable_depth > 0) { + dev->power.disable_depth++; + goto out; + } + + /* + * Wake up the device if there's a resume request pending, because that + * means there probably is some I/O to process and disabling run-time PM + * shouldn't prevent the device from processing the I/O. + */ + if (check_resume && dev->power.request_pending + && dev->power.request == RPM_REQ_RESUME) { + /* + * Prevent suspends and idle notifications from being carried + * out after we have woken up the device. + */ + pm_runtime_get_noresume(dev); + + __pm_runtime_resume(dev, false); + + pm_runtime_put_noidle(dev); + } + + if (!dev->power.disable_depth++) + __pm_runtime_barrier(dev); + + out: + spin_unlock_irq(&dev->power.lock); +} +EXPORT_SYMBOL_GPL(__pm_runtime_disable); + +/** + * pm_runtime_enable - Enable run-time PM of a device. + * @dev: Device to handle. + */ +void pm_runtime_enable(struct device *dev) +{ + unsigned long flags; + + spin_lock_irqsave(&dev->power.lock, flags); + + if (dev->power.disable_depth > 0) + dev->power.disable_depth--; + else + dev_warn(dev, "Unbalanced %s!\n", __func__); + + spin_unlock_irqrestore(&dev->power.lock, flags); +} +EXPORT_SYMBOL_GPL(pm_runtime_enable); + +/** + * pm_runtime_init - Initialize run-time PM fields in given device object. + * @dev: Device object to initialize. + */ +void pm_runtime_init(struct device *dev) +{ + spin_lock_init(&dev->power.lock); + + dev->power.runtime_status = RPM_SUSPENDED; + dev->power.idle_notification = false; + + dev->power.disable_depth = 1; + atomic_set(&dev->power.usage_count, 0); + + dev->power.runtime_error = 0; + + atomic_set(&dev->power.child_count, 0); + pm_suspend_ignore_children(dev, false); + + dev->power.request_pending = false; + dev->power.request = RPM_REQ_NONE; + dev->power.deferred_resume = false; + INIT_WORK(&dev->power.work, pm_runtime_work); + + dev->power.timer_expires = 0; + setup_timer(&dev->power.suspend_timer, pm_suspend_timer_fn, + (unsigned long)dev); + + init_waitqueue_head(&dev->power.wait_queue); +} + +/** + * pm_runtime_remove - Prepare for removing a device from device hierarchy. + * @dev: Device object being removed from device hierarchy. + */ +void pm_runtime_remove(struct device *dev) +{ + __pm_runtime_disable(dev, false); + + /* Change the status back to 'suspended' to match the initial status. */ + if (dev->power.runtime_status == RPM_ACTIVE) + pm_runtime_set_suspended(dev); +} diff --git a/include/linux/pm.h b/include/linux/pm.h index b3f74764a58..2b6e20df0e5 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -22,6 +22,10 @@ #define _LINUX_PM_H #include +#include +#include +#include +#include /* * Callbacks for platform drivers to implement. @@ -165,6 +169,28 @@ typedef struct pm_message { * It is allowed to unregister devices while the above callbacks are being * executed. However, it is not allowed to unregister a device from within any * of its own callbacks. + * + * There also are the following callbacks related to run-time power management + * of devices: + * + * @runtime_suspend: Prepare the device for a condition in which it won't be + * able to communicate with the CPU(s) and RAM due to power management. + * This need not mean that the device should be put into a low power state. + * For example, if the device is behind a link which is about to be turned + * off, the device may remain at full power. If the device does go to low + * power and if device_may_wakeup(dev) is true, remote wake-up (i.e., a + * hardware mechanism allowing the device to request a change of its power + * state, such as PCI PME) should be enabled for it. + * + * @runtime_resume: Put the device into the fully active state in response to a + * wake-up event generated by hardware or at the request of software. If + * necessary, put the device into the full power state and restore its + * registers, so that it is fully operational. + * + * @runtime_idle: Device appears to be inactive and it might be put into a low + * power state if all of the necessary conditions are satisfied. Check + * these conditions and handle the device as appropriate, possibly queueing + * a suspend request for it. The return value is ignored by the PM core. */ struct dev_pm_ops { @@ -182,6 +208,9 @@ struct dev_pm_ops { int (*thaw_noirq)(struct device *dev); int (*poweroff_noirq)(struct device *dev); int (*restore_noirq)(struct device *dev); + int (*runtime_suspend)(struct device *dev); + int (*runtime_resume)(struct device *dev); + int (*runtime_idle)(struct device *dev); }; /** @@ -315,14 +344,80 @@ enum dpm_state { DPM_OFF_IRQ, }; +/** + * Device run-time power management status. + * + * These status labels are used internally by the PM core to indicate the + * current status of a device with respect to the PM core operations. They do + * not reflect the actual power state of the device or its status as seen by the + * driver. + * + * RPM_ACTIVE Device is fully operational. Indicates that the device + * bus type's ->runtime_resume() callback has completed + * successfully. + * + * RPM_SUSPENDED Device bus type's ->runtime_suspend() callback has + * completed successfully. The device is regarded as + * suspended. + * + * RPM_RESUMING Device bus type's ->runtime_resume() callback is being + * executed. + * + * RPM_SUSPENDING Device bus type's ->runtime_suspend() callback is being + * executed. + */ + +enum rpm_status { + RPM_ACTIVE = 0, + RPM_RESUMING, + RPM_SUSPENDED, + RPM_SUSPENDING, +}; + +/** + * Device run-time power management request types. + * + * RPM_REQ_NONE Do nothing. + * + * RPM_REQ_IDLE Run the device bus type's ->runtime_idle() callback + * + * RPM_REQ_SUSPEND Run the device bus type's ->runtime_suspend() callback + * + * RPM_REQ_RESUME Run the device bus type's ->runtime_resume() callback + */ + +enum rpm_request { + RPM_REQ_NONE = 0, + RPM_REQ_IDLE, + RPM_REQ_SUSPEND, + RPM_REQ_RESUME, +}; + struct dev_pm_info { pm_message_t power_state; - unsigned can_wakeup:1; - unsigned should_wakeup:1; + unsigned int can_wakeup:1; + unsigned int should_wakeup:1; enum dpm_state status; /* Owned by the PM core */ -#ifdef CONFIG_PM_SLEEP +#ifdef CONFIG_PM_SLEEP struct list_head entry; #endif +#ifdef CONFIG_PM_RUNTIME + struct timer_list suspend_timer; + unsigned long timer_expires; + struct work_struct work; + wait_queue_head_t wait_queue; + spinlock_t lock; + atomic_t usage_count; + atomic_t child_count; + unsigned int disable_depth:3; + unsigned int ignore_children:1; + unsigned int idle_notification:1; + unsigned int request_pending:1; + unsigned int deferred_resume:1; + enum rpm_request request; + enum rpm_status runtime_status; + int runtime_error; +#endif }; /* diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h new file mode 100644 index 00000000000..44087044910 --- /dev/null +++ b/include/linux/pm_runtime.h @@ -0,0 +1,114 @@ +/* + * pm_runtime.h - Device run-time power management helper functions. + * + * Copyright (C) 2009 Rafael J. Wysocki + * + * This file is released under the GPLv2. + */ + +#ifndef _LINUX_PM_RUNTIME_H +#define _LINUX_PM_RUNTIME_H + +#include +#include + +#ifdef CONFIG_PM_RUNTIME + +extern struct workqueue_struct *pm_wq; + +extern int pm_runtime_idle(struct device *dev); +extern int pm_runtime_suspend(struct device *dev); +extern int pm_runtime_resume(struct device *dev); +extern int pm_request_idle(struct device *dev); +extern int pm_schedule_suspend(struct device *dev, unsigned int delay); +extern int pm_request_resume(struct device *dev); +extern int __pm_runtime_get(struct device *dev, bool sync); +extern int __pm_runtime_put(struct device *dev, bool sync); +extern int __pm_runtime_set_status(struct device *dev, unsigned int status); +extern int pm_runtime_barrier(struct device *dev); +extern void pm_runtime_enable(struct device *dev); +extern void __pm_runtime_disable(struct device *dev, bool check_resume); + +static inline bool pm_children_suspended(struct device *dev) +{ + return dev->power.ignore_children + || !atomic_read(&dev->power.child_count); +} + +static inline void pm_suspend_ignore_children(struct device *dev, bool enable) +{ + dev->power.ignore_children = enable; +} + +static inline void pm_runtime_get_noresume(struct device *dev) +{ + atomic_inc(&dev->power.usage_count); +} + +static inline void pm_runtime_put_noidle(struct device *dev) +{ + atomic_add_unless(&dev->power.usage_count, -1, 0); +} + +#else /* !CONFIG_PM_RUNTIME */ + +static inline int pm_runtime_idle(struct device *dev) { return -ENOSYS; } +static inline int pm_runtime_suspend(struct device *dev) { return -ENOSYS; } +static inline int pm_runtime_resume(struct device *dev) { return 0; } +static inline int pm_request_idle(struct device *dev) { return -ENOSYS; } +static inline int pm_schedule_suspend(struct device *dev, unsigned int delay) +{ + return -ENOSYS; +} +static inline int pm_request_resume(struct device *dev) { return 0; } +static inline int __pm_runtime_get(struct device *dev, bool sync) { return 1; } +static inline int __pm_runtime_put(struct device *dev, bool sync) { return 0; } +static inline int __pm_runtime_set_status(struct device *dev, + unsigned int status) { return 0; } +static inline int pm_runtime_barrier(struct device *dev) { return 0; } +static inline void pm_runtime_enable(struct device *dev) {} +static inline void __pm_runtime_disable(struct device *dev, bool c) {} + +static inline bool pm_children_suspended(struct device *dev) { return false; } +static inline void pm_suspend_ignore_children(struct device *dev, bool en) {} +static inline void pm_runtime_get_noresume(struct device *dev) {} +static inline void pm_runtime_put_noidle(struct device *dev) {} + +#endif /* !CONFIG_PM_RUNTIME */ + +static inline int pm_runtime_get(struct device *dev) +{ + return __pm_runtime_get(dev, false); +} + +static inline int pm_runtime_get_sync(struct device *dev) +{ + return __pm_runtime_get(dev, true); +} + +static inline int pm_runtime_put(struct device *dev) +{ + return __pm_runtime_put(dev, false); +} + +static inline int pm_runtime_put_sync(struct device *dev) +{ + return __pm_runtime_put(dev, true); +} + +static inline int pm_runtime_set_active(struct device *dev) +{ + return __pm_runtime_set_status(dev, RPM_ACTIVE); +} + +static inline void pm_runtime_set_suspended(struct device *dev) +{ + __pm_runtime_set_status(dev, RPM_SUSPENDED); +} + +static inline void pm_runtime_disable(struct device *dev) +{ + __pm_runtime_disable(dev, true); +} + +#endif diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 72067cbdb37..91e09d3b2eb 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -208,3 +208,17 @@ config APM_EMULATION random kernel OOPSes or reboots that don't seem to be related to anything, try disabling/enabling this option (or disabling/enabling APM in your BIOS). + +config PM_RUNTIME + bool "Run-time PM core functionality" + depends on PM + ---help--- + Enable functionality allowing I/O devices to be put into energy-saving + (low power) states at run time (or autosuspended) after a specified + period of inactivity and woken up in response to a hardware-generated + wake-up event or a driver's request. + + Hardware support is generally required for this functionality to work + and the bus type drivers of the buses the devices are on are + responsible for the actual handling of the autosuspend requests and + wake-up events. diff --git a/kernel/power/main.c b/kernel/power/main.c index f710e36930c..347d2cc88cd 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "power.h" @@ -217,8 +218,24 @@ static struct attribute_group attr_group = { .attrs = g, }; +#ifdef CONFIG_PM_RUNTIME +struct workqueue_struct *pm_wq; + +static int __init pm_start_workqueue(void) +{ + pm_wq = create_freezeable_workqueue("pm"); + + return pm_wq ? 0 : -ENOMEM; +} +#else +static inline int pm_start_workqueue(void) { return 0; } +#endif + static int __init pm_init(void) { + int error = pm_start_workqueue(); + if (error) + return error; power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; -- cgit v1.2.3-70-g09d2 From 9f77da9f40045253e91f55c12d4481254b513d2d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 22 Aug 2009 13:56:45 -0700 Subject: rcu: Move private definitions from include/linux/rcutree.h to kernel/rcutree.h Some information hiding that makes it easier to merge preemptability into rcutree without descending into #include hell. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josht@linux.vnet.ibm.com Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org LKML-Reference: <1250974613373-git-send-email-> Signed-off-by: Ingo Molnar --- include/linux/rcutree.h | 211 ------------------------------------------ kernel/rcutree.c | 2 + kernel/rcutree.h | 238 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/rcutree_trace.c | 1 + 4 files changed, 241 insertions(+), 211 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index d4dfd248963..e37d5e2a835 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -30,217 +30,6 @@ #ifndef __LINUX_RCUTREE_H #define __LINUX_RCUTREE_H -#include -#include -#include -#include -#include - -/* - * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. - * In theory, it should be possible to add more levels straightforwardly. - * In practice, this has not been tested, so there is probably some - * bug somewhere. - */ -#define MAX_RCU_LVLS 3 -#define RCU_FANOUT (CONFIG_RCU_FANOUT) -#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) -#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) - -#if NR_CPUS <= RCU_FANOUT -# define NUM_RCU_LVLS 1 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 (NR_CPUS) -# define NUM_RCU_LVL_2 0 -# define NUM_RCU_LVL_3 0 -#elif NR_CPUS <= RCU_FANOUT_SQ -# define NUM_RCU_LVLS 2 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT) -# define NUM_RCU_LVL_2 (NR_CPUS) -# define NUM_RCU_LVL_3 0 -#elif NR_CPUS <= RCU_FANOUT_CUBE -# define NUM_RCU_LVLS 3 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ) -# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT)) -# define NUM_RCU_LVL_3 NR_CPUS -#else -# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" -#endif /* #if (NR_CPUS) <= RCU_FANOUT */ - -#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) -#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) - -/* - * Dynticks per-CPU state. - */ -struct rcu_dynticks { - int dynticks_nesting; /* Track nesting level, sort of. */ - int dynticks; /* Even value for dynticks-idle, else odd. */ - int dynticks_nmi; /* Even value for either dynticks-idle or */ - /* not in nmi handler, else odd. So this */ - /* remains even for nmi from irq handler. */ -}; - -/* - * Definition for node within the RCU grace-period-detection hierarchy. - */ -struct rcu_node { - spinlock_t lock; - unsigned long qsmask; /* CPUs or groups that need to switch in */ - /* order for current grace period to proceed.*/ - unsigned long qsmaskinit; - /* Per-GP initialization for qsmask. */ - unsigned long grpmask; /* Mask to apply to parent qsmask. */ - int grplo; /* lowest-numbered CPU or group here. */ - int grphi; /* highest-numbered CPU or group here. */ - u8 grpnum; /* CPU/group number for next level up. */ - u8 level; /* root is at level 0. */ - struct rcu_node *parent; -} ____cacheline_internodealigned_in_smp; - -/* Index values for nxttail array in struct rcu_data. */ -#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ -#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ -#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */ -#define RCU_NEXT_TAIL 3 -#define RCU_NEXT_SIZE 4 - -/* Per-CPU data for read-copy update. */ -struct rcu_data { - /* 1) quiescent-state and grace-period handling : */ - long completed; /* Track rsp->completed gp number */ - /* in order to detect GP end. */ - long gpnum; /* Highest gp number that this CPU */ - /* is aware of having started. */ - long passed_quiesc_completed; - /* Value of completed at time of qs. */ - bool passed_quiesc; /* User-mode/idle loop etc. */ - bool qs_pending; /* Core waits for quiesc state. */ - bool beenonline; /* CPU online at least once. */ - struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ - unsigned long grpmask; /* Mask to apply to leaf qsmask. */ - - /* 2) batch handling */ - /* - * If nxtlist is not NULL, it is partitioned as follows. - * Any of the partitions might be empty, in which case the - * pointer to that partition will be equal to the pointer for - * the following partition. When the list is empty, all of - * the nxttail elements point to nxtlist, which is NULL. - * - * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]): - * Entries that might have arrived after current GP ended - * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): - * Entries known to have arrived before current GP ended - * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): - * Entries that batch # <= ->completed - 1: waiting for current GP - * [nxtlist, *nxttail[RCU_DONE_TAIL]): - * Entries that batch # <= ->completed - * The grace period for these entries has completed, and - * the other grace-period-completed entries may be moved - * here temporarily in rcu_process_callbacks(). - */ - struct rcu_head *nxtlist; - struct rcu_head **nxttail[RCU_NEXT_SIZE]; - long qlen; /* # of queued callbacks */ - long blimit; /* Upper limit on a processed batch */ - -#ifdef CONFIG_NO_HZ - /* 3) dynticks interface. */ - struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ - int dynticks_snap; /* Per-GP tracking for dynticks. */ - int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */ -#endif /* #ifdef CONFIG_NO_HZ */ - - /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ -#ifdef CONFIG_NO_HZ - unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ -#endif /* #ifdef CONFIG_NO_HZ */ - unsigned long offline_fqs; /* Kicked due to being offline. */ - unsigned long resched_ipi; /* Sent a resched IPI. */ - - /* 5) __rcu_pending() statistics. */ - long n_rcu_pending; /* rcu_pending() calls since boot. */ - long n_rp_qs_pending; - long n_rp_cb_ready; - long n_rp_cpu_needs_gp; - long n_rp_gp_completed; - long n_rp_gp_started; - long n_rp_need_fqs; - long n_rp_need_nothing; - - int cpu; -}; - -/* Values for signaled field in struct rcu_state. */ -#define RCU_GP_INIT 0 /* Grace period being initialized. */ -#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */ -#define RCU_FORCE_QS 2 /* Need to force quiescent state. */ -#ifdef CONFIG_NO_HZ -#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK -#else /* #ifdef CONFIG_NO_HZ */ -#define RCU_SIGNAL_INIT RCU_FORCE_QS -#endif /* #else #ifdef CONFIG_NO_HZ */ - -#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR -#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */ -#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */ -#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ - /* to take at least one */ - /* scheduling clock irq */ - /* before ratting on them. */ - -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - -/* - * RCU global state, including node hierarchy. This hierarchy is - * represented in "heap" form in a dense array. The root (first level) - * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second - * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]), - * and the third level in ->node[m+1] and following (->node[m+1] referenced - * by ->level[2]). The number of levels is determined by the number of - * CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy" - * consisting of a single rcu_node. - */ -struct rcu_state { - struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ - struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ - u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ - u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ - struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */ - - /* The following fields are guarded by the root rcu_node's lock. */ - - u8 signaled ____cacheline_internodealigned_in_smp; - /* Force QS state. */ - long gpnum; /* Current gp number. */ - long completed; /* # of last completed gp. */ - spinlock_t onofflock; /* exclude on/offline and */ - /* starting new GP. */ - spinlock_t fqslock; /* Only one task forcing */ - /* quiescent states. */ - unsigned long jiffies_force_qs; /* Time at which to invoke */ - /* force_quiescent_state(). */ - unsigned long n_force_qs; /* Number of calls to */ - /* force_quiescent_state(). */ - unsigned long n_force_qs_lh; /* ~Number of calls leaving */ - /* due to lock unavailable. */ - unsigned long n_force_qs_ngp; /* Number of calls leaving */ - /* due to no GP active. */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - unsigned long gp_start; /* Time at which GP started, */ - /* but in jiffies. */ - unsigned long jiffies_stall; /* Time at which to check */ - /* for CPU stalls. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ -#ifdef CONFIG_NO_HZ - long dynticks_completed; /* Value of completed @ snap. */ -#endif /* #ifdef CONFIG_NO_HZ */ -}; - extern void rcu_qsctr_inc(int cpu); extern void rcu_bh_qsctr_inc(int cpu); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 75762cddbe0..a162f859dd3 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -46,6 +46,8 @@ #include #include +#include "rcutree.h" + #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; struct lockdep_map rcu_lock_map = diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 5e872bbf07f..7cc830a1c44 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -1,3 +1,239 @@ +/* + * Read-Copy Update mechanism for mutual exclusion (tree-based version) + * Internal non-public definitions. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2008 + * + * Author: Ingo Molnar + * Paul E. McKenney + */ + +#include +#include +#include +#include +#include + +/* + * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. + * In theory, it should be possible to add more levels straightforwardly. + * In practice, this has not been tested, so there is probably some + * bug somewhere. + */ +#define MAX_RCU_LVLS 3 +#define RCU_FANOUT (CONFIG_RCU_FANOUT) +#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) +#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) + +#if NR_CPUS <= RCU_FANOUT +# define NUM_RCU_LVLS 1 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 (NR_CPUS) +# define NUM_RCU_LVL_2 0 +# define NUM_RCU_LVL_3 0 +#elif NR_CPUS <= RCU_FANOUT_SQ +# define NUM_RCU_LVLS 2 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT) +# define NUM_RCU_LVL_2 (NR_CPUS) +# define NUM_RCU_LVL_3 0 +#elif NR_CPUS <= RCU_FANOUT_CUBE +# define NUM_RCU_LVLS 3 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ) +# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT)) +# define NUM_RCU_LVL_3 NR_CPUS +#else +# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" +#endif /* #if (NR_CPUS) <= RCU_FANOUT */ + +#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) +#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) + +/* + * Dynticks per-CPU state. + */ +struct rcu_dynticks { + int dynticks_nesting; /* Track nesting level, sort of. */ + int dynticks; /* Even value for dynticks-idle, else odd. */ + int dynticks_nmi; /* Even value for either dynticks-idle or */ + /* not in nmi handler, else odd. So this */ + /* remains even for nmi from irq handler. */ +}; + +/* + * Definition for node within the RCU grace-period-detection hierarchy. + */ +struct rcu_node { + spinlock_t lock; + unsigned long qsmask; /* CPUs or groups that need to switch in */ + /* order for current grace period to proceed.*/ + unsigned long qsmaskinit; + /* Per-GP initialization for qsmask. */ + unsigned long grpmask; /* Mask to apply to parent qsmask. */ + int grplo; /* lowest-numbered CPU or group here. */ + int grphi; /* highest-numbered CPU or group here. */ + u8 grpnum; /* CPU/group number for next level up. */ + u8 level; /* root is at level 0. */ + struct rcu_node *parent; +} ____cacheline_internodealigned_in_smp; + +/* Index values for nxttail array in struct rcu_data. */ +#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ +#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ +#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */ +#define RCU_NEXT_TAIL 3 +#define RCU_NEXT_SIZE 4 + +/* Per-CPU data for read-copy update. */ +struct rcu_data { + /* 1) quiescent-state and grace-period handling : */ + long completed; /* Track rsp->completed gp number */ + /* in order to detect GP end. */ + long gpnum; /* Highest gp number that this CPU */ + /* is aware of having started. */ + long passed_quiesc_completed; + /* Value of completed at time of qs. */ + bool passed_quiesc; /* User-mode/idle loop etc. */ + bool qs_pending; /* Core waits for quiesc state. */ + bool beenonline; /* CPU online at least once. */ + struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ + unsigned long grpmask; /* Mask to apply to leaf qsmask. */ + + /* 2) batch handling */ + /* + * If nxtlist is not NULL, it is partitioned as follows. + * Any of the partitions might be empty, in which case the + * pointer to that partition will be equal to the pointer for + * the following partition. When the list is empty, all of + * the nxttail elements point to nxtlist, which is NULL. + * + * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]): + * Entries that might have arrived after current GP ended + * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): + * Entries known to have arrived before current GP ended + * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): + * Entries that batch # <= ->completed - 1: waiting for current GP + * [nxtlist, *nxttail[RCU_DONE_TAIL]): + * Entries that batch # <= ->completed + * The grace period for these entries has completed, and + * the other grace-period-completed entries may be moved + * here temporarily in rcu_process_callbacks(). + */ + struct rcu_head *nxtlist; + struct rcu_head **nxttail[RCU_NEXT_SIZE]; + long qlen; /* # of queued callbacks */ + long blimit; /* Upper limit on a processed batch */ + +#ifdef CONFIG_NO_HZ + /* 3) dynticks interface. */ + struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ + int dynticks_snap; /* Per-GP tracking for dynticks. */ + int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */ +#endif /* #ifdef CONFIG_NO_HZ */ + + /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ +#ifdef CONFIG_NO_HZ + unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ +#endif /* #ifdef CONFIG_NO_HZ */ + unsigned long offline_fqs; /* Kicked due to being offline. */ + unsigned long resched_ipi; /* Sent a resched IPI. */ + + /* 5) __rcu_pending() statistics. */ + long n_rcu_pending; /* rcu_pending() calls since boot. */ + long n_rp_qs_pending; + long n_rp_cb_ready; + long n_rp_cpu_needs_gp; + long n_rp_gp_completed; + long n_rp_gp_started; + long n_rp_need_fqs; + long n_rp_need_nothing; + + int cpu; +}; + +/* Values for signaled field in struct rcu_state. */ +#define RCU_GP_INIT 0 /* Grace period being initialized. */ +#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */ +#define RCU_FORCE_QS 2 /* Need to force quiescent state. */ +#ifdef CONFIG_NO_HZ +#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK +#else /* #ifdef CONFIG_NO_HZ */ +#define RCU_SIGNAL_INIT RCU_FORCE_QS +#endif /* #else #ifdef CONFIG_NO_HZ */ + +#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR +#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */ +#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */ +#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ + /* to take at least one */ + /* scheduling clock irq */ + /* before ratting on them. */ + +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + +/* + * RCU global state, including node hierarchy. This hierarchy is + * represented in "heap" form in a dense array. The root (first level) + * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second + * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]), + * and the third level in ->node[m+1] and following (->node[m+1] referenced + * by ->level[2]). The number of levels is determined by the number of + * CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy" + * consisting of a single rcu_node. + */ +struct rcu_state { + struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ + struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ + u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ + u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ + struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */ + + /* The following fields are guarded by the root rcu_node's lock. */ + + u8 signaled ____cacheline_internodealigned_in_smp; + /* Force QS state. */ + long gpnum; /* Current gp number. */ + long completed; /* # of last completed gp. */ + spinlock_t onofflock; /* exclude on/offline and */ + /* starting new GP. */ + spinlock_t fqslock; /* Only one task forcing */ + /* quiescent states. */ + unsigned long jiffies_force_qs; /* Time at which to invoke */ + /* force_quiescent_state(). */ + unsigned long n_force_qs; /* Number of calls to */ + /* force_quiescent_state(). */ + unsigned long n_force_qs_lh; /* ~Number of calls leaving */ + /* due to lock unavailable. */ + unsigned long n_force_qs_ngp; /* Number of calls leaving */ + /* due to no GP active. */ +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + unsigned long gp_start; /* Time at which GP started, */ + /* but in jiffies. */ + unsigned long jiffies_stall; /* Time at which to check */ + /* for CPU stalls. */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +#ifdef CONFIG_NO_HZ + long dynticks_completed; /* Value of completed @ snap. */ +#endif /* #ifdef CONFIG_NO_HZ */ +}; + +#ifdef RCU_TREE_NONCORE /* * RCU implementation internal declarations: @@ -8,3 +244,5 @@ DECLARE_PER_CPU(struct rcu_data, rcu_data); extern struct rcu_state rcu_bh_state; DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); +#endif /* #ifdef RCU_TREE_NONCORE */ + diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index fe1dcdbf1ca..0cb52b88775 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -43,6 +43,7 @@ #include #include +#define RCU_TREE_NONCORE #include "rcutree.h" static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) -- cgit v1.2.3-70-g09d2 From d6714c22b43fbcbead7e7b706ff270e15f04a791 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 22 Aug 2009 13:56:46 -0700 Subject: rcu: Renamings to increase RCU clarity Make RCU-sched, RCU-bh, and RCU-preempt be underlying implementations, with "RCU" defined in terms of one of the three. Update the outdated rcu_qsctr_inc() names, as these functions no longer increment anything. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josht@linux.vnet.ibm.com Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org LKML-Reference: <12509746132696-git-send-email-> Signed-off-by: Ingo Molnar --- Documentation/RCU/trace.txt | 7 ++-- include/linux/rcupdate.h | 21 +++++++++--- include/linux/rcupreempt.h | 4 +-- include/linux/rcutree.h | 8 +++-- kernel/rcupreempt.c | 8 ++--- kernel/rcutree.c | 80 ++++++++++++++++++++++++++++----------------- kernel/rcutree.h | 4 +-- kernel/rcutree_trace.c | 20 ++++++------ kernel/sched.c | 2 +- kernel/softirq.c | 4 +-- 10 files changed, 95 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index 02cced183b2..187bbf10c92 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -191,8 +191,7 @@ rcu/rcuhier (which displays the struct rcu_node hierarchy). The output of "cat rcu/rcudata" looks as follows: -rcu: -rcu: +rcu_sched: 0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10 1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10 2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10 @@ -306,7 +305,7 @@ comma-separated-variable spreadsheet format. The output of "cat rcu/rcugp" looks as follows: -rcu: completed=33062 gpnum=33063 +rcu_sched: completed=33062 gpnum=33063 rcu_bh: completed=464 gpnum=464 Again, this output is for both "rcu" and "rcu_bh". The fields are @@ -413,7 +412,7 @@ o Each element of the form "1/1 0:127 ^0" represents one struct The output of "cat rcu/rcu_pending" looks as follows: -rcu: +rcu_sched: 0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741 1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792 2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629 diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 3c89d6a2591..e920f0fd59d 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -157,17 +157,28 @@ extern int rcu_scheduler_active; * - call_rcu_sched() and rcu_barrier_sched() * on the write-side to insure proper synchronization. */ -#define rcu_read_lock_sched() preempt_disable() -#define rcu_read_lock_sched_notrace() preempt_disable_notrace() +static inline void rcu_read_lock_sched(void) +{ + preempt_disable(); +} +static inline void rcu_read_lock_sched_notrace(void) +{ + preempt_disable_notrace(); +} /* * rcu_read_unlock_sched - marks the end of a RCU-classic critical section * * See rcu_read_lock_sched for more information. */ -#define rcu_read_unlock_sched() preempt_enable() -#define rcu_read_unlock_sched_notrace() preempt_enable_notrace() - +static inline void rcu_read_unlock_sched(void) +{ + preempt_enable(); +} +static inline void rcu_read_unlock_sched_notrace(void) +{ + preempt_enable_notrace(); +} /** diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h index f164ac9b780..2963f080e48 100644 --- a/include/linux/rcupreempt.h +++ b/include/linux/rcupreempt.h @@ -40,8 +40,8 @@ #include #include -extern void rcu_qsctr_inc(int cpu); -static inline void rcu_bh_qsctr_inc(int cpu) { } +extern void rcu_sched_qs(int cpu); +static inline void rcu_bh_qs(int cpu) { } /* * Someone might want to pass call_rcu_bh as a function pointer. diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index e37d5e2a835..a0852d0d915 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -30,8 +30,8 @@ #ifndef __LINUX_RCUTREE_H #define __LINUX_RCUTREE_H -extern void rcu_qsctr_inc(int cpu); -extern void rcu_bh_qsctr_inc(int cpu); +extern void rcu_sched_qs(int cpu); +extern void rcu_bh_qs(int cpu); extern int rcu_pending(int cpu); extern int rcu_needs_cpu(int cpu); @@ -73,7 +73,8 @@ static inline void __rcu_read_unlock_bh(void) #define __synchronize_sched() synchronize_rcu() -#define call_rcu_sched(head, func) call_rcu(head, func) +extern void call_rcu_sched(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)); static inline void synchronize_rcu_expedited(void) { @@ -91,6 +92,7 @@ extern void rcu_restart_cpu(int cpu); extern long rcu_batches_completed(void); extern long rcu_batches_completed_bh(void); +extern long rcu_batches_completed_sched(void); static inline void rcu_init_sched(void) { diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 510898a7bd6..7d777c9f394 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -159,7 +159,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched .dynticks = 1, }; -void rcu_qsctr_inc(int cpu) +void rcu_sched_qs(int cpu) { struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); @@ -967,12 +967,12 @@ void rcu_check_callbacks(int cpu, int user) * If this CPU took its interrupt from user mode or from the * idle loop, and this is not a nested interrupt, then * this CPU has to have exited all prior preept-disable - * sections of code. So increment the counter to note this. + * sections of code. So invoke rcu_sched_qs() to note this. * * The memory barrier is needed to handle the case where * writes from a preempt-disable section of code get reordered * into schedule() by this CPU's write buffer. So the memory - * barrier makes sure that the rcu_qsctr_inc() is seen by other + * barrier makes sure that the rcu_sched_qs() is seen by other * CPUs to happen after any such write. */ @@ -980,7 +980,7 @@ void rcu_check_callbacks(int cpu, int user) (idle_cpu(cpu) && !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { smp_mb(); /* Guard against aggressive schedule(). */ - rcu_qsctr_inc(cpu); + rcu_sched_qs(cpu); } rcu_check_mb(cpu); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index a162f859dd3..4d71d4e8b5a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -74,26 +74,25 @@ EXPORT_SYMBOL_GPL(rcu_lock_map); .n_force_qs_ngp = 0, \ } -struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state); -DEFINE_PER_CPU(struct rcu_data, rcu_data); +struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state); +DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); /* - * Increment the quiescent state counter. - * The counter is a bit degenerated: We do not need to know + * Note a quiescent state. Because we do not need to know * how many quiescent states passed, just if there was at least - * one since the start of the grace period. Thus just a flag. + * one since the start of the grace period, this just sets a flag. */ -void rcu_qsctr_inc(int cpu) +void rcu_sched_qs(int cpu) { - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); rdp->passed_quiesc = 1; rdp->passed_quiesc_completed = rdp->completed; } -void rcu_bh_qsctr_inc(int cpu) +void rcu_bh_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); rdp->passed_quiesc = 1; @@ -113,12 +112,22 @@ static int qlowmark = 100; /* Once only this many pending, use blimit. */ static void force_quiescent_state(struct rcu_state *rsp, int relaxed); +/* + * Return the number of RCU-sched batches processed thus far for debug & stats. + */ +long rcu_batches_completed_sched(void) +{ + return rcu_sched_state.completed; +} +EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); + /* * Return the number of RCU batches processed thus far for debug & stats. + * @@@ placeholder, maps to rcu_batches_completed_sched(). */ long rcu_batches_completed(void) { - return rcu_state.completed; + return rcu_batches_completed_sched(); } EXPORT_SYMBOL_GPL(rcu_batches_completed); @@ -310,7 +319,7 @@ void rcu_irq_exit(void) WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs); /* If the interrupt queued a callback, get out of dyntick mode. */ - if (__get_cpu_var(rcu_data).nxtlist || + if (__get_cpu_var(rcu_sched_data).nxtlist || __get_cpu_var(rcu_bh_data).nxtlist) set_need_resched(); } @@ -847,7 +856,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) /* * Move callbacks from the outgoing CPU to the running CPU. * Note that the outgoing CPU is now quiscent, so it is now - * (uncharacteristically) safe to access it rcu_data structure. + * (uncharacteristically) safe to access its rcu_data structure. * Note also that we must carefully retain the order of the * outgoing CPU's callbacks in order for rcu_barrier() to work * correctly. Finally, note that we start all the callbacks @@ -878,7 +887,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) */ static void rcu_offline_cpu(int cpu) { - __rcu_offline_cpu(cpu, &rcu_state); + __rcu_offline_cpu(cpu, &rcu_sched_state); __rcu_offline_cpu(cpu, &rcu_bh_state); } @@ -973,17 +982,16 @@ void rcu_check_callbacks(int cpu, int user) * Get here if this CPU took its interrupt from user * mode or from the idle loop, and if this is not a * nested interrupt. In this case, the CPU is in - * a quiescent state, so count it. + * a quiescent state, so note it. * * No memory barrier is required here because both - * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference - * only CPU-local variables that other CPUs neither - * access nor modify, at least not while the corresponding - * CPU is online. + * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local + * variables that other CPUs neither access nor modify, + * at least not while the corresponding CPU is online. */ - rcu_qsctr_inc(cpu); - rcu_bh_qsctr_inc(cpu); + rcu_sched_qs(cpu); + rcu_bh_qs(cpu); } else if (!in_softirq()) { @@ -991,10 +999,10 @@ void rcu_check_callbacks(int cpu, int user) * Get here if this CPU did not take its interrupt from * softirq, in other words, if it is not interrupting * a rcu_bh read-side critical section. This is an _bh - * critical section, so count it. + * critical section, so note it. */ - rcu_bh_qsctr_inc(cpu); + rcu_bh_qs(cpu); } raise_softirq(RCU_SOFTIRQ); } @@ -1174,7 +1182,8 @@ static void rcu_process_callbacks(struct softirq_action *unused) */ smp_mb(); /* See above block comment. */ - __rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data)); + __rcu_process_callbacks(&rcu_sched_state, + &__get_cpu_var(rcu_sched_data)); __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); /* @@ -1231,14 +1240,25 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), } /* - * Queue an RCU callback for invocation after a grace period. + * Queue an RCU-sched callback for invocation after a grace period. + */ +void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_sched_state); +} +EXPORT_SYMBOL_GPL(call_rcu_sched); + +/* + * @@@ Queue an RCU callback for invocation after a grace period. + * @@@ Placeholder pending rcutree_plugin.h. */ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_state); + call_rcu_sched(head, func); } EXPORT_SYMBOL_GPL(call_rcu); + /* * Queue an RCU for invocation after a quicker grace period. */ @@ -1311,7 +1331,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) */ int rcu_pending(int cpu) { - return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) || + return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) || __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)); } @@ -1324,7 +1344,7 @@ int rcu_pending(int cpu) int rcu_needs_cpu(int cpu) { /* RCU callbacks either ready or pending? */ - return per_cpu(rcu_data, cpu).nxtlist || + return per_cpu(rcu_sched_data, cpu).nxtlist || per_cpu(rcu_bh_data, cpu).nxtlist; } @@ -1418,7 +1438,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) static void __cpuinit rcu_online_cpu(int cpu) { - rcu_init_percpu_data(cpu, &rcu_state); + rcu_init_percpu_data(cpu, &rcu_sched_state); rcu_init_percpu_data(cpu, &rcu_bh_state); } @@ -1545,10 +1565,10 @@ void __init __rcu_init(void) #ifdef CONFIG_RCU_CPU_STALL_DETECTOR printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - rcu_init_one(&rcu_state); - RCU_DATA_PTR_INIT(&rcu_state, rcu_data); + rcu_init_one(&rcu_sched_state); + RCU_DATA_PTR_INIT(&rcu_sched_state, rcu_sched_data); for_each_possible_cpu(i) - rcu_boot_init_percpu_data(i, &rcu_state); + rcu_boot_init_percpu_data(i, &rcu_sched_state); rcu_init_one(&rcu_bh_state); RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data); for_each_possible_cpu(i) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 7cc830a1c44..0024e5ddcc6 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -238,8 +238,8 @@ struct rcu_state { /* * RCU implementation internal declarations: */ -extern struct rcu_state rcu_state; -DECLARE_PER_CPU(struct rcu_data, rcu_data); +extern struct rcu_state rcu_sched_state; +DECLARE_PER_CPU(struct rcu_data, rcu_sched_data); extern struct rcu_state rcu_bh_state; DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 0cb52b88775..236c0504fee 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -77,8 +77,8 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) static int show_rcudata(struct seq_file *m, void *unused) { - seq_puts(m, "rcu:\n"); - PRINT_RCU_DATA(rcu_data, print_one_rcu_data, m); + seq_puts(m, "rcu_sched:\n"); + PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m); seq_puts(m, "rcu_bh:\n"); PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m); return 0; @@ -125,8 +125,8 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); #endif /* #ifdef CONFIG_NO_HZ */ seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); - seq_puts(m, "\"rcu:\"\n"); - PRINT_RCU_DATA(rcu_data, print_one_rcu_data_csv, m); + seq_puts(m, "\"rcu_sched:\"\n"); + PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m); seq_puts(m, "\"rcu_bh:\"\n"); PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m); return 0; @@ -172,8 +172,8 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) static int show_rcuhier(struct seq_file *m, void *unused) { - seq_puts(m, "rcu:\n"); - print_one_rcu_state(m, &rcu_state); + seq_puts(m, "rcu_sched:\n"); + print_one_rcu_state(m, &rcu_sched_state); seq_puts(m, "rcu_bh:\n"); print_one_rcu_state(m, &rcu_bh_state); return 0; @@ -194,8 +194,8 @@ static struct file_operations rcuhier_fops = { static int show_rcugp(struct seq_file *m, void *unused) { - seq_printf(m, "rcu: completed=%ld gpnum=%ld\n", - rcu_state.completed, rcu_state.gpnum); + seq_printf(m, "rcu_sched: completed=%ld gpnum=%ld\n", + rcu_sched_state.completed, rcu_sched_state.gpnum); seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n", rcu_bh_state.completed, rcu_bh_state.gpnum); return 0; @@ -244,8 +244,8 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) static int show_rcu_pending(struct seq_file *m, void *unused) { - seq_puts(m, "rcu:\n"); - print_rcu_pendings(m, &rcu_state); + seq_puts(m, "rcu_sched:\n"); + print_rcu_pendings(m, &rcu_sched_state); seq_puts(m, "rcu_bh:\n"); print_rcu_pendings(m, &rcu_bh_state); return 0; diff --git a/kernel/sched.c b/kernel/sched.c index cda8b81f880..c9beca67a53 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5325,7 +5325,7 @@ need_resched: preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); - rcu_qsctr_inc(cpu); + rcu_sched_qs(cpu); prev = rq->curr; switch_count = &prev->nivcsw; diff --git a/kernel/softirq.c b/kernel/softirq.c index eb5e131a048..7db25067cd2 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -227,7 +227,7 @@ restart: preempt_count() = prev_count; } - rcu_bh_qsctr_inc(cpu); + rcu_bh_qs(cpu); } h++; pending >>= 1; @@ -721,7 +721,7 @@ static int ksoftirqd(void * __bind_cpu) preempt_enable_no_resched(); cond_resched(); preempt_disable(); - rcu_qsctr_inc((long)__bind_cpu); + rcu_sched_qs((long)__bind_cpu); } preempt_enable(); set_current_state(TASK_INTERRUPTIBLE); -- cgit v1.2.3-70-g09d2 From 5699ed8fcb0c32ca699e2a27ad716eb70b367dbf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 22 Aug 2009 13:56:48 -0700 Subject: rcu: Fix online/offline indication for rcudata.csv trace file The heading said "Online?", but the column had "Y" for offline CPUs and "N" for online CPUs. Swap the "Y" and "N" to match the heading. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josht@linux.vnet.ibm.com Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org LKML-Reference: <12509746132841-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 236c0504fee..ea4a4747037 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -103,7 +103,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) return; seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d", rdp->cpu, - cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"", + cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", rdp->completed, rdp->gpnum, rdp->passed_quiesc, rdp->passed_quiesc_completed, rdp->qs_pending); -- cgit v1.2.3-70-g09d2 From 65cf8f866fc0fb40fa9daaded7e938a886d6f7c5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 22 Aug 2009 13:56:49 -0700 Subject: rcu: Merge per-RCU-flavor initialization into pre-existing macro Rename the RCU_DATA_PTR_INIT() macro to RCU_INIT_FLAVOR() and make it do the rcu_init_one() and rcu_boot_init_percpu_data() calls. Merge the loop that was in the original macro with the loops that were in __rcu_init(). Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josht@linux.vnet.ibm.com Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org LKML-Reference: <12509746133916-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4d71d4e8b5a..7c515082ae8 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1543,8 +1543,9 @@ static void __init rcu_init_one(struct rcu_state *rsp) * Helper macro for __rcu_init(). To be used nowhere else! * Assigns leaf node pointers into each CPU's rcu_data structure. */ -#define RCU_DATA_PTR_INIT(rsp, rcu_data) \ +#define RCU_INIT_FLAVOR(rsp, rcu_data) \ do { \ + rcu_init_one(rsp); \ rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \ j = 0; \ for_each_possible_cpu(i) { \ @@ -1552,6 +1553,7 @@ do { \ j++; \ per_cpu(rcu_data, i).mynode = &rnp[j]; \ (rsp)->rda[i] = &per_cpu(rcu_data, i); \ + rcu_boot_init_percpu_data(i, rsp); \ } \ } while (0) @@ -1565,14 +1567,8 @@ void __init __rcu_init(void) #ifdef CONFIG_RCU_CPU_STALL_DETECTOR printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - rcu_init_one(&rcu_sched_state); - RCU_DATA_PTR_INIT(&rcu_sched_state, rcu_sched_data); - for_each_possible_cpu(i) - rcu_boot_init_percpu_data(i, &rcu_sched_state); - rcu_init_one(&rcu_bh_state); - RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data); - for_each_possible_cpu(i) - rcu_boot_init_percpu_data(i, &rcu_bh_state); + RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); + RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } -- cgit v1.2.3-70-g09d2 From 22f00b69f6a7e1e18e821979a23e8307c2de9888 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 22 Aug 2009 13:56:50 -0700 Subject: rcu: Use debugfs_remove_recursive() simplify code. Suggested by Josh Triplett. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josht@linux.vnet.ibm.com Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: "Paul E. McKenney" LKML-Reference: <12509746132173-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree_trace.c | 45 +++++++++++++++------------------------------ 1 file changed, 15 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index ea4a4747037..31af3a0fb6d 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -265,62 +265,47 @@ static struct file_operations rcu_pending_fops = { }; static struct dentry *rcudir; -static struct dentry *datadir; -static struct dentry *datadir_csv; -static struct dentry *gpdir; -static struct dentry *hierdir; -static struct dentry *rcu_pendingdir; static int __init rcuclassic_trace_init(void) { + struct dentry *retval; + rcudir = debugfs_create_dir("rcu", NULL); if (!rcudir) - goto out; + goto free_out; - datadir = debugfs_create_file("rcudata", 0444, rcudir, + retval = debugfs_create_file("rcudata", 0444, rcudir, NULL, &rcudata_fops); - if (!datadir) + if (!retval) goto free_out; - datadir_csv = debugfs_create_file("rcudata.csv", 0444, rcudir, + retval = debugfs_create_file("rcudata.csv", 0444, rcudir, NULL, &rcudata_csv_fops); - if (!datadir_csv) + if (!retval) goto free_out; - gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); - if (!gpdir) + retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); + if (!retval) goto free_out; - hierdir = debugfs_create_file("rcuhier", 0444, rcudir, + retval = debugfs_create_file("rcuhier", 0444, rcudir, NULL, &rcuhier_fops); - if (!hierdir) + if (!retval) goto free_out; - rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir, + retval = debugfs_create_file("rcu_pending", 0444, rcudir, NULL, &rcu_pending_fops); - if (!rcu_pendingdir) + if (!retval) goto free_out; return 0; free_out: - if (datadir) - debugfs_remove(datadir); - if (datadir_csv) - debugfs_remove(datadir_csv); - if (gpdir) - debugfs_remove(gpdir); - debugfs_remove(rcudir); -out: + debugfs_remove_recursive(rcudir); return 1; } static void __exit rcuclassic_trace_cleanup(void) { - debugfs_remove(datadir); - debugfs_remove(datadir_csv); - debugfs_remove(gpdir); - debugfs_remove(hierdir); - debugfs_remove(rcu_pendingdir); - debugfs_remove(rcudir); + debugfs_remove_recursive(rcudir); } -- cgit v1.2.3-70-g09d2 From a157229cabd6dd8cfa82525fc9bf730c94cc9ac2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 22 Aug 2009 13:56:51 -0700 Subject: rcu: Simplify rcu_pending()/rcu_check_callbacks() API All calls from outside RCU are of the form: if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user); This is silly, instead we put a call to rcu_pending() in rcu_check_callbacks(), and then make the outside calls be to rcu_check_callbacks(). This cuts down on the code a bit and also gives the compiler a better chance of optimizing. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josht@linux.vnet.ibm.com Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org LKML-Reference: <125097461311-git-send-email-> Signed-off-by: Ingo Molnar --- arch/ia64/xen/time.c | 3 +-- include/linux/rcupreempt.h | 1 - include/linux/rcutree.h | 1 - kernel/rcupreempt.c | 10 ++++++++-- kernel/rcutree.c | 5 ++++- kernel/timer.c | 3 +-- 6 files changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/xen/time.c b/arch/ia64/xen/time.c index fb833269017..dbeadb9c8e2 100644 --- a/arch/ia64/xen/time.c +++ b/arch/ia64/xen/time.c @@ -133,8 +133,7 @@ consider_steal_time(unsigned long new_itm) account_idle_ticks(blocked); run_local_timers(); - if (rcu_pending(cpu)) - rcu_check_callbacks(cpu, user_mode(get_irq_regs())); + rcu_check_callbacks(cpu, user_mode(get_irq_regs())); scheduler_tick(); run_posix_cpu_timers(p); diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h index 6c9dd9cf8b8..aff4772fb49 100644 --- a/include/linux/rcupreempt.h +++ b/include/linux/rcupreempt.h @@ -66,7 +66,6 @@ extern void call_rcu_sched(struct rcu_head *head, extern void __rcu_read_lock(void); extern void __rcu_read_unlock(void); -extern int rcu_pending(int cpu); extern int rcu_needs_cpu(int cpu); #define __rcu_read_lock_bh() { rcu_read_lock(); local_bh_disable(); } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 8a0222ce3b1..c739d90f5e6 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -33,7 +33,6 @@ extern void rcu_sched_qs(int cpu); extern void rcu_bh_qs(int cpu); -extern int rcu_pending(int cpu); extern int rcu_needs_cpu(int cpu); static inline void __rcu_read_lock(void) diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 7d777c9f394..0053ce56e32 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -159,6 +159,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched .dynticks = 1, }; +static int rcu_pending(int cpu); + void rcu_sched_qs(int cpu) { struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); @@ -961,7 +963,10 @@ static void rcu_check_mb(int cpu) void rcu_check_callbacks(int cpu, int user) { unsigned long flags; - struct rcu_data *rdp = RCU_DATA_CPU(cpu); + struct rcu_data *rdp; + + if (!rcu_pending(cpu)) + return; /* if nothing for RCU to do. */ /* * If this CPU took its interrupt from user mode or from the @@ -976,6 +981,7 @@ void rcu_check_callbacks(int cpu, int user) * CPUs to happen after any such write. */ + rdp = RCU_DATA_CPU(cpu); if (user || (idle_cpu(cpu) && !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { @@ -1382,7 +1388,7 @@ int rcu_needs_cpu(int cpu) rdp->waitschedlist != NULL); } -int rcu_pending(int cpu) +static int rcu_pending(int cpu) { struct rcu_data *rdp = RCU_DATA_CPU(cpu); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7c515082ae8..4ce3adcfa94 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -111,6 +111,7 @@ static int qhimark = 10000; /* If this many pending, ignore blimit. */ static int qlowmark = 100; /* Once only this many pending, use blimit. */ static void force_quiescent_state(struct rcu_state *rsp, int relaxed); +static int rcu_pending(int cpu); /* * Return the number of RCU-sched batches processed thus far for debug & stats. @@ -974,6 +975,8 @@ static void rcu_do_batch(struct rcu_data *rdp) */ void rcu_check_callbacks(int cpu, int user) { + if (!rcu_pending(cpu)) + return; /* if nothing for RCU to do. */ if (user || (idle_cpu(cpu) && rcu_scheduler_active && !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { @@ -1329,7 +1332,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) * by the current CPU, returning 1 if so. This function is part of the * RCU implementation; it is -not- an exported member of the RCU API. */ -int rcu_pending(int cpu) +static int rcu_pending(int cpu) { return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) || __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)); diff --git a/kernel/timer.c b/kernel/timer.c index a7f07d5a624..a3d25f41501 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1156,8 +1156,7 @@ void update_process_times(int user_tick) /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); run_local_timers(); - if (rcu_pending(cpu)) - rcu_check_callbacks(cpu, user_tick); + rcu_check_callbacks(cpu, user_tick); printk_tick(); scheduler_tick(); run_posix_cpu_timers(p); -- cgit v1.2.3-70-g09d2 From f41d911f8c49a5d65c86504c19e8204bb605c4fd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 22 Aug 2009 13:56:52 -0700 Subject: rcu: Merge preemptable-RCU functionality into hierarchical RCU Create a kernel/rcutree_plugin.h file that contains definitions for preemptable RCU (or, under the #else branch of the #ifdef, empty definitions for the classic non-preemptable semantics). These definitions fit into plugins defined in kernel/rcutree.c for this purpose. This variant of preemptable RCU uses a new algorithm whose read-side expense is roughly that of classic hierarchical RCU under CONFIG_PREEMPT. This new algorithm's update-side expense is similar to that of classic hierarchical RCU, and, in absence of read-side preemption or blocking, is exactly that of classic hierarchical RCU. Perhaps more important, this new algorithm has a much simpler implementation, saving well over 1,000 lines of code compared to mainline's implementation of preemptable RCU, which will hopefully be retired in favor of this new algorithm. The simplifications are obtained by maintaining per-task nesting state for running tasks, and using a simple lock-protected algorithm to handle accounting when tasks block within RCU read-side critical sections, making use of lessons learned while creating numerous user-level RCU implementations over the past 18 months. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josht@linux.vnet.ibm.com Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org LKML-Reference: <12509746134003-git-send-email-> Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 15 ++ include/linux/rcupdate.h | 2 +- include/linux/rcupreempt.h | 4 + include/linux/rcutree.h | 16 ++ include/linux/sched.h | 37 ++++ init/Kconfig | 22 ++- kernel/Makefile | 1 + kernel/exit.c | 1 + kernel/fork.c | 5 +- kernel/rcutree.c | 135 +++++++++----- kernel/rcutree.h | 9 + kernel/rcutree_plugin.h | 447 +++++++++++++++++++++++++++++++++++++++++++++ kernel/rcutree_trace.c | 20 ++ lib/Kconfig.debug | 2 +- 14 files changed, 661 insertions(+), 55 deletions(-) create mode 100644 kernel/rcutree_plugin.h (limited to 'kernel') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 7fc01b13be4..971a968831b 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -94,6 +94,20 @@ extern struct group_info init_groups; # define CAP_INIT_BSET CAP_INIT_EFF_SET #endif +#ifdef CONFIG_PREEMPT_RCU +#define INIT_TASK_RCU_PREEMPT(tsk) \ + .rcu_read_lock_nesting = 0, \ + .rcu_flipctr_idx = 0, +#elif defined(CONFIG_TREE_PREEMPT_RCU) +#define INIT_TASK_RCU_PREEMPT(tsk) \ + .rcu_read_lock_nesting = 0, \ + .rcu_read_unlock_special = 0, \ + .rcu_blocked_cpu = -1, \ + .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), +#else +#define INIT_TASK_RCU_PREEMPT(tsk) +#endif + extern struct cred init_cred; #ifdef CONFIG_PERF_COUNTERS @@ -173,6 +187,7 @@ extern struct cred init_cred; INIT_LOCKDEP \ INIT_FTRACE_GRAPH \ INIT_TRACE_RECURSION \ + INIT_TASK_RCU_PREEMPT(tsk) \ } diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 9d85ee19492..26892f5e7bd 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -66,7 +66,7 @@ extern void rcu_scheduler_starting(void); extern int rcu_needs_cpu(int cpu); extern int rcu_scheduler_active; -#if defined(CONFIG_TREE_RCU) +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) #include #elif defined(CONFIG_PREEMPT_RCU) #include diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h index aff4772fb49..a42ab88e921 100644 --- a/include/linux/rcupreempt.h +++ b/include/linux/rcupreempt.h @@ -98,6 +98,10 @@ static inline long rcu_batches_completed_bh(void) return rcu_batches_completed(); } +static inline void exit_rcu(void) +{ +} + #ifdef CONFIG_RCU_TRACE struct rcupreempt_trace; extern long *rcupreempt_flipctr(int cpu); diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index c739d90f5e6..a8930771782 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -35,14 +35,30 @@ extern void rcu_bh_qs(int cpu); extern int rcu_needs_cpu(int cpu); +#ifdef CONFIG_TREE_PREEMPT_RCU + +extern void __rcu_read_lock(void); +extern void __rcu_read_unlock(void); +extern void exit_rcu(void); + +#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + static inline void __rcu_read_lock(void) { preempt_disable(); } + static inline void __rcu_read_unlock(void) { preempt_enable(); } + +static inline void exit_rcu(void) +{ +} + +#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ + static inline void __rcu_read_lock_bh(void) { local_bh_disable(); diff --git a/include/linux/sched.h b/include/linux/sched.h index 3ab08e4bb6b..d7f98f637a2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1210,6 +1210,13 @@ struct task_struct { int rcu_flipctr_idx; #endif /* #ifdef CONFIG_PREEMPT_RCU */ +#ifdef CONFIG_TREE_PREEMPT_RCU + int rcu_read_lock_nesting; + char rcu_read_unlock_special; + int rcu_blocked_cpu; + struct list_head rcu_node_entry; +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; #endif @@ -1723,6 +1730,36 @@ extern cputime_t task_gtime(struct task_struct *p); #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) +#ifdef CONFIG_TREE_PREEMPT_RCU + +#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ +#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */ +#define RCU_READ_UNLOCK_GOT_QS (1 << 2) /* CPU has responded to RCU core. */ + +static inline void rcu_copy_process(struct task_struct *p) +{ + p->rcu_read_lock_nesting = 0; + p->rcu_read_unlock_special = 0; + p->rcu_blocked_cpu = -1; + INIT_LIST_HEAD(&p->rcu_node_entry); +} + +#elif defined(CONFIG_PREEMPT_RCU) + +static inline void rcu_copy_process(struct task_struct *p) +{ + p->rcu_read_lock_nesting = 0; + p->rcu_flipctr_idx = 0; +} + +#else + +static inline void rcu_copy_process(struct task_struct *p) +{ +} + +#endif + #ifdef CONFIG_SMP extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); diff --git a/init/Kconfig b/init/Kconfig index 25373cf3267..f88da2d1c1f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -335,11 +335,20 @@ config PREEMPT_RCU now-naive assumptions about each RCU read-side critical section remaining on a given CPU through its execution. +config TREE_PREEMPT_RCU + bool "Preemptable tree-based hierarchical RCU" + depends on PREEMPT + help + This option selects the RCU implementation that is + designed for very large SMP systems with hundreds or + thousands of CPUs, but for which real-time response + is also required. + endchoice config RCU_TRACE bool "Enable tracing for RCU" - depends on TREE_RCU || PREEMPT_RCU + depends on TREE_RCU || PREEMPT_RCU || TREE_PREEMPT_RCU help This option provides tracing in RCU which presents stats in debugfs for debugging RCU implementation. @@ -351,7 +360,7 @@ config RCU_FANOUT int "Tree-based hierarchical RCU fanout value" range 2 64 if 64BIT range 2 32 if !64BIT - depends on TREE_RCU + depends on TREE_RCU || TREE_PREEMPT_RCU default 64 if 64BIT default 32 if !64BIT help @@ -366,7 +375,7 @@ config RCU_FANOUT config RCU_FANOUT_EXACT bool "Disable tree-based hierarchical RCU auto-balancing" - depends on TREE_RCU + depends on TREE_RCU || TREE_PREEMPT_RCU default n help This option forces use of the exact RCU_FANOUT value specified, @@ -379,11 +388,12 @@ config RCU_FANOUT_EXACT Say N if unsure. config TREE_RCU_TRACE - def_bool RCU_TRACE && TREE_RCU + def_bool RCU_TRACE && ( TREE_RCU || TREE_PREEMPT_RCU ) select DEBUG_FS help - This option provides tracing for the TREE_RCU implementation, - permitting Makefile to trivially select kernel/rcutree_trace.c. + This option provides tracing for the TREE_RCU and + TREE_PREEMPT_RCU implementations, permitting Makefile to + trivially select kernel/rcutree_trace.c. config PREEMPT_RCU_TRACE def_bool RCU_TRACE && PREEMPT_RCU diff --git a/kernel/Makefile b/kernel/Makefile index 2419c9d4391..1a38b4789dd 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -81,6 +81,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += rcutree.o +obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o diff --git a/kernel/exit.c b/kernel/exit.c index 869dc221733..263f95ed720 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1010,6 +1010,7 @@ NORET_TYPE void do_exit(long code) __free_pipe_info(tsk->splice_pipe); preempt_disable(); + exit_rcu(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; schedule(); diff --git a/kernel/fork.c b/kernel/fork.c index 021e1138556..642e8b5edf0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1022,10 +1022,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, copy_flags(clone_flags, p); INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); -#ifdef CONFIG_PREEMPT_RCU - p->rcu_read_lock_nesting = 0; - p->rcu_flipctr_idx = 0; -#endif /* #ifdef CONFIG_PREEMPT_RCU */ + rcu_copy_process(p); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4ce3adcfa94..cc025571407 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -80,6 +80,21 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); +extern long rcu_batches_completed_sched(void); +static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, + struct rcu_node *rnp, unsigned long flags); +static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags); +static void __rcu_process_callbacks(struct rcu_state *rsp, + struct rcu_data *rdp); +static void __call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu), + struct rcu_state *rsp); +static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp); +static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp, + int preemptable); + +#include "rcutree_plugin.h" + /* * Note a quiescent state. Because we do not need to know * how many quiescent states passed, just if there was at least @@ -87,16 +102,27 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); */ void rcu_sched_qs(int cpu) { - struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); + unsigned long flags; + struct rcu_data *rdp; + + local_irq_save(flags); + rdp = &per_cpu(rcu_sched_data, cpu); rdp->passed_quiesc = 1; rdp->passed_quiesc_completed = rdp->completed; + rcu_preempt_qs(cpu); + local_irq_restore(flags); } void rcu_bh_qs(int cpu) { - struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); + unsigned long flags; + struct rcu_data *rdp; + + local_irq_save(flags); + rdp = &per_cpu(rcu_bh_data, cpu); rdp->passed_quiesc = 1; rdp->passed_quiesc_completed = rdp->completed; + local_irq_restore(flags); } #ifdef CONFIG_NO_HZ @@ -122,16 +148,6 @@ long rcu_batches_completed_sched(void) } EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); -/* - * Return the number of RCU batches processed thus far for debug & stats. - * @@@ placeholder, maps to rcu_batches_completed_sched(). - */ -long rcu_batches_completed(void) -{ - return rcu_batches_completed_sched(); -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - /* * Return the number of RCU BH batches processed thus far for debug & stats. */ @@ -193,6 +209,10 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) return 1; } + /* If preemptable RCU, no point in sending reschedule IPI. */ + if (rdp->preemptable) + return 0; + /* The CPU is online, so send it a reschedule IPI. */ if (rdp->cpu != smp_processor_id()) smp_send_reschedule(rdp->cpu); @@ -473,6 +493,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) printk(KERN_ERR "INFO: RCU detected CPU stalls:"); for (; rnp_cur < rnp_end; rnp_cur++) { + rcu_print_task_stall(rnp); if (rnp_cur->qsmask == 0) continue; for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++) @@ -685,6 +706,19 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_restore(flags); } +/* + * Clean up after the prior grace period and let rcu_start_gp() start up + * the next grace period if one is needed. Note that the caller must + * hold rnp->lock, as required by rcu_start_gp(), which will release it. + */ +static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags) + __releases(rnp->lock) +{ + rsp->completed = rsp->gpnum; + rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); + rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ +} + /* * Similar to cpu_quiet(), for which it is a helper function. Allows * a group of CPUs to be quieted at one go, though all the CPUs in the @@ -706,7 +740,7 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, return; } rnp->qsmask &= ~mask; - if (rnp->qsmask != 0) { + if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { /* Other bits still set at this level, so done. */ spin_unlock_irqrestore(&rnp->lock, flags); @@ -726,14 +760,10 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, /* * Get here if we are the last CPU to pass through a quiescent - * state for this grace period. Clean up and let rcu_start_gp() - * start up the next grace period if one is needed. Note that - * we still hold rnp->lock, as required by rcu_start_gp(), which - * will release it. + * state for this grace period. Invoke cpu_quiet_msk_finish() + * to clean up and start the next grace period if one is needed. */ - rsp->completed = rsp->gpnum; - rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); - rcu_start_gp(rsp, flags); /* releases rnp->lock. */ + cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */ } /* @@ -840,11 +870,11 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) spin_lock(&rnp->lock); /* irqs already disabled. */ rnp->qsmaskinit &= ~mask; if (rnp->qsmaskinit != 0) { - spin_unlock(&rnp->lock); /* irqs already disabled. */ + spin_unlock(&rnp->lock); /* irqs remain disabled. */ break; } mask = rnp->grpmask; - spin_unlock(&rnp->lock); /* irqs already disabled. */ + spin_unlock(&rnp->lock); /* irqs remain disabled. */ rnp = rnp->parent; } while (rnp != NULL); lastcomp = rsp->completed; @@ -1007,6 +1037,7 @@ void rcu_check_callbacks(int cpu, int user) rcu_bh_qs(cpu); } + rcu_preempt_check_callbacks(cpu); raise_softirq(RCU_SOFTIRQ); } @@ -1188,6 +1219,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) __rcu_process_callbacks(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); + rcu_preempt_process_callbacks(); /* * Memory references from any later RCU read-side critical sections @@ -1251,17 +1283,6 @@ void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu_sched); -/* - * @@@ Queue an RCU callback for invocation after a grace period. - * @@@ Placeholder pending rcutree_plugin.h. - */ -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - call_rcu_sched(head, func); -} -EXPORT_SYMBOL_GPL(call_rcu); - - /* * Queue an RCU for invocation after a quicker grace period. */ @@ -1335,7 +1356,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) static int rcu_pending(int cpu) { return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) || - __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)); + __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) || + rcu_preempt_pending(cpu); } /* @@ -1348,7 +1370,8 @@ int rcu_needs_cpu(int cpu) { /* RCU callbacks either ready or pending? */ return per_cpu(rcu_sched_data, cpu).nxtlist || - per_cpu(rcu_bh_data, cpu).nxtlist; + per_cpu(rcu_bh_data, cpu).nxtlist || + rcu_preempt_needs_cpu(cpu); } /* @@ -1383,7 +1406,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) * that this CPU cannot possibly have any RCU callbacks in flight yet. */ static void __cpuinit -rcu_init_percpu_data(int cpu, struct rcu_state *rsp) +rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) { unsigned long flags; long lastcomp; @@ -1399,6 +1422,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->passed_quiesc = 0; /* We could be racing with new GP, */ rdp->qs_pending = 1; /* so set up to respond to current GP. */ rdp->beenonline = 1; /* We have now been online. */ + rdp->preemptable = preemptable; rdp->passed_quiesc_completed = lastcomp - 1; rdp->blimit = blimit; spin_unlock(&rnp->lock); /* irqs remain disabled. */ @@ -1441,12 +1465,13 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) static void __cpuinit rcu_online_cpu(int cpu) { - rcu_init_percpu_data(cpu, &rcu_sched_state); - rcu_init_percpu_data(cpu, &rcu_bh_state); + rcu_init_percpu_data(cpu, &rcu_sched_state, 0); + rcu_init_percpu_data(cpu, &rcu_bh_state, 0); + rcu_preempt_init_percpu_data(cpu); } /* - * Handle CPU online/offline notifcation events. + * Handle CPU online/offline notification events. */ int __cpuinit rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) @@ -1521,6 +1546,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) rnp = rsp->level[i]; for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { spin_lock_init(&rnp->lock); + rnp->gpnum = 0; rnp->qsmask = 0; rnp->qsmaskinit = 0; rnp->grplo = j * cpustride; @@ -1538,13 +1564,16 @@ static void __init rcu_init_one(struct rcu_state *rsp) j / rsp->levelspread[i - 1]; } rnp->level = i; + INIT_LIST_HEAD(&rnp->blocked_tasks[0]); + INIT_LIST_HEAD(&rnp->blocked_tasks[1]); } } } /* - * Helper macro for __rcu_init(). To be used nowhere else! - * Assigns leaf node pointers into each CPU's rcu_data structure. + * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used + * nowhere else! Assigns leaf node pointers into each CPU's rcu_data + * structure. */ #define RCU_INIT_FLAVOR(rsp, rcu_data) \ do { \ @@ -1560,18 +1589,38 @@ do { \ } \ } while (0) +#ifdef CONFIG_TREE_PREEMPT_RCU + +void __init __rcu_init_preempt(void) +{ + int i; /* All used by RCU_INIT_FLAVOR(). */ + int j; + struct rcu_node *rnp; + + RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); +} + +#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + +void __init __rcu_init_preempt(void) +{ +} + +#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ + void __init __rcu_init(void) { - int i; /* All used by RCU_DATA_PTR_INIT(). */ + int i; /* All used by RCU_INIT_FLAVOR(). */ int j; struct rcu_node *rnp; - printk(KERN_INFO "Hierarchical RCU implementation.\n"); + rcu_bootup_announce(); #ifdef CONFIG_RCU_CPU_STALL_DETECTOR printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); + __rcu_init_preempt(); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 0024e5ddcc6..ca560364d8c 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -80,6 +80,7 @@ struct rcu_dynticks { */ struct rcu_node { spinlock_t lock; + long gpnum; /* Current grace period for this node. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ /* order for current grace period to proceed.*/ unsigned long qsmaskinit; @@ -90,6 +91,8 @@ struct rcu_node { u8 grpnum; /* CPU/group number for next level up. */ u8 level; /* root is at level 0. */ struct rcu_node *parent; + struct list_head blocked_tasks[2]; + /* Tasks blocked in RCU read-side critsect. */ } ____cacheline_internodealigned_in_smp; /* Index values for nxttail array in struct rcu_data. */ @@ -111,6 +114,7 @@ struct rcu_data { bool passed_quiesc; /* User-mode/idle loop etc. */ bool qs_pending; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ + bool preemptable; /* Preemptable RCU? */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ @@ -244,5 +248,10 @@ DECLARE_PER_CPU(struct rcu_data, rcu_sched_data); extern struct rcu_state rcu_bh_state; DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); +#ifdef CONFIG_TREE_PREEMPT_RCU +extern struct rcu_state rcu_preempt_state; +DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + #endif /* #ifdef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h new file mode 100644 index 00000000000..cd2ab67400c --- /dev/null +++ b/kernel/rcutree_plugin.h @@ -0,0 +1,447 @@ +/* + * Read-Copy Update mechanism for mutual exclusion (tree-based version) + * Internal non-public definitions that provide either classic + * or preemptable semantics. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright Red Hat, 2009 + * Copyright IBM Corporation, 2009 + * + * Author: Ingo Molnar + * Paul E. McKenney + */ + + +#ifdef CONFIG_TREE_PREEMPT_RCU + +struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); +DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); + +/* + * Tell them what RCU they are running. + */ +static inline void rcu_bootup_announce(void) +{ + printk(KERN_INFO + "Experimental preemptable hierarchical RCU implementation.\n"); +} + +/* + * Return the number of RCU-preempt batches processed thus far + * for debug and statistics. + */ +long rcu_batches_completed_preempt(void) +{ + return rcu_preempt_state.completed; +} +EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt); + +/* + * Return the number of RCU batches processed thus far for debug & stats. + */ +long rcu_batches_completed(void) +{ + return rcu_batches_completed_preempt(); +} +EXPORT_SYMBOL_GPL(rcu_batches_completed); + +/* + * Record a preemptable-RCU quiescent state for the specified CPU. Note + * that this just means that the task currently running on the CPU is + * not in a quiescent state. There might be any number of tasks blocked + * while in an RCU read-side critical section. + */ +static void rcu_preempt_qs_record(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); + rdp->passed_quiesc = 1; + rdp->passed_quiesc_completed = rdp->completed; +} + +/* + * We have entered the scheduler or are between softirqs in ksoftirqd. + * If we are in an RCU read-side critical section, we need to reflect + * that in the state of the rcu_node structure corresponding to this CPU. + * Caller must disable hardirqs. + */ +static void rcu_preempt_qs(int cpu) +{ + struct task_struct *t = current; + int phase; + struct rcu_data *rdp; + struct rcu_node *rnp; + + if (t->rcu_read_lock_nesting && + (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { + + /* Possibly blocking in an RCU read-side critical section. */ + rdp = rcu_preempt_state.rda[cpu]; + rnp = rdp->mynode; + spin_lock(&rnp->lock); + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; + t->rcu_blocked_cpu = cpu; + + /* + * If this CPU has already checked in, then this task + * will hold up the next grace period rather than the + * current grace period. Queue the task accordingly. + * If the task is queued for the current grace period + * (i.e., this CPU has not yet passed through a quiescent + * state for the current grace period), then as long + * as that task remains queued, the current grace period + * cannot end. + */ + phase = !(rnp->qsmask & rdp->grpmask) ^ (rnp->gpnum & 0x1); + list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); + smp_mb(); /* Ensure later ctxt swtch seen after above. */ + spin_unlock(&rnp->lock); + } + + /* + * Either we were not in an RCU read-side critical section to + * begin with, or we have now recorded that critical section + * globally. Either way, we can now note a quiescent state + * for this CPU. Again, if we were in an RCU read-side critical + * section, and if that critical section was blocking the current + * grace period, then the fact that the task has been enqueued + * means that we continue to block the current grace period. + */ + rcu_preempt_qs_record(cpu); + t->rcu_read_unlock_special &= ~(RCU_READ_UNLOCK_NEED_QS | + RCU_READ_UNLOCK_GOT_QS); +} + +/* + * Tree-preemptable RCU implementation for rcu_read_lock(). + * Just increment ->rcu_read_lock_nesting, shared state will be updated + * if we block. + */ +void __rcu_read_lock(void) +{ + ACCESS_ONCE(current->rcu_read_lock_nesting)++; + barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ +} +EXPORT_SYMBOL_GPL(__rcu_read_lock); + +static void rcu_read_unlock_special(struct task_struct *t) +{ + int empty; + unsigned long flags; + unsigned long mask; + struct rcu_node *rnp; + int special; + + /* NMI handlers cannot block and cannot safely manipulate state. */ + if (in_nmi()) + return; + + local_irq_save(flags); + + /* + * If RCU core is waiting for this CPU to exit critical section, + * let it know that we have done so. + */ + special = t->rcu_read_unlock_special; + if (special & RCU_READ_UNLOCK_NEED_QS) { + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_GOT_QS; + } + + /* Hardware IRQ handlers cannot block. */ + if (in_irq()) { + local_irq_restore(flags); + return; + } + + /* Clean up if blocked during RCU read-side critical section. */ + if (special & RCU_READ_UNLOCK_BLOCKED) { + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; + + /* Remove this task from the list it blocked on. */ + rnp = rcu_preempt_state.rda[t->rcu_blocked_cpu]->mynode; + spin_lock(&rnp->lock); + empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); + list_del_init(&t->rcu_node_entry); + t->rcu_blocked_cpu = -1; + + /* + * If this was the last task on the current list, and if + * we aren't waiting on any CPUs, report the quiescent state. + * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk() + * drop rnp->lock and restore irq. + */ + if (!empty && rnp->qsmask == 0 && + list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) { + t->rcu_read_unlock_special &= + ~(RCU_READ_UNLOCK_NEED_QS | + RCU_READ_UNLOCK_GOT_QS); + if (rnp->parent == NULL) { + /* Only one rcu_node in the tree. */ + cpu_quiet_msk_finish(&rcu_preempt_state, flags); + return; + } + /* Report up the rest of the hierarchy. */ + mask = rnp->grpmask; + spin_unlock_irqrestore(&rnp->lock, flags); + rnp = rnp->parent; + spin_lock_irqsave(&rnp->lock, flags); + cpu_quiet_msk(mask, &rcu_preempt_state, rnp, flags); + return; + } + spin_unlock(&rnp->lock); + } + local_irq_restore(flags); +} + +/* + * Tree-preemptable RCU implementation for rcu_read_unlock(). + * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost + * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then + * invoke rcu_read_unlock_special() to clean up after a context switch + * in an RCU read-side critical section and other special cases. + */ +void __rcu_read_unlock(void) +{ + struct task_struct *t = current; + + barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ + if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && + unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + rcu_read_unlock_special(t); +} +EXPORT_SYMBOL_GPL(__rcu_read_unlock); + +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + +/* + * Scan the current list of tasks blocked within RCU read-side critical + * sections, printing out the tid of each. + */ +static void rcu_print_task_stall(struct rcu_node *rnp) +{ + unsigned long flags; + struct list_head *lp; + int phase = rnp->gpnum & 0x1; + struct task_struct *t; + + if (!list_empty(&rnp->blocked_tasks[phase])) { + spin_lock_irqsave(&rnp->lock, flags); + phase = rnp->gpnum & 0x1; /* re-read under lock. */ + lp = &rnp->blocked_tasks[phase]; + list_for_each_entry(t, lp, rcu_node_entry) + printk(" P%d", t->pid); + spin_unlock_irqrestore(&rnp->lock, flags); + } +} + +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + +/* + * Check for preempted RCU readers for the specified rcu_node structure. + * If the caller needs a reliable answer, it must hold the rcu_node's + * >lock. + */ +static int rcu_preempted_readers(struct rcu_node *rnp) +{ + return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); +} + +/* + * Check for a quiescent state from the current CPU. When a task blocks, + * the task is recorded in the corresponding CPU's rcu_node structure, + * which is checked elsewhere. + * + * Caller must disable hard irqs. + */ +static void rcu_preempt_check_callbacks(int cpu) +{ + struct task_struct *t = current; + + if (t->rcu_read_lock_nesting == 0) { + t->rcu_read_unlock_special &= + ~(RCU_READ_UNLOCK_NEED_QS | RCU_READ_UNLOCK_GOT_QS); + rcu_preempt_qs_record(cpu); + return; + } + if (per_cpu(rcu_preempt_data, cpu).qs_pending) { + if (t->rcu_read_unlock_special & RCU_READ_UNLOCK_GOT_QS) { + rcu_preempt_qs_record(cpu); + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_GOT_QS; + } else if (!(t->rcu_read_unlock_special & + RCU_READ_UNLOCK_NEED_QS)) { + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; + } + } +} + +/* + * Process callbacks for preemptable RCU. + */ +static void rcu_preempt_process_callbacks(void) +{ + __rcu_process_callbacks(&rcu_preempt_state, + &__get_cpu_var(rcu_preempt_data)); +} + +/* + * Queue a preemptable-RCU callback for invocation after a grace period. + */ +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_preempt_state); +} +EXPORT_SYMBOL_GPL(call_rcu); + +/* + * Check to see if there is any immediate preemptable-RCU-related work + * to be done. + */ +static int rcu_preempt_pending(int cpu) +{ + return __rcu_pending(&rcu_preempt_state, + &per_cpu(rcu_preempt_data, cpu)); +} + +/* + * Does preemptable RCU need the CPU to stay out of dynticks mode? + */ +static int rcu_preempt_needs_cpu(int cpu) +{ + return !!per_cpu(rcu_preempt_data, cpu).nxtlist; +} + +/* + * Initialize preemptable RCU's per-CPU data. + */ +static void __cpuinit rcu_preempt_init_percpu_data(int cpu) +{ + rcu_init_percpu_data(cpu, &rcu_preempt_state, 1); +} + +/* + * Check for a task exiting while in a preemptable-RCU read-side + * critical section, clean up if so. No need to issue warnings, + * as debug_check_no_locks_held() already does this if lockdep + * is enabled. + */ +void exit_rcu(void) +{ + struct task_struct *t = current; + + if (t->rcu_read_lock_nesting == 0) + return; + t->rcu_read_lock_nesting = 1; + rcu_read_unlock(); +} + +#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + +/* + * Tell them what RCU they are running. + */ +static inline void rcu_bootup_announce(void) +{ + printk(KERN_INFO "Hierarchical RCU implementation.\n"); +} + +/* + * Return the number of RCU batches processed thus far for debug & stats. + */ +long rcu_batches_completed(void) +{ + return rcu_batches_completed_sched(); +} +EXPORT_SYMBOL_GPL(rcu_batches_completed); + +/* + * Because preemptable RCU does not exist, we never have to check for + * CPUs being in quiescent states. + */ +static void rcu_preempt_qs(int cpu) +{ +} + +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + +/* + * Because preemptable RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections. + */ +static void rcu_print_task_stall(struct rcu_node *rnp) +{ +} + +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + +/* + * Because preemptable RCU does not exist, there are never any preempted + * RCU readers. + */ +static int rcu_preempted_readers(struct rcu_node *rnp) +{ + return 0; +} + +/* + * Because preemptable RCU does not exist, it never has any callbacks + * to check. + */ +void rcu_preempt_check_callbacks(int cpu) +{ +} + +/* + * Because preemptable RCU does not exist, it never has any callbacks + * to process. + */ +void rcu_preempt_process_callbacks(void) +{ +} + +/* + * In classic RCU, call_rcu() is just call_rcu_sched(). + */ +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + call_rcu_sched(head, func); +} +EXPORT_SYMBOL_GPL(call_rcu); + +/* + * Because preemptable RCU does not exist, it never has any work to do. + */ +static int rcu_preempt_pending(int cpu) +{ + return 0; +} + +/* + * Because preemptable RCU does not exist, it never needs any CPU. + */ +static int rcu_preempt_needs_cpu(int cpu) +{ + return 0; +} + +/* + * Because preemptable RCU does not exist, there is no per-CPU + * data to initialize. + */ +static void __cpuinit rcu_preempt_init_percpu_data(int cpu) +{ +} + +#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 31af3a0fb6d..0ea1bff6972 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -77,6 +77,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) static int show_rcudata(struct seq_file *m, void *unused) { +#ifdef CONFIG_TREE_PREEMPT_RCU + seq_puts(m, "rcu_preempt:\n"); + PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ seq_puts(m, "rcu_sched:\n"); PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m); seq_puts(m, "rcu_bh:\n"); @@ -125,6 +129,10 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); #endif /* #ifdef CONFIG_NO_HZ */ seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); +#ifdef CONFIG_TREE_PREEMPT_RCU + seq_puts(m, "\"rcu_preempt:\"\n"); + PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ seq_puts(m, "\"rcu_sched:\"\n"); PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m); seq_puts(m, "\"rcu_bh:\"\n"); @@ -172,6 +180,10 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) static int show_rcuhier(struct seq_file *m, void *unused) { +#ifdef CONFIG_TREE_PREEMPT_RCU + seq_puts(m, "rcu_preempt:\n"); + print_one_rcu_state(m, &rcu_preempt_state); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ seq_puts(m, "rcu_sched:\n"); print_one_rcu_state(m, &rcu_sched_state); seq_puts(m, "rcu_bh:\n"); @@ -194,6 +206,10 @@ static struct file_operations rcuhier_fops = { static int show_rcugp(struct seq_file *m, void *unused) { +#ifdef CONFIG_TREE_PREEMPT_RCU + seq_printf(m, "rcu_preempt: completed=%ld gpnum=%ld\n", + rcu_preempt_state.completed, rcu_preempt_state.gpnum); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ seq_printf(m, "rcu_sched: completed=%ld gpnum=%ld\n", rcu_sched_state.completed, rcu_sched_state.gpnum); seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n", @@ -244,6 +260,10 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) static int show_rcu_pending(struct seq_file *m, void *unused) { +#ifdef CONFIG_TREE_PREEMPT_RCU + seq_puts(m, "rcu_preempt:\n"); + print_rcu_pendings(m, &rcu_preempt_state); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ seq_puts(m, "rcu_sched:\n"); print_rcu_pendings(m, &rcu_sched_state); seq_puts(m, "rcu_bh:\n"); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 12327b2bb78..f87fb0c8f92 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -725,7 +725,7 @@ config RCU_TORTURE_TEST_RUNNABLE config RCU_CPU_STALL_DETECTOR bool "Check for stalled CPUs delaying RCU grace periods" - depends on CLASSIC_RCU || TREE_RCU + depends on CLASSIC_RCU || TREE_RCU || TREE_PREEMPT_RCU default n help This option causes RCU to printk information on which -- cgit v1.2.3-70-g09d2 From 6b3ef48adf847f7adf11c870e3ffacac150f1564 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 22 Aug 2009 13:56:53 -0700 Subject: rcu: Remove CONFIG_PREEMPT_RCU Now that CONFIG_TREE_PREEMPT_RCU is in place, there is no further need for CONFIG_PREEMPT_RCU. Remove it, along with whatever subtle bugs it may (or may not) contain. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josht@linux.vnet.ibm.com Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org LKML-Reference: <125097461396-git-send-email-> Signed-off-by: Ingo Molnar --- Documentation/RCU/rcu.txt | 10 +- Documentation/RCU/whatisRCU.txt | 8 +- include/linux/init_task.h | 6 +- include/linux/rcupdate.h | 4 +- include/linux/rcupreempt.h | 140 ---- include/linux/rcupreempt_trace.h | 97 --- include/linux/sched.h | 13 - init/Kconfig | 20 +- kernel/Makefile | 2 - kernel/rcupreempt.c | 1518 -------------------------------------- kernel/rcupreempt_trace.c | 335 --------- lib/Kconfig.debug | 2 +- 12 files changed, 13 insertions(+), 2142 deletions(-) delete mode 100644 include/linux/rcupreempt.h delete mode 100644 include/linux/rcupreempt_trace.h delete mode 100644 kernel/rcupreempt.c delete mode 100644 kernel/rcupreempt_trace.c (limited to 'kernel') diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt index 7aa2002ade7..2a23523ce47 100644 --- a/Documentation/RCU/rcu.txt +++ b/Documentation/RCU/rcu.txt @@ -36,7 +36,7 @@ o How can the updater tell when a grace period has completed executed in user mode, or executed in the idle loop, we can safely free up that item. - Preemptible variants of RCU (CONFIG_PREEMPT_RCU) get the + Preemptible variants of RCU (CONFIG_TREE_PREEMPT_RCU) get the same effect, but require that the readers manipulate CPU-local counters. These counters allow limited types of blocking within RCU read-side critical sections. SRCU also uses @@ -79,10 +79,10 @@ o I hear that RCU is patented? What is with that? o I hear that RCU needs work in order to support realtime kernels? This work is largely completed. Realtime-friendly RCU can be - enabled via the CONFIG_PREEMPT_RCU kernel configuration parameter. - However, work is in progress for enabling priority boosting of - preempted RCU read-side critical sections. This is needed if you - have CPU-bound realtime threads. + enabled via the CONFIG_TREE_PREEMPT_RCU kernel configuration + parameter. However, work is in progress for enabling priority + boosting of preempted RCU read-side critical sections. This is + needed if you have CPU-bound realtime threads. o Where can I find more information on RCU? diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 97ded2432c5..e41a7fecf0d 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -136,10 +136,10 @@ rcu_read_lock() Used by a reader to inform the reclaimer that the reader is entering an RCU read-side critical section. It is illegal to block while in an RCU read-side critical section, though - kernels built with CONFIG_PREEMPT_RCU can preempt RCU read-side - critical sections. Any RCU-protected data structure accessed - during an RCU read-side critical section is guaranteed to remain - unreclaimed for the full duration of that critical section. + kernels built with CONFIG_TREE_PREEMPT_RCU can preempt RCU + read-side critical sections. Any RCU-protected data structure + accessed during an RCU read-side critical section is guaranteed to + remain unreclaimed for the full duration of that critical section. Reference counts may be used in conjunction with RCU to maintain longer-term references to data structures. diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 971a968831b..79d4baee31b 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -94,11 +94,7 @@ extern struct group_info init_groups; # define CAP_INIT_BSET CAP_INIT_EFF_SET #endif -#ifdef CONFIG_PREEMPT_RCU -#define INIT_TASK_RCU_PREEMPT(tsk) \ - .rcu_read_lock_nesting = 0, \ - .rcu_flipctr_idx = 0, -#elif defined(CONFIG_TREE_PREEMPT_RCU) +#ifdef CONFIG_TREE_PREEMPT_RCU #define INIT_TASK_RCU_PREEMPT(tsk) \ .rcu_read_lock_nesting = 0, \ .rcu_read_unlock_special = 0, \ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 26892f5e7bd..ec90fc34fea 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -68,11 +68,9 @@ extern int rcu_scheduler_active; #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) #include -#elif defined(CONFIG_PREEMPT_RCU) -#include #else #error "Unknown RCU implementation specified to kernel configuration" -#endif /* #else #if defined(CONFIG_CLASSIC_RCU) */ +#endif #define RCU_HEAD_INIT { .next = NULL, .func = NULL } #define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h deleted file mode 100644 index a42ab88e921..00000000000 --- a/include/linux/rcupreempt.h +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Read-Copy Update mechanism for mutual exclusion (RT implementation) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2006 - * - * Author: Paul McKenney - * - * Based on the original work by Paul McKenney - * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. - * Papers: - * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf - * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU - * - */ - -#ifndef __LINUX_RCUPREEMPT_H -#define __LINUX_RCUPREEMPT_H - -#include -#include -#include -#include -#include -#include - -extern void rcu_sched_qs(int cpu); -static inline void rcu_bh_qs(int cpu) { } - -/* - * Someone might want to pass call_rcu_bh as a function pointer. - * So this needs to just be a rename and not a macro function. - * (no parentheses) - */ -#define call_rcu_bh call_rcu - -/** - * call_rcu_sched - Queue RCU callback for invocation after sched grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual update function to be invoked after the grace period - * - * The update function will be invoked some time after a full - * synchronize_sched()-style grace period elapses, in other words after - * all currently executing preempt-disabled sections of code (including - * hardirq handlers, NMI handlers, and local_irq_save() blocks) have - * completed. - */ -extern void call_rcu_sched(struct rcu_head *head, - void (*func)(struct rcu_head *head)); - -extern void __rcu_read_lock(void); -extern void __rcu_read_unlock(void); -extern int rcu_needs_cpu(int cpu); - -#define __rcu_read_lock_bh() { rcu_read_lock(); local_bh_disable(); } -#define __rcu_read_unlock_bh() { local_bh_enable(); rcu_read_unlock(); } - -extern void __synchronize_sched(void); - -static inline void synchronize_rcu_expedited(void) -{ - synchronize_rcu(); /* Placeholder for new rcupreempt implementation. */ -} - -static inline void synchronize_rcu_bh_expedited(void) -{ - synchronize_rcu_bh(); /* Placeholder for new rcupreempt impl. */ -} - -extern void __rcu_init(void); -extern void rcu_init_sched(void); -extern void rcu_check_callbacks(int cpu, int user); -extern void rcu_restart_cpu(int cpu); -extern long rcu_batches_completed(void); - -/* - * Return the number of RCU batches processed thus far. Useful for debug - * and statistic. The _bh variant is identifcal to straight RCU - */ -static inline long rcu_batches_completed_bh(void) -{ - return rcu_batches_completed(); -} - -static inline void exit_rcu(void) -{ -} - -#ifdef CONFIG_RCU_TRACE -struct rcupreempt_trace; -extern long *rcupreempt_flipctr(int cpu); -extern long rcupreempt_data_completed(void); -extern int rcupreempt_flip_flag(int cpu); -extern int rcupreempt_mb_flag(int cpu); -extern char *rcupreempt_try_flip_state_name(void); -extern struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu); -#endif - -struct softirq_action; - -#ifdef CONFIG_NO_HZ -extern void rcu_enter_nohz(void); -extern void rcu_exit_nohz(void); -#else -# define rcu_enter_nohz() do { } while (0) -# define rcu_exit_nohz() do { } while (0) -#endif - -/* - * A context switch is a grace period for rcupreempt synchronize_rcu() - * only during early boot, before the scheduler has been initialized. - * So, how the heck do we get a context switch? Well, if the caller - * invokes synchronize_rcu(), they are willing to accept a context - * switch, so we simply pretend that one happened. - * - * After boot, there might be a blocked or preempted task in an RCU - * read-side critical section, so we cannot then take the fastpath. - */ -static inline int rcu_blocking_is_gp(void) -{ - return num_online_cpus() == 1 && !rcu_scheduler_active; -} - -#endif /* __LINUX_RCUPREEMPT_H */ diff --git a/include/linux/rcupreempt_trace.h b/include/linux/rcupreempt_trace.h deleted file mode 100644 index b99ae073192..00000000000 --- a/include/linux/rcupreempt_trace.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Read-Copy Update mechanism for mutual exclusion (RT implementation) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2006 - * - * Author: Paul McKenney - * - * Based on the original work by Paul McKenney - * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. - * Papers: - * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf - * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) - * - * For detailed explanation of the Preemptible Read-Copy Update mechanism see - - * http://lwn.net/Articles/253651/ - */ - -#ifndef __LINUX_RCUPREEMPT_TRACE_H -#define __LINUX_RCUPREEMPT_TRACE_H - -#include -#include - -#include - -/* - * PREEMPT_RCU data structures. - */ - -struct rcupreempt_trace { - long next_length; - long next_add; - long wait_length; - long wait_add; - long done_length; - long done_add; - long done_remove; - atomic_t done_invoked; - long rcu_check_callbacks; - atomic_t rcu_try_flip_1; - atomic_t rcu_try_flip_e1; - long rcu_try_flip_i1; - long rcu_try_flip_ie1; - long rcu_try_flip_g1; - long rcu_try_flip_a1; - long rcu_try_flip_ae1; - long rcu_try_flip_a2; - long rcu_try_flip_z1; - long rcu_try_flip_ze1; - long rcu_try_flip_z2; - long rcu_try_flip_m1; - long rcu_try_flip_me1; - long rcu_try_flip_m2; -}; - -#ifdef CONFIG_RCU_TRACE -#define RCU_TRACE(fn, arg) fn(arg); -#else -#define RCU_TRACE(fn, arg) -#endif - -extern void rcupreempt_trace_move2done(struct rcupreempt_trace *trace); -extern void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace); -extern void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace); -extern void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace); -extern void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace); -extern void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace); -extern void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace); -extern void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace); -extern void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace); -extern void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace); -extern void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace); -extern void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace); -extern void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace); -extern void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace); -extern void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace); -extern void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace); -extern void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace); -extern void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace); -extern void rcupreempt_trace_invoke(struct rcupreempt_trace *trace); -extern void rcupreempt_trace_next_add(struct rcupreempt_trace *trace); - -#endif /* __LINUX_RCUPREEMPT_TRACE_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index d7f98f637a2..bfca26d63b1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1205,11 +1205,6 @@ struct task_struct { unsigned int policy; cpumask_t cpus_allowed; -#ifdef CONFIG_PREEMPT_RCU - int rcu_read_lock_nesting; - int rcu_flipctr_idx; -#endif /* #ifdef CONFIG_PREEMPT_RCU */ - #ifdef CONFIG_TREE_PREEMPT_RCU int rcu_read_lock_nesting; char rcu_read_unlock_special; @@ -1744,14 +1739,6 @@ static inline void rcu_copy_process(struct task_struct *p) INIT_LIST_HEAD(&p->rcu_node_entry); } -#elif defined(CONFIG_PREEMPT_RCU) - -static inline void rcu_copy_process(struct task_struct *p) -{ - p->rcu_read_lock_nesting = 0; - p->rcu_flipctr_idx = 0; -} - #else static inline void rcu_copy_process(struct task_struct *p) diff --git a/init/Kconfig b/init/Kconfig index f88da2d1c1f..8e8b76d8a27 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -324,17 +324,6 @@ config TREE_RCU thousands of CPUs. It also scales down nicely to smaller systems. -config PREEMPT_RCU - bool "Preemptible RCU" - depends on PREEMPT - help - This option reduces the latency of the kernel by making certain - RCU sections preemptible. Normally RCU code is non-preemptible, if - this option is selected then read-only RCU sections become - preemptible. This helps latency, but may expose bugs due to - now-naive assumptions about each RCU read-side critical section - remaining on a given CPU through its execution. - config TREE_PREEMPT_RCU bool "Preemptable tree-based hierarchical RCU" depends on PREEMPT @@ -348,7 +337,7 @@ endchoice config RCU_TRACE bool "Enable tracing for RCU" - depends on TREE_RCU || PREEMPT_RCU || TREE_PREEMPT_RCU + depends on TREE_RCU || TREE_PREEMPT_RCU help This option provides tracing in RCU which presents stats in debugfs for debugging RCU implementation. @@ -395,13 +384,6 @@ config TREE_RCU_TRACE TREE_PREEMPT_RCU implementations, permitting Makefile to trivially select kernel/rcutree_trace.c. -config PREEMPT_RCU_TRACE - def_bool RCU_TRACE && PREEMPT_RCU - select DEBUG_FS - help - This option provides tracing for the PREEMPT_RCU implementation, - permitting Makefile to trivially select kernel/rcupreempt_trace.c. - endmenu # "RCU Subsystem" config IKCONFIG diff --git a/kernel/Makefile b/kernel/Makefile index 1a38b4789dd..b833bd5cc12 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -82,9 +82,7 @@ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += rcutree.o obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o -obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o -obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c deleted file mode 100644 index 0053ce56e32..00000000000 --- a/kernel/rcupreempt.c +++ /dev/null @@ -1,1518 +0,0 @@ -/* - * Read-Copy Update mechanism for mutual exclusion, realtime implementation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2006 - * - * Authors: Paul E. McKenney - * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar - * for pushing me away from locks and towards counters, and - * to Suparna Bhattacharya for pushing me completely away - * from atomic instructions on the read side. - * - * - Added handling of Dynamic Ticks - * Copyright 2007 - Paul E. Mckenney - * - Steven Rostedt - * - * Papers: http://www.rdrop.com/users/paulmck/RCU - * - * Design Document: http://lwn.net/Articles/253651/ - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU/ *.txt - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * PREEMPT_RCU data structures. - */ - -/* - * GP_STAGES specifies the number of times the state machine has - * to go through the all the rcu_try_flip_states (see below) - * in a single Grace Period. - * - * GP in GP_STAGES stands for Grace Period ;) - */ -#define GP_STAGES 2 -struct rcu_data { - spinlock_t lock; /* Protect rcu_data fields. */ - long completed; /* Number of last completed batch. */ - int waitlistcount; - struct rcu_head *nextlist; - struct rcu_head **nexttail; - struct rcu_head *waitlist[GP_STAGES]; - struct rcu_head **waittail[GP_STAGES]; - struct rcu_head *donelist; /* from waitlist & waitschedlist */ - struct rcu_head **donetail; - long rcu_flipctr[2]; - struct rcu_head *nextschedlist; - struct rcu_head **nextschedtail; - struct rcu_head *waitschedlist; - struct rcu_head **waitschedtail; - int rcu_sched_sleeping; -#ifdef CONFIG_RCU_TRACE - struct rcupreempt_trace trace; -#endif /* #ifdef CONFIG_RCU_TRACE */ -}; - -/* - * States for rcu_try_flip() and friends. - */ - -enum rcu_try_flip_states { - - /* - * Stay here if nothing is happening. Flip the counter if somthing - * starts happening. Denoted by "I" - */ - rcu_try_flip_idle_state, - - /* - * Wait here for all CPUs to notice that the counter has flipped. This - * prevents the old set of counters from ever being incremented once - * we leave this state, which in turn is necessary because we cannot - * test any individual counter for zero -- we can only check the sum. - * Denoted by "A". - */ - rcu_try_flip_waitack_state, - - /* - * Wait here for the sum of the old per-CPU counters to reach zero. - * Denoted by "Z". - */ - rcu_try_flip_waitzero_state, - - /* - * Wait here for each of the other CPUs to execute a memory barrier. - * This is necessary to ensure that these other CPUs really have - * completed executing their RCU read-side critical sections, despite - * their CPUs wildly reordering memory. Denoted by "M". - */ - rcu_try_flip_waitmb_state, -}; - -/* - * States for rcu_ctrlblk.rcu_sched_sleep. - */ - -enum rcu_sched_sleep_states { - rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */ - rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */ - rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */ -}; - -struct rcu_ctrlblk { - spinlock_t fliplock; /* Protect state-machine transitions. */ - long completed; /* Number of last completed batch. */ - enum rcu_try_flip_states rcu_try_flip_state; /* The current state of - the rcu state machine */ - spinlock_t schedlock; /* Protect rcu_sched sleep state. */ - enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */ - wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */ -}; - -struct rcu_dyntick_sched { - int dynticks; - int dynticks_snap; - int sched_qs; - int sched_qs_snap; - int sched_dynticks_snap; -}; - -static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = { - .dynticks = 1, -}; - -static int rcu_pending(int cpu); - -void rcu_sched_qs(int cpu) -{ - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - rdssp->sched_qs++; -} - -#ifdef CONFIG_NO_HZ - -void rcu_enter_nohz(void) -{ - static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1); - - smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ - __get_cpu_var(rcu_dyntick_sched).dynticks++; - WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs); -} - -void rcu_exit_nohz(void) -{ - static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1); - - __get_cpu_var(rcu_dyntick_sched).dynticks++; - smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ - WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1), - &rs); -} - -#endif /* CONFIG_NO_HZ */ - - -static DEFINE_PER_CPU(struct rcu_data, rcu_data); - -static struct rcu_ctrlblk rcu_ctrlblk = { - .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), - .completed = 0, - .rcu_try_flip_state = rcu_try_flip_idle_state, - .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock), - .sched_sleep = rcu_sched_not_sleeping, - .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq), -}; - -static struct task_struct *rcu_sched_grace_period_task; - -#ifdef CONFIG_RCU_TRACE -static char *rcu_try_flip_state_names[] = - { "idle", "waitack", "waitzero", "waitmb" }; -#endif /* #ifdef CONFIG_RCU_TRACE */ - -static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly - = CPU_BITS_NONE; - -/* - * Enum and per-CPU flag to determine when each CPU has seen - * the most recent counter flip. - */ - -enum rcu_flip_flag_values { - rcu_flip_seen, /* Steady/initial state, last flip seen. */ - /* Only GP detector can update. */ - rcu_flipped /* Flip just completed, need confirmation. */ - /* Only corresponding CPU can update. */ -}; -static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag) - = rcu_flip_seen; - -/* - * Enum and per-CPU flag to determine when each CPU has executed the - * needed memory barrier to fence in memory references from its last RCU - * read-side critical section in the just-completed grace period. - */ - -enum rcu_mb_flag_values { - rcu_mb_done, /* Steady/initial state, no mb()s required. */ - /* Only GP detector can update. */ - rcu_mb_needed /* Flip just completed, need an mb(). */ - /* Only corresponding CPU can update. */ -}; -static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag) - = rcu_mb_done; - -/* - * RCU_DATA_ME: find the current CPU's rcu_data structure. - * RCU_DATA_CPU: find the specified CPU's rcu_data structure. - */ -#define RCU_DATA_ME() (&__get_cpu_var(rcu_data)) -#define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu)) - -/* - * Helper macro for tracing when the appropriate rcu_data is not - * cached in a local variable, but where the CPU number is so cached. - */ -#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace)); - -/* - * Helper macro for tracing when the appropriate rcu_data is not - * cached in a local variable. - */ -#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace)); - -/* - * Helper macro for tracing when the appropriate rcu_data is pointed - * to by a local variable. - */ -#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace)); - -#define RCU_SCHED_BATCH_TIME (HZ / 50) - -/* - * Return the number of RCU batches processed thus far. Useful - * for debug and statistics. - */ -long rcu_batches_completed(void) -{ - return rcu_ctrlblk.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - -void __rcu_read_lock(void) -{ - int idx; - struct task_struct *t = current; - int nesting; - - nesting = ACCESS_ONCE(t->rcu_read_lock_nesting); - if (nesting != 0) { - - /* An earlier rcu_read_lock() covers us, just count it. */ - - t->rcu_read_lock_nesting = nesting + 1; - - } else { - unsigned long flags; - - /* - * We disable interrupts for the following reasons: - * - If we get scheduling clock interrupt here, and we - * end up acking the counter flip, it's like a promise - * that we will never increment the old counter again. - * Thus we will break that promise if that - * scheduling clock interrupt happens between the time - * we pick the .completed field and the time that we - * increment our counter. - * - * - We don't want to be preempted out here. - * - * NMIs can still occur, of course, and might themselves - * contain rcu_read_lock(). - */ - - local_irq_save(flags); - - /* - * Outermost nesting of rcu_read_lock(), so increment - * the current counter for the current CPU. Use volatile - * casts to prevent the compiler from reordering. - */ - - idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1; - ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++; - - /* - * Now that the per-CPU counter has been incremented, we - * are protected from races with rcu_read_lock() invoked - * from NMI handlers on this CPU. We can therefore safely - * increment the nesting counter, relieving further NMIs - * of the need to increment the per-CPU counter. - */ - - ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1; - - /* - * Now that we have preventing any NMIs from storing - * to the ->rcu_flipctr_idx, we can safely use it to - * remember which counter to decrement in the matching - * rcu_read_unlock(). - */ - - ACCESS_ONCE(t->rcu_flipctr_idx) = idx; - local_irq_restore(flags); - } -} -EXPORT_SYMBOL_GPL(__rcu_read_lock); - -void __rcu_read_unlock(void) -{ - int idx; - struct task_struct *t = current; - int nesting; - - nesting = ACCESS_ONCE(t->rcu_read_lock_nesting); - if (nesting > 1) { - - /* - * We are still protected by the enclosing rcu_read_lock(), - * so simply decrement the counter. - */ - - t->rcu_read_lock_nesting = nesting - 1; - - } else { - unsigned long flags; - - /* - * Disable local interrupts to prevent the grace-period - * detection state machine from seeing us half-done. - * NMIs can still occur, of course, and might themselves - * contain rcu_read_lock() and rcu_read_unlock(). - */ - - local_irq_save(flags); - - /* - * Outermost nesting of rcu_read_unlock(), so we must - * decrement the current counter for the current CPU. - * This must be done carefully, because NMIs can - * occur at any point in this code, and any rcu_read_lock() - * and rcu_read_unlock() pairs in the NMI handlers - * must interact non-destructively with this code. - * Lots of volatile casts, and -very- careful ordering. - * - * Changes to this code, including this one, must be - * inspected, validated, and tested extremely carefully!!! - */ - - /* - * First, pick up the index. - */ - - idx = ACCESS_ONCE(t->rcu_flipctr_idx); - - /* - * Now that we have fetched the counter index, it is - * safe to decrement the per-task RCU nesting counter. - * After this, any interrupts or NMIs will increment and - * decrement the per-CPU counters. - */ - ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1; - - /* - * It is now safe to decrement this task's nesting count. - * NMIs that occur after this statement will route their - * rcu_read_lock() calls through this "else" clause, and - * will thus start incrementing the per-CPU counter on - * their own. They will also clobber ->rcu_flipctr_idx, - * but that is OK, since we have already fetched it. - */ - - ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--; - local_irq_restore(flags); - } -} -EXPORT_SYMBOL_GPL(__rcu_read_unlock); - -/* - * If a global counter flip has occurred since the last time that we - * advanced callbacks, advance them. Hardware interrupts must be - * disabled when calling this function. - */ -static void __rcu_advance_callbacks(struct rcu_data *rdp) -{ - int cpu; - int i; - int wlc = 0; - - if (rdp->completed != rcu_ctrlblk.completed) { - if (rdp->waitlist[GP_STAGES - 1] != NULL) { - *rdp->donetail = rdp->waitlist[GP_STAGES - 1]; - rdp->donetail = rdp->waittail[GP_STAGES - 1]; - RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp); - } - for (i = GP_STAGES - 2; i >= 0; i--) { - if (rdp->waitlist[i] != NULL) { - rdp->waitlist[i + 1] = rdp->waitlist[i]; - rdp->waittail[i + 1] = rdp->waittail[i]; - wlc++; - } else { - rdp->waitlist[i + 1] = NULL; - rdp->waittail[i + 1] = - &rdp->waitlist[i + 1]; - } - } - if (rdp->nextlist != NULL) { - rdp->waitlist[0] = rdp->nextlist; - rdp->waittail[0] = rdp->nexttail; - wlc++; - rdp->nextlist = NULL; - rdp->nexttail = &rdp->nextlist; - RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp); - } else { - rdp->waitlist[0] = NULL; - rdp->waittail[0] = &rdp->waitlist[0]; - } - rdp->waitlistcount = wlc; - rdp->completed = rcu_ctrlblk.completed; - } - - /* - * Check to see if this CPU needs to report that it has seen - * the most recent counter flip, thereby declaring that all - * subsequent rcu_read_lock() invocations will respect this flip. - */ - - cpu = raw_smp_processor_id(); - if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { - smp_mb(); /* Subsequent counter accesses must see new value */ - per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen; - smp_mb(); /* Subsequent RCU read-side critical sections */ - /* seen -after- acknowledgement. */ - } -} - -#ifdef CONFIG_NO_HZ -static DEFINE_PER_CPU(int, rcu_update_flag); - -/** - * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI. - * - * If the CPU was idle with dynamic ticks active, this updates the - * rcu_dyntick_sched.dynticks to let the RCU handling know that the - * CPU is active. - */ -void rcu_irq_enter(void) -{ - int cpu = smp_processor_id(); - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - if (per_cpu(rcu_update_flag, cpu)) - per_cpu(rcu_update_flag, cpu)++; - - /* - * Only update if we are coming from a stopped ticks mode - * (rcu_dyntick_sched.dynticks is even). - */ - if (!in_interrupt() && - (rdssp->dynticks & 0x1) == 0) { - /* - * The following might seem like we could have a race - * with NMI/SMIs. But this really isn't a problem. - * Here we do a read/modify/write, and the race happens - * when an NMI/SMI comes in after the read and before - * the write. But NMI/SMIs will increment this counter - * twice before returning, so the zero bit will not - * be corrupted by the NMI/SMI which is the most important - * part. - * - * The only thing is that we would bring back the counter - * to a postion that it was in during the NMI/SMI. - * But the zero bit would be set, so the rest of the - * counter would again be ignored. - * - * On return from the IRQ, the counter may have the zero - * bit be 0 and the counter the same as the return from - * the NMI/SMI. If the state machine was so unlucky to - * see that, it still doesn't matter, since all - * RCU read-side critical sections on this CPU would - * have already completed. - */ - rdssp->dynticks++; - /* - * The following memory barrier ensures that any - * rcu_read_lock() primitives in the irq handler - * are seen by other CPUs to follow the above - * increment to rcu_dyntick_sched.dynticks. This is - * required in order for other CPUs to correctly - * determine when it is safe to advance the RCU - * grace-period state machine. - */ - smp_mb(); /* see above block comment. */ - /* - * Since we can't determine the dynamic tick mode from - * the rcu_dyntick_sched.dynticks after this routine, - * we use a second flag to acknowledge that we came - * from an idle state with ticks stopped. - */ - per_cpu(rcu_update_flag, cpu)++; - /* - * If we take an NMI/SMI now, they will also increment - * the rcu_update_flag, and will not update the - * rcu_dyntick_sched.dynticks on exit. That is for - * this IRQ to do. - */ - } -} - -/** - * rcu_irq_exit - Called from exiting Hard irq context. - * - * If the CPU was idle with dynamic ticks active, update the - * rcu_dyntick_sched.dynticks to let the RCU handling be - * aware that the CPU is going back to idle with no ticks. - */ -void rcu_irq_exit(void) -{ - int cpu = smp_processor_id(); - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - /* - * rcu_update_flag is set if we interrupted the CPU - * when it was idle with ticks stopped. - * Once this occurs, we keep track of interrupt nesting - * because a NMI/SMI could also come in, and we still - * only want the IRQ that started the increment of the - * rcu_dyntick_sched.dynticks to be the one that modifies - * it on exit. - */ - if (per_cpu(rcu_update_flag, cpu)) { - if (--per_cpu(rcu_update_flag, cpu)) - return; - - /* This must match the interrupt nesting */ - WARN_ON(in_interrupt()); - - /* - * If an NMI/SMI happens now we are still - * protected by the rcu_dyntick_sched.dynticks being odd. - */ - - /* - * The following memory barrier ensures that any - * rcu_read_unlock() primitives in the irq handler - * are seen by other CPUs to preceed the following - * increment to rcu_dyntick_sched.dynticks. This - * is required in order for other CPUs to determine - * when it is safe to advance the RCU grace-period - * state machine. - */ - smp_mb(); /* see above block comment. */ - rdssp->dynticks++; - WARN_ON(rdssp->dynticks & 0x1); - } -} - -void rcu_nmi_enter(void) -{ - rcu_irq_enter(); -} - -void rcu_nmi_exit(void) -{ - rcu_irq_exit(); -} - -static void dyntick_save_progress_counter(int cpu) -{ - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - rdssp->dynticks_snap = rdssp->dynticks; -} - -static inline int -rcu_try_flip_waitack_needed(int cpu) -{ - long curr; - long snap; - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - curr = rdssp->dynticks; - snap = rdssp->dynticks_snap; - smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ - - /* - * If the CPU remained in dynticks mode for the entire time - * and didn't take any interrupts, NMIs, SMIs, or whatever, - * then it cannot be in the middle of an rcu_read_lock(), so - * the next rcu_read_lock() it executes must use the new value - * of the counter. So we can safely pretend that this CPU - * already acknowledged the counter. - */ - - if ((curr == snap) && ((curr & 0x1) == 0)) - return 0; - - /* - * If the CPU passed through or entered a dynticks idle phase with - * no active irq handlers, then, as above, we can safely pretend - * that this CPU already acknowledged the counter. - */ - - if ((curr - snap) > 2 || (curr & 0x1) == 0) - return 0; - - /* We need this CPU to explicitly acknowledge the counter flip. */ - - return 1; -} - -static inline int -rcu_try_flip_waitmb_needed(int cpu) -{ - long curr; - long snap; - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - curr = rdssp->dynticks; - snap = rdssp->dynticks_snap; - smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ - - /* - * If the CPU remained in dynticks mode for the entire time - * and didn't take any interrupts, NMIs, SMIs, or whatever, - * then it cannot have executed an RCU read-side critical section - * during that time, so there is no need for it to execute a - * memory barrier. - */ - - if ((curr == snap) && ((curr & 0x1) == 0)) - return 0; - - /* - * If the CPU either entered or exited an outermost interrupt, - * SMI, NMI, or whatever handler, then we know that it executed - * a memory barrier when doing so. So we don't need another one. - */ - if (curr != snap) - return 0; - - /* We need the CPU to execute a memory barrier. */ - - return 1; -} - -static void dyntick_save_progress_counter_sched(int cpu) -{ - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - rdssp->sched_dynticks_snap = rdssp->dynticks; -} - -static int rcu_qsctr_inc_needed_dyntick(int cpu) -{ - long curr; - long snap; - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - curr = rdssp->dynticks; - snap = rdssp->sched_dynticks_snap; - smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ - - /* - * If the CPU remained in dynticks mode for the entire time - * and didn't take any interrupts, NMIs, SMIs, or whatever, - * then it cannot be in the middle of an rcu_read_lock(), so - * the next rcu_read_lock() it executes must use the new value - * of the counter. Therefore, this CPU has been in a quiescent - * state the entire time, and we don't need to wait for it. - */ - - if ((curr == snap) && ((curr & 0x1) == 0)) - return 0; - - /* - * If the CPU passed through or entered a dynticks idle phase with - * no active irq handlers, then, as above, this CPU has already - * passed through a quiescent state. - */ - - if ((curr - snap) > 2 || (snap & 0x1) == 0) - return 0; - - /* We need this CPU to go through a quiescent state. */ - - return 1; -} - -#else /* !CONFIG_NO_HZ */ - -# define dyntick_save_progress_counter(cpu) do { } while (0) -# define rcu_try_flip_waitack_needed(cpu) (1) -# define rcu_try_flip_waitmb_needed(cpu) (1) - -# define dyntick_save_progress_counter_sched(cpu) do { } while (0) -# define rcu_qsctr_inc_needed_dyntick(cpu) (1) - -#endif /* CONFIG_NO_HZ */ - -static void save_qsctr_sched(int cpu) -{ - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - rdssp->sched_qs_snap = rdssp->sched_qs; -} - -static inline int rcu_qsctr_inc_needed(int cpu) -{ - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - /* - * If there has been a quiescent state, no more need to wait - * on this CPU. - */ - - if (rdssp->sched_qs != rdssp->sched_qs_snap) { - smp_mb(); /* force ordering with cpu entering schedule(). */ - return 0; - } - - /* We need this CPU to go through a quiescent state. */ - - return 1; -} - -/* - * Get here when RCU is idle. Decide whether we need to - * move out of idle state, and return non-zero if so. - * "Straightforward" approach for the moment, might later - * use callback-list lengths, grace-period duration, or - * some such to determine when to exit idle state. - * Might also need a pre-idle test that does not acquire - * the lock, but let's get the simple case working first... - */ - -static int -rcu_try_flip_idle(void) -{ - int cpu; - - RCU_TRACE_ME(rcupreempt_trace_try_flip_i1); - if (!rcu_pending(smp_processor_id())) { - RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1); - return 0; - } - - /* - * Do the flip. - */ - - RCU_TRACE_ME(rcupreempt_trace_try_flip_g1); - rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */ - - /* - * Need a memory barrier so that other CPUs see the new - * counter value before they see the subsequent change of all - * the rcu_flip_flag instances to rcu_flipped. - */ - - smp_mb(); /* see above block comment. */ - - /* Now ask each CPU for acknowledgement of the flip. */ - - for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) { - per_cpu(rcu_flip_flag, cpu) = rcu_flipped; - dyntick_save_progress_counter(cpu); - } - - return 1; -} - -/* - * Wait for CPUs to acknowledge the flip. - */ - -static int -rcu_try_flip_waitack(void) -{ - int cpu; - - RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); - for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) - if (rcu_try_flip_waitack_needed(cpu) && - per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { - RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); - return 0; - } - - /* - * Make sure our checks above don't bleed into subsequent - * waiting for the sum of the counters to reach zero. - */ - - smp_mb(); /* see above block comment. */ - RCU_TRACE_ME(rcupreempt_trace_try_flip_a2); - return 1; -} - -/* - * Wait for collective ``last'' counter to reach zero, - * then tell all CPUs to do an end-of-grace-period memory barrier. - */ - -static int -rcu_try_flip_waitzero(void) -{ - int cpu; - int lastidx = !(rcu_ctrlblk.completed & 0x1); - int sum = 0; - - /* Check to see if the sum of the "last" counters is zero. */ - - RCU_TRACE_ME(rcupreempt_trace_try_flip_z1); - for_each_possible_cpu(cpu) - sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx]; - if (sum != 0) { - RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1); - return 0; - } - - /* - * This ensures that the other CPUs see the call for - * memory barriers -after- the sum to zero has been - * detected here - */ - smp_mb(); /* ^^^^^^^^^^^^ */ - - /* Call for a memory barrier from each CPU. */ - for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) { - per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; - dyntick_save_progress_counter(cpu); - } - - RCU_TRACE_ME(rcupreempt_trace_try_flip_z2); - return 1; -} - -/* - * Wait for all CPUs to do their end-of-grace-period memory barrier. - * Return 0 once all CPUs have done so. - */ - -static int -rcu_try_flip_waitmb(void) -{ - int cpu; - - RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); - for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) - if (rcu_try_flip_waitmb_needed(cpu) && - per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { - RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); - return 0; - } - - smp_mb(); /* Ensure that the above checks precede any following flip. */ - RCU_TRACE_ME(rcupreempt_trace_try_flip_m2); - return 1; -} - -/* - * Attempt a single flip of the counters. Remember, a single flip does - * -not- constitute a grace period. Instead, the interval between - * at least GP_STAGES consecutive flips is a grace period. - * - * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation - * on a large SMP, they might want to use a hierarchical organization of - * the per-CPU-counter pairs. - */ -static void rcu_try_flip(void) -{ - unsigned long flags; - - RCU_TRACE_ME(rcupreempt_trace_try_flip_1); - if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) { - RCU_TRACE_ME(rcupreempt_trace_try_flip_e1); - return; - } - - /* - * Take the next transition(s) through the RCU grace-period - * flip-counter state machine. - */ - - switch (rcu_ctrlblk.rcu_try_flip_state) { - case rcu_try_flip_idle_state: - if (rcu_try_flip_idle()) - rcu_ctrlblk.rcu_try_flip_state = - rcu_try_flip_waitack_state; - break; - case rcu_try_flip_waitack_state: - if (rcu_try_flip_waitack()) - rcu_ctrlblk.rcu_try_flip_state = - rcu_try_flip_waitzero_state; - break; - case rcu_try_flip_waitzero_state: - if (rcu_try_flip_waitzero()) - rcu_ctrlblk.rcu_try_flip_state = - rcu_try_flip_waitmb_state; - break; - case rcu_try_flip_waitmb_state: - if (rcu_try_flip_waitmb()) - rcu_ctrlblk.rcu_try_flip_state = - rcu_try_flip_idle_state; - } - spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); -} - -/* - * Check to see if this CPU needs to do a memory barrier in order to - * ensure that any prior RCU read-side critical sections have committed - * their counter manipulations and critical-section memory references - * before declaring the grace period to be completed. - */ -static void rcu_check_mb(int cpu) -{ - if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) { - smp_mb(); /* Ensure RCU read-side accesses are visible. */ - per_cpu(rcu_mb_flag, cpu) = rcu_mb_done; - } -} - -void rcu_check_callbacks(int cpu, int user) -{ - unsigned long flags; - struct rcu_data *rdp; - - if (!rcu_pending(cpu)) - return; /* if nothing for RCU to do. */ - - /* - * If this CPU took its interrupt from user mode or from the - * idle loop, and this is not a nested interrupt, then - * this CPU has to have exited all prior preept-disable - * sections of code. So invoke rcu_sched_qs() to note this. - * - * The memory barrier is needed to handle the case where - * writes from a preempt-disable section of code get reordered - * into schedule() by this CPU's write buffer. So the memory - * barrier makes sure that the rcu_sched_qs() is seen by other - * CPUs to happen after any such write. - */ - - rdp = RCU_DATA_CPU(cpu); - if (user || - (idle_cpu(cpu) && !in_softirq() && - hardirq_count() <= (1 << HARDIRQ_SHIFT))) { - smp_mb(); /* Guard against aggressive schedule(). */ - rcu_sched_qs(cpu); - } - - rcu_check_mb(cpu); - if (rcu_ctrlblk.completed == rdp->completed) - rcu_try_flip(); - spin_lock_irqsave(&rdp->lock, flags); - RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); - __rcu_advance_callbacks(rdp); - if (rdp->donelist == NULL) { - spin_unlock_irqrestore(&rdp->lock, flags); - } else { - spin_unlock_irqrestore(&rdp->lock, flags); - raise_softirq(RCU_SOFTIRQ); - } -} - -/* - * Needed by dynticks, to make sure all RCU processing has finished - * when we go idle: - */ -void rcu_advance_callbacks(int cpu, int user) -{ - unsigned long flags; - struct rcu_data *rdp = RCU_DATA_CPU(cpu); - - if (rcu_ctrlblk.completed == rdp->completed) { - rcu_try_flip(); - if (rcu_ctrlblk.completed == rdp->completed) - return; - } - spin_lock_irqsave(&rdp->lock, flags); - RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); - __rcu_advance_callbacks(rdp); - spin_unlock_irqrestore(&rdp->lock, flags); -} - -#ifdef CONFIG_HOTPLUG_CPU -#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \ - *dsttail = srclist; \ - if (srclist != NULL) { \ - dsttail = srctail; \ - srclist = NULL; \ - srctail = &srclist;\ - } \ - } while (0) - -void rcu_offline_cpu(int cpu) -{ - int i; - struct rcu_head *list = NULL; - unsigned long flags; - struct rcu_data *rdp = RCU_DATA_CPU(cpu); - struct rcu_head *schedlist = NULL; - struct rcu_head **schedtail = &schedlist; - struct rcu_head **tail = &list; - - /* - * Remove all callbacks from the newly dead CPU, retaining order. - * Otherwise rcu_barrier() will fail - */ - - spin_lock_irqsave(&rdp->lock, flags); - rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail); - for (i = GP_STAGES - 1; i >= 0; i--) - rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i], - list, tail); - rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail); - rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail, - schedlist, schedtail); - rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail, - schedlist, schedtail); - rdp->rcu_sched_sleeping = 0; - spin_unlock_irqrestore(&rdp->lock, flags); - rdp->waitlistcount = 0; - - /* Disengage the newly dead CPU from the grace-period computation. */ - - spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); - rcu_check_mb(cpu); - if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { - smp_mb(); /* Subsequent counter accesses must see new value */ - per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen; - smp_mb(); /* Subsequent RCU read-side critical sections */ - /* seen -after- acknowledgement. */ - } - - cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map)); - - spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); - - /* - * Place the removed callbacks on the current CPU's queue. - * Make them all start a new grace period: simple approach, - * in theory could starve a given set of callbacks, but - * you would need to be doing some serious CPU hotplugging - * to make this happen. If this becomes a problem, adding - * a synchronize_rcu() to the hotplug path would be a simple - * fix. - */ - - local_irq_save(flags); /* disable preempt till we know what lock. */ - rdp = RCU_DATA_ME(); - spin_lock(&rdp->lock); - *rdp->nexttail = list; - if (list) - rdp->nexttail = tail; - *rdp->nextschedtail = schedlist; - if (schedlist) - rdp->nextschedtail = schedtail; - spin_unlock_irqrestore(&rdp->lock, flags); -} - -#else /* #ifdef CONFIG_HOTPLUG_CPU */ - -void rcu_offline_cpu(int cpu) -{ -} - -#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ - -void __cpuinit rcu_online_cpu(int cpu) -{ - unsigned long flags; - struct rcu_data *rdp; - - spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); - cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map)); - spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); - - /* - * The rcu_sched grace-period processing might have bypassed - * this CPU, given that it was not in the rcu_cpu_online_map - * when the grace-period scan started. This means that the - * grace-period task might sleep. So make sure that if this - * should happen, the first callback posted to this CPU will - * wake up the grace-period task if need be. - */ - - rdp = RCU_DATA_CPU(cpu); - spin_lock_irqsave(&rdp->lock, flags); - rdp->rcu_sched_sleeping = 1; - spin_unlock_irqrestore(&rdp->lock, flags); -} - -static void rcu_process_callbacks(struct softirq_action *unused) -{ - unsigned long flags; - struct rcu_head *next, *list; - struct rcu_data *rdp; - - local_irq_save(flags); - rdp = RCU_DATA_ME(); - spin_lock(&rdp->lock); - list = rdp->donelist; - if (list == NULL) { - spin_unlock_irqrestore(&rdp->lock, flags); - return; - } - rdp->donelist = NULL; - rdp->donetail = &rdp->donelist; - RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp); - spin_unlock_irqrestore(&rdp->lock, flags); - while (list) { - next = list->next; - list->func(list); - list = next; - RCU_TRACE_ME(rcupreempt_trace_invoke); - } -} - -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - unsigned long flags; - struct rcu_data *rdp; - - head->func = func; - head->next = NULL; - local_irq_save(flags); - rdp = RCU_DATA_ME(); - spin_lock(&rdp->lock); - __rcu_advance_callbacks(rdp); - *rdp->nexttail = head; - rdp->nexttail = &head->next; - RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp); - spin_unlock_irqrestore(&rdp->lock, flags); -} -EXPORT_SYMBOL_GPL(call_rcu); - -void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - unsigned long flags; - struct rcu_data *rdp; - int wake_gp = 0; - - head->func = func; - head->next = NULL; - local_irq_save(flags); - rdp = RCU_DATA_ME(); - spin_lock(&rdp->lock); - *rdp->nextschedtail = head; - rdp->nextschedtail = &head->next; - if (rdp->rcu_sched_sleeping) { - - /* Grace-period processing might be sleeping... */ - - rdp->rcu_sched_sleeping = 0; - wake_gp = 1; - } - spin_unlock_irqrestore(&rdp->lock, flags); - if (wake_gp) { - - /* Wake up grace-period processing, unless someone beat us. */ - - spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags); - if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping) - wake_gp = 0; - rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping; - spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); - if (wake_gp) - wake_up_interruptible(&rcu_ctrlblk.sched_wq); - } -} -EXPORT_SYMBOL_GPL(call_rcu_sched); - -/* - * Wait until all currently running preempt_disable() code segments - * (including hardware-irq-disable segments) complete. Note that - * in -rt this does -not- necessarily result in all currently executing - * interrupt -handlers- having completed. - */ -void __synchronize_sched(void) -{ - struct rcu_synchronize rcu; - - if (num_online_cpus() == 1) - return; /* blocking is gp if only one CPU! */ - - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu_sched(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); -} -EXPORT_SYMBOL_GPL(__synchronize_sched); - -/* - * kthread function that manages call_rcu_sched grace periods. - */ -static int rcu_sched_grace_period(void *arg) -{ - int couldsleep; /* might sleep after current pass. */ - int couldsleepnext = 0; /* might sleep after next pass. */ - int cpu; - unsigned long flags; - struct rcu_data *rdp; - int ret; - - /* - * Each pass through the following loop handles one - * rcu_sched grace period cycle. - */ - do { - /* Save each CPU's current state. */ - - for_each_online_cpu(cpu) { - dyntick_save_progress_counter_sched(cpu); - save_qsctr_sched(cpu); - } - - /* - * Sleep for about an RCU grace-period's worth to - * allow better batching and to consume less CPU. - */ - schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME); - - /* - * If there was nothing to do last time, prepare to - * sleep at the end of the current grace period cycle. - */ - couldsleep = couldsleepnext; - couldsleepnext = 1; - if (couldsleep) { - spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags); - rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep; - spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); - } - - /* - * Wait on each CPU in turn to have either visited - * a quiescent state or been in dynticks-idle mode. - */ - for_each_online_cpu(cpu) { - while (rcu_qsctr_inc_needed(cpu) && - rcu_qsctr_inc_needed_dyntick(cpu)) { - /* resched_cpu(cpu); @@@ */ - schedule_timeout_interruptible(1); - } - } - - /* Advance callbacks for each CPU. */ - - for_each_online_cpu(cpu) { - - rdp = RCU_DATA_CPU(cpu); - spin_lock_irqsave(&rdp->lock, flags); - - /* - * We are running on this CPU irq-disabled, so no - * CPU can go offline until we re-enable irqs. - * The current CPU might have already gone - * offline (between the for_each_offline_cpu and - * the spin_lock_irqsave), but in that case all its - * callback lists will be empty, so no harm done. - * - * Advance the callbacks! We share normal RCU's - * donelist, since callbacks are invoked the - * same way in either case. - */ - if (rdp->waitschedlist != NULL) { - *rdp->donetail = rdp->waitschedlist; - rdp->donetail = rdp->waitschedtail; - - /* - * Next rcu_check_callbacks() will - * do the required raise_softirq(). - */ - } - if (rdp->nextschedlist != NULL) { - rdp->waitschedlist = rdp->nextschedlist; - rdp->waitschedtail = rdp->nextschedtail; - couldsleep = 0; - couldsleepnext = 0; - } else { - rdp->waitschedlist = NULL; - rdp->waitschedtail = &rdp->waitschedlist; - } - rdp->nextschedlist = NULL; - rdp->nextschedtail = &rdp->nextschedlist; - - /* Mark sleep intention. */ - - rdp->rcu_sched_sleeping = couldsleep; - - spin_unlock_irqrestore(&rdp->lock, flags); - } - - /* If we saw callbacks on the last scan, go deal with them. */ - - if (!couldsleep) - continue; - - /* Attempt to block... */ - - spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags); - if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) { - - /* - * Someone posted a callback after we scanned. - * Go take care of it. - */ - spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); - couldsleepnext = 0; - continue; - } - - /* Block until the next person posts a callback. */ - - rcu_ctrlblk.sched_sleep = rcu_sched_sleeping; - spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); - ret = 0; /* unused */ - __wait_event_interruptible(rcu_ctrlblk.sched_wq, - rcu_ctrlblk.sched_sleep != rcu_sched_sleeping, - ret); - - couldsleepnext = 0; - - } while (!kthread_should_stop()); - - return (0); -} - -/* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. Assumes that notifiers would take care of handling any - * outstanding requests from the RCU core. - * - * This function is part of the RCU implementation; it is -not- - * an exported member of the RCU API. - */ -int rcu_needs_cpu(int cpu) -{ - struct rcu_data *rdp = RCU_DATA_CPU(cpu); - - return (rdp->donelist != NULL || - !!rdp->waitlistcount || - rdp->nextlist != NULL || - rdp->nextschedlist != NULL || - rdp->waitschedlist != NULL); -} - -static int rcu_pending(int cpu) -{ - struct rcu_data *rdp = RCU_DATA_CPU(cpu); - - /* The CPU has at least one callback queued somewhere. */ - - if (rdp->donelist != NULL || - !!rdp->waitlistcount || - rdp->nextlist != NULL || - rdp->nextschedlist != NULL || - rdp->waitschedlist != NULL) - return 1; - - /* The RCU core needs an acknowledgement from this CPU. */ - - if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) || - (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed)) - return 1; - - /* This CPU has fallen behind the global grace-period number. */ - - if (rdp->completed != rcu_ctrlblk.completed) - return 1; - - /* Nothing needed from this CPU. */ - - return 0; -} - -int __cpuinit rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - rcu_online_cpu(cpu); - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - rcu_offline_cpu(cpu); - break; - default: - break; - } - return NOTIFY_OK; -} - -void __init __rcu_init(void) -{ - int cpu; - int i; - struct rcu_data *rdp; - - printk(KERN_NOTICE "Preemptible RCU implementation.\n"); - for_each_possible_cpu(cpu) { - rdp = RCU_DATA_CPU(cpu); - spin_lock_init(&rdp->lock); - rdp->completed = 0; - rdp->waitlistcount = 0; - rdp->nextlist = NULL; - rdp->nexttail = &rdp->nextlist; - for (i = 0; i < GP_STAGES; i++) { - rdp->waitlist[i] = NULL; - rdp->waittail[i] = &rdp->waitlist[i]; - } - rdp->donelist = NULL; - rdp->donetail = &rdp->donelist; - rdp->rcu_flipctr[0] = 0; - rdp->rcu_flipctr[1] = 0; - rdp->nextschedlist = NULL; - rdp->nextschedtail = &rdp->nextschedlist; - rdp->waitschedlist = NULL; - rdp->waitschedtail = &rdp->waitschedlist; - rdp->rcu_sched_sleeping = 0; - } - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); -} - -/* - * Late-boot-time RCU initialization that must wait until after scheduler - * has been initialized. - */ -void __init rcu_init_sched(void) -{ - rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period, - NULL, - "rcu_sched_grace_period"); - WARN_ON(IS_ERR(rcu_sched_grace_period_task)); -} - -#ifdef CONFIG_RCU_TRACE -long *rcupreempt_flipctr(int cpu) -{ - return &RCU_DATA_CPU(cpu)->rcu_flipctr[0]; -} -EXPORT_SYMBOL_GPL(rcupreempt_flipctr); - -int rcupreempt_flip_flag(int cpu) -{ - return per_cpu(rcu_flip_flag, cpu); -} -EXPORT_SYMBOL_GPL(rcupreempt_flip_flag); - -int rcupreempt_mb_flag(int cpu) -{ - return per_cpu(rcu_mb_flag, cpu); -} -EXPORT_SYMBOL_GPL(rcupreempt_mb_flag); - -char *rcupreempt_try_flip_state_name(void) -{ - return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state]; -} -EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name); - -struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu) -{ - struct rcu_data *rdp = RCU_DATA_CPU(cpu); - - return &rdp->trace; -} -EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu); - -#endif /* #ifdef RCU_TRACE */ diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c deleted file mode 100644 index 11640346a50..00000000000 --- a/kernel/rcupreempt_trace.c +++ /dev/null @@ -1,335 +0,0 @@ -/* - * Read-Copy Update tracing for realtime implementation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2006 - * - * Papers: http://www.rdrop.com/users/paulmck/RCU - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU/ *.txt - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static struct mutex rcupreempt_trace_mutex; -static char *rcupreempt_trace_buf; -#define RCUPREEMPT_TRACE_BUF_SIZE 4096 - -void rcupreempt_trace_move2done(struct rcupreempt_trace *trace) -{ - trace->done_length += trace->wait_length; - trace->done_add += trace->wait_length; - trace->wait_length = 0; -} -void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace) -{ - trace->wait_length += trace->next_length; - trace->wait_add += trace->next_length; - trace->next_length = 0; -} -void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace) -{ - atomic_inc(&trace->rcu_try_flip_1); -} -void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace) -{ - atomic_inc(&trace->rcu_try_flip_e1); -} -void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_i1++; -} -void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_ie1++; -} -void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_g1++; -} -void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_a1++; -} -void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_ae1++; -} -void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_a2++; -} -void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_z1++; -} -void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_ze1++; -} -void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_z2++; -} -void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_m1++; -} -void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_me1++; -} -void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_m2++; -} -void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace) -{ - trace->rcu_check_callbacks++; -} -void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace) -{ - trace->done_remove += trace->done_length; - trace->done_length = 0; -} -void rcupreempt_trace_invoke(struct rcupreempt_trace *trace) -{ - atomic_inc(&trace->done_invoked); -} -void rcupreempt_trace_next_add(struct rcupreempt_trace *trace) -{ - trace->next_add++; - trace->next_length++; -} - -static void rcupreempt_trace_sum(struct rcupreempt_trace *sp) -{ - struct rcupreempt_trace *cp; - int cpu; - - memset(sp, 0, sizeof(*sp)); - for_each_possible_cpu(cpu) { - cp = rcupreempt_trace_cpu(cpu); - sp->next_length += cp->next_length; - sp->next_add += cp->next_add; - sp->wait_length += cp->wait_length; - sp->wait_add += cp->wait_add; - sp->done_length += cp->done_length; - sp->done_add += cp->done_add; - sp->done_remove += cp->done_remove; - atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked); - sp->rcu_check_callbacks += cp->rcu_check_callbacks; - atomic_add(atomic_read(&cp->rcu_try_flip_1), - &sp->rcu_try_flip_1); - atomic_add(atomic_read(&cp->rcu_try_flip_e1), - &sp->rcu_try_flip_e1); - sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1; - sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1; - sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1; - sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1; - sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1; - sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2; - sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1; - sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1; - sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2; - sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1; - sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1; - sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2; - } -} - -static ssize_t rcustats_read(struct file *filp, char __user *buffer, - size_t count, loff_t *ppos) -{ - struct rcupreempt_trace trace; - ssize_t bcount; - int cnt = 0; - - rcupreempt_trace_sum(&trace); - mutex_lock(&rcupreempt_trace_mutex); - snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt, - "ggp=%ld rcc=%ld\n", - rcu_batches_completed(), - trace.rcu_check_callbacks); - snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt, - "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n" - "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n" - "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n", - - trace.next_add, trace.next_length, - trace.wait_add, trace.wait_length, - trace.done_add, trace.done_length, - trace.done_remove, atomic_read(&trace.done_invoked), - atomic_read(&trace.rcu_try_flip_1), - atomic_read(&trace.rcu_try_flip_e1), - trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1, - trace.rcu_try_flip_g1, - trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1, - trace.rcu_try_flip_a2, - trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1, - trace.rcu_try_flip_z2, - trace.rcu_try_flip_m1, trace.rcu_try_flip_me1, - trace.rcu_try_flip_m2); - bcount = simple_read_from_buffer(buffer, count, ppos, - rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); - mutex_unlock(&rcupreempt_trace_mutex); - return bcount; -} - -static ssize_t rcugp_read(struct file *filp, char __user *buffer, - size_t count, loff_t *ppos) -{ - long oldgp = rcu_batches_completed(); - ssize_t bcount; - - mutex_lock(&rcupreempt_trace_mutex); - synchronize_rcu(); - snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE, - "oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed()); - bcount = simple_read_from_buffer(buffer, count, ppos, - rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); - mutex_unlock(&rcupreempt_trace_mutex); - return bcount; -} - -static ssize_t rcuctrs_read(struct file *filp, char __user *buffer, - size_t count, loff_t *ppos) -{ - int cnt = 0; - int cpu; - int f = rcu_batches_completed() & 0x1; - ssize_t bcount; - - mutex_lock(&rcupreempt_trace_mutex); - - cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE, - "CPU last cur F M\n"); - for_each_possible_cpu(cpu) { - long *flipctr = rcupreempt_flipctr(cpu); - cnt += snprintf(&rcupreempt_trace_buf[cnt], - RCUPREEMPT_TRACE_BUF_SIZE - cnt, - "%3d%c %4ld %3ld %d %d\n", - cpu, - cpu_is_offline(cpu) ? '!' : ' ', - flipctr[!f], - flipctr[f], - rcupreempt_flip_flag(cpu), - rcupreempt_mb_flag(cpu)); - } - cnt += snprintf(&rcupreempt_trace_buf[cnt], - RCUPREEMPT_TRACE_BUF_SIZE - cnt, - "ggp = %ld, state = %s\n", - rcu_batches_completed(), - rcupreempt_try_flip_state_name()); - cnt += snprintf(&rcupreempt_trace_buf[cnt], - RCUPREEMPT_TRACE_BUF_SIZE - cnt, - "\n"); - bcount = simple_read_from_buffer(buffer, count, ppos, - rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); - mutex_unlock(&rcupreempt_trace_mutex); - return bcount; -} - -static struct file_operations rcustats_fops = { - .owner = THIS_MODULE, - .read = rcustats_read, -}; - -static struct file_operations rcugp_fops = { - .owner = THIS_MODULE, - .read = rcugp_read, -}; - -static struct file_operations rcuctrs_fops = { - .owner = THIS_MODULE, - .read = rcuctrs_read, -}; - -static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir; -static int rcupreempt_debugfs_init(void) -{ - rcudir = debugfs_create_dir("rcu", NULL); - if (!rcudir) - goto out; - statdir = debugfs_create_file("rcustats", 0444, rcudir, - NULL, &rcustats_fops); - if (!statdir) - goto free_out; - - gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); - if (!gpdir) - goto free_out; - - ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir, - NULL, &rcuctrs_fops); - if (!ctrsdir) - goto free_out; - return 0; -free_out: - if (statdir) - debugfs_remove(statdir); - if (gpdir) - debugfs_remove(gpdir); - debugfs_remove(rcudir); -out: - return 1; -} - -static int __init rcupreempt_trace_init(void) -{ - int ret; - - mutex_init(&rcupreempt_trace_mutex); - rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL); - if (!rcupreempt_trace_buf) - return 1; - ret = rcupreempt_debugfs_init(); - if (ret) - kfree(rcupreempt_trace_buf); - return ret; -} - -static void __exit rcupreempt_trace_cleanup(void) -{ - debugfs_remove(statdir); - debugfs_remove(gpdir); - debugfs_remove(ctrsdir); - debugfs_remove(rcudir); - kfree(rcupreempt_trace_buf); -} - - -module_init(rcupreempt_trace_init); -module_exit(rcupreempt_trace_cleanup); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f87fb0c8f92..82fbc49728d 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -725,7 +725,7 @@ config RCU_TORTURE_TEST_RUNNABLE config RCU_CPU_STALL_DETECTOR bool "Check for stalled CPUs delaying RCU grace periods" - depends on CLASSIC_RCU || TREE_RCU || TREE_PREEMPT_RCU + depends on TREE_RCU || TREE_PREEMPT_RCU default n help This option causes RCU to printk information on which -- cgit v1.2.3-70-g09d2 From d8e180dcd5bbbab9cd3ff2e779efcf70692ef541 Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Thu, 20 Aug 2009 14:39:52 -0700 Subject: bsdacct: switch credentials for writing to the accounting file When process accounting is enabled, every exiting process writes a log to the account file. In addition, every once in a while one of the exiting processes checks whether there's enough free space for the log. SELinux policy may or may not allow the exiting process to stat the fs. So unsuspecting processes start generating AVC denials just because someone enabled process accounting. For these filesystem operations, the exiting process's credentials should be temporarily switched to that of the process which enabled accounting, because it's really that process which wanted to have the accounting information logged. Signed-off-by: Michal Schmidt Acked-by: David Howells Acked-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: James Morris --- kernel/acct.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 9f3391090b3..9a4715a2f6b 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -491,13 +491,17 @@ static void do_acct_process(struct bsd_acct_struct *acct, u64 run_time; struct timespec uptime; struct tty_struct *tty; + const struct cred *orig_cred; + + /* Perform file operations on behalf of whoever enabled accounting */ + orig_cred = override_creds(file->f_cred); /* * First check to see if there is enough free_space to continue * the process accounting system. */ if (!check_free_space(acct, file)) - return; + goto out; /* * Fill the accounting struct with the needed info as recorded @@ -578,6 +582,8 @@ static void do_acct_process(struct bsd_acct_struct *acct, sizeof(acct_t), &file->f_pos); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; set_fs(fs); +out: + revert_creds(orig_cred); } /** -- cgit v1.2.3-70-g09d2 From 33f76148ced0e0618062e302d2a9614efdbd4a06 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 24 Aug 2009 09:42:01 -0700 Subject: rcu: Add CPU-offline processing for single-node configurations Add preemptable-RCU plugin to handle the CPU-offline processing. An additional plugin is forthcoming to handle multinode RCU trees, but this current plugin works for configurations up to 32 CPUs (64 CPUs for 64-bit kernels). Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josht@linux.vnet.ibm.com Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org LKML-Reference: <12511321213336-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 2 ++ kernel/rcutree_plugin.h | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index cc025571407..000b0760987 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -84,6 +84,7 @@ extern long rcu_batches_completed_sched(void); static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, unsigned long flags); static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags); +static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp); static void __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); static void __call_rcu(struct rcu_head *head, @@ -920,6 +921,7 @@ static void rcu_offline_cpu(int cpu) { __rcu_offline_cpu(cpu, &rcu_sched_state); __rcu_offline_cpu(cpu, &rcu_bh_state); + rcu_preempt_offline_cpu(cpu); } #else /* #ifdef CONFIG_HOTPLUG_CPU */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index cd2ab67400c..201334cdc20 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -259,6 +259,18 @@ static int rcu_preempted_readers(struct rcu_node *rnp) return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); } +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Do CPU-offline processing for preemptable RCU. + */ +static void rcu_preempt_offline_cpu(int cpu) +{ + __rcu_offline_cpu(cpu, &rcu_preempt_state); +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + /* * Check for a quiescent state from the current CPU. When a task blocks, * the task is recorded in the corresponding CPU's rcu_node structure, @@ -395,6 +407,18 @@ static int rcu_preempted_readers(struct rcu_node *rnp) return 0; } +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Because preemptable RCU does not exist, it never needs CPU-offline + * processing. + */ +static void rcu_preempt_offline_cpu(int cpu) +{ +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + /* * Because preemptable RCU does not exist, it never has any callbacks * to check. -- cgit v1.2.3-70-g09d2 From 36d47481b3824b661b464077db95d447984df799 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 25 Aug 2009 15:08:30 +0900 Subject: timekeeping: Fix invalid getboottime() value Don't use timespec_add_safe() with wall_to_monotonic, because wall_to_monotonic has negative values which will cause overflow in timespec_add_safe(). That makes btime in /proc/stat invalid. Signed-off-by: Hiroshi Shimamoto Cc: Martin Schwidefsky Cc: John Stultz Cc: Daniel Walker LKML-Reference: <4A937FDE.4050506@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 03cbeb34d14..fb0f46fa1ec 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -826,9 +826,11 @@ void update_wall_time(void) */ void getboottime(struct timespec *ts) { - struct timespec boottime; + struct timespec boottime = { + .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec, + .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec + }; - boottime = timespec_add_safe(wall_to_monotonic, total_sleep_time); set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); } -- cgit v1.2.3-70-g09d2 From fa289beca9de9119c7760bd984f3640da21bc94c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 25 Aug 2009 15:17:20 +1000 Subject: perf_counter: Start counting time enabled when group leader gets enabled Currently, if a group is created where the group leader is initially disabled but a non-leader member is initially enabled, and then the leader is subsequently enabled some time later, the time_enabled for the non-leader member will reflect the whole time since it was created, not just the time since the leader was enabled. This is incorrect, because all of the members are effectively disabled while the leader is disabled, since none of the members can go on the PMU if the leader can't. Thus we have to update the ->tstamp_enabled for all the enabled group members when a group leader is enabled, so that the time_enabled computation only counts the time since the leader was enabled. Similarly, when disabling a group leader we have to update the time_enabled and time_running for all of the group members. Also, in update_counter_times, we have to treat a counter whose group leader is disabled as being disabled. Reported-by: Stephane Eranian Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: LKML-Reference: <19091.29664.342227.445006@drongo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f274e195988..06bf6a4f260 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -469,7 +469,8 @@ static void update_counter_times(struct perf_counter *counter) struct perf_counter_context *ctx = counter->ctx; u64 run_end; - if (counter->state < PERF_COUNTER_STATE_INACTIVE) + if (counter->state < PERF_COUNTER_STATE_INACTIVE || + counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE) return; counter->total_time_enabled = ctx->time - counter->tstamp_enabled; @@ -518,7 +519,7 @@ static void __perf_counter_disable(void *info) */ if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { update_context_time(ctx); - update_counter_times(counter); + update_group_times(counter); if (counter == counter->group_leader) group_sched_out(counter, cpuctx, ctx); else @@ -573,7 +574,7 @@ static void perf_counter_disable(struct perf_counter *counter) * in, so we can change the state safely. */ if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_counter_times(counter); + update_group_times(counter); counter->state = PERF_COUNTER_STATE_OFF; } @@ -850,6 +851,27 @@ retry: spin_unlock_irq(&ctx->lock); } +/* + * Put a counter into inactive state and update time fields. + * Enabling the leader of a group effectively enables all + * the group members that aren't explicitly disabled, so we + * have to update their ->tstamp_enabled also. + * Note: this works for group members as well as group leaders + * since the non-leader members' sibling_lists will be empty. + */ +static void __perf_counter_mark_enabled(struct perf_counter *counter, + struct perf_counter_context *ctx) +{ + struct perf_counter *sub; + + counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_enabled = ctx->time - counter->total_time_enabled; + list_for_each_entry(sub, &counter->sibling_list, list_entry) + if (sub->state >= PERF_COUNTER_STATE_INACTIVE) + sub->tstamp_enabled = + ctx->time - sub->total_time_enabled; +} + /* * Cross CPU call to enable a performance counter */ @@ -877,8 +899,7 @@ static void __perf_counter_enable(void *info) if (counter->state >= PERF_COUNTER_STATE_INACTIVE) goto unlock; - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = ctx->time - counter->total_time_enabled; + __perf_counter_mark_enabled(counter, ctx); /* * If the counter is in a group and isn't the group leader, @@ -971,11 +992,9 @@ static void perf_counter_enable(struct perf_counter *counter) * Since we have the lock this context can't be scheduled * in, so we can change the state safely. */ - if (counter->state == PERF_COUNTER_STATE_OFF) { - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = - ctx->time - counter->total_time_enabled; - } + if (counter->state == PERF_COUNTER_STATE_OFF) + __perf_counter_mark_enabled(counter, ctx); + out: spin_unlock_irq(&ctx->lock); } @@ -1479,9 +1498,7 @@ static void perf_counter_enable_on_exec(struct task_struct *task) counter->attr.enable_on_exec = 0; if (counter->state >= PERF_COUNTER_STATE_INACTIVE) continue; - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = - ctx->time - counter->total_time_enabled; + __perf_counter_mark_enabled(counter, ctx); enabled = 1; } -- cgit v1.2.3-70-g09d2 From a4be7c2778d1fd8e0a8a5607bec91fa221ab2c91 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Aug 2009 11:18:27 +0200 Subject: perf_counter: Allow sharing of output channels Provide the ability to configure a counter to send its output to another (already existing) counter's output stream. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: stephane eranian Cc: Paul Mackerras LKML-Reference: <20090819092023.980284148@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 5 +++ kernel/perf_counter.c | 83 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 85 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index b53f7006cc4..e022b847c90 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -216,6 +216,7 @@ struct perf_counter_attr { #define PERF_COUNTER_IOC_REFRESH _IO ('$', 2) #define PERF_COUNTER_IOC_RESET _IO ('$', 3) #define PERF_COUNTER_IOC_PERIOD _IOW('$', 4, u64) +#define PERF_COUNTER_IOC_SET_OUTPUT _IO ('$', 5) enum perf_counter_ioc_flags { PERF_IOC_FLAG_GROUP = 1U << 0, @@ -415,6 +416,9 @@ enum perf_callchain_context { PERF_CONTEXT_MAX = (__u64)-4095, }; +#define PERF_FLAG_FD_NO_GROUP (1U << 0) +#define PERF_FLAG_FD_OUTPUT (1U << 1) + #ifdef __KERNEL__ /* * Kernel-internal data types and definitions: @@ -536,6 +540,7 @@ struct perf_counter { struct list_head sibling_list; int nr_siblings; struct perf_counter *group_leader; + struct perf_counter *output; const struct pmu *pmu; enum perf_counter_active_state state; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 06bf6a4f260..53abcbefa0b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1692,6 +1692,11 @@ static void free_counter(struct perf_counter *counter) atomic_dec(&nr_task_counters); } + if (counter->output) { + fput(counter->output->filp); + counter->output = NULL; + } + if (counter->destroy) counter->destroy(counter); @@ -1977,6 +1982,8 @@ unlock: return ret; } +int perf_counter_set_output(struct perf_counter *counter, int output_fd); + static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct perf_counter *counter = file->private_data; @@ -2000,6 +2007,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_COUNTER_IOC_PERIOD: return perf_counter_period(counter, (u64 __user *)arg); + case PERF_COUNTER_IOC_SET_OUTPUT: + return perf_counter_set_output(counter, arg); + default: return -ENOTTY; } @@ -2270,6 +2280,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON_ONCE(counter->ctx->parent_ctx); mutex_lock(&counter->mmap_mutex); + if (counter->output) { + ret = -EINVAL; + goto unlock; + } + if (atomic_inc_not_zero(&counter->mmap_count)) { if (nr_pages != counter->data->nr_pages) ret = -EINVAL; @@ -2655,6 +2670,7 @@ static int perf_output_begin(struct perf_output_handle *handle, struct perf_counter *counter, unsigned int size, int nmi, int sample) { + struct perf_counter *output_counter; struct perf_mmap_data *data; unsigned int offset, head; int have_lost; @@ -2664,13 +2680,17 @@ static int perf_output_begin(struct perf_output_handle *handle, u64 lost; } lost_event; + rcu_read_lock(); /* * For inherited counters we send all the output towards the parent. */ if (counter->parent) counter = counter->parent; - rcu_read_lock(); + output_counter = rcu_dereference(counter->output); + if (output_counter) + counter = output_counter; + data = rcu_dereference(counter->data); if (!data) goto out; @@ -4218,6 +4238,57 @@ err_size: goto out; } +int perf_counter_set_output(struct perf_counter *counter, int output_fd) +{ + struct perf_counter *output_counter = NULL; + struct file *output_file = NULL; + struct perf_counter *old_output; + int fput_needed = 0; + int ret = -EINVAL; + + if (!output_fd) + goto set; + + output_file = fget_light(output_fd, &fput_needed); + if (!output_file) + return -EBADF; + + if (output_file->f_op != &perf_fops) + goto out; + + output_counter = output_file->private_data; + + /* Don't chain output fds */ + if (output_counter->output) + goto out; + + /* Don't set an output fd when we already have an output channel */ + if (counter->data) + goto out; + + atomic_long_inc(&output_file->f_count); + +set: + mutex_lock(&counter->mmap_mutex); + old_output = counter->output; + rcu_assign_pointer(counter->output, output_counter); + mutex_unlock(&counter->mmap_mutex); + + if (old_output) { + /* + * we need to make sure no existing perf_output_*() + * is still referencing this counter. + */ + synchronize_rcu(); + fput(old_output->filp); + } + + ret = 0; +out: + fput_light(output_file, fput_needed); + return ret; +} + /** * sys_perf_counter_open - open a performance counter, associate it to a task/cpu * @@ -4240,7 +4311,7 @@ SYSCALL_DEFINE5(perf_counter_open, int ret; /* for future expandability... */ - if (flags) + if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) return -EINVAL; ret = perf_copy_attr(attr_uptr, &attr); @@ -4268,7 +4339,7 @@ SYSCALL_DEFINE5(perf_counter_open, * Look up the group leader (we will attach this counter to it): */ group_leader = NULL; - if (group_fd != -1) { + if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) { ret = -EINVAL; group_file = fget_light(group_fd, &fput_needed); if (!group_file) @@ -4310,6 +4381,12 @@ SYSCALL_DEFINE5(perf_counter_open, if (!counter_file) goto err_free_put_context; + if (flags & PERF_FLAG_FD_OUTPUT) { + ret = perf_counter_set_output(counter, group_fd); + if (ret) + goto err_free_put_context; + } + counter->filp = counter_file; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); -- cgit v1.2.3-70-g09d2 From d88cb582325830698de5071fa8b8c9e933dbbcad Mon Sep 17 00:00:00 2001 From: Anirban Sinha Date: Tue, 25 Aug 2009 07:00:02 -0700 Subject: tracing: Eliminate code duplication in kernel/tracepoint.c Signed-off-by: Anirban Sinha Reviewed-by: Li Zefan Cc: "Oleg Nesterov" LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/tracepoint.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 35dd27adb82..06f165a4408 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -555,9 +555,6 @@ int tracepoint_module_notify(struct notifier_block *self, switch (val) { case MODULE_STATE_COMING: - tracepoint_update_probe_range(mod->tracepoints, - mod->tracepoints + mod->num_tracepoints); - break; case MODULE_STATE_GOING: tracepoint_update_probe_range(mod->tracepoints, mod->tracepoints + mod->num_tracepoints); -- cgit v1.2.3-70-g09d2 From c935a331c8f569c7903ed26a3994a70cbea1802e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 25 Aug 2009 08:40:25 -0700 Subject: rcu: Add #ifdef to suppress __rcu_offline_cpu() warning in !HOTPLUG_CPU builds Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josht@linux.vnet.ibm.com Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Paul Mundt LKML-Reference: <20090825154025.GD6616@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 000b0760987..fee6316a867 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -84,7 +84,9 @@ extern long rcu_batches_completed_sched(void); static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, unsigned long flags); static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags); +#ifdef CONFIG_HOTPLUG_CPU static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); static void __call_rcu(struct rcu_head *head, -- cgit v1.2.3-70-g09d2 From 667000011927b4fcc359beac4a2447889db6d349 Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Mon, 24 Aug 2009 14:43:11 -0700 Subject: tracing: Rename FTRACE_SYSCALLS for tracepoints s/HAVE_FTRACE_SYSCALLS/HAVE_SYSCALL_TRACEPOINTS/g s/TIF_SYSCALL_FTRACE/TIF_SYSCALL_TRACEPOINT/g The syscall enter/exit tracing is no longer specific to just ftrace, so they now have names that reflect their tie to tracepoints instead. Signed-off-by: Josh Stone Cc: Jason Baron Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Li Zefan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Lai Jiangshan Cc: Paul Mundt Cc: Martin Schwidefsky Cc: Heiko Carstens LKML-Reference: <1251150194-1713-2-git-send-email-jistone@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/s390/Kconfig | 2 +- arch/s390/defconfig | 2 +- arch/s390/include/asm/thread_info.h | 4 ++-- arch/s390/kernel/entry.S | 2 +- arch/s390/kernel/entry64.S | 2 +- arch/s390/kernel/ptrace.c | 4 ++-- arch/x86/Kconfig | 2 +- arch/x86/configs/i386_defconfig | 2 +- arch/x86/configs/x86_64_defconfig | 2 +- arch/x86/include/asm/thread_info.h | 13 +++++++------ arch/x86/kernel/ptrace.c | 4 ++-- kernel/trace/Kconfig | 4 ++-- kernel/tracepoint.c | 4 ++-- 13 files changed, 24 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 2ae5d72f47e..7238ef4c7a6 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -84,7 +84,7 @@ config S390 select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_FTRACE_MCOUNT_RECORD - select HAVE_FTRACE_SYSCALLS + select HAVE_SYSCALL_TRACEPOINTS select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_GRAPH_TRACER select HAVE_DEFAULT_NO_SPIN_MUTEXES diff --git a/arch/s390/defconfig b/arch/s390/defconfig index fcba206529f..4e91a2573cc 100644 --- a/arch/s390/defconfig +++ b/arch/s390/defconfig @@ -900,7 +900,7 @@ CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y -CONFIG_HAVE_FTRACE_SYSCALLS=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y CONFIG_TRACING_SUPPORT=y CONFIG_FTRACE=y # CONFIG_FUNCTION_TRACER is not set diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index ba1cab9fc1f..07eb61b2fb3 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -92,7 +92,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 9 /* syscall auditing active */ #define TIF_SECCOMP 10 /* secure computing */ -#define TIF_SYSCALL_FTRACE 11 /* ftrace syscall instrumentation */ +#define TIF_SYSCALL_TRACEPOINT 11 /* syscall tracepoint instrumentation */ #define TIF_USEDFPU 16 /* FPU was used by this task this quantum (SMP) */ #define TIF_POLLING_NRFLAG 17 /* true if poll_idle() is polling TIF_NEED_RESCHED */ @@ -111,7 +111,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_SYSCALL_TRACE (1<>8 | _TIF_SYSCALL_AUDIT>>8 | \ - _TIF_SECCOMP>>8 | _TIF_SYSCALL_FTRACE>>8) + _TIF_SECCOMP>>8 | _TIF_SYSCALL_TRACEPOINT>>8) STACK_SHIFT = PAGE_SHIFT + THREAD_ORDER STACK_SIZE = 1 << STACK_SHIFT diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S index f6618e9e15e..3ceb53c9c49 100644 --- a/arch/s390/kernel/entry64.S +++ b/arch/s390/kernel/entry64.S @@ -57,7 +57,7 @@ _TIF_WORK_SVC = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ _TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ _TIF_MCCK_PENDING) _TIF_SYSCALL = (_TIF_SYSCALL_TRACE>>8 | _TIF_SYSCALL_AUDIT>>8 | \ - _TIF_SECCOMP>>8 | _TIF_SYSCALL_FTRACE>>8) + _TIF_SECCOMP>>8 | _TIF_SYSCALL_TRACEPOINT>>8) #define BASED(name) name-system_call(%r13) diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index c5e87d891ca..9d3dcfa79ea 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -664,7 +664,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) ret = -1; } - if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_syscall_enter(regs, regs->gprs[2]); if (unlikely(current->audit_context)) @@ -682,7 +682,7 @@ asmlinkage void do_syscall_trace_exit(struct pt_regs *regs) audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]), regs->gprs[2]); - if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_syscall_exit(regs, regs->gprs[2]); if (test_thread_flag(TIF_SYSCALL_TRACE)) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 738bdc6b0f8..d59cbf758f3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -37,7 +37,7 @@ config X86 select HAVE_FUNCTION_GRAPH_FP_TEST select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE - select HAVE_FTRACE_SYSCALLS + select HAVE_SYSCALL_TRACEPOINTS select HAVE_KVM select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index edb992ebef9..d28fad19654 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -2355,7 +2355,7 @@ CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_HAVE_HW_BRANCH_TRACER=y -CONFIG_HAVE_FTRACE_SYSCALLS=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y CONFIG_RING_BUFFER=y CONFIG_TRACING=y CONFIG_TRACING_SUPPORT=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index cee1dd2e69b..6c86acd847a 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -2329,7 +2329,7 @@ CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_HAVE_HW_BRANCH_TRACER=y -CONFIG_HAVE_FTRACE_SYSCALLS=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y CONFIG_RING_BUFFER=y CONFIG_TRACING=y CONFIG_TRACING_SUPPORT=y diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index fad7d40b75f..6f7786aea4f 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -95,7 +95,7 @@ struct thread_info { #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ -#define TIF_SYSCALL_FTRACE 28 /* for ftrace syscall instrumentation */ +#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -118,17 +118,17 @@ struct thread_info { #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) -#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE) +#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_FTRACE | \ - _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP) + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \ + _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) /* work to do in syscall_trace_leave() */ #define _TIF_WORK_SYSCALL_EXIT \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ - _TIF_SYSCALL_FTRACE) + _TIF_SYSCALL_TRACEPOINT) /* work to do on interrupt/exception return */ #define _TIF_WORK_MASK \ @@ -137,7 +137,8 @@ struct thread_info { _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) /* work to do on any return to user space */ -#define _TIF_ALLWORK_MASK ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_FTRACE) +#define _TIF_ALLWORK_MASK \ + ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT) /* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 34dd6f15185..a909afef44f 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1500,7 +1500,7 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) tracehook_report_syscall_entry(regs)) ret = -1L; - if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_syscall_enter(regs, regs->orig_ax); if (unlikely(current->audit_context)) { @@ -1526,7 +1526,7 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) if (unlikely(current->audit_context)) audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); - if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_syscall_exit(regs, regs->ax); if (test_thread_flag(TIF_SYSCALL_TRACE)) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 019f380fd76..06be85a7ef8 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -41,7 +41,7 @@ config HAVE_FTRACE_MCOUNT_RECORD config HAVE_HW_BRANCH_TRACER bool -config HAVE_FTRACE_SYSCALLS +config HAVE_SYSCALL_TRACEPOINTS bool config TRACER_MAX_TRACE @@ -211,7 +211,7 @@ config ENABLE_DEFAULT_TRACERS config FTRACE_SYSCALLS bool "Trace syscalls" - depends on HAVE_FTRACE_SYSCALLS + depends on HAVE_SYSCALL_TRACEPOINTS select GENERIC_TRACER select KALLSYMS help diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 06f165a4408..be86b9a01a0 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -590,7 +590,7 @@ void syscall_regfunc(void) if (!sys_tracepoint_refcount) { read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, t) { - set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); + set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); } while_each_thread(g, t); read_unlock_irqrestore(&tasklist_lock, flags); } @@ -608,7 +608,7 @@ void syscall_unregfunc(void) if (!sys_tracepoint_refcount) { read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, t) { - clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); + clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); } while_each_thread(g, t); read_unlock_irqrestore(&tasklist_lock, flags); } -- cgit v1.2.3-70-g09d2 From 3d27d8cb34fc156beb86de2338ca4029873a5cc6 Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Mon, 24 Aug 2009 14:43:12 -0700 Subject: tracing: Make syscall tracepoints conditional The syscall enter/exit tracepoints are only supported on archs that HAVE_SYSCALL_TRACEPOINTS, so the declarations should be #ifdef'ed. Also, the definition of syscall_regfunc and syscall_unregfunc should depend on this same config, rather than the ftrace-specific one. Signed-off-by: Josh Stone Cc: Jason Baron Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Li Zefan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Lai Jiangshan LKML-Reference: <1251150194-1713-3-git-send-email-jistone@redhat.com> Signed-off-by: Frederic Weisbecker --- include/trace/syscall.h | 4 ++++ kernel/tracepoint.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 9661dd406b9..5dcb7e3a544 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -8,6 +8,8 @@ #include +#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS + extern void syscall_regfunc(void); extern void syscall_unregfunc(void); @@ -25,6 +27,8 @@ DECLARE_TRACE_WITH_CALLBACK(syscall_exit, syscall_unregfunc ); +#endif + /* * A syscall entry in the ftrace syscalls array. * diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index be86b9a01a0..9e0a36f0e2a 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -576,7 +576,7 @@ __initcall(init_tracepoints); #endif /* CONFIG_MODULES */ -#ifdef CONFIG_FTRACE_SYSCALLS +#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS static DEFINE_MUTEX(regfunc_mutex); static int sys_tracepoint_refcount; -- cgit v1.2.3-70-g09d2 From 97419875865859fd2403e66266c02ce028e2f5ab Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Mon, 24 Aug 2009 14:43:13 -0700 Subject: tracing: Move tracepoint callbacks from declaration to definition It's not strictly correct for the tracepoint reg/unreg callbacks to occur when a client is hooking up, because the actual tracepoint may not be present yet. This happens to be fine for syscall, since that's in the core kernel, but it would cause problems for tracepoints defined in a module that hasn't been loaded yet. It also means the reg/unreg has to be EXPORTed for any modules to use the tracepoint (as in SystemTap). This patch removes DECLARE_TRACE_WITH_CALLBACK, and instead introduces DEFINE_TRACE_FN which stores the callbacks in struct tracepoint. The callbacks are used now when the active state of the tracepoint changes in set_tracepoint & disable_tracepoint. This also introduces TRACE_EVENT_FN, so ftrace events can also provide registration callbacks if needed. Signed-off-by: Josh Stone Cc: Jason Baron Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Li Zefan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Lai Jiangshan Cc: Paul Mundt Cc: Martin Schwidefsky Cc: Heiko Carstens LKML-Reference: <1251150194-1713-4-git-send-email-jistone@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/s390/kernel/ptrace.c | 4 ++-- arch/x86/kernel/ptrace.c | 4 ++-- include/linux/tracepoint.h | 46 +++++++++++++++++--------------------------- include/trace/define_trace.h | 5 +++++ include/trace/ftrace.h | 9 +++++++++ include/trace/syscall.h | 12 ++++-------- kernel/tracepoint.c | 14 +++++++++----- 7 files changed, 49 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 9d3dcfa79ea..c05b44b80c2 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -51,8 +51,8 @@ #include "compat_ptrace.h" #endif -DEFINE_TRACE(syscall_enter); -DEFINE_TRACE(syscall_exit); +DEFINE_TRACE_FN(syscall_enter, syscall_regfunc, syscall_unregfunc); +DEFINE_TRACE_FN(syscall_exit, syscall_regfunc, syscall_unregfunc); enum s390_regset { REGSET_GENERAL, diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index a909afef44f..31e9b97ec4d 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -37,8 +37,8 @@ #include -DEFINE_TRACE(syscall_enter); -DEFINE_TRACE(syscall_exit); +DEFINE_TRACE_FN(syscall_enter, syscall_regfunc, syscall_unregfunc); +DEFINE_TRACE_FN(syscall_exit, syscall_regfunc, syscall_unregfunc); #include "tls.h" diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 5984ed04c03..846a4ae501e 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -23,6 +23,8 @@ struct tracepoint; struct tracepoint { const char *name; /* Tracepoint name */ int state; /* State. */ + void (*regfunc)(void); + void (*unregfunc)(void); void **funcs; } __attribute__((aligned(32))); /* * Aligned on 32 bytes because it is @@ -60,10 +62,8 @@ struct tracepoint { * Make sure the alignment of the structure in the __tracepoints section will * not add unwanted padding between the beginning of the section and the * structure. Force alignment to the same alignment as the section start. - * An optional set of (un)registration functions can be passed to perform any - * additional (un)registration work. */ -#define DECLARE_TRACE_WITH_CALLBACK(name, proto, args, reg, unreg) \ +#define DECLARE_TRACE(name, proto, args) \ extern struct tracepoint __tracepoint_##name; \ static inline void trace_##name(proto) \ { \ @@ -73,36 +73,23 @@ struct tracepoint { } \ static inline int register_trace_##name(void (*probe)(proto)) \ { \ - int ret; \ - void (*func)(void) = reg; \ - \ - ret = tracepoint_probe_register(#name, (void *)probe); \ - if (func && !ret) \ - func(); \ - return ret; \ + return tracepoint_probe_register(#name, (void *)probe); \ } \ static inline int unregister_trace_##name(void (*probe)(proto)) \ { \ - int ret; \ - void (*func)(void) = unreg; \ - \ - ret = tracepoint_probe_unregister(#name, (void *)probe);\ - if (func && !ret) \ - func(); \ - return ret; \ + return tracepoint_probe_unregister(#name, (void *)probe);\ } -#define DECLARE_TRACE(name, proto, args) \ - DECLARE_TRACE_WITH_CALLBACK(name, TP_PROTO(proto), TP_ARGS(args),\ - NULL, NULL); - -#define DEFINE_TRACE(name) \ +#define DEFINE_TRACE_FN(name, reg, unreg) \ static const char __tpstrtab_##name[] \ __attribute__((section("__tracepoints_strings"))) = #name; \ struct tracepoint __tracepoint_##name \ __attribute__((section("__tracepoints"), aligned(32))) = \ - { __tpstrtab_##name, 0, NULL } + { __tpstrtab_##name, 0, reg, unreg, NULL } + +#define DEFINE_TRACE(name) \ + DEFINE_TRACE_FN(name, NULL, NULL); #define EXPORT_TRACEPOINT_SYMBOL_GPL(name) \ EXPORT_SYMBOL_GPL(__tracepoint_##name) @@ -113,7 +100,7 @@ extern void tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end); #else /* !CONFIG_TRACEPOINTS */ -#define DECLARE_TRACE_WITH_CALLBACK(name, proto, args, reg, unreg) \ +#define DECLARE_TRACE(name, proto, args) \ static inline void _do_trace_##name(struct tracepoint *tp, proto) \ { } \ static inline void trace_##name(proto) \ @@ -127,10 +114,7 @@ extern void tracepoint_update_probe_range(struct tracepoint *begin, return -ENOSYS; \ } -#define DECLARE_TRACE(name, proto, args) \ - DECLARE_TRACE_WITH_CALLBACK(name, TP_PROTO(proto), TP_ARGS(args),\ - NULL, NULL); - +#define DEFINE_TRACE_FN(name, reg, unreg) #define DEFINE_TRACE(name) #define EXPORT_TRACEPOINT_SYMBOL_GPL(name) #define EXPORT_TRACEPOINT_SYMBOL(name) @@ -282,10 +266,16 @@ static inline void tracepoint_synchronize_unregister(void) * can also by used by generic instrumentation like SystemTap), and * it is also used to expose a structured trace record in * /sys/kernel/debug/tracing/events/. + * + * A set of (un)registration functions can be passed to the variant + * TRACE_EVENT_FN to perform any (un)registration work. */ #define TRACE_EVENT(name, proto, args, struct, assign, print) \ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) +#define TRACE_EVENT_FN(name, proto, args, struct, \ + assign, print, reg, unreg) \ + DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) #endif #endif diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index f7a7ae1e8f9..2a969850736 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -26,6 +26,11 @@ #define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ DEFINE_TRACE(name) +#undef TRACE_EVENT_FN +#define TRACE_EVENT_FN(name, proto, args, tstruct, \ + assign, print, reg, unreg) \ + DEFINE_TRACE_FN(name, reg, unreg) + #undef DECLARE_TRACE #define DECLARE_TRACE(name, proto, args) \ DEFINE_TRACE(name) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 127400255e4..3a0b44bdabf 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -42,6 +42,15 @@ }; \ static struct ftrace_event_call event_##name +/* Callbacks are meaningless to ftrace. */ +#undef TRACE_EVENT_FN +#define TRACE_EVENT_FN(name, proto, args, tstruct, \ + assign, print, reg, unreg) \ + TRACE_EVENT(name, TP_PROTO(proto), TP_ARGS(args), \ + TP_STRUCT__entry(tstruct), \ + TP_fast_assign(assign), \ + TP_printk(print)) + #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 5dcb7e3a544..4e194300185 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -13,18 +13,14 @@ extern void syscall_regfunc(void); extern void syscall_unregfunc(void); -DECLARE_TRACE_WITH_CALLBACK(syscall_enter, +DECLARE_TRACE(syscall_enter, TP_PROTO(struct pt_regs *regs, long id), - TP_ARGS(regs, id), - syscall_regfunc, - syscall_unregfunc + TP_ARGS(regs, id) ); -DECLARE_TRACE_WITH_CALLBACK(syscall_exit, +DECLARE_TRACE(syscall_exit, TP_PROTO(struct pt_regs *regs, long ret), - TP_ARGS(regs, ret), - syscall_regfunc, - syscall_unregfunc + TP_ARGS(regs, ret) ); #endif diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 9e0a36f0e2a..1a6a453b7ef 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -243,6 +243,11 @@ static void set_tracepoint(struct tracepoint_entry **entry, { WARN_ON(strcmp((*entry)->name, elem->name) != 0); + if (elem->regfunc && !elem->state && active) + elem->regfunc(); + else if (elem->unregfunc && elem->state && !active) + elem->unregfunc(); + /* * rcu_assign_pointer has a smp_wmb() which makes sure that the new * probe callbacks array is consistent before setting a pointer to it. @@ -262,6 +267,9 @@ static void set_tracepoint(struct tracepoint_entry **entry, */ static void disable_tracepoint(struct tracepoint *elem) { + if (elem->unregfunc && elem->state) + elem->unregfunc(); + elem->state = 0; rcu_assign_pointer(elem->funcs, NULL); } @@ -578,7 +586,7 @@ __initcall(init_tracepoints); #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS -static DEFINE_MUTEX(regfunc_mutex); +/* NB: reg/unreg are called while guarded with the tracepoints_mutex */ static int sys_tracepoint_refcount; void syscall_regfunc(void) @@ -586,7 +594,6 @@ void syscall_regfunc(void) unsigned long flags; struct task_struct *g, *t; - mutex_lock(®func_mutex); if (!sys_tracepoint_refcount) { read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, t) { @@ -595,7 +602,6 @@ void syscall_regfunc(void) read_unlock_irqrestore(&tasklist_lock, flags); } sys_tracepoint_refcount++; - mutex_unlock(®func_mutex); } void syscall_unregfunc(void) @@ -603,7 +609,6 @@ void syscall_unregfunc(void) unsigned long flags; struct task_struct *g, *t; - mutex_lock(®func_mutex); sys_tracepoint_refcount--; if (!sys_tracepoint_refcount) { read_lock_irqsave(&tasklist_lock, flags); @@ -612,6 +617,5 @@ void syscall_unregfunc(void) } while_each_thread(g, t); read_unlock_irqrestore(&tasklist_lock, flags); } - mutex_unlock(®func_mutex); } #endif -- cgit v1.2.3-70-g09d2 From 1c569f0264ea629c10bbab471dd0626ce4d3f19f Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Mon, 24 Aug 2009 14:43:14 -0700 Subject: tracing: Create generic syscall TRACE_EVENTs This converts the syscall_enter/exit tracepoints into TRACE_EVENTs, so you can have generic ftrace events that capture all system calls with arguments and return values. These generic events are also renamed to sys_enter/exit, so they're more closely aligned to the specific sys_enter_foo events. Signed-off-by: Josh Stone Cc: Jason Baron Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Li Zefan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Lai Jiangshan Cc: Paul Mundt Cc: Martin Schwidefsky Cc: Heiko Carstens LKML-Reference: <1251150194-1713-5-git-send-email-jistone@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/s390/kernel/ptrace.c | 8 ++--- arch/x86/kernel/ptrace.c | 12 +++---- include/trace/events/syscalls.h | 70 +++++++++++++++++++++++++++++++++++++++++ include/trace/syscall.h | 17 ---------- kernel/trace/trace_syscalls.c | 17 +++++----- 5 files changed, 88 insertions(+), 36 deletions(-) create mode 100644 include/trace/events/syscalls.h (limited to 'kernel') diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index c05b44b80c2..f3ddd7ac06c 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -51,8 +51,8 @@ #include "compat_ptrace.h" #endif -DEFINE_TRACE_FN(syscall_enter, syscall_regfunc, syscall_unregfunc); -DEFINE_TRACE_FN(syscall_exit, syscall_regfunc, syscall_unregfunc); +#define CREATE_TRACE_POINTS +#include enum s390_regset { REGSET_GENERAL, @@ -665,7 +665,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) } if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_syscall_enter(regs, regs->gprs[2]); + trace_sys_enter(regs, regs->gprs[2]); if (unlikely(current->audit_context)) audit_syscall_entry(is_compat_task() ? @@ -683,7 +683,7 @@ asmlinkage void do_syscall_trace_exit(struct pt_regs *regs) regs->gprs[2]); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_syscall_exit(regs, regs->gprs[2]); + trace_sys_exit(regs, regs->gprs[2]); if (test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall_exit(regs, 0); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 31e9b97ec4d..8d7d5c9c1be 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -35,13 +35,11 @@ #include #include -#include - -DEFINE_TRACE_FN(syscall_enter, syscall_regfunc, syscall_unregfunc); -DEFINE_TRACE_FN(syscall_exit, syscall_regfunc, syscall_unregfunc); - #include "tls.h" +#define CREATE_TRACE_POINTS +#include + enum x86_regset { REGSET_GENERAL, REGSET_FP, @@ -1501,7 +1499,7 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) ret = -1L; if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_syscall_enter(regs, regs->orig_ax); + trace_sys_enter(regs, regs->orig_ax); if (unlikely(current->audit_context)) { if (IS_IA32) @@ -1527,7 +1525,7 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_syscall_exit(regs, regs->ax); + trace_sys_exit(regs, regs->ax); if (test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall_exit(regs, 0); diff --git a/include/trace/events/syscalls.h b/include/trace/events/syscalls.h new file mode 100644 index 00000000000..397dff2dbd5 --- /dev/null +++ b/include/trace/events/syscalls.h @@ -0,0 +1,70 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM syscalls + +#if !defined(_TRACE_EVENTS_SYSCALLS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EVENTS_SYSCALLS_H + +#include + +#include +#include + + +#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS + +extern void syscall_regfunc(void); +extern void syscall_unregfunc(void); + +TRACE_EVENT_FN(sys_enter, + + TP_PROTO(struct pt_regs *regs, long id), + + TP_ARGS(regs, id), + + TP_STRUCT__entry( + __field( long, id ) + __array( unsigned long, args, 6 ) + ), + + TP_fast_assign( + __entry->id = id; + syscall_get_arguments(current, regs, 0, 6, __entry->args); + ), + + TP_printk("NR %ld (%lx, %lx, %lx, %lx, %lx, %lx)", + __entry->id, + __entry->args[0], __entry->args[1], __entry->args[2], + __entry->args[3], __entry->args[4], __entry->args[5]), + + syscall_regfunc, syscall_unregfunc +); + +TRACE_EVENT_FN(sys_exit, + + TP_PROTO(struct pt_regs *regs, long ret), + + TP_ARGS(regs, ret), + + TP_STRUCT__entry( + __field( long, id ) + __field( long, ret ) + ), + + TP_fast_assign( + __entry->id = syscall_get_nr(current, regs); + __entry->ret = ret; + ), + + TP_printk("NR %ld = %ld", + __entry->id, __entry->ret), + + syscall_regfunc, syscall_unregfunc +); + +#endif /* CONFIG_HAVE_SYSCALL_TRACEPOINTS */ + +#endif /* _TRACE_EVENTS_SYSCALLS_H */ + +/* This part must be outside protection */ +#include + diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 4e194300185..5dc283ba5ae 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -8,23 +8,6 @@ #include -#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS - -extern void syscall_regfunc(void); -extern void syscall_unregfunc(void); - -DECLARE_TRACE(syscall_enter, - TP_PROTO(struct pt_regs *regs, long id), - TP_ARGS(regs, id) -); - -DECLARE_TRACE(syscall_exit, - TP_PROTO(struct pt_regs *regs, long ret), - TP_ARGS(regs, ret) -); - -#endif - /* * A syscall entry in the ftrace syscalls array. * diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 46c1b977a2c..2698fe401eb 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -286,7 +287,7 @@ int reg_event_syscall_enter(void *ptr) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_enter) - ret = register_trace_syscall_enter(ftrace_syscall_enter); + ret = register_trace_sys_enter(ftrace_syscall_enter); if (ret) { pr_info("event trace: Could not activate" "syscall entry trace point"); @@ -311,7 +312,7 @@ void unreg_event_syscall_enter(void *ptr) sys_refcount_enter--; clear_bit(num, enabled_enter_syscalls); if (!sys_refcount_enter) - unregister_trace_syscall_enter(ftrace_syscall_enter); + unregister_trace_sys_enter(ftrace_syscall_enter); mutex_unlock(&syscall_trace_lock); } @@ -327,7 +328,7 @@ int reg_event_syscall_exit(void *ptr) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_exit) - ret = register_trace_syscall_exit(ftrace_syscall_exit); + ret = register_trace_sys_exit(ftrace_syscall_exit); if (ret) { pr_info("event trace: Could not activate" "syscall exit trace point"); @@ -352,7 +353,7 @@ void unreg_event_syscall_exit(void *ptr) sys_refcount_exit--; clear_bit(num, enabled_exit_syscalls); if (!sys_refcount_exit) - unregister_trace_syscall_exit(ftrace_syscall_exit); + unregister_trace_sys_exit(ftrace_syscall_exit); mutex_unlock(&syscall_trace_lock); } @@ -418,7 +419,7 @@ int reg_prof_syscall_enter(char *name) mutex_lock(&syscall_trace_lock); if (!sys_prof_refcount_enter) - ret = register_trace_syscall_enter(prof_syscall_enter); + ret = register_trace_sys_enter(prof_syscall_enter); if (ret) { pr_info("event trace: Could not activate" "syscall entry trace point"); @@ -442,7 +443,7 @@ void unreg_prof_syscall_enter(char *name) sys_prof_refcount_enter--; clear_bit(num, enabled_prof_enter_syscalls); if (!sys_prof_refcount_enter) - unregister_trace_syscall_enter(prof_syscall_enter); + unregister_trace_sys_enter(prof_syscall_enter); mutex_unlock(&syscall_trace_lock); } @@ -479,7 +480,7 @@ int reg_prof_syscall_exit(char *name) mutex_lock(&syscall_trace_lock); if (!sys_prof_refcount_exit) - ret = register_trace_syscall_exit(prof_syscall_exit); + ret = register_trace_sys_exit(prof_syscall_exit); if (ret) { pr_info("event trace: Could not activate" "syscall entry trace point"); @@ -503,7 +504,7 @@ void unreg_prof_syscall_exit(char *name) sys_prof_refcount_exit--; clear_bit(num, enabled_prof_exit_syscalls); if (!sys_prof_refcount_exit) - unregister_trace_syscall_exit(prof_syscall_exit); + unregister_trace_sys_exit(prof_syscall_exit); mutex_unlock(&syscall_trace_lock); } -- cgit v1.2.3-70-g09d2 From aa38e9fc3ea804290efd3a39316d7f7e6c945800 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 7 Aug 2009 10:33:02 +0800 Subject: tracing/filters: Add filter_type to struct ftrace_event_field The type of a field is stored as a string in @type, and here we add @filter_type which is an enum value. This prepares for later patches, so we can specifically assign different @filter_type for the same @type. For example normally a "char *" field is treated as a ptr, but we may want it to be treated as a string when doing filting. Signed-off-by: Li Zefan LKML-Reference: <4A7B925E.9030605@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 2 ++ kernel/trace/trace_events.c | 2 ++ kernel/trace/trace_events_filter.c | 23 ++++++++++++++--------- 3 files changed, 18 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 300ef788c97..64dda5709cb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -755,6 +755,7 @@ struct ftrace_event_field { struct list_head link; char *name; char *type; + int filter_type; int offset; int size; int is_signed; @@ -800,6 +801,7 @@ extern int apply_subsystem_event_filter(struct event_subsystem *system, char *filter_string); extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); +extern int filter_assign_type(const char *type); static inline int filter_check_discard(struct ftrace_event_call *call, void *rec, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 79d352027a6..5740e90f4ca 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -44,9 +44,11 @@ int trace_define_field(struct ftrace_event_call *call, const char *type, if (!field->type) goto err; + field->filter_type = filter_assign_type(type); field->offset = offset; field->size = size; field->is_signed = is_signed; + list_add(&field->link, &call->fields); return 0; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 490337abed7..22e6d822bba 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -476,11 +476,12 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, } enum { - FILTER_STATIC_STRING = 1, - FILTER_DYN_STRING + FILTER_OTHER = 0, + FILTER_STATIC_STRING, + FILTER_DYN_STRING, }; -static int is_string_field(const char *type) +int filter_assign_type(const char *type) { if (strstr(type, "__data_loc") && strstr(type, "char")) return FILTER_DYN_STRING; @@ -488,12 +489,18 @@ static int is_string_field(const char *type) if (strchr(type, '[') && strstr(type, "char")) return FILTER_STATIC_STRING; - return 0; + return FILTER_OTHER; +} + +static bool is_string_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_STATIC_STRING; } static int is_legal_op(struct ftrace_event_field *field, int op) { - if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE)) + if (is_string_field(field) && (op != OP_EQ && op != OP_NE)) return 0; return 1; @@ -550,7 +557,6 @@ static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_field *field; filter_pred_fn_t fn; unsigned long long val; - int string_type; int ret; pred->fn = filter_pred_none; @@ -578,9 +584,8 @@ static int filter_add_pred(struct filter_parse_state *ps, return -EINVAL; } - string_type = is_string_field(field->type); - if (string_type) { - if (string_type == FILTER_STATIC_STRING) + if (is_string_field(field)) { + if (field->filter_type == FILTER_STATIC_STRING) fn = filter_pred_string; else fn = filter_pred_strloc; -- cgit v1.2.3-70-g09d2 From 43b51ead3f752a3935116e5b1a94254b8573734f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 7 Aug 2009 10:33:22 +0800 Subject: tracing/filters: Add __field_ext() to TRACE_EVENT Add __field_ext(), so a field can be assigned to a specific filter_type, which matches a corresponding filter function. For example, a later patch will allow this: __field_ext(const char *, str, FILTER_PTR_STR); Signed-off-by: Li Zefan LKML-Reference: <4A7B9272.6050709@cn.fujitsu.com> [ Fixed a -1 to FILTER_OTHER Forward ported to latest kernel. ] Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 9 ++++++++- include/trace/ftrace.h | 31 ++++++++++++++++++++++++------- kernel/trace/trace_events.c | 11 ++++++++--- kernel/trace/trace_events_filter.c | 6 ------ kernel/trace/trace_export.c | 8 +++++--- kernel/trace/trace_syscalls.c | 6 ++++-- 6 files changed, 49 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index df5b085c415..0440bea8f6b 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -140,9 +140,16 @@ extern int filter_current_check_discard(struct ftrace_event_call *call, void *rec, struct ring_buffer_event *event); +enum { + FILTER_OTHER = 0, + FILTER_STATIC_STRING, + FILTER_DYN_STRING, +}; + extern int trace_define_field(struct ftrace_event_call *call, const char *type, const char *name, - int offset, int size, int is_signed); + int offset, int size, int is_signed, + int filter_type); extern int trace_define_common_fields(struct ftrace_event_call *call); #define is_signed_type(type) (((type)(-1)) < 0) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 127400255e4..1b1f742a604 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -21,6 +21,9 @@ #undef __field #define __field(type, item) type item; +#undef __field_ext +#define __field_ext(type, item, filter_type) type item; + #undef __array #define __array(type, item, len) type item[len]; @@ -62,7 +65,10 @@ */ #undef __field -#define __field(type, item); +#define __field(type, item) + +#undef __field_ext +#define __field_ext(type, item, filter_type) #undef __array #define __array(type, item, len) @@ -110,6 +116,9 @@ if (!ret) \ return 0; +#undef __field_ext +#define __field_ext(type, item, filter_type) __field(type, item) + #undef __array #define __array(type, item, len) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ @@ -265,28 +274,33 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -#undef __field -#define __field(type, item) \ +#undef __field_ext +#define __field_ext(type, item, filter_type) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ - sizeof(field.item), is_signed_type(type)); \ + sizeof(field.item), \ + is_signed_type(type), filter_type); \ if (ret) \ return ret; +#undef __field +#define __field(type, item) __field_ext(type, item, FILTER_OTHER) + #undef __array #define __array(type, item, len) \ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), item), \ - sizeof(field.item), 0); \ + sizeof(field.item), 0, FILTER_OTHER); \ if (ret) \ return ret; #undef __dynamic_array #define __dynamic_array(type, item, len) \ ret = trace_define_field(event_call, "__data_loc " #type "[]", #item, \ - offsetof(typeof(field), __data_loc_##item), \ - sizeof(field.__data_loc_##item), 0); + offsetof(typeof(field), __data_loc_##item), \ + sizeof(field.__data_loc_##item), 0, \ + FILTER_OTHER); #undef __string #define __string(item, src) __dynamic_array(char, item, -1) @@ -320,6 +334,9 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ #undef __field #define __field(type, item) +#undef __field_ext +#define __field_ext(type, item, filter_type) + #undef __array #define __array(type, item, len) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5740e90f4ca..d33bcdeffe6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -28,7 +28,8 @@ DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); int trace_define_field(struct ftrace_event_call *call, const char *type, - const char *name, int offset, int size, int is_signed) + const char *name, int offset, int size, int is_signed, + int filter_type) { struct ftrace_event_field *field; @@ -44,7 +45,11 @@ int trace_define_field(struct ftrace_event_call *call, const char *type, if (!field->type) goto err; - field->filter_type = filter_assign_type(type); + if (filter_type == FILTER_OTHER) + field->filter_type = filter_assign_type(type); + else + field->filter_type = filter_type; + field->offset = offset; field->size = size; field->is_signed = is_signed; @@ -68,7 +73,7 @@ EXPORT_SYMBOL_GPL(trace_define_field); ret = trace_define_field(call, #type, "common_" #item, \ offsetof(typeof(ent), item), \ sizeof(ent.item), \ - is_signed_type(type)); \ + is_signed_type(type), FILTER_OTHER); \ if (ret) \ return ret; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 22e6d822bba..8a8e576733f 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -475,12 +475,6 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, return 0; } -enum { - FILTER_OTHER = 0, - FILTER_STATIC_STRING, - FILTER_DYN_STRING, -}; - int filter_assign_type(const char *type) { if (strstr(type, "__data_loc") && strstr(type, "char")) diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 70875303ae4..029a91f4228 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -158,7 +158,8 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #define TRACE_FIELD(type, item, assign) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ - sizeof(field.item), is_signed_type(type)); \ + sizeof(field.item), \ + is_signed_type(type), FILTER_OTHER); \ if (ret) \ return ret; @@ -166,7 +167,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), item), \ - sizeof(field.item), 0); \ + sizeof(field.item), 0, FILTER_OTHER); \ if (ret) \ return ret; @@ -174,7 +175,8 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ - sizeof(field.item), is_signed); \ + sizeof(field.item), is_signed, \ + FILTER_OTHER); \ if (ret) \ return ret; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 46c1b977a2c..97a2454760b 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -194,7 +194,8 @@ int syscall_enter_define_fields(struct ftrace_event_call *call) for (i = 0; i < meta->nb_args; i++) { ret = trace_define_field(call, meta->types[i], meta->args[i], offset, - sizeof(unsigned long), 0); + sizeof(unsigned long), 0, + FILTER_OTHER); offset += sizeof(unsigned long); } @@ -210,7 +211,8 @@ int syscall_exit_define_fields(struct ftrace_event_call *call) if (ret) return ret; - ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0); + ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0, + FILTER_OTHER); return ret; } -- cgit v1.2.3-70-g09d2 From 87a342f5db69d53ea70493bb1ec69c9047677038 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 7 Aug 2009 10:33:43 +0800 Subject: tracing/filters: Support filtering for char * strings Usually, char * entries are dangerous in traces because the string can be released whereas a pointer to it can still wait to be read from the ring buffer. But sometimes we can assume it's safe, like in case of RO data (eg: __file__ or __line__, used in bkl trace event). If these RO data are in a module and so is the call to the trace event, then it's safe, because the ring buffer will be flushed once this module get unloaded. To allow char * to be treated as a string: TRACE_EVENT(..., TP_STRUCT__entry( __field_ext(const char *, name, FILTER_PTR_STRING) ... ) ... ); The filtering will not dereference "char *" unless the developer explicitly sets FILTER_PTR_STR in __field_ext. Signed-off-by: Li Zefan LKML-Reference: <4A7B9287.90205@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 1 + kernel/trace/trace_events_filter.c | 26 +++++++++++++++++++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 0440bea8f6b..ace2da9e0a0 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -144,6 +144,7 @@ enum { FILTER_OTHER = 0, FILTER_STATIC_STRING, FILTER_DYN_STRING, + FILTER_PTR_STRING, }; extern int trace_define_field(struct ftrace_event_call *call, diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 8a8e576733f..9f03082c81d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -163,6 +163,20 @@ static int filter_pred_string(struct filter_pred *pred, void *event, return match; } +/* Filter predicate for char * pointers */ +static int filter_pred_pchar(struct filter_pred *pred, void *event, + int val1, int val2) +{ + char **addr = (char **)(event + pred->offset); + int cmp, match; + + cmp = strncmp(*addr, pred->str_val, pred->str_len); + + match = (!cmp) ^ pred->not; + + return match; +} + /* * Filter predicate for dynamic sized arrays of characters. * These are implemented through a list of strings at the end @@ -489,7 +503,8 @@ int filter_assign_type(const char *type) static bool is_string_field(struct ftrace_event_field *field) { return field->filter_type == FILTER_DYN_STRING || - field->filter_type == FILTER_STATIC_STRING; + field->filter_type == FILTER_STATIC_STRING || + field->filter_type == FILTER_PTR_STRING; } static int is_legal_op(struct ftrace_event_field *field, int op) @@ -579,11 +594,16 @@ static int filter_add_pred(struct filter_parse_state *ps, } if (is_string_field(field)) { + pred->str_len = field->size; + if (field->filter_type == FILTER_STATIC_STRING) fn = filter_pred_string; - else + else if (field->filter_type == FILTER_DYN_STRING) fn = filter_pred_strloc; - pred->str_len = field->size; + else { + fn = filter_pred_pchar; + pred->str_len = strlen(pred->str_val); + } } else { if (field->is_signed) ret = strict_strtoll(pred->str_val, 0, &val); -- cgit v1.2.3-70-g09d2 From 5079f3261ffd7fe4a537679af695f2328943a245 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Tue, 25 Aug 2009 16:12:56 +0800 Subject: ftrace: Move setting of clock-source out of options There are many clock sources for the tracing system but we can only enable/disable one at a time with the trace/options file. We can move the setting of clock-source out of options and add a separate file for it: # cat trace_clock [local] global # echo global > trace_clock # cat trace_clock local [global] Signed-off-by: Zhao Lei LKML-Reference: <4A939D08.6050604@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 92 ++++++++++++++++++++++++++++++++++++++++++---------- kernel/trace/trace.h | 7 ++-- 2 files changed, 79 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8ac204360a3..63dbc7ff213 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -323,12 +323,21 @@ static const char *trace_options[] = { "printk-msg-only", "context-info", "latency-format", - "global-clock", "sleep-time", "graph-time", NULL }; +static struct { + u64 (*func)(void); + const char *name; +} trace_clocks[] = { + { trace_clock_local, "local" }, + { trace_clock_global, "global" }, +}; + +int trace_clock_id; + /* * ftrace_max_lock is used to protect the swapping of buffers * when taking a max snapshot. The buffers themselves are @@ -2159,22 +2168,6 @@ static void set_tracer_flags(unsigned int mask, int enabled) trace_flags |= mask; else trace_flags &= ~mask; - - if (mask == TRACE_ITER_GLOBAL_CLK) { - u64 (*func)(void); - - if (enabled) - func = trace_clock_global; - else - func = trace_clock_local; - - mutex_lock(&trace_types_lock); - ring_buffer_set_clock(global_trace.buffer, func); - - if (max_tr.buffer) - ring_buffer_set_clock(max_tr.buffer, func); - mutex_unlock(&trace_types_lock); - } } static ssize_t @@ -3142,6 +3135,62 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, return cnt; } +static ssize_t tracing_clock_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int bufiter = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) + bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, + "%s%s%s%s", i ? " " : "", + i == trace_clock_id ? "[" : "", trace_clocks[i].name, + i == trace_clock_id ? "]" : ""); + bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, "\n"); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, bufiter); +} + +static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + char buf[64]; + const char *clockstr; + int i; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + clockstr = strstrip(buf); + + for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) { + if (strcmp(trace_clocks[i].name, clockstr) == 0) + break; + } + if (i == ARRAY_SIZE(trace_clocks)) + return -EINVAL; + + trace_clock_id = i; + + mutex_lock(&trace_types_lock); + + ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func); + if (max_tr.buffer) + ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); + + mutex_unlock(&trace_types_lock); + + *fpos += cnt; + + return cnt; +} + static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -3179,6 +3228,12 @@ static const struct file_operations tracing_mark_fops = { .write = tracing_mark_write, }; +static const struct file_operations trace_clock_fops = { + .open = tracing_open_generic, + .read = tracing_clock_read, + .write = tracing_clock_write, +}; + struct ftrace_buffer_info { struct trace_array *tr; void *spare; @@ -3918,6 +3973,9 @@ static __init int tracer_init_debugfs(void) trace_create_file("saved_cmdlines", 0444, d_tracer, NULL, &tracing_saved_cmdlines_fops); + trace_create_file("trace_clock", 0644, d_tracer, NULL, + &trace_clock_fops); + #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 64dda5709cb..654fd657bd0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -568,6 +568,8 @@ trace_vprintk(unsigned long ip, const char *fmt, va_list args); extern unsigned long trace_flags; +extern int trace_clock_id; + /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER extern enum print_line_t print_graph_function(struct trace_iterator *iter); @@ -656,9 +658,8 @@ enum trace_iterator_flags { TRACE_ITER_PRINTK_MSGONLY = 0x10000, TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ TRACE_ITER_LATENCY_FMT = 0x40000, - TRACE_ITER_GLOBAL_CLK = 0x80000, - TRACE_ITER_SLEEP_TIME = 0x100000, - TRACE_ITER_GRAPH_TIME = 0x200000, + TRACE_ITER_SLEEP_TIME = 0x80000, + TRACE_ITER_GRAPH_TIME = 0x100000, }; /* -- cgit v1.2.3-70-g09d2 From 90cba64a5f672a239f43ec5cb9a11b806887331e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 25 Aug 2009 14:35:41 -0700 Subject: timer.c: Fix S/390 comments Fix typos and add omitted words. Signed-off-by: Randy Dunlap Cc: akpm Cc: linux390@de.ibm.com Cc: Martin Schwidefsky Cc: Heiko Carstens LKML-Reference: <20090825143541.43fc2ed8.randy.dunlap@oracle.com> Signed-off-by: Ingo Molnar --- kernel/timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 33fc9d175f4..8e92be654da 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1023,8 +1023,8 @@ static inline void __run_timers(struct tvec_base *base) #ifdef CONFIG_NO_HZ /* * Find out when the next timer event is due to happen. This - * is used on S/390 to stop all activity when a cpus is idle. - * This functions needs to be called disabled. + * is used on S/390 to stop all activity when a CPU is idle. + * This function needs to be called with interrupts disabled. */ static unsigned long __next_timer_interrupt(struct tvec_base *base) { -- cgit v1.2.3-70-g09d2 From cd0980fc8add25e8ab12fcf1051c0f20cbc7c0c0 Mon Sep 17 00:00:00 2001 From: Hendrik Brueckner Date: Tue, 25 Aug 2009 14:50:27 +0200 Subject: tracing: Check invalid syscall nr while tracing syscalls Most arch syscall_get_nr() implementations returns -1 if the syscall number is not valid. Accessing the bit field without a check might result in a kernel oops (at least I saw it on s390 for ftrace selftest). Before this change, this problem did not occur, because the invalid syscall number (-1) caused syscall_nr_to_meta() to return NULL. There are at least two scenarios where syscall_get_nr() can return -1: 1. For example, ptrace stores an invalid syscall number, and thus, tracing code resets it. (see do_syscall_trace_enter in arch/s390/kernel/ptrace.c) 2. The syscall_regfunc() (kernel/tracepoint.c) sets the TIF_SYSCALL_FTRACE (now: TIF_SYSCALL_TRACEPOINT) flag for all threads which include kernel threads. However, the ftrace selftest triggers a kernel oops when testing syscall trace points: - The kernel thread is started as ususal (do_fork()), - tracing code sets TIF_SYSCALL_FTRACE, - the ret_from_fork() function is triggered and starts ftrace_syscall_exit() with an invalid syscall number. To avoid these scenarios, I suggest to check the syscall_nr. For instance, the ftrace selftest fails for s390 (with config option CONFIG_FTRACE_SYSCALLS set) and produces the following kernel oops. Unable to handle kernel pointer dereference at virtual kernel address 2000000000 Oops: 0038 [#1] PREEMPT SMP Modules linked in: CPU: 0 Not tainted 2.6.31-rc6-next-20090819-dirty #18 Process kthreadd (pid: 818, task: 000000003ea207e8, ksp: 000000003e813eb8) Krnl PSW : 0704100180000000 00000000000ea54c (ftrace_syscall_exit+0x58/0xdc) R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:0 CC:1 PM:0 EA:3 Krnl GPRS: 0000000000000000 00000000000e0000 ffffffffffffffff 20000000008c2650 0000000000000007 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 ffffffffffffffff 000000003e813d78 000000003e813f58 0000000000505ba8 000000003e813e18 000000003e813d78 Krnl Code: 00000000000ea540: e330d0000008 ag %r3,0(%r13) 00000000000ea546: a7480007 lhi %r4,7 00000000000ea54a: 1442 nr %r4,%r2 >00000000000ea54c: e31030000090 llgc %r1,0(%r3) 00000000000ea552: 5410d008 n %r1,8(%r13) 00000000000ea556: 8a104000 sra %r1,0(%r4) 00000000000ea55a: 5410d00c n %r1,12(%r13) 00000000000ea55e: 1211 ltr %r1,%r1 Call Trace: ([<0000000000000000>] 0x0) [<000000000001fa22>] do_syscall_trace_exit+0x132/0x18c [<000000000002d0c4>] sysc_return+0x0/0x8 [<000000000001c738>] kernel_thread_starter+0x0/0xc Last Breaking-Event-Address: [<00000000000ea51e>] ftrace_syscall_exit+0x2a/0xdc Signed-off-by: Hendrik Brueckner Acked-by: Heiko Carstens Cc: Jason Baron Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Martin Schwidefsky Cc: Paul Mundt LKML-Reference: <20090825125027.GE4639@cetus.boeblingen.de.ibm.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_syscalls.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 85291c4de40..cb7f600cb02 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -227,6 +227,8 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id) int syscall_nr; syscall_nr = syscall_get_nr(current, regs); + if (syscall_nr < 0) + return; if (!test_bit(syscall_nr, enabled_enter_syscalls)) return; @@ -257,6 +259,8 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret) int syscall_nr; syscall_nr = syscall_get_nr(current, regs); + if (syscall_nr < 0) + return; if (!test_bit(syscall_nr, enabled_exit_syscalls)) return; -- cgit v1.2.3-70-g09d2 From cc3b13c11c567c69a6356be98d0c03ff11541d5c Mon Sep 17 00:00:00 2001 From: Hendrik Brueckner Date: Tue, 25 Aug 2009 18:02:37 +0200 Subject: tracing: Don't trace kernel thread syscalls Kernel threads don't call syscalls using the sysenter/sysexit path. Instead they directly call the sys_* or do_* functions that implement the syscalls inside the kernel. The current syscall tracepoints only bind the sysenter/sysexit path, then it has no effect to trace the kernel thread calls to syscalls in that path. Setting the TIF_SYSCALL_TRACEPOINT flag is then useless for these. Actually there is only one case when a kernel thread can reach the usual syscall exit tracing path: when we create a kernel thread, the child comes to ret_from_fork and is the fork() return is then traced. But this information alone is useless, then we don't want to set the TIF flags for these threads. Kernel threads have task_struct->mm set to NULL. (Thanks to Heiko for that hint ;-) The idea is then to check the mm field in syscall_regfunc() and set the flag accordingly. Signed-off-by: Hendrik Brueckner Cc: Jason Baron Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Martin Schwidefsky Cc: Paul Mundt Cc: Heiko Carstens Cc: Hendrik Brueckner LKML-Reference: <20090825160237.GG4639@cetus.boeblingen.de.ibm.com> Signed-off-by: Frederic Weisbecker --- kernel/tracepoint.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 1a6a453b7ef..9489a0a9b1b 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -597,7 +597,9 @@ void syscall_regfunc(void) if (!sys_tracepoint_refcount) { read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, t) { - set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); + /* Skip kernel threads. */ + if (t->mm) + set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); } while_each_thread(g, t); read_unlock_irqrestore(&tasklist_lock, flags); } -- cgit v1.2.3-70-g09d2 From 57421dbbdc932d65f0e6a41ebb027a2bfe3d0669 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 24 Aug 2009 17:40:22 -0400 Subject: tracing: Convert event tracing code to use NR_syscalls Convert the syscalls event tracing code to use NR_syscalls, instead of FTRACE_SYSCALL_MAX. NR_syscalls is standard accross most arches, and reduces code confusion/complexity. Signed-off-by: Jason Baron Cc: Paul Mundt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Josh Stone Cc: Thomas Gleixner Cc: H. Peter Anwin Cc: Hendrik Brueckner Cc: Heiko Carstens LKML-Reference: <9b4f1a84ecae57cc6599412772efa36f0d2b815b.1251146513.git.jbaron@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/ftrace.c | 8 ++++---- kernel/trace/trace_syscalls.c | 24 ++++++++++++------------ 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 3cff1214e17..9dbb527e165 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -494,7 +494,7 @@ static struct syscall_metadata *find_syscall_meta(unsigned long *syscall) struct syscall_metadata *syscall_nr_to_meta(int nr) { - if (!syscalls_metadata || nr >= FTRACE_SYSCALL_MAX || nr < 0) + if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) return NULL; return syscalls_metadata[nr]; @@ -507,7 +507,7 @@ int syscall_name_to_nr(char *name) if (!syscalls_metadata) return -1; - for (i = 0; i < FTRACE_SYSCALL_MAX; i++) { + for (i = 0; i < NR_syscalls; i++) { if (syscalls_metadata[i]) { if (!strcmp(syscalls_metadata[i]->name, name)) return i; @@ -533,13 +533,13 @@ static int __init arch_init_ftrace_syscalls(void) unsigned long **psys_syscall_table = &sys_call_table; syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * - FTRACE_SYSCALL_MAX, GFP_KERNEL); + NR_syscalls, GFP_KERNEL); if (!syscalls_metadata) { WARN_ON(1); return -ENOMEM; } - for (i = 0; i < FTRACE_SYSCALL_MAX; i++) { + for (i = 0; i < NR_syscalls; i++) { meta = find_syscall_meta(psys_syscall_table[i]); syscalls_metadata[i] = meta; } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index cb7f600cb02..4f5fae6fad9 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -11,8 +11,8 @@ static DEFINE_MUTEX(syscall_trace_lock); static int sys_refcount_enter; static int sys_refcount_exit; -static DECLARE_BITMAP(enabled_enter_syscalls, FTRACE_SYSCALL_MAX); -static DECLARE_BITMAP(enabled_exit_syscalls, FTRACE_SYSCALL_MAX); +static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); +static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags) @@ -289,7 +289,7 @@ int reg_event_syscall_enter(void *ptr) name = (char *)ptr; num = syscall_name_to_nr(name); - if (num < 0 || num >= FTRACE_SYSCALL_MAX) + if (num < 0 || num >= NR_syscalls) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_enter) @@ -312,7 +312,7 @@ void unreg_event_syscall_enter(void *ptr) name = (char *)ptr; num = syscall_name_to_nr(name); - if (num < 0 || num >= FTRACE_SYSCALL_MAX) + if (num < 0 || num >= NR_syscalls) return; mutex_lock(&syscall_trace_lock); sys_refcount_enter--; @@ -330,7 +330,7 @@ int reg_event_syscall_exit(void *ptr) name = (char *)ptr; num = syscall_name_to_nr(name); - if (num < 0 || num >= FTRACE_SYSCALL_MAX) + if (num < 0 || num >= NR_syscalls) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_exit) @@ -353,7 +353,7 @@ void unreg_event_syscall_exit(void *ptr) name = (char *)ptr; num = syscall_name_to_nr(name); - if (num < 0 || num >= FTRACE_SYSCALL_MAX) + if (num < 0 || num >= NR_syscalls) return; mutex_lock(&syscall_trace_lock); sys_refcount_exit--; @@ -373,8 +373,8 @@ struct trace_event event_syscall_exit = { #ifdef CONFIG_EVENT_PROFILE -static DECLARE_BITMAP(enabled_prof_enter_syscalls, FTRACE_SYSCALL_MAX); -static DECLARE_BITMAP(enabled_prof_exit_syscalls, FTRACE_SYSCALL_MAX); +static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); +static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); static int sys_prof_refcount_enter; static int sys_prof_refcount_exit; @@ -420,7 +420,7 @@ int reg_prof_syscall_enter(char *name) int num; num = syscall_name_to_nr(name); - if (num < 0 || num >= FTRACE_SYSCALL_MAX) + if (num < 0 || num >= NR_syscalls) return -ENOSYS; mutex_lock(&syscall_trace_lock); @@ -442,7 +442,7 @@ void unreg_prof_syscall_enter(char *name) int num; num = syscall_name_to_nr(name); - if (num < 0 || num >= FTRACE_SYSCALL_MAX) + if (num < 0 || num >= NR_syscalls) return; mutex_lock(&syscall_trace_lock); @@ -481,7 +481,7 @@ int reg_prof_syscall_exit(char *name) int num; num = syscall_name_to_nr(name); - if (num < 0 || num >= FTRACE_SYSCALL_MAX) + if (num < 0 || num >= NR_syscalls) return -ENOSYS; mutex_lock(&syscall_trace_lock); @@ -503,7 +503,7 @@ void unreg_prof_syscall_exit(char *name) int num; num = syscall_name_to_nr(name); - if (num < 0 || num >= FTRACE_SYSCALL_MAX) + if (num < 0 || num >= NR_syscalls) return; mutex_lock(&syscall_trace_lock); -- cgit v1.2.3-70-g09d2 From 31ffe249e5426d2648d68568fa00a7b66666a5db Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 26 Aug 2009 16:32:37 -0700 Subject: net: Temporarily backout SKB sources tracer. Steven Rostedt has suggested that Neil work with the tracing folks, trying to use TRACE_EVENT as the mechanism for implementation. And if that doesn't workout we can investigate other solutions such as that one which was tried here. This reverts the following 2 commits: 5a165657bef7c47e5ff4cd138f7758ef6278e87b ("net: skb ftracer - Add config option to enable new ftracer (v3)") 9ec04da7489d2c9ae01ea6e9b5fa313ccf3d35fb ("net: skb ftracer - Add actual ftrace code to kernel (v3)") Signed-off-by: David S. Miller --- kernel/trace/Kconfig | 11 --- kernel/trace/Makefile | 1 - kernel/trace/trace.h | 19 ----- kernel/trace/trace_skb_sources.c | 154 --------------------------------------- 4 files changed, 185 deletions(-) delete mode 100644 kernel/trace/trace_skb_sources.c (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 9a2f19bf051..019f380fd76 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -234,17 +234,6 @@ config BOOT_TRACER You must pass in initcall_debug and ftrace=initcall to the kernel command line to enable this on bootup. -config SKB_SOURCES_TRACER - bool "Trace skb source information" - depends on NET - select GENERIC_TRACER - help - This tracer helps developers/sysadmins correlate skb allocation and - consumption. The idea being that some processes will primarily consume data - that was allocated on certain numa nodes. By being able to visualize which - nodes the data was allocated on, a sysadmin or developer can optimize the - scheduling of those processes to cut back on cross node chatter. - config TRACE_BRANCH_PROFILING bool select GENERIC_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index ee5e5b1b928..844164dca90 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -49,7 +49,6 @@ obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o ifeq ($(CONFIG_BLOCK),y) obj-$(CONFIG_EVENT_TRACING) += blktrace.o endif -obj-$(CONFIG_SKB_SOURCES_TRACER) += trace_skb_sources.o obj-$(CONFIG_EVENT_TRACING) += trace_events.o obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8a6281b2330..8b9f4f6e955 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #include @@ -41,7 +40,6 @@ enum trace_type { TRACE_KMEM_FREE, TRACE_POWER, TRACE_BLK, - TRACE_SKB_SOURCE, __TRACE_LAST_TYPE, }; @@ -173,21 +171,6 @@ struct trace_power { struct power_trace state_data; }; -struct skb_record { - pid_t pid; /* pid of the copying process */ - int anid; /* node where skb was allocated */ - int cnid; /* node to which skb was copied in userspace */ - char ifname[IFNAMSIZ]; /* Name of the receiving interface */ - int rx_queue; /* The rx queue the skb was received on */ - int ccpu; /* Cpu the application got this frame from */ - int len; /* length of the data copied */ -}; - -struct trace_skb_event { - struct trace_entry ent; - struct skb_record event_data; -}; - enum kmemtrace_type_id { KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ @@ -340,8 +323,6 @@ extern void __ftrace_bad_type(void); TRACE_SYSCALL_ENTER); \ IF_ASSIGN(var, ent, struct syscall_trace_exit, \ TRACE_SYSCALL_EXIT); \ - IF_ASSIGN(var, ent, struct trace_skb_event, \ - TRACE_SKB_SOURCE); \ __ftrace_bad_type(); \ } while (0) diff --git a/kernel/trace/trace_skb_sources.c b/kernel/trace/trace_skb_sources.c deleted file mode 100644 index 40eb071afdb..00000000000 --- a/kernel/trace/trace_skb_sources.c +++ /dev/null @@ -1,154 +0,0 @@ -/* - * ring buffer based tracer for analyzing per-socket skb sources - * - * Neil Horman - * Copyright (C) 2009 - * - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "trace.h" -#include "trace_output.h" - -EXPORT_TRACEPOINT_SYMBOL_GPL(skb_copy_datagram_iovec); - -static struct trace_array *skb_trace; -static int __read_mostly trace_skb_source_enabled; - -static void probe_skb_dequeue(const struct sk_buff *skb, int len) -{ - struct ring_buffer_event *event; - struct trace_skb_event *entry; - struct trace_array *tr = skb_trace; - struct net_device *dev; - - if (!trace_skb_source_enabled) - return; - - if (in_interrupt()) - return; - - event = trace_buffer_lock_reserve(tr, TRACE_SKB_SOURCE, - sizeof(*entry), 0, 0); - if (!event) - return; - entry = ring_buffer_event_data(event); - - entry->event_data.pid = current->pid; - entry->event_data.anid = page_to_nid(virt_to_page(skb->data)); - entry->event_data.cnid = cpu_to_node(smp_processor_id()); - entry->event_data.len = len; - entry->event_data.rx_queue = skb->queue_mapping; - entry->event_data.ccpu = smp_processor_id(); - - dev = dev_get_by_index(sock_net(skb->sk), skb->iif); - if (dev) { - memcpy(entry->event_data.ifname, dev->name, IFNAMSIZ); - dev_put(dev); - } else { - strcpy(entry->event_data.ifname, "Unknown"); - } - - trace_buffer_unlock_commit(tr, event, 0, 0); -} - -static int tracing_skb_source_register(void) -{ - int ret; - - ret = register_trace_skb_copy_datagram_iovec(probe_skb_dequeue); - if (ret) - pr_info("skb source trace: Couldn't activate dequeue tracepoint"); - - return ret; -} - -static void start_skb_source_trace(struct trace_array *tr) -{ - trace_skb_source_enabled = 1; -} - -static void stop_skb_source_trace(struct trace_array *tr) -{ - trace_skb_source_enabled = 0; -} - -static void skb_source_trace_reset(struct trace_array *tr) -{ - trace_skb_source_enabled = 0; - unregister_trace_skb_copy_datagram_iovec(probe_skb_dequeue); -} - - -static int skb_source_trace_init(struct trace_array *tr) -{ - int cpu; - skb_trace = tr; - - trace_skb_source_enabled = 1; - tracing_skb_source_register(); - - for_each_cpu(cpu, cpu_possible_mask) - tracing_reset(tr, cpu); - return 0; -} - -static enum print_line_t skb_source_print_line(struct trace_iterator *iter) -{ - int ret = 0; - struct trace_entry *entry = iter->ent; - struct trace_skb_event *event; - struct skb_record *record; - struct trace_seq *s = &iter->seq; - - trace_assign_type(event, entry); - record = &event->event_data; - if (entry->type != TRACE_SKB_SOURCE) - return TRACE_TYPE_UNHANDLED; - - ret = trace_seq_printf(s, " %d %d %d %s %d %d %d\n", - record->pid, - record->anid, - record->cnid, - record->ifname, - record->rx_queue, - record->ccpu, - record->len); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static void skb_source_print_header(struct seq_file *s) -{ - seq_puts(s, "# PID ANID CNID IFC RXQ CCPU LEN\n"); - seq_puts(s, "# | | | | | | |\n"); -} - -static struct tracer skb_source_tracer __read_mostly = -{ - .name = "skb_sources", - .init = skb_source_trace_init, - .start = start_skb_source_trace, - .stop = stop_skb_source_trace, - .reset = skb_source_trace_reset, - .print_line = skb_source_print_line, - .print_header = skb_source_print_header, -}; - -static int init_skb_source_trace(void) -{ - return register_tracer(&skb_source_tracer); -} -device_initcall(init_skb_source_trace); -- cgit v1.2.3-70-g09d2 From 4ab6c08336535f8c8e42cf45d7adeda882eff06e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 Aug 2009 14:29:24 -0700 Subject: clone(): fix race between copy_process() and de_thread() Spotted by Hiroshi Shimamoto who also provided the test-case below. copy_process() uses signal->count as a reference counter, but it is not. This test case #include #include #include #include #include #include void *null_thread(void *p) { for (;;) sleep(1); return NULL; } void *exec_thread(void *p) { execl("/bin/true", "/bin/true", NULL); return null_thread(p); } int main(int argc, char **argv) { for (;;) { pid_t pid; int ret, status; pid = fork(); if (pid < 0) break; if (!pid) { pthread_t tid; pthread_create(&tid, NULL, exec_thread, NULL); for (;;) pthread_create(&tid, NULL, null_thread, NULL); } do { ret = waitpid(pid, &status, 0); } while (ret == -1 && errno == EINTR); } return 0; } quickly creates an unkillable task. If copy_process(CLONE_THREAD) races with de_thread() copy_signal()->atomic(signal->count) breaks the signal->notify_count logic, and the execing thread can hang forever in kernel space. Change copy_process() to increment count/live only when we know for sure we can't fail. In this case the forked thread will take care of its reference to signal correctly. If copy_process() fails, check CLONE_THREAD flag. If it it set - do nothing, the counters were not changed and current belongs to the same thread group. If it is not set, ->signal must be released in any case (and ->count must be == 1), the forked child is the only thread in the thread group. We need more cleanups here, in particular signal->count should not be used by de_thread/__exit_signal at all. This patch only fixes the bug. Reported-by: Hiroshi Shimamoto Tested-by: Hiroshi Shimamoto Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Cc: KAMEZAWA Hiroyuki Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 144326b7af5..e6c04d462ab 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -815,11 +815,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) { struct signal_struct *sig; - if (clone_flags & CLONE_THREAD) { - atomic_inc(¤t->signal->count); - atomic_inc(¤t->signal->live); + if (clone_flags & CLONE_THREAD) return 0; - } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); tsk->signal = sig; @@ -877,16 +874,6 @@ void __cleanup_signal(struct signal_struct *sig) kmem_cache_free(signal_cachep, sig); } -static void cleanup_signal(struct task_struct *tsk) -{ - struct signal_struct *sig = tsk->signal; - - atomic_dec(&sig->live); - - if (atomic_dec_and_test(&sig->count)) - __cleanup_signal(sig); -} - static void copy_flags(unsigned long clone_flags, struct task_struct *p) { unsigned long new_flags = p->flags; @@ -1239,6 +1226,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, } if (clone_flags & CLONE_THREAD) { + atomic_inc(¤t->signal->count); + atomic_inc(¤t->signal->live); p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); } @@ -1282,7 +1271,8 @@ bad_fork_cleanup_mm: if (p->mm) mmput(p->mm); bad_fork_cleanup_signal: - cleanup_signal(p); + if (!(clone_flags & CLONE_THREAD)) + __cleanup_signal(p->signal); bad_fork_cleanup_sighand: __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: -- cgit v1.2.3-70-g09d2 From 4dbc9ca219b0f294332e734528f7b82211700170 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 27 Aug 2009 09:38:49 +0200 Subject: genirq: Do not mask oneshot edge type interrupts Masking oneshot edge type interrupts is wrong as we might lose an interrupt which is issued when the threaded handler is handling the device. We can keep the irq unmasked safely as with edge type interrupts there is no danger of interrupt floods. If the threaded handler has not yet finished then IRQTF_RUNTHREAD is set which will keep the handler thread active. Debugged and verified in preempt-rt. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 5765aad9499..c1660194d11 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -548,13 +548,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) kstat_incr_irqs_this_cpu(irq, desc); /* Start handling the irq */ - if (unlikely(desc->status & IRQ_ONESHOT)) { - desc->status |= IRQ_MASKED; - mask_ack_irq(desc, irq); - } else { - if (desc->chip->ack) - desc->chip->ack(irq); - } + if (desc->chip->ack) + desc->chip->ack(irq); /* Mark the IRQ currently in progress.*/ desc->status |= IRQ_INPROGRESS; -- cgit v1.2.3-70-g09d2 From 7d1d16e416e61aeef8655d542f8e4a4fc6e808e4 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 26 Aug 2009 22:02:54 +0930 Subject: module: fix BUG_ON() for powerpc (and other function descriptor archs) The rarely-used symbol_put_addr() needs to use dereference_function_descriptor on powerpc. Reported-by: Paul Mackerras Signed-off-by: Rusty Russell --- kernel/module.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index fd141140355..07c80e68a6c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -909,16 +909,18 @@ void __symbol_put(const char *symbol) } EXPORT_SYMBOL(__symbol_put); +/* Note this assumes addr is a function, which it currently always is. */ void symbol_put_addr(void *addr) { struct module *modaddr; + unsigned long a = (unsigned long)dereference_function_descriptor(addr); - if (core_kernel_text((unsigned long)addr)) + if (core_kernel_text(a)) return; /* module_text_address is safe here: we're supposed to have reference * to module from symbol_get, so it can't go away. */ - modaddr = __module_text_address((unsigned long)addr); + modaddr = __module_text_address(a); BUG_ON(!modaddr); module_put(modaddr); } -- cgit v1.2.3-70-g09d2 From 1b364bf438cf337a3818aee77d68c0713f3e1fc4 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Wed, 26 Aug 2009 22:04:12 +0930 Subject: module: workaround duplicate section names The root cause is a duplicate section name (.text); is this legal? [ Amerigo Wang: "AFAIK, yes." ] However, there's a problem with commit 6d76013381ed28979cd122eb4b249a88b5e384fa in that if you fail to allocate a mod->sect_attrs (in this case it's null because of the duplication), it still gets used without checking in add_notes_attrs() This should fix it [ This patch leaves other problems, particularly the sections directory, but recent parisc toolchains seem to produce these modules and this prevents a crash and is a minimal change -- RR ] Signed-off-by: James Bottomley Signed-off-by: Rusty Russell Tested-by: Helge Deller Signed-off-by: Linus Torvalds --- kernel/module.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 07c80e68a6c..eccb561dd8a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2355,7 +2355,8 @@ static noinline struct module *load_module(void __user *umod, if (err < 0) goto unlink; add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); - add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); + if (mod->sect_attrs) + add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); /* Get rid of temporary copy */ vfree(hdr); -- cgit v1.2.3-70-g09d2 From c0729be99cb2b9d9749256254f1c40a801835896 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 26 Aug 2009 22:23:52 -0400 Subject: tracing: remove legacy select of MARKERS by context switch tracing The context switch tracer was made before tracepoints were mature, and the original version used markers. This is no longer true and this patch removes the select. Reported-by: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 06be85a7ef8..163fbfc2f39 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -60,7 +60,6 @@ config EVENT_TRACING bool config CONTEXT_SWITCH_TRACER - select MARKERS bool # All tracer options should select GENERIC_TRACER. For those options that are -- cgit v1.2.3-70-g09d2 From 5d4a9dba2d7fbab69f00dedd430d1788834a055a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Aug 2009 16:52:21 -0400 Subject: tracing: only show tracing_max_latency when latency tracer configured The tracing_max_latency file should only be present when one of the latency tracers ({preempt|irqs}off, wakeup*) are enabled. This patch also removes tracing_thresh when latency tracers are not enabled, as well as compiles out code that is only used for latency tracers. Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 92 ++++++++++++++++++++++++++++------------------------ kernel/trace/trace.h | 2 ++ 2 files changed, 52 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 63dbc7ff213..0f0881676dc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -43,9 +43,6 @@ #define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) -unsigned long __read_mostly tracing_max_latency; -unsigned long __read_mostly tracing_thresh; - /* * On boot up, the ring buffer is set to the minimum size, so that * we do not waste memory on systems that are not using tracing. @@ -338,45 +335,6 @@ static struct { int trace_clock_id; -/* - * ftrace_max_lock is used to protect the swapping of buffers - * when taking a max snapshot. The buffers themselves are - * protected by per_cpu spinlocks. But the action of the swap - * needs its own lock. - * - * This is defined as a raw_spinlock_t in order to help - * with performance when lockdep debugging is enabled. - */ -static raw_spinlock_t ftrace_max_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - -/* - * Copy the new maximum trace into the separate maximum-trace - * structure. (this way the maximum trace is permanently saved, - * for later retrieval via /sys/kernel/debug/tracing/latency_trace) - */ -static void -__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) -{ - struct trace_array_cpu *data = tr->data[cpu]; - - max_tr.cpu = cpu; - max_tr.time_start = data->preempt_timestamp; - - data = max_tr.data[cpu]; - data->saved_latency = tracing_max_latency; - - memcpy(data->comm, tsk->comm, TASK_COMM_LEN); - data->pid = tsk->pid; - data->uid = task_uid(tsk); - data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; - data->policy = tsk->policy; - data->rt_priority = tsk->rt_priority; - - /* record this tasks comm */ - tracing_record_cmdline(tsk); -} - ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) { int len; @@ -420,6 +378,53 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) return cnt; } +/* + * ftrace_max_lock is used to protect the swapping of buffers + * when taking a max snapshot. The buffers themselves are + * protected by per_cpu spinlocks. But the action of the swap + * needs its own lock. + * + * This is defined as a raw_spinlock_t in order to help + * with performance when lockdep debugging is enabled. + * + * It is also used in other places outside the update_max_tr + * so it needs to be defined outside of the + * CONFIG_TRACER_MAX_TRACE. + */ +static raw_spinlock_t ftrace_max_lock = + (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + +#ifdef CONFIG_TRACER_MAX_TRACE +unsigned long __read_mostly tracing_max_latency; +unsigned long __read_mostly tracing_thresh; + +/* + * Copy the new maximum trace into the separate maximum-trace + * structure. (this way the maximum trace is permanently saved, + * for later retrieval via /sys/kernel/debug/tracing/latency_trace) + */ +static void +__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct trace_array_cpu *data = tr->data[cpu]; + + max_tr.cpu = cpu; + max_tr.time_start = data->preempt_timestamp; + + data = max_tr.data[cpu]; + data->saved_latency = tracing_max_latency; + + memcpy(data->comm, tsk->comm, TASK_COMM_LEN); + data->pid = tsk->pid; + data->uid = task_uid(tsk); + data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; + data->policy = tsk->policy; + data->rt_priority = tsk->rt_priority; + + /* record this tasks comm */ + tracing_record_cmdline(tsk); +} + /** * update_max_tr - snapshot all trace buffers from global_trace to max_tr * @tr: tracer @@ -476,6 +481,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) __update_max_tr(tr, tsk, cpu); __raw_spin_unlock(&ftrace_max_lock); } +#endif /* CONFIG_TRACER_MAX_TRACE */ /** * register_tracer - register a tracer with the ftrace system. @@ -3952,11 +3958,13 @@ static __init int tracer_init_debugfs(void) trace_create_file("current_tracer", 0644, d_tracer, &global_trace, &set_tracer_fops); +#ifdef CONFIG_TRACER_MAX_TRACE trace_create_file("tracing_max_latency", 0644, d_tracer, &tracing_max_latency, &tracing_max_lat_fops); trace_create_file("tracing_thresh", 0644, d_tracer, &tracing_thresh, &tracing_max_lat_fops); +#endif trace_create_file("README", 0444, d_tracer, NULL, &tracing_readme_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 654fd657bd0..e2c06b21dd8 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -473,12 +473,14 @@ void unregister_tracer(struct tracer *type); extern unsigned long nsecs_to_usecs(unsigned long nsecs); +#ifdef CONFIG_TRACER_MAX_TRACE extern unsigned long tracing_max_latency; extern unsigned long tracing_thresh; void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); +#endif /* CONFIG_TRACER_MAX_TRACE */ #ifdef CONFIG_STACKTRACE void ftrace_trace_stack(struct trace_array *tr, unsigned long flags, -- cgit v1.2.3-70-g09d2 From 34d76c41554a05425613d16efebb3069c4c545f0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 Aug 2009 13:08:56 +0200 Subject: sched: Fix division by zero - really When re-computing the shares for each task group's cpu representation we need the ratio of weight on each cpu vs the total weight of the sched domain. Since load-balancing is loosely (read not) synchronized, the weight of individual cpus can change between doing the sum and calculating the ratio. The previous patch dealt with only one of the race scenarios, this patch side steps them all by saving a snapshot of all the individual cpu weights, thereby always working on a consistent set. Signed-off-by: Peter Zijlstra Cc: torvalds@linux-foundation.org Cc: jes@sgi.com Cc: jens.axboe@oracle.com Cc: Balbir Singh Cc: Arjan van de Ven Cc: Yinghai Lu LKML-Reference: <1251371336.18584.77.camel@twins> Signed-off-by: Ingo Molnar --- kernel/sched.c | 50 +++++++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8f8a98eab9d..523e20a6269 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1515,30 +1515,29 @@ static unsigned long cpu_avg_load_per_task(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED +struct update_shares_data { + unsigned long rq_weight[NR_CPUS]; +}; + +static DEFINE_PER_CPU(struct update_shares_data, update_shares_data); + static void __set_se_shares(struct sched_entity *se, unsigned long shares); /* * Calculate and set the cpu's group shares. */ -static void -update_group_shares_cpu(struct task_group *tg, int cpu, - unsigned long sd_shares, unsigned long sd_rq_weight, - unsigned long sd_eff_weight) +static void update_group_shares_cpu(struct task_group *tg, int cpu, + unsigned long sd_shares, + unsigned long sd_rq_weight, + struct update_shares_data *usd) { - unsigned long rq_weight; - unsigned long shares; + unsigned long shares, rq_weight; int boost = 0; - if (!tg->se[cpu]) - return; - - rq_weight = tg->cfs_rq[cpu]->rq_weight; + rq_weight = usd->rq_weight[cpu]; if (!rq_weight) { boost = 1; rq_weight = NICE_0_LOAD; - if (sd_rq_weight == sd_eff_weight) - sd_eff_weight += NICE_0_LOAD; - sd_rq_weight = sd_eff_weight; } /* @@ -1555,6 +1554,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long flags; spin_lock_irqsave(&rq->lock, flags); + tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; tg->cfs_rq[cpu]->shares = boost ? 0 : shares; __set_se_shares(tg->se[cpu], shares); spin_unlock_irqrestore(&rq->lock, flags); @@ -1568,25 +1568,31 @@ update_group_shares_cpu(struct task_group *tg, int cpu, */ static int tg_shares_up(struct task_group *tg, void *data) { - unsigned long weight, rq_weight = 0, eff_weight = 0; - unsigned long shares = 0; + unsigned long weight, rq_weight = 0, shares = 0; + struct update_shares_data *usd; struct sched_domain *sd = data; + unsigned long flags; int i; + if (!tg->se[0]) + return 0; + + local_irq_save(flags); + usd = &__get_cpu_var(update_shares_data); + for_each_cpu(i, sched_domain_span(sd)) { + weight = tg->cfs_rq[i]->load.weight; + usd->rq_weight[i] = weight; + /* * If there are currently no tasks on the cpu pretend there * is one of average load so that when a new task gets to * run here it will not get delayed by group starvation. */ - weight = tg->cfs_rq[i]->load.weight; - tg->cfs_rq[i]->rq_weight = weight; - rq_weight += weight; - if (!weight) weight = NICE_0_LOAD; - eff_weight += weight; + rq_weight += weight; shares += tg->cfs_rq[i]->shares; } @@ -1597,7 +1603,9 @@ static int tg_shares_up(struct task_group *tg, void *data) shares = tg->shares; for_each_cpu(i, sched_domain_span(sd)) - update_group_shares_cpu(tg, i, shares, rq_weight, eff_weight); + update_group_shares_cpu(tg, i, shares, rq_weight, usd); + + local_irq_restore(flags); return 0; } -- cgit v1.2.3-70-g09d2 From 6bb56347f5162d1a7cb1dc461023360781ecd4c0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 28 Aug 2009 13:44:53 +0200 Subject: perf_counters: Increase paranoia level Per-cpu counters are an ASLR information leak as they show the execution other tasks do. Increase the paranoia level to 1, which disallows per-cpu counters. (they still allow counting/profiling of own tasks - and admin can profile everything.) Acked-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f274e195988..7d4bb83b78c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -50,7 +50,7 @@ static atomic_t nr_task_counters __read_mostly; * 1 - disallow cpu counters to unpriv * 2 - disallow kernel profiling to unpriv */ -int sysctl_perf_counter_paranoid __read_mostly; +int sysctl_perf_counter_paranoid __read_mostly = 1; static inline bool perf_paranoid_cpu(void) { -- cgit v1.2.3-70-g09d2 From 7285dd7fd375763bfb8ab1ac9cf3f1206f503c16 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 28 Aug 2009 20:25:24 +0200 Subject: clocksource: Resolve cpu hotplug dead lock with TSC unstable Martin Schwidefsky analyzed it: To register a clocksource the clocksource_mutex is acquired and if necessary timekeeping_notify is called to install the clocksource as the timekeeper clock. timekeeping_notify uses stop_machine which needs to take cpu_add_remove_lock mutex. Starting a new cpu is done with the cpu_add_remove_lock mutex held. native_cpu_up checks the tsc of the new cpu and if the tsc is no good clocksource_change_rating is called. Which needs the clocksource_mutex and the deadlock is complete. The solution is to replace the TSC via the clocksource watchdog mechanism. Mark the TSC as unstable and schedule the watchdog work so it gets removed in the watchdog thread context. Signed-off-by: Thomas Gleixner LKML-Reference: Cc: Martin Schwidefsky Cc: John Stultz --- arch/x86/kernel/tsc.c | 8 +++++--- include/linux/clocksource.h | 1 + kernel/time/clocksource.c | 33 ++++++++++++++++++++++++++++++--- 3 files changed, 36 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 968425422c4..fc3672a303d 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -767,12 +767,14 @@ void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; - printk("Marking TSC unstable due to %s\n", reason); + printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) - clocksource_change_rating(&clocksource_tsc, 0); - else + clocksource_mark_unstable(&clocksource_tsc); + else { + clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE; clocksource_tsc.rating = 0; + } } } diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 9ea40ff26f0..83d2fbd81b9 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -277,6 +277,7 @@ extern struct clocksource* clocksource_get_next(void); extern void clocksource_change_rating(struct clocksource *cs, int rating); extern void clocksource_resume(void); extern struct clocksource * __init __weak clocksource_default_clock(void); +extern void clocksource_mark_unstable(struct clocksource *cs); #ifdef CONFIG_GENERIC_TIME_VSYSCALL extern void update_vsyscall(struct timespec *ts, struct clocksource *c); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e0c86ad6e9f..a0af4ffcb6e 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -149,15 +149,42 @@ static void clocksource_watchdog_work(struct work_struct *work) kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); } -static void clocksource_unstable(struct clocksource *cs, int64_t delta) +static void __clocksource_unstable(struct clocksource *cs) { - printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", - cs->name, delta); cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); cs->flags |= CLOCK_SOURCE_UNSTABLE; schedule_work(&watchdog_work); } +static void clocksource_unstable(struct clocksource *cs, int64_t delta) +{ + printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", + cs->name, delta); + __clocksource_unstable(cs); +} + +/** + * clocksource_mark_unstable - mark clocksource unstable via watchdog + * @cs: clocksource to be marked unstable + * + * This function is called instead of clocksource_change_rating from + * cpu hotplug code to avoid a deadlock between the clocksource mutex + * and the cpu hotplug mutex. It defers the update of the clocksource + * to the watchdog thread. + */ +void clocksource_mark_unstable(struct clocksource *cs) +{ + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) { + if (list_empty(&cs->wd_list)) + list_add(&cs->wd_list, &watchdog_list); + __clocksource_unstable(cs); + } + spin_unlock_irqrestore(&watchdog_lock, flags); +} + static void clocksource_watchdog(unsigned long data) { struct clocksource *cs; -- cgit v1.2.3-70-g09d2 From ea6bff368548d79529421a9dc0710fc5330eb504 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 28 Aug 2009 10:44:56 +0200 Subject: modules: Fix build error in the !CONFIG_KALLSYMS case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit > James Bottomley (1): > module: workaround duplicate section names -tip testing found that this patch breaks the build on x86 if CONFIG_KALLSYMS is disabled: kernel/module.c: In function ‘load_module’: kernel/module.c:2367: error: ‘struct module’ has no member named ‘sect_attrs’ distcc[8269] ERROR: compile kernel/module.c on ph/32 failed make[1]: *** [kernel/module.o] Error 1 make: *** [kernel] Error 2 make: *** Waiting for unfinished jobs.... Commit 1b364bf misses the fact that section attributes are only built and dealt with if kallsyms is enabled. The patch below fixes this. ( note, technically speaking this should depend on CONFIG_SYSFS as well but this patch is correct too and keeps the #ifdef less intrusive - in the KALLSYMS && !SYSFS case the code is a NOP. ) Signed-off-by: Ingo Molnar [ Replaced patch with a slightly cleaner variation by James Bottomley ] Signed-off-by: Linus Torvalds --- kernel/module.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index eccb561dd8a..2d537186191 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1274,6 +1274,10 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, struct module_notes_attrs *notes_attrs; struct bin_attribute *nattr; + /* failed to create section attributes, so can't create notes */ + if (!mod->sect_attrs) + return; + /* Count notes sections and allocate structures. */ notes = 0; for (i = 0; i < nsect; i++) @@ -2355,8 +2359,7 @@ static noinline struct module *load_module(void __user *umod, if (err < 0) goto unlink; add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); - if (mod->sect_attrs) - add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); + add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); /* Get rid of temporary copy */ vfree(hdr); -- cgit v1.2.3-70-g09d2 From 2bc481cf433879f0e6cdd4d899fc21ee05dcea23 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 28 Aug 2009 23:41:29 -0700 Subject: pktgen: spin using hrtimer This changes how the pktgen thread spins/waits between packets if delay is configured. It uses a high res timer to wait for time to arrive. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- kernel/hrtimer.c | 2 ++ net/core/pktgen.c | 49 ++++++++++++++++++++++++++++--------------------- 2 files changed, 30 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 49da79ab848..05071bf6a37 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -485,6 +485,7 @@ void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, debug_object_init_on_stack(timer, &hrtimer_debug_descr); __hrtimer_init(timer, clock_id, mode); } +EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); void destroy_hrtimer_on_stack(struct hrtimer *timer) { @@ -1477,6 +1478,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) sl->timer.function = hrtimer_wakeup; sl->task = task; } +EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) { diff --git a/net/core/pktgen.c b/net/core/pktgen.c index bede00b5403..3045dd13307 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -131,6 +131,7 @@ #include #include #include +#include #include #include #include @@ -2086,33 +2087,40 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) pkt_dev->nflows = 0; } -static inline s64 delta_ns(ktime_t a, ktime_t b) -{ - return ktime_to_ns(ktime_sub(a, b)); -} static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) { - ktime_t start, now; - s64 dt; + ktime_t start; + s32 remaining; + struct hrtimer_sleeper t; - start = now = ktime_now(); + hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_set_expires(&t.timer, spin_until); + + remaining = ktime_to_us(hrtimer_expires_remaining(&t.timer)); + if (remaining <= 0) + return; - while ((dt = delta_ns(spin_until, now)) > 0) { - /* TODO: optimize sleeping behavior */ - if (dt > TICK_NSEC) - schedule_timeout_interruptible(1); - else if (dt > 100*NSEC_PER_USEC) { - if (!pkt_dev->running) - return; - if (need_resched()) + start = ktime_now(); + if (remaining < 100) + udelay(remaining); /* really small just spin */ + else { + /* see do_nanosleep */ + hrtimer_init_sleeper(&t, current); + do { + set_current_state(TASK_INTERRUPTIBLE); + hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); + if (!hrtimer_active(&t.timer)) + t.task = NULL; + + if (likely(t.task)) schedule(); - } - now = ktime_now(); + hrtimer_cancel(&t.timer); + } while (t.task && pkt_dev->running && !signal_pending(current)); + __set_current_state(TASK_RUNNING); } - - pkt_dev->idle_acc += ktime_to_ns(ktime_sub(now, start)); + pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), start)); } static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) @@ -3360,8 +3368,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) int ret; if (pkt_dev->delay) { - if (ktime_lt(ktime_now(), pkt_dev->next_tx)) - spin(pkt_dev, pkt_dev->next_tx); + spin(pkt_dev, pkt_dev->next_tx); /* This is max DELAY, this has special meaning of * "never transmit" -- cgit v1.2.3-70-g09d2 From eced1dfcfcf6b0a35e925d73916a9d8e36ab5457 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 Aug 2009 17:10:47 +0200 Subject: perf_counter: Fix /0 bug in swcounters We have a race in the swcounter stuff where we can start counting a counter that has never been enabled, this leads to a /0 situation. The below avoids the /0 but doesn't close the race, this would need a new counter state. The race is due to perf_swcounter_is_counting() which cannot discern between disabled due to scheduled out, and disabled for any other reason. Such a crash has been seen by Ingo: [ 967.092372] divide error: 0000 [#1] SMP [ 967.096499] last sysfs file: /sys/devices/system/cpu/cpu15/cache/index2/shared_cpu_map [ 967.104846] CPU 5 [ 967.106965] Modules linked in: [ 967.110169] Pid: 3351, comm: hackbench Not tainted 2.6.31-rc8-tip-01158-gd940a54-dirty #1568 X8DTN [ 967.119456] RIP: 0010:[] [] perf_swcounter_ctx_event+0x127/0x1af [ 967.129137] RSP: 0018:ffff8801a95abd70 EFLAGS: 00010046 [ 967.134699] RAX: 0000000000000002 RBX: ffff8801bd645c00 RCX: 0000000000000002 [ 967.142162] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8801bd645d40 [ 967.149584] RBP: ffff8801a95abdb0 R08: 0000000000000001 R09: ffff8801a95abe00 [ 967.157042] R10: 0000000000000037 R11: ffff8801aa1245f8 R12: ffff8801a95abe00 [ 967.164481] R13: ffff8801a95abe00 R14: ffff8801aa1c0e78 R15: 0000000000000001 [ 967.171953] FS: 0000000000000000(0000) GS:ffffc90000a00000(0063) knlGS:00000000f7f486c0 [ 967.180406] CS: 0010 DS: 002b ES: 002b CR0: 000000008005003b [ 967.186374] CR2: 000000004822c0ac CR3: 00000001b19a2000 CR4: 00000000000006e0 [ 967.193770] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 967.201224] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 967.208692] Process hackbench (pid: 3351, threadinfo ffff8801a95aa000, task ffff8801a96b0000) [ 967.217607] Stack: [ 967.219711] 0000000000000000 0000000000000037 0000000200000001 ffffc90000a1107c [ 967.227296] <0> ffff8801a95abe00 0000000000000001 0000000000000001 0000000000000037 [ 967.235333] <0> ffff8801a95abdf0 ffffffff810c0c20 0000000200a14f30 ffff8801a95abe40 [ 967.243532] Call Trace: [ 967.246103] [] do_perf_swcounter_event+0xde/0xec [ 967.252635] [] perf_tpcounter_event+0x79/0x7b [ 967.258957] [] ftrace_profile_sched_switch+0xc0/0xcb [ 967.265791] [] schedule+0x429/0x4c4 [ 967.271156] [] int_careful+0xd/0x14 Reported-by: Ingo Molnar Signed-off-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1251472247.17617.74.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 7d4bb83b78c..d7cbc579fc8 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -4066,6 +4066,7 @@ perf_counter_alloc(struct perf_counter_attr *attr, hwc->sample_period = attr->sample_period; if (attr->freq && attr->sample_freq) hwc->sample_period = 1; + hwc->last_period = hwc->sample_period; atomic64_set(&hwc->period_left, hwc->sample_period); -- cgit v1.2.3-70-g09d2 From a417887637e862b434b293404f2a31ad1f282a58 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 29 Aug 2009 18:47:59 +0800 Subject: lockdep: Remove recursion stattistics Since lockdep has introduced BFS to avoid recursion, statistics for recursion does not make any sense now. So remove them. Signed-off-by: Ming Lei Cc: a.p.zijlstra@chello.nl LKML-Reference: <1251542879-5211-1-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 4 ---- kernel/lockdep_proc.c | 8 -------- 2 files changed, 12 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 0843584aed5..f74d2d7aa60 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -399,7 +399,6 @@ unsigned int nr_hardirq_chains; unsigned int nr_softirq_chains; unsigned int nr_process_chains; unsigned int max_lockdep_depth; -unsigned int max_recursion_depth; #ifdef CONFIG_DEBUG_LOCKDEP /* @@ -429,11 +428,8 @@ atomic_t redundant_softirqs_on; atomic_t redundant_softirqs_off; atomic_t nr_unused_locks; atomic_t nr_cyclic_checks; -atomic_t nr_cyclic_check_recursions; atomic_t nr_find_usage_forwards_checks; -atomic_t nr_find_usage_forwards_recursions; atomic_t nr_find_usage_backwards_checks; -atomic_t nr_find_usage_backwards_recursions; #endif /* diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 8dac8402e07..ec33c6ad58d 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -199,16 +199,10 @@ static void lockdep_stats_debug_show(struct seq_file *m) debug_atomic_read(&chain_lookup_hits)); seq_printf(m, " cyclic checks: %11u\n", debug_atomic_read(&nr_cyclic_checks)); - seq_printf(m, " cyclic-check recursions: %11u\n", - debug_atomic_read(&nr_cyclic_check_recursions)); seq_printf(m, " find-mask forwards checks: %11u\n", debug_atomic_read(&nr_find_usage_forwards_checks)); - seq_printf(m, " find-mask forwards recursions: %11u\n", - debug_atomic_read(&nr_find_usage_forwards_recursions)); seq_printf(m, " find-mask backwards checks: %11u\n", debug_atomic_read(&nr_find_usage_backwards_checks)); - seq_printf(m, " find-mask backwards recursions:%11u\n", - debug_atomic_read(&nr_find_usage_backwards_recursions)); seq_printf(m, " hardirq on events: %11u\n", hi1); seq_printf(m, " hardirq off events: %11u\n", hi2); @@ -350,8 +344,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v) nr_unused); seq_printf(m, " max locking depth: %11u\n", max_lockdep_depth); - seq_printf(m, " max recursion depth: %11u\n", - max_recursion_depth); #ifdef CONFIG_PROVE_LOCKING seq_printf(m, " max bfs queue depth: %11u\n", max_bfs_queue_depth); -- cgit v1.2.3-70-g09d2 From dd5d19bafd90d33043a4a14b2e2d98612caa293c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 Aug 2009 14:58:16 -0700 Subject: rcu: Create rcutree plugins to handle hotplug CPU for multi-level trees When offlining CPUs from a multi-level tree, there is the possibility of offlining the last CPU from a given node when there are preempted RCU read-side critical sections that started life on one of the CPUs on that node. In this case, the corresponding tasks will be enqueued via the task_struct's rcu_node_entry list_head onto one of the rcu_node's blocked_tasks[] lists. These tasks need to be moved somewhere else so that they will prevent the current grace period from ending. That somewhere is the root rcu_node. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josht@linux.vnet.ibm.com Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org LKML-Reference: <20090827215816.GA30472@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 2 +- include/linux/sched.h | 4 +-- kernel/rcutree.c | 2 ++ kernel/rcutree_plugin.h | 69 +++++++++++++++++++++++++++++++++++++++++++---- 4 files changed, 69 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 79d4baee31b..9e7f2e8fc66 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -98,7 +98,7 @@ extern struct group_info init_groups; #define INIT_TASK_RCU_PREEMPT(tsk) \ .rcu_read_lock_nesting = 0, \ .rcu_read_unlock_special = 0, \ - .rcu_blocked_cpu = -1, \ + .rcu_blocked_node = NULL, \ .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), #else #define INIT_TASK_RCU_PREEMPT(tsk) diff --git a/include/linux/sched.h b/include/linux/sched.h index bfca26d63b1..3fe03151a8e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1208,7 +1208,7 @@ struct task_struct { #ifdef CONFIG_TREE_PREEMPT_RCU int rcu_read_lock_nesting; char rcu_read_unlock_special; - int rcu_blocked_cpu; + void *rcu_blocked_node; struct list_head rcu_node_entry; #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ @@ -1735,7 +1735,7 @@ static inline void rcu_copy_process(struct task_struct *p) { p->rcu_read_lock_nesting = 0; p->rcu_read_unlock_special = 0; - p->rcu_blocked_cpu = -1; + p->rcu_blocked_node = NULL; INIT_LIST_HEAD(&p->rcu_node_entry); } diff --git a/kernel/rcutree.c b/kernel/rcutree.c index fee6316a867..d903e2f2b84 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -81,6 +81,7 @@ struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); extern long rcu_batches_completed_sched(void); +static struct rcu_node *rcu_get_root(struct rcu_state *rsp); static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, unsigned long flags); static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags); @@ -876,6 +877,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) spin_unlock(&rnp->lock); /* irqs remain disabled. */ break; } + rcu_preempt_offline_tasks(rsp, rnp); mask = rnp->grpmask; spin_unlock(&rnp->lock); /* irqs remain disabled. */ rnp = rnp->parent; diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 201334cdc20..04343bee646 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -92,7 +92,7 @@ static void rcu_preempt_qs(int cpu) rnp = rdp->mynode; spin_lock(&rnp->lock); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; - t->rcu_blocked_cpu = cpu; + t->rcu_blocked_node = (void *)rnp; /* * If this CPU has already checked in, then this task @@ -170,12 +170,21 @@ static void rcu_read_unlock_special(struct task_struct *t) if (special & RCU_READ_UNLOCK_BLOCKED) { t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; - /* Remove this task from the list it blocked on. */ - rnp = rcu_preempt_state.rda[t->rcu_blocked_cpu]->mynode; - spin_lock(&rnp->lock); + /* + * Remove this task from the list it blocked on. The + * task can migrate while we acquire the lock, but at + * most one time. So at most two passes through loop. + */ + for (;;) { + rnp = (struct rcu_node *)t->rcu_blocked_node; + spin_lock(&rnp->lock); + if (rnp == (struct rcu_node *)t->rcu_blocked_node) + break; + spin_unlock(&rnp->lock); + } empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); list_del_init(&t->rcu_node_entry); - t->rcu_blocked_cpu = -1; + t->rcu_blocked_node = NULL; /* * If this was the last task on the current list, and if @@ -261,6 +270,47 @@ static int rcu_preempted_readers(struct rcu_node *rnp) #ifdef CONFIG_HOTPLUG_CPU +/* + * Handle tasklist migration for case in which all CPUs covered by the + * specified rcu_node have gone offline. Move them up to the root + * rcu_node. The reason for not just moving them to the immediate + * parent is to remove the need for rcu_read_unlock_special() to + * make more than two attempts to acquire the target rcu_node's lock. + * + * The caller must hold rnp->lock with irqs disabled. + */ +static void rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp) +{ + int i; + struct list_head *lp; + struct list_head *lp_root; + struct rcu_node *rnp_root = rcu_get_root(rsp); + struct task_struct *tp; + + if (rnp == rnp_root) + return; /* Shouldn't happen: at least one CPU online. */ + + /* + * Move tasks up to root rcu_node. Rely on the fact that the + * root rcu_node can be at most one ahead of the rest of the + * rcu_nodes in terms of gp_num value. This fact allows us to + * move the blocked_tasks[] array directly, element by element. + */ + for (i = 0; i < 2; i++) { + lp = &rnp->blocked_tasks[i]; + lp_root = &rnp_root->blocked_tasks[i]; + while (!list_empty(lp)) { + tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); + spin_lock(&rnp_root->lock); /* irqs already disabled */ + list_del(&tp->rcu_node_entry); + tp->rcu_blocked_node = rnp_root; + list_add(&tp->rcu_node_entry, lp_root); + spin_unlock(&rnp_root->lock); /* irqs remain disabled */ + } + } +} + /* * Do CPU-offline processing for preemptable RCU. */ @@ -409,6 +459,15 @@ static int rcu_preempted_readers(struct rcu_node *rnp) #ifdef CONFIG_HOTPLUG_CPU +/* + * Because preemptable RCU does not exist, it never needs to migrate + * tasks that were blocked within RCU read-side critical sections. + */ +static void rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp) +{ +} + /* * Because preemptable RCU does not exist, it never needs CPU-offline * processing. -- cgit v1.2.3-70-g09d2 From 868489660dabc0c28087cca3dbc1adbbc398c6fe Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 Aug 2009 15:00:12 -0700 Subject: rcu: Changes from reviews: avoid casts, fix/add warnings, improve comments Changes suggested by review comments from Josh Triplett and Mathieu Desnoyers. Signed-off-by: Paul E. McKenney Acked-by: Josh Triplett Acked-by: Mathieu Desnoyers Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org LKML-Reference: <20090827220012.GA30525@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 +++- kernel/rcutree.c | 13 ++++++------- kernel/rcutree.h | 2 ++ kernel/rcutree_plugin.h | 10 ++++++---- 4 files changed, 17 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 3fe03151a8e..855fd0d3f17 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1163,6 +1163,8 @@ struct sched_rt_entity { #endif }; +struct rcu_node; + struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ void *stack; @@ -1208,7 +1210,7 @@ struct task_struct { #ifdef CONFIG_TREE_PREEMPT_RCU int rcu_read_lock_nesting; char rcu_read_unlock_special; - void *rcu_blocked_node; + struct rcu_node *rcu_blocked_node; struct list_head rcu_node_entry; #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d903e2f2b84..71bc79791cd 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -229,7 +229,6 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) #endif /* #ifdef CONFIG_SMP */ #ifdef CONFIG_NO_HZ -static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5); /** * rcu_enter_nohz - inform RCU that current CPU is entering nohz @@ -249,7 +248,7 @@ void rcu_enter_nohz(void) rdtp = &__get_cpu_var(rcu_dynticks); rdtp->dynticks++; rdtp->dynticks_nesting--; - WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs); + WARN_ON_ONCE(rdtp->dynticks & 0x1); local_irq_restore(flags); } @@ -268,7 +267,7 @@ void rcu_exit_nohz(void) rdtp = &__get_cpu_var(rcu_dynticks); rdtp->dynticks++; rdtp->dynticks_nesting++; - WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs); + WARN_ON_ONCE(!(rdtp->dynticks & 0x1)); local_irq_restore(flags); smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ } @@ -287,7 +286,7 @@ void rcu_nmi_enter(void) if (rdtp->dynticks & 0x1) return; rdtp->dynticks_nmi++; - WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs); + WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1)); smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ } @@ -306,7 +305,7 @@ void rcu_nmi_exit(void) return; smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ rdtp->dynticks_nmi++; - WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs); + WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1); } /** @@ -322,7 +321,7 @@ void rcu_irq_enter(void) if (rdtp->dynticks_nesting++) return; rdtp->dynticks++; - WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs); + WARN_ON_ONCE(!(rdtp->dynticks & 0x1)); smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ } @@ -341,7 +340,7 @@ void rcu_irq_exit(void) return; smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ rdtp->dynticks++; - WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs); + WARN_ON_ONCE(rdtp->dynticks & 0x1); /* If the interrupt queued a callback, get out of dyntick mode. */ if (__get_cpu_var(rcu_sched_data).nxtlist || diff --git a/kernel/rcutree.h b/kernel/rcutree.h index ca560364d8c..bf8a6f9f134 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -81,6 +81,8 @@ struct rcu_dynticks { struct rcu_node { spinlock_t lock; long gpnum; /* Current grace period for this node. */ + /* This will either be equal to or one */ + /* behind the root rcu_node's gpnum. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ /* order for current grace period to proceed.*/ unsigned long qsmaskinit; diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 04343bee646..47789369ea5 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -92,7 +92,7 @@ static void rcu_preempt_qs(int cpu) rnp = rdp->mynode; spin_lock(&rnp->lock); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; - t->rcu_blocked_node = (void *)rnp; + t->rcu_blocked_node = rnp; /* * If this CPU has already checked in, then this task @@ -176,9 +176,9 @@ static void rcu_read_unlock_special(struct task_struct *t) * most one time. So at most two passes through loop. */ for (;;) { - rnp = (struct rcu_node *)t->rcu_blocked_node; + rnp = t->rcu_blocked_node; spin_lock(&rnp->lock); - if (rnp == (struct rcu_node *)t->rcu_blocked_node) + if (rnp == t->rcu_blocked_node) break; spin_unlock(&rnp->lock); } @@ -288,8 +288,10 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp, struct rcu_node *rnp_root = rcu_get_root(rsp); struct task_struct *tp; - if (rnp == rnp_root) + if (rnp == rnp_root) { + WARN_ONCE(1, "Last CPU thought to be offlined?"); return; /* Shouldn't happen: at least one CPU online. */ + } /* * Move tasks up to root rcu_node. Rely on the fact that the -- cgit v1.2.3-70-g09d2 From 84e9dabf6e6a78928c6a1a8ba235c9fb0908d0f8 Mon Sep 17 00:00:00 2001 From: Anirban Sinha Date: Fri, 28 Aug 2009 22:40:43 -0700 Subject: sched: Rename init_cfs_rq => init_tg_cfs_rq ... so that it does not share a common name with a function within the same scope. Signed-off-by: Anirban Sinha LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 523e20a6269..6244d24cafc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -309,8 +309,8 @@ void set_tg_uid(struct user_struct *user) /* * Root task group. - * Every UID task group (including init_task_group aka UID-0) will - * be a child to this group. + * Every UID task group (including init_task_group aka UID-0) will + * be a child to this group. */ struct task_group root_task_group; @@ -318,7 +318,7 @@ struct task_group root_task_group; /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ -static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp; #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED @@ -9400,11 +9400,11 @@ void __init sched_init(void) * system cpu resource, based on the weight assigned to root * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished * by letting tasks of init_task_group sit in a separate cfs_rq - * (init_cfs_rq) and having one entity represent this group of + * (init_tg_cfs_rq) and having one entity represent this group of * tasks in rq->cfs (i.e init_task_group->se[] != NULL). */ init_tg_cfs_entry(&init_task_group, - &per_cpu(init_cfs_rq, i), + &per_cpu(init_tg_cfs_rq, i), &per_cpu(init_sched_entity, i), i, 1, root_task_group.se[i]); -- cgit v1.2.3-70-g09d2 From 372e24b0cb764ec55b4cf3408a95ae40a29e5b96 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 26 Aug 2009 16:20:13 -0700 Subject: irq: Make sure irq_desc for legacy irq get correct node setting when there is no ram on node 0. Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Jesse Barnes LKML-Reference: <4A95C32D.5040605@kernel.org> Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 065205bdd92..a81cf80554d 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -161,7 +161,7 @@ int __init early_irq_init(void) desc = irq_desc_legacy; legacy_count = ARRAY_SIZE(irq_desc_legacy); - node = first_online_node; + node = first_online_node; /* allocate irq_desc_ptrs array based on nr_irqs */ irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT); @@ -172,6 +172,9 @@ int __init early_irq_init(void) for (i = 0; i < legacy_count; i++) { desc[i].irq = i; +#ifdef CONFIG_SMP + desc[i].node = node; +#endif desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); alloc_desc_masks(&desc[i], node, true); -- cgit v1.2.3-70-g09d2 From 8e254c1d183f0225ad21f9049641529e56cce4da Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 31 Aug 2009 16:49:41 +0800 Subject: tracing/filters: Defer pred allocation init_preds() allocates about 5392 bytes of memory (on x86_32) for a TRACE_EVENT. With my config, at system boot total memory occupied is: 5392 * (642 + 15) == 3459KB 642 == cat available_events | wc -l 15 == number of dirs in events/ftrace That's quite a lot, so we'd better defer memory allocation util it's needed, that's when filter is used. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Tom Zanussi Cc: Masami Hiramatsu LKML-Reference: <4A9B8EA5.6020700@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 1 - include/linux/syscalls.h | 2 -- include/trace/ftrace.h | 1 - kernel/trace/trace_events_filter.c | 50 +++++++++++++++++++++++++++++++------- kernel/trace/trace_export.c | 1 - 5 files changed, 41 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index ace2da9e0a0..755480484eb 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -133,7 +133,6 @@ struct ftrace_event_call { #define MAX_FILTER_PRED 32 #define MAX_FILTER_STR_VAL 128 -extern int init_preds(struct ftrace_event_call *call); extern void destroy_preds(struct ftrace_event_call *call); extern int filter_match_preds(struct ftrace_event_call *call, void *rec); extern int filter_current_check_discard(struct ftrace_event_call *call, diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index f124c899555..a8e37821cc6 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -177,7 +177,6 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *event_call) \ event_enter_##sname.id = id; \ set_syscall_enter_id(num, id); \ INIT_LIST_HEAD(&event_enter_##sname.fields); \ - init_preds(&event_enter_##sname); \ return 0; \ } \ TRACE_SYS_ENTER_PROFILE(sname); \ @@ -214,7 +213,6 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *event_call) \ event_exit_##sname.id = id; \ set_syscall_exit_id(num, id); \ INIT_LIST_HEAD(&event_exit_##sname.fields); \ - init_preds(&event_exit_##sname); \ return 0; \ } \ TRACE_SYS_EXIT_PROFILE(sname); \ diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 57c56a998ee..bfbc842600a 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -622,7 +622,6 @@ static int ftrace_raw_init_event_##call(void) \ return -ENODEV; \ event_##call.id = id; \ INIT_LIST_HEAD(&event_##call.fields); \ - init_preds(&event_##call); \ return 0; \ } \ \ diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 9f03082c81d..c6b2edfb7fe 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -309,7 +309,7 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) struct event_filter *filter = call->filter; mutex_lock(&event_mutex); - if (filter->filter_string) + if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else trace_seq_printf(s, "none\n"); @@ -322,7 +322,7 @@ void print_subsystem_event_filter(struct event_subsystem *system, struct event_filter *filter = system->filter; mutex_lock(&event_mutex); - if (filter->filter_string) + if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else trace_seq_printf(s, "none\n"); @@ -390,6 +390,9 @@ void destroy_preds(struct ftrace_event_call *call) struct event_filter *filter = call->filter; int i; + if (!filter) + return; + for (i = 0; i < MAX_FILTER_PRED; i++) { if (filter->preds[i]) filter_free_pred(filter->preds[i]); @@ -400,7 +403,7 @@ void destroy_preds(struct ftrace_event_call *call) call->filter = NULL; } -int init_preds(struct ftrace_event_call *call) +static int init_preds(struct ftrace_event_call *call) { struct event_filter *filter; struct filter_pred *pred; @@ -410,7 +413,6 @@ int init_preds(struct ftrace_event_call *call) if (!call->filter) return -ENOMEM; - call->filter_active = 0; filter->n_preds = 0; filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); @@ -432,7 +434,28 @@ oom: return -ENOMEM; } -EXPORT_SYMBOL_GPL(init_preds); + +static int init_subsystem_preds(struct event_subsystem *system) +{ + struct ftrace_event_call *call; + int err; + + list_for_each_entry(call, &ftrace_events, list) { + if (!call->define_fields) + continue; + + if (strcmp(call->system, system->name) != 0) + continue; + + if (!call->filter) { + err = init_preds(call); + if (err) + return err; + } + } + + return 0; +} enum { FILTER_DISABLE_ALL, @@ -449,6 +472,9 @@ static void filter_free_subsystem_preds(struct event_subsystem *system, if (!call->define_fields) continue; + if (strcmp(call->system, system->name) != 0) + continue; + if (flag == FILTER_INIT_NO_RESET) { call->filter->no_reset = false; continue; @@ -457,10 +483,8 @@ static void filter_free_subsystem_preds(struct event_subsystem *system, if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset) continue; - if (!strcmp(call->system, system->name)) { - filter_disable_preds(call); - remove_filter_string(call->filter); - } + filter_disable_preds(call); + remove_filter_string(call->filter); } } @@ -1094,6 +1118,10 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) mutex_lock(&event_mutex); + err = init_preds(call); + if (err) + goto out_unlock; + if (!strcmp(strstrip(filter_string), "0")) { filter_disable_preds(call); remove_filter_string(call->filter); @@ -1139,6 +1167,10 @@ int apply_subsystem_event_filter(struct event_subsystem *system, mutex_lock(&event_mutex); + err = init_subsystem_preds(system); + if (err) + goto out_unlock; + if (!strcmp(strstrip(filter_string), "0")) { filter_free_subsystem_preds(system, FILTER_DISABLE_ALL); remove_filter_string(system->filter); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 029a91f4228..df1bf6e48bb 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -135,7 +135,6 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ static int ftrace_raw_init_event_##call(void) \ { \ INIT_LIST_HEAD(&event_##call.fields); \ - init_preds(&event_##call); \ return 0; \ } \ -- cgit v1.2.3-70-g09d2 From 69d0ee7377eef808e34ba5542b554ec97244b871 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 31 Aug 2009 14:43:36 +0200 Subject: locking: Move spinlock function bodies to header file Move spinlock function bodies to header file by creating a static inline version of each variant. Use the inline version on the out-of-line code. This shouldn't make any difference besides that the spinlock code can now be used to generate inlined spinlock code. Signed-off-by: Heiko Carstens Acked-by: Arnd Bergmann Acked-by: Peter Zijlstra Cc: Nick Piggin Cc: Martin Schwidefsky Cc: Horst Hartmann Cc: Christian Ehrhardt Cc: Andrew Morton Cc: Linus Torvalds Cc: David Miller Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Geert Uytterhoeven Cc: Roman Zippel Cc: LKML-Reference: <20090831124417.859022429@de.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/spinlock.h | 18 +-- include/linux/spinlock_api_smp.h | 263 +++++++++++++++++++++++++++++++++++++++ kernel/spinlock.c | 174 +++++--------------------- 3 files changed, 300 insertions(+), 155 deletions(-) (limited to 'kernel') diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 4be57ab0347..da76a06556b 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -143,15 +143,6 @@ static inline void smp_mb__after_lock(void) { smp_mb(); } */ #define spin_unlock_wait(lock) __raw_spin_unlock_wait(&(lock)->raw_lock) -/* - * Pull the _spin_*()/_read_*()/_write_*() functions/declarations: - */ -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) -# include -#else -# include -#endif - #ifdef CONFIG_DEBUG_SPINLOCK extern void _raw_spin_lock(spinlock_t *lock); #define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock) @@ -380,4 +371,13 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock); */ #define spin_can_lock(lock) (!spin_is_locked(lock)) +/* + * Pull the _spin_*()/_read_*()/_write_*() functions/declarations: + */ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) +# include +#else +# include +#endif + #endif /* __LINUX_SPINLOCK_H */ diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index d79845d034b..6b108f5fb14 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -60,4 +60,267 @@ void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) __releases(lock); +static inline int __spin_trylock(spinlock_t *lock) +{ + preempt_disable(); + if (_raw_spin_trylock(lock)) { + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + return 1; + } + preempt_enable(); + return 0; +} + +static inline int __read_trylock(rwlock_t *lock) +{ + preempt_disable(); + if (_raw_read_trylock(lock)) { + rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_); + return 1; + } + preempt_enable(); + return 0; +} + +static inline int __write_trylock(rwlock_t *lock) +{ + preempt_disable(); + if (_raw_write_trylock(lock)) { + rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_); + return 1; + } + preempt_enable(); + return 0; +} + +/* + * If lockdep is enabled then we use the non-preemption spin-ops + * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are + * not re-enabled during lock-acquire (which the preempt-spin-ops do): + */ +#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) + +static inline void __read_lock(rwlock_t *lock) +{ + preempt_disable(); + rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); +} + +static inline unsigned long __spin_lock_irqsave(spinlock_t *lock) +{ + unsigned long flags; + + local_irq_save(flags); + preempt_disable(); + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); + /* + * On lockdep we dont want the hand-coded irq-enable of + * _raw_spin_lock_flags() code, because lockdep assumes + * that interrupts are not re-enabled during lock-acquire: + */ +#ifdef CONFIG_LOCKDEP + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); +#else + _raw_spin_lock_flags(lock, &flags); +#endif + return flags; +} + +static inline void __spin_lock_irq(spinlock_t *lock) +{ + local_irq_disable(); + preempt_disable(); + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); +} + +static inline void __spin_lock_bh(spinlock_t *lock) +{ + local_bh_disable(); + preempt_disable(); + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); +} + +static inline unsigned long __read_lock_irqsave(rwlock_t *lock) +{ + unsigned long flags; + + local_irq_save(flags); + preempt_disable(); + rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock, + _raw_read_lock_flags, &flags); + return flags; +} + +static inline void __read_lock_irq(rwlock_t *lock) +{ + local_irq_disable(); + preempt_disable(); + rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); +} + +static inline void __read_lock_bh(rwlock_t *lock) +{ + local_bh_disable(); + preempt_disable(); + rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); +} + +static inline unsigned long __write_lock_irqsave(rwlock_t *lock) +{ + unsigned long flags; + + local_irq_save(flags); + preempt_disable(); + rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock, + _raw_write_lock_flags, &flags); + return flags; +} + +static inline void __write_lock_irq(rwlock_t *lock) +{ + local_irq_disable(); + preempt_disable(); + rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); +} + +static inline void __write_lock_bh(rwlock_t *lock) +{ + local_bh_disable(); + preempt_disable(); + rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); +} + +static inline void __spin_lock(spinlock_t *lock) +{ + preempt_disable(); + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); +} + +static inline void __write_lock(rwlock_t *lock) +{ + preempt_disable(); + rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); +} + +#endif /* CONFIG_PREEMPT */ + +static inline void __spin_unlock(spinlock_t *lock) +{ + spin_release(&lock->dep_map, 1, _RET_IP_); + _raw_spin_unlock(lock); + preempt_enable(); +} + +static inline void __write_unlock(rwlock_t *lock) +{ + rwlock_release(&lock->dep_map, 1, _RET_IP_); + _raw_write_unlock(lock); + preempt_enable(); +} + +static inline void __read_unlock(rwlock_t *lock) +{ + rwlock_release(&lock->dep_map, 1, _RET_IP_); + _raw_read_unlock(lock); + preempt_enable(); +} + +static inline void __spin_unlock_irqrestore(spinlock_t *lock, + unsigned long flags) +{ + spin_release(&lock->dep_map, 1, _RET_IP_); + _raw_spin_unlock(lock); + local_irq_restore(flags); + preempt_enable(); +} + +static inline void __spin_unlock_irq(spinlock_t *lock) +{ + spin_release(&lock->dep_map, 1, _RET_IP_); + _raw_spin_unlock(lock); + local_irq_enable(); + preempt_enable(); +} + +static inline void __spin_unlock_bh(spinlock_t *lock) +{ + spin_release(&lock->dep_map, 1, _RET_IP_); + _raw_spin_unlock(lock); + preempt_enable_no_resched(); + local_bh_enable_ip((unsigned long)__builtin_return_address(0)); +} + +static inline void __read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +{ + rwlock_release(&lock->dep_map, 1, _RET_IP_); + _raw_read_unlock(lock); + local_irq_restore(flags); + preempt_enable(); +} + +static inline void __read_unlock_irq(rwlock_t *lock) +{ + rwlock_release(&lock->dep_map, 1, _RET_IP_); + _raw_read_unlock(lock); + local_irq_enable(); + preempt_enable(); +} + +static inline void __read_unlock_bh(rwlock_t *lock) +{ + rwlock_release(&lock->dep_map, 1, _RET_IP_); + _raw_read_unlock(lock); + preempt_enable_no_resched(); + local_bh_enable_ip((unsigned long)__builtin_return_address(0)); +} + +static inline void __write_unlock_irqrestore(rwlock_t *lock, + unsigned long flags) +{ + rwlock_release(&lock->dep_map, 1, _RET_IP_); + _raw_write_unlock(lock); + local_irq_restore(flags); + preempt_enable(); +} + +static inline void __write_unlock_irq(rwlock_t *lock) +{ + rwlock_release(&lock->dep_map, 1, _RET_IP_); + _raw_write_unlock(lock); + local_irq_enable(); + preempt_enable(); +} + +static inline void __write_unlock_bh(rwlock_t *lock) +{ + rwlock_release(&lock->dep_map, 1, _RET_IP_); + _raw_write_unlock(lock); + preempt_enable_no_resched(); + local_bh_enable_ip((unsigned long)__builtin_return_address(0)); +} + +static inline int __spin_trylock_bh(spinlock_t *lock) +{ + local_bh_disable(); + preempt_disable(); + if (_raw_spin_trylock(lock)) { + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + return 1; + } + preempt_enable_no_resched(); + local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + return 0; +} + #endif /* __LINUX_SPINLOCK_API_SMP_H */ diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 7932653c4eb..2c000f5c070 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -23,40 +23,19 @@ int __lockfunc _spin_trylock(spinlock_t *lock) { - preempt_disable(); - if (_raw_spin_trylock(lock)) { - spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); - return 1; - } - - preempt_enable(); - return 0; + return __spin_trylock(lock); } EXPORT_SYMBOL(_spin_trylock); int __lockfunc _read_trylock(rwlock_t *lock) { - preempt_disable(); - if (_raw_read_trylock(lock)) { - rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_); - return 1; - } - - preempt_enable(); - return 0; + return __read_trylock(lock); } EXPORT_SYMBOL(_read_trylock); int __lockfunc _write_trylock(rwlock_t *lock) { - preempt_disable(); - if (_raw_write_trylock(lock)) { - rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_); - return 1; - } - - preempt_enable(); - return 0; + return __write_trylock(lock); } EXPORT_SYMBOL(_write_trylock); @@ -69,129 +48,74 @@ EXPORT_SYMBOL(_write_trylock); void __lockfunc _read_lock(rwlock_t *lock) { - preempt_disable(); - rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); + __read_lock(lock); } EXPORT_SYMBOL(_read_lock); unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) { - unsigned long flags; - - local_irq_save(flags); - preempt_disable(); - spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); - /* - * On lockdep we dont want the hand-coded irq-enable of - * _raw_spin_lock_flags() code, because lockdep assumes - * that interrupts are not re-enabled during lock-acquire: - */ -#ifdef CONFIG_LOCKDEP - LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); -#else - _raw_spin_lock_flags(lock, &flags); -#endif - return flags; + return __spin_lock_irqsave(lock); } EXPORT_SYMBOL(_spin_lock_irqsave); void __lockfunc _spin_lock_irq(spinlock_t *lock) { - local_irq_disable(); - preempt_disable(); - spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); + __spin_lock_irq(lock); } EXPORT_SYMBOL(_spin_lock_irq); void __lockfunc _spin_lock_bh(spinlock_t *lock) { - local_bh_disable(); - preempt_disable(); - spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); + __spin_lock_bh(lock); } EXPORT_SYMBOL(_spin_lock_bh); unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) { - unsigned long flags; - - local_irq_save(flags); - preempt_disable(); - rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock, - _raw_read_lock_flags, &flags); - return flags; + return __read_lock_irqsave(lock); } EXPORT_SYMBOL(_read_lock_irqsave); void __lockfunc _read_lock_irq(rwlock_t *lock) { - local_irq_disable(); - preempt_disable(); - rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); + __read_lock_irq(lock); } EXPORT_SYMBOL(_read_lock_irq); void __lockfunc _read_lock_bh(rwlock_t *lock) { - local_bh_disable(); - preempt_disable(); - rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); + __read_lock_bh(lock); } EXPORT_SYMBOL(_read_lock_bh); unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) { - unsigned long flags; - - local_irq_save(flags); - preempt_disable(); - rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock, - _raw_write_lock_flags, &flags); - return flags; + return __write_lock_irqsave(lock); } EXPORT_SYMBOL(_write_lock_irqsave); void __lockfunc _write_lock_irq(rwlock_t *lock) { - local_irq_disable(); - preempt_disable(); - rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); + __write_lock_irq(lock); } EXPORT_SYMBOL(_write_lock_irq); void __lockfunc _write_lock_bh(rwlock_t *lock) { - local_bh_disable(); - preempt_disable(); - rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); + __write_lock_bh(lock); } EXPORT_SYMBOL(_write_lock_bh); void __lockfunc _spin_lock(spinlock_t *lock) { - preempt_disable(); - spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); + __spin_lock(lock); } - EXPORT_SYMBOL(_spin_lock); void __lockfunc _write_lock(rwlock_t *lock) { - preempt_disable(); - rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); + __write_lock(lock); } - EXPORT_SYMBOL(_write_lock); #else /* CONFIG_PREEMPT: */ @@ -320,121 +244,79 @@ EXPORT_SYMBOL(_spin_lock_nest_lock); void __lockfunc _spin_unlock(spinlock_t *lock) { - spin_release(&lock->dep_map, 1, _RET_IP_); - _raw_spin_unlock(lock); - preempt_enable(); + __spin_unlock(lock); } EXPORT_SYMBOL(_spin_unlock); void __lockfunc _write_unlock(rwlock_t *lock) { - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_write_unlock(lock); - preempt_enable(); + __write_unlock(lock); } EXPORT_SYMBOL(_write_unlock); void __lockfunc _read_unlock(rwlock_t *lock) { - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_read_unlock(lock); - preempt_enable(); + __read_unlock(lock); } EXPORT_SYMBOL(_read_unlock); void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) { - spin_release(&lock->dep_map, 1, _RET_IP_); - _raw_spin_unlock(lock); - local_irq_restore(flags); - preempt_enable(); + __spin_unlock_irqrestore(lock, flags); } EXPORT_SYMBOL(_spin_unlock_irqrestore); void __lockfunc _spin_unlock_irq(spinlock_t *lock) { - spin_release(&lock->dep_map, 1, _RET_IP_); - _raw_spin_unlock(lock); - local_irq_enable(); - preempt_enable(); + __spin_unlock_irq(lock); } EXPORT_SYMBOL(_spin_unlock_irq); void __lockfunc _spin_unlock_bh(spinlock_t *lock) { - spin_release(&lock->dep_map, 1, _RET_IP_); - _raw_spin_unlock(lock); - preempt_enable_no_resched(); - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + __spin_unlock_bh(lock); } EXPORT_SYMBOL(_spin_unlock_bh); void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_read_unlock(lock); - local_irq_restore(flags); - preempt_enable(); + __read_unlock_irqrestore(lock, flags); } EXPORT_SYMBOL(_read_unlock_irqrestore); void __lockfunc _read_unlock_irq(rwlock_t *lock) { - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_read_unlock(lock); - local_irq_enable(); - preempt_enable(); + __read_unlock_irq(lock); } EXPORT_SYMBOL(_read_unlock_irq); void __lockfunc _read_unlock_bh(rwlock_t *lock) { - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_read_unlock(lock); - preempt_enable_no_resched(); - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + __read_unlock_bh(lock); } EXPORT_SYMBOL(_read_unlock_bh); void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_write_unlock(lock); - local_irq_restore(flags); - preempt_enable(); + __write_unlock_irqrestore(lock, flags); } EXPORT_SYMBOL(_write_unlock_irqrestore); void __lockfunc _write_unlock_irq(rwlock_t *lock) { - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_write_unlock(lock); - local_irq_enable(); - preempt_enable(); + __write_unlock_irq(lock); } EXPORT_SYMBOL(_write_unlock_irq); void __lockfunc _write_unlock_bh(rwlock_t *lock) { - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_write_unlock(lock); - preempt_enable_no_resched(); - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + __write_unlock_bh(lock); } EXPORT_SYMBOL(_write_unlock_bh); int __lockfunc _spin_trylock_bh(spinlock_t *lock) { - local_bh_disable(); - preempt_disable(); - if (_raw_spin_trylock(lock)) { - spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); - return 1; - } - - preempt_enable_no_resched(); - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); - return 0; + return __spin_trylock_bh(lock); } EXPORT_SYMBOL(_spin_trylock_bh); -- cgit v1.2.3-70-g09d2 From 892a7c67c12da63fa4b51728bbe5b982356a090a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 31 Aug 2009 14:43:37 +0200 Subject: locking: Allow arch-inlined spinlocks This allows an architecture to specify per lock variant if the locking code should be kept out-of-line or inlined. If an architecure wants out-of-line locking code no change is needed. To force inlining of e.g. spin_lock() the line: #define __always_inline__spin_lock needs to be added to arch/<...>/include/asm/spinlock.h If CONFIG_DEBUG_SPINLOCK or CONFIG_GENERIC_LOCKBREAK are defined the per architecture defines are (partly) ignored and still out-of-line spinlock code will be generated. Signed-off-by: Heiko Carstens Acked-by: Peter Zijlstra Cc: Arnd Bergmann Cc: Nick Piggin Cc: Martin Schwidefsky Cc: Horst Hartmann Cc: Christian Ehrhardt Cc: Andrew Morton Cc: Linus Torvalds Cc: David Miller Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Geert Uytterhoeven Cc: Roman Zippel Cc: LKML-Reference: <20090831124418.375299024@de.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/spinlock_api_smp.h | 119 +++++++++++++++++++++++++++++++++++++++ kernel/spinlock.c | 56 ++++++++++++++++++ 2 files changed, 175 insertions(+) (limited to 'kernel') diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index 6b108f5fb14..1a411e3fab9 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -60,6 +60,125 @@ void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) __releases(lock); +#ifndef CONFIG_DEBUG_SPINLOCK +#ifndef CONFIG_GENERIC_LOCKBREAK + +#ifdef __always_inline__spin_lock +#define _spin_lock(lock) __spin_lock(lock) +#endif + +#ifdef __always_inline__read_lock +#define _read_lock(lock) __read_lock(lock) +#endif + +#ifdef __always_inline__write_lock +#define _write_lock(lock) __write_lock(lock) +#endif + +#ifdef __always_inline__spin_lock_bh +#define _spin_lock_bh(lock) __spin_lock_bh(lock) +#endif + +#ifdef __always_inline__read_lock_bh +#define _read_lock_bh(lock) __read_lock_bh(lock) +#endif + +#ifdef __always_inline__write_lock_bh +#define _write_lock_bh(lock) __write_lock_bh(lock) +#endif + +#ifdef __always_inline__spin_lock_irq +#define _spin_lock_irq(lock) __spin_lock_irq(lock) +#endif + +#ifdef __always_inline__read_lock_irq +#define _read_lock_irq(lock) __read_lock_irq(lock) +#endif + +#ifdef __always_inline__write_lock_irq +#define _write_lock_irq(lock) __write_lock_irq(lock) +#endif + +#ifdef __always_inline__spin_lock_irqsave +#define _spin_lock_irqsave(lock) __spin_lock_irqsave(lock) +#endif + +#ifdef __always_inline__read_lock_irqsave +#define _read_lock_irqsave(lock) __read_lock_irqsave(lock) +#endif + +#ifdef __always_inline__write_lock_irqsave +#define _write_lock_irqsave(lock) __write_lock_irqsave(lock) +#endif + +#endif /* !CONFIG_GENERIC_LOCKBREAK */ + +#ifdef __always_inline__spin_trylock +#define _spin_trylock(lock) __spin_trylock(lock) +#endif + +#ifdef __always_inline__read_trylock +#define _read_trylock(lock) __read_trylock(lock) +#endif + +#ifdef __always_inline__write_trylock +#define _write_trylock(lock) __write_trylock(lock) +#endif + +#ifdef __always_inline__spin_trylock_bh +#define _spin_trylock_bh(lock) __spin_trylock_bh(lock) +#endif + +#ifdef __always_inline__spin_unlock +#define _spin_unlock(lock) __spin_unlock(lock) +#endif + +#ifdef __always_inline__read_unlock +#define _read_unlock(lock) __read_unlock(lock) +#endif + +#ifdef __always_inline__write_unlock +#define _write_unlock(lock) __write_unlock(lock) +#endif + +#ifdef __always_inline__spin_unlock_bh +#define _spin_unlock_bh(lock) __spin_unlock_bh(lock) +#endif + +#ifdef __always_inline__read_unlock_bh +#define _read_unlock_bh(lock) __read_unlock_bh(lock) +#endif + +#ifdef __always_inline__write_unlock_bh +#define _write_unlock_bh(lock) __write_unlock_bh(lock) +#endif + +#ifdef __always_inline__spin_unlock_irq +#define _spin_unlock_irq(lock) __spin_unlock_irq(lock) +#endif + +#ifdef __always_inline__read_unlock_irq +#define _read_unlock_irq(lock) __read_unlock_irq(lock) +#endif + +#ifdef __always_inline__write_unlock_irq +#define _write_unlock_irq(lock) __write_unlock_irq(lock) +#endif + +#ifdef __always_inline__spin_unlock_irqrestore +#define _spin_unlock_irqrestore(lock, flags) __spin_unlock_irqrestore(lock, flags) +#endif + +#ifdef __always_inline__read_unlock_irqrestore +#define _read_unlock_irqrestore(lock, flags) __read_unlock_irqrestore(lock, flags) +#endif + +#ifdef __always_inline__write_unlock_irqrestore +#define _write_unlock_irqrestore(lock, flags) __write_unlock_irqrestore(lock, flags) +#endif + +#endif /* CONFIG_DEBUG_SPINLOCK */ + static inline int __spin_trylock(spinlock_t *lock) { preempt_disable(); diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 2c000f5c070..5ddab730cb2 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -21,23 +21,29 @@ #include #include +#ifndef _spin_trylock int __lockfunc _spin_trylock(spinlock_t *lock) { return __spin_trylock(lock); } EXPORT_SYMBOL(_spin_trylock); +#endif +#ifndef _read_trylock int __lockfunc _read_trylock(rwlock_t *lock) { return __read_trylock(lock); } EXPORT_SYMBOL(_read_trylock); +#endif +#ifndef _write_trylock int __lockfunc _write_trylock(rwlock_t *lock) { return __write_trylock(lock); } EXPORT_SYMBOL(_write_trylock); +#endif /* * If lockdep is enabled then we use the non-preemption spin-ops @@ -46,77 +52,101 @@ EXPORT_SYMBOL(_write_trylock); */ #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) +#ifndef _read_lock void __lockfunc _read_lock(rwlock_t *lock) { __read_lock(lock); } EXPORT_SYMBOL(_read_lock); +#endif +#ifndef _spin_lock_irqsave unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) { return __spin_lock_irqsave(lock); } EXPORT_SYMBOL(_spin_lock_irqsave); +#endif +#ifndef _spin_lock_irq void __lockfunc _spin_lock_irq(spinlock_t *lock) { __spin_lock_irq(lock); } EXPORT_SYMBOL(_spin_lock_irq); +#endif +#ifndef _spin_lock_bh void __lockfunc _spin_lock_bh(spinlock_t *lock) { __spin_lock_bh(lock); } EXPORT_SYMBOL(_spin_lock_bh); +#endif +#ifndef _read_lock_irqsave unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) { return __read_lock_irqsave(lock); } EXPORT_SYMBOL(_read_lock_irqsave); +#endif +#ifndef _read_lock_irq void __lockfunc _read_lock_irq(rwlock_t *lock) { __read_lock_irq(lock); } EXPORT_SYMBOL(_read_lock_irq); +#endif +#ifndef _read_lock_bh void __lockfunc _read_lock_bh(rwlock_t *lock) { __read_lock_bh(lock); } EXPORT_SYMBOL(_read_lock_bh); +#endif +#ifndef _write_lock_irqsave unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) { return __write_lock_irqsave(lock); } EXPORT_SYMBOL(_write_lock_irqsave); +#endif +#ifndef _write_lock_irq void __lockfunc _write_lock_irq(rwlock_t *lock) { __write_lock_irq(lock); } EXPORT_SYMBOL(_write_lock_irq); +#endif +#ifndef _write_lock_bh void __lockfunc _write_lock_bh(rwlock_t *lock) { __write_lock_bh(lock); } EXPORT_SYMBOL(_write_lock_bh); +#endif +#ifndef _spin_lock void __lockfunc _spin_lock(spinlock_t *lock) { __spin_lock(lock); } EXPORT_SYMBOL(_spin_lock); +#endif +#ifndef _write_lock void __lockfunc _write_lock(rwlock_t *lock) { __write_lock(lock); } EXPORT_SYMBOL(_write_lock); +#endif #else /* CONFIG_PREEMPT: */ @@ -242,83 +272,109 @@ EXPORT_SYMBOL(_spin_lock_nest_lock); #endif +#ifndef _spin_unlock void __lockfunc _spin_unlock(spinlock_t *lock) { __spin_unlock(lock); } EXPORT_SYMBOL(_spin_unlock); +#endif +#ifndef _write_unlock void __lockfunc _write_unlock(rwlock_t *lock) { __write_unlock(lock); } EXPORT_SYMBOL(_write_unlock); +#endif +#ifndef _read_unlock void __lockfunc _read_unlock(rwlock_t *lock) { __read_unlock(lock); } EXPORT_SYMBOL(_read_unlock); +#endif +#ifndef _spin_unlock_irqrestore void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) { __spin_unlock_irqrestore(lock, flags); } EXPORT_SYMBOL(_spin_unlock_irqrestore); +#endif +#ifndef _spin_unlock_irq void __lockfunc _spin_unlock_irq(spinlock_t *lock) { __spin_unlock_irq(lock); } EXPORT_SYMBOL(_spin_unlock_irq); +#endif +#ifndef _spin_unlock_bh void __lockfunc _spin_unlock_bh(spinlock_t *lock) { __spin_unlock_bh(lock); } EXPORT_SYMBOL(_spin_unlock_bh); +#endif +#ifndef _read_unlock_irqrestore void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { __read_unlock_irqrestore(lock, flags); } EXPORT_SYMBOL(_read_unlock_irqrestore); +#endif +#ifndef _read_unlock_irq void __lockfunc _read_unlock_irq(rwlock_t *lock) { __read_unlock_irq(lock); } EXPORT_SYMBOL(_read_unlock_irq); +#endif +#ifndef _read_unlock_bh void __lockfunc _read_unlock_bh(rwlock_t *lock) { __read_unlock_bh(lock); } EXPORT_SYMBOL(_read_unlock_bh); +#endif +#ifndef _write_unlock_irqrestore void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { __write_unlock_irqrestore(lock, flags); } EXPORT_SYMBOL(_write_unlock_irqrestore); +#endif +#ifndef _write_unlock_irq void __lockfunc _write_unlock_irq(rwlock_t *lock) { __write_unlock_irq(lock); } EXPORT_SYMBOL(_write_unlock_irq); +#endif +#ifndef _write_unlock_bh void __lockfunc _write_unlock_bh(rwlock_t *lock) { __write_unlock_bh(lock); } EXPORT_SYMBOL(_write_unlock_bh); +#endif +#ifndef _spin_trylock_bh int __lockfunc _spin_trylock_bh(spinlock_t *lock) { return __spin_trylock_bh(lock); } EXPORT_SYMBOL(_spin_trylock_bh); +#endif notrace int in_lock_functions(unsigned long addr) { -- cgit v1.2.3-70-g09d2 From 69575d388603365f2afbf4166df93152df59b165 Mon Sep 17 00:00:00 2001 From: Shane Wang Date: Tue, 1 Sep 2009 18:25:07 -0700 Subject: x86, intel_txt: clean up the impact on generic code, unbreak non-x86 Move tboot.h from asm to linux to fix the build errors of intel_txt patch on non-X86 platforms. Remove the tboot code from generic code init/main.c and kernel/cpu.c. Signed-off-by: Shane Wang Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 4 + arch/x86/include/asm/tboot.h | 197 ------------------------------------------ arch/x86/kernel/reboot.c | 3 +- arch/x86/kernel/setup.c | 3 +- arch/x86/kernel/smpboot.c | 2 +- arch/x86/kernel/tboot.c | 58 ++++++++++--- drivers/acpi/acpica/hwsleep.c | 2 +- drivers/pci/dmar.c | 2 +- drivers/pci/intel-iommu.c | 2 +- include/linux/tboot.h | 162 ++++++++++++++++++++++++++++++++++ init/main.c | 3 - kernel/cpu.c | 6 +- security/Kconfig | 2 +- 13 files changed, 221 insertions(+), 225 deletions(-) delete mode 100644 arch/x86/include/asm/tboot.h create mode 100644 include/linux/tboot.h (limited to 'kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 738bdc6b0f8..b66f2102c35 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -178,6 +178,10 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING config ARCH_SUPPORTS_DEBUG_PAGEALLOC def_bool y +config HAVE_INTEL_TXT + def_bool y + depends on EXPERIMENTAL && DMAR && ACPI + # Use the generic interrupt handling code in kernel/irq/: config GENERIC_HARDIRQS bool diff --git a/arch/x86/include/asm/tboot.h b/arch/x86/include/asm/tboot.h deleted file mode 100644 index b13929d4e5f..00000000000 --- a/arch/x86/include/asm/tboot.h +++ /dev/null @@ -1,197 +0,0 @@ -/* - * tboot.h: shared data structure with tboot and kernel and functions - * used by kernel for runtime support of Intel(R) Trusted - * Execution Technology - * - * Copyright (c) 2006-2009, Intel Corporation - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * - */ - -#ifndef _ASM_TBOOT_H -#define _ASM_TBOOT_H - -#include - -/* these must have the values from 0-5 in this order */ -enum { - TB_SHUTDOWN_REBOOT = 0, - TB_SHUTDOWN_S5, - TB_SHUTDOWN_S4, - TB_SHUTDOWN_S3, - TB_SHUTDOWN_HALT, - TB_SHUTDOWN_WFS -}; - -#ifdef CONFIG_INTEL_TXT - -/* used to communicate between tboot and the launched kernel */ - -#define TB_KEY_SIZE 64 /* 512 bits */ - -#define MAX_TB_MAC_REGIONS 32 - -struct tboot_mac_region { - u64 start; /* must be 64 byte -aligned */ - u32 size; /* must be 64 byte -granular */ -} __packed; - -/* GAS - Generic Address Structure (ACPI 2.0+) */ -struct tboot_acpi_generic_address { - u8 space_id; - u8 bit_width; - u8 bit_offset; - u8 access_width; - u64 address; -} __packed; - -/* - * combines Sx info from FADT and FACS tables per ACPI 2.0+ spec - * (http://www.acpi.info/) - */ -struct tboot_acpi_sleep_info { - struct tboot_acpi_generic_address pm1a_cnt_blk; - struct tboot_acpi_generic_address pm1b_cnt_blk; - struct tboot_acpi_generic_address pm1a_evt_blk; - struct tboot_acpi_generic_address pm1b_evt_blk; - u16 pm1a_cnt_val; - u16 pm1b_cnt_val; - u64 wakeup_vector; - u32 vector_width; - u64 kernel_s3_resume_vector; -} __packed; - -/* - * shared memory page used for communication between tboot and kernel - */ -struct tboot { - /* - * version 3+ fields: - */ - - /* TBOOT_UUID */ - u8 uuid[16]; - - /* version number: 5 is current */ - u32 version; - - /* physical addr of tb_log_t log */ - u32 log_addr; - - /* - * physical addr of entry point for tboot shutdown and - * type of shutdown (TB_SHUTDOWN_*) being requested - */ - u32 shutdown_entry; - u32 shutdown_type; - - /* kernel-specified ACPI info for Sx shutdown */ - struct tboot_acpi_sleep_info acpi_sinfo; - - /* tboot location in memory (physical) */ - u32 tboot_base; - u32 tboot_size; - - /* memory regions (phys addrs) for tboot to MAC on S3 */ - u8 num_mac_regions; - struct tboot_mac_region mac_regions[MAX_TB_MAC_REGIONS]; - - - /* - * version 4+ fields: - */ - - /* symmetric key for use by kernel; will be encrypted on S3 */ - u8 s3_key[TB_KEY_SIZE]; - - - /* - * version 5+ fields: - */ - - /* used to 4byte-align num_in_wfs */ - u8 reserved_align[3]; - - /* number of processors in wait-for-SIPI */ - u32 num_in_wfs; -} __packed; - -/* - * UUID for tboot data struct to facilitate matching - * defined as {663C8DFF-E8B3-4b82-AABF-19EA4D057A08} by tboot, which is - * represented as {} in the char array used here - */ -#define TBOOT_UUID {0xff, 0x8d, 0x3c, 0x66, 0xb3, 0xe8, 0x82, 0x4b, 0xbf,\ - 0xaa, 0x19, 0xea, 0x4d, 0x5, 0x7a, 0x8} - -extern struct tboot *tboot; - -static inline int tboot_enabled(void) -{ - return tboot != NULL; -} - -extern void tboot_probe(void); -extern void tboot_create_trampoline(void); -extern void tboot_shutdown(u32 shutdown_type); -extern void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control); -extern int tboot_wait_for_aps(int num_aps); -extern struct acpi_table_header *tboot_get_dmar_table( - struct acpi_table_header *dmar_tbl); -extern int tboot_force_iommu(void); - -#else /* CONFIG_INTEL_TXT */ - -static inline int tboot_enabled(void) -{ - return 0; -} - -static inline void tboot_probe(void) -{ -} - -static inline void tboot_create_trampoline(void) -{ -} - -static inline void tboot_shutdown(u32 shutdown_type) -{ -} - -static inline void tboot_sleep(u8 sleep_state, u32 pm1a_control, - u32 pm1b_control) -{ -} - -static inline int tboot_wait_for_aps(int num_aps) -{ - return 0; -} - -static inline struct acpi_table_header *tboot_get_dmar_table( - struct acpi_table_header *dmar_tbl) -{ - return dmar_tbl; -} - -static inline int tboot_force_iommu(void) -{ - return 0; -} - -#endif /* !CONFIG_INTEL_TXT */ - -#endif /* _ASM_TBOOT_H */ diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 9de01c5d979..18ce5c04242 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -24,8 +25,6 @@ # include #endif -#include - /* * Power off function, if any */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 80d6e9e3248..6ce0d6f38f7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -66,6 +66,7 @@ #include #include +#include #include