From 061b1bd394ca8628b7c24eb4658ba3535da4249a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 24 Sep 2008 14:46:44 -0700 Subject: Staging: add TAINT_CRAP for all drivers/staging code We need to add a flag for all code that is in the drivers/staging/ directory to prevent all other kernel developers from worrying about issues here, and to notify users that the drivers might not be as good as they are normally used to. Based on code from Andreas Gruenbacher and Jeff Mahoney to provide a TAINT flag for the support level of a kernel module in the Novell enterprise kernel release. This is the kernel portion of this feature, the ability for the flag to be set needs to be done in the build process and will happen in a follow-up patch. Cc: Andreas Gruenbacher Cc: Jeff Mahoney Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel/module.c') diff --git a/kernel/module.c b/kernel/module.c index 9db11911e04..152b1655bba 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1806,6 +1806,7 @@ static noinline struct module *load_module(void __user *umod, Elf_Ehdr *hdr; Elf_Shdr *sechdrs; char *secstrings, *args, *modmagic, *strtab = NULL; + char *staging; unsigned int i; unsigned int symindex = 0; unsigned int strindex = 0; @@ -1960,6 +1961,14 @@ static noinline struct module *load_module(void __user *umod, goto free_hdr; } + staging = get_modinfo(sechdrs, infoindex, "staging"); + if (staging) { + add_taint_module(mod, TAINT_CRAP); + printk(KERN_WARNING "%s: module is from the staging directory," + " the quality is unknown, you have been warned.\n", + mod->name); + } + /* Now copy in args */ args = strndup_user(uargs, ~0UL >> 1); if (IS_ERR(args)) { @@ -2556,6 +2565,8 @@ static char *module_flags(struct module *mod, char *buf) buf[bx++] = 'P'; if (mod->taints & TAINT_FORCED_MODULE) buf[bx++] = 'F'; + if (mod->taints & TAINT_CRAP) + buf[bx++] = 'C'; /* * TAINT_FORCED_RMMOD: could be added. * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't -- cgit v1.2.3-70-g09d2 From 97e1c18e8d17bd87e1e383b2e9d9fc740332c8e2 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 18 Jul 2008 12:16:16 -0400 Subject: tracing: Kernel Tracepoints Implementation of kernel tracepoints. Inspired from the Linux Kernel Markers. Allows complete typing verification by declaring both tracing statement inline functions and probe registration/unregistration static inline functions within the same macro "DEFINE_TRACE". No format string is required. See the tracepoint Documentation and Samples patches for usage examples. Taken from the documentation patch : "A tracepoint placed in code provides a hook to call a function (probe) that you can provide at runtime. A tracepoint can be "on" (a probe is connected to it) or "off" (no probe is attached). When a tracepoint is "off" it has no effect, except for adding a tiny time penalty (checking a condition for a branch) and space penalty (adding a few bytes for the function call at the end of the instrumented function and adds a data structure in a separate section). When a tracepoint is "on", the function you provide is called each time the tracepoint is executed, in the execution context of the caller. When the function provided ends its execution, it returns to the caller (continuing from the tracepoint site). You can put tracepoints at important locations in the code. They are lightweight hooks that can pass an arbitrary number of parameters, which prototypes are described in a tracepoint declaration placed in a header file." Addition and removal of tracepoints is synchronized by RCU using the scheduler (and preempt_disable) as guarantees to find a quiescent state (this is really RCU "classic"). The update side uses rcu_barrier_sched() with call_rcu_sched() and the read/execute side uses "preempt_disable()/preempt_enable()". We make sure the previous array containing probes, which has been scheduled for deletion by the rcu callback, is indeed freed before we proceed to the next update. It therefore limits the rate of modification of a single tracepoint to one update per RCU period. The objective here is to permit fast batch add/removal of probes on _different_ tracepoints. Changelog : - Use #name ":" #proto as string to identify the tracepoint in the tracepoint table. This will make sure not type mismatch happens due to connexion of a probe with the wrong type to a tracepoint declared with the same name in a different header. - Add tracepoint_entry_free_old. - Change __TO_TRACE to get rid of the 'i' iterator. Masami Hiramatsu : Tested on x86-64. Performance impact of a tracepoint : same as markers, except that it adds about 70 bytes of instructions in an unlikely branch of each instrumented function (the for loop, the stack setup and the function call). It currently adds a memory read, a test and a conditional branch at the instrumentation site (in the hot path). Immediate values will eventually change this into a load immediate, test and branch, which removes the memory read which will make the i-cache impact smaller (changing the memory read for a load immediate removes 3-4 bytes per site on x86_32 (depending on mov prefixes), or 7-8 bytes on x86_64, it also saves the d-cache hit). About the performance impact of tracepoints (which is comparable to markers), even without immediate values optimizations, tests done by Hideo Aoki on ia64 show no regression. His test case was using hackbench on a kernel where scheduler instrumentation (about 5 events in code scheduler code) was added. Quoting Hideo Aoki about Markers : I evaluated overhead of kernel marker using linux-2.6-sched-fixes git tree, which includes several markers for LTTng, using an ia64 server. While the immediate trace mark feature isn't implemented on ia64, there is no major performance regression. So, I think that we don't have any issues to propose merging marker point patches into Linus's tree from the viewpoint of performance impact. I prepared two kernels to evaluate. The first one was compiled without CONFIG_MARKERS. The second one was enabled CONFIG_MARKERS. I downloaded the original hackbench from the following URL: http://devresources.linux-foundation.org/craiger/hackbench/src/hackbench.c I ran hackbench 5 times in each condition and calculated the average and difference between the kernels. The parameter of hackbench: every 50 from 50 to 800 The number of CPUs of the server: 2, 4, and 8 Below is the results. As you can see, major performance regression wasn't found in any case. Even if number of processes increases, differences between marker-enabled kernel and marker- disabled kernel doesn't increase. Moreover, if number of CPUs increases, the differences doesn't increase either. Curiously, marker-enabled kernel is better than marker-disabled kernel in more than half cases, although I guess it comes from the difference of memory access pattern. * 2 CPUs Number of | without | with | diff | diff | processes | Marker [Sec] | Marker [Sec] | [Sec] | [%] | -------------------------------------------------------------- 50 | 4.811 | 4.872 | +0.061 | +1.27 | 100 | 9.854 | 10.309 | +0.454 | +4.61 | 150 | 15.602 | 15.040 | -0.562 | -3.6 | 200 | 20.489 | 20.380 | -0.109 | -0.53 | 250 | 25.798 | 25.652 | -0.146 | -0.56 | 300 | 31.260 | 30.797 | -0.463 | -1.48 | 350 | 36.121 | 35.770 | -0.351 | -0.97 | 400 | 42.288 | 42.102 | -0.186 | -0.44 | 450 | 47.778 | 47.253 | -0.526 | -1.1 | 500 | 51.953 | 52.278 | +0.325 | +0.63 | 550 | 58.401 | 57.700 | -0.701 | -1.2 | 600 | 63.334 | 63.222 | -0.112 | -0.18 | 650 | 68.816 | 68.511 | -0.306 | -0.44 | 700 | 74.667 | 74.088 | -0.579 | -0.78 | 750 | 78.612 | 79.582 | +0.970 | +1.23 | 800 | 85.431 | 85.263 | -0.168 | -0.2 | -------------------------------------------------------------- * 4 CPUs Number of | without | with | diff | diff | processes | Marker [Sec] | Marker [Sec] | [Sec] | [%] | -------------------------------------------------------------- 50 | 2.586 | 2.584 | -0.003 | -0.1 | 100 | 5.254 | 5.283 | +0.030 | +0.56 | 150 | 8.012 | 8.074 | +0.061 | +0.76 | 200 | 11.172 | 11.000 | -0.172 | -1.54 | 250 | 13.917 | 14.036 | +0.119 | +0.86 | 300 | 16.905 | 16.543 | -0.362 | -2.14 | 350 | 19.901 | 20.036 | +0.135 | +0.68 | 400 | 22.908 | 23.094 | +0.186 | +0.81 | 450 | 26.273 | 26.101 | -0.172 | -0.66 | 500 | 29.554 | 29.092 | -0.461 | -1.56 | 550 | 32.377 | 32.274 | -0.103 | -0.32 | 600 | 35.855 | 35.322 | -0.533 | -1.49 | 650 | 39.192 | 38.388 | -0.804 | -2.05 | 700 | 41.744 | 41.719 | -0.025 | -0.06 | 750 | 45.016 | 44.496 | -0.520 | -1.16 | 800 | 48.212 | 47.603 | -0.609 | -1.26 | -------------------------------------------------------------- * 8 CPUs Number of | without | with | diff | diff | processes | Marker [Sec] | Marker [Sec] | [Sec] | [%] | -------------------------------------------------------------- 50 | 2.094 | 2.072 | -0.022 | -1.07 | 100 | 4.162 | 4.273 | +0.111 | +2.66 | 150 | 6.485 | 6.540 | +0.055 | +0.84 | 200 | 8.556 | 8.478 | -0.078 | -0.91 | 250 | 10.458 | 10.258 | -0.200 | -1.91 | 300 | 12.425 | 12.750 | +0.325 | +2.62 | 350 | 14.807 | 14.839 | +0.032 | +0.22 | 400 | 16.801 | 16.959 | +0.158 | +0.94 | 450 | 19.478 | 19.009 | -0.470 | -2.41 | 500 | 21.296 | 21.504 | +0.208 | +0.98 | 550 | 23.842 | 23.979 | +0.137 | +0.57 | 600 | 26.309 | 26.111 | -0.198 | -0.75 | 650 | 28.705 | 28.446 | -0.259 | -0.9 | 700 | 31.233 | 31.394 | +0.161 | +0.52 | 750 | 34.064 | 33.720 | -0.344 | -1.01 | 800 | 36.320 | 36.114 | -0.206 | -0.57 | -------------------------------------------------------------- Signed-off-by: Mathieu Desnoyers Acked-by: Masami Hiramatsu Acked-by: 'Peter Zijlstra' Signed-off-by: Ingo Molnar --- include/asm-generic/vmlinux.lds.h | 6 +- include/linux/module.h | 17 ++ include/linux/tracepoint.h | 127 ++++++++++ init/Kconfig | 7 + kernel/Makefile | 1 + kernel/module.c | 66 +++++- kernel/tracepoint.c | 476 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 698 insertions(+), 2 deletions(-) create mode 100644 include/linux/tracepoint.h create mode 100644 kernel/tracepoint.c (limited to 'kernel/module.c') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 7440a0dcedd..3d8e472a09c 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -52,7 +52,10 @@ . = ALIGN(8); \ VMLINUX_SYMBOL(__start___markers) = .; \ *(__markers) \ - VMLINUX_SYMBOL(__stop___markers) = .; + VMLINUX_SYMBOL(__stop___markers) = .; \ + VMLINUX_SYMBOL(__start___tracepoints) = .; \ + *(__tracepoints) \ + VMLINUX_SYMBOL(__stop___tracepoints) = .; #define RO_DATA(align) \ . = ALIGN((align)); \ @@ -61,6 +64,7 @@ *(.rodata) *(.rodata.*) \ *(__vermagic) /* Kernel version magic */ \ *(__markers_strings) /* Markers: strings */ \ + *(__tracepoints_strings)/* Tracepoints: strings */ \ } \ \ .rodata1 : AT(ADDR(.rodata1) - LOAD_OFFSET) { \ diff --git a/include/linux/module.h b/include/linux/module.h index 68e09557c95..8b611350386 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -331,6 +332,10 @@ struct module struct marker *markers; unsigned int num_markers; #endif +#ifdef CONFIG_TRACEPOINTS + struct tracepoint *tracepoints; + unsigned int num_tracepoints; +#endif #ifdef CONFIG_MODULE_UNLOAD /* What modules depend on me? */ @@ -454,6 +459,9 @@ extern void print_modules(void); extern void module_update_markers(void); +extern void module_update_tracepoints(void); +extern int module_get_iter_tracepoints(struct tracepoint_iter *iter); + #else /* !CONFIG_MODULES... */ #define EXPORT_SYMBOL(sym) #define EXPORT_SYMBOL_GPL(sym) @@ -558,6 +566,15 @@ static inline void module_update_markers(void) { } +static inline void module_update_tracepoints(void) +{ +} + +static inline int module_get_iter_tracepoints(struct tracepoint_iter *iter) +{ + return 0; +} + #endif /* CONFIG_MODULES */ struct device_driver; diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h new file mode 100644 index 00000000000..e623a6fca5c --- /dev/null +++ b/include/linux/tracepoint.h @@ -0,0 +1,127 @@ +#ifndef _LINUX_TRACEPOINT_H +#define _LINUX_TRACEPOINT_H + +/* + * Kernel Tracepoint API. + * + * See Documentation/tracepoint.txt. + * + * (C) Copyright 2008 Mathieu Desnoyers + * + * Heavily inspired from the Linux Kernel Markers. + * + * This file is released under the GPLv2. + * See the file COPYING for more details. + */ + +#include +#include + +struct module; +struct tracepoint; + +struct tracepoint { + const char *name; /* Tracepoint name */ + int state; /* State. */ + void **funcs; +} __attribute__((aligned(8))); + + +#define TPPROTO(args...) args +#define TPARGS(args...) args + +#ifdef CONFIG_TRACEPOINTS + +/* + * it_func[0] is never NULL because there is at least one element in the array + * when the array itself is non NULL. + */ +#define __DO_TRACE(tp, proto, args) \ + do { \ + void **it_func; \ + \ + rcu_read_lock_sched(); \ + it_func = rcu_dereference((tp)->funcs); \ + if (it_func) { \ + do { \ + ((void(*)(proto))(*it_func))(args); \ + } while (*(++it_func)); \ + } \ + rcu_read_unlock_sched(); \ + } while (0) + +/* + * Make sure the alignment of the structure in the __tracepoints section will + * not add unwanted padding between the beginning of the section and the + * structure. Force alignment to the same alignment as the section start. + */ +#define DEFINE_TRACE(name, proto, args) \ + static inline void trace_##name(proto) \ + { \ + static const char __tpstrtab_##name[] \ + __attribute__((section("__tracepoints_strings"))) \ + = #name ":" #proto; \ + static struct tracepoint __tracepoint_##name \ + __attribute__((section("__tracepoints"), aligned(8))) = \ + { __tpstrtab_##name, 0, NULL }; \ + if (unlikely(__tracepoint_##name.state)) \ + __DO_TRACE(&__tracepoint_##name, \ + TPPROTO(proto), TPARGS(args)); \ + } \ + static inline int register_trace_##name(void (*probe)(proto)) \ + { \ + return tracepoint_probe_register(#name ":" #proto, \ + (void *)probe); \ + } \ + static inline void unregister_trace_##name(void (*probe)(proto))\ + { \ + tracepoint_probe_unregister(#name ":" #proto, \ + (void *)probe); \ + } + +extern void tracepoint_update_probe_range(struct tracepoint *begin, + struct tracepoint *end); + +#else /* !CONFIG_TRACEPOINTS */ +#define DEFINE_TRACE(name, proto, args) \ + static inline void _do_trace_##name(struct tracepoint *tp, proto) \ + { } \ + static inline void trace_##name(proto) \ + { } \ + static inline int register_trace_##name(void (*probe)(proto)) \ + { \ + return -ENOSYS; \ + } \ + static inline void unregister_trace_##name(void (*probe)(proto))\ + { } + +static inline void tracepoint_update_probe_range(struct tracepoint *begin, + struct tracepoint *end) +{ } +#endif /* CONFIG_TRACEPOINTS */ + +/* + * Connect a probe to a tracepoint. + * Internal API, should not be used directly. + */ +extern int tracepoint_probe_register(const char *name, void *probe); + +/* + * Disconnect a probe from a tracepoint. + * Internal API, should not be used directly. + */ +extern int tracepoint_probe_unregister(const char *name, void *probe); + +struct tracepoint_iter { + struct module *module; + struct tracepoint *tracepoint; +}; + +extern void tracepoint_iter_start(struct tracepoint_iter *iter); +extern void tracepoint_iter_next(struct tracepoint_iter *iter); +extern void tracepoint_iter_stop(struct tracepoint_iter *iter); +extern void tracepoint_iter_reset(struct tracepoint_iter *iter); +extern int tracepoint_get_iter_range(struct tracepoint **tracepoint, + struct tracepoint *begin, struct tracepoint *end); + +#endif diff --git a/init/Kconfig b/init/Kconfig index c11da38837e..70082678a91 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -771,6 +771,13 @@ config PROFILING Say Y here to enable the extended profiling support mechanisms used by profilers such as OProfile. +config TRACEPOINTS + bool "Activate tracepoints" + default y + help + Place an empty function call at each tracepoint site. Can be + dynamically changed for a probe function. + config MARKERS bool "Activate markers" help diff --git a/kernel/Makefile b/kernel/Makefile index 4e1d7df7c3e..8f9ce7ec21b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -83,6 +83,7 @@ obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o +obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_FTRACE) += trace/ diff --git a/kernel/module.c b/kernel/module.c index 9db11911e04..661d73db786 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -46,6 +46,7 @@ #include #include #include +#include #if 0 #define DEBUGP printk @@ -1831,6 +1832,8 @@ static noinline struct module *load_module(void __user *umod, #endif unsigned int markersindex; unsigned int markersstringsindex; + unsigned int tracepointsindex; + unsigned int tracepointsstringsindex; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ @@ -2117,6 +2120,9 @@ static noinline struct module *load_module(void __user *umod, markersindex = find_sec(hdr, sechdrs, secstrings, "__markers"); markersstringsindex = find_sec(hdr, sechdrs, secstrings, "__markers_strings"); + tracepointsindex = find_sec(hdr, sechdrs, secstrings, "__tracepoints"); + tracepointsstringsindex = find_sec(hdr, sechdrs, secstrings, + "__tracepoints_strings"); /* Now do relocations. */ for (i = 1; i < hdr->e_shnum; i++) { @@ -2144,6 +2150,12 @@ static noinline struct module *load_module(void __user *umod, mod->num_markers = sechdrs[markersindex].sh_size / sizeof(*mod->markers); #endif +#ifdef CONFIG_TRACEPOINTS + mod->tracepoints = (void *)sechdrs[tracepointsindex].sh_addr; + mod->num_tracepoints = + sechdrs[tracepointsindex].sh_size / sizeof(*mod->tracepoints); +#endif + /* Find duplicate symbols */ err = verify_export_symbols(mod); @@ -2162,11 +2174,16 @@ static noinline struct module *load_module(void __user *umod, add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); + if (!mod->taints) { #ifdef CONFIG_MARKERS - if (!mod->taints) marker_update_probe_range(mod->markers, mod->markers + mod->num_markers); #endif +#ifdef CONFIG_TRACEPOINTS + tracepoint_update_probe_range(mod->tracepoints, + mod->tracepoints + mod->num_tracepoints); +#endif + } err = module_finalize(hdr, sechdrs, mod); if (err < 0) goto cleanup; @@ -2717,3 +2734,50 @@ void module_update_markers(void) mutex_unlock(&module_mutex); } #endif + +#ifdef CONFIG_TRACEPOINTS +void module_update_tracepoints(void) +{ + struct module *mod; + + mutex_lock(&module_mutex); + list_for_each_entry(mod, &modules, list) + if (!mod->taints) + tracepoint_update_probe_range(mod->tracepoints, + mod->tracepoints + mod->num_tracepoints); + mutex_unlock(&module_mutex); +} + +/* + * Returns 0 if current not found. + * Returns 1 if current found. + */ +int module_get_iter_tracepoints(struct tracepoint_iter *iter) +{ + struct module *iter_mod; + int found = 0; + + mutex_lock(&module_mutex); + list_for_each_entry(iter_mod, &modules, list) { + if (!iter_mod->taints) { + /* + * Sorted module list + */ + if (iter_mod < iter->module) + continue; + else if (iter_mod > iter->module) + iter->tracepoint = NULL; + found = tracepoint_get_iter_range(&iter->tracepoint, + iter_mod->tracepoints, + iter_mod->tracepoints + + iter_mod->num_tracepoints); + if (found) { + iter->module = iter_mod; + break; + } + } + } + mutex_unlock(&module_mutex); + return found; +} +#endif diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c new file mode 100644 index 00000000000..42e86ddbd2a --- /dev/null +++ b/kernel/tracepoint.c @@ -0,0 +1,476 @@ +/* + * Copyright (C) 2008 Mathieu Desnoyers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern struct tracepoint __start___tracepoints[]; +extern struct tracepoint __stop___tracepoints[]; + +/* Set to 1 to enable tracepoint debug output */ +static const int tracepoint_debug; + +/* + * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the + * builtin and module tracepoints and the hash table. + */ +static DEFINE_MUTEX(tracepoints_mutex); + +/* + * Tracepoint hash table, containing the active tracepoints. + * Protected by tracepoints_mutex. + */ +#define TRACEPOINT_HASH_BITS 6 +#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS) + +/* + * Note about RCU : + * It is used to to delay the free of multiple probes array until a quiescent + * state is reached. + * Tracepoint entries modifications are protected by the tracepoints_mutex. + */ +struct tracepoint_entry { + struct hlist_node hlist; + void **funcs; + int refcount; /* Number of times armed. 0 if disarmed. */ + struct rcu_head rcu; + void *oldptr; + unsigned char rcu_pending:1; + char name[0]; +}; + +static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; + +static void free_old_closure(struct rcu_head *head) +{ + struct tracepoint_entry *entry = container_of(head, + struct tracepoint_entry, rcu); + kfree(entry->oldptr); + /* Make sure we free the data before setting the pending flag to 0 */ + smp_wmb(); + entry->rcu_pending = 0; +} + +static void tracepoint_entry_free_old(struct tracepoint_entry *entry, void *old) +{ + if (!old) + return; + entry->oldptr = old; + entry->rcu_pending = 1; + /* write rcu_pending before calling the RCU callback */ + smp_wmb(); +#ifdef CONFIG_PREEMPT_RCU + synchronize_sched(); /* Until we have the call_rcu_sched() */ +#endif + call_rcu(&entry->rcu, free_old_closure); +} + +static void debug_print_probes(struct tracepoint_entry *entry) +{ + int i; + + if (!tracepoint_debug) + return; + + for (i = 0; entry->funcs[i]; i++) + printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]); +} + +static void * +tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe) +{ + int nr_probes = 0; + void **old, **new; + + WARN_ON(!probe); + + debug_print_probes(entry); + old = entry->funcs; + if (old) { + /* (N -> N+1), (N != 0, 1) probes */ + for (nr_probes = 0; old[nr_probes]; nr_probes++) + if (old[nr_probes] == probe) + return ERR_PTR(-EEXIST); + } + /* + 2 : one for new probe, one for NULL func */ + new = kzalloc((nr_probes + 2) * sizeof(void *), GFP_KERNEL); + if (new == NULL) + return ERR_PTR(-ENOMEM); + if (old) + memcpy(new, old, nr_probes * sizeof(void *)); + new[nr_probes] = probe; + entry->refcount = nr_probes + 1; + entry->funcs = new; + debug_print_probes(entry); + return old; +} + +static void * +tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) +{ + int nr_probes = 0, nr_del = 0, i; + void **old, **new; + + old = entry->funcs; + + debug_print_probes(entry); + /* (N -> M), (N > 1, M >= 0) probes */ + for (nr_probes = 0; old[nr_probes]; nr_probes++) { + if ((!probe || old[nr_probes] == probe)) + nr_del++; + } + + if (nr_probes - nr_del == 0) { + /* N -> 0, (N > 1) */ + entry->funcs = NULL; + entry->refcount = 0; + debug_print_probes(entry); + return old; + } else { + int j = 0; + /* N -> M, (N > 1, M > 0) */ + /* + 1 for NULL */ + new = kzalloc((nr_probes - nr_del + 1) + * sizeof(void *), GFP_KERNEL); + if (new == NULL) + return ERR_PTR(-ENOMEM); + for (i = 0; old[i]; i++) + if ((probe && old[i] != probe)) + new[j++] = old[i]; + entry->refcount = nr_probes - nr_del; + entry->funcs = new; + } + debug_print_probes(entry); + return old; +} + +/* + * Get tracepoint if the tracepoint is present in the tracepoint hash table. + * Must be called with tracepoints_mutex held. + * Returns NULL if not present. + */ +static struct tracepoint_entry *get_tracepoint(const char *name) +{ + struct hlist_head *head; + struct hlist_node *node; + struct tracepoint_entry *e; + u32 hash = jhash(name, strlen(name), 0); + + head = &tracepoint_table[hash & ((1 << TRACEPOINT_HASH_BITS)-1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(name, e->name)) + return e; + } + return NULL; +} + +/* + * Add the tracepoint to the tracepoint hash table. Must be called with + * tracepoints_mutex held. + */ +static struct tracepoint_entry *add_tracepoint(const char *name) +{ + struct hlist_head *head; + struct hlist_node *node; + struct tracepoint_entry *e; + size_t name_len = strlen(name) + 1; + u32 hash = jhash(name, name_len-1, 0); + + head = &tracepoint_table[hash & ((1 << TRACEPOINT_HASH_BITS)-1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(name, e->name)) { + printk(KERN_NOTICE + "tracepoint %s busy\n", name); + return ERR_PTR(-EEXIST); /* Already there */ + } + } + /* + * Using kmalloc here to allocate a variable length element. Could + * cause some memory fragmentation if overused. + */ + e = kmalloc(sizeof(struct tracepoint_entry) + name_len, GFP_KERNEL); + if (!e) + return ERR_PTR(-ENOMEM); + memcpy(&e->name[0], name, name_len); + e->funcs = NULL; + e->refcount = 0; + e->rcu_pending = 0; + hlist_add_head(&e->hlist, head); + return e; +} + +/* + * Remove the tracepoint from the tracepoint hash table. Must be called with + * mutex_lock held. + */ +static int remove_tracepoint(const char *name) +{ + struct hlist_head *head; + struct hlist_node *node; + struct tracepoint_entry *e; + int found = 0; + size_t len = strlen(name) + 1; + u32 hash = jhash(name, len-1, 0); + + head = &tracepoint_table[hash & ((1 << TRACEPOINT_HASH_BITS)-1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(name, e->name)) { + found = 1; + break; + } + } + if (!found) + return -ENOENT; + if (e->refcount) + return -EBUSY; + hlist_del(&e->hlist); + /* Make sure the call_rcu has been executed */ + if (e->rcu_pending) + rcu_barrier(); + kfree(e); + return 0; +} + +/* + * Sets the probe callback corresponding to one tracepoint. + */ +static void set_tracepoint(struct tracepoint_entry **entry, + struct tracepoint *elem, int active) +{ + WARN_ON(strcmp((*entry)->name, elem->name) != 0); + + /* + * rcu_assign_pointer has a smp_wmb() which makes sure that the new + * probe callbacks array is consistent before setting a pointer to it. + * This array is referenced by __DO_TRACE from + * include/linux/tracepoints.h. A matching smp_read_barrier_depends() + * is used. + */ + rcu_assign_pointer(elem->funcs, (*entry)->funcs); + elem->state = active; +} + +/* + * Disable a tracepoint and its probe callback. + * Note: only waiting an RCU period after setting elem->call to the empty + * function insures that the original callback is not used anymore. This insured + * by preempt_disable around the call site. + */ +static void disable_tracepoint(struct tracepoint *elem) +{ + elem->state = 0; +} + +/** + * tracepoint_update_probe_range - Update a probe range + * @begin: beginning of the range + * @end: end of the range + * + * Updates the probe callback corresponding to a range of tracepoints. + */ +void tracepoint_update_probe_range(struct tracepoint *begin, + struct tracepoint *end) +{ + struct tracepoint *iter; + struct tracepoint_entry *mark_entry; + + mutex_lock(&tracepoints_mutex); + for (iter = begin; iter < end; iter++) { + mark_entry = get_tracepoint(iter->name); + if (mark_entry) { + set_tracepoint(&mark_entry, iter, + !!mark_entry->refcount); + } else { + disable_tracepoint(iter); + } + } + mutex_unlock(&tracepoints_mutex); +} + +/* + * Update probes, removing the faulty probes. + */ +static void tracepoint_update_probes(void) +{ + /* Core kernel tracepoints */ + tracepoint_update_probe_range(__start___tracepoints, + __stop___tracepoints); + /* tracepoints in modules. */ + module_update_tracepoints(); +} + +/** + * tracepoint_probe_register - Connect a probe to a tracepoint + * @name: tracepoint name + * @probe: probe handler + * + * Returns 0 if ok, error value on error. + * The probe address must at least be aligned on the architecture pointer size. + */ +int tracepoint_probe_register(const char *name, void *probe) +{ + struct tracepoint_entry *entry; + int ret = 0; + void *old; + + mutex_lock(&tracepoints_mutex); + entry = get_tracepoint(name); + if (!entry) { + entry = add_tracepoint(name); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry); + goto end; + } + } + /* + * If we detect that a call_rcu is pending for this tracepoint, + * make sure it's executed now. + */ + if (entry->rcu_pending) + rcu_barrier(); + old = tracepoint_entry_add_probe(entry, probe); + if (IS_ERR(old)) { + ret = PTR_ERR(old); + goto end; + } + mutex_unlock(&tracepoints_mutex); + tracepoint_update_probes(); /* may update entry */ + mutex_lock(&tracepoints_mutex); + entry = get_tracepoint(name); + WARN_ON(!entry); + tracepoint_entry_free_old(entry, old); +end: + mutex_unlock(&tracepoints_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(tracepoint_probe_register); + +/** + * tracepoint_probe_unregister - Disconnect a probe from a tracepoint + * @name: tracepoint name + * @probe: probe function pointer + * + * We do not need to call a synchronize_sched to make sure the probes have + * finished running before doing a module unload, because the module unload + * itself uses stop_machine(), which insures that every preempt disabled section + * have finished. + */ +int tracepoint_probe_unregister(const char *name, void *probe) +{ + struct tracepoint_entry *entry; + void *old; + int ret = -ENOENT; + + mutex_lock(&tracepoints_mutex); + entry = get_tracepoint(name); + if (!entry) + goto end; + if (entry->rcu_pending) + rcu_barrier(); + old = tracepoint_entry_remove_probe(entry, probe); + mutex_unlock(&tracepoints_mutex); + tracepoint_update_probes(); /* may update entry */ + mutex_lock(&tracepoints_mutex); + entry = get_tracepoint(name); + if (!entry) + goto end; + tracepoint_entry_free_old(entry, old); + remove_tracepoint(name); /* Ignore busy error message */ + ret = 0; +end: + mutex_unlock(&tracepoints_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); + +/** + * tracepoint_get_iter_range - Get a next tracepoint iterator given a range. + * @tracepoint: current tracepoints (in), next tracepoint (out) + * @begin: beginning of the range + * @end: end of the range + * + * Returns whether a next tracepoint has been found (1) or not (0). + * Will return the first tracepoint in the range if the input tracepoint is + * NULL. + */ +int tracepoint_get_iter_range(struct tracepoint **tracepoint, + struct tracepoint *begin, struct tracepoint *end) +{ + if (!*tracepoint && begin != end) { + *tracepoint = begin; + return 1; + } + if (*tracepoint >= begin && *tracepoint < end) + return 1; + return 0; +} +EXPORT_SYMBOL_GPL(tracepoint_get_iter_range); + +static void tracepoint_get_iter(struct tracepoint_iter *iter) +{ + int found = 0; + + /* Core kernel tracepoints */ + if (!iter->module) { + found = tracepoint_get_iter_range(&iter->tracepoint, + __start___tracepoints, __stop___tracepoints); + if (found) + goto end; + } + /* tracepoints in modules. */ + found = module_get_iter_tracepoints(iter); +end: + if (!found) + tracepoint_iter_reset(iter); +} + +void tracepoint_iter_start(struct tracepoint_iter *iter) +{ + tracepoint_get_iter(iter); +} +EXPORT_SYMBOL_GPL(tracepoint_iter_start); + +void tracepoint_iter_next(struct tracepoint_iter *iter) +{ + iter->tracepoint++; + /* + * iter->tracepoint may be invalid because we blindly incremented it. + * Make sure it is valid by marshalling on the tracepoints, getting the + * tracepoints from following modules if necessary. + */ + tracepoint_get_iter(iter); +} +EXPORT_SYMBOL_GPL(tracepoint_iter_next); + +void tracepoint_iter_stop(struct tracepoint_iter *iter) +{ +} +EXPORT_SYMBOL_GPL(tracepoint_iter_stop); + +void tracepoint_iter_reset(struct tracepoint_iter *iter) +{ + iter->module = NULL; + iter->tracepoint = NULL; +} +EXPORT_SYMBOL_GPL(tracepoint_iter_reset); -- cgit v1.2.3-70-g09d2 From 90d595fe5ca4b685465c068907e6e554760abea8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Aug 2008 15:45:09 -0400 Subject: ftrace: enable mcount recording for modules This patch enables the loading of the __mcount_section of modules and changing all the callers of mcount into nops. The modification is done before the init_module function is called, so again, we do not need to use kstop_machine to make these changes. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 3 +++ kernel/module.c | 11 +++++++++++ kernel/trace/ftrace.c | 5 +++++ 3 files changed, 19 insertions(+) (limited to 'kernel/module.c') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index d4d6ab453b7..4936489f9ed 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -164,8 +164,11 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } #ifdef CONFIG_FTRACE_MCOUNT_RECORD extern void ftrace_init(void); +extern void ftrace_init_module(unsigned long *start, unsigned long *end); #else static inline void ftrace_init(void) { } +static inline void +ftrace_init_module(unsigned long *start, unsigned long *end) { } #endif #endif /* _LINUX_FTRACE_H */ diff --git a/kernel/module.c b/kernel/module.c index 661d73db786..d753fd9d83e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -47,6 +47,7 @@ #include #include #include +#include #if 0 #define DEBUGP printk @@ -1834,6 +1835,7 @@ static noinline struct module *load_module(void __user *umod, unsigned int markersstringsindex; unsigned int tracepointsindex; unsigned int tracepointsstringsindex; + unsigned int mcountindex; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ @@ -2124,6 +2126,9 @@ static noinline struct module *load_module(void __user *umod, tracepointsstringsindex = find_sec(hdr, sechdrs, secstrings, "__tracepoints_strings"); + mcountindex = find_sec(hdr, sechdrs, secstrings, + "__mcount_loc"); + /* Now do relocations. */ for (i = 1; i < hdr->e_shnum; i++) { const char *strtab = (char *)sechdrs[strindex].sh_addr; @@ -2184,6 +2189,12 @@ static noinline struct module *load_module(void __user *umod, mod->tracepoints + mod->num_tracepoints); #endif } + + if (mcountindex) { + void *mseg = (void *)sechdrs[mcountindex].sh_addr; + ftrace_init_module(mseg, mseg + sechdrs[mcountindex].sh_size); + } + err = module_finalize(hdr, sechdrs, mod); if (err < 0) goto cleanup; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index df96d5990c0..ea45bb1c0fd 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1541,6 +1541,11 @@ static int ftrace_convert_nops(unsigned long *start, return 0; } +void ftrace_init_module(unsigned long *start, unsigned long *end) +{ + ftrace_convert_nops(start, end); +} + extern unsigned long __start_mcount_loc[]; extern unsigned long __stop_mcount_loc[]; -- cgit v1.2.3-70-g09d2 From fed1939c64d2288938fdc1c367d49082da65e195 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Aug 2008 22:47:19 -0400 Subject: ftrace: remove old pointers to mcount When a mcount pointer is recorded into a table, it is used to add or remove calls to mcount (replacing them with nops). If the code is removed via removing a module, the pointers still exist. At modifying the code a check is always made to make sure the code being replaced is the code expected. In-other-words, the code being replaced is compared to what it is expected to be before being replaced. There is a very small chance that the code being replaced just happens to look like code that calls mcount (very small since the call to mcount is relative). To remove this chance, this patch adds ftrace_release to allow module unloading to remove the pointers to mcount within the module. Another change for init calls is made to not trace calls marked with __init. The tracing can not be started until after init is done anyway. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 2 ++ include/linux/init.h | 2 +- kernel/module.c | 12 ++++++++---- kernel/trace/ftrace.c | 32 ++++++++++++++++++++++++++++++-- 4 files changed, 41 insertions(+), 7 deletions(-) (limited to 'kernel/module.c') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4936489f9ed..6b232a2460c 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -165,10 +165,12 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } #ifdef CONFIG_FTRACE_MCOUNT_RECORD extern void ftrace_init(void); extern void ftrace_init_module(unsigned long *start, unsigned long *end); +extern void ftrace_release(void *start, unsigned long size); #else static inline void ftrace_init(void) { } static inline void ftrace_init_module(unsigned long *start, unsigned long *end) { } +static inline void ftrace_release(void *start, unsigned long size) { } #endif #endif /* _LINUX_FTRACE_H */ diff --git a/include/linux/init.h b/include/linux/init.h index 93538b696e3..27f61f6b3cb 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -40,7 +40,7 @@ /* These are for everybody (although not all archs will actually discard it in modules) */ -#define __init __section(.init.text) __cold +#define __init __section(.init.text) __cold notrace #define __initdata __section(.init.data) #define __initconst __section(.init.rodata) #define __exitdata __section(.exit.data) diff --git a/kernel/module.c b/kernel/module.c index d753fd9d83e..7576c2d9462 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1431,6 +1431,9 @@ static void free_module(struct module *mod) /* Module unload stuff */ module_unload_free(mod); + /* release any pointers to mcount in this module */ + ftrace_release(mod->module_core, mod->core_size); + /* This may be NULL, but that's OK */ module_free(mod, mod->module_init); kfree(mod->args); @@ -1839,6 +1842,7 @@ static noinline struct module *load_module(void __user *umod, struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ + void *mseg; struct exception_table_entry *extable; mm_segment_t old_fs; @@ -2190,10 +2194,9 @@ static noinline struct module *load_module(void __user *umod, #endif } - if (mcountindex) { - void *mseg = (void *)sechdrs[mcountindex].sh_addr; - ftrace_init_module(mseg, mseg + sechdrs[mcountindex].sh_size); - } + /* sechdrs[0].sh_size is always zero */ + mseg = (void *)sechdrs[mcountindex].sh_addr; + ftrace_init_module(mseg, mseg + sechdrs[mcountindex].sh_size); err = module_finalize(hdr, sechdrs, mod); if (err < 0) @@ -2264,6 +2267,7 @@ static noinline struct module *load_module(void __user *umod, cleanup: kobject_del(&mod->mkobj.kobj); kobject_put(&mod->mkobj.kobj); + ftrace_release(mod->module_core, mod->core_size); free_unload: module_unload_free(mod); module_free(mod, mod->module_init); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8affb6d00ec..eadd0eaea9b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -294,13 +294,37 @@ static inline void ftrace_del_hash(struct dyn_ftrace *node) static void ftrace_free_rec(struct dyn_ftrace *rec) { - /* no locking, only called from kstop_machine */ - rec->ip = (unsigned long)ftrace_free_records; ftrace_free_records = rec; rec->flags |= FTRACE_FL_FREE; } +void ftrace_release(void *start, unsigned long size) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + unsigned long s = (unsigned long)start; + unsigned long e = s + size; + int i; + + if (!start) + return; + + /* No interrupt should call this */ + spin_lock(&ftrace_lock); + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + + if ((rec->ip >= s) && (rec->ip < e)) + ftrace_free_rec(rec); + } + } + spin_unlock(&ftrace_lock); + +} + static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { struct dyn_ftrace *rec; @@ -1527,7 +1551,9 @@ static int ftrace_convert_nops(unsigned long *start, p = start; while (p < end) { addr = ftrace_call_adjust(*p++); + spin_lock(&ftrace_lock); ftrace_record_ip(addr); + spin_unlock(&ftrace_lock); ftrace_shutdown_replenish(); } @@ -1541,6 +1567,8 @@ static int ftrace_convert_nops(unsigned long *start, void ftrace_init_module(unsigned long *start, unsigned long *end) { + if (start == end) + return; ftrace_convert_nops(start, end); } -- cgit v1.2.3-70-g09d2 From e94320939f44e0cbaccc3f259a5778abced4949c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 23 Sep 2008 23:51:11 +0400 Subject: modules: fix module "notes" kobject leak Fix "notes" kobject leak It happens every rmmod if KALLSYMS=y and SYSFS=y. # modprobe foo kobject: 'foo' (ffffffffa00743d0): kobject_add_internal: parent: 'module', set: 'module' kobject: 'holders' (ffff88017e7c5770): kobject_add_internal: parent: 'foo', set: '' kobject: 'foo' (ffffffffa00743d0): kobject_uevent_env kobject: 'foo' (ffffffffa00743d0): fill_kobj_path: path = '/module/foo' kobject: 'notes' (ffff88017fa9b668): kobject_add_internal: parent: 'foo', set: '' ^^^^^ # rmmod foo kobject: 'holders' (ffff88017e7c5770): kobject_cleanup kobject: 'holders' (ffff88017e7c5770): auto cleanup kobject_del kobject: 'holders' (ffff88017e7c5770): calling ktype release kobject: (ffff88017e7c5770): dynamic_kobj_release kobject: 'holders': free name kobject: 'foo' (ffffffffa00743d0): kobject_cleanup kobject: 'foo' (ffffffffa00743d0): does not have a release() function, it is broken and must be fixed. kobject: 'foo' (ffffffffa00743d0): auto cleanup 'remove' event kobject: 'foo' (ffffffffa00743d0): kobject_uevent_env kobject: 'foo' (ffffffffa00743d0): fill_kobj_path: path = '/module/foo' kobject: 'foo' (ffffffffa00743d0): auto cleanup kobject_del kobject: 'foo': free name [whooops] Signed-off-by: Alexey Dobriyan Cc: stable Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/module.c') diff --git a/kernel/module.c b/kernel/module.c index 9db11911e04..d5fcd24e5af 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1173,7 +1173,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs, while (i-- > 0) sysfs_remove_bin_file(notes_attrs->dir, ¬es_attrs->attrs[i]); - kobject_del(notes_attrs->dir); + kobject_put(notes_attrs->dir); } kfree(notes_attrs); } -- cgit v1.2.3-70-g09d2 From 346e15beb5343c2eb8216d820f2ed8f150822b08 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Tue, 12 Aug 2008 16:46:19 -0400 Subject: driver core: basic infrastructure for per-module dynamic debug messages Base infrastructure to enable per-module debug messages. I've introduced CONFIG_DYNAMIC_PRINTK_DEBUG, which when enabled centralizes control of debugging statements on a per-module basis in one /proc file, currently, /dynamic_printk/modules. When, CONFIG_DYNAMIC_PRINTK_DEBUG, is not set, debugging statements can still be enabled as before, often by defining 'DEBUG' for the proper compilation unit. Thus, this patch set has no affect when CONFIG_DYNAMIC_PRINTK_DEBUG is not set. The infrastructure currently ties into all pr_debug() and dev_dbg() calls. That is, if CONFIG_DYNAMIC_PRINTK_DEBUG is set, all pr_debug() and dev_dbg() calls can be dynamically enabled/disabled on a per-module basis. Future plans include extending this functionality to subsystems, that define their own debug levels and flags. Usage: Dynamic debugging is controlled by the debugfs file, /dynamic_printk/modules. This file contains a list of the modules that can be enabled. The format of the file is as follows: . . . : Name of the module in which the debug call resides : whether the messages are enabled or not For example: snd_hda_intel enabled=0 fixup enabled=1 driver enabled=0 Enable a module: $echo "set enabled=1 " > dynamic_printk/modules Disable a module: $echo "set enabled=0 " > dynamic_printk/modules Enable all modules: $echo "set enabled=1 all" > dynamic_printk/modules Disable all modules: $echo "set enabled=0 all" > dynamic_printk/modules Finally, passing "dynamic_printk" at the command line enables debugging for all modules. This mode can be turned off via the above disable command. [gkh: minor cleanups and tweaks to make the build work quietly] Signed-off-by: Jason Baron Signed-off-by: Greg Kroah-Hartman --- Documentation/kernel-parameters.txt | 5 + include/asm-generic/vmlinux.lds.h | 10 +- include/linux/device.h | 6 +- include/linux/dynamic_printk.h | 93 ++++++++ include/linux/kernel.h | 7 +- include/linux/module.h | 1 - kernel/module.c | 31 +++ lib/Kconfig.debug | 55 +++++ lib/Makefile | 2 + lib/dynamic_printk.c | 418 ++++++++++++++++++++++++++++++++++++ net/netfilter/nf_conntrack_pptp.c | 2 +- scripts/Makefile.lib | 11 +- scripts/basic/Makefile | 2 +- scripts/basic/hash.c | 64 ++++++ 14 files changed, 700 insertions(+), 7 deletions(-) create mode 100644 include/linux/dynamic_printk.h create mode 100644 lib/dynamic_printk.c create mode 100644 scripts/basic/hash.c (limited to 'kernel/module.c') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 2443f5bb436..b429c84ceef 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1713,6 +1713,11 @@ and is between 256 and 4096 characters. It is defined in the file autoconfiguration. Ranges are in pairs (memory base and size). + dynamic_printk + Enables pr_debug()/dev_dbg() calls if + CONFIG_DYNAMIC_PRINTK_DEBUG has been enabled. These can also + be switched on/off via /dynamic_printk/modules + print-fatal-signals= [KNL] debug: print fatal signals print-fatal-signals=1: print segfault info to diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 7440a0dcedd..74c5faf26c0 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -268,7 +268,15 @@ CPU_DISCARD(init.data) \ CPU_DISCARD(init.rodata) \ MEM_DISCARD(init.data) \ - MEM_DISCARD(init.rodata) + MEM_DISCARD(init.rodata) \ + /* implement dynamic printk debug */ \ + VMLINUX_SYMBOL(__start___verbose_strings) = .; \ + *(__verbose_strings) \ + VMLINUX_SYMBOL(__stop___verbose_strings) = .; \ + . = ALIGN(8); \ + VMLINUX_SYMBOL(__start___verbose) = .; \ + *(__verbose) \ + VMLINUX_SYMBOL(__stop___verbose) = .; #define INIT_TEXT \ *(.init.text) \ diff --git a/include/linux/device.h b/include/linux/device.h index 60f6456691a..fb034461b39 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -550,7 +550,11 @@ extern const char *dev_driver_string(const struct device *dev); #define dev_info(dev, format, arg...) \ dev_printk(KERN_INFO , dev , format , ## arg) -#ifdef DEBUG +#if defined(CONFIG_DYNAMIC_PRINTK_DEBUG) +#define dev_dbg(dev, format, ...) do { \ + dynamic_dev_dbg(dev, format, ##__VA_ARGS__); \ + } while (0) +#elif defined(DEBUG) #define dev_dbg(dev, format, arg...) \ dev_printk(KERN_DEBUG , dev , format , ## arg) #else diff --git a/include/linux/dynamic_printk.h b/include/linux/dynamic_printk.h new file mode 100644 index 00000000000..2d528d00907 --- /dev/null +++ b/include/linux/dynamic_printk.h @@ -0,0 +1,93 @@ +#ifndef _DYNAMIC_PRINTK_H +#define _DYNAMIC_PRINTK_H + +#define DYNAMIC_DEBUG_HASH_BITS 6 +#define DEBUG_HASH_TABLE_SIZE (1 << DYNAMIC_DEBUG_HASH_BITS) + +#define TYPE_BOOLEAN 1 + +#define DYNAMIC_ENABLED_ALL 0 +#define DYNAMIC_ENABLED_NONE 1 +#define DYNAMIC_ENABLED_SOME 2 + +extern int dynamic_enabled; + +/* dynamic_printk_enabled, and dynamic_printk_enabled2 are bitmasks in which + * bit n is set to 1 if any modname hashes into the bucket n, 0 otherwise. They + * use independent hash functions, to reduce the chance of false positives. + */ +extern long long dynamic_printk_enabled; +extern long long dynamic_printk_enabled2; + +struct mod_debug { + char *modname; + char *logical_modname; + char *flag_names; + int type; + int hash; + int hash2; +} __attribute__((aligned(8))); + +int register_dynamic_debug_module(char *mod_name, int type, char *share_name, + char *flags, int hash, int hash2); + +#if defined(CONFIG_DYNAMIC_PRINTK_DEBUG) +extern int unregister_dynamic_debug_module(char *mod_name); +extern int __dynamic_dbg_enabled_helper(char *modname, int type, + int value, int hash); + +#define __dynamic_dbg_enabled(module, type, value, level, hash) ({ \ + int __ret = 0; \ + if (unlikely((dynamic_printk_enabled & (1LL << DEBUG_HASH)) && \ + (dynamic_printk_enabled2 & (1LL << DEBUG_HASH2)))) \ + __ret = __dynamic_dbg_enabled_helper(module, type, \ + value, hash);\ + __ret; }) + +#define dynamic_pr_debug(fmt, ...) do { \ + static char mod_name[] \ + __attribute__((section("__verbose_strings"))) \ + = KBUILD_MODNAME; \ + static struct mod_debug descriptor \ + __used \ + __attribute__((section("__verbose"), aligned(8))) = \ + { mod_name, mod_name, NULL, TYPE_BOOLEAN, DEBUG_HASH, DEBUG_HASH2 };\ + if (__dynamic_dbg_enabled(KBUILD_MODNAME, TYPE_BOOLEAN, \ + 0, 0, DEBUG_HASH)) \ + printk(KERN_DEBUG KBUILD_MODNAME ":" fmt, \ + ##__VA_ARGS__); \ + } while (0) + +#define dynamic_dev_dbg(dev, format, ...) do { \ + static char mod_name[] \ + __attribute__((section("__verbose_strings"))) \ + = KBUILD_MODNAME; \ + static struct mod_debug descriptor \ + __used \ + __attribute__((section("__verbose"), aligned(8))) = \ + { mod_name, mod_name, NULL, TYPE_BOOLEAN, DEBUG_HASH, DEBUG_HASH2 };\ + if (__dynamic_dbg_enabled(KBUILD_MODNAME, TYPE_BOOLEAN, \ + 0, 0, DEBUG_HASH)) \ + dev_printk(KERN_DEBUG, dev, \ + KBUILD_MODNAME ": " format, \ + ##__VA_ARGS__); \ + } while (0) + +#else + +static inline int unregister_dynamic_debug_module(const char *mod_name) +{ + return 0; +} +static inline int __dynamic_dbg_enabled_helper(char *modname, int type, + int value, int hash) +{ + return 0; +} + +#define __dynamic_dbg_enabled(module, type, value, level, hash) ({ 0; }) +#define dynamic_pr_debug(fmt, ...) do { } while (0) +#define dynamic_dev_dbg(dev, format, ...) do { } while (0) +#endif + +#endif diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 75d81f157d2..ededb6e83b4 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -303,8 +304,12 @@ static inline char *pack_hex_byte(char *buf, u8 byte) #define pr_info(fmt, arg...) \ printk(KERN_INFO fmt, ##arg) -#ifdef DEBUG /* If you are writing a driver, please use dev_dbg instead */ +#if defined(CONFIG_DYNAMIC_PRINTK_DEBUG) +#define pr_debug(fmt, ...) do { \ + dynamic_pr_debug(fmt, ##__VA_ARGS__); \ + } while (0) +#elif defined(DEBUG) #define pr_debug(fmt, arg...) \ printk(KERN_DEBUG fmt, ##arg) #else diff --git a/include/linux/module.h b/include/linux/module.h index 68e09557c95..a41555cbe00 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -345,7 +345,6 @@ struct module /* Reference counts */ struct module_ref ref[NR_CPUS]; #endif - }; #ifndef MODULE_ARCH_INIT #define MODULE_ARCH_INIT {} diff --git a/kernel/module.c b/kernel/module.c index d5fcd24e5af..c5270066729 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -784,6 +784,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags) mutex_lock(&module_mutex); /* Store the name of the last unloaded module for diagnostic purposes */ strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); + unregister_dynamic_debug_module(mod->name); free_module(mod); out: @@ -1783,6 +1784,33 @@ static inline void add_kallsyms(struct module *mod, } #endif /* CONFIG_KALLSYMS */ +#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG +static void dynamic_printk_setup(Elf_Shdr *sechdrs, unsigned int verboseindex) +{ + struct mod_debug *debug_info; + unsigned long pos, end; + unsigned int num_verbose; + + pos = sechdrs[verboseindex].sh_addr; + num_verbose = sechdrs[verboseindex].sh_size / + sizeof(struct mod_debug); + end = pos + (num_verbose * sizeof(struct mod_debug)); + + for (; pos < end; pos += sizeof(struct mod_debug)) { + debug_info = (struct mod_debug *)pos; + register_dynamic_debug_module(debug_info->modname, + debug_info->type, debug_info->logical_modname, + debug_info->flag_names, debug_info->hash, + debug_info->hash2); + } +} +#else +static inline void dynamic_printk_setup(Elf_Shdr *sechdrs, + unsigned int verboseindex) +{ +} +#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */ + static void *module_alloc_update_bounds(unsigned long size) { void *ret = module_alloc(size); @@ -1831,6 +1859,7 @@ static noinline struct module *load_module(void __user *umod, #endif unsigned int markersindex; unsigned int markersstringsindex; + unsigned int verboseindex; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ @@ -2117,6 +2146,7 @@ static noinline struct module *load_module(void __user *umod, markersindex = find_sec(hdr, sechdrs, secstrings, "__markers"); markersstringsindex = find_sec(hdr, sechdrs, secstrings, "__markers_strings"); + verboseindex = find_sec(hdr, sechdrs, secstrings, "__verbose"); /* Now do relocations. */ for (i = 1; i < hdr->e_shnum; i++) { @@ -2167,6 +2197,7 @@ static noinline struct module *load_module(void __user *umod, marker_update_probe_range(mod->markers, mod->markers + mod->num_markers); #endif + dynamic_printk_setup(sechdrs, verboseindex); err = module_finalize(hdr, sechdrs, mod); if (err < 0) goto cleanup; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index aa81d284844..31d784dd80d 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -807,6 +807,61 @@ menuconfig BUILD_DOCSRC Say N if you are unsure. +config DYNAMIC_PRINTK_DEBUG + bool "Enable dynamic printk() call support" + default n + depends on PRINTK + select PRINTK_DEBUG + help + + Compiles debug level messages into the kernel, which would not + otherwise be available at runtime. These messages can then be + enabled/disabled on a per module basis. This mechanism implicitly + enables all pr_debug() and dev_dbg() calls. The impact of this + compile option is a larger kernel text size of about 2%. + + Usage: + + Dynamic debugging is controlled by the debugfs file, + dynamic_printk/modules. This file contains a list of the modules that + can be enabled. The format of the file is the module name, followed + by a set of flags that can be enabled. The first flag is always the + 'enabled' flag. For example: + + + . + . + . + + : Name of the module in which the debug call resides + : whether the messages are enabled or not + + From a live system: + + snd_hda_intel enabled=0 + fixup enabled=0 + driver enabled=0 + + Enable a module: + + $echo "set enabled=1 " > dynamic_printk/modules + + Disable a module: + + $echo "set enabled=0 " > dynamic_printk/modules + + Enable all modules: + + $echo "set enabled=1 all" > dynamic_printk/modules + + Disable all modules: + + $echo "set enabled=0 all" > dynamic_printk/modules + + Finally, passing "dynamic_printk" at the command line enables + debugging for all modules. This mode can be turned off via the above + disable command. + source "samples/Kconfig" source "lib/Kconfig.kgdb" diff --git a/lib/Makefile b/lib/Makefile index 44001af76a7..16feaab057b 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -81,6 +81,8 @@ obj-$(CONFIG_HAVE_LMB) += lmb.o obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o +obj-$(CONFIG_DYNAMIC_PRINTK_DEBUG) += dynamic_printk.o + hostprogs-y := gen_crc32table clean-files := crc32table.h diff --git a/lib/dynamic_printk.c b/lib/dynamic_printk.c new file mode 100644 index 00000000000..d640f87bdc9 --- /dev/null +++ b/lib/dynamic_printk.c @@ -0,0 +1,418 @@ +/* + * lib/dynamic_printk.c + * + * make pr_debug()/dev_dbg() calls runtime configurable based upon their + * their source module. + * + * Copyright (C) 2008 Red Hat, Inc., Jason Baron + */ + +#include +#include +#include +#include +#include +#include + +extern struct mod_debug __start___verbose[]; +extern struct mod_debug __stop___verbose[]; + +struct debug_name { + struct hlist_node hlist; + struct hlist_node hlist2; + int hash1; + int hash2; + char *name; + int enable; + int type; +}; + +static int nr_entries; +static int num_enabled; +int dynamic_enabled = DYNAMIC_ENABLED_NONE; +static struct hlist_head module_table[DEBUG_HASH_TABLE_SIZE] = + { [0 ... DEBUG_HASH_TABLE_SIZE-1] = HLIST_HEAD_INIT }; +static struct hlist_head module_table2[DEBUG_HASH_TABLE_SIZE] = + { [0 ... DEBUG_HASH_TABLE_SIZE-1] = HLIST_HEAD_INIT }; +static DECLARE_MUTEX(debug_list_mutex); + +/* dynamic_printk_enabled, and dynamic_printk_enabled2 are bitmasks in which + * bit n is set to 1 if any modname hashes into the bucket n, 0 otherwise. They + * use independent hash functions, to reduce the chance of false positives. + */ +long long dynamic_printk_enabled; +EXPORT_SYMBOL_GPL(dynamic_printk_enabled); +long long dynamic_printk_enabled2; +EXPORT_SYMBOL_GPL(dynamic_printk_enabled2); + +/* returns the debug module pointer. */ +static struct debug_name *find_debug_module(char *module_name) +{ + int i; + struct hlist_head *head; + struct hlist_node *node; + struct debug_name *element; + + element = NULL; + for (i = 0; i < DEBUG_HASH_TABLE_SIZE; i++) { + head = &module_table[i]; + hlist_for_each_entry_rcu(element, node, head, hlist) + if (!strcmp(element->name, module_name)) + return element; + } + return NULL; +} + +/* returns the debug module pointer. */ +static struct debug_name *find_debug_module_hash(char *module_name, int hash) +{ + struct hlist_head *head; + struct hlist_node *node; + struct debug_name *element; + + element = NULL; + head = &module_table[hash]; + hlist_for_each_entry_rcu(element, node, head, hlist) + if (!strcmp(element->name, module_name)) + return element; + return NULL; +} + +/* caller must hold mutex*/ +static int __add_debug_module(char *mod_name, int hash, int hash2) +{ + struct debug_name *new; + char *module_name; + int ret = 0; + + if (find_debug_module(mod_name)) { + ret = -EINVAL; + goto out; + } + module_name = kmalloc(strlen(mod_name) + 1, GFP_KERNEL); + if (!module_name) { + ret = -ENOMEM; + goto out; + } + module_name = strcpy(module_name, mod_name); + module_name[strlen(mod_name)] = '\0'; + new = kzalloc(sizeof(struct debug_name), GFP_KERNEL); + if (!new) { + kfree(module_name); + ret = -ENOMEM; + goto out; + } + INIT_HLIST_NODE(&new->hlist); + INIT_HLIST_NODE(&new->hlist2); + new->name = module_name; + new->hash1 = hash; + new->hash2 = hash2; + hlist_add_head_rcu(&new->hlist, &module_table[hash]); + hlist_add_head_rcu(&new->hlist2, &module_table2[hash2]); + nr_entries++; +out: + return ret; +} + +int unregister_dynamic_debug_module(char *mod_name) +{ + struct debug_name *element; + int ret = 0; + + down(&debug_list_mutex); + element = find_debug_module(mod_name); + if (!element) { + ret = -EINVAL; + goto out; + } + hlist_del_rcu(&element->hlist); + hlist_del_rcu(&element->hlist2); + synchronize_rcu(); + kfree(element->name); + if (element->enable) + num_enabled--; + kfree(element); + nr_entries--; +out: + up(&debug_list_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(unregister_dynamic_debug_module); + +int register_dynamic_debug_module(char *mod_name, int type, char *share_name, + char *flags, int hash, int hash2) +{ + struct debug_name *elem; + int ret = 0; + + down(&debug_list_mutex); + elem = find_debug_module(mod_name); + if (!elem) { + if (__add_debug_module(mod_name, hash, hash2)) + goto out; + elem = find_debug_module(mod_name); + if (dynamic_enabled == DYNAMIC_ENABLED_ALL && + !strcmp(mod_name, share_name)) { + elem->enable = true; + num_enabled++; + } + } + elem->type |= type; +out: + up(&debug_list_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(register_dynamic_debug_module); + +int __dynamic_dbg_enabled_helper(char *mod_name, int type, int value, int hash) +{ + struct debug_name *elem; + int ret = 0; + + if (dynamic_enabled == DYNAMIC_ENABLED_ALL) + return 1; + rcu_read_lock(); + elem = find_debug_module_hash(mod_name, hash); + if (elem && elem->enable) + ret = 1; + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(__dynamic_dbg_enabled_helper); + +static void set_all(bool enable) +{ + struct debug_name *e; + struct hlist_node *node; + int i; + long long enable_mask; + + for (i = 0; i < DEBUG_HASH_TABLE_SIZE; i++) { + if (module_table[i].first != NULL) { + hlist_for_each_entry(e, node, &module_table[i], hlist) { + e->enable = enable; + } + } + } + if (enable) + enable_mask = ULLONG_MAX; + else + enable_mask = 0; + dynamic_printk_enabled = enable_mask; + dynamic_printk_enabled2 = enable_mask; +} + +static int disabled_hash(int i, bool first_table) +{ + struct debug_name *e; + struct hlist_node *node; + + if (first_table) { + hlist_for_each_entry(e, node, &module_table[i], hlist) { + if (e->enable) + return 0; + } + } else { + hlist_for_each_entry(e, node, &module_table2[i], hlist2) { + if (e->enable) + return 0; + } + } + return 1; +} + +static ssize_t pr_debug_write(struct file *file, const char __user *buf, + size_t length, loff_t *ppos) +{ + char *buffer, *s, *value_str, *setting_str; + int err, value; + struct debug_name *elem = NULL; + int all = 0; + + if (length > PAGE_SIZE || length < 0) + return -EINVAL; + + buffer = (char *)__get_free_page(GFP_KERNEL); + if (!buffer) + return -ENOMEM; + + err = -EFAULT; + if (copy_from_user(buffer, buf, length)) + goto out; + + err = -EINVAL; + if (length < PAGE_SIZE) + buffer[length] = '\0'; + else if (buffer[PAGE_SIZE-1]) + goto out; + + err = -EINVAL; + down(&debug_list_mutex); + + if (strncmp("set", buffer, 3)) + goto out_up; + s = buffer + 3; + setting_str = strsep(&s, "="); + if (s == NULL) + goto out_up; + setting_str = strstrip(setting_str); + value_str = strsep(&s, " "); + if (s == NULL) + goto out_up; + s = strstrip(s); + if (!strncmp(s, "all", 3)) + all = 1; + else + elem = find_debug_module(s); + if (!strncmp(setting_str, "enable", 6)) { + value = !!simple_strtol(value_str, NULL, 10); + if (all) { + if (value) { + set_all(true); + num_enabled = nr_entries; + dynamic_enabled = DYNAMIC_ENABLED_ALL; + } else { + set_all(false); + num_enabled = 0; + dynamic_enabled = DYNAMIC_ENABLED_NONE; + } + err = 0; + } else { + if (elem) { + if (value && (elem->enable == 0)) { + dynamic_printk_enabled |= + (1LL << elem->hash1); + dynamic_printk_enabled2 |= + (1LL << elem->hash2); + elem->enable = 1; + num_enabled++; + dynamic_enabled = DYNAMIC_ENABLED_SOME; + err = 0; + printk(KERN_DEBUG + "debugging enabled for module %s", + elem->name); + } else if (!value && (elem->enable == 1)) { + elem->enable = 0; + num_enabled--; + if (disabled_hash(elem->hash1, true)) + dynamic_printk_enabled &= + ~(1LL << elem->hash1); + if (disabled_hash(elem->hash2, false)) + dynamic_printk_enabled2 &= + ~(1LL << elem->hash2); + if (num_enabled) + dynamic_enabled = + DYNAMIC_ENABLED_SOME; + else + dynamic_enabled = + DYNAMIC_ENABLED_NONE; + err = 0; + printk(KERN_DEBUG + "debugging disabled for module " + "%s", elem->name); + } + } + } + } + if (!err) + err = length; +out_up: + up(&debug_list_mutex); +out: + free_page((unsigned long)buffer); + return err; +} + +static void *pr_debug_seq_start(struct seq_file *f, loff_t *pos) +{ + return (*pos < DEBUG_HASH_TABLE_SIZE) ? pos : NULL; +} + +static void *pr_debug_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + if (*pos >= DEBUG_HASH_TABLE_SIZE) + return NULL; + return pos; +} + +static void pr_debug_seq_stop(struct seq_file *s, void *v) +{ + /* Nothing to do */ +} + +static int pr_debug_seq_show(struct seq_file *s, void *v) +{ + struct hlist_head *head; + struct hlist_node *node; + struct debug_name *elem; + unsigned int i = *(loff_t *) v; + + rcu_read_lock(); + head = &module_table[i]; + hlist_for_each_entry_rcu(elem, node, head, hlist) { + seq_printf(s, "%s enabled=%d", elem->name, elem->enable); + seq_printf(s, "\n"); + } + rcu_read_unlock(); + return 0; +} + +static struct seq_operations pr_debug_seq_ops = { + .start = pr_debug_seq_start, + .next = pr_debug_seq_next, + .stop = pr_debug_seq_stop, + .show = pr_debug_seq_show +}; + +static int pr_debug_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &pr_debug_seq_ops); +} + +static const struct file_operations pr_debug_operations = { + .open = pr_debug_open, + .read = seq_read, + .write = pr_debug_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init dynamic_printk_init(void) +{ + struct dentry *dir, *file; + struct mod_debug *iter; + unsigned long value; + + dir = debugfs_create_dir("dynamic_printk", NULL); + if (!dir) + return -ENOMEM; + file = debugfs_create_file("modules", 0644, dir, NULL, + &pr_debug_operations); + if (!file) { + debugfs_remove(dir); + return -ENOMEM; + } + for (value = (unsigned long)__start___verbose; + value < (unsigned long)__stop___verbose; + value += sizeof(struct mod_debug)) { + iter = (struct mod_debug *)value; + register_dynamic_debug_module(iter->modname, + iter->type, + iter->logical_modname, + iter->flag_names, iter->hash, iter->hash2); + } + return 0; +} +module_init(dynamic_printk_init); +/* may want to move this earlier so we can get traces as early as possible */ + +static int __init dynamic_printk_setup(char *str) +{ + if (str) + return -ENOENT; + set_all(true); + return 0; +} +/* Use early_param(), so we can get debug output as early as possible */ +early_param("dynamic_printk", dynamic_printk_setup); diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 373e51e91ce..1bc3001d182 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -65,7 +65,7 @@ void struct nf_conntrack_expect *exp) __read_mostly; EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_expectfn); -#ifdef DEBUG +#if defined(DEBUG) || defined(CONFIG_DYNAMIC_PRINTK_DEBUG) /* PptpControlMessageType names */ const char *const pptp_msg_name[] = { "UNKNOWN_MESSAGE", diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index ea48b82a370..b4ca38a2115 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -96,6 +96,14 @@ basename_flags = -D"KBUILD_BASENAME=KBUILD_STR($(call name-fix,$(basetarget)))" modname_flags = $(if $(filter 1,$(words $(modname))),\ -D"KBUILD_MODNAME=KBUILD_STR($(call name-fix,$(modname)))") +#hash values +ifdef CONFIG_DYNAMIC_PRINTK_DEBUG +debug_flags = -D"DEBUG_HASH=$(shell ./scripts/basic/hash djb2 $(@D)$(modname))"\ + -D"DEBUG_HASH2=$(shell ./scripts/basic/hash r5 $(@D)$(modname))" +else +debug_flags = +endif + orig_c_flags = $(KBUILD_CFLAGS) $(ccflags-y) $(CFLAGS_$(basetarget).o) _c_flags = $(filter-out $(CFLAGS_REMOVE_$(basetarget).o), $(orig_c_flags)) _a_flags = $(KBUILD_AFLAGS) $(asflags-y) $(AFLAGS_$(basetarget).o) @@ -121,7 +129,8 @@ endif c_flags = -Wp,-MD,$(depfile) $(NOSTDINC_FLAGS) $(KBUILD_CPPFLAGS) \ $(__c_flags) $(modkern_cflags) \ - -D"KBUILD_STR(s)=\#s" $(basename_flags) $(modname_flags) + -D"KBUILD_STR(s)=\#s" $(basename_flags) $(modname_flags) \ + $(debug_flags) a_flags = -Wp,-MD,$(depfile) $(NOSTDINC_FLAGS) $(KBUILD_CPPFLAGS) \ $(__a_flags) $(modkern_aflags) diff --git a/scripts/basic/Makefile b/scripts/basic/Makefile index 4c324a1f1e0..09559951df1 100644 --- a/scripts/basic/Makefile +++ b/scripts/basic/Makefile @@ -9,7 +9,7 @@ # fixdep: Used to generate dependency information during build process # docproc: Used in Documentation/DocBook -hostprogs-y := fixdep docproc +hostprogs-y := fixdep docproc hash always := $(hostprogs-y) # fixdep is needed to compile other host programs diff --git a/scripts/basic/hash.c b/scripts/basic/hash.c new file mode 100644 index 00000000000..3299ad7fc8c --- /dev/null +++ b/scripts/basic/hash.c @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2008 Red Hat, Inc., Jason Baron + * + */ + +#include +#include +#include + +#define DYNAMIC_DEBUG_HASH_BITS 6 + +static const char *program; + +static void usage(void) +{ + printf("Usage: %s \n", program); + exit(1); +} + +/* djb2 hashing algorithm by Dan Bernstein. From: + * http://www.cse.yorku.ca/~oz/hash.html + */ + +unsigned int djb2_hash(char *str) +{ + unsigned long hash = 5381; + int c; + + c = *str; + while (c) { + hash = ((hash << 5) + hash) + c; + c = *++str; + } + return (unsigned int)(hash & ((1 << DYNAMIC_DEBUG_HASH_BITS) - 1)); +} + +unsigned int r5_hash(char *str) +{ + unsigned long hash = 0; + int c; + + c = *str; + while (c) { + hash = (hash + (c << 4) + (c >> 4)) * 11; + c = *++str; + } + return (unsigned int)(hash & ((1 << DYNAMIC_DEBUG_HASH_BITS) - 1)); +} + +int main(int argc, char *argv[]) +{ + program = argv[0]; + + if (argc != 3) + usage(); + if (!strcmp(argv[1], "djb2")) + printf("%d\n", djb2_hash(argv[2])); + else if (!strcmp(argv[1], "r5")) + printf("%d\n", r5_hash(argv[2])); + else + usage(); + exit(0); +} + -- cgit v1.2.3-70-g09d2 From 25ddbb18aae33ad255eb9f35aacebe3af01e1e9c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 15 Oct 2008 22:01:41 -0700 Subject: Make the taint flags reliable It's somewhat unlikely that it happens, but right now a race window between interrupts or machine checks or oopses could corrupt the tainted bitmap because it is modified in a non atomic fashion. Convert the taint variable to an unsigned long and use only atomic bit operations on it. Unfortunately this means the intvec sysctl functions cannot be used on it anymore. It turned out the taint sysctl handler could actually be simplified a bit (since it only increases capabilities) so this patch actually removes code. [akpm@linux-foundation.org: remove unneeded include] Signed-off-by: Andi Kleen Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/smpboot.c | 14 +++++----- include/linux/kernel.h | 25 +++++++++--------- kernel/module.c | 12 ++++----- kernel/panic.c | 63 ++++++++++++++++++++++++++++++++------------ kernel/softlockup.c | 2 +- kernel/sysctl.c | 67 ++++++++++++++++++++--------------------------- 6 files changed, 101 insertions(+), 82 deletions(-) (limited to 'kernel/module.c') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8c3aca7cb34..7ed9e070a6e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -282,6 +282,8 @@ static void __cpuinit smp_callin(void) cpu_set(cpuid, cpu_callin_map); } +static int __cpuinitdata unsafe_smp; + /* * Activate a secondary processor. */ @@ -397,7 +399,7 @@ static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) goto valid_k7; /* If we get here, not a certified SMP capable AMD system. */ - add_taint(TAINT_UNSAFE_SMP); + unsafe_smp = 1; } valid_k7: @@ -414,12 +416,10 @@ static void __cpuinit smp_checks(void) * Don't taint if we are running SMP kernel on a single non-MP * approved Athlon */ - if (tainted & TAINT_UNSAFE_SMP) { - if (num_online_cpus()) - printk(KERN_INFO "WARNING: This combination of AMD" - "processors is not suitable for SMP.\n"); - else - tainted &= ~TAINT_UNSAFE_SMP; + if (unsafe_smp && num_online_cpus() > 1) { + printk(KERN_INFO "WARNING: This combination of AMD" + "processors is not suitable for SMP.\n"); + add_taint(TAINT_UNSAFE_SMP); } } diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 75d81f157d2..e971c55f45a 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -235,9 +235,10 @@ extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in extern int panic_timeout; extern int panic_on_oops; extern int panic_on_unrecovered_nmi; -extern int tainted; extern const char *print_tainted(void); -extern void add_taint(unsigned); +extern void add_taint(unsigned flag); +extern int test_taint(unsigned flag); +extern unsigned long get_taint(void); extern int root_mountflags; /* Values used for system_state */ @@ -250,16 +251,16 @@ extern enum system_states { SYSTEM_SUSPEND_DISK, } system_state; -#define TAINT_PROPRIETARY_MODULE (1<<0) -#define TAINT_FORCED_MODULE (1<<1) -#define TAINT_UNSAFE_SMP (1<<2) -#define TAINT_FORCED_RMMOD (1<<3) -#define TAINT_MACHINE_CHECK (1<<4) -#define TAINT_BAD_PAGE (1<<5) -#define TAINT_USER (1<<6) -#define TAINT_DIE (1<<7) -#define TAINT_OVERRIDDEN_ACPI_TABLE (1<<8) -#define TAINT_WARN (1<<9) +#define TAINT_PROPRIETARY_MODULE 0 +#define TAINT_FORCED_MODULE 1 +#define TAINT_UNSAFE_SMP 2 +#define TAINT_FORCED_RMMOD 3 +#define TAINT_MACHINE_CHECK 4 +#define TAINT_BAD_PAGE 5 +#define TAINT_USER 6 +#define TAINT_DIE 7 +#define TAINT_OVERRIDDEN_ACPI_TABLE 8 +#define TAINT_WARN 9 extern void dump_stack(void) __cold; diff --git a/kernel/module.c b/kernel/module.c index 9db11911e04..dd9ac6ad5cb 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -100,7 +100,7 @@ static inline int strong_try_module_get(struct module *mod) static inline void add_taint_module(struct module *mod, unsigned flag) { add_taint(flag); - mod->taints |= flag; + mod->taints |= (1U << flag); } /* @@ -923,7 +923,7 @@ static const char vermagic[] = VERMAGIC_STRING; static int try_to_force_load(struct module *mod, const char *symname) { #ifdef CONFIG_MODULE_FORCE_LOAD - if (!(tainted & TAINT_FORCED_MODULE)) + if (!test_taint(TAINT_FORCED_MODULE)) printk("%s: no version for \"%s\" found: kernel tainted.\n", mod->name, symname); add_taint_module(mod, TAINT_FORCED_MODULE); @@ -1033,7 +1033,7 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, const unsigned long *crc; ret = find_symbol(name, &owner, &crc, - !(mod->taints & TAINT_PROPRIETARY_MODULE), true); + !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); if (!IS_ERR_VALUE(ret)) { /* use_module can fail due to OOM, or module initialization or unloading */ @@ -1634,7 +1634,7 @@ static void set_license(struct module *mod, const char *license) license = "unspecified"; if (!license_is_gpl_compatible(license)) { - if (!(tainted & TAINT_PROPRIETARY_MODULE)) + if (!test_taint(TAINT_PROPRIETARY_MODULE)) printk(KERN_WARNING "%s: module license '%s' taints " "kernel.\n", mod->name, license); add_taint_module(mod, TAINT_PROPRIETARY_MODULE); @@ -2552,9 +2552,9 @@ static char *module_flags(struct module *mod, char *buf) mod->state == MODULE_STATE_GOING || mod->state == MODULE_STATE_COMING) { buf[bx++] = '('; - if (mod->taints & TAINT_PROPRIETARY_MODULE) + if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) buf[bx++] = 'P'; - if (mod->taints & TAINT_FORCED_MODULE) + if (mod->taints & (1 << TAINT_FORCED_MODULE)) buf[bx++] = 'F'; /* * TAINT_FORCED_RMMOD: could be added. diff --git a/kernel/panic.c b/kernel/panic.c index 12c5a0a6c89..028013f7afd 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -23,7 +23,7 @@ #include int panic_on_oops; -int tainted; +static unsigned long tainted_mask; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); @@ -159,31 +159,60 @@ EXPORT_SYMBOL(panic); * The string is overwritten by the next call to print_taint(). */ +struct tnt { + u8 bit; + char true; + char false; +}; + +static const struct tnt tnts[] = { + { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, + { TAINT_FORCED_MODULE, 'F', ' ' }, + { TAINT_UNSAFE_SMP, 'S', ' ' }, + { TAINT_FORCED_RMMOD, 'R', ' ' }, + { TAINT_MACHINE_CHECK, 'M', ' ' }, + { TAINT_BAD_PAGE, 'B', ' ' }, + { TAINT_USER, 'U', ' ' }, + { TAINT_DIE, 'D', ' ' }, + { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, + { TAINT_WARN, 'W', ' ' }, +}; + const char *print_tainted(void) { - static char buf[20]; - if (tainted) { - snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c", - tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', - tainted & TAINT_FORCED_MODULE ? 'F' : ' ', - tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', - tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', - tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', - tainted & TAINT_BAD_PAGE ? 'B' : ' ', - tainted & TAINT_USER ? 'U' : ' ', - tainted & TAINT_DIE ? 'D' : ' ', - tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ', - tainted & TAINT_WARN ? 'W' : ' '); - } - else + static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1]; + + if (tainted_mask) { + char *s; + int i; + + s = buf + sprintf(buf, "Tainted: "); + for (i = 0; i < ARRAY_SIZE(tnts); i++) { + const struct tnt *t = &tnts[i]; + *s++ = test_bit(t->bit, &tainted_mask) ? + t->true : t->false; + } + *s = 0; + } else snprintf(buf, sizeof(buf), "Not tainted"); return(buf); } +int test_taint(unsigned flag) +{ + return test_bit(flag, &tainted_mask); +} +EXPORT_SYMBOL(test_taint); + +unsigned long get_taint(void) +{ + return tainted_mask; +} + void add_taint(unsigned flag) { debug_locks = 0; /* can't trust the integrity of the kernel anymore */ - tainted |= flag; + set_bit(flag, &tainted_mask); } EXPORT_SYMBOL(add_taint); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index cb838ee93a8..3953e4aed73 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -226,7 +226,7 @@ static void check_hung_uninterruptible_tasks(int this_cpu) * If the system crashed already then all bets are off, * do not report extra hung tasks: */ - if ((tainted & TAINT_DIE) || did_panic) + if (test_taint(TAINT_DIE) || did_panic) return; read_lock(&tasklist_lock); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cfc5295f1e8..ec88fcc9a0d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -149,7 +149,7 @@ extern int max_lock_depth; #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); #endif @@ -379,10 +379,9 @@ static struct ctl_table kern_table[] = { #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", - .data = &tainted, - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0644, - .proc_handler = &proc_dointvec_taint, + .proc_handler = &proc_taint, }, #endif #ifdef CONFIG_LATENCYTOP @@ -2228,49 +2227,39 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp, NULL,NULL); } -#define OP_SET 0 -#define OP_AND 1 -#define OP_OR 2 - -static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - int op = *(int *)data; - if (write) { - int val = *negp ? -*lvalp : *lvalp; - switch(op) { - case OP_SET: *valp = val; break; - case OP_AND: *valp &= val; break; - case OP_OR: *valp |= val; break; - } - } else { - int val = *valp; - if (val < 0) { - *negp = -1; - *lvalp = (unsigned long)-val; - } else { - *negp = 0; - *lvalp = (unsigned long)val; - } - } - return 0; -} - /* - * Taint values can only be increased + * Taint values can only be increased + * This means we can safely use a temporary. */ -static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - int op; + struct ctl_table t; + unsigned long tmptaint = get_taint(); + int err; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; - op = OP_OR; - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, - do_proc_dointvec_bset_conv,&op); + t = *table; + t.data = &tmptaint; + err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); + if (err < 0) + return err; + + if (write) { + /* + * Poor man's atomic or. Not worth adding a primitive + * to everyone's atomic.h for this + */ + int i; + for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { + if ((tmptaint >> i) & 1) + add_taint(i); + } + } + + return err; } struct do_proc_dointvec_minmax_conv_param { -- cgit v1.2.3-70-g09d2 From 5e458cc0f4770eea45d3c07110f01b3a94c72aa5 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 22 Oct 2008 10:00:13 -0500 Subject: module: simplify load_module. Linus' recent catch of stack overflow in load_module lead me to look at the code. A couple of helpers to get a section address and get objects from a section can help clean things up a little. (And in case you're wondering, the stack size also dropped from 328 to 284 bytes). Signed-off-by: Rusty Russell --- include/linux/module.h | 2 +- kernel/module.c | 235 +++++++++++++++++++++---------------------------- 2 files changed, 100 insertions(+), 137 deletions(-) (limited to 'kernel/module.c') diff --git a/include/linux/module.h b/include/linux/module.h index 5d2970cdce9..eddf27db442 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -277,7 +277,7 @@ struct module /* Exception table */ unsigned int num_exentries; - const struct exception_table_entry *extable; + struct exception_table_entry *extable; /* Startup function. */ int (*init)(void); diff --git a/kernel/module.c b/kernel/module.c index 0d8d21ee792..3d256681ab6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -132,6 +132,29 @@ static unsigned int find_sec(Elf_Ehdr *hdr, return 0; } +/* Find a module section, or NULL. */ +static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs, + const char *secstrings, const char *name) +{ + /* Section 0 has sh_addr 0. */ + return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr; +} + +/* Find a module section, or NULL. Fill in number of "objects" in section. */ +static void *section_objs(Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, + const char *secstrings, + const char *name, + size_t object_size, + unsigned int *num) +{ + unsigned int sec = find_sec(hdr, sechdrs, secstrings, name); + + /* Section 0 has sh_addr 0 and sh_size 0. */ + *num = sechdrs[sec].sh_size / object_size; + return (void *)sechdrs[sec].sh_addr; +} + /* Provided by the linker */ extern const struct kernel_symbol __start___ksymtab[]; extern const struct kernel_symbol __stop___ksymtab[]; @@ -1789,32 +1812,20 @@ static inline void add_kallsyms(struct module *mod, } #endif /* CONFIG_KALLSYMS */ -#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG -static void dynamic_printk_setup(Elf_Shdr *sechdrs, unsigned int verboseindex) +static void dynamic_printk_setup(struct mod_debug *debug, unsigned int num) { - struct mod_debug *debug_info; - unsigned long pos, end; - unsigned int num_verbose; - - pos = sechdrs[verboseindex].sh_addr; - num_verbose = sechdrs[verboseindex].sh_size / - sizeof(struct mod_debug); - end = pos + (num_verbose * sizeof(struct mod_debug)); +#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG + unsigned int i; - for (; pos < end; pos += sizeof(struct mod_debug)) { - debug_info = (struct mod_debug *)pos; - register_dynamic_debug_module(debug_info->modname, - debug_info->type, debug_info->logical_modname, - debug_info->flag_names, debug_info->hash, - debug_info->hash2); + for (i = 0; i < num; i++) { + register_dynamic_debug_module(debug[i].modname, + debug[i].type, + debug[i].logical_modname, + debug[i].flag_names, + debug[i].hash, debug[i].hash2); } -} -#else -static inline void dynamic_printk_setup(Elf_Shdr *sechdrs, - unsigned int verboseindex) -{ -} #endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */ +} static void *module_alloc_update_bounds(unsigned long size) { @@ -1843,37 +1854,14 @@ static noinline struct module *load_module(void __user *umod, unsigned int i; unsigned int symindex = 0; unsigned int strindex = 0; - unsigned int setupindex; - unsigned int exindex; - unsigned int exportindex; - unsigned int modindex; - unsigned int obsparmindex; - unsigned int infoindex; - unsigned int gplindex; - unsigned int crcindex; - unsigned int gplcrcindex; - unsigned int versindex; - unsigned int pcpuindex; - unsigned int gplfutureindex; - unsigned int gplfuturecrcindex; + unsigned int modindex, versindex, infoindex, pcpuindex; unsigned int unwindex = 0; -#ifdef CONFIG_UNUSED_SYMBOLS - unsigned int unusedindex; - unsigned int unusedcrcindex; - unsigned int unusedgplindex; - unsigned int unusedgplcrcindex; -#endif - unsigned int markersindex; - unsigned int markersstringsindex; - unsigned int verboseindex; - unsigned int tracepointsindex; - unsigned int tracepointsstringsindex; - unsigned int mcountindex; + unsigned int num_kp, num_mcount; + struct kernel_param *kp; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ - void *mseg; - struct exception_table_entry *extable; + unsigned long *mseg; mm_segment_t old_fs; DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", @@ -1937,6 +1925,7 @@ static noinline struct module *load_module(void __user *umod, err = -ENOEXEC; goto free_hdr; } + /* This is temporary: point mod into copy of data. */ mod = (void *)sechdrs[modindex].sh_addr; if (symindex == 0) { @@ -1946,22 +1935,6 @@ static noinline struct module *load_module(void __user *umod, goto free_hdr; } - /* Optional sections */ - exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); - gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); - gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future"); - crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); - gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); - gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future"); -#ifdef CONFIG_UNUSED_SYMBOLS - unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused"); - unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl"); - unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused"); - unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl"); -#endif - setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); - exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); - obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); @@ -2117,42 +2090,57 @@ static noinline struct module *load_module(void __user *umod, if (err < 0) goto cleanup; - /* Set up EXPORTed & EXPORT_GPLed symbols (section 0 is 0 length) */ - mod->num_syms = sechdrs[exportindex].sh_size / sizeof(*mod->syms); - mod->syms = (void *)sechdrs[exportindex].sh_addr; - if (crcindex) - mod->crcs = (void *)sechdrs[crcindex].sh_addr; - mod->num_gpl_syms = sechdrs[gplindex].sh_size / sizeof(*mod->gpl_syms); - mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr; - if (gplcrcindex) - mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; - mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size / - sizeof(*mod->gpl_future_syms); - mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; - if (gplfuturecrcindex) - mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; + /* Now we've got everything in the final locations, we can + * find optional sections. */ + kp = section_objs(hdr, sechdrs, secstrings, "__param", sizeof(*kp), + &num_kp); + mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab", + sizeof(*mod->syms), &mod->num_syms); + mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab"); + mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl", + sizeof(*mod->gpl_syms), + &mod->num_gpl_syms); + mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl"); + mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings, + "__ksymtab_gpl_future", + sizeof(*mod->gpl_future_syms), + &mod->num_gpl_future_syms); + mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings, + "__kcrctab_gpl_future"); #ifdef CONFIG_UNUSED_SYMBOLS - mod->num_unused_syms = sechdrs[unusedindex].sh_size / - sizeof(*mod->unused_syms); - mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size / - sizeof(*mod->unused_gpl_syms); - mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr; - if (unusedcrcindex) - mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr; - mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr; - if (unusedgplcrcindex) - mod->unused_gpl_crcs - = (void *)sechdrs[unusedgplcrcindex].sh_addr; + mod->unused_syms = section_objs(hdr, sechdrs, secstrings, + "__ksymtab_unused", + sizeof(*mod->unused_syms), + &mod->num_unused_syms); + mod->unused_crcs = section_addr(hdr, sechdrs, secstrings, + "__kcrctab_unused"); + mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings, + "__ksymtab_unused_gpl", + sizeof(*mod->unused_gpl_syms), + &mod->num_unused_gpl_syms); + mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings, + "__kcrctab_unused_gpl"); +#endif + +#ifdef CONFIG_MARKERS + mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers", + sizeof(*mod->markers), &mod->num_markers); +#endif +#ifdef CONFIG_TRACEPOINTS + mod->tracepoints = section_objs(hdr, sechdrs, secstrings, + "__tracepoints", + sizeof(*mod->tracepoints), + &mod->num_tracepoints); #endif #ifdef CONFIG_MODVERSIONS - if ((mod->num_syms && !crcindex) - || (mod->num_gpl_syms && !gplcrcindex) - || (mod->num_gpl_future_syms && !gplfuturecrcindex) + if ((mod->num_syms && !mod->crcs) + || (mod->num_gpl_syms && !mod->gpl_crcs) + || (mod->num_gpl_future_syms && !mod->gpl_future_crcs) #ifdef CONFIG_UNUSED_SYMBOLS - || (mod->num_unused_syms && !unusedcrcindex) - || (mod->num_unused_gpl_syms && !unusedgplcrcindex) + || (mod->num_unused_syms && !mod->unused_crcs) + || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs) #endif ) { printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name); @@ -2161,16 +2149,6 @@ static noinline struct module *load_module(void __user *umod, goto cleanup; } #endif - markersindex = find_sec(hdr, sechdrs, secstrings, "__markers"); - markersstringsindex = find_sec(hdr, sechdrs, secstrings, - "__markers_strings"); - verboseindex = find_sec(hdr, sechdrs, secstrings, "__verbose"); - tracepointsindex = find_sec(hdr, sechdrs, secstrings, "__tracepoints"); - tracepointsstringsindex = find_sec(hdr, sechdrs, secstrings, - "__tracepoints_strings"); - - mcountindex = find_sec(hdr, sechdrs, secstrings, - "__mcount_loc"); /* Now do relocations. */ for (i = 1; i < hdr->e_shnum; i++) { @@ -2193,28 +2171,16 @@ static noinline struct module *load_module(void __user *umod, if (err < 0) goto cleanup; } -#ifdef CONFIG_MARKERS - mod->markers = (void *)sechdrs[markersindex].sh_addr; - mod->num_markers = - sechdrs[markersindex].sh_size / sizeof(*mod->markers); -#endif -#ifdef CONFIG_TRACEPOINTS - mod->tracepoints = (void *)sechdrs[tracepointsindex].sh_addr; - mod->num_tracepoints = - sechdrs[tracepointsindex].sh_size / sizeof(*mod->tracepoints); -#endif - /* Find duplicate symbols */ err = verify_export_symbols(mod); - if (err < 0) goto cleanup; /* Set up and sort exception table */ - mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable); - mod->extable = extable = (void *)sechdrs[exindex].sh_addr; - sort_extable(extable, extable + mod->num_exentries); + mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table", + sizeof(*mod->extable), &mod->num_exentries); + sort_extable(mod->extable, mod->extable + mod->num_exentries); /* Finally, copy percpu area over. */ percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, @@ -2223,11 +2189,17 @@ static noinline struct module *load_module(void __user *umod, add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); if (!mod->taints) { + struct mod_debug *debug; + unsigned int num_debug; + #ifdef CONFIG_MARKERS marker_update_probe_range(mod->markers, mod->markers + mod->num_markers); #endif - dynamic_printk_setup(sechdrs, verboseindex); + debug = section_objs(hdr, sechdrs, secstrings, "__verbose", + sizeof(*debug), &num_debug); + dynamic_printk_setup(debug, num_debug); + #ifdef CONFIG_TRACEPOINTS tracepoint_update_probe_range(mod->tracepoints, mod->tracepoints + mod->num_tracepoints); @@ -2235,8 +2207,9 @@ static noinline struct module *load_module(void __user *umod, } /* sechdrs[0].sh_size is always zero */ - mseg = (void *)sechdrs[mcountindex].sh_addr; - ftrace_init_module(mseg, mseg + sechdrs[mcountindex].sh_size); + mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc", + sizeof(*mseg), &num_mcount); + ftrace_init_module(mseg, mseg + num_mcount); err = module_finalize(hdr, sechdrs, mod); if (err < 0) @@ -2261,7 +2234,7 @@ static noinline struct module *load_module(void __user *umod, set_fs(old_fs); mod->args = args; - if (obsparmindex) + if (section_addr(hdr, sechdrs, secstrings, "__obsparm")) printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", mod->name); @@ -2270,21 +2243,11 @@ static noinline struct module *load_module(void __user *umod, * strong_try_module_get() will fail. */ stop_machine(__link_module, mod, NULL); - /* Size of section 0 is 0, so this works well if no params */ - err = parse_args(mod->name, mod->args, - (struct kernel_param *) - sechdrs[setupindex].sh_addr, - sechdrs[setupindex].sh_size - / sizeof(struct kernel_param), - NULL); + err = parse_args(mod->name, mod->args, kp, num_kp, NULL); if (err < 0) goto unlink; - err = mod_sysfs_setup(mod, - (struct kernel_param *) - sechdrs[setupindex].sh_addr, - sechdrs[setupindex].sh_size - / sizeof(struct kernel_param)); + err = mod_sysfs_setup(mod, kp, num_kp); if (err < 0) goto unlink; add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); -- cgit v1.2.3-70-g09d2 From d72b37513cdfbd3f53f3d485a8c403cc96d2c95f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 30 Aug 2008 10:09:00 +0200 Subject: Remove stop_machine during module load v2 Remove stop_machine during module load v2 module loading currently does a stop_machine on each module load to insert the module into the global module lists. Especially on larger systems this can be quite expensive. It does that to handle concurrent lock lessmodule list readers like kallsyms. I don't think stop_machine() is actually needed to insert something into a list though. There are no concurrent writers because the module mutex is taken. And the RCU list functions know how to insert a node into a list with the right memory ordering so that concurrent readers don't go off into the wood. So remove the stop_machine for the module list insert and just do a list_add_rcu() instead. Module removal will still do a stop_machine of course, it needs that for other reasons. v2: Revised readers based on Paul's comments. All readers that only rely on disabled preemption need to be changed to list_for_each_rcu(). Done that. The others are ok because they have the modules mutex. Also added a possible missing preempt disable for print_modules(). [cc Paul McKenney for review. It's not RCU, but quite similar.] Acked-by: Paul E. McKenney Signed-off-by: Rusty Russell --- kernel/module.c | 47 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 25 deletions(-) (limited to 'kernel/module.c') diff --git a/kernel/module.c b/kernel/module.c index 3d256681ab6..c0f1826e2d9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -63,7 +64,7 @@ #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) /* List of modules, protected by module_mutex or preempt_disable - * (add/delete uses stop_machine). */ + * (delete uses stop_machine/add uses RCU list operations). */ static DEFINE_MUTEX(module_mutex); static LIST_HEAD(modules); @@ -241,7 +242,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr, if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data)) return true; - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { struct symsearch arr[] = { { mod->syms, mod->syms + mod->num_syms, mod->crcs, NOT_GPL_ONLY, false }, @@ -1416,17 +1417,6 @@ static void mod_kobject_remove(struct module *mod) mod_sysfs_fini(mod); } -/* - * link the module with the whole machine is stopped with interrupts off - * - this defends against kallsyms not taking locks - */ -static int __link_module(void *_mod) -{ - struct module *mod = _mod; - list_add(&mod->list, &modules); - return 0; -} - /* * unlink the module with the whole machine is stopped with interrupts off * - this defends against kallsyms not taking locks @@ -2239,9 +2229,13 @@ static noinline struct module *load_module(void __user *umod, mod->name); /* Now sew it into the lists so we can get lockdep and oops - * info during argument parsing. Noone should access us, since - * strong_try_module_get() will fail. */ - stop_machine(__link_module, mod, NULL); + * info during argument parsing. Noone should access us, since + * strong_try_module_get() will fail. + * lockdep/oops can run asynchronous, so use the RCU list insertion + * function to insert in a way safe to concurrent readers. + * The mutex protects against concurrent writers. + */ + list_add_rcu(&mod->list, &modules); err = parse_args(mod->name, mod->args, kp, num_kp, NULL); if (err < 0) @@ -2436,7 +2430,7 @@ const char *module_address_lookup(unsigned long addr, const char *ret = NULL; preempt_disable(); - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { if (within(addr, mod->module_init, mod->init_size) || within(addr, mod->module_core, mod->core_size)) { if (modname) @@ -2459,7 +2453,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) struct module *mod; preempt_disable(); - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { if (within(addr, mod->module_init, mod->init_size) || within(addr, mod->module_core, mod->core_size)) { const char *sym; @@ -2483,7 +2477,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, struct module *mod; preempt_disable(); - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { if (within(addr, mod->module_init, mod->init_size) || within(addr, mod->module_core, mod->core_size)) { const char *sym; @@ -2510,7 +2504,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, struct module *mod; preempt_disable(); - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { if (symnum < mod->num_symtab) { *value = mod->symtab[symnum].st_value; *type = mod->symtab[symnum].st_info; @@ -2553,7 +2547,7 @@ unsigned long module_kallsyms_lookup_name(const char *name) ret = mod_find_symname(mod, colon+1); *colon = ':'; } else { - list_for_each_entry(mod, &modules, list) + list_for_each_entry_rcu(mod, &modules, list) if ((ret = mod_find_symname(mod, name)) != 0) break; } @@ -2656,7 +2650,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) struct module *mod; preempt_disable(); - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { if (mod->num_exentries == 0) continue; @@ -2682,7 +2676,7 @@ int is_module_address(unsigned long addr) preempt_disable(); - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { if (within(addr, mod->module_core, mod->core_size)) { preempt_enable(); return 1; @@ -2703,7 +2697,7 @@ struct module *__module_text_address(unsigned long addr) if (addr < module_addr_min || addr > module_addr_max) return NULL; - list_for_each_entry(mod, &modules, list) + list_for_each_entry_rcu(mod, &modules, list) if (within(addr, mod->module_init, mod->init_text_size) || within(addr, mod->module_core, mod->core_text_size)) return mod; @@ -2728,8 +2722,11 @@ void print_modules(void) char buf[8]; printk("Modules linked in:"); - list_for_each_entry(mod, &modules, list) + /* Most callers should already have preempt disabled, but make sure */ + preempt_disable(); + list_for_each_entry_rcu(mod, &modules, list) printk(" %s%s", mod->name, module_flags(mod, buf)); + preempt_enable(); if (last_unloaded_module[0]) printk(" [last unloaded: %s]", last_unloaded_module); printk("\n"); -- cgit v1.2.3-70-g09d2 From 3b5d5c6b0ccba733a313f8752ebc3f8015628ba3 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 6 Oct 2008 13:19:27 +0400 Subject: proc: move /proc/modules boilerplate to kernel/module.c Signed-off-by: Alexey Dobriyan --- fs/proc/proc_misc.c | 17 --------------- kernel/module.c | 59 +++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 41 insertions(+), 35 deletions(-) (limited to 'kernel/module.c') diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 7c22831efd9..f6d25db9892 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -57,20 +57,6 @@ #include #include "internal.h" -#ifdef CONFIG_MODULES -extern const struct seq_operations modules_op; -static int modules_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &modules_op); -} -static const struct file_operations proc_modules_operations = { - .open = modules_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; -#endif - #ifdef CONFIG_PROC_PAGE_MONITOR #define KPMSIZE sizeof(u64) #define KPMMASK (KPMSIZE - 1) @@ -209,9 +195,6 @@ void __init proc_misc_init(void) proc_symlink("mounts", NULL, "self/mounts"); /* And now for trickier ones */ -#ifdef CONFIG_MODULES - proc_create("modules", 0, NULL, &proc_modules_operations); -#endif #ifdef CONFIG_SCHEDSTATS proc_create("schedstat", 0, NULL, &proc_schedstat_operations); #endif diff --git a/kernel/module.c b/kernel/module.c index 0d8d21ee792..6fded84d702 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -20,11 +20,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -2599,23 +2601,6 @@ unsigned long module_kallsyms_lookup_name(const char *name) } #endif /* CONFIG_KALLSYMS */ -/* Called by the /proc file system to return a list of modules. */ -static void *m_start(struct seq_file *m, loff_t *pos) -{ - mutex_lock(&module_mutex); - return seq_list_start(&modules, *pos); -} - -static void *m_next(struct seq_file *m, void *p, loff_t *pos) -{ - return seq_list_next(p, &modules, pos); -} - -static void m_stop(struct seq_file *m, void *p) -{ - mutex_unlock(&module_mutex); -} - static char *module_flags(struct module *mod, char *buf) { int bx = 0; @@ -2649,6 +2634,24 @@ static char *module_flags(struct module *mod, char *buf) return buf; } +#ifdef CONFIG_PROC_FS +/* Called by the /proc file system to return a list of modules. */ +static void *m_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&module_mutex); + return seq_list_start(&modules, *pos); +} + +static void *m_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &modules, pos); +} + +static void m_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&module_mutex); +} + static int m_show(struct seq_file *m, void *p) { struct module *mod = list_entry(p, struct module, list); @@ -2679,13 +2682,33 @@ static int m_show(struct seq_file *m, void *p) Where refcount is a number or -, and deps is a comma-separated list of depends or -. */ -const struct seq_operations modules_op = { +static const struct seq_operations modules_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = m_show }; +static int modules_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &modules_op); +} + +static const struct file_operations proc_modules_operations = { + .open = modules_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_modules_init(void) +{ + proc_create("modules", 0, NULL, &proc_modules_operations); + return 0; +} +module_init(proc_modules_init); +#endif + /* Given an address, look for it in the module exception tables. */ const struct exception_table_entry *search_module_extables(unsigned long addr) { -- cgit v1.2.3-70-g09d2