aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/kmemcheck.txt773
-rw-r--r--MAINTAINERS8
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/include/asm/dma-mapping.h7
-rw-r--r--arch/x86/include/asm/kmemcheck.h42
-rw-r--r--arch/x86/include/asm/pgtable.h5
-rw-r--r--arch/x86/include/asm/pgtable_types.h9
-rw-r--r--arch/x86/include/asm/string_32.h8
-rw-r--r--arch/x86/include/asm/string_64.h8
-rw-r--r--arch/x86/include/asm/thread_info.h4
-rw-r--r--arch/x86/include/asm/xor.h5
-rw-r--r--arch/x86/kernel/cpu/intel.c23
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--arch/x86/kernel/stacktrace.c7
-rw-r--r--arch/x86/kernel/traps.c5
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/fault.c18
-rw-r--r--arch/x86/mm/init.c2
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/mm/init_64.c4
-rw-r--r--arch/x86/mm/kmemcheck/Makefile1
-rw-r--r--arch/x86/mm/kmemcheck/error.c228
-rw-r--r--arch/x86/mm/kmemcheck/error.h15
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c640
-rw-r--r--arch/x86/mm/kmemcheck/opcode.c106
-rw-r--r--arch/x86/mm/kmemcheck/opcode.h9
-rw-r--r--arch/x86/mm/kmemcheck/pte.c22
-rw-r--r--arch/x86/mm/kmemcheck/pte.h10
-rw-r--r--arch/x86/mm/kmemcheck/selftest.c69
-rw-r--r--arch/x86/mm/kmemcheck/selftest.h6
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c162
-rw-r--r--arch/x86/mm/kmemcheck/shadow.h16
-rw-r--r--arch/x86/mm/pageattr.c2
-rw-r--r--arch/x86/mm/pgtable.c12
-rw-r--r--crypto/xor.c7
-rw-r--r--drivers/ieee1394/csr1212.c2
-rw-r--r--drivers/ieee1394/nodemgr.c5
-rw-r--r--drivers/misc/c2port/core.c2
-rw-r--r--include/linux/c2port.h3
-rw-r--r--include/linux/fs.h5
-rw-r--r--include/linux/gfp.h14
-rw-r--r--include/linux/interrupt.h14
-rw-r--r--include/linux/kmemcheck.h153
-rw-r--r--include/linux/mm_types.h8
-rw-r--r--include/linux/ring_buffer.h4
-rw-r--r--include/linux/skbuff.h7
-rw-r--r--include/linux/slab.h7
-rw-r--r--include/linux/slab_def.h81
-rw-r--r--include/linux/stacktrace.h3
-rw-r--r--include/net/inet_sock.h14
-rw-r--r--include/net/inet_timewait_sock.h5
-rw-r--r--include/net/sock.h2
-rw-r--r--init/do_mounts.c3
-rw-r--r--init/main.c1
-rw-r--r--kernel/fork.c14
-rw-r--r--kernel/signal.c11
-rw-r--r--kernel/softirq.c11
-rw-r--r--kernel/sysctl.c12
-rw-r--r--kernel/trace/ring_buffer.c3
-rw-r--r--lib/Kconfig.debug6
-rw-r--r--lib/Kconfig.kmemcheck91
-rw-r--r--mm/Kconfig.debug1
-rw-r--r--mm/Makefile1
-rw-r--r--mm/kmemcheck.c122
-rw-r--r--mm/page_alloc.c18
-rw-r--r--mm/slab.c108
-rw-r--r--mm/slub.c38
-rw-r--r--net/core/skbuff.c8
-rw-r--r--net/core/sock.c2
-rw-r--r--net/ipv4/inet_timewait_sock.c3
71 files changed, 2899 insertions, 128 deletions
diff --git a/Documentation/kmemcheck.txt b/Documentation/kmemcheck.txt
new file mode 100644
index 00000000000..363044609da
--- /dev/null
+++ b/Documentation/kmemcheck.txt
@@ -0,0 +1,773 @@
+GETTING STARTED WITH KMEMCHECK
+==============================
+
+Vegard Nossum <vegardno@ifi.uio.no>
+
+
+Contents
+========
+0. Introduction
+1. Downloading
+2. Configuring and compiling
+3. How to use
+3.1. Booting
+3.2. Run-time enable/disable
+3.3. Debugging
+3.4. Annotating false positives
+4. Reporting errors
+5. Technical description
+
+
+0. Introduction
+===============
+
+kmemcheck is a debugging feature for the Linux Kernel. More specifically, it
+is a dynamic checker that detects and warns about some uses of uninitialized
+memory.
+
+Userspace programmers might be familiar with Valgrind's memcheck. The main
+difference between memcheck and kmemcheck is that memcheck works for userspace
+programs only, and kmemcheck works for the kernel only. The implementations
+are of course vastly different. Because of this, kmemcheck is not as accurate
+as memcheck, but it turns out to be good enough in practice to discover real
+programmer errors that the compiler is not able to find through static
+analysis.
+
+Enabling kmemcheck on a kernel will probably slow it down to the extent that
+the machine will not be usable for normal workloads such as e.g. an
+interactive desktop. kmemcheck will also cause the kernel to use about twice
+as much memory as normal. For this reason, kmemcheck is strictly a debugging
+feature.
+
+
+1. Downloading
+==============
+
+kmemcheck can only be downloaded using git. If you want to write patches
+against the current code, you should use the kmemcheck development branch of
+the tip tree. It is also possible to use the linux-next tree, which also
+includes the latest version of kmemcheck.
+
+Assuming that you've already cloned the linux-2.6.git repository, all you
+have to do is add the -tip tree as a remote, like this:
+
+ $ git remote add tip git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git
+
+To actually download the tree, fetch the remote:
+
+ $ git fetch tip
+
+And to check out a new local branch with the kmemcheck code:
+
+ $ git checkout -b kmemcheck tip/kmemcheck
+
+General instructions for the -tip tree can be found here:
+http://people.redhat.com/mingo/tip.git/readme.txt
+
+
+2. Configuring and compiling
+============================
+
+kmemcheck only works for the x86 (both 32- and 64-bit) platform. A number of
+configuration variables must have specific settings in order for the kmemcheck
+menu to even appear in "menuconfig". These are:
+
+ o CONFIG_CC_OPTIMIZE_FOR_SIZE=n
+
+ This option is located under "General setup" / "Optimize for size".
+
+ Without this, gcc will use certain optimizations that usually lead to
+ false positive warnings from kmemcheck. An example of this is a 16-bit
+ field in a struct, where gcc may load 32 bits, then discard the upper
+ 16 bits. kmemcheck sees only the 32-bit load, and may trigger a
+ warning for the upper 16 bits (if they're uninitialized).
+
+ o CONFIG_SLAB=y or CONFIG_SLUB=y
+
+ This option is located under "General setup" / "Choose SLAB
+ allocator".
+
+ o CONFIG_FUNCTION_TRACER=n
+
+ This option is located under "Kernel hacking" / "Tracers" / "Kernel
+ Function Tracer"
+
+ When function tracing is compiled in, gcc emits a call to another
+ function at the beginning of every function. This means that when the
+ page fault handler is called, the ftrace framework will be called
+ before kmemcheck has had a chance to handle the fault. If ftrace then
+ modifies memory that was tracked by kmemcheck, the result is an
+ endless recursive page fault.
+
+ o CONFIG_DEBUG_PAGEALLOC=n
+
+ This option is located under "Kernel hacking" / "Debug page memory
+ allocations".
+
+In addition, I highly recommend turning on CONFIG_DEBUG_INFO=y. This is also
+located under "Kernel hacking". With this, you will be able to get line number
+information from the kmemcheck warnings, which is extremely valuable in
+debugging a problem. This option is not mandatory, however, because it slows
+down the compilation process and produces a much bigger kernel image.
+
+Now the kmemcheck menu should be visible (under "Kernel hacking" / "kmemcheck:
+trap use of uninitialized memory"). Here follows a description of the
+kmemcheck configuration variables:
+
+ o CONFIG_KMEMCHECK
+
+ This must be enabled in order to use kmemcheck at all...
+
+ o CONFIG_KMEMCHECK_[DISABLED | ENABLED | ONESHOT]_BY_DEFAULT
+
+ This option controls the status of kmemcheck at boot-time. "Enabled"
+ will enable kmemcheck right from the start, "disabled" will boot the
+ kernel as normal (but with the kmemcheck code compiled in, so it can
+ be enabled at run-time after the kernel has booted), and "one-shot" is
+ a special mode which will turn kmemcheck off automatically after
+ detecting the first use of uninitialized memory.
+
+ If you are using kmemcheck to actively debug a problem, then you
+ probably want to choose "enabled" here.
+
+ The one-shot mode is mostly useful in automated test setups because it
+ can prevent floods of warnings and increase the chances of the machine
+ surviving in case something is really wrong. In other cases, the one-
+ shot mode could actually be counter-productive because it would turn
+ itself off at the very first error -- in the case of a false positive
+ too -- and this would come in the way of debugging the specific
+ problem you were interested in.
+
+ If you would like to use your kernel as normal, but with a chance to
+ enable kmemcheck in case of some problem, it might be a good idea to
+ choose "disabled" here. When kmemcheck is disabled, most of the run-
+ time overhead is not incurred, and the kernel will be almost as fast
+ as normal.
+
+ o CONFIG_KMEMCHECK_QUEUE_SIZE
+
+ Select the maximum number of error reports to store in an internal
+ (fixed-size) buffer. Since errors can occur virtually anywhere and in
+ any context, we need a temporary storage area which is guaranteed not
+ to generate any other page faults when accessed. The queue will be
+ emptied as soon as a tasklet may be scheduled. If the queue is full,
+ new error reports will be lost.
+
+ The default value of 64 is probably fine. If some code produces more
+ than 64 errors within an irqs-off section, then the code is likely to
+ produce many, many more, too, and these additional reports seldom give
+ any more information (the first report is usually the most valuable
+ anyway).
+
+ This number might have to be adjusted if you are not using serial
+ console or similar to capture the kernel log. If you are using the
+ "dmesg" command to save the log, then getting a lot of kmemcheck
+ warnings might overflow the kernel log itself, and the earlier reports
+ will get lost in that way instead. Try setting this to 10 or so on
+ such a setup.
+
+ o CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT
+
+ Select the number of shadow bytes to save along with each entry of the
+ error-report queue. These bytes indicate what parts of an allocation
+ are initialized, uninitialized, etc. and will be displayed when an
+ error is detected to help the debugging of a particular problem.
+
+ The number entered here is actually the logarithm of the number of
+ bytes that will be saved. So if you pick for example 5 here, kmemcheck
+ will save 2^5 = 32 bytes.
+
+ The default value should be fine for debugging most problems. It also
+ fits nicely within 80 columns.
+
+ o CONFIG_KMEMCHECK_PARTIAL_OK
+
+ This option (when enabled) works around certain GCC optimizations that
+ produce 32-bit reads from 16-bit variables where the upper 16 bits are
+ thrown away afterwards.
+
+ The default value (enabled) is recommended. This may of course hide
+ some real errors, but disabling it would probably produce a lot of
+ false positives.
+
+ o CONFIG_KMEMCHECK_BITOPS_OK
+
+ This option silences warnings that would be generated for bit-field
+ accesses where not all the bits are initialized at the same time. This
+ may also hide some real bugs.
+
+ This option is probably obsolete, or it should be replaced with
+ the kmemcheck-/bitfield-annotations for the code in question. The
+ default value is therefore fine.
+
+Now compile the kernel as usual.
+
+
+3. How to use
+=============
+
+3.1. Booting
+============
+
+First some information about the command-line options. There is only one
+option specific to kmemcheck, and this is called "kmemcheck". It can be used
+to override the default mode as chosen by the CONFIG_KMEMCHECK_*_BY_DEFAULT
+option. Its possible settings are:
+
+ o kmemcheck=0 (disabled)
+ o kmemcheck=1 (enabled)
+ o kmemcheck=2 (one-shot mode)
+
+If SLUB debugging has been enabled in the kernel, it may take precedence over
+kmemcheck in such a way that the slab caches which are under SLUB debugging
+will not be tracked by kmemcheck. In order to ensure that this doesn't happen
+(even though it shouldn't by default), use SLUB's boot option "slub_debug",
+like this: slub_debug=-
+
+In fact, this option may also be used for fine-grained control over SLUB vs.
+kmemcheck. For example, if the command line includes "kmemcheck=1
+slub_debug=,dentry", then SLUB debugging will be used only for the "dentry"
+slab cache, and with kmemcheck tracking all the other caches. This is advanced
+usage, however, and is not generally recommended.
+
+
+3.2. Run-time enable/disable
+============================
+
+When the kernel has booted, it is possible to enable or disable kmemcheck at
+run-time. WARNING: This feature is still experimental and may cause false
+positive warnings to appear. Therefore, try not to use this. If you find that
+it doesn't work properly (e.g. you see an unreasonable amount of warnings), I
+will be happy to take bug reports.
+
+Use the file /proc/sys/kernel/kmemcheck for this purpose, e.g.:
+
+ $ echo 0 > /proc/sys/kernel/kmemcheck # disables kmemcheck
+
+The numbers are the same as for the kmemcheck= command-line option.
+
+
+3.3. Debugging
+==============
+
+A typical report will look something like this:
+
+WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024)
+80000000000000000000000000000000000000000088ffff0000000000000000
+ i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
+ ^
+
+Pid: 1856, comm: ntpdate Not tainted 2.6.29-rc5 #264 945P-A
+RIP: 0010:[<ffffffff8104ede8>] [<ffffffff8104ede8>] __dequeue_signal+0xc8/0x190
+RSP: 0018:ffff88003cdf7d98 EFLAGS: 00210002
+RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009
+RDX: ffff88003e5d6018 RSI: ffff88003e5d6024 RDI: ffff88003cdf7e84
+RBP: ffff88003cdf7db8 R08: ffff88003e5d6000 R09: 0000000000000000
+R10: 0000000000000080 R11: 0000000000000000 R12: 000000000000000e
+R13: ffff88003cdf7e78 R14: ffff88003d530710 R15: ffff88003d5a98c8
+FS: 0000000000000000(0000) GS:ffff880001982000(0063) knlGS:00000
+CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033
+CR2: ffff88003f806ea0 CR3: 000000003c036000 CR4: 00000000000006a0
+DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+DR3: 0000000000000000 DR6: 00000000ffff4ff0 DR7: 0000000000000400
+ [<ffffffff8104f04e>] dequeue_signal+0x8e/0x170
+ [<ffffffff81050bd8>] get_signal_to_deliver+0x98/0x390
+ [<ffffffff8100b87d>] do_notify_resume+0xad/0x7d0
+ [<ffffffff8100c7b5>] int_signal+0x12/0x17
+ [<ffffffffffffffff>] 0xffffffffffffffff
+
+The single most valuable information in this report is the RIP (or EIP on 32-
+bit) value. This will help us pinpoint exactly which instruction that caused
+the warning.
+
+If your kernel was compiled with CONFIG_DEBUG_INFO=y, then all we have to do
+is give this address to the addr2line program, like this:
+
+ $ addr2line -e vmlinux -i ffffffff8104ede8
+ arch/x86/include/asm/string_64.h:12
+ include/asm-generic/siginfo.h:287
+ kernel/signal.c:380
+ kernel/signal.c:410
+
+The "-e vmlinux" tells addr2line which file to look in. IMPORTANT: This must
+be the vmlinux of the kernel that produced the warning in the first place! If
+not, the line number information will almost certainly be wrong.
+
+The "-i" tells addr2line to also print the line numbers of inlined functions.
+In this case, the flag was very important, because otherwise, it would only
+have printed the first line, which is just a call to memcpy(), which could be
+called from a thousand places in the kernel, and is therefore not very useful.
+These inlined functions would not show up in the stack trace above, simply
+because the kernel doesn't load the extra debugging information. This
+technique can of course be used with ordinary kernel oopses as well.
+
+In this case, it's the caller of memcpy() that is interesting, and it can be
+found in include/asm-generic/siginfo.h, line 287:
+
+281 static inline void copy_siginfo(struct siginfo *to, struct siginfo *from)
+282 {
+283 if (from->si_code < 0)
+284 memcpy(to, from, sizeof(*to));
+285 else
+286 /* _sigchld is currently the largest know union member */
+287 memcpy(to, from, __ARCH_SI_PREAMBLE_SIZE + sizeof(from->_sifields._sigchld));
+288 }
+
+Since this was a read (kmemcheck usually warns about reads only, though it can
+warn about writes to unallocated or freed memory as well), it was probably the
+"from" argument which contained some uninitialized bytes. Following the chain
+of calls, we move upwards to see where "from" was allocated or initialized,
+kernel/signal.c, line 380:
+
+359 static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
+360 {
+...
+367 list_for_each_entry(q, &list->list, list) {
+368 if (q->info.si_signo == sig) {
+369 if (first)
+370 goto still_pending;
+371 first = q;
+...
+377 if (first) {
+378 still_pending:
+379 list_del_init(&first->list);
+380 copy_siginfo(info, &first->info);
+381 __sigqueue_free(first);
+...
+392 }
+393 }
+
+Here, it is &first->info that is being passed on to copy_siginfo(). The
+variable "first" was found on a list -- passed in as the second argument to
+collect_signal(). We continue our journey through the stack, to figure out
+where the item on "list" was allocated or initialized. We move to line 410:
+
+395 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
+396 siginfo_t *info)
+397 {
+...
+410 collect_signal(sig, pending, info);
+...
+414 }
+
+Now we need to follow the "pending" pointer, since that is being passed on to
+collect_signal() as "list". At this point, we've run out of lines from the
+"addr2line" output. Not to worry, we just paste the next addresses from the
+kmemcheck stack dump, i.e.:
+
+ [<ffffffff8104f04e>] dequeue_signal+0x8e/0x170
+ [<ffffffff81050bd8>] get_signal_to_deliver+0x98/0x390
+ [<ffffffff8100b87d>] do_notify_resume+0xad/0x7d0
+ [<ffffffff8100c7b5>] int_signal+0x12/0x17
+
+ $ addr2line -e vmlinux -i ffffffff8104f04e ffffffff81050bd8 \
+ ffffffff8100b87d ffffffff8100c7b5
+ kernel/signal.c:446
+ kernel/signal.c:1806
+ arch/x86/kernel/signal.c:805
+ arch/x86/kernel/signal.c:871
+ arch/x86/kernel/entry_64.S:694
+
+Remember that since these addresses were found on the stack and not as the
+RIP value, they actually point to the _next_ instruction (they are return
+addresses). This becomes obvious when we look at the code for line 446:
+
+422 int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
+423 {
+...
+431 signr = __dequeue_signal(&tsk->signal->shared_pending,
+432 mask, info);
+433 /*
+434 * itimer signal ?
+435 *
+436 * itimers are process shared and we restart periodic
+437 * itimers in the signal delivery path to prevent DoS
+438 * attacks in the high resolution timer case. This is
+439 * compliant with the old way of self restarting
+440 * itimers, as the SIGALRM is a legacy signal and only
+441 * queued once. Changing the restart behaviour to
+442 * restart the timer in the signal dequeue path is
+443 * reducing the timer noise on heavy loaded !highres
+444 * systems too.
+445 */
+446 if (unlikely(signr == SIGALRM)) {
+...
+489 }
+
+So instead of looking at 446, we should be looking at 431, which is the line
+that executes just before 446. Here we see that what we are looking for is
+&tsk->signal->shared_pending.
+
+Our next task is now to figure out which function that puts items on this
+"shared_pending" list. A crude, but efficient tool, is git grep:
+
+ $ git grep -n 'shared_pending' kernel/
+ ...
+ kernel/signal.c:828: pending = group ? &t->signal->shared_pending : &t->pending;
+ kernel/signal.c:1339: pending = group ? &t->signal->shared_pending : &t->pending;
+ ...
+
+There were more results, but none of them were related to list operations,
+and these were the only assignments. We inspect the line numbers more closely
+and find that this is indeed where items are being added to the list:
+
+816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
+817 int group)
+818 {
+...
+828 pending = group ? &t->signal->shared_pending : &t->pending;
+...
+851 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
+852 (is_si_special(info) ||
+853 info->si_code >= 0)));
+854 if (q) {
+855 list_add_tail(&q->list, &pending->list);
+...
+890 }
+
+and:
+
+1309 int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
+1310 {
+....
+1339 pending = group ? &t->signal->shared_pending : &t->pending;
+1340 list_add_tail(&q->list, &pending->list);
+....
+1347 }
+
+In the first case, the list element we are looking for, "q", is being returned
+from the function __sigqueue_alloc(), which looks like an allocation function.
+Let's take a look at it:
+
+187 static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
+188 int override_rlimit)
+189 {
+190 struct sigqueue *q = NULL;
+191 struct user_struct *user;
+192
+193 /*
+194 * We won't get problems with the target's UID changing under us
+195 * because changing it requires RCU be used, and if t != current, the
+196 * caller must be holding the RCU readlock (by way of a spinlock) and
+197 * we use RCU protection here
+198 */
+199 user = get_uid(__task_cred(t)->user);
+200 atomic_inc(&user->sigpending);
+201 if (override_rlimit ||
+202 atomic_read(&user->sigpending) <=
+203 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
+204 q = kmem_cache_alloc(sigqueue_cachep, flags);
+205 if (unlikely(q == NULL)) {
+206 atomic_dec(&user->sigpending);
+207 free_uid(user);
+208 } else {
+209 INIT_LIST_HEAD(&q->list);
+210 q->flags = 0;
+211 q->user = user;
+212 }
+213
+214 return q;
+215 }
+
+We see that this function initializes q->list, q->flags, and q->user. It seems
+that now is the time to look at the definition of "struct sigqueue", e.g.:
+
+14 struct sigqueue {
+15 struct list_head list;
+16 int flags;
+17 siginfo_t info;
+18 struct user_struct *user;
+19 };
+
+And, you might remember, it was a memcpy() on &first->info that caused the
+warning, so this makes perfect sense. It also seems reasonable to assume that
+it is the caller of __sigqueue_alloc() that has the responsibility of filling
+out (initializing) this member.
+
+But just which fields of the struct were uninitialized? Let's look at
+kmemcheck's report again:
+
+WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024)
+80000000000000000000000000000000000000000088ffff0000000000000000
+ i i i i u u u u i i i