aboutsummaryrefslogtreecommitdiff
path: root/Documentation/DocBook/kernel-locking.tmpl
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /Documentation/DocBook/kernel-locking.tmpl
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'Documentation/DocBook/kernel-locking.tmpl')
-rw-r--r--Documentation/DocBook/kernel-locking.tmpl2088
1 files changed, 2088 insertions, 0 deletions
diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl
new file mode 100644
index 00000000000..90dc2de8e0a
--- /dev/null
+++ b/Documentation/DocBook/kernel-locking.tmpl
@@ -0,0 +1,2088 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="LKLockingGuide">
+ <bookinfo>
+ <title>Unreliable Guide To Locking</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Rusty</firstname>
+ <surname>Russell</surname>
+ <affiliation>
+ <address>
+ <email>rusty@rustcorp.com.au</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2003</year>
+ <holder>Rusty Russell</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+ <toc></toc>
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ Welcome, to Rusty's Remarkably Unreliable Guide to Kernel
+ Locking issues. This document describes the locking systems in
+ the Linux Kernel in 2.6.
+ </para>
+ <para>
+ With the wide availability of HyperThreading, and <firstterm
+ linkend="gloss-preemption">preemption </firstterm> in the Linux
+ Kernel, everyone hacking on the kernel needs to know the
+ fundamentals of concurrency and locking for
+ <firstterm linkend="gloss-smp"><acronym>SMP</acronym></firstterm>.
+ </para>
+ </chapter>
+
+ <chapter id="races">
+ <title>The Problem With Concurrency</title>
+ <para>
+ (Skip this if you know what a Race Condition is).
+ </para>
+ <para>
+ In a normal program, you can increment a counter like so:
+ </para>
+ <programlisting>
+ very_important_count++;
+ </programlisting>
+
+ <para>
+ This is what they would expect to happen:
+ </para>
+
+ <table>
+ <title>Expected Results</title>
+
+ <tgroup cols="2" align="left">
+
+ <thead>
+ <row>
+ <entry>Instance 1</entry>
+ <entry>Instance 2</entry>
+ </row>
+ </thead>
+
+ <tbody>
+ <row>
+ <entry>read very_important_count (5)</entry>
+ <entry></entry>
+ </row>
+ <row>
+ <entry>add 1 (6)</entry>
+ <entry></entry>
+ </row>
+ <row>
+ <entry>write very_important_count (6)</entry>
+ <entry></entry>
+ </row>
+ <row>
+ <entry></entry>
+ <entry>read very_important_count (6)</entry>
+ </row>
+ <row>
+ <entry></entry>
+ <entry>add 1 (7)</entry>
+ </row>
+ <row>
+ <entry></entry>
+ <entry>write very_important_count (7)</entry>
+ </row>
+ </tbody>
+
+ </tgroup>
+ </table>
+
+ <para>
+ This is what might happen:
+ </para>
+
+ <table>
+ <title>Possible Results</title>
+
+ <tgroup cols="2" align="left">
+ <thead>
+ <row>
+ <entry>Instance 1</entry>
+ <entry>Instance 2</entry>
+ </row>
+ </thead>
+
+ <tbody>
+ <row>
+ <entry>read very_important_count (5)</entry>
+ <entry></entry>
+ </row>
+ <row>
+ <entry></entry>
+ <entry>read very_important_count (5)</entry>
+ </row>
+ <row>
+ <entry>add 1 (6)</entry>
+ <entry></entry>
+ </row>
+ <row>
+ <entry></entry>
+ <entry>add 1 (6)</entry>
+ </row>
+ <row>
+ <entry>write very_important_count (6)</entry>
+ <entry></entry>
+ </row>
+ <row>
+ <entry></entry>
+ <entry>write very_important_count (6)</entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+
+ <sect1 id="race-condition">
+ <title>Race Conditions and Critical Regions</title>
+ <para>
+ This overlap, where the result depends on the
+ relative timing of multiple tasks, is called a <firstterm>race condition</firstterm>.
+ The piece of code containing the concurrency issue is called a
+ <firstterm>critical region</firstterm>. And especially since Linux starting running
+ on SMP machines, they became one of the major issues in kernel
+ design and implementation.
+ </para>
+ <para>
+ Preemption can have the same effect, even if there is only one
+ CPU: by preempting one task during the critical region, we have
+ exactly the same race condition. In this case the thread which
+ preempts might run the critical region itself.
+ </para>
+ <para>
+ The solution is to recognize when these simultaneous accesses
+ occur, and use locks to make sure that only one instance can
+ enter the critical region at any time. There are many
+ friendly primitives in the Linux kernel to help you do this.
+ And then there are the unfriendly primitives, but I'll pretend
+ they don't exist.
+ </para>
+ </sect1>
+ </chapter>
+
+ <chapter id="locks">
+ <title>Locking in the Linux Kernel</title>
+
+ <para>
+ If I could give you one piece of advice: never sleep with anyone
+ crazier than yourself. But if I had to give you advice on
+ locking: <emphasis>keep it simple</emphasis>.
+ </para>
+
+ <para>
+ Be reluctant to introduce new locks.
+ </para>
+
+ <para>
+ Strangely enough, this last one is the exact reverse of my advice when
+ you <emphasis>have</emphasis> slept with someone crazier than yourself.
+ And you should think about getting a big dog.
+ </para>
+
+ <sect1 id="lock-intro">
+ <title>Two Main Types of Kernel Locks: Spinlocks and Semaphores</title>
+
+ <para>
+ There are two main types of kernel locks. The fundamental type
+ is the spinlock
+ (<filename class="headerfile">include/asm/spinlock.h</filename>),
+ which is a very simple single-holder lock: if you can't get the
+ spinlock, you keep trying (spinning) until you can. Spinlocks are
+ very small and fast, and can be used anywhere.
+ </para>
+ <para>
+ The second type is a semaphore
+ (<filename class="headerfile">include/asm/semaphore.h</filename>): it
+ can have more than one holder at any time (the number decided at
+ initialization time), although it is most commonly used as a
+ single-holder lock (a mutex). If you can't get a semaphore,
+ your task will put itself on the queue, and be woken up when the
+ semaphore is released. This means the CPU will do something
+ else while you are waiting, but there are many cases when you
+ simply can't sleep (see <xref linkend="sleeping-things"/>), and so
+ have to use a spinlock instead.
+ </para>
+ <para>
+ Neither type of lock is recursive: see
+ <xref linkend="deadlock"/>.
+ </para>
+ </sect1>
+
+ <sect1 id="uniprocessor">
+ <title>Locks and Uniprocessor Kernels</title>
+
+ <para>
+ For kernels compiled without <symbol>CONFIG_SMP</symbol>, and
+ without <symbol>CONFIG_PREEMPT</symbol> spinlocks do not exist at
+ all. This is an excellent design decision: when no-one else can
+ run at the same time, there is no reason to have a lock.
+ </para>
+
+ <para>
+ If the kernel is compiled without <symbol>CONFIG_SMP</symbol>,
+ but <symbol>CONFIG_PREEMPT</symbol> is set, then spinlocks
+ simply disable preemption, which is sufficient to prevent any
+ races. For most purposes, we can think of preemption as
+ equivalent to SMP, and not worry about it separately.
+ </para>
+
+ <para>
+ You should always test your locking code with <symbol>CONFIG_SMP</symbol>
+ and <symbol>CONFIG_PREEMPT</symbol> enabled, even if you don't have an SMP test box, because it
+ will still catch some kinds of locking bugs.
+ </para>
+
+ <para>
+ Semaphores still exist, because they are required for
+ synchronization between <firstterm linkend="gloss-usercontext">user
+ contexts</firstterm>, as we will see below.
+ </para>
+ </sect1>
+
+ <sect1 id="usercontextlocking">
+ <title>Locking Only In User Context</title>
+
+ <para>
+ If you have a data structure which is only ever accessed from
+ user context, then you can use a simple semaphore
+ (<filename>linux/asm/semaphore.h</filename>) to protect it. This
+ is the most trivial case: you initialize the semaphore to the number
+ of resources available (usually 1), and call
+ <function>down_interruptible()</function> to grab the semaphore, and
+ <function>up()</function> to release it. There is also a
+ <function>down()</function>, which should be avoided, because it
+ will not return if a signal is received.
+ </para>
+
+ <para>
+ Example: <filename>linux/net/core/netfilter.c</filename> allows
+ registration of new <function>setsockopt()</function> and
+ <function>getsockopt()</function> calls, with
+ <function>nf_register_sockopt()</function>. Registration and
+ de-registration are only done on module load and unload (and boot
+ time, where there is no concurrency), and the list of registrations
+ is only consulted for an unknown <function>setsockopt()</function>
+ or <function>getsockopt()</function> system call. The
+ <varname>nf_sockopt_mutex</varname> is perfect to protect this,
+ especially since the setsockopt and getsockopt calls may well
+ sleep.
+ </para>
+ </sect1>
+
+ <sect1 id="lock-user-bh">
+ <title>Locking Between User Context and Softirqs</title>
+
+ <para>
+ If a <firstterm linkend="gloss-softirq">softirq</firstterm> shares
+ data with user context, you have two problems. Firstly, the current
+ user context can be interrupted by a softirq, and secondly, the
+ critical region could be entered from another CPU. This is where
+ <function>spin_lock_bh()</function>
+ (<filename class="headerfile">include/linux/spinlock.h</filename>) is
+ used. It disables softirqs on that CPU, then grabs the lock.
+ <function>spin_unlock_bh()</function> does the reverse. (The
+ '_bh' suffix is a historical reference to "Bottom Halves", the
+ old name for software interrupts. It should really be
+ called spin_lock_softirq()' in a perfect world).
+ </para>
+
+ <para>
+ Note that you can also use <function>spin_lock_irq()</function>
+ or <function>spin_lock_irqsave()</function> here, which stop
+ hardware interrupts as well: see <xref linkend="hardirq-context"/>.
+ </para>
+
+ <para>
+ This works perfectly for <firstterm linkend="gloss-up"><acronym>UP
+ </acronym></firstterm> as well: the spin lock vanishes, and this macro
+ simply becomes <function>local_bh_disable()</function>
+ (<filename class="headerfile">include/linux/interrupt.h</filename>), which
+ protects you from the softirq being run.
+ </para>
+ </sect1>
+
+ <sect1 id="lock-user-tasklet">
+ <title>Locking Between User Context and Tasklets</title>
+
+ <para>
+ This is exactly the same as above, because <firstterm
+ linkend="gloss-tasklet">tasklets</firstterm> are actually run
+ from a softirq.
+ </para>
+ </sect1>
+
+ <sect1 id="lock-user-timers">
+ <title>Locking Between User Context and Timers</title>
+
+ <para>
+ This, too, is exactly the same as above, because <firstterm
+ linkend="gloss-timers">timers</firstterm> are actually run from
+ a softirq. From a locking point of view, tasklets and timers
+ are identical.
+ </para>
+ </sect1>
+
+ <sect1 id="lock-tasklets">
+ <title>Locking Between Tasklets/Timers</title>
+
+ <para>
+ Sometimes a tasklet or timer might want to share data with
+ another tasklet or timer.
+ </para>
+
+ <sect2 id="lock-tasklets-same">
+ <title>The Same Tasklet/Timer</title>
+ <para>
+ Since a tasklet is never run on two CPUs at once, you don't
+ need to worry about your tasklet being reentrant (running
+ twice at once), even on SMP.
+ </para>
+ </sect2>
+
+ <sect2 id="lock-tasklets-different">
+ <title>Different Tasklets/Timers</title>
+ <para>
+ If another tasklet/timer wants
+ to share data with your tasklet or timer , you will both need to use
+ <function>spin_lock()</function> and
+ <function>spin_unlock()</function> calls.
+ <function>spin_lock_bh()</function> is
+ unnecessary here, as you are already in a tasklet, and
+ none will be run on the same CPU.
+ </para>
+ </sect2>
+ </sect1>
+
+ <sect1 id="lock-softirqs">
+ <title>Locking Between Softirqs</title>
+
+ <para>
+ Often a softirq might
+ want to share data with itself or a tasklet/timer.
+ </para>
+
+ <sect2 id="lock-softirqs-same">
+ <title>The Same Softirq</title>
+
+ <para>
+ The same softirq can run on the other CPUs: you can use a
+ per-CPU array (see <xref linkend="per-cpu"/>) for better
+ performance. If you're going so far as to use a softirq,
+ you probably care about scalable performance enough
+ to justify the extra complexity.
+ </para>
+
+ <para>
+ You'll need to use <function>spin_lock()</function> and
+ <function>spin_unlock()</function> for shared data.
+ </para>
+ </sect2>
+
+ <sect2 id="lock-softirqs-different">
+ <title>Different Softirqs</title>
+
+ <para>
+ You'll need to use <function>spin_lock()</function> and
+ <function>spin_unlock()</function> for shared data, whether it
+ be a timer, tasklet, different softirq or the same or another
+ softirq: any of them could be running on a different CPU.
+ </para>
+ </sect2>
+ </sect1>
+ </chapter>
+
+ <chapter id="hardirq-context">
+ <title>Hard IRQ Context</title>
+
+ <para>
+ Hardware interrupts usually communicate with a
+ tasklet or softirq. Frequently this involves putting work in a
+ queue, which the softirq will take out.
+ </para>
+
+ <sect1 id="hardirq-softirq">
+ <title>Locking Between Hard IRQ and Softirqs/Tasklets</title>
+
+ <para>
+ If a hardware irq handler shares data with a softirq, you have
+ two concerns. Firstly, the softirq processing can be
+ interrupted by a hardware interrupt, and secondly, the
+ critical region could be entered by a hardware interrupt on
+ another CPU. This is where <function>spin_lock_irq()</function> is
+ used. It is defined to disable interrupts on that cpu, then grab
+ the lock. <function>spin_unlock_irq()</function> does the reverse.
+ </para>
+
+ <para>
+ The irq handler does not to use
+ <function>spin_lock_irq()</function>, because the softirq cannot
+ run while the irq handler is running: it can use
+ <function>spin_lock()</function>, which is slightly faster. The
+ only exception would be if a different hardware irq handler uses
+ the same lock: <function>spin_lock_irq()</function> will stop
+ that from interrupting us.
+ </para>
+
+ <para>
+ This works perfectly for UP as well: the spin lock vanishes,
+ and this macro simply becomes <function>local_irq_disable()</function>
+ (<filename class="headerfile">include/asm/smp.h</filename>), which
+ protects you from the softirq/tasklet/BH being run.
+ </para>
+
+ <para>
+ <function>spin_lock_irqsave()</function>
+ (<filename>include/linux/spinlock.h</filename>) is a variant
+ which saves whether interrupts were on or off in a flags word,
+ which is passed to <function>spin_unlock_irqrestore()</function>. This
+ means that the same code can be used inside an hard irq handler (where
+ interrupts are already off) and in softirqs (where the irq
+ disabling is required).
+ </para>
+
+ <para>
+ Note that softirqs (and hence tasklets and timers) are run on
+ return from hardware interrupts, so
+ <function>spin_lock_irq()</function> also stops these. In that
+ sense, <function>spin_lock_irqsave()</function> is the most
+ general and powerful locking function.
+ </para>
+
+ </sect1>
+ <sect1 id="hardirq-hardirq">
+ <title>Locking Between Two Hard IRQ Handlers</title>
+ <para>
+ It is rare to have to share data between two IRQ handlers, but
+ if you do, <function>spin_lock_irqsave()</function> should be
+ used: it is architecture-specific whether all interrupts are
+ disabled inside irq handlers themselves.
+ </para>
+ </sect1>
+
+ </chapter>
+
+ <chapter id="cheatsheet">
+ <title>Cheat Sheet For Locking</title>
+ <para>
+ Pete Zaitcev gives the following summary:
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ If you are in a process context (any syscall) and want to
+ lock other process out, use a semaphore. You can take a semaphore
+ and sleep (<function>copy_from_user*(</function> or
+ <function>kmalloc(x,GFP_KERNEL)</function>).
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Otherwise (== data can be touched in an interrupt), use
+ <function>spin_lock_irqsave()</function> and
+ <function>spin_unlock_irqrestore()</function>.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Avoid holding spinlock for more than 5 lines of code and
+ across any function call (except accessors like
+ <function>readb</function>).
+ </para>
+ </listitem>
+ </itemizedlist>
+
+ <sect1 id="minimum-lock-reqirements">
+ <title>Table of Minimum Requirements</title>
+
+ <para> The following table lists the <emphasis>minimum</emphasis>
+ locking requirements between various contexts. In some cases,
+ the same context can only be running on one CPU at a time, so
+ no locking is required for that context (eg. a particular
+ thread can only run on one CPU at a time, but if it needs
+ shares data with another thread, locking is required).
+ </para>
+ <para>
+ Remember the advice above: you can always use
+ <function>spin_lock_irqsave()</function>, which is a superset
+ of all other spinlock primitives.
+ </para>
+ <table>
+<title>Table of Locking Requirements</title>
+<tgroup cols="11">
+<tbody>
+<row>
+<entry></entry>
+<entry>IRQ Handler A</entry>
+<entry>IRQ Handler B</entry>
+<entry>Softirq A</entry>
+<entry>Softirq B</entry>
+<entry>Tasklet A</entry>
+<entry>Tasklet B</entry>
+<entry>Timer A</entry>
+<entry>Timer B</entry>
+<entry>User Context A</entry>
+<entry>User Context B</entry>
+</row>
+
+<row>
+<entry>IRQ Handler A</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>IRQ Handler B</entry>
+<entry>spin_lock_irqsave</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>Softirq A</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock</entry>
+</row>
+
+<row>
+<entry>Softirq B</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+</row>
+
+<row>
+<entry>Tasklet A</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>Tasklet B</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>Timer A</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>Timer B</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>User Context A</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>User Context B</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>down_interruptible</entry>
+<entry>None</entry>
+</row>
+
+</tbody>
+</tgroup>
+</table>
+</sect1>
+</chapter>
+
+ <chapter id="Examples">
+ <title>Common Examples</title>
+ <para>
+Let's step through a simple example: a cache of number to name
+mappings. The cache keeps a count of how often each of the objects is
+used, and when it gets full, throws out the least used one.
+
+ </para>
+
+ <sect1 id="examples-usercontext">
+ <title>All In User Context</title>
+ <para>
+For our first example, we assume that all operations are in user
+context (ie. from system calls), so we can sleep. This means we can
+use a semaphore to protect the cache and all the objects within
+it. Here's the code:
+ </para>
+
+ <programlisting>
+#include &lt;linux/list.h&gt;
+#include &lt;linux/slab.h&gt;
+#include &lt;linux/string.h&gt;
+#include &lt;asm/semaphore.h&gt;
+#include &lt;asm/errno.h&gt;
+
+struct object
+{
+ struct list_head list;
+ int id;
+ char name[32];
+ int popularity;
+};
+
+/* Protects the cache, cache_num, and the objects within it */
+static DECLARE_MUTEX(cache_lock);
+static LIST_HEAD(cache);
+static unsigned int cache_num = 0;
+#define MAX_CACHE_SIZE 10
+
+/* Must be holding cache_lock */
+static struct object *__cache_find(int id)
+{
+ struct object *i;
+
+ list_for_each_entry(i, &amp;cache, list)
+ if (i-&gt;id == id) {
+ i-&gt;popularity++;
+ return i;
+ }
+ return NULL;
+}
+
+/* Must be holding cache_lock */
+static void __cache_delete(struct object *obj)
+{
+ BUG_ON(!obj);
+ list_del(&amp;obj-&gt;list);
+ kfree(obj);
+ cache_num--;
+}
+
+/* Must be holding cache_lock */
+static void __cache_add(struct object *obj)
+{
+ list_add(&amp;obj-&gt;list, &amp;cache);
+ if (++cache_num > MAX_CACHE_SIZE) {
+ struct object *i, *outcast = NULL;
+ list_for_each_entry(i, &amp;cache, list) {
+ if (!outcast || i-&gt;popularity &lt; outcast-&gt;popularity)
+ outcast = i;
+ }
+ __cache_delete(outcast);
+ }
+}
+
+int cache_add(int id, const char *name)
+{
+ struct object *obj;
+
+ if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL)
+ return -ENOMEM;
+
+ strlcpy(obj-&gt;name, name, sizeof(obj-&gt;name));
+ obj-&gt;id = id;
+ obj-&gt;popularity = 0;
+
+ down(&amp;cache_lock);
+ __cache_add(obj);
+ up(&amp;cache_lock);
+ return 0;
+}
+
+void cache_delete(int id)
+{
+ down(&amp;cache_lock);
+ __cache_delete(__cache_find(id));
+ up(&amp;cache_lock);
+}
+
+int cache_find(int id, char *name)
+{
+ struct object *obj;
+ int ret = -ENOENT;
+
+ down(&amp;cache_lock);
+ obj = __cache_find(id);
+ if (obj) {
+ ret = 0;
+ strcpy(name, obj-&gt;name);
+ }
+ up(&amp;cache_lock);
+ return ret;
+}
+</programlisting>
+
+ <para>
+Note that we always make sure we have the cache_lock when we add,
+delete, or look up the cache: both the cache infrastructure itself and
+the contents of the objects are protected by the lock. In this case
+it's easy, since we copy the data for the user, and never let them
+access the objects directly.
+ </para>
+ <para>
+There is a slight (and common) optimization here: in
+<function>cache_add</function> we set up the fields of the object
+before grabbing the lock. This is safe, as no-one else can access it
+until we put it in cache.
+ </para>
+ </sect1>
+
+ <sect1 id="examples-interrupt">
+ <title>Accessing From Interrupt Context</title>
+ <para>
+Now consider the case where <function>cache_find</function> can be
+called from interrupt context: either a hardware interrupt or a
+softirq. An example would be a timer which deletes object from the
+cache.
+ </para>
+ <para>
+The change is shown below, in standard patch format: the
+<symbol>-</symbol> are lines which are taken away, and the
+<symbol>+</symbol> are lines which are added.
+ </para>
+<programlisting>
+--- cache.c.usercontext 2003-12-09 13:58:54.000000000 +1100
++++ cache.c.interrupt 2003-12-09 14:07:49.000000000 +1100
+@@ -12,7 +12,7 @@
+ int popularity;
+ };
+
+-static DECLARE_MUTEX(cache_lock);
++static spinlock_t cache_lock = SPIN_LOCK_UNLOCKED;
+ static LIST_HEAD(cache);
+ static unsigned int cache_num = 0;
+ #define MAX_CACHE_SIZE 10
+@@ -55,6 +55,7 @@
+ int cache_add(int id, const char *name)
+ {
+ struct object *obj;
++ unsigned long flags;
+
+ if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL)
+ return -ENOMEM;
+@@ -63,30 +64,33 @@
+ obj-&gt;id = id;
+ obj-&gt;popularity = 0;
+
+- down(&amp;cache_lock);
++ spin_lock_irqsave(&amp;cache_lock, flags);
+ __cache_add(obj);
+- up(&amp;cache_lock);
++ spin_unlock_irqrestore(&amp;cache_lock, flags);
+ return 0;
+ }
+
+ void cache_delete(int id)
+ {
+- down(&amp;cache_lock);
++ unsigned long flags;
++
++ spin_lock_irqsave(&amp;cache_lock, flags);
+ __cache_delete(__cache_find(id));
+- up(&amp;cache_lock);
++ spin_unlock_irqrestore(&amp;cache_lock, flags);
+ }
+
+ int cache_find(int id, char *name)
+ {
+ struct object *obj;
+ int ret = -ENOENT;
++ unsigned long flags;
+
+- down(&amp;cache_lock);
++ spin_lock_irqsave(&amp;cache_lock, flags);
+ obj = __cache_find(id);
+ if (obj) {
+ ret = 0;
+ strcpy(name, obj-&gt;name);
+ }
+- up(&amp;cache_lock);
++ spin_unlock_irqrestore(&amp;cache_lock, flags);
+ return ret;
+ }
+</programlisting>
+
+ <para>
+Note that the <function>spin_lock_irqsave</function> will turn off
+interrupts if they are on, otherwise does nothing (if we are already
+in an interrupt handler), hence these functions are safe to call from
+any context.
+ </para>
+ <para>
+Unfortunately, <function>cache_add</function> calls
+<function>kmalloc</function> with the <symbol>GFP_KERNEL</symbol>
+flag, which is only legal in user context. I have assumed that
+<function>cache_add</function> is still only called in user context,
+otherwise this should become a parameter to
+<function>cache_add</function>.
+ </para>
+ </sect1>
+ <sect1 id="examples-refcnt">
+ <title>Exposing Objects Outside This File</title>
+ <para>
+If our objects contained more information, it might not be sufficient
+to copy the information in and out: other parts of the code might want
+to keep pointers to these objects, for example, rather than looking up
+the id every time. This produces two problems.
+ </para>
+ <para>
+The first problem is that we use the <symbol>cache_lock</symbol> to
+protect objects: we'd need to make this non-static so the rest of the
+code can use it. This makes locking trickier, as it is no longer all
+in one place.
+ </para>
+ <para>
+The second problem is the lifetime problem: if another structure keeps
+a pointer to an object, it presumably expects that pointer to remain
+valid. Unfortunately, this is only guaranteed while you hold the
+lock, otherwise someone might call <function>cache_delete</function>
+and even worse, add another object, re-using the same address.
+ </para>
+ <para>
+As there is only one lock, you can't hold it forever: no-one else would
+get any work done.
+ </para>
+ <para>
+The solution to this problem is to use a reference count: everyone who
+has a pointer to the object increases it when they first get the
+object, and drops the reference count when they're finished with it.
+Whoever drops it to zero knows it is unused, and can actually delete it.
+ </para>
+ <para>
+Here is the code:
+ </para>
+
+<programlisting>
+--- cache.c.interrupt 2003-12-09 14:25:43.000000000 +1100
++++ cache.c.refcnt 2003-12-09 14:33:05.000000000 +1100
+@@ -7,6 +7,7 @@
+ struct object
+ {
+ struct list_head list;
++ unsigned int refcnt;
+ int id;
+ char name[32];
+ int popularity;
+@@ -17,6 +18,35 @@
+ static unsigned int cache_num = 0;
+ #define MAX_CACHE_SIZE 10
+
++static void __object_put(struct object *obj)
++{
++ if (--obj-&gt;refcnt == 0)
++ kfree(obj);
++}
++
++static void __object_get(struct object *obj)
++{
++ obj-&gt;refcnt++;
++}
++
++void object_put(struct object *obj)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&amp;cache_lock, flags);
++ __object_put(obj);
++ spin_unlock_irqrestore(&amp;cache_lock, flags);
++}
++
++void object_get(struct object *obj)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&amp;cache_lock, flags);
++ __object_get(obj);
++ spin_unlock_irqrestore(&amp;cache_lock, flags);
++}
++
+ /* Must be holding cache_lock */
+ static struct object *__cache_find(int id)
+ {
+@@ -35,6 +65,7 @@
+ {
+ BUG_ON(!obj);
+ list_del(&amp;obj-&gt;list);
++ __object_put(obj);
+ cache_num--;
+ }
+
+@@ -63,6 +94,7 @@
+ strlcpy(obj-&gt;name, name, sizeof(obj-&gt;name));
+ obj-&gt;id = id;
+ obj-&gt;popularity = 0;
++ obj-&gt;refcnt = 1; /* The cache holds a reference */
+
+ spin_lock_irqsave(&amp;cache_lock, flags);
+ __cache_add(obj);
+@@ -79,18 +111,15 @@
+ spin_unlock_irqrestore(&amp;cache_lock, flags);
+ }
+
+-int cache_find(int id, char *name)
++struct object *cache_find(int id)
+ {
+ struct object *obj;
+- int ret = -ENOENT;
+ unsigned long flags;
+
+ spin_lock_irqsave(&amp;cache_lock, flags);
+ obj = __cache_find(id);
+- if (obj) {
+- ret = 0;
+- strcpy(name, obj-&gt;name);
+- }
++ if (obj)
++ __object_get(obj);
+ spin_unlock_irqrestore(&amp;cache_lock, flags);
+- r