Linux-2.6.12-rc2v2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 15:20:36 -0700
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 15:20:36 -0700
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /Documentation/DocBook/kernel-locking.tmpl
1 files changed, 2088 insertions, 0 deletions
diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl
new file mode 100644
index 00000000000..90dc2de8e0a
--- /dev/null
+++ b/Documentation/DocBook/kernel-locking.tmpl
@@ -0,0 +1,2088 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="LKLockingGuide">
+ <bookinfo>
+  <title>Unreliable Guide To Locking</title>
+  
+  <authorgroup>
+   <author>
+    <firstname>Rusty</firstname>
+    <surname>Russell</surname>
+    <affiliation>
+     <address>
+      <email>rusty@rustcorp.com.au</email>
+     </address>
+    </affiliation>
+   </author>
+  </authorgroup>
+
+  <copyright>
+   <year>2003</year>
+   <holder>Rusty Russell</holder>
+  </copyright>
+
+  <legalnotice>
+   <para>
+     This documentation is free software; you can redistribute
+     it and/or modify it under the terms of the GNU General Public
+     License as published by the Free Software Foundation; either
+     version 2 of the License, or (at your option) any later
+     version.
+   </para>
+      
+   <para>
+     This program is distributed in the hope that it will be
+     useful, but WITHOUT ANY WARRANTY; without even the implied
+     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+     See the GNU General Public License for more details.
+   </para>
+      
+   <para>
+     You should have received a copy of the GNU General Public
+     License along with this program; if not, write to the Free
+     Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+     MA 02111-1307 USA
+   </para>
+      
+   <para>
+     For more details see the file COPYING in the source
+     distribution of Linux.
+   </para>
+  </legalnotice>
+ </bookinfo>
+
+ <toc></toc>
+  <chapter id="intro">
+   <title>Introduction</title>
+   <para>
+     Welcome, to Rusty's Remarkably Unreliable Guide to Kernel
+     Locking issues.  This document describes the locking systems in
+     the Linux Kernel in 2.6.
+   </para>
+   <para>
+     With the wide availability of HyperThreading, and <firstterm
+     linkend="gloss-preemption">preemption </firstterm> in the Linux
+     Kernel, everyone hacking on the kernel needs to know the
+     fundamentals of concurrency and locking for
+     <firstterm linkend="gloss-smp"><acronym>SMP</acronym></firstterm>.
+   </para>
+  </chapter>
+
+   <chapter id="races">
+    <title>The Problem With Concurrency</title>
+    <para>
+      (Skip this if you know what a Race Condition is).
+    </para>
+    <para>
+      In a normal program, you can increment a counter like so:
+    </para>
+    <programlisting>
+      very_important_count++;
+    </programlisting>
+
+    <para>
+      This is what they would expect to happen:
+    </para>
+
+    <table>
+     <title>Expected Results</title>
+
+     <tgroup cols="2" align="left">
+
+      <thead>
+       <row>
+        <entry>Instance 1</entry>
+        <entry>Instance 2</entry>
+       </row>
+      </thead>
+
+      <tbody>
+       <row>
+        <entry>read very_important_count (5)</entry>
+        <entry></entry>
+       </row>
+       <row>
+        <entry>add 1 (6)</entry>
+        <entry></entry>
+       </row>
+       <row>
+        <entry>write very_important_count (6)</entry>
+        <entry></entry>
+       </row>
+       <row>
+        <entry></entry>
+        <entry>read very_important_count (6)</entry>
+       </row>
+       <row>
+        <entry></entry>
+        <entry>add 1 (7)</entry>
+       </row>
+       <row>
+        <entry></entry>
+        <entry>write very_important_count (7)</entry>
+       </row>
+      </tbody>
+
+     </tgroup>
+    </table>
+
+    <para>
+     This is what might happen:
+    </para>
+
+    <table>
+     <title>Possible Results</title>
+
+     <tgroup cols="2" align="left">
+      <thead>
+       <row>
+        <entry>Instance 1</entry>
+        <entry>Instance 2</entry>
+       </row>
+      </thead>
+
+      <tbody>
+       <row>
+        <entry>read very_important_count (5)</entry>
+        <entry></entry>
+       </row>
+       <row>
+        <entry></entry>
+        <entry>read very_important_count (5)</entry>
+       </row>
+       <row>
+        <entry>add 1 (6)</entry>
+        <entry></entry>
+       </row>
+       <row>
+        <entry></entry>
+        <entry>add 1 (6)</entry>
+       </row>
+       <row>
+        <entry>write very_important_count (6)</entry>
+        <entry></entry>
+       </row>
+       <row>
+        <entry></entry>
+        <entry>write very_important_count (6)</entry>
+       </row>
+      </tbody>
+     </tgroup>
+    </table>
+
+    <sect1 id="race-condition">
+    <title>Race Conditions and Critical Regions</title>
+    <para>
+      This overlap, where the result depends on the
+      relative timing of multiple tasks, is called a <firstterm>race condition</firstterm>.
+      The piece of code containing the concurrency issue is called a
+      <firstterm>critical region</firstterm>.  And especially since Linux starting running
+      on SMP machines, they became one of the major issues in kernel
+      design and implementation.
+    </para>
+    <para>
+      Preemption can have the same effect, even if there is only one
+      CPU: by preempting one task during the critical region, we have
+      exactly the same race condition.  In this case the thread which
+      preempts might run the critical region itself.
+    </para>
+    <para>
+      The solution is to recognize when these simultaneous accesses
+      occur, and use locks to make sure that only one instance can
+      enter the critical region at any time.  There are many
+      friendly primitives in the Linux kernel to help you do this.
+      And then there are the unfriendly primitives, but I'll pretend
+      they don't exist.
+    </para>
+    </sect1>
+  </chapter>
+
+  <chapter id="locks">
+   <title>Locking in the Linux Kernel</title>
+
+   <para>
+     If I could give you one piece of advice: never sleep with anyone
+     crazier than yourself.  But if I had to give you advice on
+     locking: <emphasis>keep it simple</emphasis>.
+   </para>
+
+   <para>
+     Be reluctant to introduce new locks.
+   </para>
+
+   <para>
+     Strangely enough, this last one is the exact reverse of my advice when
+     you <emphasis>have</emphasis> slept with someone crazier than yourself.
+     And you should think about getting a big dog.
+   </para>
+
+   <sect1 id="lock-intro">
+   <title>Two Main Types of Kernel Locks: Spinlocks and Semaphores</title>
+
+   <para>
+     There are two main types of kernel locks.  The fundamental type
+     is the spinlock 
+     (<filename class="headerfile">include/asm/spinlock.h</filename>),
+     which is a very simple single-holder lock: if you can't get the 
+     spinlock, you keep trying (spinning) until you can.  Spinlocks are 
+     very small and fast, and can be used anywhere.
+   </para>
+   <para>
+     The second type is a semaphore
+     (<filename class="headerfile">include/asm/semaphore.h</filename>): it
+     can have more than one holder at any time (the number decided at
+     initialization time), although it is most commonly used as a
+     single-holder lock (a mutex).  If you can't get a semaphore,
+     your task will put itself on the queue, and be woken up when the
+     semaphore is released.  This means the CPU will do something
+     else while you are waiting, but there are many cases when you
+     simply can't sleep (see <xref linkend="sleeping-things"/>), and so
+     have to use a spinlock instead.
+   </para>
+   <para>
+     Neither type of lock is recursive: see
+     <xref linkend="deadlock"/>.
+   </para>
+   </sect1>
+ 
+   <sect1 id="uniprocessor">
+    <title>Locks and Uniprocessor Kernels</title>
+
+    <para>
+      For kernels compiled without <symbol>CONFIG_SMP</symbol>, and
+      without <symbol>CONFIG_PREEMPT</symbol> spinlocks do not exist at
+      all.  This is an excellent design decision: when no-one else can
+      run at the same time, there is no reason to have a lock.
+    </para>
+
+    <para>
+      If the kernel is compiled without <symbol>CONFIG_SMP</symbol>,
+      but <symbol>CONFIG_PREEMPT</symbol> is set, then spinlocks
+      simply disable preemption, which is sufficient to prevent any
+      races.  For most purposes, we can think of preemption as
+      equivalent to SMP, and not worry about it separately.
+    </para>
+
+    <para>
+      You should always test your locking code with <symbol>CONFIG_SMP</symbol>
+      and <symbol>CONFIG_PREEMPT</symbol> enabled, even if you don't have an SMP test box, because it
+      will still catch some kinds of locking bugs.
+    </para>
+
+    <para>
+      Semaphores still exist, because they are required for
+      synchronization between <firstterm linkend="gloss-usercontext">user 
+      contexts</firstterm>, as we will see below.
+    </para>
+   </sect1>
+
+    <sect1 id="usercontextlocking">
+     <title>Locking Only In User Context</title>
+
+     <para>
+       If you have a data structure which is only ever accessed from
+       user context, then you can use a simple semaphore
+       (<filename>linux/asm/semaphore.h</filename>) to protect it.  This 
+       is the most trivial case: you initialize the semaphore to the number 
+       of resources available (usually 1), and call
+       <function>down_interruptible()</function> to grab the semaphore, and 
+       <function>up()</function> to release it.  There is also a 
+       <function>down()</function>, which should be avoided, because it 
+       will not return if a signal is received.
+     </para>
+
+     <para>
+       Example: <filename>linux/net/core/netfilter.c</filename> allows 
+       registration of new <function>setsockopt()</function> and 
+       <function>getsockopt()</function> calls, with
+       <function>nf_register_sockopt()</function>.  Registration and 
+       de-registration are only done on module load and unload (and boot 
+       time, where there is no concurrency), and the list of registrations 
+       is only consulted for an unknown <function>setsockopt()</function>
+       or <function>getsockopt()</function> system call.  The 
+       <varname>nf_sockopt_mutex</varname> is perfect to protect this,
+       especially since the setsockopt and getsockopt calls may well
+       sleep.
+     </para>
+   </sect1>
+
+   <sect1 id="lock-user-bh">
+    <title>Locking Between User Context and Softirqs</title>
+
+    <para>
+      If a <firstterm linkend="gloss-softirq">softirq</firstterm> shares
+      data with user context, you have two problems.  Firstly, the current 
+      user context can be interrupted by a softirq, and secondly, the
+      critical region could be entered from another CPU.  This is where
+      <function>spin_lock_bh()</function> 
+      (<filename class="headerfile">include/linux/spinlock.h</filename>) is
+      used.  It disables softirqs on that CPU, then grabs the lock.
+      <function>spin_unlock_bh()</function> does the reverse.  (The
+      '_bh' suffix is a historical reference to "Bottom Halves", the
+      old name for software interrupts.  It should really be
+      called spin_lock_softirq()' in a perfect world).
+    </para>
+
+    <para>
+      Note that you can also use <function>spin_lock_irq()</function>
+      or <function>spin_lock_irqsave()</function> here, which stop
+      hardware interrupts as well: see <xref linkend="hardirq-context"/>.
+    </para>
+
+    <para>
+      This works perfectly for <firstterm linkend="gloss-up"><acronym>UP
+      </acronym></firstterm> as well: the spin lock vanishes, and this macro 
+      simply becomes <function>local_bh_disable()</function>
+      (<filename class="headerfile">include/linux/interrupt.h</filename>), which
+      protects you from the softirq being run.
+    </para>
+   </sect1>
+
+   <sect1 id="lock-user-tasklet">
+    <title>Locking Between User Context and Tasklets</title>
+
+    <para>
+      This is exactly the same as above, because <firstterm
+      linkend="gloss-tasklet">tasklets</firstterm> are actually run
+      from a softirq.
+    </para>
+   </sect1>
+
+   <sect1 id="lock-user-timers">
+    <title>Locking Between User Context and Timers</title>
+
+    <para>
+      This, too, is exactly the same as above, because <firstterm
+      linkend="gloss-timers">timers</firstterm> are actually run from
+      a softirq.  From a locking point of view, tasklets and timers
+      are identical.
+    </para>
+   </sect1>
+
+   <sect1 id="lock-tasklets">
+    <title>Locking Between Tasklets/Timers</title>
+
+    <para>
+      Sometimes a tasklet or timer might want to share data with
+      another tasklet or timer.
+    </para>
+
+    <sect2 id="lock-tasklets-same">
+     <title>The Same Tasklet/Timer</title>
+     <para>
+       Since a tasklet is never run on two CPUs at once, you don't
+       need to worry about your tasklet being reentrant (running
+       twice at once), even on SMP.
+     </para>
+    </sect2>
+
+    <sect2 id="lock-tasklets-different">
+     <title>Different Tasklets/Timers</title>
+     <para>
+       If another tasklet/timer wants
+       to share data with your tasklet or timer , you will both need to use
+       <function>spin_lock()</function> and
+       <function>spin_unlock()</function> calls.  
+       <function>spin_lock_bh()</function> is
+       unnecessary here, as you are already in a tasklet, and
+       none will be run on the same CPU.
+     </para>
+    </sect2>
+   </sect1>
+
+   <sect1 id="lock-softirqs">
+    <title>Locking Between Softirqs</title>
+
+    <para>
+      Often a softirq might
+      want to share data with itself or a tasklet/timer.
+    </para>
+
+    <sect2 id="lock-softirqs-same">
+     <title>The Same Softirq</title>
+
+     <para>
+       The same softirq can run on the other CPUs: you can use a
+       per-CPU array (see <xref linkend="per-cpu"/>) for better
+       performance.  If you're going so far as to use a softirq,
+       you probably care about scalable performance enough
+       to justify the extra complexity.
+     </para>
+
+     <para>
+       You'll need to use <function>spin_lock()</function> and 
+       <function>spin_unlock()</function> for shared data.
+     </para>
+    </sect2>
+
+    <sect2 id="lock-softirqs-different">
+     <title>Different Softirqs</title>
+
+     <para>
+       You'll need to use <function>spin_lock()</function> and
+       <function>spin_unlock()</function> for shared data, whether it
+       be a timer, tasklet, different softirq or the same or another
+       softirq: any of them could be running on a different CPU.
+     </para>
+    </sect2>
+   </sect1>
+  </chapter>
+
+  <chapter id="hardirq-context">
+   <title>Hard IRQ Context</title>
+
+   <para>
+     Hardware interrupts usually communicate with a
+     tasklet or softirq.  Frequently this involves putting work in a
+     queue, which the softirq will take out.
+   </para>
+
+   <sect1 id="hardirq-softirq">
+    <title>Locking Between Hard IRQ and Softirqs/Tasklets</title>
+
+    <para>
+      If a hardware irq handler shares data with a softirq, you have
+      two concerns.  Firstly, the softirq processing can be
+      interrupted by a hardware interrupt, and secondly, the
+      critical region could be entered by a hardware interrupt on
+      another CPU.  This is where <function>spin_lock_irq()</function> is 
+      used.  It is defined to disable interrupts on that cpu, then grab 
+      the lock. <function>spin_unlock_irq()</function> does the reverse.
+    </para>
+
+    <para>
+      The irq handler does not to use
+      <function>spin_lock_irq()</function>, because the softirq cannot
+      run while the irq handler is running: it can use
+      <function>spin_lock()</function>, which is slightly faster.  The
+      only exception would be if a different hardware irq handler uses
+      the same lock: <function>spin_lock_irq()</function> will stop
+      that from interrupting us.
+    </para>
+
+    <para>
+      This works perfectly for UP as well: the spin lock vanishes,
+      and this macro simply becomes <function>local_irq_disable()</function>
+      (<filename class="headerfile">include/asm/smp.h</filename>), which
+      protects you from the softirq/tasklet/BH being run.
+    </para>
+
+    <para>
+      <function>spin_lock_irqsave()</function> 
+      (<filename>include/linux/spinlock.h</filename>) is a variant
+      which saves whether interrupts were on or off in a flags word,
+      which is passed to <function>spin_unlock_irqrestore()</function>.  This
+      means that the same code can be used inside an hard irq handler (where
+      interrupts are already off) and in softirqs (where the irq
+      disabling is required).
+    </para>
+
+    <para>
+      Note that softirqs (and hence tasklets and timers) are run on
+      return from hardware interrupts, so
+      <function>spin_lock_irq()</function> also stops these.  In that
+      sense, <function>spin_lock_irqsave()</function> is the most
+      general and powerful locking function.
+    </para>
+
+   </sect1>
+   <sect1 id="hardirq-hardirq">
+    <title>Locking Between Two Hard IRQ Handlers</title>
+    <para>
+      It is rare to have to share data between two IRQ handlers, but
+      if you do, <function>spin_lock_irqsave()</function> should be
+      used: it is architecture-specific whether all interrupts are
+      disabled inside irq handlers themselves.
+    </para>
+   </sect1>
+
+  </chapter>
+
+  <chapter id="cheatsheet">
+   <title>Cheat Sheet For Locking</title>
+   <para>
+     Pete Zaitcev gives the following summary:
+   </para>
+   <itemizedlist>
+      <listitem>
+	<para>
+          If you are in a process context (any syscall) and want to
+	lock other process out, use a semaphore.  You can take a semaphore
+	and sleep (<function>copy_from_user*(</function> or
+	<function>kmalloc(x,GFP_KERNEL)</function>).
+      </para>
+      </listitem>
+      <listitem>
+	<para>
+	Otherwise (== data can be touched in an interrupt), use
+	<function>spin_lock_irqsave()</function> and
+	<function>spin_unlock_irqrestore()</function>.
+	</para>
+      </listitem>
+      <listitem>
+	<para>
+	Avoid holding spinlock for more than 5 lines of code and
+	across any function call (except accessors like
+	<function>readb</function>).
+	</para>
+      </listitem>
+    </itemizedlist>
+
+   <sect1 id="minimum-lock-reqirements">
+   <title>Table of Minimum Requirements</title>
+
+   <para> The following table lists the <emphasis>minimum</emphasis>
+	locking requirements between various contexts.  In some cases,
+	the same context can only be running on one CPU at a time, so
+	no locking is required for that context (eg. a particular
+	thread can only run on one CPU at a time, but if it needs
+	shares data with another thread, locking is required).
+   </para>
+   <para>
+	Remember the advice above: you can always use
+	<function>spin_lock_irqsave()</function>, which is a superset
+	of all other spinlock primitives.
+   </para>
+   <table>
+<title>Table of Locking Requirements</title>
+<tgroup cols="11">
+<tbody>
+<row>
+<entry></entry>
+<entry>IRQ Handler A</entry>
+<entry>IRQ Handler B</entry>
+<entry>Softirq A</entry>
+<entry>Softirq B</entry>
+<entry>Tasklet A</entry>
+<entry>Tasklet B</entry>
+<entry>Timer A</entry>
+<entry>Timer B</entry>
+<entry>User Context A</entry>
+<entry>User Context B</entry>
+</row>
+
+<row>
+<entry>IRQ Handler A</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>IRQ Handler B</entry>
+<entry>spin_lock_irqsave</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>Softirq A</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock</entry>
+</row>
+
+<row>
+<entry>Softirq B</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+</row>
+
+<row>
+<entry>Tasklet A</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>Tasklet B</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>Timer A</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>Timer B</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>User Context A</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>User Context B</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>down_interruptible</entry>
+<entry>None</entry>
+</row>
+
+</tbody>
+</tgroup>
+</table>
+</sect1>
+</chapter>
+
+  <chapter id="Examples">
+   <title>Common Examples</title>
+    <para>
+Let's step through a simple example: a cache of number to name
+mappings.  The cache keeps a count of how often each of the objects is
+used, and when it gets full, throws out the least used one.
+
+    </para>
+
+   <sect1 id="examples-usercontext">
+    <title>All In User Context</title>
+    <para>
+For our first example, we assume that all operations are in user
+context (ie. from system calls), so we can sleep.  This means we can
+use a semaphore to protect the cache and all the objects within
+it.  Here's the code:
+    </para>
+
+    <programlisting>
+#include &lt;linux/list.h&gt;
+#include &lt;linux/slab.h&gt;
+#include &lt;linux/string.h&gt;
+#include &lt;asm/semaphore.h&gt;
+#include &lt;asm/errno.h&gt;
+
+struct object
+{
+        struct list_head list;
+        int id;
+        char name[32];
+        int popularity;
+};
+
+/* Protects the cache, cache_num, and the objects within it */
+static DECLARE_MUTEX(cache_lock);
+static LIST_HEAD(cache);
+static unsigned int cache_num = 0;
+#define MAX_CACHE_SIZE 10
+
+/* Must be holding cache_lock */
+static struct object *__cache_find(int id)
+{
+        struct object *i;
+
+        list_for_each_entry(i, &amp;cache, list)
+                if (i-&gt;id == id) {
+                        i-&gt;popularity++;
+                        return i;
+                }
+        return NULL;
+}
+
+/* Must be holding cache_lock */
+static void __cache_delete(struct object *obj)
+{
+        BUG_ON(!obj);
+        list_del(&amp;obj-&gt;list);
+        kfree(obj);
+        cache_num--;
+}
+
+/* Must be holding cache_lock */
+static void __cache_add(struct object *obj)
+{
+        list_add(&amp;obj-&gt;list, &amp;cache);
+        if (++cache_num > MAX_CACHE_SIZE) {
+                struct object *i, *outcast = NULL;
+                list_for_each_entry(i, &amp;cache, list) {
+                        if (!outcast || i-&gt;popularity &lt; outcast-&gt;popularity)
+                                outcast = i;
+                }
+                __cache_delete(outcast);
+        }
+}
+
+int cache_add(int id, const char *name)
+{
+        struct object *obj;
+
+        if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL)
+                return -ENOMEM;
+
+        strlcpy(obj-&gt;name, name, sizeof(obj-&gt;name));
+        obj-&gt;id = id;
+        obj-&gt;popularity = 0;
+
+        down(&amp;cache_lock);
+        __cache_add(obj);
+        up(&amp;cache_lock);
+        return 0;
+}
+
+void cache_delete(int id)
+{
+        down(&amp;cache_lock);
+        __cache_delete(__cache_find(id));
+        up(&amp;cache_lock);
+}
+
+int cache_find(int id, char *name)
+{
+        struct object *obj;
+        int ret = -ENOENT;
+
+        down(&amp;cache_lock);
+        obj = __cache_find(id);
+        if (obj) {
+                ret = 0;
+                strcpy(name, obj-&gt;name);
+        }
+        up(&amp;cache_lock);
+        return ret;
+}
+</programlisting>
+
+    <para>
+Note that we always make sure we have the cache_lock when we add,
+delete, or look up the cache: both the cache infrastructure itself and
+the contents of the objects are protected by the lock.  In this case
+it's easy, since we copy the data for the user, and never let them
+access the objects directly.
+    </para>
+    <para>
+There is a slight (and common) optimization here: in
+<function>cache_add</function> we set up the fields of the object
+before grabbing the lock.  This is safe, as no-one else can access it
+until we put it in cache.
+    </para>
+    </sect1>
+
+   <sect1 id="examples-interrupt">
+    <title>Accessing From Interrupt Context</title>
+    <para>
+Now consider the case where <function>cache_find</function> can be
+called from interrupt context: either a hardware interrupt or a
+softirq.  An example would be a timer which deletes object from the
+cache.
+    </para>
+    <para>
+The change is shown below, in standard patch format: the
+<symbol>-</symbol> are lines which are taken away, and the
+<symbol>+</symbol> are lines which are added.
+    </para>
+<programlisting>
+--- cache.c.usercontext	2003-12-09 13:58:54.000000000 +1100
++++ cache.c.interrupt	2003-12-09 14:07:49.000000000 +1100
+@@ -12,7 +12,7 @@
+         int popularity;
+ };
+
+-static DECLARE_MUTEX(cache_lock);
++static spinlock_t cache_lock = SPIN_LOCK_UNLOCKED;
+ static LIST_HEAD(cache);
+ static unsigned int cache_num = 0;
+ #define MAX_CACHE_SIZE 10
+@@ -55,6 +55,7 @@
+ int cache_add(int id, const char *name)
+ {
+         struct object *obj;
++        unsigned long flags;
+
+         if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL)
+                 return -ENOMEM;
+@@ -63,30 +64,33 @@
+         obj-&gt;id = id;
+         obj-&gt;popularity = 0;
+
+-        down(&amp;cache_lock);
++        spin_lock_irqsave(&amp;cache_lock, flags);
+         __cache_add(obj);
+-        up(&amp;cache_lock);
++        spin_unlock_irqrestore(&amp;cache_lock, flags);
+         return 0;
+ }
+
+ void cache_delete(int id)
+ {
+-        down(&amp;cache_lock);
++        unsigned long flags;
++
++        spin_lock_irqsave(&amp;cache_lock, flags);
+         __cache_delete(__cache_find(id));
+-        up(&amp;cache_lock);
++        spin_unlock_irqrestore(&amp;cache_lock, flags);
+ }
+
+ int cache_find(int id, char *name)
+ {
+         struct object *obj;
+         int ret = -ENOENT;
++        unsigned long flags;
+
+-        down(&amp;cache_lock);
++        spin_lock_irqsave(&amp;cache_lock, flags);
+         obj = __cache_find(id);
+         if (obj) {
+                 ret = 0;
+                 strcpy(name, obj-&gt;name);
+         }
+-        up(&amp;cache_lock);
++        spin_unlock_irqrestore(&amp;cache_lock, flags);
+         return ret;
+ }
+</programlisting>
+
+    <para>
+Note that the <function>spin_lock_irqsave</function> will turn off
+interrupts if they are on, otherwise does nothing (if we are already
+in an interrupt handler), hence these functions are safe to call from
+any context.
+    </para>
+    <para>
+Unfortunately, <function>cache_add</function> calls
+<function>kmalloc</function> with the <symbol>GFP_KERNEL</symbol>
+flag, which is only legal in user context.  I have assumed that
+<function>cache_add</function> is still only called in user context,
+otherwise this should become a parameter to
+<function>cache_add</function>.
+    </para>
+  </sect1>
+   <sect1 id="examples-refcnt">
+    <title>Exposing Objects Outside This File</title>
+    <para>
+If our objects contained more information, it might not be sufficient
+to copy the information in and out: other parts of the code might want
+to keep pointers to these objects, for example, rather than looking up
+the id every time.  This produces two problems.
+    </para>
+    <para>
+The first problem is that we use the <symbol>cache_lock</symbol> to
+protect objects: we'd need to make this non-static so the rest of the
+code can use it.  This makes locking trickier, as it is no longer all
+in one place.
+    </para>
+    <para>
+The second problem is the lifetime problem: if another structure keeps
+a pointer to an object, it presumably expects that pointer to remain
+valid.  Unfortunately, this is only guaranteed while you hold the
+lock, otherwise someone might call <function>cache_delete</function>
+and even worse, add another object, re-using the same address.
+    </para>
+    <para>
+As there is only one lock, you can't hold it forever: no-one else would
+get any work done.
+    </para>
+    <para>
+The solution to this problem is to use a reference count: everyone who
+has a pointer to the object increases it when they first get the
+object, and drops the reference count when they're finished with it.
+Whoever drops it to zero knows it is unused, and can actually delete it.
+    </para>
+    <para>
+Here is the code:
+    </para>
+
+<programlisting>
+--- cache.c.interrupt	2003-12-09 14:25:43.000000000 +1100
++++ cache.c.refcnt	2003-12-09 14:33:05.000000000 +1100
+@@ -7,6 +7,7 @@
+ struct object
+ {
+         struct list_head list;
++        unsigned int refcnt;
+         int id;
+         char name[32];
+         int popularity;
+@@ -17,6 +18,35 @@
+ static unsigned int cache_num = 0;
+ #define MAX_CACHE_SIZE 10
+
++static void __object_put(struct object *obj)
++{
++        if (--obj-&gt;refcnt == 0)
++                kfree(obj);
++}
++
++static void __object_get(struct object *obj)
++{
++        obj-&gt;refcnt++;
++}
++
++void object_put(struct object *obj)
++{
++        unsigned long flags;
++
++        spin_lock_irqsave(&amp;cache_lock, flags);
++        __object_put(obj);
++        spin_unlock_irqrestore(&amp;cache_lock, flags);
++}
++
++void object_get(struct object *obj)
++{
++        unsigned long flags;
++
++        spin_lock_irqsave(&amp;cache_lock, flags);
++        __object_get(obj);
++        spin_unlock_irqrestore(&amp;cache_lock, flags);
++}
++
+ /* Must be holding cache_lock */
+ static struct object *__cache_find(int id)
+ {
+@@ -35,6 +65,7 @@
+ {
+         BUG_ON(!obj);
+         list_del(&amp;obj-&gt;list);
++        __object_put(obj);
+         cache_num--;
+ }
+
+@@ -63,6 +94,7 @@
+         strlcpy(obj-&gt;name, name, sizeof(obj-&gt;name));
+         obj-&gt;id = id;
+         obj-&gt;popularity = 0;
++        obj-&gt;refcnt = 1; /* The cache holds a reference */
+
+         spin_lock_irqsave(&amp;cache_lock, flags);
+         __cache_add(obj);
+@@ -79,18 +111,15 @@
+         spin_unlock_irqrestore(&amp;cache_lock, flags);
+ }
+
+-int cache_find(int id, char *name)
++struct object *cache_find(int id)
+ {
+         struct object *obj;
+-        int ret = -ENOENT;
+         unsigned long flags;
+
+         spin_lock_irqsave(&amp;cache_lock, flags);
+         obj = __cache_find(id);
+-        if (obj) {
+-                ret = 0;
+-                strcpy(name, obj-&gt;name);
+-        }
++        if (obj)
++                __object_get(obj);
+         spin_unlock_irqrestore(&amp;cache_lock, flags);
+-        r
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 15:20:36 -0700
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 15:20:36 -0700
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /Documentation/DocBook/kernel-locking.tmpl