aboutsummaryrefslogtreecommitdiff
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/00-INDEX7
-rw-r--r--Documentation/ABI/testing/sysfs-bus-usb33
-rw-r--r--Documentation/ABI/testing/sysfs-firmware-acpi99
-rw-r--r--Documentation/BUG-HUNTING39
-rw-r--r--Documentation/DocBook/kernel-api.tmpl2
-rw-r--r--Documentation/DocBook/kernel-locking.tmpl32
-rw-r--r--Documentation/DocBook/s390-drivers.tmpl21
-rw-r--r--Documentation/Smack.txt493
-rw-r--r--Documentation/SubmittingPatches16
-rw-r--r--Documentation/acpi/method-tracing.txt26
-rw-r--r--Documentation/arm/Sharp-LH/IOBarrier2
-rw-r--r--Documentation/debugging-modules.txt4
-rw-r--r--Documentation/driver-model/platform.txt6
-rw-r--r--Documentation/fb/deferred_io.txt6
-rw-r--r--Documentation/feature-removal-schedule.txt23
-rw-r--r--Documentation/filesystems/configfs/configfs.txt2
-rw-r--r--Documentation/filesystems/porting6
-rw-r--r--Documentation/filesystems/proc.txt104
-rw-r--r--Documentation/filesystems/ramfs-rootfs-initramfs.txt2
-rw-r--r--Documentation/filesystems/relay.txt2
-rw-r--r--Documentation/frv/README.txt (renamed from Documentation/fujitsu/frv/README.txt)0
-rw-r--r--Documentation/frv/atomic-ops.txt (renamed from Documentation/fujitsu/frv/atomic-ops.txt)0
-rw-r--r--Documentation/frv/booting.txt (renamed from Documentation/fujitsu/frv/booting.txt)2
-rw-r--r--Documentation/frv/clock.txt (renamed from Documentation/fujitsu/frv/clock.txt)0
-rw-r--r--Documentation/frv/configuring.txt (renamed from Documentation/fujitsu/frv/configuring.txt)0
-rw-r--r--Documentation/frv/features.txt (renamed from Documentation/fujitsu/frv/features.txt)0
-rw-r--r--Documentation/frv/gdbinit (renamed from Documentation/fujitsu/frv/gdbinit)0
-rw-r--r--Documentation/frv/gdbstub.txt (renamed from Documentation/fujitsu/frv/gdbstub.txt)0
-rw-r--r--Documentation/frv/kernel-ABI.txt (renamed from Documentation/fujitsu/frv/kernel-ABI.txt)0
-rw-r--r--Documentation/frv/mmu-layout.txt (renamed from Documentation/fujitsu/frv/mmu-layout.txt)0
-rw-r--r--Documentation/gpio.txt133
-rw-r--r--Documentation/i2c/chips/pca95393
-rw-r--r--Documentation/ia64/aliasing-test.c15
-rw-r--r--Documentation/ide/ChangeLog.ide-tape.1995-2002257
-rw-r--r--Documentation/ide/ide-tape.txt146
-rw-r--r--Documentation/initrd.txt2
-rw-r--r--Documentation/ja_JP/stable_kernel_rules.txt79
-rw-r--r--Documentation/kernel-parameters.txt35
-rw-r--r--Documentation/kprobes.txt81
-rw-r--r--Documentation/kref.txt20
-rw-r--r--Documentation/lguest/lguest.c231
-rw-r--r--Documentation/md.txt10
-rw-r--r--Documentation/networking/decnet.txt2
-rw-r--r--Documentation/pci.txt37
-rw-r--r--Documentation/pcmcia/driver-changes.txt4
-rw-r--r--Documentation/pm_qos_interface.txt59
-rw-r--r--Documentation/power/basic-pm-debugging.txt216
-rw-r--r--Documentation/power/devices.txt49
-rw-r--r--Documentation/power/drivers-testing.txt30
-rw-r--r--Documentation/power/notifiers.txt8
-rw-r--r--Documentation/power/swsusp.txt5
-rw-r--r--Documentation/power/userland-swsusp.txt82
-rw-r--r--Documentation/power_supply_class.txt6
-rw-r--r--Documentation/rtc.txt42
-rw-r--r--Documentation/smp.txt22
-rw-r--r--Documentation/sysctl/fs.txt10
-rw-r--r--Documentation/sysctl/vm.txt7
-rw-r--r--Documentation/thinkpad-acpi.txt116
-rw-r--r--Documentation/unaligned-memory-access.txt226
-rw-r--r--Documentation/usb/gadget_printer.txt510
-rw-r--r--Documentation/usb/iuu_phoenix.txt84
-rw-r--r--Documentation/w1/masters/00-INDEX2
-rw-r--r--Documentation/w1/masters/w1-gpio33
-rw-r--r--Documentation/x86_64/00-INDEX16
64 files changed, 3000 insertions, 505 deletions
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index c3014df066c..33f55917f23 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -14,6 +14,7 @@ Following translations are available on the WWW:
- this file.
ABI/
- info on kernel <-> userspace ABI and relative interface stability.
+
BUG-HUNTING
- brute force method of doing binary search of patches to find bug.
Changes
@@ -66,6 +67,8 @@ VGA-softcursor.txt
- how to change your VGA cursor from a blinking underscore.
accounting/
- documentation on accounting and taskstats.
+acpi/
+ - info on ACPI-specific hooks in the kernel.
aoe/
- description of AoE (ATA over Ethernet) along with config examples.
applying-patches.txt
@@ -154,7 +157,7 @@ firmware_class/
- request_firmware() hotplug interface info.
floppy.txt
- notes and driver options for the floppy disk driver.
-fujitsu/
+frv/
- Fujitsu FR-V Linux documentation.
gpio.txt
- overview of GPIO (General Purpose Input/Output) access conventions.
@@ -364,8 +367,6 @@ sharedsubtree.txt
- a description of shared subtrees for namespaces.
smart-config.txt
- description of the Smart Config makefile feature.
-smp.txt
- - a few notes on symmetric multi-processing.
sony-laptop.txt
- Sony Notebook Control Driver (SNC) Readme.
sonypi.txt
diff --git a/Documentation/ABI/testing/sysfs-bus-usb b/Documentation/ABI/testing/sysfs-bus-usb
index 9734577d171..11a3c1682ce 100644
--- a/Documentation/ABI/testing/sysfs-bus-usb
+++ b/Documentation/ABI/testing/sysfs-bus-usb
@@ -52,3 +52,36 @@ Description:
facility is inherently dangerous, it is disabled by default
for all devices except hubs. For more information, see
Documentation/usb/persist.txt.
+
+What: /sys/bus/usb/device/.../power/connected_duration
+Date: January 2008
+KernelVersion: 2.6.25
+Contact: Sarah Sharp <sarah.a.sharp@intel.com>
+Description:
+ If CONFIG_PM and CONFIG_USB_SUSPEND are enabled, then this file
+ is present. When read, it returns the total time (in msec)
+ that the USB device has been connected to the machine. This
+ file is read-only.
+Users:
+ PowerTOP <power@bughost.org>
+ http://www.lesswatts.org/projects/powertop/
+
+What: /sys/bus/usb/device/.../power/active_duration
+Date: January 2008
+KernelVersion: 2.6.25
+Contact: Sarah Sharp <sarah.a.sharp@intel.com>
+Description:
+ If CONFIG_PM and CONFIG_USB_SUSPEND are enabled, then this file
+ is present. When read, it returns the total time (in msec)
+ that the USB device has been active, i.e. not in a suspended
+ state. This file is read-only.
+
+ Tools can use this file and the connected_duration file to
+ compute the percentage of time that a device has been active.
+ For example,
+ echo $((100 * `cat active_duration` / `cat connected_duration`))
+ will give an integer percentage. Note that this does not
+ account for counter wrap.
+Users:
+ PowerTOP <power@bughost.org>
+ http://www.lesswatts.org/projects/powertop/
diff --git a/Documentation/ABI/testing/sysfs-firmware-acpi b/Documentation/ABI/testing/sysfs-firmware-acpi
new file mode 100644
index 00000000000..9470ed9afcc
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-firmware-acpi
@@ -0,0 +1,99 @@
+What: /sys/firmware/acpi/interrupts/
+Date: February 2008
+Contact: Len Brown <lenb@kernel.org>
+Description:
+ All ACPI interrupts are handled via a single IRQ,
+ the System Control Interrupt (SCI), which appears
+ as "acpi" in /proc/interrupts.
+
+ However, one of the main functions of ACPI is to make
+ the platform understand random hardware without
+ special driver support. So while the SCI handles a few
+ well known (fixed feature) interrupts sources, such
+ as the power button, it can also handle a variable
+ number of a "General Purpose Events" (GPE).
+
+ A GPE vectors to a specified handler in AML, which
+ can do a anything the BIOS writer wants from
+ OS context. GPE 0x12, for example, would vector
+ to a level or edge handler called _L12 or _E12.
+ The handler may do its business and return.
+ Or the handler may send send a Notify event
+ to a Linux device driver registered on an ACPI device,
+ such as a battery, or a processor.
+
+ To figure out where all the SCI's are coming from,
+ /sys/firmware/acpi/interrupts contains a file listing
+ every possible source, and the count of how many
+ times it has triggered.
+
+ $ cd /sys/firmware/acpi/interrupts
+ $ grep . *
+ error:0
+ ff_gbl_lock:0
+ ff_pmtimer:0
+ ff_pwr_btn:0
+ ff_rt_clk:0
+ ff_slp_btn:0
+ gpe00:0
+ gpe01:0
+ gpe02:0
+ gpe03:0
+ gpe04:0
+ gpe05:0
+ gpe06:0
+ gpe07:0
+ gpe08:0
+ gpe09:174
+ gpe0A:0
+ gpe0B:0
+ gpe0C:0
+ gpe0D:0
+ gpe0E:0
+ gpe0F:0
+ gpe10:0
+ gpe11:60
+ gpe12:0
+ gpe13:0
+ gpe14:0
+ gpe15:0
+ gpe16:0
+ gpe17:0
+ gpe18:0
+ gpe19:7
+ gpe1A:0
+ gpe1B:0
+ gpe1C:0
+ gpe1D:0
+ gpe1E:0
+ gpe1F:0
+ gpe_all:241
+ sci:241
+
+ sci - The total number of times the ACPI SCI
+ has claimed an interrupt.
+
+ gpe_all - count of SCI caused by GPEs.
+
+ gpeXX - count for individual GPE source
+
+ ff_gbl_lock - Global Lock
+
+ ff_pmtimer - PM Timer
+
+ ff_pwr_btn - Power Button
+
+ ff_rt_clk - Real Time Clock
+
+ ff_slp_btn - Sleep Button
+
+ error - an interrupt that can't be accounted for above.
+
+ Root has permission to clear any of these counters. Eg.
+ # echo 0 > gpe11
+
+ All counters can be cleared by clearing the total "sci":
+ # echo 0 > sci
+
+ None of these counters has an effect on the function
+ of the system, they are simply statistics.
diff --git a/Documentation/BUG-HUNTING b/Documentation/BUG-HUNTING
index 35f5bd24333..65022a87bf1 100644
--- a/Documentation/BUG-HUNTING
+++ b/Documentation/BUG-HUNTING
@@ -53,7 +53,7 @@ Finding it the old way
[Sat Mar 2 10:32:33 PST 1996 KERNEL_BUG-HOWTO lm@sgi.com (Larry McVoy)]
-This is how to track down a bug if you know nothing about kernel hacking.
+This is how to track down a bug if you know nothing about kernel hacking.
It's a brute force approach but it works pretty well.
You need:
@@ -66,12 +66,12 @@ You will then do:
. Rebuild a revision that you believe works, install, and verify that.
. Do a binary search over the kernels to figure out which one
- introduced the bug. I.e., suppose 1.3.28 didn't have the bug, but
+ introduced the bug. I.e., suppose 1.3.28 didn't have the bug, but
you know that 1.3.69 does. Pick a kernel in the middle and build
that, like 1.3.50. Build & test; if it works, pick the mid point
between .50 and .69, else the mid point between .28 and .50.
. You'll narrow it down to the kernel that introduced the bug. You
- can probably do better than this but it gets tricky.
+ can probably do better than this but it gets tricky.
. Narrow it down to a subdirectory
@@ -81,27 +81,27 @@ You will then do:
directories:
Copy the non-working directory next to the working directory
- as "dir.63".
+ as "dir.63".
One directory at time, try moving the working directory to
- "dir.62" and mv dir.63 dir"time, try
+ "dir.62" and mv dir.63 dir"time, try
mv dir dir.62
mv dir.63 dir
find dir -name '*.[oa]' -print | xargs rm -f
And then rebuild and retest. Assuming that all related
- changes were contained in the sub directory, this should
- isolate the change to a directory.
+ changes were contained in the sub directory, this should
+ isolate the change to a directory.
Problems: changes in header files may have occurred; I've
- found in my case that they were self explanatory - you may
+ found in my case that they were self explanatory - you may
or may not want to give up when that happens.
. Narrow it down to a file
- You can apply the same technique to each file in the directory,
- hoping that the changes in that file are self contained.
-
+ hoping that the changes in that file are self contained.
+
. Narrow it down to a routine
- You can take the old file and the new file and manually create
@@ -130,7 +130,7 @@ You will then do:
that makes the difference.
Finally, you take all the info that you have, kernel revisions, bug
-description, the extent to which you have narrowed it down, and pass
+description, the extent to which you have narrowed it down, and pass
that off to whomever you believe is the maintainer of that section.
A post to linux.dev.kernel isn't such a bad idea if you've done some
work to narrow it down.
@@ -214,6 +214,23 @@ And recompile the kernel with CONFIG_DEBUG_INFO enabled:
gdb vmlinux
(gdb) p vt_ioctl
(gdb) l *(0x<address of vt_ioctl> + 0xda8)
+or, as one command
+ (gdb) l *(vt_ioctl + 0xda8)
+
+If you have a call trace, such as :-
+>Call Trace:
+> [<ffffffff8802c8e9>] :jbd:log_wait_commit+0xa3/0xf5
+> [<ffffffff810482d9>] autoremove_wake_function+0x0/0x2e
+> [<ffffffff8802770b>] :jbd:journal_stop+0x1be/0x1ee
+> ...
+this shows the problem in the :jbd: module. You can load that module in gdb
+and list the relevant code.
+ gdb fs/jbd/jbd.ko
+ (gdb) p log_wait_commit
+ (gdb) l *(0x<address> + 0xa3)
+or
+ (gdb) l *(log_wait_commit + 0xa3)
+
Another very useful option of the Kernel Hacking section in menuconfig is
Debug memory allocations. This will help you see whether data has been
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index 77436d73501..059aaf20951 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -165,6 +165,7 @@ X!Ilib/string.c
!Emm/vmalloc.c
!Imm/page_alloc.c
!Emm/mempool.c
+!Emm/dmapool.c
!Emm/page-writeback.c
!Emm/truncate.c
</sect1>
@@ -371,7 +372,6 @@ X!Iinclude/linux/device.h
!Edrivers/base/class.c
!Edrivers/base/firmware_class.c
!Edrivers/base/transport_class.c
-!Edrivers/base/dmapool.c
<!-- Cannot be included, because
attribute_container_add_class_device_adapter
and attribute_container_classdev_to_container
diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl
index 01825ee7db6..2e9d6b41f03 100644
--- a/Documentation/DocBook/kernel-locking.tmpl
+++ b/Documentation/DocBook/kernel-locking.tmpl
@@ -717,7 +717,7 @@ used, and when it gets full, throws out the least used one.
<para>
For our first example, we assume that all operations are in user
context (ie. from system calls), so we can sleep. This means we can
-use a semaphore to protect the cache and all the objects within
+use a mutex to protect the cache and all the objects within
it. Here's the code:
</para>
@@ -725,7 +725,7 @@ it. Here's the code:
#include &lt;linux/list.h&gt;
#include &lt;linux/slab.h&gt;
#include &lt;linux/string.h&gt;
-#include &lt;asm/semaphore.h&gt;
+#include &lt;linux/mutex.h&gt;
#include &lt;asm/errno.h&gt;
struct object
@@ -737,7 +737,7 @@ struct object
};
/* Protects the cache, cache_num, and the objects within it */
-static DECLARE_MUTEX(cache_lock);
+static DEFINE_MUTEX(cache_lock);
static LIST_HEAD(cache);
static unsigned int cache_num = 0;
#define MAX_CACHE_SIZE 10
@@ -789,17 +789,17 @@ int cache_add(int id, const char *name)
obj-&gt;id = id;
obj-&gt;popularity = 0;
- down(&amp;cache_lock);
+ mutex_lock(&amp;cache_lock);
__cache_add(obj);
- up(&amp;cache_lock);
+ mutex_unlock(&amp;cache_lock);
return 0;
}
void cache_delete(int id)
{
- down(&amp;cache_lock);
+ mutex_lock(&amp;cache_lock);
__cache_delete(__cache_find(id));
- up(&amp;cache_lock);
+ mutex_unlock(&amp;cache_lock);
}
int cache_find(int id, char *name)
@@ -807,13 +807,13 @@ int cache_find(int id, char *name)
struct object *obj;
int ret = -ENOENT;
- down(&amp;cache_lock);
+ mutex_lock(&amp;cache_lock);
obj = __cache_find(id);
if (obj) {
ret = 0;
strcpy(name, obj-&gt;name);
}
- up(&amp;cache_lock);
+ mutex_unlock(&amp;cache_lock);
return ret;
}
</programlisting>
@@ -853,7 +853,7 @@ The change is shown below, in standard patch format: the
int popularity;
};
--static DECLARE_MUTEX(cache_lock);
+-static DEFINE_MUTEX(cache_lock);
+static spinlock_t cache_lock = SPIN_LOCK_UNLOCKED;
static LIST_HEAD(cache);
static unsigned int cache_num = 0;
@@ -870,22 +870,22 @@ The change is shown below, in standard patch format: the
obj-&gt;id = id;
obj-&gt;popularity = 0;
-- down(&amp;cache_lock);
+- mutex_lock(&amp;cache_lock);
+ spin_lock_irqsave(&amp;cache_lock, flags);
__cache_add(obj);
-- up(&amp;cache_lock);
+- mutex_unlock(&amp;cache_lock);
+ spin_unlock_irqrestore(&amp;cache_lock, flags);
return 0;
}
void cache_delete(int id)
{
-- down(&amp;cache_lock);
+- mutex_lock(&amp;cache_lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&amp;cache_lock, flags);
__cache_delete(__cache_find(id));
-- up(&amp;cache_lock);
+- mutex_unlock(&amp;cache_lock);
+ spin_unlock_irqrestore(&amp;cache_lock, flags);
}
@@ -895,14 +895,14 @@ The change is shown below, in standard patch format: the
int ret = -ENOENT;
+ unsigned long flags;
-- down(&amp;cache_lock);
+- mutex_lock(&amp;cache_lock);
+ spin_lock_irqsave(&amp;cache_lock, flags);
obj = __cache_find(id);
if (obj) {
ret = 0;
strcpy(name, obj-&gt;name);
}
-- up(&amp;cache_lock);
+- mutex_unlock(&amp;cache_lock);
+ spin_unlock_irqrestore(&amp;cache_lock, flags);
return ret;
}
diff --git a/Documentation/DocBook/s390-drivers.tmpl b/Documentation/DocBook/s390-drivers.tmpl
index 3d2f31b99dd..4acc73240a6 100644
--- a/Documentation/DocBook/s390-drivers.tmpl
+++ b/Documentation/DocBook/s390-drivers.tmpl
@@ -59,7 +59,7 @@
<title>Introduction</title>
<para>
This document describes the interfaces available for device drivers that
- drive s390 based channel attached devices. This includes interfaces for
+ drive s390 based channel attached I/O devices. This includes interfaces for
interaction with the hardware and interfaces for interacting with the
common driver core. Those interfaces are provided by the s390 common I/O
layer.
@@ -86,9 +86,10 @@
The ccw bus typically contains the majority of devices available to
a s390 system. Named after the channel command word (ccw), the basic
command structure used to address its devices, the ccw bus contains
- so-called channel attached devices. They are addressed via subchannels,
- visible on the css bus. A device driver, however, will never interact
- with the subchannel directly, but only via the device on the ccw bus,
+ so-called channel attached devices. They are addressed via I/O
+ subchannels, visible on the css bus. A device driver for
+ channel-attached devices, however, will never interact with the
+ subchannel directly, but only via the I/O device on the ccw bus,
the ccw device.
</para>
<sect1 id="channelIO">
@@ -116,7 +117,6 @@
!Iinclude/asm-s390/ccwdev.h
!Edrivers/s390/cio/device.c
!Edrivers/s390/cio/device_ops.c
-!Edrivers/s390/cio/airq.c
</sect1>
<sect1 id="cmf">
<title>The channel-measurement facility</title>
@@ -147,4 +147,15 @@
</sect1>
</chapter>
+ <chapter id="genericinterfaces">
+ <title>Generic interfaces</title>
+ <para>
+ Some interfaces are available to other drivers that do not necessarily
+ have anything to do with the busses described above, but still are
+ indirectly using basic infrastructure in the common I/O layer.
+ One example is the support for adapter interrupts.
+ </para>
+!Edrivers/s390/cio/airq.c
+ </chapter>
+
</book>
diff --git a/Documentation/Smack.txt b/Documentation/Smack.txt
new file mode 100644
index 00000000000..989c2fcd811
--- /dev/null
+++ b/Documentation/Smack.txt
@@ -0,0 +1,493 @@
+
+
+ "Good for you, you've decided to clean the elevator!"
+ - The Elevator, from Dark Star
+
+Smack is the the Simplified Mandatory Access Control Kernel.
+Smack is a kernel based implementation of mandatory access
+control that includes simplicity in its primary design goals.
+
+Smack is not the only Mandatory Access Control scheme
+available for Linux. Those new to Mandatory Access Control
+are encouraged to compare Smack with the other mechanisms
+available to determine which is best suited to the problem
+at hand.
+
+Smack consists of three major components:
+ - The kernel
+ - A start-up script and a few modified applications
+ - Configuration data
+
+The kernel component of Smack is implemented as a Linux
+Security Modules (LSM) module. It requires netlabel and
+works best with file systems that support extended attributes,
+although xattr support is not strictly required.
+It is safe to run a Smack kernel under a "vanilla" distribution.
+Smack kernels use the CIPSO IP option. Some network
+configurations are intolerant of IP options and can impede
+access to systems that use them as Smack does.
+
+The startup script etc-init.d-smack should be installed
+in /etc/init.d/smack and should be invoked early in the
+start-up process. On Fedora rc5.d/S02smack is recommended.
+This script ensures that certain devices have the correct
+Smack attributes and loads the Smack configuration if
+any is defined. This script invokes two programs that
+ensure configuration data is properly formatted. These
+programs are /usr/sbin/smackload and /usr/sin/smackcipso.
+The system will run just fine without these programs,
+but it will be difficult to set access rules properly.
+
+A version of "ls" that provides a "-M" option to display
+Smack labels on long listing is available.
+
+A hacked version of sshd that allows network logins by users
+with specific Smack labels is available. This version does
+not work for scp. You must set the /etc/ssh/sshd_config
+line:
+ UsePrivilegeSeparation no
+
+The format of /etc/smack/usr is:
+
+ username smack
+
+In keeping with the intent of Smack, configuration data is
+minimal and not strictly required. The most important
+configuration step is mounting the smackfs pseudo filesystem.
+
+Add this line to /etc/fstab:
+
+ smackfs /smack smackfs smackfsdef=* 0 0
+
+and create the /smack directory for mounting.
+
+Smack uses extended attributes (xattrs) to store file labels.
+The command to set a Smack label on a file is:
+
+ # attr -S -s SMACK64 -V "value" path
+
+NOTE: Smack labels are limited to 23 characters. The attr command
+ does not enforce this restriction and can be used to set
+ invalid Smack labels on files.
+
+If you don't do anything special all users will get the floor ("_")
+label when they log in. If you do want to log in via the hacked ssh
+at other labels use the attr command to set the smack value on the
+home directory and it's contents.
+
+You can add access rules in /etc/smack/accesses. They take the form:
+
+ subjectlabel objectlabel access
+
+access is a combination of the letters rwxa which specify the
+kind of access permitted a subject with subjectlabel on an
+object with objectlabel. If there is no rule no access is allowed.
+
+A process can see the smack label it is running with by
+reading /proc/self/attr/current. A privileged process can
+set the process smack by writing there.
+
+Look for additional programs on http://schaufler-ca.com
+
+From the Smack Whitepaper:
+
+The Simplified Mandatory Access Control Kernel
+
+Casey Schaufler
+casey@schaufler-ca.com
+
+Mandatory Access Control
+
+Computer systems employ a variety of schemes to constrain how information is
+shared among the people and services using the machine. Some of these schemes
+allow the program or user to decide what other programs or users are allowed
+access to pieces of data. These schemes are called discretionary access
+control mechanisms because the access control is specified at the discretion
+of the user. Other schemes do not leave the decision regarding what a user or
+program can access up to users or programs. These schemes are called mandatory
+access control mechanisms because you don't have a choice regarding the users
+or programs that have access to pieces of data.
+
+Bell & LaPadula
+
+From the middle of the 1980's until the turn of the century Mandatory Access
+Control (MAC) was very closely associated with the Bell & LaPadula security
+model, a mathematical description of the United States Department of Defense
+policy for marking paper documents. MAC in this form enjoyed a following
+within the Capital Beltway and Scandinavian supercomputer centers but was
+often sited as failing to address general needs.
+
+Domain Type Enforcement
+
+Around the turn of the century Domain Type Enforcement (DTE) became popular.
+This scheme organizes users, programs, and data into domains that are
+protected from each other. This scheme has been widely deployed as a component
+of popular Linux distributions. The administrative overhead required to
+maintain this scheme and the detailed understanding of the whole system
+necessary to provide a secure domain mapping leads to the scheme being
+disabled or used in limited ways in the majority of cases.
+
+Smack
+
+Smack is a Mandatory Access Control mechanism designed to provide useful MAC
+while avoiding the pitfalls of its predecessors. The limitations of Bell &
+LaPadula are addressed by providing a scheme whereby access can be controlled
+according to the requirements of the system and its purpose rather than those
+imposed by an arcane government policy. The complexity of Domain Type
+Enforcement and avoided by defining access controls in terms of the access
+modes already in use.
+
+Smack Terminology
+
+The jargon used to talk about Smack will be familiar to those who have dealt
+with other MAC systems and shouldn't be too difficult for the uninitiated to
+pick up. There are four terms that are used in a specific way and that are
+especially important:
+
+ Subject: A subject is an active entity on the computer system.
+ On Smack a subject is a task, which is in turn the basic unit
+ of execution.
+
+ Object: An object is a passive entity on the computer system.
+ On Smack files of all types, IPC, and tasks can be objects.
+
+ Access: Any attempt by a subject to put information into or get
+ information from an object is an access.
+
+ Label: Data that identifies the Mandatory Access Control
+ characteristics of a subject or an object.
+
+These definitions are consistent with the traditional use in the security
+community. There are also some terms from Linux that are likely to crop up:
+
+ Capability: A task that possesses a capability has permission to
+ violate an aspect of the system security policy, as identified by
+ the specific capability. A task that possesses one or more
+ capabilities is a privileged task, whereas a task with no
+ capabilities is an unprivileged task.
+
+ Privilege: A task that is allowed to violate the system security
+ policy is said to have privilege. As of this writing a task can
+ have privilege either by possessing capabilities or by having an
+ effective user of root.
+
+Smack Basics
+
+Smack is an extension to a Linux system. It enforces additional restrictions
+on what subjects can access which objects, based on the labels attached to
+each of the subject and the object.
+
+Labels
+
+Smack labels are ASCII character strings, one to twenty-three characters in
+length. Single character labels using special characters, that being anything
+other than a letter or digit, are reserved for use by the Smack development
+team. Smack labels are unstructured, case sensitive, and the only operation
+ever performed on them is comparison for equality. Smack labels cannot
+contain unprintable characters or the "/" (slash) character.
+
+There are some predefined labels:
+
+ _ Pronounced "floor", a single underscore character.
+ ^ Pronounced "hat", a single circumflex character.
+ * Pronounced "star", a single asterisk character.
+ ? Pronounced "huh", a single question mark character.
+
+Every task on a Smack system is assigned a label. System tasks, such as
+init(8) and systems daemons, are run with the floor ("_") label. User tasks
+are assigned labels according to the specification found in the
+/etc/smack/user configuration file.
+
+Access Rules
+
+Smack uses the traditional access modes of Linux. These modes are read,
+execute, write, and occasionally append. There are a few cases where the
+access mode may not be obvious. These include:
+
+ Signals: A signal is a write operation from the subject task to
+ the object task.
+ Internet Domain IPC: Transmission of a packet is considered a
+ write operation from the source task to the destination task.
+
+Smack restricts access based on the label attached to a subject and the label
+attached to the object it is trying to access. The rules enforced are, in
+order:
+
+ 1. Any access requested by a task labeled "*" is denied.
+ 2. A read or execute access requested by a task labeled "^"
+ is permitted.
+ 3. A read or execute access requested on an object labeled "_"
+ is permitted.
+ 4. Any access requested on an object labeled "*" is permitted.
+ 5. Any access requested by a task on an object with the same
+ label is permitted.
+ 6. Any access requested that is explicitly defined in the loaded
+ rule set is permitted.
+ 7. Any other access is denied.
+
+Smack Access Rules
+
+With the isolation provided by Smack access separation is simple. There are
+many interesting cases where limited access by subjects to objects with
+different labels is desired. One example is the familiar spy model of
+sensitivity, where a scientist working on a highly classified project would be
+able to read documents of lower classifications and anything she writes will
+be "born" highly classified. To accommodate such schemes Smack includes a
+mechanism for specifying rules allowing access between labels.
+
+Access Rule Format
+
+The format of an access rule is:
+
+ subject-label object-label access
+
+Where subject-label is the Smack label of the task, object-label is the Smack
+label of the thing being accessed, and access is a string specifying the sort
+of access allowed. The Smack labels are limited to 23 characters. The access
+specification is searched for letters that describe access modes:
+
+ a: indicates that append access should be granted.
+ r: indicates that read access should be granted.
+ w: indicates that write access should be granted.
+ x: indicates that execute access should be granted.
+
+Uppercase values for the specification letters are allowed as well.
+Access mode specifications can be in any order. Examples of acceptable rules
+are:
+
+ TopSecret Secret rx
+ Secret Unclass R
+ Manager Game x
+ User HR w
+ New Old rRrRr
+ Closed Off -
+
+Examples of unacceptable rules are:
+
+ Top Secret Secret rx
+ Ace Ace r
+ Odd spells waxbeans
+
+Spaces are not allowed in labels. Since a subject always has access to files
+with the same label specifying a rule for that case is pointless. Only
+valid letters (rwxaRWXA) and the dash ('-') character are allowed in
+access specifications. The dash is a placeholder, so "a-r" is the same
+as "ar". A lone dash is used to specify that no access should be allowed.
+
+Applying Access Rules
+
+The developers of Linux rarely define new sorts of things, usually importing
+schemes and concepts from other systems. Most often, the other systems are
+variants of Unix. Unix has many endearing properties, but consistency of
+access control models is not one of them. Smack strives to treat accesses as
+uniformly as is sensible while keeping with the spirit of the underlying
+mechanism.
+
+File system objects including files, directories, named pipes, symbolic links,
+and devices require access permissions that closely match those used by mode
+bit access. To open a file for reading read access is required on the file. To
+search a directory requires execute access. Creating a file with write access
+requires both read and write access on the containing directory. Deleting a
+file requires read and write access to the file and to the containing
+directory. It is possible that a user may be able to see that a file exists
+but not any of its attributes by the circumstance of having read access to the
+containing directory but not to the differently labeled file. This is an
+artifact of the file name being data in the directory, not a part of the file.
+
+IPC objects, message queues, semaphore sets, and memory segments exist in flat
+namespaces and access requests are only required to match the object in
+question.
+
+Process objects reflect tasks on the system and the Smack label used to access
+them is the same Smack label that the task would use for its own access
+attempts. Sending a signal via the kill() system call is a write operation
+from the signaler to the recipient. Debugging a process requires both reading
+and writing. Creating a new task is an internal operation that results in two
+tasks with identical Smack labels and requires no access checks.
+
+Sockets are data structures attached to processes and sending a packet from
+one process to another requires that the sender have write access to the
+receiver. The receiver is not required to have read access to the sender.
+
+Setting Access Rules
+
+The configuration file /etc/smack/accesses contains the rules to be set at
+system startup. The contents are written to the special file /smack/load.
+Rules can be written to /smack/load at any time and take effect immediately.
+For any pair of subject and object labels there can be only one rule, with the
+most recently specified overriding any earlier specification.
+
+The program smackload is provided to ensure data is formatted
+properly when written to /smack/load. This program reads lines
+of the form
+
+ subjectlabel objectlabel mode.
+
+Task Attribute
+
+The Smack label of a process can be read from /proc/<pid>/attr/current. A
+process can read its own Smack label from /proc/self/attr/current. A
+privileged process can change its own Smack label by writing to
+/proc/self/attr/current but not the label of another process.
+
+File Attribute
+
+The Smack label of a filesystem object is stored as an extended attribute
+named SMACK64 on the file. This attribute is in the security namespace. It can
+only be changed by a process with privilege.
+
+Privilege
+
+A process with CAP_MAC_OVERRIDE is privileged.
+
+Smack Networking
+
+As mentioned before, Smack enforces access control on network protocol
+transmissions. Every packet sent by a Smack process is tagged with its Smack
+label. This is done by adding a CIPSO tag to the header of the IP packet. Each
+packet received is expected to have a CIPSO tag that identifies the label and
+if it lacks such a tag the network ambient label is assumed. Before the packet
+is delivered a check is made to determine that a subject with the label on the
+packet has write access to the receiving process and if that is not the case
+the packet is dropped.
+
+CIPSO Configuration
+
+It is normally unnecessary to specify the CIPSO configuration. The default
+values used by the system handle all internal cases. Smack will compose CIPSO
+label values to match the Smack labels being used without administrative
+intervention. Unlabeled packets that come into the system will be given the
+ambient label.
+
+Smack requires configuration in the case where packets from a system that is
+not smack that speaks CIPSO may be encountered. Usually this will be a Trusted
+Solaris system, but there are other, less widely deployed systems out there.
+CIPSO provides 3 important values, a Domain Of Interpretation (DOI), a level,
+and a category set with each packet. The DOI is intended to identify a group
+of systems that use compatible labeling schemes, and the DOI specified on the
+smack system must match that of the remote system or packets will be
+discarded. The DOI is 3 by default. The value can be read from /smack/doi and
+can be changed by writing to /smack/doi.
+
+The label and category set are mapped to a Smack label as defined in
+/etc/smack/cipso.
+
+A Smack/CIPSO mapping has the form:
+
+ smack level [category [category]*]
+
+Smack does not expect the level or category sets to be related in any
+particular way and does not assume or assign accesses based on them. Some
+examples of mappings:
+
+ TopSecret 7
+ TS:A,B 7 1 2
+ SecBDE 5 2 4 6
+ RAFTERS 7 12 26
+
+The ":" and "," characters are permitted in a Smack label but have no special
+meaning.
+
+The mapping of Smack labels to CIPSO values is defined by writing to
+/smack/cipso. Again, the format of data written to this special file
+is highly restrictive, so the program smackcipso is provided to
+ensure the writes are done properly. This program takes mappings
+on the standard input and sends them to /smack/cipso properly.
+
+In addition to explicit mappings Smack supports direct CIPSO mappings. One
+CIPSO level is used to indicate that the category set passed in the packet is
+in fact an encoding of the Smack label. The level used is 250 by default. The
+value can be read from /smack/direct and changed by writing to /smack/direct.
+
+Socket Attributes
+
+There are two attributes that are associated with sockets. These attributes
+can only be set by privileged tasks, but any task can read them for their own
+sockets.
+
+ SMACK64IPIN: The Smack label of the task object. A privileged
+ program that will enforce policy may set this to the star label.
+
+ SMACK64IPOUT: The Smack label transmitted with outgoing packets.
+ A privileged program may set this to match the label of another
+ task with which it hopes to communicate.
+
+Writing Applications for Smack
+
+There are three sorts of applications that will run on a Smack system. How an
+application interacts with Smack will determine what it will have to do to
+work properly under Smack.
+
+Smack Ignorant Applications
+
+By far the majority of applications have no reason whatever to care about the
+unique properties of Smack. Since invoking a program has no impact on the
+Smack label associated with the process the only concern likely to arise is
+whether the process has execute access to the program.
+
+Smack Relevant Applications
+
+Some programs can be improved by teaching them about Smack, but do not make
+any security decisions themselves. The utility ls(1) is one example of such a
+program.
+
+Smack Enforcing Applications
+
+These are special programs that not only know about Smack, but participate in
+the enforcement of system policy. In most cases these are the programs that
+set up user sessions. There are also network services that provide information
+to processes running with various labels.
+
+File System Interfaces
+
+Smack maintains labels on file system objects using extended attributes. The
+Smack label of a file, directory, or other file system object can be obtained
+using getxattr(2).
+
+ len = getxattr("/", "security.SMACK64", value, sizeof (value));
+
+will put the Smack label of the root directory into value. A privileged
+process can set the Smack label of a file system object with setxattr(2).
+
+ len = strlen("Rubble");
+ rc = setxattr("/foo", "security.SMACK64", "Rubble", len, 0);
+
+will set the Smack label of /foo to "Rubble" if the program has appropriate
+privilege.
+
+Socket Interfaces
+
+The socket attributes can be read using fgetxattr(2).
+
+A privileged process can set the Smack label of outgoing packets with
+fsetxattr(2).
+
+ len = strlen("Rubble");
+ rc = fsetxattr(fd, "security.SMACK64IPOUT", "Rubble", len, 0);
+
+will set the Smack label "Rubble" on packets going out from the socket if the
+program has appropriate privilege.
+
+ rc = fsetxattr(fd, "security.SMACK64IPIN, "*", strlen("*"), 0);
+
+will set the Smack label "*" as the object label against which incoming
+packets will be checked if the program has appropriate privilege.
+
+Administration
+
+Smack supports some mount options:
+
+ smackfsdef=label: specifies the label to give files that lack
+ the Smack label extended attribute.
+
+ smackfsroot=label: specifies the label to assign the root of the
+ file system if it lacks the Smack extended attribute.
+
+ smackfshat=label: specifies a label that must have read access to
+ all labels set on the filesystem. Not yet enforced.
+
+ smackfsfloor=label: specifies a label to which all labels set on the
+ filesystem must have read access. Not yet enforced.
+
+These mount options apply to all file system types.
+
diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches
index 681e2b36195..08a1ed1cb5d 100644
--- a/Documentation/SubmittingPatches
+++ b/Documentation/SubmittingPatches
@@ -220,20 +220,8 @@ decreasing the likelihood of your MIME-attached change being accepted.
Exception: If your mailer is mangling patches then someone may ask
you to re-send them using MIME.
-
-WARNING: Some mailers like Mozilla send your messages with
----- message header ----
-Content-Type: text/plain; charset=us-ascii; format=flowed
----- message header ----
-The problem is that "format=flowed" makes some of the mailers
-on receiving side to replace TABs with spaces and do similar
-changes. Thus the patches from you can look corrupted.
-
-To fix this just make your mozilla defaults/pref/mailnews.js file to look like:
-pref("mailnews.send_plaintext_flowed", false); // RFC 2646=======
-pref("mailnews.display.disable_format_flowed_support", true);
-
-
+See Documentation/email-clients.txt for hints about configuring
+your e-mail client so that it sends your patches untouched.
8) E-mail size.
diff --git a/Documentation/acpi/method-tracing.txt b/Documentation/acpi/method-tracing.txt
new file mode 100644
index 00000000000..f6efb1ea559
--- /dev/null
+++ b/Documentation/acpi/method-tracing.txt
@@ -0,0 +1,26 @@
+/sys/module/acpi/parameters/:
+
+trace_method_name
+ The AML method name that the user wants to trace
+
+trace_debug_layer
+ The temporary debug_layer used when tracing the method.
+ Using 0xffffffff by default if it is 0.
+
+trace_debug_level
+ The temporary debug_level used when tracing the method.
+ Using 0x00ffffff by default if it is 0.
+
+trace_state
+ The status of the tracing feature.
+
+ "enabled" means this feature is enabled
+ and the AML method is traced every time it's executed.
+
+ "1" means this feature is enabled and the AML method
+ will only be traced during the next execution.
+
+ "disabled" means this feature is disabled.
+ Users can enable/disable this debug tracing feature by
+ "echo string > /sys/module/acpi/parameters/trace_state".
+ "string" should be one of "enable", "disable" and "1".
diff --git a/Documentation/arm/Sharp-LH/IOBarrier b/Documentation/arm/Sharp-LH/IOBarrier
index c0d8853672d..2e953e228f4 100644
--- a/Documentation/arm/Sharp-LH/IOBarrier
+++ b/Documentation/arm/Sharp-LH/IOBarrier
@@ -32,7 +32,7 @@ BARRIER IO before the access to the SMC chip because the AEN latch
only needs occurs after the SMC IO write cycle. The routines that
implement this work-around make an additional concession which is to
disable interrupts during the IO sequence. Other hardware devices
-(the LogicPD CPLD) have registers in the same the physical memory
+(the LogicPD CPLD) have registers in the same physical memory
region as the SMC chip. An interrupt might allow an access to one of
those registers while SMC IO is being performed.
diff --git a/Documentation/debugging-modules.txt b/Documentation/debugging-modules.txt
index 24029f65fc9..172ad4aec49 100644
--- a/Documentation/debugging-modules.txt
+++ b/Documentation/debugging-modules.txt
@@ -16,3 +16,7 @@ echo 'echo "$@" >> /tmp/modprobe.log' >> /tmp/modprobe
echo 'exec /sbin/modprobe "$@"' >> /tmp/modprobe
chmod a+x /tmp/modprobe
echo /tmp/modprobe > /proc/sys/kernel/modprobe
+
+Note that the above applies only when the *kernel* is requesting
+that the module be loaded -- it won't have any effect if that module
+is being loaded explicitly using "modprobe" from userspace.
diff --git a/Documentation/driver-model/platform.txt b/Documentation/driver-model/platform.txt
index 2a97320ee17..83009fdcbbc 100644
--- a/Documentation/driver-model/platform.txt
+++ b/Documentation/driver-model/platform.txt
@@ -122,15 +122,15 @@ None the less, there are some APIs to support such legacy drivers. Avoid
using these calls except with such hotplug-deficient drivers.
struct platform_device *platform_device_alloc(
- char *name, unsigned id);
+ const char *name, int id);
You can use platform_device_alloc() to dynamically allocate a device, which
you will then initialize with resources and platform_device_register().
A better solution is usually:
struct platform_device *platform_device_register_simple(
- char *name, unsigned id,
- struct resource *res, unsigned nres);
+ const char *name, int id,
+ struct resource *res, unsigned int nres);
You can use platform_device_register_simple() as a one-step call to allocate
and register a device.
diff --git a/Documentation/fb/deferred_io.txt b/Documentation/fb/deferred_io.txt
index 63883a89212..74832837025 100644
--- a/Documentation/fb/deferred_io.txt
+++ b/Documentation/fb/deferred_io.txt
@@ -7,10 +7,10 @@ IO. The following example may be a useful explanation of how one such setup
works:
- userspace app like Xfbdev mmaps framebuffer
-- deferred IO and driver sets up nopage and page_mkwrite handlers
+- deferred IO and driver sets up fault and page_mkwrite handlers
- userspace app tries to write to mmaped vaddress
-- we get pagefault and reach nopage handler
-- nopage handler finds and returns physical page
+- we get pagefault and reach fault handler
+- fault handler finds and returns physical page
- we get page_mkwrite where we add this page to a list
- schedule a workqueue task to be run after a delay
- app continues writing to that page with no additional cost. this is
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 181bff00516..68ce1300a36 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -156,22 +156,6 @@ Who: Arjan van de Ven <arjan@linux.intel.com>
---------------------------
-What: USB driver API moves to EXPORT_SYMBOL_GPL
-When: February 2008
-Files: include/linux/usb.h, drivers/usb/core/driver.c
-Why: The USB subsystem has changed a lot over time, and it has been
- possible to create userspace USB drivers using usbfs/libusb/gadgetfs
- that operate as fast as the USB bus allows. Because of this, the USB
- subsystem will not be allowing closed source kernel drivers to
- register with it, after this grace period is over. If anyone needs
- any help in converting their closed source drivers over to use the
- userspace filesystems, please contact the
- linux-usb-devel@lists.sourceforge.net mailing list, and the developers
- there will be glad to help you out.
-Who: Greg Kroah-Hartman <gregkh@suse.de>
-
----------------------------
-
What: vm_ops.nopage
When: Soon, provided in-kernel callers have been converted
Why: This interface is replaced by vm_ops.fault, but it has been around
@@ -224,13 +208,6 @@ Who: Randy Dunlap <randy.dunlap@oracle.com>
---------------------------
-What: drivers depending on OSS_OBSOLETE
-When: options in 2.6.23, code in 2.6.25
-Why: obsolete OSS drivers
-Who: Adrian Bunk <bunk@stusta.de>
-
----------------------------
-
What: libata spindown skipping and warning
When: Dec 2008
Why: Some halt(8) implementations synchronize caches for and spin
diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt
index d1b98257d00..44c97e6accb 100644
--- a/Documentation/filesystems/configfs/configfs.txt
+++ b/Documentation/filesystems/configfs/configfs.txt
@@ -377,7 +377,7 @@ more explicit to have a method whereby userspace sees this divergence.
Rather than have a group where some items behave differently than
others, configfs provides a method whereby one or many subgroups are
automatically created inside the parent at its creation. Thus,
-mkdir("parent) results in "parent", "parent/subgroup1", up through
+mkdir("parent") results in "parent", "parent/subgroup1", up through
"parent/subgroupN". Items of type 1 can now be created in
"parent/subgroup1", and items of type N can be created in
"parent/subgroupN".
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index dac45c92d87..0f33c77bc14 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -1,6 +1,6 @@
Changes since 2.5.0:
----
+---
[recommended]
New helpers: sb_bread(), sb_getblk(), sb_find_get_block(), set_bh(),
@@ -10,7 +10,7 @@ Use them.
(sb_find_get_block() replaces 2.4's get_hash_table())
----
+---
[recommended]
New methods: ->alloc_inode() and ->destroy_inode().
@@ -28,7 +28,7 @@ Declare
Use FOO_I(inode) instead of &inode->u.foo_inode_i;
-Add foo_alloc_inode() and foo_destory_inode() - the former should allocate
+Add foo_alloc_inode() and foo_destroy_inode() - the former should allocate
foo_inode_info and return the address of ->vfs_inode, the latter should free
FOO_I(inode) (see in-tree filesystems for examples).
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 194c8f35132..5681e2fa149 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -216,6 +216,7 @@ Table 1-3: Contents of the stat files (as of 2.6.22-rc3)
priority priority level
nice nice level
num_threads number of threads
+ it_real_value (obsolete, always 0)
start_time time the process started after system boot
vsize virtual memory size
rss resident set memory size
@@ -1028,6 +1029,14 @@ nr_inodes
Denotes the number of inodes the system has allocated. This number will
grow and shrink dynamically.
+nr_open
+-------
+
+Denotes the maximum number of file-handles a process can
+allocate. Default value is 1024*1024 (1048576) which should be
+enough for most machines. Actual limit depends on RLIMIT_NOFILE
+resource limit.
+
nr_free_inodes
--------------
@@ -1314,13 +1323,28 @@ for writeout by the pdflush daemons. It is expressed in 100'ths of a second.
Data which has been dirty in-memory for longer than this interval will be
written out next time a pdflush daemon wakes up.
+highmem_is_dirtyable
+--------------------
+
+Only present if CONFIG_HIGHMEM is set.
+
+This defaults to 0 (false), meaning that the ratios set above are calculated
+as a percentage of lowmem only. This protects against excessive scanning
+in page reclaim, swapping and general VM distress.
+
+Setting this to 1 can be useful on 32 bit machines where you want to make
+random changes within an MMAPed file that is larger than your available
+lowmem without causing large quantities of random IO. Is is safe if the
+behavior of all programs running on the machine is known and memory will
+not be otherwise stressed.
+
legacy_va_layout
----------------
If non-zero, this sysctl disables the new 32-bit mmap mmap layout - the kernel
will use the legacy (2.4) layout for all processes.
-lower_zone_protection
+lowmem_reserve_ratio
---------------------
For some specialised workloads on highmem machines it is dangerous for
@@ -1340,25 +1364,71 @@ captured into pinned user memory.
mechanism will also defend that region from allocations which could use
highmem or lowmem).
-The `lower_zone_protection' tunable determines how aggressive the kernel is
-in defending these lower zones. The default value is zero - no
-protection at all.
+The `lowmem_reserve_ratio' tunable determines how aggressive the kernel is
+in defending these lower zones.
If you have a machine which uses highmem or ISA DMA and your
applications are using mlock(), or if you are running with no swap then
-you probably should increase the lower_zone_protection setting.
-
-The units of this tunable are fairly vague. It is approximately equal
-to "megabytes," so setting lower_zone_protection=100 will protect around 100
-megabytes of the lowmem zone from user allocations. It will also make
-those 100 megabytes unavailable for use by applications and by
-pagecache, so there is a cost.
-
-The effects of this tunable may be observed by monitoring
-/proc/meminfo:LowFree. Write a single huge file and observe the point
-at which LowFree ceases to fall.
-
-A reasonable value for lower_zone_protection is 100.
+you probably should change the lowmem_reserve_ratio setting.
+
+The lowmem_reserve_ratio is an array. You can see them by reading this file.
+-
+% cat /proc/sys/vm/lowmem_reserve_ratio
+256 256 32
+-
+Note: # of this elements is one fewer than number of zones. Because the highest
+ zone's value is not necessary for following calculation.
+
+But, these values are not used directly. The kernel calculates # of protection
+pages for each zones from them. These are shown as array of protection pages
+in /proc/zoneinfo like followings. (This is an example of x86-64 box).
+Each zone has an array of protection pages like this.
+
+-
+Node 0, zone DMA
+ pages free 1355
+ min 3
+ low 3
+ high 4
+ :
+ :
+ numa_other 0
+ protection: (0, 2004, 2004, 2004)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ pagesets
+ cpu: 0 pcp: 0
+ :
+-
+These protections are added to score to judge whether this zone should be used
+for page allocation or should be reclaimed.
+
+In this example, if normal pages (index=2) are required to this DMA zone and
+pages_high is used for watermark, the kernel judges this zone should not be
+used because pages_free(1355) is smaller than watermark + protection[2]
+(4 + 2004 = 2008). If this protection value is 0, this zone would be used for
+normal page requirement. If requirement is DMA zone(index=0), protection[0]
+(=0) is used.
+
+zone[i]'s protection[j] is calculated by following exprssion.
+
+(i < j):
+ zone[i]->protection[j]
+ = (total sums of present_pages from zone[i+1] to zone[j] on the node)
+ / lowmem_reserve_ratio[i];
+(i = j):
+ (should not be protected. = 0;
+(i > j):
+ (not necessary, but looks 0)
+
+The default values of lowmem_reserve_ratio[i] are
+ 256 (if zone[i] means DMA or DMA32 zone)
+ 32 (others).
+As above expression, they are reciprocal number of ratio.
+256 means 1/256. # of protection pages becomes about "0.39%" of total present
+pages of higher zones on the node.
+
+If you would like to protect more pages, smaller values are effective.
+The minimum value is 1 (1/1 -> 100%).
page-cluster
------------
diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.txt b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
index 339c6a4f220..7be232b44ee 100644
--- a/Documentation/filesystems/ramfs-rootfs-initramfs.txt
+++ b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
@@ -118,7 +118,7 @@ All this differs from the old initrd in several ways:
with the new root (cd /newmount; mount --move . /; chroot .), attach
stdin/stdout/stderr to the new /dev/console, and exec the new init.
- Since this is a remarkably persnickity process (and involves deleting
+ Since this is a remarkably persnickety process (and involves deleting
commands before you can run them), the klibc package introduced a helper
program (utils/run_init.c) to do all this for you. Most other packages
(such as busybox) have named this command "switch_root".
diff --git a/Documentation/filesystems/relay.txt b/Documentation/filesystems/relay.txt
index 18d23f9a18c..094f2d2f38b 100644
--- a/Documentation/filesystems/relay.txt
+++ b/Documentation/filesystems/relay.txt
@@ -140,7 +140,7 @@ close() decrements the channel buffer's refcount. When the refcount
In order for a user application to make use of relay files, the
host filesystem must be mounted. For example,
- mount -t debugfs debugfs /debug
+ mount -t debugfs debugfs /sys/kernel/debug
NOTE: the host filesystem doesn't need to be mounted for kernel
clients to create or use channels - it only needs to be
diff --git a/Documentation/fujitsu/frv/README.txt b/Documentation/frv/README.txt
index a984faa968e..a984faa968e 100644
--- a/Documentation/fujitsu/frv/README.txt
+++ b/Documentation/frv/README.txt
diff --git a/Documentation/fujitsu/frv/atomic-ops.txt b/Documentation/frv/atomic-ops.txt
index 96638e9b9fe..96638e9b9fe 100644
--- a/Documentation/fujitsu/frv/atomic-ops.txt
+++ b/Documentation/frv/atomic-ops.txt
diff --git a/Documentation/fujitsu/frv/booting.txt b/Documentation/frv/booting.txt
index 4e229056ef2..ace200b7c21 100644
--- a/Documentation/fujitsu/frv/booting.txt
+++ b/Documentation/frv/booting.txt
@@ -177,5 +177,5 @@ separated by spaces:
(*) vdc=...
This option configures the MB93493 companion chip visual display
- driver. Please see Documentation/fujitsu/mb93493/vdc.txt for more
+ driver. Please see Documentation/frv/mb93493/vdc.txt for more
information.
diff --git a/Documentation/fujitsu/frv/clock.txt b/Documentation/frv/clock.txt
index c72d350e177..c72d350e177 100644
--- a/Documentation/fujitsu/frv/clock.txt
+++ b/Documentation/frv/clock.txt
diff --git a/Documentation/fujitsu/frv/configuring.txt b/Documentation/frv/configuring.txt
index 36e76a2336f..36e76a2336f 100644
--- a/Documentation/fujitsu/frv/configuring.txt
+++ b/Documentation/frv/configuring.txt
diff --git a/Documentation/fujitsu/frv/features.txt b/Documentation/frv/features.txt
index fa20c0e7283..fa20c0e7283 100644
--- a/Documentation/fujitsu/frv/features.txt
+++ b/Documentation/frv/features.txt
diff --git a/Documentation/fujitsu/frv/gdbinit b/Documentation/frv/gdbinit
index 51517b6f307..51517b6f307 100644
--- a/Documentation/fujitsu/frv/gdbinit
+++ b/Documentation/frv/gdbinit
diff --git a/Documentation/fujitsu/frv/gdbstub.txt b/Documentation/frv/gdbstub.txt
index b92bfd902a4..b92bfd902a4 100644
--- a/Documentation/fujitsu/frv/gdbstub.txt
+++ b/Documentation/frv/gdbstub.txt
diff --git a/Documentation/fujitsu/frv/kernel-ABI.txt b/Documentation/frv/kernel-ABI.txt
index aaa1cec86f0..aaa1cec86f0 100644
--- a/Documentation/fujitsu/frv/kernel-ABI.txt
+++ b/Documentation/frv/kernel-ABI.txt
diff --git a/Documentation/fujitsu/frv/mmu-layout.txt b/Documentation/frv/mmu-layout.txt
index db10250df6b..db10250df6b 100644
--- a/Documentation/fujitsu/frv/mmu-layout.txt
+++ b/Documentation/frv/mmu-layout.txt
diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt
index 6bc2ba215df..8da724e2a0f 100644
--- a/Documentation/gpio.txt
+++ b/Documentation/gpio.txt
@@ -32,7 +32,7 @@ The exact capabilities of GPIOs vary between systems. Common options:
- Input values are likewise readable (1, 0). Some chips support readback
of pins configured as "output", which is very useful in such "wire-OR"
cases (to support bidirectional signaling). GPIO controllers may have
- input de-glitch logic, sometimes with software controls.
+ input de-glitch/debounce logic, sometimes with software controls.
- Inputs can often be used as IRQ signals, often edge triggered but
sometimes level triggered. Such IRQs may be configurable as system
@@ -60,10 +60,13 @@ used on a board that's wired differently. Only least-common-denominator
functionality can be very portable. Other features are platform-specific,
and that can be critical for glue logic.
-Plus, this doesn't define an implementation framework, just an interface.
+Plus, this doesn't require any implementation framework, just an interface.
One platform might implement it as simple inline functions accessing chip
registers; another might implement it by delegating through abstractions
-used for several very different kinds of GPIO controller.
+used for several very different kinds of GPIO controller. (There is some
+optional code supporting such an implementation strategy, described later
+in this document, but drivers acting as clients to the GPIO interface must
+not care how it's implemented.)
That said, if the convention is supported on their platform, drivers should
use it when possible. Platforms should declare GENERIC_GPIO support in
@@ -121,6 +124,11 @@ before tasking is enabled, as part of early board setup.
For output GPIOs, the value provided becomes the initial output value.
This helps avoid signal glitching during system startup.
+For compatibility with legacy interfaces to GPIOs, setting the direction
+of a GPIO implicitly requests that GPIO (see below) if it has not been
+requested already. That compatibility may be removed in the future;
+explicitly requesting GPIOs is strongly preferred.
+
Setting the direction can fail if the GPIO number is invalid, or when
that particular GPIO can't be used in that mode. It's generally a bad
idea to rely on boot firmware to have set the direction correctly, since
@@ -133,6 +141,7 @@ Spinlock-Safe GPIO access
-------------------------
Most GPIO controllers can be accessed with memory read/write instructions.
That doesn't need to sleep, and can safely be done from inside IRQ handlers.
+(That includes hardirq contexts on RT kernels.)
Use these calls to access such GPIOs:
@@ -145,7 +154,7 @@ Use these calls to access such GPIOs:
The values are boolean, zero for low, nonzero for high. When reading the
value of an output pin, the value returned should be what's seen on the
pin ... that won't always match the specified output value, because of
-issues including wire-OR and output latencies.
+issues including open-drain signaling and output latencies.
The get/set calls have no error returns because "invalid GPIO" should have
been reported earlier from gpio_direction_*(). However, note that not all
@@ -170,7 +179,8 @@ get to the head of a queue to transmit a command and get its response.
This requires sleeping, which can't be done from inside IRQ handlers.
Platforms that support this type of GPIO distinguish them from other GPIOs
-by returning nonzero from this call:
+by returning nonzero from this call (which requires a valid GPIO number,
+either explicitly or implicitly requested):
int gpio_cansleep(unsigned gpio);
@@ -209,8 +219,11 @@ before tasking is enabled, as part of early board setup.
These calls serve two basic purposes. One is marking the signals which
are actually in use as GPIOs, for better diagnostics; systems may have
several hundred potential GPIOs, but often only a dozen are used on any
-given board. Another is to catch conflicts between drivers, reporting
-errors when drivers wrongly think they have exclusive use of that signal.
+given board. Another is to catch conflicts, identifying errors when
+(a) two or more drivers wrongly think they have exclusive use of that
+signal, or (b) something wrongly believes it's safe to remove drivers
+needed to manage a signal that's in active use. That is, requesting a
+GPIO can serve as a kind of lock.
These two calls are optional because not not all current Linux platforms
offer such functionality in their GPIO support; a valid implementation
@@ -223,6 +236,9 @@ Note that requesting a GPIO does NOT cause it to be configured in any
way; it just marks that GPIO as in use. Separate code must handle any
pin setup (e.g. controlling which pin the GPIO uses, pullup/pulldown).
+Also note that it's your responsibility to have stopped using a GPIO
+before you free it.
+
GPIOs mapped to IRQs
--------------------
@@ -238,7 +254,7 @@ map between them using calls like:
Those return either the corresponding number in the other namespace, or
else a negative errno code if the mapping can't be done. (For example,
-some GPIOs can't used as IRQs.) It is an unchecked error to use a GPIO
+some GPIOs can't be used as IRQs.) It is an unchecked error to use a GPIO
number that wasn't set up as an input using gpio_direction_input(), or
to use an IRQ number that didn't originally come from gpio_to_irq().
@@ -299,17 +315,110 @@ Related to multiplexing is configuration and enabling of the pullups or
pulldowns integrated on some platforms. Not all platforms support them,
or support them in the same way; and any given board might use external
pullups (or pulldowns) so that the on-chip ones should not be used.
+(When a circuit needs 5 kOhm, on-chip 100 kOhm resistors won't do.)
There are other system-specific mechanisms that are not specified here,
like the aforementioned options for input de-glitching and wire-OR output.
Hardware may support reading or writing GPIOs in gangs, but that's usually
configuration dependent: for GPIOs sharing the same bank. (GPIOs are
commonly grouped in banks of 16 or 32, with a given SOC having several such
-banks.) Some systems can trigger IRQs from output GPIOs. Code relying on
-such mechanisms will necessarily be nonportable.
+banks.) Some systems can trigger IRQs from output GPIOs, or read values
+from pins not managed as GPIOs. Code relying on such mechanisms will
+necessarily be nonportable.
-Dynamic definition of GPIOs is not currently supported; for example, as
+Dynamic definition of GPIOs is not currently standard; for example, as
a side effect of configuring an add-on board with some GPIO expanders.
These calls are purely for kernel space, but a userspace API could be built
-on top of it.
+on top of them.
+
+
+GPIO implementor's framework (OPTIONAL)
+=======================================
+As noted earlier, there is an optional implementation framework making it
+easier for platforms to support different kinds of GPIO controller using
+the same programming interface.
+
+As a debugging aid, if debugfs is available a /sys/kernel/debug/gpio file
+will be found there. That will list all the controllers registered through
+this framework, and the state of the GPIOs currently in use.
+
+
+Controller Drivers: gpio_chip
+-----------------------------
+In this framework each GPIO controller is packaged as a "struct gpio_chip"
+with information common to each controller of that type:
+
+ - methods to establish GPIO direction
+ - methods used to access GPIO values
+ - flag saying whether calls to its methods may sleep
+ - optional debugfs dump method (showing extra state like pullup config)
+ - label for diagnostics
+
+There is also per-instance data, which may come from device.platform_data:
+the number of its first GPIO, and how many GPIOs it exposes.
+
+The code implementing a gpio_chip should support multiple instances of the
+controller, possibly using the driver model. That code will configure each
+gpio_chip and issue gpiochip_add(). Removing a GPIO controller should be
+rare; use gpiochip_remove() when it is unavoidable.
+
+Most often a gpio_chip is part of an instance-specific structure with state
+not exposed by the GPIO interfaces, such as addressing, power management,
+and more. Chips such as codecs will have complex non-GPIO state,
+
+Any debugfs dump method should normally ignore signals which haven't been
+requested as GPIOs. They can use gpiochip_is_requested(), which returns
+either NULL or the label associated with that GPIO when it was requested.
+
+
+Platform Support
+----------------
+To support this framework, a platform's Kconfig will "select HAVE_GPIO_LIB"
+and arrange that its <asm/gpio.h> includes <asm-generic/gpio.h> and defines
+three functions: gpio_get_value(), gpio_set_value(), and gpio_cansleep().
+They may also want to provide a custom value for ARCH_NR_GPIOS.
+
+Trivial implementations of those functions can directly use framework
+code, which always dispatches through the gpio_chip:
+
+ #define gpio_get_value __gpio_get_value
+ #define gpio_set_value __gpio_set_value
+ #define gpio_cansleep __gpio_cansleep
+
+Fancier implementations could instead define those as inline functions with
+logic optimizing access to specific SOC-based GPIOs. For example, if the
+referenced GPIO is the constant "12", getting or setting its value could
+cost as little as two or three instructions, never sleeping. When such an
+optimization is not possible those calls must delegate to the framework
+code, costing at least a few dozen instructions. For bitbanged I/O, such
+instruction savings can be significant.
+
+For SOCs, platform-specific code defines and registers gpio_chip instances
+for each bank of on-chip GPIOs. Those GPIOs should be numbered/labeled to
+match chip vendor documentation, and directly match board schematics. They
+may well start at zero and go up to a platform-specific limit. Such GPIOs
+are normally integrated into platform initialization to make them always be
+available, from arch_initcall() or earlier; they can often serve as IRQs.
+
+
+Board Support
+-------------
+For external GPIO controllers -- such as I2C or SPI expanders, ASICs, multi
+function devices, FPGAs or CPLDs -- most often board-specific code handles
+registering controller devices and ensures that their drivers know what GPIO
+numbers to use with gpiochip_add(). Their numbers often start right after
+platform-specific GPIOs.
+
+For example, board setup code could create structures identifying the range
+of GPIOs that chip will expose, and passes them to each GPIO expander chip
+using platform_data. Then the chip driver's probe() routine could pass that
+data to gpiochip_add().
+
+Initialization order can be important. For example, when a device relies on
+an I2C-based GPIO, its probe() routine should only be called after that GPIO
+becomes available. That may mean the device should not be registered until
+calls for that GPIO can work. One way to address such dependencies is for
+such gpio_chip controllers to provide setup() and teardown() callbacks to
+board specific code; those board specific callbacks would register devices
+once all the necessary resources are available.
diff --git a/Documentation/i2c/chips/pca9539 b/Documentation/i2c/chips/pca9539
index c4fce6a1353..1d81c530c4a 100644
--- a/Documentation/i2c/chips/pca9539
+++ b/Documentation/i2c/chips/pca9539
@@ -1,6 +1,9 @@
Kernel driver pca9539
=====================
+NOTE: this driver is deprecated and will be dropped soon, use
+drivers/gpio/pca9539.c instead.
+
Supported chips:
* Philips PCA9539
Prefix: 'pca9539'
diff --git a/Documentation/ia64/aliasing-test.c b/Documentation/ia64/aliasing-test.c
index 773a814d409..d23610fb2ff 100644
--- a/Documentation/ia64/aliasing-test.c
+++ b/Documentation/ia64/aliasing-test.c
@@ -16,6 +16,7 @@
#include <fcntl.h>
#include <fnmatch.h>
#include <string.h>
+#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
@@ -65,7 +66,7 @@ int scan_tree(char *path, char *file, off_t offset, size_t length, int touch)
{
struct dirent **namelist;
char *name, *path2;
- int i, n, r, rc, result = 0;
+ int i, n, r, rc = 0, result = 0;
struct stat buf;
n = scandir(path, &namelist, 0, alphasort);
@@ -113,7 +114,7 @@ skip:
free(namelist[i]);
}
free(namelist);
- return rc;
+ return result;
}
char buf[1024];
@@ -149,7 +150,7 @@ int scan_rom(char *path, char *file)
{
struct dirent **namelist;
char *name, *path2;
- int i, n, r, rc, result = 0;
+ int i, n, r, rc = 0, result = 0;
struct stat buf;
n = scandir(path, &namelist, 0, alphasort);
@@ -180,7 +181,7 @@ int scan_rom(char *path, char *file)
* important thing is that no MCA happened.
*/
if (rc > 0)
- fprintf(stderr, "PASS: %s read %ld bytes\n", path2, rc);
+ fprintf(stderr, "PASS: %s read %d bytes\n", path2, rc);
else {
fprintf(stderr, "PASS: %s not readable\n", path2);
return rc;
@@ -201,10 +202,10 @@ skip:
free(namelist[i]);
}
free(namelist);
- return rc;
+ return result;
}
-int main()
+int main(void)
{
int rc;
@@ -256,4 +257,6 @@ int main()
scan_tree("/proc/bus/pci", "??.?", 0xA0000, 0x20000, 0);
scan_tree("/proc/bus/pci", "??.?", 0xC0000, 0x40000, 1);
scan_tree("/proc/bus/pci", "??.?", 0, 1024*1024, 0);
+
+ return rc;
}
diff --git a/Documentation/ide/ChangeLog.ide-tape.1995-2002 b/Documentation/ide/ChangeLog.ide-tape.1995-2002
new file mode 100644
index 00000000000..877fac8770b
--- /dev/null
+++ b/Documentation/ide/ChangeLog.ide-tape.1995-2002
@@ -0,0 +1,257 @@
+/*
+ * Ver 0.1 Nov 1 95 Pre-working code :-)
+ * Ver 0.2 Nov 23 95 A short backup (few megabytes) and restore procedure
+ * was successful ! (Using tar cvf ... on the block
+ * device interface).
+ * A longer backup resulted in major swapping, bad
+ * overall Linux performance and eventually failed as
+ * we received non serial read-ahead requests from the
+ * buffer cache.
+ * Ver 0.3 Nov 28 95 Long backups are now possible, thanks to the
+ * character device interface. Linux's responsiveness
+ * and performance doesn't seem to be much affected
+ * from the background backup procedure.
+ * Some general mtio.h magnetic tape operations are
+ * now supported by our character device. As a result,
+ * popular tape utilities are starting to work with
+ * ide tapes :-)
+ * The following configurations were tested:
+ * 1. An IDE ATAPI TAPE shares the same interface
+ * and irq with an IDE ATAPI CDROM.
+ * 2. An IDE ATAPI TAPE shares the same interface
+ * and irq with a normal IDE disk.
+ * Both configurations seemed to work just fine !
+ * However, to be on the safe side, it is meanwhile
+ * recommended to give the IDE TAPE its own interface
+ * and irq.
+ * The one thing which needs to be done here is to
+ * add a "request postpone" feature to ide.c,
+ * so that we won't have to wait for the tape to finish
+ * performing a long media access (DSC) request (such
+ * as a rewind) before we can access the other device
+ * on the same interface. This effect doesn't disturb
+ * normal operation most of the time because read/write
+ * requests are relatively fast, and once we are
+ * performing one tape r/w request, a lot of requests
+ * from the other device can be queued and ide.c will
+ * service all of them after this single tape request.
+ * Ver 1.0 Dec 11 95 Integrated into Linux 1.3.46 development tree.
+ * On each read / write request, we now ask the drive
+ * if we can transfer a constant number of bytes
+ * (a parameter of the drive) only to its buffers,
+ * without causing actual media access. If we can't,
+ * we just wait until we can by polling the DSC bit.
+ * This ensures that while we are not transferring
+ * more bytes than the constant referred to above, the
+ * interrupt latency will not become too high and
+ * we won't cause an interrupt timeout, as happened
+ * occasionally in the previous version.
+ * While polling for DSC, the current request is
+ * postponed and ide.c is free to handle requests from
+ * the other device. This is handled transparently to
+ * ide.c. The hwgroup locking method which was used
+ * in the previous version was removed.
+ * Use of new general features which are provided by
+ * ide.c for use with atapi devices.
+ * (Programming done by Mark Lord)
+ * Few potential bug fixes (Again, suggested by Mark)
+ * Single character device data transfers are now
+ * not limited in size, as they were before.
+ * We are asking the tape about its recommended
+ * transfer unit and send a larger data transfer
+ * as several transfers of the above size.
+ * For best results, use an integral number of this
+ * basic unit (which is shown during driver
+ * initialization). I will soon add an ioctl to get
+ * this important parameter.
+ * Our data transfer buffer is allocated on startup,
+ * rather than before each data transfer. This should
+ * ensure that we will indeed have a data buffer.
+ * Ver 1.1 Dec 14 95 Fixed random problems which occurred when the tape
+ * shared an interface with another device.
+ * (poll_for_dsc was a complete mess).
+ * Removed some old (non-active) code which had
+ * to do with supporting buffer cache originated
+ * requests.
+ * The block device interface can now be opened, so
+ * that general ide driver features like the unmask
+ * interrupts flag can be selected with an ioctl.
+ * This is the only use of the block device interface.
+ * New fast pipelined operation mode (currently only on
+ * writes). When using the pipelined mode, the
+ * throughput can potentially reach the maximum
+ * tape supported throughput, regardless of the
+ * user backup program. On my tape drive, it sometimes
+ * boosted performance by a factor of 2. Pipelined
+ * mode is enabled by default, but since it has a few
+ * downfalls as well, you may want to disable it.
+ * A short explanation of the pipelined operation mode
+ * is available below.
+ * Ver 1.2 Jan 1 96 Eliminated pipelined mode race condition.
+ * Added pipeline read mode. As a result, restores
+ * are now as fast as backups.
+ * Optimized shared interface behavior. The new behavior
+ * typically results in better IDE bus efficiency and
+ * higher tape throughput.
+ * Pre-calculation of the expected read/write request
+ * service time, based on the tape's parameters. In
+ * the pipelined operation mode, this allows us to
+ * adjust our polling frequency to a much lower value,
+ * and thus to dramatically reduce our load on Linux,
+ * without any decrease in performance.
+ * Implemented additional mtio.h operations.
+ * The recommended user block size is returned by
+ * the MTIOCGET ioctl.
+ * Additional minor changes.
+ * Ver 1.3 Feb 9 96 Fixed pipelined read mode bug which prevented the
+ * use of some block sizes during a restore procedure.
+ * The character device interface will now present a
+ * continuous view of the media - any mix of block sizes
+ * during a backup/restore procedure is supported. The
+ * driver will buffer the requests internally and
+ * convert them to the tape's recommended transfer
+ * unit, making performance almost independent of the
+ * chosen user block size.
+ * Some improvements in error recovery.
+ * By cooperating with ide-dma.c, bus mastering DMA can
+ * now sometimes be used with IDE tape drives as well.
+ * Bus mastering DMA has the potential to dramatically
+ * reduce the CPU's overhead when accessing the device,
+ * and can be enabled by using hdparm -d1 on the tape's
+ * block device interface. For more info, read the
+ * comments in ide-dma.c.
+ * Ver 1.4 Mar 13 96 Fixed serialize support.
+ * Ver 1.5 Apr 12 96 Fixed shared interface operation, broken in 1.3.85.
+ * Fixed pipelined read mode inefficiency.
+ * Fixed nasty null dereferencing bug.
+ * Ver 1.6 Aug 16 96 Fixed FPU usage in the driver.
+ * Fixed end of media bug.
+ * Ver 1.7 Sep 10 96 Minor changes for the CONNER CTT8000-A model.
+ * Ver 1.8 Sep 26 96 Attempt to find a better balance between good
+ * interactive response and high system throughput.
+ * Ver 1.9 Nov 5 96 Automatically cross encountered filemarks rather
+ * than requiring an explicit FSF command.
+ * Abort pending requests at end of media.
+ * MTTELL was sometimes returning incorrect results.
+ * Return the real block size in the MTIOCGET ioctl.
+ * Some error recovery bug fixes.
+ * Ver 1.10 Nov 5 96 Major reorganization.
+ * Reduced CPU overhead a bit by eliminating internal
+ * bounce buffers.
+ * Added module support.
+ * Added multiple tape drives support.
+ * Added partition support.
+ * Rewrote DSC handling.
+ * Some portability fixes.
+ * Removed ide-tape.h.
+ * Additional minor changes.
+ * Ver 1.11 Dec 2 96 Bug fix in previous DSC timeout handling.
+ * Use ide_stall_queue() for DSC overlap.
+ * Use the maximum speed rather than the current speed
+ * to compute the request service time.
+ * Ver 1.12 Dec 7 97 Fix random memory overwriting and/or last block data
+ * corruption, which could occur if the total number
+ * of bytes written to the tape was not an integral
+ * number of tape blocks.
+ * Add support for INTERRUPT DRQ devices.
+ * Ver 1.13 Jan 2 98 Add "speed == 0" work-around for HP COLORADO 5GB
+ * Ver 1.14 Dec 30 98 Partial fixes for the Sony/AIWA tape drives.
+ * Replace cli()/sti() with hwgroup spinlocks.
+ * Ver 1.15 Mar 25 99 Fix SMP race condition by replacing hwgroup
+ * spinlock with private per-tape spinlock.
+ * Ver 1.16 Sep 1 99 Add OnStream tape support.
+ * Abort read pipeline on EOD.
+ * Wait for the tape to become ready in case it returns
+ * "in the process of becoming ready" on open().
+ * Fix zero padding of the last written block in
+ * case the tape block size is larger than PAGE_SIZE.
+ * Decrease the default disconnection time to tn.
+ * Ver 1.16e Oct 3 99 Minor fixes.
+ * Ver 1.16e1 Oct 13 99 Patches by Arnold Niessen,
+ * niessen@iae.nl / arnold.niessen@philips.com
+ * GO-1) Undefined code in idetape_read_position
+ * according to Gadi's email
+ * AJN-1) Minor fix asc == 11 should be asc == 0x11
+ * in idetape_issue_packet_command (did effect
+ * debugging output only)
+ * AJN-2) Added more debugging output, and
+ * added ide-tape: where missing. I would also
+ * like to add tape->name where possible
+ * AJN-3) Added different debug_level's
+ * via /proc/ide/hdc/settings
+ * "debug_level" determines amount of debugging output;
+ * can be changed using /proc/ide/hdx/settings
+ * 0 : almost no debugging output
+ * 1 : 0+output errors only
+ * 2 : 1+output all sensekey/asc
+ * 3 : 2+follow all chrdev related procedures
+ * 4 : 3+follow all procedures
+ * 5 : 4+include pc_stack rq_stack info
+ * 6 : 5+USE_COUNT updates
+ * AJN-4) Fixed timeout for retension in idetape_queue_pc_tail
+ * from 5 to 10 minutes
+ * AJN-5) Changed maximum number of blocks to skip when
+ * reading tapes with multiple consecutive write
+ * errors from 100 to 1000 in idetape_get_logical_blk
+ * Proposed changes to code:
+ * 1) output "logical_blk_num" via /proc
+ * 2) output "current_operation" via /proc
+ * 3) Either solve or document the fact that `mt rewind' is
+ * required after reading from /dev/nhtx to be
+ * able to rmmod the idetape module;
+ * Also, sometimes an application finishes but the
+ * device remains `busy' for some time. Same cause ?
+ * Proposed changes to release-notes:
+ * 4) write a simple `quickstart' section in the
+ * release notes; I volunteer if you don't want to
+ * 5) include a pointer to video4linux in the doc
+ * to stimulate video applications
+ * 6) release notes lines 331 and 362: explain what happens
+ * if the application data rate is higher than 1100 KB/s;
+ * similar approach to lower-than-500 kB/s ?
+ * 7) 6.6 Comparison; wouldn't it be better to allow different
+ * strategies for read and write ?
+ * Wouldn't it be better to control the tape buffer
+ * contents instead of the bandwidth ?
+ * 8) line 536: replace will by would (if I understand
+ * this section correctly, a hypothetical and unwanted situation
+ * is being described)
+ * Ver 1.16f Dec 15 99 Change place of the secondary OnStream header frames.
+ * Ver 1.17 Nov 2000 / Jan 2001 Marcel Mol, marcel@mesa.nl
+ * - Add idetape_onstream_mode_sense_tape_parameter_page
+ * function to get tape capacity in frames: tape->capacity.
+ * - Add support for DI-50 drives( or any DI- drive).
+ * - 'workaround' for read error/blank block around block 3000.
+ * - Implement Early warning for end of media for Onstream.
+ * - Cosmetic code changes for readability.
+ * - Idetape_position_tape should not use SKIP bit during
+ * Onstream read recovery.
+ * - Add capacity, logical_blk_num and first/last_frame_position
+ * to /proc/ide/hd?/settings.
+ * - Module use count was gone in the Linux 2.4 driver.
+ * Ver 1.17a Apr 2001 Willem Riede osst@riede.org
+ * - Get drive's actual block size from mode sense block descriptor
+ * - Limit size of pipeline
+ * Ver 1.17b Oct 2002 Alan Stern <stern@rowland.harvard.edu>
+ * Changed IDETAPE_MIN_PIPELINE_STAGES to 1 and actually used
+ * it in the code!
+ * Actually removed aborted stages in idetape_abort_pipeline
+ * instead of just changing the command code.
+ * Made the transfer byte count for Request Sense equal to the
+ * actual length of the data transfer.
+ * Changed handling of partial data transfers: they do not
+ * cause DMA errors.
+ * Moved initiation of DMA transfers to the correct place.
+ * Removed reference to unallocated memory.
+ * Made __idetape_discard_read_pipeline return the number of
+ * sectors skipped, not the number of stages.
+ * Replaced errant kfree() calls with __idetape_kfree_stage().
+ * Fixed off-by-one error in testing the pipeline length.
+ * Fixed handling of filemarks in the read pipeline.
+ * Small code optimization for MTBSF and MTBSFM ioctls.
+ * Don't try to unlock the door during device close if is
+ * already unlocked!
+ * Cosmetic fixes to miscellaneous debugging output messages.
+ * Set the minimum /proc/ide/hd?/settings values for "pipeline",
+ * "pipeline_min", and "pipeline_max" to 1.
+ */
diff --git a/Documentation/ide/ide-tape.txt b/Documentation/ide/ide-tape.txt
new file mode 100644
index 00000000000..658f271a373
--- /dev/null
+++ b/Documentation/ide/ide-tape.txt
@@ -0,0 +1,146 @@
+/*
+ * IDE ATAPI streaming tape driver.
+ *
+ * This driver is a part of the Linux ide driver.
+ *
+ * The driver, in co-operation with ide.c, basically traverses the
+ * request-list for the block device interface. The character device
+ * interface, on the other hand, creates new requests, adds them
+ * to the request-list of the block device, and waits for their completion.
+ *
+ * Pipelined operation mode is now supported on both reads and writes.
+ *
+ * The block device major and minor numbers are determined from the
+ * tape's relative position in the ide interfaces, as explained in ide.c.
+ *
+ * The character device interface consists of the following devices:
+ *
+ * ht0 major 37, minor 0 first IDE tape, rewind on close.
+ * ht1 major 37, minor 1 second IDE tape, rewind on close.
+ * ...
+ * nht0 major 37, minor 128 first IDE tape, no rewind on close.
+ * nht1 major 37, minor 129 second IDE tape, no rewind on close.
+ * ...
+ *
+ * The general magnetic tape commands compatible interface, as defined by
+ * include/linux/mtio.h, is accessible through the character device.
+ *
+ * General ide driver configuration options, such as the interrupt-unmask
+ * flag, can be configured by issuing an ioctl to the block device interface,
+ * as any other ide device.
+ *
+ * Our own ide-tape ioctl's can be issued to either the block device or
+ * the character device interface.
+ *
+ * Maximal throughput with minimal bus load will usually be achieved in the
+ * following scenario:
+ *
+ * 1. ide-tape is operating in the pipelined operation mode.
+ * 2. No buffering is performed by the user backup program.
+ *
+ * Testing was done with a 2 GB CONNER CTMA 4000 IDE ATAPI Streaming Tape Drive.
+ *
+ * Here are some words from the first releases of hd.c, which are quoted
+ * in ide.c and apply here as well:
+ *
+ * | Special care is recommended. Have Fun!
+ *
+ *
+ * An overview of the pipelined operation mode.
+ *
+ * In the pipelined write mode, we will usually just add requests to our
+ * pipeline and return immediately, before we even start to service them. The
+ * user program will then have enough time to prepare the next request while
+ * we are still busy servicing previous requests. In the pipelined read mode,
+ * the situation is similar - we add read-ahead requests into the pipeline,
+ * before the user even requested them.
+ *
+ * The pipeline can be viewed as a "safety net" which will be activated when
+ * the system load is high and prevents the user backup program from keeping up
+ * with the current tape speed. At this point, the pipeline will get
+ * shorter and shorter but the tape will still be streaming at the same speed.
+ * Assuming we have enough pipeline stages, the system load will hopefully
+ * decrease before the pipeline is completely empty, and the backup program
+ * will be able to "catch up" and refill the pipeline again.
+ *
+ * When using the pipelined mode, it would be best to disable any type of
+ * buffering done by the user program, as ide-tape already provides all the
+ * benefits in the kernel, where it can be done in a more efficient way.
+ * As we will usually not block the user program on a request, the most
+ * efficient user code will then be a simple read-write-read-... cycle.
+ * Any additional logic will usually just slow down the backup process.
+ *
+ * Using the pipelined mode, I get a constant over 400 KBps throughput,
+ * which seems to be the maximum throughput supported by my tape.
+ *
+ * However, there are some downfalls:
+ *
+ * 1. We use memory (for data buffers) in proportional to the number
+ * of pipeline stages (each stage is about 26 KB with my tape).
+ * 2. In the pipelined write mode, we cheat and postpone error codes
+ * to the user task. In read mode, the actual tape position
+ * will be a bit further than the last requested block.
+ *
+ * Concerning (1):
+ *
+ * 1. We allocate stages dynamically only when we need them. When
+ * we don't need them, we don't consume additional memory. In
+ * case we can't allocate stages, we just manage without them
+ * (at the expense of decreased throughput) so when Linux is
+ * tight in memory, we will not pose additional difficulties.
+ *
+ * 2. The maximum number of stages (which is, in fact, the maximum
+ * amount of memory) which we allocate is limited by the compile
+ * time parameter IDETAPE_MAX_PIPELINE_STAGES.
+ *
+ * 3. The maximum number of stages is a controlled parameter - We
+ * don't start from the user defined maximum number of stages
+ * but from the lower IDETAPE_MIN_PIPELINE_STAGES (again, we
+ * will not even allocate this amount of stages if the user
+ * program can't handle the speed). We then implement a feedback
+ * loop which checks if the pipeline is empty, and if it is, we
+ * increase the maximum number of stages as necessary until we
+ * reach the optimum value which just manages to keep the tape
+ * busy with minimum allocated memory or until we reach
+ * IDETAPE_MAX_PIPELINE_STAGES.
+ *
+ * Concerning (2):
+ *
+ * In pipelined write mode, ide-tape can not return accurate error codes
+ * to the user program since we usually just add the request to the
+ * pipeline without waiting for it to be serviced. In case an error
+ * occurs, I will report it on the next user request.
+ *
+ * In the pipelined read mode, subsequent read requests or forward
+ * filemark spacing will perform correctly, as we preserve all blocks
+ * and filemarks which we encountered during our excess read-ahead.
+ *
+ * For accurate tape positioning and error reporting, disabling
+ * pipelined mode might be the best option.
+ *
+ * You can enable/disable/tune the pipelined operation mode by adjusting
+ * the compile time parameters below.
+ *
+ *
+ * Possible improvements.
+ *
+ * 1. Support for the ATAPI overlap protocol.
+ *
+ * In order to maximize bus throughput, we currently use the DSC
+ * overlap method which enables ide.c to service requests from the
+ * other device while the tape is busy executing a command. The
+ * DSC overlap method involves polling the tape's status register
+ * for the DSC bit, and servicing the other device while the tape
+ * isn't ready.
+ *
+ * In the current QIC development standard (December 1995),
+ * it is recommended that new tape drives will *in addition*
+ * implement the ATAPI overlap protocol, which is used for the
+ * same purpose - efficient use of the IDE bus, but is interrupt
+ * driven and thus has much less CPU overhead.
+ *
+ * ATAPI overlap is likely to be supported in most new ATAPI
+ * devices, including new ATAPI cdroms, and thus provides us
+ * a method by which we can achieve higher throughput when
+ * sharing a (fast) ATA-2 disk with any (slow) new ATAPI device.
+ */
diff --git a/Documentation/initrd.txt b/Documentation/initrd.txt
index 74f68b35f7c..1ba84f3584e 100644
--- a/Documentation/initrd.txt
+++ b/Documentation/initrd.txt
@@ -85,7 +85,7 @@ involve special block devices or loopbacks; you merely create a directory on
disk with the desired initrd content, cd to that directory, and run (as an
example):
-find . | cpio --quiet -c -o | gzip -9 -n > /boot/imagefile.img
+find . | cpio --quiet -H newc -o | gzip -9 -n > /boot/imagefile.img
Examining the contents of an existing image file is just as simple:
diff --git a/Documentation/ja_JP/stable_kernel_rules.txt b/Documentation/ja_JP/stable_kernel_rules.txt
new file mode 100644
index 00000000000..17d87519e46
--- /dev/null
+++ b/Documentation/ja_JP/stable_kernel_rules.txt
@@ -0,0 +1,79 @@
+NOTE:
+This is Japanese translated version of "Documentation/stable_kernel_rules.txt".
+This one is maintained by Tsugikazu Shibata <tshibata@ab.jp.nec.com>
+and JF Project team <www.linux.or.jp/JF>.
+If you find difference with original file or problem in translation,
+please contact maintainer of this file or JF project.
+
+Please also note that purpose of this file is easier to read for non
+English natives and do no intended to fork. So, if you have any
+comment or update of this file, please try to update Original(English)
+file at first.
+
+==================================
+これは、
+linux-2.6.24/Documentation/stable_kernel_rules.txt
+の和訳です。
+
+翻訳団体: JF プロジェクト < http://www.linux.or.jp/JF/ >
+翻訳日: 2007/12/30
+翻訳者: Tsugikazu Shibata <tshibata at ab dot jp dot nec dot com>
+校正者: 武井伸光さん、<takei at webmasters dot gr dot jp>
+ かねこさん (Seiji Kaneko) <skaneko at a2 dot mbn dot or dot jp>
+ 小林 雅典さん (Masanori Kobayasi) <zap03216 at nifty dot ne dot jp>
+ 野口さん (Kenji Noguchi) <tokyo246 at gmail dot com>
+ 神宮信太郎さん <jin at libjingu dot jp>
+==================================
+
+ずっと知りたかった Linux 2.6 -stable リリースの全て
+
+"-stable" ツリーにどのような種類のパッチが受け入れられるか、どのような
+ものが受け入れられないか、についての規則-
+
+ - 明らかに正しく、テストされているものでなければならない。
+ - 文脈(変更行の前後)を含めて 100 行より大きくてはいけない。
+ - ただ一個のことだけを修正しているべき。
+ - 皆を悩ませている本物のバグを修正しなければならない。("これはバグで
+ あるかもしれないが..." のようなものではない)
+ - ビルドエラー(CONFIG_BROKENになっているものを除く), oops, ハング、デー
+ タ破壊、現実のセキュリティ問題、その他 "ああ、これはダメだね"という
+ ようなものを修正しなければならない。短く言えば、重大な問題。
+ - どのように競合状態が発生するかの説明も一緒に書かれていない限り、
+ "理論的には競合状態になる"ようなものは不可。
+ - いかなる些細な修正も含めることはできない。(スペルの修正、空白のクリー
+ ンアップなど)
+ - 対応するサブシステムメンテナが受け入れたものでなければならない。
+ - Documentation/SubmittingPatches の規則に従ったものでなければならない。
+
+-stable ツリーにパッチを送付する手続き-
+
+ - 上記の規則に従っているかを確認した後に、stable@kernel.org にパッチ
+ を送る。
+ - 送信者はパッチがキューに受け付けられた際には ACK を、却下された場合
+ には NAK を受け取る。この反応は開発者たちのスケジュールによって、数
+ 日かかる場合がある。
+ - もし受け取られたら、パッチは他の開発者たちのレビューのために
+ -stable キューに追加される。
+ - セキュリティパッチはこのエイリアス (stable@kernel.org) に送られるべ
+ きではなく、代わりに security@kernel.org のアドレスに送られる。
+
+レビューサイクル-
+
+ - -stable メンテナがレビューサイクルを決めるとき、パッチはレビュー委
+ 員会とパッチが影響する領域のメンテナ(提供者がその領域のメンテナで無
+ い限り)に送られ、linux-kernel メーリングリストにCCされる。
+ - レビュー委員会は 48時間の間に ACK か NAK を出す。
+ - もしパッチが委員会のメンバから却下れるか、メンテナ達やメンバが気付
+ かなかった問題が持ちあがり、linux-kernel メンバがパッチに異議を唱え
+ た場合には、パッチはキューから削除される。
+ - レビューサイクルの最後に、ACK を受けたパッチは最新の -stable リリー
+ スに追加され、その後に新しい -stable リリースが行われる。
+ - セキュリティパッチは、通常のレビューサイクルを通らず、セキュリティ
+ カーネルチームから直接 -stable ツリーに受け付けられる。
+ この手続きの詳細については kernel security チームに問い合わせること。
+
+レビュー委員会-
+
+ - この委員会は、このタスクについて活動する多くのボランティアと、少数の
+ 非ボランティアのカーネル開発者達で構成されている。
+
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 92c40d17435..8ea41b6e6a8 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -147,8 +147,10 @@ and is between 256 and 4096 characters. It is defined in the file
default: 0
acpi_sleep= [HW,ACPI] Sleep options
- Format: { s3_bios, s3_mode }
- See Documentation/power/video.txt
+ Format: { s3_bios, s3_mode, s3_beep }
+ See Documentation/power/video.txt for s3_bios and s3_mode.
+ s3_beep is for debugging; it makes the PC's speaker beep
+ as soon as the kernel's real-mode entry point is called.
acpi_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode
Format: { level | edge | high | low }
@@ -168,6 +170,11 @@ and is between 256 and 4096 characters. It is defined in the file
acpi_irq_isa= [HW,ACPI] If irq_balance, mark listed IRQs used by ISA
Format: <irq>,<irq>...
+ acpi_new_pts_ordering [HW,ACPI]
+ Enforce the ACPI 2.0 ordering of the _PTS control
+ method wrt putting devices into low power states
+ default: pre ACPI 2.0 ordering of _PTS
+
acpi_no_auto_ssdt [HW,ACPI] Disable automatic loading of SSDT
acpi_os_name= [HW,ACPI] Tell ACPI BIOS the name of the OS
@@ -544,7 +551,7 @@ and is between 256 and 4096 characters. It is defined in the file
1 will print _a lot_ more information - normally
only useful to kernel developers.
- decnet= [HW,NET]
+ decnet.addr= [HW,NET]
Format: <area>[,<node>]
See also Documentation/networking/decnet.txt.
@@ -775,6 +782,9 @@ and is between 256 and 4096 characters. It is defined in the file
loop use the MONITOR/MWAIT idle loop anyways. Performance should be the same
as idle=poll.
+ ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem
+ Claim all unknown PCI IDE storage controllers.
+
ignore_loglevel [KNL]
Ignore loglevel setting - this will print /all/
kernel messages to the console. Useful for debugging.
@@ -1556,14 +1566,17 @@ and is between 256 and 4096 characters. It is defined in the file
ramdisk_size= [RAM] Sizes of RAM disks in kilobytes
See Documentation/ramdisk.txt.
- rcu.blimit= [KNL,BOOT] Set maximum number of finished
- RCU callbacks to process in one batch.
+ rcupdate.blimit= [KNL,BOOT]
+ Set maximum number of finished RCU callbacks to process
+ in one batch.
- rcu.qhimark= [KNL,BOOT] Set threshold of queued
+ rcupdate.qhimark= [KNL,BOOT]
+ Set threshold of queued
RCU callbacks over which batch limiting is disabled.
- rcu.qlowmark= [KNL,BOOT] Set threshold of queued
- RCU callbacks below which batch limiting is re-enabled.
+ rcupdate.qlowmark= [KNL,BOOT]
+ Set threshold of queued RCU callbacks below which
+ batch limiting is re-enabled.
rdinit= [KNL]
Format: <full_path>
@@ -1883,9 +1896,6 @@ and is between 256 and 4096 characters. It is defined in the file
st= [HW,SCSI] SCSI tape parameters (buffers, etc.)
See Documentation/scsi/st.txt.
- st0x= [HW,SCSI]
- See header of drivers/scsi/seagate.c.
-
sti= [PARISC,HW]
Format: <num>
Set the STI (builtin display/keyboard on the HP-PARISC
@@ -1970,9 +1980,6 @@ and is between 256 and 4096 characters. It is defined in the file
tipar.delay= [HW,PPT]
Set inter-bit delay in microseconds (default 10).
- tmc8xx= [HW,SCSI]
- See header of drivers/scsi/seagate.c.
-
tmscsim= [HW,SCSI]
See comment before function dc390_setup() in
drivers/scsi/tmscsim.c.
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index 53a63890aea..30c101761d0 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -96,7 +96,9 @@ or in registers (e.g., for x86_64 or for an i386 fastcall function).
The jprobe will work in either case, so long as the handler's
prototype matches that of the probed function.
-1.3 How Does a Return Probe Work?
+1.3 Return Probes
+
+1.3.1 How Does a Return Probe Work?
When you call register_kretprobe(), Kprobes establishes a kprobe at
the entry to the function. When the probed function is called and this
@@ -107,9 +109,9 @@ At boot time, Kprobes registers a kprobe at the trampoline.
When the probed function executes its return instruction, control
passes to the trampoline and that probe is hit. Kprobes' trampoline
-handler calls the user-specified handler associated with the kretprobe,
-then sets the saved instruction pointer to the saved return address,
-and that's where execution resumes upon return from the trap.
+handler calls the user-specified return handler associated with the
+kretprobe, then sets the saved instruction pointer to the saved return
+address, and that's where execution resumes upon return from the trap.
While the probed function is executing, its return address is
stored in an object of type kretprobe_instance. Before calling
@@ -131,6 +133,30 @@ zero when the return probe is registered, and is incremented every
time the probed function is entered but there is no kretprobe_instance
object available for establishing the return probe.
+1.3.2 Kretprobe entry-handler
+
+Kretprobes also provides an optional user-specified handler which runs
+on function entry. This handler is specified by setting the entry_handler
+field of the kretprobe struct. Whenever the kprobe placed by kretprobe at the
+function entry is hit, the user-defined entry_handler, if any, is invoked.
+If the entry_handler returns 0 (success) then a corresponding return handler
+is guaranteed to be called upon function return. If the entry_handler
+returns a non-zero error then Kprobes leaves the return address as is, and
+the kretprobe has no further effect for that particular function instance.
+
+Multiple entry and return handler invocations are matched using the unique
+kretprobe_instance object associated with them. Additionally, a user
+may also specify per return-instance private data to be part of each
+kretprobe_instance object. This is especially useful when sharing private
+data between corresponding user entry and return handlers. The size of each
+private data object can be specified at kretprobe registration time by
+setting the data_size field of the kretprobe struct. This data can be
+accessed through the data field of each kretprobe_instance object.
+
+In case probed function is entered but there is no kretprobe_instance
+object available, then in addition to incrementing the nmissed count,
+the user entry_handler invocation is also skipped.
+
2. Architectures Supported
Kprobes, jprobes, and return probes are implemented on the following
@@ -274,6 +300,8 @@ of interest:
- ret_addr: the return address
- rp: points to the corresponding kretprobe object
- task: points to the corresponding task struct
+- data: points to per return-instance private data; see "Kretprobe
+ entry-handler" for details.
The regs_return_value(regs) macro provides a simple abstraction to
extract the return value from the appropriate register as defined by
@@ -556,23 +584,52 @@ report failed calls to sys_open().
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>
+#include <linux/ktime.h>
+
+/* per-instance private data */
+struct my_data {
+ ktime_t entry_stamp;
+};
static const char *probed_func = "sys_open";
-/* Return-probe handler: If the probed function fails, log the return value. */
-static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
+/* Timestamp function entry. */
+static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+ struct my_data *data;
+
+ if(!current->mm)
+ return 1; /* skip kernel threads */
+
+ data = (struct my_data *)ri->data;
+ data->entry_stamp = ktime_get();
+ return 0;
+}
+
+/* If the probed function failed, log the return value and duration.
+ * Duration may turn out to be zero consistently, depending upon the
+ * granularity of time accounting on the platform. */
+static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
int retval = regs_return_value(regs);
+ struct my_data *data = (struct my_data *)ri->data;
+ s64 delta;
+ ktime_t now;
+
if (retval < 0) {
- printk("%s returns %d\n", probed_func, retval);
+ now = ktime_get();
+ delta = ktime_to_ns(ktime_sub(now, data->entry_stamp));
+ printk("%s: return val = %d (duration = %lld ns)\n",
+ probed_func, retval, delta);
}
return 0;
}
static struct kretprobe my_kretprobe = {
- .handler = ret_handler,
- /* Probe up to 20 instances concurrently. */
- .maxactive = 20
+ .handler = return_handler,
+ .entry_handler = entry_handler,
+ .data_size = sizeof(struct my_data),
+ .maxactive = 20, /* probe up to 20 instances concurrently */
};
static int __init kretprobe_init(void)
@@ -584,7 +641,7 @@ static int __init kretprobe_init(void)
printk("register_kretprobe failed, returned %d\n", ret);
return -1;
}
- printk("Planted return probe at %p\n", my_kretprobe.kp.addr);
+ printk("Kretprobe active on %s\n", my_kretprobe.kp.symbol_name);
return 0;
}
@@ -594,7 +651,7 @@ static void __exit kretprobe_exit(void)
printk("kretprobe unregistered\n");
/* nmissed > 0 suggests that maxactive was set too low. */
printk("Missed probing %d instances of %s\n",
- my_kretprobe.nmissed, probed_func);
+ my_kretprobe.nmissed, probed_func);
}
module_init(kretprobe_init)
diff --git a/Documentation/kref.txt b/Documentation/kref.txt
index f38b59d00c6..130b6e87aa7 100644
--- a/Documentation/kref.txt
+++ b/Documentation/kref.txt
@@ -141,10 +141,10 @@ The last rule (rule 3) is the nastiest one to handle. Say, for
instance, you have a list of items that are each kref-ed, and you wish
to get the first one. You can't just pull the first item off the list
and kref_get() it. That violates rule 3 because you are not already
-holding a valid pointer. You must add locks or semaphores. For
-instance:
+holding a valid pointer. You must add a mutex (or some other lock).
+For instance:
-static DECLARE_MUTEX(sem);
+static DEFINE_MUTEX(mutex);
static LIST_HEAD(q);
struct my_data
{
@@ -155,12 +155,12 @@ struct my_data
static struct my_data *get_entry()
{
struct my_data *entry = NULL;
- down(&sem);
+ mutex_lock(&mutex);
if (!list_empty(&q)) {
entry = container_of(q.next, struct my_q_entry, link);
kref_get(&entry->refcount);
}
- up(&sem);
+ mutex_unlock(&mutex);
return entry;
}
@@ -174,9 +174,9 @@ static void release_entry(struct kref *ref)
static void put_entry(struct my_data *entry)
{
- down(&sem);
+ mutex_lock(&mutex);
kref_put(&entry->refcount, release_entry);
- up(&sem);
+ mutex_unlock(&mutex);
}
The kref_put() return value is useful if you do not want to hold the
@@ -191,13 +191,13 @@ static void release_entry(struct kref *ref)
static void put_entry(struct my_data *entry)
{
- down(&sem);
+ mutex_lock(&mutex);
if (kref_put(&entry->refcount, release_entry)) {
list_del(&entry->link);
- up(&sem);
+ mutex_unlock(&mutex);
kfree(entry);
} else
- up(&sem);
+ mutex_unlock(&mutex);
}
This is really more useful if you have to call other routines as part
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 6c8a2386cd5..0f23d67f958 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -34,6 +34,8 @@
#include <zlib.h>
#include <assert.h>
#include <sched.h>
+#include <limits.h>
+#include <stddef.h>
#include "linux/lguest_launcher.h"
#include "linux/virtio_config.h"
#include "linux/virtio_net.h"
@@ -99,13 +101,11 @@ struct device_list
/* The descriptor page for the devices. */
u8 *descpage;
- /* The tail of the last descriptor. */
- unsigned int desc_used;
-
/* A single linked list of devices. */
struct device *dev;
- /* ... And an end pointer so we can easily append new devices */
- struct device **lastdev;
+ /* And a pointer to the last device for easy append and also for
+ * configuration appending. */
+ struct device *lastdev;
};
/* The list of Guest devices, based on command line arguments. */
@@ -191,7 +191,14 @@ static void *_convert(struct iovec *iov, size_t size, size_t align,
#define cpu_to_le64(v64) (v64)
#define le16_to_cpu(v16) (v16)
#define le32_to_cpu(v32) (v32)
-#define le64_to_cpu(v32) (v64)
+#define le64_to_cpu(v64) (v64)
+
+/* The device virtqueue descriptors are followed by feature bitmasks. */
+static u8 *get_feature_bits(struct device *dev)
+{
+ return (u8 *)(dev->desc + 1)
+ + dev->desc->num_vq * sizeof(struct lguest_vqconfig);
+}
/*L:100 The Launcher code itself takes us out into userspace, that scary place
* where pointers run wild and free! Unfortunately, like most userspace
@@ -914,21 +921,58 @@ static void enable_fd(int fd, struct virtqueue *vq)
write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd));
}
+/* Resetting a device is fairly easy. */
+static void reset_device(struct device *dev)
+{
+ struct virtqueue *vq;
+
+ verbose("Resetting device %s\n", dev->name);
+ /* Clear the status. */
+ dev->desc->status = 0;
+
+ /* Clear any features they've acked. */
+ memset(get_feature_bits(dev) + dev->desc->feature_len, 0,
+ dev->desc->feature_len);
+
+ /* Zero out the virtqueues. */
+ for (vq = dev->vq; vq; vq = vq->next) {
+ memset(vq->vring.desc, 0,
+ vring_size(vq->config.num, getpagesize()));
+ vq->last_avail_idx = 0;
+ }
+}
+
/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */
static void handle_output(int fd, unsigned long addr)
{
struct device *i;
struct virtqueue *vq;
- /* Check each virtqueue. */
+ /* Check each device and virtqueue. */
for (i = devices.dev; i; i = i->next) {
+ /* Notifications to device descriptors reset the device. */
+ if (from_guest_phys(addr) == i->desc) {
+ reset_device(i);
+ return;
+ }
+
+ /* Notifications to virtqueues mean output has occurred. */
for (vq = i->vq; vq; vq = vq->next) {
- if (vq->config.pfn == addr/getpagesize()
- && vq->handle_output) {
- verbose("Output to %s\n", vq->dev->name);
- vq->handle_output(fd, vq);
+ if (vq->config.pfn != addr/getpagesize())
+ continue;
+
+ /* Guest should acknowledge (and set features!) before
+ * using the device. */
+ if (i->desc->status == 0) {
+ warnx("%s gave early output", i->name);
return;
}
+
+ if (strcmp(vq->dev->name, "console") != 0)
+ verbose("Output to %s\n", vq->dev->name);
+ if (vq->handle_output)
+ vq->handle_output(fd, vq);
+ return;
}
}
@@ -986,54 +1030,44 @@ static void handle_input(int fd)
*
* All devices need a descriptor so the Guest knows it exists, and a "struct
* device" so the Launcher can keep track of it. We have common helper
- * routines to allocate them.
- *
- * This routine allocates a new "struct lguest_device_desc" from descriptor
- * table just above the Guest's normal memory. It returns a pointer to that
- * descriptor. */
-static struct lguest_device_desc *new_dev_desc(u16 type)
-{
- struct lguest_device_desc *d;
+ * routines to allocate and manage them. */
- /* We only have one page for all the descriptors. */
- if (devices.desc_used + sizeof(*d) > getpagesize())
- errx(1, "Too many devices");
-
- /* We don't need to set config_len or status: page is 0 already. */
- d = (void *)devices.descpage + devices.desc_used;
- d->type = type;
- devices.desc_used += sizeof(*d);
-
- return d;
+/* The layout of the device page is a "struct lguest_device_desc" followed by a
+ * number of virtqueue descriptors, then two sets of feature bits, then an
+ * array of configuration bytes. This routine returns the configuration
+ * pointer. */
+static u8 *device_config(const struct device *dev)
+{
+ return (void *)(dev->desc + 1)
+ + dev->desc->num_vq * sizeof(struct lguest_vqconfig)
+ + dev->desc->feature_len * 2;
}
-/* Each device descriptor is followed by some configuration information.
- * Each configuration field looks like: u8 type, u8 len, [... len bytes...].
- *
- * This routine adds a new field to an existing device's descriptor. It only
- * works for the last device, but that's OK because that's how we use it. */
-static void add_desc_field(struct device *dev, u8 type, u8 len, const void *c)
+/* This routine allocates a new "struct lguest_device_desc" from descriptor
+ * table page just above the Guest's normal memory. It returns a pointer to
+ * that descriptor. */
+static struct lguest_device_desc *new_dev_desc(u16 type)
{
- /* This is the last descriptor, right? */
- assert(devices.descpage + devices.desc_used
- == (u8 *)(dev->desc + 1) + dev->desc->config_len);
+ struct lguest_device_desc d = { .type = type };
+ void *p;
- /* We only have one page of device descriptions. */
- if (devices.desc_used + 2 + len > getpagesize())
- errx(1, "Too many devices");
+ /* Figure out where the next device config is, based on the last one. */
+ if (devices.lastdev)
+ p = device_config(devices.lastdev)
+ + devices.lastdev->desc->config_len;
+ else
+ p = devices.descpage;
- /* Copy in the new config header: type then length. */
- devices.descpage[devices.desc_used++] = type;
- devices.descpage[devices.desc_used++] = len;
- memcpy(devices.descpage + devices.desc_used, c, len);
- devices.desc_used += len;
+ /* We only have one page for all the descriptors. */
+ if (p + sizeof(d) > (void *)devices.descpage + getpagesize())
+ errx(1, "Too many devices");
- /* Update the device descriptor length: two byte head then data. */
- dev->desc->config_len += 2 + len;
+ /* p might not be aligned, so we memcpy in. */
+ return memcpy(p, &d, sizeof(d));
}
-/* This routine adds a virtqueue to a device. We specify how many descriptors
- * the virtqueue is to have. */
+/* Each device descriptor is followed by the description of its virtqueues. We
+ * specify how many descriptors the virtqueue is to have. */
static void add_virtqueue(struct device *dev, unsigned int num_descs,
void (*handle_output)(int fd, struct virtqueue *me))
{
@@ -1059,9 +1093,15 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
/* Initialize the vring. */
vring_init(&vq->vring, num_descs, p, getpagesize());
- /* Add the configuration information to this device's descriptor. */
- add_desc_field(dev, VIRTIO_CONFIG_F_VIRTQUEUE,
- sizeof(vq->config), &vq->config);
+ /* Append virtqueue to this device's descriptor. We use
+ * device_config() to get the end of the device's current virtqueues;
+ * we check that we haven't added any config or feature information
+ * yet, otherwise we'd be overwriting them. */
+ assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0);
+ memcpy(device_config(dev), &vq->config, sizeof(vq->config));
+ dev->desc->num_vq++;
+
+ verbose("Virtqueue page %#lx\n", to_guest_phys(p));
/* Add to tail of list, so dev->vq is first vq, dev->vq->next is
* second. */
@@ -1072,11 +1112,41 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
* virtqueue. */
vq->handle_output = handle_output;
- /* Set the "Don't Notify Me" flag if we don't have a handler */
+ /* As an optimization, set the advisory "Don't Notify Me" flag if we
+ * don't have a handler */
if (!handle_output)
vq->vring.used->flags = VRING_USED_F_NO_NOTIFY;
}
+/* The first half of the feature bitmask is for us to advertise features. The
+ * second half if for the Guest to accept features. */
+static void add_feature(struct device *dev, unsigned bit)
+{
+ u8 *features = get_feature_bits(dev);
+
+ /* We can't extend the feature bits once we've added config bytes */
+ if (dev->desc->feature_len <= bit / CHAR_BIT) {
+ assert(dev->desc->config_len == 0);
+ dev->desc->feature_len = (bit / CHAR_BIT) + 1;
+ }
+
+ features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT));
+}
+
+/* This routine sets the configuration fields for an existing device's
+ * descriptor. It only works for the last device, but that's OK because that's
+ * how we use it. */
+static void set_config(struct device *dev, unsigned len, const void *conf)
+{
+ /* Check we haven't overflowed our single page. */
+ if (device_config(dev) + len > devices.descpage + getpagesize())
+ errx(1, "Too many devices");
+
+ /* Copy in the config information, and store the length. */
+ memcpy(device_config(dev), conf, len);
+ dev->desc->config_len = len;
+}
+
/* This routine does all the creation and setup of a new device, including
* calling new_dev_desc() to allocate the descriptor and device memory. */
static struct device *new_device(const char *name, u16 type, int fd,
@@ -1084,14 +1154,6 @@ static struct device *new_device(const char *name, u16 type, int fd,
{
struct device *dev = malloc(sizeof(*dev));
- /* Append to device list. Prepending to a single-linked list is
- * easier, but the user expects the devices to be arranged on the bus
- * in command-line order. The first network device on the command line
- * is eth0, the first block device /dev/vda, etc. */
- *devices.lastdev = dev;
- dev->next = NULL;
- devices.lastdev = &dev->next;
-
/* Now we populate the fields one at a time. */
dev->fd = fd;
/* If we have an input handler for this file descriptor, then we add it
@@ -1102,6 +1164,17 @@ static struct device *new_device(const char *name, u16 type, int fd,
dev->handle_input = handle_input;
dev->name = name;
dev->vq = NULL;
+
+ /* Append to device list. Prepending to a single-linked list is
+ * easier, but the user expects the devices to be arranged on the bus
+ * in command-line order. The first network device on the command line
+ * is eth0, the first block device /dev/vda, etc. */
+ if (devices.lastdev)
+ devices.lastdev->next = dev;
+ else
+ devices.dev = dev;
+ devices.lastdev = dev;
+
return dev;
}
@@ -1226,7 +1299,7 @@ static void setup_tun_net(const char *arg)
int netfd, ipfd;
u32 ip;
const char *br_name = NULL;
- u8 hwaddr[6];
+ struct virtio_net_config conf;
/* We open the /dev/net/tun device and tell it we want a tap device. A
* tap device is like a tun device, only somehow different. To tell
@@ -1265,12 +1338,13 @@ static void setup_tun_net(const char *arg)
ip = str2ip(arg);
/* Set up the tun device, and get the mac address for the interface. */
- configure_device(ipfd, ifr.ifr_name, ip, hwaddr);
+ configure_device(ipfd, ifr.ifr_name, ip, conf.mac);
/* Tell Guest what MAC address to use. */
- add_desc_field(dev, VIRTIO_CONFIG_NET_MAC_F, sizeof(hwaddr), hwaddr);
+ add_feature(dev, VIRTIO_NET_F_MAC);
+ set_config(dev, sizeof(conf), &conf);
- /* We don't seed the socket any more; setup is done. */
+ /* We don't need the socket any more; setup is done. */
close(ipfd);
verbose("device %u: tun net %u.%u.%u.%u\n",
@@ -1458,8 +1532,7 @@ static void setup_block_file(const char *filename)
struct device *dev;
struct vblk_info *vblk;
void *stack;
- u64 cap;
- unsigned int val;
+ struct virtio_blk_config conf;
/* This is the pipe the I/O thread will use to tell us I/O is done. */
pipe(p);
@@ -1477,14 +1550,18 @@ static void setup_block_file(const char *filename)
vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
vblk->len = lseek64(vblk->fd, 0, SEEK_END);
+ /* We support barriers. */
+ add_feature(dev, VIRTIO_BLK_F_BARRIER);
+
/* Tell Guest how many sectors this device has. */
- cap = cpu_to_le64(vblk->len / 512);
- add_desc_field(dev, VIRTIO_CONFIG_BLK_F_CAPACITY, sizeof(cap), &cap);
+ conf.capacity = cpu_to_le64(vblk->len / 512);
/* Tell Guest not to put in too many descriptors at once: two are used
* for the in and out elements. */
- val = cpu_to_le32(VIRTQUEUE_NUM - 2);
- add_desc_field(dev, VIRTIO_CONFIG_BLK_F_SEG_MAX, sizeof(val), &val);
+ add_feature(dev, VIRTIO_BLK_F_SEG_MAX);
+ conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2);
+
+ set_config(dev, sizeof(conf), &conf);
/* The I/O thread writes to this end of the pipe when done. */
vblk->done_fd = p[1];
@@ -1505,7 +1582,7 @@ static void setup_block_file(const char *filename)
close(vblk->workpipe[0]);
verbose("device %u: virtblock %llu sectors\n",
- devices.device_num, cap);
+ devices.device_num, le64_to_cpu(conf.capacity));
}
/* That's the end of device setup. :*/
@@ -1610,12 +1687,12 @@ int main(int argc, char *argv[])
/* First we initialize the device list. Since console and network
* device receive input from a file descriptor, we keep an fdset
* (infds) and the maximum fd number (max_infd) with the head of the
- * list. We also keep a pointer to the last device, for easy appending
- * to the list. Finally, we keep the next interrupt number to hand out
- * (1: remember that 0 is used by the timer). */
+ * list. We also keep a pointer to the last device. Finally, we keep
+ * the next interrupt number to hand out (1: remember that 0 is used by
+ * the timer). */
FD_ZERO(&devices.infds);
devices.max_infd = -1;
- devices.lastdev = &devices.dev;
+ devices.lastdev = NULL;
devices.next_irq = 1;
cpu_id = 0;
diff --git a/Documentation/md.txt b/Documentation/md.txt
index 5818628207b..396cdd982c2 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -416,6 +416,16 @@ also have
sectors in total that could need to be processed. The two
numbers are separated by a '/' thus effectively showing one
value, a fraction of the process that is complete.
+ A 'select' on this attribute will return when resync completes,
+ when it reaches the current sync_max (below) and possibly at
+ other times.
+
+ sync_max
+ This is a number of sectors at which point a resync/recovery
+ process will pause. When a resync is active, the value can
+ only ever be increased, never decreased. The value of 'max'
+ effectively disables the limit.
+
sync_speed
This shows the current actual speed, in K/sec, of the current
diff --git a/Documentation/networking/decnet.txt b/Documentation/networking/decnet.txt
index badb7480ea6..d8968958d83 100644
--- a/Documentation/networking/decnet.txt
+++ b/Documentation/networking/decnet.txt
@@ -60,7 +60,7 @@ operation of the local communications in any other way though.
The kernel command line takes options looking like the following:
- decnet=1,2
+ decnet.addr=1,2
the two numbers are the node address 1,2 = 1.2 For 2.2.xx kernels
and early 2.3.xx kernels, you must use a comma when specifying the
diff --git a/Documentation/pci.txt b/Documentation/pci.txt
index 7754f5aea4e..72b20c63959 100644
--- a/Documentation/pci.txt
+++ b/Documentation/pci.txt
@@ -274,8 +274,6 @@ the PCI device by calling pci_enable_device(). This will:
o allocate an IRQ (if BIOS did not).
NOTE: pci_enable_device() can fail! Check the return value.
-NOTE2: Also see pci_enable_device_bars() below. Drivers can
- attempt to enable only a subset of BARs they need.
[ OS BUG: we don't check resource allocations before enabling those
resources. The sequence would make more sense if we called
@@ -605,40 +603,7 @@ device lists. This is still possible but discouraged.
-10. pci_enable_device_bars() and Legacy I/O Port space
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Large servers may not be able to provide I/O port resources to all PCI
-devices. I/O Port space is only 64KB on Intel Architecture[1] and is
-likely also fragmented since the I/O base register of PCI-to-PCI
-bridge will usually be aligned to a 4KB boundary[2]. On such systems,
-pci_enable_device() and pci_request_region() will fail when
-attempting to enable I/O Port regions that don't have I/O Port
-resources assigned.
-
-Fortunately, many PCI devices which request I/O Port resources also
-provide access to the same registers via MMIO BARs. These devices can
-be handled without using I/O port space and the drivers typically
-offer a CONFIG_ option to only use MMIO regions
-(e.g. CONFIG_TULIP_MMIO). PCI devices typically provide I/O port
-interface for legacy OSes and will work when I/O port resources are not
-assigned. The "PCI Local Bus Specification Revision 3.0" discusses
-this on p.44, "IMPLEMENTATION NOTE".
-
-If your PCI device driver doesn't need I/O port resources assigned to
-I/O Port BARs, you should use pci_enable_device_bars() instead of
-pci_enable_device() in order not to enable I/O port regions for the
-corresponding devices. In addition, you should use
-pci_request_selected_regions() and pci_release_selected_regions()
-instead of pci_request_regions()/pci_release_regions() in order not to
-request/release I/O port regions for the corresponding devices.
-
-[1] Some systems support 64KB I/O port space per PCI segment.
-[2] Some PCI-to-PCI bridges support optional 1KB aligned I/O base.
-
-
-
-11. MMIO Space and "Write Posting"
+10. MMIO Space and "Write Posting"
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Converting a driver from using I/O Port space to using MMIO space
diff --git a/Documentation/pcmcia/driver-changes.txt b/Documentation/pcmcia/driver-changes.txt
index 4739c5c3fac..96f155e6875 100644
--- a/Documentation/pcmcia/driver-changes.txt
+++ b/Documentation/pcmcia/driver-changes.txt
@@ -33,8 +33,8 @@ This file details changes in 2.6 which affect PCMCIA card driver authors:
and can be used (e.g. for SET_NETDEV_DEV) by using
handle_to_dev(client_handle_t * handle).
-* Convert internal I/O port addresses to unsigned long (as of 2.6.11)
- ioaddr_t should be replaced by kio_addr_t in PCMCIA card drivers.
+* Convert internal I/O port addresses to unsigned int (as of 2.6.11)
+ ioaddr_t should be replaced by unsigned int in PCMCIA card drivers.
* irq_mask and irq_list parameters (as of 2.6.11)
The irq_mask and irq_list parameters should no longer be used in
diff --git a/Documentation/pm_qos_interface.txt b/Documentation/pm_qos_interface.txt
new file mode 100644
index 00000000000..49adb1a3351
--- /dev/null
+++ b/Documentation/pm_qos_interface.txt
@@ -0,0 +1,59 @@
+PM quality of Service interface.
+
+This interface provides a kernel and user mode interface for registering
+performance expectations by drivers, subsystems and user space applications on
+one of the parameters.
+
+Currently we have {cpu_dma_latency, network_latency, network_throughput} as the
+initial set of pm_qos parameters.
+
+The infrastructure exposes multiple misc device nodes one per implemented
+parameter. The set of parameters implement is defined by pm_qos_power_init()
+and pm_qos_params.h. This is done because having the available parameters
+being runtime configurable or changeable from a driver was seen as too easy to
+abuse.
+
+For each parameter a list of performance requirements is maintained along with
+an aggregated target value. The aggregated target value is updated with
+changes to the requirement list or elements of the list. Typically the
+aggregated target value is simply the max or min of the requirement values held
+in the parameter list elements.
+
+From kernel mode the use of this interface is simple:
+pm_qos_add_requirement(param_id, name, target_value):
+Will insert a named element in the list for that identified PM_QOS parameter
+with the target value. Upon change to this list the new target is recomputed
+and any registered notifiers are called only if the target value is now
+different.
+
+pm_qos_update_requirement(param_id, name, new_target_value):
+Will search the list identified by the param_id for the named list element and
+then update its target value, calling the notification tree if the aggregated
+target is changed. with that name is already registered.
+
+pm_qos_remove_requirement(param_id, name):
+Will search the identified list for the named element and remove it, after
+removal it will update the aggregate target and call the notification tree if
+the target was changed as a result of removing the named requirement.
+
+
+From user mode:
+Only processes can register a pm_qos requirement. To provide for automatic
+cleanup for process the interface requires the process to register its
+parameter requirements in the following way:
+
+To register the default pm_qos target for the specific parameter, the process
+must open one of /dev/[cpu_dma_latency, network_latency, network_throughput]
+
+As long as the device node is held open that process has a registered
+requirement on the parameter. The name of the requirement is "process_<PID>"
+derived from the current->pid from within the open system call.
+
+To change the requested target value the process needs to write a s32 value to
+the open device node. This translates to a pm_qos_update_requirement call.
+
+To remove the user mode request for a target value simply close the device
+node.
+
+
+
diff --git a/Documentation/power/basic-pm-debugging.txt b/Documentation/power/basic-pm-debugging.txt
index 57aef2f6e0d..1555001bc73 100644
--- a/Documentation/power/basic-pm-debugging.txt
+++ b/Documentation/power/basic-pm-debugging.txt
@@ -1,45 +1,111 @@
-Debugging suspend and resume
+Debugging hibernation and suspend
(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
-1. Testing suspend to disk (STD)
+1. Testing hibernation (aka suspend to disk or STD)
-To verify that the STD works, you can try to suspend in the "reboot" mode:
+To check if hibernation works, you can try to hibernate in the "reboot" mode:
# echo reboot > /sys/power/disk
# echo disk > /sys/power/state
-and the system should suspend, reboot, resume and get back to the command prompt
-where you have started the transition. If that happens, the STD is most likely
-to work correctly, but you need to repeat the test at least a couple of times in
-a row for confidence. This is necessary, because some problems only show up on
-a second attempt at suspending and resuming the system. You should also test
-the "platform" and "shutdown" modes of suspend:
+and the system should create a hibernation image, reboot, resume and get back to
+the command prompt where you have started the transition. If that happens,
+hibernation is most likely to work correctly. Still, you need to repeat the
+test at least a couple of times in a row for confidence. [This is necessary,
+because some problems only show up on a second attempt at suspending and
+resuming the system.] Moreover, hibernating in the "reboot" and "shutdown"
+modes causes the PM core to skip some platform-related callbacks which on ACPI
+systems might be necessary to make hibernation work. Thus, if you machine fails
+to hibernate or resume in the "reboot" mode, you should try the "platform" mode:
# echo platform > /sys/power/disk
# echo disk > /sys/power/state
-or
+which is the default and recommended mode of hibernation.
+
+Unfortunately, the "platform" mode of hibernation does not work on some systems
+with broken BIOSes. In such cases the "shutdown" mode of hibernation might
+work:
# echo shutdown > /sys/power/disk
# echo disk > /sys/power/state
-in which cases you will have to press the power button to make the system
-resume. If that does not work, you will need to identify what goes wrong.
+(it is similar to the "reboot" mode, but it requires you to press the power
+button to make the system resume).
+
+If neither "platform" nor "shutdown" hibernation mode works, you will need to
+identify what goes wrong.
+
+a) Test modes of hibernation
+
+To find out why hibernation fails on your system, you can use a special testing
+facility available if the kernel is compiled with CONFIG_PM_DEBUG set. Then,
+there is the file /sys/power/pm_test that can be used to make the hibernation
+core run in a test mode. There are 5 test modes available:
+
+freezer
+- test the freezing of processes
+
+devices
+- test the freezing of processes and suspending of devices
-a) Test mode of STD
+platform
+- test the freezing of processes, suspending of devices and platform
+ global control methods(*)
-To verify if there are any drivers that cause problems you can run the STD
-in the test mode:
+processors
+- test the freezing of processes, suspending of devices, platform
+ global control methods(*) and the disabling of nonboot CPUs
-# echo test > /sys/power/disk
+core
+- test the freezing of processes, suspending of devices, platform global
+ control methods(*), the disabling of nonboot CPUs and suspending of
+ platform/system devices
+
+(*) the platform global control methods are only available on ACPI systems
+ and are only tested if the hibernation mode is set to "platform"
+
+To use one of them it is necessary to write the corresponding string to
+/sys/power/pm_test (eg. "devices" to test the freezing of processes and
+suspending devices) and issue the standard hibernation commands. For example,
+to use the "devices" test mode along with the "platform" mode of hibernation,
+you should do the following:
+
+# echo devices > /sys/power/pm_test
+# echo platform > /sys/power/disk
# echo disk > /sys/power/state
-in which case the system should freeze tasks, suspend devices, disable nonboot
-CPUs (if any), wait for 5 seconds, enable nonboot CPUs, resume devices, thaw
-tasks and return to your command prompt. If that fails, most likely there is
-a driver that fails to either suspend or resume (in the latter case the system
-may hang or be unstable after the test, so please take that into consideration).
-To find this driver, you can carry out a binary search according to the rules:
+Then, the kernel will try to freeze processes, suspend devices, wait 5 seconds,
+resume devices and thaw processes. If "platform" is written to
+/sys/power/pm_test , then after suspending devices the kernel will additionally
+invoke the global control methods (eg. ACPI global control methods) used to
+prepare the platform firmware for hibernation. Next, it will wait 5 seconds and
+invoke the platform (eg. ACPI) global methods used to cancel hibernation etc.
+
+Writing "none" to /sys/power/pm_test causes the kernel to switch to the normal
+hibernation/suspend operations. Also, when open for reading, /sys/power/pm_test
+contains a space-separated list of all available tests (including "none" that
+represents the normal functionality) in which the current test level is
+indicated by square brackets.
+
+Generally, as you can see, each test level is more "invasive" than the previous
+one and the "core" level tests the hardware and drivers as deeply as possible
+without creating a hibernation image. Obviously, if the "devices" test fails,
+the "platform" test will fail as well and so on. Thus, as a rule of thumb, you
+should try the test modes starting from "freezer", through "devices", "platform"
+and "processors" up to "core" (repeat the test on each level a couple of times
+to make sure that any random factors are avoided).
+
+If the "freezer" test fails, there is a task that cannot be frozen (in that case
+it usually is possible to identify the offending task by analysing the output of
+dmesg obtained after the failing test). Failure at this level usually means
+that there is a problem with the tasks freezer subsystem that should be
+reported.
+
+If the "devices" test fails, most likely there is a driver that cannot suspend
+or resume its device (in the latter case the system may hang or become unstable
+after the test, so please take that into consideration). To find this driver,
+you can carry out a binary search according to the rules:
- if the test fails, unload a half of the drivers currently loaded and repeat
(that would probably involve rebooting the system, so always note what drivers
have been loaded before the test),
@@ -47,23 +113,46 @@ have been loaded before the test),
recently and repeat.
Once you have found the failing driver (there can be more than just one of
-them), you have to unload it every time before the STD transition. In that case
-please make sure to report the problem with the driver.
-
-It is also possible that a cycle can still fail after you have unloaded
-all modules. In that case, you would want to look in your kernel configuration
-for the drivers that can be compiled as modules (testing again with them as
-modules), and possibly also try boot time options such as "noapic" or "noacpi".
+them), you have to unload it every time before hibernation. In that case please
+make sure to report the problem with the driver.
+
+It is also possible that the "devices" test will still fail after you have
+unloaded all modules. In that case, you may want to look in your kernel
+configuration for the drivers that can be compiled as modules (and test again
+with these drivers compiled as modules). You may also try to use some special
+kernel command line options such as "noapic", "noacpi" or even "acpi=off".
+
+If the "platform" test fails, there is a problem with the handling of the
+platform (eg. ACPI) firmware on your system. In that case the "platform" mode
+of hibernation is not likely to work. You can try the "shutdown" mode, but that
+is rather a poor man's workaround.
+
+If the "processors" test fails, the disabling/enabling of nonboot CPUs does not
+work (of course, this only may be an issue on SMP systems) and the problem
+should be reported. In that case you can also try to switch the nonboot CPUs
+off and on using the /sys/devices/system/cpu/cpu*/online sysfs attributes and
+see if that works.
+
+If the "core" test fails, which means that suspending of the system/platform
+devices has failed (these devices are suspended on one CPU with interrupts off),
+the problem is most probably hardware-related and serious, so it should be
+reported.
+
+A failure of any of the "platform", "processors" or "core" tests may cause your
+system to hang or become unstable, so please beware. Such a failure usually
+indicates a serious problem that very well may be related to the hardware, but
+please report it anyway.
b) Testing minimal configuration
-If the test mode of STD works, you can boot the system with "init=/bin/bash"
-and attempt to suspend in the "reboot", "shutdown" and "platform" modes. If
-that does not work, there probably is a problem with a driver statically
-compiled into the kernel and you can try to compile more drivers as modules,
-so that they can be tested individually. Otherwise, there is a problem with a
-modular driver and you can find it by loading a half of the modules you normally
-use and binary searching in accordance with the algorithm:
+If all of the hibernation test modes work, you can boot the system with the
+"init=/bin/bash" command line parameter and attempt to hibernate in the
+"reboot", "shutdown" and "platform" modes. If that does not work, there
+probably is a problem with a driver statically compiled into the kernel and you
+can try to compile more drivers as modules, so that they can be tested
+individually. Otherwise, there is a problem with a modular driver and you can
+find it by loading a half of the modules you normally use and binary searching
+in accordance with the algorithm:
- if there are n modules loaded and the attempt to suspend and resume fails,
unload n/2 of the modules and try again (that would probably involve rebooting
the system),
@@ -71,19 +160,19 @@ the system),
load n/2 modules more and try again.
Again, if you find the offending module(s), it(they) must be unloaded every time
-before the STD transition, and please report the problem with it(them).
+before hibernation, and please report the problem with it(them).
c) Advanced debugging
-In case the STD does not work on your system even in the minimal configuration
-and compiling more drivers as modules is not practical or some modules cannot
-be unloaded, you can use one of the more advanced debugging techniques to find
-the problem. First, if there is a serial port in your box, you can boot the
-kernel with the 'no_console_suspend' parameter and try to log kernel
-messages using the serial console. This may provide you with some information
-about the reasons of the suspend (resume) failure. Alternatively, it may be
-possible to use a FireWire port for debugging with firescope
-(ftp://ftp.firstfloor.org/pub/ak/firescope/). On i386 it is also possible to
+In case that hibernation does not work on your system even in the minimal
+configuration and compiling more drivers as modules is not practical or some
+modules cannot be unloaded, you can use one of the more advanced debugging
+techniques to find the problem. First, if there is a serial port in your box,
+you can boot the kernel with the 'no_console_suspend' parameter and try to log
+kernel messages using the serial console. This may provide you with some
+information about the reasons of the suspend (resume) failure. Alternatively,
+it may be possible to use a FireWire port for debugging with firescope
+(ftp://ftp.firstfloor.org/pub/ak/firescope/). On x86 it is also possible to
use the PM_TRACE mechanism documented in Documentation/s2ram.txt .
2. Testing suspend to RAM (STR)
@@ -91,16 +180,25 @@ use the PM_TRACE mechanism documented in Documentation/s2ram.txt .
To verify that the STR works, it is generally more convenient to use the s2ram
tool available from http://suspend.sf.net and documented at
http://en.opensuse.org/s2ram . However, before doing that it is recommended to
-carry out the procedure described in section 1.
-
-Assume you have resolved the problems with the STD and you have found some
-failing drivers. These drivers are also likely to fail during the STR or
-during the resume, so it is better to unload them every time before the STR
-transition. Now, you can follow the instructions at
-http://en.opensuse.org/s2ram to test the system, but if it does not work
-"out of the box", you may need to boot it with "init=/bin/bash" and test
-s2ram in the minimal configuration. In that case, you may be able to search
-for failing drivers by following the procedure analogous to the one described in
-1b). If you find some failing drivers, you will have to unload them every time
-before the STR transition (ie. before you run s2ram), and please report the
-problems with them.
+carry out STR testing using the facility described in section 1.
+
+Namely, after writing "freezer", "devices", "platform", "processors", or "core"
+into /sys/power/pm_test (available if the kernel is compiled with
+CONFIG_PM_DEBUG set) the suspend code will work in the test mode corresponding
+to given string. The STR test modes are defined in the same way as for
+hibernation, so please refer to Section 1 for more information about them. In
+particular, the "core" test allows you to test everything except for the actual
+invocation of the platform firmware in order to put the system into the sleep
+state.
+
+Among other things, the testing with the help of /sys/power/pm_test may allow
+you to identify drivers that fail to suspend or resume their devices. They
+should be unloaded every time before an STR transition.
+
+Next, you can follow the instructions at http://en.opensuse.org/s2ram to test
+the system, but if it does not work "out of the box", you may need to boot it
+with "init=/bin/bash" and test s2ram in the minimal configuration. In that
+case, you may be able to search for failing drivers by following the procedure
+analogous to the one described in section 1. If you find some failing drivers,
+you will have to unload them every time before an STR transition (ie. before
+you run s2ram), and please report the problems with them.
diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt
index d0e79d5820a..c53d2636191 100644
--- a/Documentation/power/devices.txt
+++ b/Documentation/power/devices.txt
@@ -502,52 +502,3 @@ If the CPU can have a "cpufreq" driver, there also may be opportunities
to shift to lower voltage settings and reduce the power cost of executing
a given number of instructions. (Without voltage adjustment, it's rare
for cpufreq to save much power; the cost-per-instruction must go down.)
-
-
-/sys/devices/.../power/state files
-==================================
-For now you can also test some of this functionality using sysfs.
-
- DEPRECATED: USE "power/state" ONLY FOR DRIVER TESTING, AND
- AVOID USING dev->power.power_state IN DRIVERS.
-
- THESE WILL BE REMOVED. IF THE "power/state" FILE GETS REPLACED,
- IT WILL BECOME SOMETHING COUPLED TO THE BUS OR DRIVER.
-
-In each device's directory, there is a 'power' directory, which contains
-at least a 'state' file. The value of this field is effectively boolean,
-PM_EVENT_ON or PM_EVENT_SUSPEND.
-
- * Reading from this file displays a value corresponding to
- the power.power_state.event field. All nonzero values are
- displayed as "2", corresponding to a low power state; zero
- is displayed as "0", corresponding to normal operation.
-
- * Writing to this file initiates a transition using the
- specified event code number; only '0', '2', and '3' are
- accepted (without a newline); '2' and '3' are both
- mapped to PM_EVENT_SUSPEND.
-
-On writes, the PM core relies on that recorded event code and the device/bus
-capabilities to determine whether it uses a partial suspend() or resume()
-sequence to change things so that the recorded event corresponds to the
-numeric parameter.
-
- - If the bus requires the irqs-disabled suspend_late()/resume_early()
- phases, writes fail because those operations are not supported here.
-
- - If the recorded value is the expected value, nothing is done.
-
- - If the recorded value is nonzero, the device is partially resumed,
- using the bus.resume() and/or class.resume() methods.
-
- - If the target value is nonzero, the device is partially suspended,
- using the class.suspend() and/or bus.suspend() methods and the
- PM_EVENT_SUSPEND message.
-
-Drivers have no way to tell whether their suspend() and resume() calls
-have come through the sysfs power/state file or as part of entering a
-system sleep state, except that when accessed through sysfs the normal
-parent/child sequencing rules are ignored. Drivers (such as bus, bridge,
-or hub drivers) which expose child devices may need to enforce those rules
-on their own.
diff --git a/Documentation/power/drivers-testing.txt b/Documentation/power/drivers-testing.txt
index e4bdcaee24e..7f7a737f7f9 100644
--- a/Documentation/power/drivers-testing.txt
+++ b/Documentation/power/drivers-testing.txt
@@ -6,9 +6,9 @@ Testing suspend and resume support in device drivers
Unfortunately, to effectively test the support for the system-wide suspend and
resume transitions in a driver, it is necessary to suspend and resume a fully
functional system with this driver loaded. Moreover, that should be done
-several times, preferably several times in a row, and separately for the suspend
-to disk (STD) and the suspend to RAM (STR) transitions, because each of these
-cases involves different ordering of operations and different interactions with
+several times, preferably several times in a row, and separately for hibernation
+(aka suspend to disk or STD) and suspend to RAM (STR), because each of these
+cases involves slightly different operations and different interactions with
the machine's BIOS.
Of course, for this purpose the test system has to be known to suspend and
@@ -22,20 +22,24 @@ for more information about the debugging of suspend/resume functionality.
Once you have resolved the suspend/resume-related problems with your test system
without the new driver, you are ready to test it:
-a) Build the driver as a module, load it and try the STD in the test mode (see:
-Documents/power/basic-pm-debugging.txt, 1a)).
+a) Build the driver as a module, load it and try the test modes of hibernation
+ (see: Documents/power/basic-pm-debugging.txt, 1).
-b) Load the driver and attempt to suspend to disk in the "reboot", "shutdown"
-and "platform" modes (see: Documents/power/basic-pm-debugging.txt, 1).
+b) Load the driver and attempt to hibernate in the "reboot", "shutdown" and
+ "platform" modes (see: Documents/power/basic-pm-debugging.txt, 1).
-c) Compile the driver directly into the kernel and try the STD in the test mode.
+c) Compile the driver directly into the kernel and try the test modes of
+ hibernation.
-d) Attempt to suspend to disk with the driver compiled directly into the kernel
-in the "reboot", "shutdown" and "platform" modes.
+d) Attempt to hibernate with the driver compiled directly into the kernel
+ in the "reboot", "shutdown" and "platform" modes.
-e) Attempt to suspend to RAM using the s2ram tool with the driver loaded (see:
-Documents/power/basic-pm-debugging.txt, 2). As far as the STR tests are
-concerned, it should not matter whether or not the driver is built as a module.
+e) Try the test modes of suspend (see: Documents/power/basic-pm-debugging.txt,
+ 2). [As far as the STR tests are concerned, it should not matter whether or
+ not the driver is built as a module.]
+
+f) Attempt to suspend to RAM using the s2ram tool with the driver loaded
+ (see: Documents/power/basic-pm-debugging.txt, 2).
Each of the above tests should be repeated several times and the STD tests
should be mixed with the STR tests. If any of them fails, the driver cannot be
diff --git a/Documentation/power/notifiers.txt b/Documentation/power/notifiers.txt
index 9293e4bc857..ae1b7ec0768 100644
--- a/Documentation/power/notifiers.txt
+++ b/Documentation/power/notifiers.txt
@@ -28,6 +28,14 @@ PM_POST_HIBERNATION The system memory state has been restored from a
hibernation. Device drivers' .resume() callbacks have
been executed and tasks have been thawed.
+PM_RESTORE_PREPARE The system is going to restore a hibernation image.
+ If all goes well the restored kernel will issue a
+ PM_POST_HIBERNATION notification.
+
+PM_POST_RESTORE An error occurred during the hibernation restore.
+ Device drivers' .resume() callbacks have been executed
+ and tasks have been thawed.
+
PM_SUSPEND_PREPARE The system is preparing for a suspend.
PM_POST_SUSPEND The system has just resumed or an error occured during
diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt
index aea7e920966..9d60ab717a7 100644
--- a/Documentation/power/swsusp.txt
+++ b/Documentation/power/swsusp.txt
@@ -386,6 +386,11 @@ before suspending; then remount them after resuming.
There is a work-around for this problem. For more information, see
Documentation/usb/persist.txt.
+Q: Can I suspend-to-disk using a swap partition under LVM?
+
+A: No. You can suspend successfully, but you'll not be able to
+resume. uswsusp should be able to work with LVM. See suspend.sf.net.
+
Q: I upgraded the kernel from 2.6.15 to 2.6.16. Both kernels were
compiled with the similar configuration files. Anyway I found that
suspend to disk (and resume) is much slower on 2.6.16 compared to
diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt
index e00c6cf09e8..7b99636564c 100644
--- a/Documentation/power/userland-swsusp.txt
+++ b/Documentation/power/userland-swsusp.txt
@@ -14,7 +14,7 @@ are going to develop your own suspend/resume utilities.
The interface consists of a character device providing the open(),
release(), read(), and write() operations as well as several ioctl()
-commands defined in kernel/power/power.h. The major and minor
+commands defined in include/linux/suspend_ioctls.h . The major and minor
numbers of the device are, respectively, 10 and 231, and they can
be read from /sys/class/misc/snapshot/dev.
@@ -27,17 +27,17 @@ once at a time.
The ioctl() commands recognized by the device are:
SNAPSHOT_FREEZE - freeze user space processes (the current process is
- not frozen); this is required for SNAPSHOT_ATOMIC_SNAPSHOT
+ not frozen); this is required for SNAPSHOT_CREATE_IMAGE
and SNAPSHOT_ATOMIC_RESTORE to succeed
SNAPSHOT_UNFREEZE - thaw user space processes frozen by SNAPSHOT_FREEZE
-SNAPSHOT_ATOMIC_SNAPSHOT - create a snapshot of the system memory; the
+SNAPSHOT_CREATE_IMAGE - create a snapshot of the system memory; the
last argument of ioctl() should be a pointer to an int variable,
the value of which will indicate whether the call returned after
creating the snapshot (1) or after restoring the system memory state
from it (0) (after resume the system finds itself finishing the
- SNAPSHOT_ATOMIC_SNAPSHOT ioctl() again); after the snapshot
+ SNAPSHOT_CREATE_IMAGE ioctl() again); after the snapshot
has been created the read() operation can be used to transfer
it out of the kernel
@@ -49,39 +49,37 @@ SNAPSHOT_ATOMIC_RESTORE - restore the system memory state from the
SNAPSHOT_FREE - free memory allocated for the snapshot image
-SNAPSHOT_SET_IMAGE_SIZE - set the preferred maximum size of the image
+SNAPSHOT_PREF_IMAGE_SIZE - set the preferred maximum size of the image
(the kernel will do its best to ensure the image size will not exceed
this number, but if it turns out to be impossible, the kernel will
create the smallest image possible)
-SNAPSHOT_AVAIL_SWAP - return the amount of available swap in bytes (the last
- argument should be a pointer to an unsigned int variable that will
+SNAPSHOT_GET_IMAGE_SIZE - return the actual size of the hibernation image
+
+SNAPSHOT_AVAIL_SWAP_SIZE - return the amount of available swap in bytes (the
+ last argument should be a pointer to an unsigned int variable that will
contain the result if the call is successful).
-SNAPSHOT_GET_SWAP_PAGE - allocate a swap page from the resume partition
+SNAPSHOT_ALLOC_SWAP_PAGE - allocate a swap page from the resume partition
(the last argument should be a pointer to a loff_t variable that
will contain the swap page offset if the call is successful)
-SNAPSHOT_FREE_SWAP_PAGES - free all swap pages allocated with
- SNAPSHOT_GET_SWAP_PAGE
-
-SNAPSHOT_SET_SWAP_FILE - set the resume partition (the last ioctl() argument
- should specify the device's major and minor numbers in the old
- two-byte format, as returned by the stat() function in the .st_rdev
- member of the stat structure)
+SNAPSHOT_FREE_SWAP_PAGES - free all swap pages allocated by
+ SNAPSHOT_ALLOC_SWAP_PAGE
SNAPSHOT_SET_SWAP_AREA - set the resume partition and the offset (in <PAGE_SIZE>
units) from the beginning of the partition at which the swap header is
located (the last ioctl() argument should point to a struct
- resume_swap_area, as defined in kernel/power/power.h, containing the
- resume device specification, as for the SNAPSHOT_SET_SWAP_FILE ioctl(),
- and the offset); for swap partitions the offset is always 0, but it is
- different to zero for swap files (please see
- Documentation/swsusp-and-swap-files.txt for details).
- The SNAPSHOT_SET_SWAP_AREA ioctl() is considered as a replacement for
- SNAPSHOT_SET_SWAP_FILE which is regarded as obsolete. It is
- recommended to always use this call, because the code to set the resume
- partition may be removed from future kernels
+ resume_swap_area, as defined in kernel/power/suspend_ioctls.h,
+ containing the resume device specification and the offset); for swap
+ partitions the offset is always 0, but it is different from zero for
+ swap files (see Documentation/swsusp-and-swap-files.txt for details).
+
+SNAPSHOT_PLATFORM_SUPPORT - enable/disable the hibernation platform support,
+ depending on the argument value (enable, if the argument is nonzero)
+
+SNAPSHOT_POWER_OFF - make the kernel transition the system to the hibernation
+ state (eg. ACPI S4) using the platform (eg. ACPI) driver
SNAPSHOT_S2RAM - suspend to RAM; using this call causes the kernel to
immediately enter the suspend-to-RAM state, so this call must always
@@ -93,24 +91,6 @@ SNAPSHOT_S2RAM - suspend to RAM; using this call causes the kernel to
to resume the system from RAM if there's enough battery power or restore
its state on the basis of the saved suspend image otherwise)
-SNAPSHOT_PMOPS - enable the usage of the hibernation_ops->prepare,
- hibernate_ops->enter and hibernation_ops->finish methods (the in-kernel
- swsusp knows these as the "platform method") which are needed on many
- machines to (among others) speed up the resume by letting the BIOS skip
- some steps or to let the system recognise the correct state of the
- hardware after the resume (in particular on many machines this ensures
- that unplugged AC adapters get correctly detected and that kacpid does
- not run wild after the resume). The last ioctl() argument can take one
- of the three values, defined in kernel/power/power.h:
- PMOPS_PREPARE - make the kernel carry out the
- hibernation_ops->prepare() operation
- PMOPS_ENTER - make the kernel power off the system by calling
- hibernation_ops->enter()
- PMOPS_FINISH - make the kernel carry out the
- hibernation_ops->finish() operation
- Note that the actual constants are misnamed because they surface
- internal kernel implementation details that have changed.
-
The device's read() operation can be used to transfer the snapshot image from
the kernel. It has the following limitations:
- you cannot read() more than one virtual memory page at a time
@@ -122,7 +102,7 @@ The device's write() operation is used for uploading the system memory snapshot
into the kernel. It has the same limitations as the read() operation.
The release() operation frees all memory allocated for the snapshot image
-and all swap pages allocated with SNAPSHOT_GET_SWAP_PAGE (if any).
+and all swap pages allocated with SNAPSHOT_ALLOC_SWAP_PAGE (if any).
Thus it is not necessary to use either SNAPSHOT_FREE or
SNAPSHOT_FREE_SWAP_PAGES before closing the device (in fact it will also
unfreeze user space processes frozen by SNAPSHOT_UNFREEZE if they are
@@ -133,16 +113,12 @@ snapshot image from/to the kernel will use a swap parition, called the resume
partition, or a swap file as storage space (if a swap file is used, the resume
partition is the partition that holds this file). However, this is not really
required, as they can use, for example, a special (blank) suspend partition or
-a file on a partition that is unmounted before SNAPSHOT_ATOMIC_SNAPSHOT and
+a file on a partition that is unmounted before SNAPSHOT_CREATE_IMAGE and
mounted afterwards.
-These utilities SHOULD NOT make any assumptions regarding the ordering of
-data within the snapshot image, except for the image header that MAY be
-assumed to start with an swsusp_info structure, as specified in
-kernel/power/power.h. This structure MAY be used by the userland utilities
-to obtain some information about the snapshot image, such as the size
-of the snapshot image, including the metadata and the header itself,
-contained in the .size member of swsusp_info.
+These utilities MUST NOT make any assumptions regarding the ordering of
+data within the snapshot image. The contents of the image are entirely owned
+by the kernel and its structure may be changed in future kernel releases.
The snapshot image MUST be written to the kernel unaltered (ie. all of the image
data, metadata and header MUST be written in _exactly_ the same amount, form
@@ -159,7 +135,7 @@ means, such as checksums, to ensure the integrity of the snapshot image.
The suspending and resuming utilities MUST lock themselves in memory,
preferrably using mlockall(), before calling SNAPSHOT_FREEZE.
-The suspending utility MUST check the value stored by SNAPSHOT_ATOMIC_SNAPSHOT
+The suspending utility MUST check the value stored by SNAPSHOT_CREATE_IMAGE
in the memory location pointed to by the last argument of ioctl() and proceed
in accordance with it:
1. If the value is 1 (ie. the system memory snapshot has just been
@@ -173,7 +149,7 @@ in accordance with it:
image has been saved.
(b) The suspending utility SHOULD NOT attempt to perform any
file system operations (including reads) on the file systems
- that were mounted before SNAPSHOT_ATOMIC_SNAPSHOT has been
+ that were mounted before SNAPSHOT_CREATE_IMAGE has been
called. However, it MAY mount a file system that was not
mounted at that time and perform some operations on it (eg.
use it for saving the image).
diff --git a/Documentation/power_supply_class.txt b/Documentation/power_supply_class.txt
index 9758cf433c0..a8686e5a685 100644
--- a/Documentation/power_supply_class.txt
+++ b/Documentation/power_supply_class.txt
@@ -87,6 +87,10 @@ batteries use voltage for very approximated calculation of capacity.
Battery driver also can use this attribute just to inform userspace
about maximal and minimal voltage thresholds of a given battery.
+VOLTAGE_MAX, VOLTAGE_MIN - same as _DESIGN voltage values except that
+these ones should be used if hardware could only guess (measure and
+retain) the thresholds of a given power supply.
+
CHARGE_FULL_DESIGN, CHARGE_EMPTY_DESIGN - design charge values, when
battery considered full/empty.
@@ -100,8 +104,6 @@ age)". I.e. these attributes represents real thresholds, not design values.
ENERGY_FULL, ENERGY_EMPTY - same as above but for energy.
CAPACITY - capacity in percents.
-CAPACITY_LEVEL - capacity level. This corresponds to
-POWER_SUPPLY_CAPACITY_LEVEL_*.
TEMP - temperature of the power supply.
TEMP_AMBIENT - ambient temperature.
diff --git a/Documentation/rtc.txt b/Documentation/rtc.txt
index e20b19c1b60..8deffcd68cb 100644
--- a/Documentation/rtc.txt
+++ b/Documentation/rtc.txt
@@ -182,8 +182,8 @@ driver returns ENOIOCTLCMD. Some common examples:
since the frequency is stored in the irq_freq member of the rtc_device
structure. Your driver needs to initialize the irq_freq member during
init. Make sure you check the requested frequency is in range of your
- hardware in the irq_set_freq function. If you cannot actually change
- the frequency, just return -ENOTTY.
+ hardware in the irq_set_freq function. If it isn't, return -EINVAL. If
+ you cannot actually change the frequency, do not define irq_set_freq.
If all else fails, check out the rtc-test.c driver!
@@ -268,8 +268,8 @@ int main(int argc, char **argv)
/* This read will block */
retval = read(fd, &data, sizeof(unsigned long));
if (retval == -1) {
- perror("read");
- exit(errno);
+ perror("read");
+ exit(errno);
}
fprintf(stderr, " %d",i);
fflush(stderr);
@@ -326,11 +326,11 @@ test_READ:
rtc_tm.tm_sec %= 60;
rtc_tm.tm_min++;
}
- if (rtc_tm.tm_min == 60) {
+ if (rtc_tm.tm_min == 60) {
rtc_tm.tm_min = 0;
rtc_tm.tm_hour++;
}
- if (rtc_tm.tm_hour == 24)
+ if (rtc_tm.tm_hour == 24)
rtc_tm.tm_hour = 0;
retval = ioctl(fd, RTC_ALM_SET, &rtc_tm);
@@ -407,8 +407,8 @@ test_PIE:
"\n...Periodic IRQ rate is fixed\n");
goto done;
}
- perror("RTC_IRQP_SET ioctl");
- exit(errno);
+ perror("RTC_IRQP_SET ioctl");
+ exit(errno);
}
fprintf(stderr, "\n%ldHz:\t", tmp);
@@ -417,27 +417,27 @@ test_PIE:
/* Enable periodic interrupts */
retval = ioctl(fd, RTC_PIE_ON, 0);
if (retval == -1) {
- perror("RTC_PIE_ON ioctl");
- exit(errno);
+ perror("RTC_PIE_ON ioctl");
+ exit(errno);
}
for (i=1; i<21; i++) {
- /* This blocks */
- retval = read(fd, &data, sizeof(unsigned long));
- if (retval == -1) {
- perror("read");
- exit(errno);
- }
- fprintf(stderr, " %d",i);
- fflush(stderr);
- irqcount++;
+ /* This blocks */
+ retval = read(fd, &data, sizeof(unsigned long));
+ if (retval == -1) {
+ perror("read");
+ exit(errno);
+ }
+ fprintf(stderr, " %d",i);
+ fflush(stderr);
+ irqcount++;
}
/* Disable periodic interrupts */
retval = ioctl(fd, RTC_PIE_OFF, 0);
if (retval == -1) {
- perror("RTC_PIE_OFF ioctl");
- exit(errno);
+ perror("RTC_PIE_OFF ioctl");
+ exit(errno);
}
}
diff --git a/Documentation/smp.txt b/Documentation/smp.txt
deleted file mode 100644
index 82fc50b6305..00000000000
--- a/Documentation/smp.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-To set up SMP
-
-Configure the kernel and answer Y to CONFIG_SMP.
-
-If you are using LILO, it is handy to have both SMP and non-SMP
-kernel images on hand. Edit /etc/lilo.conf to create an entry
-for another kernel image called "linux-smp" or something.
-
-The next time you compile the kernel, when running a SMP kernel,
-edit linux/Makefile and change "MAKE=make" to "MAKE=make -jN"
-(where N = number of CPU + 1, or if you have tons of memory/swap
- you can just use "-j" without a number). Feel free to experiment
-with this one.
-
-Of course you should time how long each build takes :-)
-Example:
- make config
- time -v sh -c 'make clean install modules modules_install'
-
-If you are using some Compaq MP compliant machines you will need to set
-the operating system in the BIOS settings to "Unixware" - don't ask me
-why Compaqs don't work otherwise.
diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
index aa986a35e99..f99254327ae 100644
--- a/Documentation/sysctl/fs.txt
+++ b/Documentation/sysctl/fs.txt
@@ -23,6 +23,7 @@ Currently, these files are in /proc/sys/fs:
- inode-max
- inode-nr
- inode-state
+- nr_open
- overflowuid
- overflowgid
- suid_dumpable
@@ -91,6 +92,15 @@ usage of file handles and you don't need to increase the maximum.
==============================================================
+nr_open:
+
+This denotes the maximum number of file-handles a process can
+allocate. Default value is 1024*1024 (1048576) which should be
+enough for most machines. Actual limit depends on RLIMIT_NOFILE
+resource limit.
+
+==============================================================
+
inode-max, inode-nr & inode-state:
As with file handles, the kernel allocates the inode structures
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 6f31f0a247d..24eac1bc735 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -22,6 +22,7 @@ Currently, these files are in /proc/sys/vm:
- dirty_background_ratio
- dirty_expire_centisecs
- dirty_writeback_centisecs
+- highmem_is_dirtyable (only if CONFIG_HIGHMEM set)
- max_map_count
- min_free_kbytes
- laptop_mode
@@ -40,9 +41,9 @@ Currently, these files are in /proc/sys/vm:
==============================================================
dirty_ratio, dirty_background_ratio, dirty_expire_centisecs,
-dirty_writeback_centisecs, vfs_cache_pressure, laptop_mode,
-block_dump, swap_token_timeout, drop-caches,
-hugepages_treat_as_movable:
+dirty_writeback_centisecs, highmem_is_dirtyable,
+vfs_cache_pressure, laptop_mode, block_dump, swap_token_timeout,
+drop-caches, hugepages_treat_as_movable:
See Documentation/filesystems/proc.txt
diff --git a/Documentation/thinkpad-acpi.txt b/Documentation/thinkpad-acpi.txt
index 10c041ca13c..6c2477754a2 100644
--- a/Documentation/thinkpad-acpi.txt
+++ b/Documentation/thinkpad-acpi.txt
@@ -1,7 +1,7 @@
ThinkPad ACPI Extras Driver
- Version 0.17
- October 04th, 2007
+ Version 0.19
+ January 06th, 2008
Borislav Deianov <borislav@users.sf.net>
Henrique de Moraes Holschuh <hmh@hmh.eng.br>
@@ -215,6 +215,11 @@ The following commands can be written to the /proc/acpi/ibm/hotkey file:
... any other 8-hex-digit mask ...
echo reset > /proc/acpi/ibm/hotkey -- restore the original mask
+The procfs interface does not support NVRAM polling control. So as to
+maintain maximum bug-to-bug compatibility, it does not report any masks,
+nor does it allow one to manipulate the hot key mask when the firmware
+does not support masks at all, even if NVRAM polling is in use.
+
sysfs notes:
hotkey_bios_enabled:
@@ -231,17 +236,26 @@ sysfs notes:
to this value.
hotkey_enable:
- Enables/disables the hot keys feature, and reports
- current status of the hot keys feature.
+ Enables/disables the hot keys feature in the ACPI
+ firmware, and reports current status of the hot keys
+ feature. Has no effect on the NVRAM hot key polling
+ functionality.
0: disables the hot keys feature / feature disabled
1: enables the hot keys feature / feature enabled
hotkey_mask:
- bit mask to enable driver-handling and ACPI event
- generation for each hot key (see above). Returns the
- current status of the hot keys mask, and allows one to
- modify it.
+ bit mask to enable driver-handling (and depending on
+ the firmware, ACPI event generation) for each hot key
+ (see above). Returns the current status of the hot keys
+ mask, and allows one to modify it.
+
+ Note: when NVRAM polling is active, the firmware mask
+ will be different from the value returned by
+ hotkey_mask. The driver will retain enabled bits for
+ hotkeys that are under NVRAM polling even if the
+ firmware refuses them, and will not set these bits on
+ the firmware hot key mask.
hotkey_all_mask:
bit mask that should enable event reporting for all
@@ -257,12 +271,48 @@ sysfs notes:
handled by the firmware anyway. Echo it to
hotkey_mask above, to use.
+ hotkey_source_mask:
+ bit mask that selects which hot keys will the driver
+ poll the NVRAM for. This is auto-detected by the driver
+ based on the capabilities reported by the ACPI firmware,
+ but it can be overridden at runtime.
+
+ Hot keys whose bits are set in both hotkey_source_mask
+ and also on hotkey_mask are polled for in NVRAM. Only a
+ few hot keys are available through CMOS NVRAM polling.
+
+ Warning: when in NVRAM mode, the volume up/down/mute
+ keys are synthesized according to changes in the mixer,
+ so you have to use volume up or volume down to unmute,
+ as per the ThinkPad volume mixer user interface. When
+ in ACPI event mode, volume up/down/mute are reported as
+ separate events, but this behaviour may be corrected in
+ future releases of this driver, in which case the
+ ThinkPad volume mixer user interface semanthics will be
+ enforced.
+
+ hotkey_poll_freq:
+ frequency in Hz for hot key polling. It must be between
+ 0 and 25 Hz. Polling is only carried out when strictly
+ needed.
+
+ Setting hotkey_poll_freq to zero disables polling, and
+ will cause hot key presses that require NVRAM polling
+ to never be reported.
+
+ Setting hotkey_poll_freq too low will cause repeated
+ pressings of the same hot key to be misreported as a
+ single key press, or to not even be detected at all.
+ The recommended polling frequency is 10Hz.
+
hotkey_radio_sw:
if the ThinkPad has a hardware radio switch, this
attribute will read 0 if the switch is in the "radios
disabled" postition, and 1 if the switch is in the
"radios enabled" position.
+ This attribute has poll()/select() support.
+
hotkey_report_mode:
Returns the state of the procfs ACPI event report mode
filter for hot keys. If it is set to 1 (the default),
@@ -277,6 +327,25 @@ sysfs notes:
May return -EPERM (write access locked out by module
parameter) or -EACCES (read-only).
+ wakeup_reason:
+ Set to 1 if the system is waking up because the user
+ requested a bay ejection. Set to 2 if the system is
+ waking up because the user requested the system to
+ undock. Set to zero for normal wake-ups or wake-ups
+ due to unknown reasons.
+
+ This attribute has poll()/select() support.
+
+ wakeup_hotunplug_complete:
+ Set to 1 if the system was waken up because of an
+ undock or bay ejection request, and that request
+ was sucessfully completed. At this point, it might
+ be useful to send the system back to sleep, at the
+ user's choice. Refer to HKEY events 0x4003 and
+ 0x3003, below.
+
+ This attribute has poll()/select() support.
+
input layer notes:
A Hot key is mapped to a single input layer EV_KEY event, possibly
@@ -427,6 +496,23 @@ Non hot-key ACPI HKEY event map:
The above events are not propagated by the driver, except for legacy
compatibility purposes when hotkey_report_mode is set to 1.
+0x2304 System is waking up from suspend to undock
+0x2305 System is waking up from suspend to eject bay
+0x2404 System is waking up from hibernation to undock
+0x2405 System is waking up from hibernation to eject bay
+
+The above events are never propagated by the driver.
+
+0x3003 Bay ejection (see 0x2x05) complete, can sleep again
+0x4003 Undocked (see 0x2x04), can sleep again
+0x5009 Tablet swivel: switched to tablet mode
+0x500A Tablet swivel: switched to normal mode
+0x500B Tablet pen insterted into its storage bay
+0x500C Tablet pen removed from its storage bay
+0x5010 Brightness level changed (newer Lenovo BIOSes)
+
+The above events are propagated by the driver.
+
Compatibility notes:
ibm-acpi and thinkpad-acpi 0.15 (mainline kernels before 2.6.23) never
@@ -1263,3 +1349,17 @@ Sysfs interface changelog:
and the hwmon class for libsensors4 (lm-sensors 3)
compatibility. Moved all hwmon attributes to this
new platform device.
+
+0x020100: Marker for thinkpad-acpi with hot key NVRAM polling
+ support. If you must, use it to know you should not
+ start an userspace NVRAM poller (allows to detect when
+ NVRAM is compiled out by the user because it is
+ unneeded/undesired in the first place).
+0x020101: Marker for thinkpad-acpi with hot key NVRAM polling
+ and proper hotkey_mask semanthics (version 8 of the
+ NVRAM polling patch). Some development snapshots of
+ 0.18 had an earlier version that did strange things
+ to hotkey_mask.
+
+0x020200: Add poll()/select() support to the following attributes:
+ hotkey_radio_sw, wakeup_hotunplug_complete, wakeup_reason
diff --git a/Documentation/unaligned-memory-access.txt b/Documentation/unaligned-memory-access.txt
new file mode 100644
index 00000000000..6223eace3c0
--- /dev/null
+++ b/Documentation/unaligned-memory-access.txt
@@ -0,0 +1,226 @@
+UNALIGNED MEMORY ACCESSES
+=========================
+
+Linux runs on a wide variety of architectures which have varying behaviour
+when it comes to memory access. This document presents some details about
+unaligned accesses, why you need to write code that doesn't cause them,
+and how to write such code!
+
+
+The definition of an unaligned access
+=====================================
+
+Unaligned memory accesses occur when you try to read N bytes of data starting
+from an address that is not evenly divisible by N (i.e. addr % N != 0).
+For example, reading 4 bytes of data from address 0x10004 is fine, but
+reading 4 bytes of data from address 0x10005 would be an unaligned memory
+access.
+
+The above may seem a little vague, as memory access can happen in different
+ways. The context here is at the machine code level: certain instructions read
+or write a number of bytes to or from memory (e.g. movb, movw, movl in x86
+assembly). As will become clear, it is relatively easy to spot C statements
+which will compile to multiple-byte memory access instructions, namely when
+dealing with types such as u16, u32 and u64.
+
+
+Natural alignment
+=================
+
+The rule mentioned above forms what we refer to as natural alignment:
+When accessing N bytes of memory, the base memory address must be evenly
+divisible by N, i.e. addr % N == 0.
+
+When writing code, assume the target architecture has natural alignment
+requirements.
+
+In reality, only a few architectures require natural alignment on all sizes
+of memory access. However, we must consider ALL supported architectures;
+writing code that satisfies natural alignment requirements is the easiest way
+to achieve full portability.
+
+
+Why unaligned access is bad
+===========================
+
+The effects of performing an unaligned memory access vary from architecture
+to architecture. It would be easy to write a whole document on the differences
+here; a summary of the common scenarios is presented below:
+
+ - Some architectures are able to perform unaligned memory accesses
+ transparently, but there is usually a significant performance cost.
+ - Some architectures raise processor exceptions when unaligned accesses
+ happen. The exception handler is able to correct the unaligned access,
+ at significant cost to performance.
+ - Some architectures raise processor exceptions when unaligned accesses
+ happen, but the exceptions do not contain enough information for the
+ unaligned access to be corrected.
+ - Some architectures are not capable of unaligned memory access, but will
+ silently perform a different memory access to the one that was requested,
+ resulting a a subtle code bug that is hard to detect!
+
+It should be obvious from the above that if your code causes unaligned
+memory accesses to happen, your code will not work correctly on certain
+platforms and will cause performance problems on others.
+
+
+Code that does not cause unaligned access
+=========================================
+
+At first, the concepts above may seem a little hard to relate to actual
+coding practice. After all, you don't have a great deal of control over
+memory addresses of certain variables, etc.
+
+Fortunately things are not too complex, as in most cases, the compiler
+ensures that things will work for you. For example, take the following
+structure:
+
+ struct foo {
+ u16 field1;
+ u32 field2;
+ u8 field3;
+ };
+
+Let us assume that an instance of the above structure resides in memory
+starting at address 0x10000. With a basic level of understanding, it would
+not be unreasonable to expect that accessing field2 would cause an unaligned
+access. You'd be expecting field2 to be located at offset 2 bytes into the
+structure, i.e. address 0x10002, but that address is not evenly divisible
+by 4 (remember, we're reading a 4 byte value here).
+
+Fortunately, the compiler understands the alignment constraints, so in the
+above case it would insert 2 bytes of padding in between field1 and field2.
+Therefore, for standard structure types you can always rely on the compiler
+to pad structures so that accesses to fields are suitably aligned (assuming
+you do not cast the field to a type of different length).
+
+Similarly, you can also rely on the compiler to align variables and function
+parameters to a naturally aligned scheme, based on the size of the type of
+the variable.
+
+At this point, it should be clear that accessing a single byte (u8 or char)
+will never cause an unaligned access, because all memory addresses are evenly
+divisible by one.
+
+On a related topic, with the above considerations in mind you may observe
+that you could reorder the fields in the structure in order to place fields
+where padding would otherwise be inserted, and hence reduce the overall
+resident memory size of structure instances. The optimal layout of the
+above example is:
+
+ struct foo {
+ u32 field2;
+ u16 field1;
+ u8 field3;
+ };
+
+For a natural alignment scheme, the compiler would only have to add a single
+byte of padding at the end of the structure. This padding is added in order
+to satisfy alignment constraints for arrays of these structures.
+
+Another point worth mentioning is the use of __attribute__((packed)) on a
+structure type. This GCC-specific attribute tells the compiler never to
+insert any padding within structures, useful when you want to use a C struct
+to represent some data that comes in a fixed arrangement 'off the wire'.
+
+You might be inclined to believe that usage of this attribute can easily
+lead to unaligned accesses when accessing fields that do not satisfy
+architectural alignment requirements. However, again, the compiler is aware
+of the alignment constraints and will generate extra instructions to perform
+the memory access in a way that does not cause unaligned access. Of course,
+the extra instructions obviously cause a loss in performance compared to the
+non-packed case, so the packed attribute should only be used when avoiding
+structure padding is of importance.
+
+
+Code that causes unaligned access
+=================================
+
+With the above in mind, let's move onto a real life example of a function
+that can cause an unaligned memory access. The following function adapted
+from include/linux/etherdevice.h is an optimized routine to compare two
+ethernet MAC addresses for equality.
+
+unsigned int compare_ether_addr(const u8 *addr1, const u8 *addr2)
+{
+ const u16 *a = (const u16 *) addr1;
+ const u16 *b = (const u16 *) addr2;
+ return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) != 0;
+}
+
+In the above function, the reference to a[0] causes 2 bytes (16 bits) to
+be read from memory starting at address addr1. Think about what would happen
+if addr1 was an odd address such as 0x10003. (Hint: it'd be an unaligned
+access.)
+
+Despite the potential unaligned access problems with the above function, it
+is included in the kernel anyway but is understood to only work on
+16-bit-aligned addresses. It is up to the caller to ensure this alignment or
+not use this function at all. This alignment-unsafe function is still useful
+as it is a decent optimization for the cases when you can ensure alignment,
+which is true almost all of the time in ethernet networking context.
+
+
+Here is another example of some code that could cause unaligned accesses:
+ void myfunc(u8 *data, u32 value)
+ {
+ [...]
+ *((u32 *) data) = cpu_to_le32(value);
+ [...]
+ }
+
+This code will cause unaligned accesses every time the data parameter points
+to an address that is not evenly divisible by 4.
+
+In summary, the 2 main scenarios where you may run into unaligned access
+problems involve:
+ 1. Casting variables to types of different lengths
+ 2. Pointer arithmetic followed by access to at least 2 bytes of data
+
+
+Avoiding unaligned accesses
+===========================
+
+The easiest way to avoid unaligned access is to use the get_unaligned() and
+put_unaligned() macros provided by the <asm/unaligned.h> header file.
+
+Going back to an earlier example of code that potentially causes unaligned
+access:
+
+ void myfunc(u8 *data, u32 value)
+ {
+ [...]
+ *((u32 *) data) = cpu_to_le32(value);
+ [...]
+ }
+
+To avoid the unaligned memory access, you would rewrite it as follows:
+
+ void myfunc(u8 *data, u32 value)
+ {
+ [...]
+ value = cpu_to_le32(value);
+ put_unaligned(value, (u32 *) data);
+ [...]
+ }
+
+The get_unaligned() macro works similarly. Assuming 'data' is a pointer to
+memory and you wish to avoid unaligned access, its usage is as follows:
+
+ u32 value = get_unaligned((u32 *) data);
+
+These macros work work for memory accesses of any length (not just 32 bits as
+in the examples above). Be aware that when compared to standard access of
+aligned memory, using these macros to access unaligned memory can be costly in
+terms of performance.
+
+If use of such macros is not convenient, another option is to use memcpy(),
+where the source or destination (or both) are of type u8* or unsigned char*.
+Due to the byte-wise nature of this operation, unaligned accesses are avoided.
+
+--
+Author: Daniel Drake <dsd@gentoo.org>
+With help from: Alan Cox, Avuton Olrich, Heikki Orsila, Jan Engelhardt,
+Johannes Berg, Kyle McMartin, Kyle Moffett, Randy Dunlap, Robert Hancock,
+Uli Kunitz, Vadim Lobanov
+
diff --git a/Documentation/usb/gadget_printer.txt b/Documentation/usb/gadget_printer.txt
new file mode 100644
index 00000000000..ad995bf0db4
--- /dev/null
+++ b/Documentation/usb/gadget_printer.txt
@@ -0,0 +1,510 @@
+
+ Linux USB Printer Gadget Driver
+ 06/04/2007
+
+ Copyright (C) 2007 Craig W. Nadler <craig@nadler.us>
+
+
+
+GENERAL
+=======
+
+This driver may be used if you are writing printer firmware using Linux as
+the embedded OS. This driver has nothing to do with using a printer with
+your Linux host system.
+
+You will need a USB device controller and a Linux driver for it that accepts
+a gadget / "device class" driver using the Linux USB Gadget API. After the
+USB device controller driver is loaded then load the printer gadget driver.
+This will present a printer interface to the USB Host that your USB Device
+port is connected to.
+
+This driver is structured for printer firmware that runs in user mode. The
+user mode printer firmware will read and write data from the kernel mode
+printer gadget driver using a device file. The printer returns a printer status
+byte when the USB HOST sends a device request to get the printer status. The
+user space firmware can read or write this status byte using a device file
+/dev/g_printer . Both blocking and non-blocking read/write calls are supported.
+
+
+
+
+HOWTO USE THIS DRIVER
+=====================
+
+To load the USB device controller driver and the printer gadget driver. The
+following example uses the Netchip 2280 USB device controller driver:
+
+modprobe net2280
+modprobe g_printer
+
+
+The follow command line parameter can be used when loading the printer gadget
+(ex: modprobe g_printer idVendor=0x0525 idProduct=0xa4a8 ):
+
+idVendor - This is the Vendor ID used in the device descriptor. The default is
+ the Netchip vendor id 0x0525. YOU MUST CHANGE TO YOUR OWN VENDOR ID
+ BEFORE RELEASING A PRODUCT. If you plan to release a product and don't
+ already have a Vendor ID please see www.usb.org for details on how to
+ get one.
+
+idProduct - This is the Product ID used in the device descriptor. The default
+ is 0xa4a8, you should change this to an ID that's not used by any of
+ your other USB products if you have any. It would be a good idea to
+ start numbering your products starting with say 0x0001.
+
+bcdDevice - This is the version number of your product. It would be a good idea
+ to put your firmware version here.
+
+iManufacturer - A string containing the name of the Vendor.
+
+iProduct - A string containing the Product Name.
+
+iSerialNum - A string containing the Serial Number. This should be changed for
+ each unit of your product.
+
+iPNPstring - The PNP ID string used for this printer. You will want to set
+ either on the command line or hard code the PNP ID string used for
+ your printer product.
+
+qlen - The number of 8k buffers to use per endpoint. The default is 10, you
+ should tune this for your product. You may also want to tune the
+ size of each buffer for your product.
+
+
+
+
+USING THE EXAMPLE CODE
+======================
+
+This example code talks to stdout, instead of a print engine.
+
+To compile the test code below:
+
+1) save it to a file called prn_example.c
+2) compile the code with the follow command:
+ gcc prn_example.c -o prn_example
+
+
+
+To read printer data from the host to stdout:
+
+ # prn_example -read_data
+
+
+To write printer data from a file (data_file) to the host:
+
+ # cat data_file | prn_example -write_data
+
+
+To get the current printer status for the gadget driver:
+
+ # prn_example -get_status
+
+ Printer status is:
+ Printer is NOT Selected
+ Paper is Out
+ Printer OK
+
+
+To set printer to Selected/On-line:
+
+ # prn_example -selected
+
+
+To set printer to Not Selected/Off-line:
+
+ # prn_example -not_selected
+
+
+To set paper status to paper out:
+
+ # prn_example -paper_out
+
+
+To set paper status to paper loaded:
+
+ # prn_example -paper_loaded
+
+
+To set error status to printer OK:
+
+ # prn_example -no_error
+
+
+To set error status to ERROR:
+
+ # prn_example -error
+
+
+
+
+EXAMPLE CODE
+============
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <linux/poll.h>
+#include <sys/ioctl.h>
+#include <linux/usb/g_printer.h>
+
+#define PRINTER_FILE "/dev/g_printer"
+#define BUF_SIZE 512
+
+
+/*
+ * 'usage()' - Show program usage.
+ */
+
+static void
+usage(const char *option) /* I - Option string or NULL */
+{
+ if (option) {
+ fprintf(stderr,"prn_example: Unknown option \"%s\"!\n",
+ option);
+ }
+
+ fputs("\n", stderr);
+ fputs("Usage: prn_example -[options]\n", stderr);
+ fputs("Options:\n", stderr);
+ fputs("\n", stderr);
+ fputs("-get_status Get the current printer status.\n", stderr);
+ fputs("-selected Set the selected status to selected.\n", stderr);
+ fputs("-not_selected Set the selected status to NOT selected.\n",
+ stderr);
+ fputs("-error Set the error status to error.\n", stderr);
+ fputs("-no_error Set the error status to NO error.\n", stderr);
+ fputs("-paper_out Set the paper status to paper out.\n", stderr);
+ fputs("-paper_loaded Set the paper status to paper loaded.\n",
+ stderr);
+ fputs("-read_data Read printer data from driver.\n", stderr);
+ fputs("-write_data Write printer sata to driver.\n", stderr);
+ fputs("-NB_read_data (Non-Blocking) Read printer data from driver.\n",
+ stderr);
+ fputs("\n\n", stderr);
+
+ exit(1);
+}
+
+
+static int
+read_printer_data()
+{
+ struct pollfd fd[1];
+
+ /* Open device file for printer gadget. */
+ fd[0].fd = open(PRINTER_FILE, O_RDWR);
+ if (fd[0].fd < 0) {
+ printf("Error %d opening %s\n", fd[0].fd, PRINTER_FILE);
+ close(fd[0].fd);
+ return(-1);
+ }
+
+ fd[0].events = POLLIN | POLLRDNORM;
+
+ while (1) {
+ static char buf[BUF_SIZE];
+ int bytes_read;
+ int retval;
+
+ /* Wait for up to 1 second for data. */
+ retval = poll(fd, 1, 1000);
+
+ if (retval && (fd[0].revents & POLLRDNORM)) {
+
+ /* Read data from printer gadget driver. */
+ bytes_read = read(fd[0].fd, buf, BUF_SIZE);
+
+ if (bytes_read < 0) {
+ printf("Error %d reading from %s\n",
+ fd[0].fd, PRINTER_FILE);
+ close(fd[0].fd);
+ return(-1);
+ } else if (bytes_read > 0) {
+ /* Write data to standard OUTPUT (stdout). */
+ fwrite(buf, 1, bytes_read, stdout);
+ fflush(stdout);
+ }
+
+ }
+
+ }
+
+ /* Close the device file. */
+ close(fd[0].fd);
+
+ return 0;
+}
+
+
+static int
+write_printer_data()
+{
+ struct pollfd fd[1];
+
+ /* Open device file for printer gadget. */
+ fd[0].fd = open (PRINTER_FILE, O_RDWR);
+ if (fd[0].fd < 0) {
+ printf("Error %d opening %s\n", fd[0].fd, PRINTER_FILE);
+ close(fd[0].fd);
+ return(-1);
+ }
+
+ fd[0].events = POLLOUT | POLLWRNORM;
+
+ while (1) {
+ int retval;
+ static char buf[BUF_SIZE];
+ /* Read data from standard INPUT (stdin). */
+ int bytes_read = fread(buf, 1, BUF_SIZE, stdin);
+
+ if (!bytes_read) {
+ break;
+ }
+
+ while (bytes_read) {
+
+ /* Wait for up to 1 second to sent data. */
+ retval = poll(fd, 1, 1000);
+
+ /* Write data to printer gadget driver. */
+ if (retval && (fd[0].revents & POLLWRNORM)) {
+ retval = write(fd[0].fd, buf, bytes_read);
+ if (retval < 0) {
+ printf("Error %d writing to %s\n",
+ fd[0].fd,
+ PRINTER_FILE);
+ close(fd[0].fd);
+ return(-1);
+ } else {
+ bytes_read -= retval;
+ }
+
+ }
+
+ }
+
+ }
+
+ /* Wait until the data has been sent. */
+ fsync(fd[0].fd);
+
+ /* Close the device file. */
+ close(fd[0].fd);
+
+ return 0;
+}
+
+
+static int
+read_NB_printer_data()
+{
+ int fd;
+ static char buf[BUF_SIZE];
+ int bytes_read;
+
+ /* Open device file for printer gadget. */
+ fd = open(PRINTER_FILE, O_RDWR|O_NONBLOCK);
+ if (fd < 0) {
+ printf("Error %d opening %s\n", fd, PRINTER_FILE);
+ close(fd);
+ return(-1);
+ }
+
+ while (1) {
+ /* Read data from printer gadget driver. */
+ bytes_read = read(fd, buf, BUF_SIZE);
+ if (bytes_read <= 0) {
+ break;
+ }
+
+ /* Write data to standard OUTPUT (stdout). */
+ fwrite(buf, 1, bytes_read, stdout);
+ fflush(stdout);
+ }
+
+ /* Close the device file. */
+ close(fd);
+
+ return 0;
+}
+
+
+static int
+get_printer_status()
+{
+ int retval;
+ int fd;
+
+ /* Open device file for printer gadget. */
+ fd = open(PRINTER_FILE, O_RDWR);
+ if (fd < 0) {
+ printf("Error %d opening %s\n", fd, PRINTER_FILE);
+ close(fd);
+ return(-1);
+ }
+
+ /* Make the IOCTL call. */
+ retval = ioctl(fd, GADGET_GET_PRINTER_STATUS);
+ if (retval < 0) {
+ fprintf(stderr, "ERROR: Failed to set printer status\n");
+ return(-1);
+ }
+
+ /* Close the device file. */
+ close(fd);
+
+ return(retval);
+}
+
+
+static int
+set_printer_status(unsigned char buf, int clear_printer_status_bit)
+{
+ int retval;
+ int fd;
+
+ retval = get_printer_status();
+ if (retval < 0) {
+ fprintf(stderr, "ERROR: Failed to get printer status\n");
+ return(-1);
+ }
+
+ /* Open device file for printer gadget. */
+ fd = open(PRINTER_FILE, O_RDWR);
+
+ if (fd < 0) {
+ printf("Error %d opening %s\n", fd, PRINTER_FILE);
+ close(fd);
+ return(-1);
+ }
+
+ if (clear_printer_status_bit) {
+ retval &= ~buf;
+ } else {
+ retval |= buf;
+ }
+
+ /* Make the IOCTL call. */
+ if (ioctl(fd, GADGET_SET_PRINTER_STATUS, (unsigned char)retval)) {
+ fprintf(stderr, "ERROR: Failed to set printer status\n");
+ return(-1);
+ }
+
+ /* Close the device file. */
+ close(fd);
+
+ return 0;
+}
+
+
+static int
+display_printer_status()
+{
+ char printer_status;
+
+ printer_status = get_printer_status();
+ if (printer_status < 0) {
+ fprintf(stderr, "ERROR: Failed to get printer status\n");
+ return(-1);
+ }
+
+ printf("Printer status is:\n");
+ if (printer_status & PRINTER_SELECTED) {
+ printf(" Printer is Selected\n");
+ } else {
+ printf(" Printer is NOT Selected\n");
+ }
+ if (printer_status & PRINTER_PAPER_EMPTY) {
+ printf(" Paper is Out\n");
+ } else {
+ printf(" Paper is Loaded\n");
+ }
+ if (printer_status & PRINTER_NOT_ERROR) {
+ printf(" Printer OK\n");
+ } else {
+ printf(" Printer ERROR\n");
+ }
+
+ return(0);
+}
+
+
+int
+main(int argc, char *argv[])
+{
+ int i; /* Looping var */
+ int retval = 0;
+
+ /* No Args */
+ if (argc == 1) {
+ usage(0);
+ exit(0);
+ }
+
+ for (i = 1; i < argc && !retval; i ++) {
+
+ if (argv[i][0] != '-') {
+ continue;
+ }
+
+ if (!strcmp(argv[i], "-get_status")) {
+ if (display_printer_status()) {
+ retval = 1;
+ }
+
+ } else if (!strcmp(argv[i], "-paper_loaded")) {
+ if (set_printer_status(PRINTER_PAPER_EMPTY, 1)) {
+ retval = 1;
+ }
+
+ } else if (!strcmp(argv[i], "-paper_out")) {
+ if (set_printer_status(PRINTER_PAPER_EMPTY, 0)) {
+ retval = 1;
+ }
+
+ } else if (!strcmp(argv[i], "-selected")) {
+ if (set_printer_status(PRINTER_SELECTED, 0)) {
+ retval = 1;
+ }
+
+ } else if (!strcmp(argv[i], "-not_selected")) {
+ if (set_printer_status(PRINTER_SELECTED, 1)) {
+ retval = 1;
+ }
+
+ } else if (!strcmp(argv[i], "-error")) {
+ if (set_printer_status(PRINTER_NOT_ERROR, 1)) {
+ retval = 1;
+ }
+
+ } else if (!strcmp(argv[i], "-no_error")) {
+ if (set_printer_status(PRINTER_NOT_ERROR, 0)) {
+ retval = 1;
+ }
+
+ } else if (!strcmp(argv[i], "-read_data")) {
+ if (read_printer_data()) {
+ retval = 1;
+ }
+
+ } else if (!strcmp(argv[i], "-write_data")) {
+ if (write_printer_data()) {
+ retval = 1;
+ }
+
+ } else if (!strcmp(argv[i], "-NB_read_data")) {
+ if (read_NB_printer_data()) {
+ retval = 1;
+ }
+
+ } else {
+ usage(argv[i]);
+ retval = 1;
+ }
+ }
+
+ exit(retval);
+}
diff --git a/Documentation/usb/iuu_phoenix.txt b/Documentation/usb/iuu_phoenix.txt
new file mode 100644
index 00000000000..e5f048067da
--- /dev/null
+++ b/Documentation/usb/iuu_phoenix.txt
@@ -0,0 +1,84 @@
+Infinity Usb Unlimited Readme
+-----------------------------
+
+Hi all,
+
+
+This module provide a serial interface to use your
+IUU unit in phoenix mode. Loading this module will
+bring a ttyUSB[0-x] interface. This driver must be
+used by your favorite application to pilot the IUU
+
+This driver is still in beta stage, so bugs can
+occur and your system may freeze. As far I now,
+I never had any problem with it, but I'm not a real
+guru, so don't blame me if your system is unstable
+
+You can plug more than one IUU. Every unit will
+have his own device file(/dev/ttyUSB0,/dev/ttyUSB1,...)
+
+
+
+How to tune the reader speed ?
+
+ A few parameters can be used at load time
+ To use parameters, just unload the module if it is
+ already loaded and use modprobe iuu_phoenix param=value.
+ In case of prebuilt module, use the command
+ insmod iuu_phoenix param=value.
+
+ Example:
+
+ modprobe iuu_phoenix clockmode=3
+
+ The parameters are:
+
+ parm: clockmode:1=3Mhz579,2=3Mhz680,3=6Mhz (int)
+ parm: boost:overclock boost percent 100 to 500 (int)
+ parm: cdmode:Card detect mode 0=none, 1=CD, 2=!CD, 3=DSR, 4=!DSR, 5=CTS, 6=!CTS, 7=RING, 8=!RING (int)
+ parm: xmas:xmas color enabled or not (bool)
+ parm: debug:Debug enabled or not (bool)
+
+- clockmode will provide 3 different base settings commonly adopted by
+ different software:
+ 1. 3Mhz579
+ 2. 3Mhz680
+ 3. 6Mhz
+
+- boost provide a way to overclock the reader ( my favorite :-) )
+ For example to have best performance than a simple clockmode=3, try this:
+
+ modprobe boost=195
+
+ This will put the reader in a base of 3Mhz579 but boosted a 195 % !
+ the real clock will be now : 6979050 Hz ( 6Mhz979 ) and will increase
+ the speed to a score 10 to 20% better than the simple clockmode=3 !!!
+
+
+- cdmode permit to setup the signal used to inform the userland ( ioctl answer )
+ if the card is present or not. Eight signals are possible.
+
+- xmas is completely useless except for your eyes. This is one of my friend who was
+ so sad to have a nice device like the iuu without seeing all color range available.
+ So I have added this option to permit him to see a lot of color ( each activity change the color
+ and the frequency randomly )
+
+- debug will produce a lot of debugging messages...
+
+
+ Last notes:
+
+ Don't worry about the serial settings, the serial emulation
+ is an abstraction, so use any speed or parity setting will
+ work. ( This will not change anything ).Later I will perhaps
+ use this settings to deduce de boost but is that feature
+ really necessary ?
+ The autodetect feature used is the serial CD. If that doesn't
+ work for your software, disable detection mechanism in it.
+
+
+ Have fun !
+
+ Alain Degreffe
+
+ eczema(at)ecze.com
diff --git a/Documentation/w1/masters/00-INDEX b/Documentation/w1/masters/00-INDEX
index 752613c4cea..7b0ceaaad7a 100644
--- a/Documentation/w1/masters/00-INDEX
+++ b/Documentation/w1/masters/00-INDEX
@@ -4,3 +4,5 @@ ds2482
- The Maxim/Dallas Semiconductor DS2482 provides 1-wire busses.
ds2490
- The Maxim/Dallas Semiconductor DS2490 builds USB <-> W1 bridges.
+w1-gpio
+ - GPIO 1-wire bus master driver.
diff --git a/Documentation/w1/masters/w1-gpio b/Documentation/w1/masters/w1-gpio
new file mode 100644
index 00000000000..af5d3b4aa85
--- /dev/null
+++ b/Documentation/w1/masters/w1-gpio
@@ -0,0 +1,33 @@
+Kernel driver w1-gpio
+=====================
+
+Author: Ville Syrjala <syrjala@sci.fi>
+
+
+Description
+-----------
+
+GPIO 1-wire bus master driver. The driver uses the GPIO API to control the
+wire and the GPIO pin can be specified using platform data.
+
+
+Example (mach-at91)
+-------------------
+
+#include <linux/w1-gpio.h>
+
+static struct w1_gpio_platform_data foo_w1_gpio_pdata = {
+ .pin = AT91_PIN_PB20,
+ .is_open_drain = 1,
+};
+
+static struct platform_device foo_w1_device = {
+ .name = "w1-gpio",
+ .id = -1,
+ .dev.platform_data = &foo_w1_gpio_pdata,
+};
+
+...
+ at91_set_GPIO_periph(foo_w1_gpio_pdata.pin, 1);
+ at91_set_multi_drive(foo_w1_gpio_pdata.pin, 1);
+ platform_device_register(&foo_w1_device);
diff --git a/Documentation/x86_64/00-INDEX b/Documentation/x86_64/00-INDEX
new file mode 100644
index 00000000000..92fc20ab5f0
--- /dev/null
+++ b/Documentation/x86_64/00-INDEX
@@ -0,0 +1,16 @@
+00-INDEX
+ - This file
+boot-options.txt
+ - AMD64-specific boot options.
+cpu-hotplug-spec
+ - Firmware support for CPU hotplug under Linux/x86-64
+fake-numa-for-cpusets
+ - Using numa=fake and CPUSets for Resource Management
+kernel-stacks
+ - Context-specific per-processor interrupt stacks.
+machinecheck
+ - Configurable sysfs parameters for the x86-64 machine check code.
+mm.txt
+ - Memory layout of x86-64 (4 level page tables, 46 bits physical).
+uefi.txt
+ - Booting Linux via Unified Extensible Firmware Interface.