202 files changed, 8305 insertions, 2394 deletions
diff --git a/Documentation/ABI/testing/sysfs-pps b/Documentation/ABI/testing/sysfs-pps
new file mode 100644
index 00000000000..25028c7bc37
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-pps
@@ -0,0 +1,73 @@
+What:		/sys/class/pps/
+Date:		February 2008
+Contact:	Rodolfo Giometti <giometti@linux.it>
+Description:
+		The /sys/class/pps/ directory will contain files and
+		directories that will provide a unified interface to
+		the PPS sources.
+
+What:		/sys/class/pps/ppsX/
+Date:		February 2008
+Contact:	Rodolfo Giometti <giometti@linux.it>
+Description:
+		The /sys/class/pps/ppsX/ directory is related to X-th
+		PPS source into the system. Each directory will
+		contain files to manage and control its PPS source.
+
+What:		/sys/class/pps/ppsX/assert
+Date:		February 2008
+Contact:	Rodolfo Giometti <giometti@linux.it>
+Description:
+		The /sys/class/pps/ppsX/assert file reports the assert events
+		and the assert sequence number of the X-th source in the form:
+
+			<secs>.<nsec>#<sequence>
+
+		If the source has no assert events the content of this file
+		is empty.
+
+What:		/sys/class/pps/ppsX/clear
+Date:		February 2008
+Contact:	Rodolfo Giometti <giometti@linux.it>
+Description:
+		The /sys/class/pps/ppsX/clear file reports the clear events
+		and the clear sequence number of the X-th source in the form:
+
+			<secs>.<nsec>#<sequence>
+
+		If the source has no clear events the content of this file
+		is empty.
+
+What:		/sys/class/pps/ppsX/mode
+Date:		February 2008
+Contact:	Rodolfo Giometti <giometti@linux.it>
+Description:
+		The /sys/class/pps/ppsX/mode file reports the functioning
+		mode of the X-th source in hexadecimal encoding.
+
+		Please, refer to linux/include/linux/pps.h for further
+		info.
+
+What:		/sys/class/pps/ppsX/echo
+Date:		February 2008
+Contact:	Rodolfo Giometti <giometti@linux.it>
+Description:
+		The /sys/class/pps/ppsX/echo file reports if the X-th does
+		or does not support an "echo" function.
+
+What:		/sys/class/pps/ppsX/name
+Date:		February 2008
+Contact:	Rodolfo Giometti <giometti@linux.it>
+Description:
+		The /sys/class/pps/ppsX/name file reports the name of the
+		X-th source.
+
+What:		/sys/class/pps/ppsX/path
+Date:		February 2008
+Contact:	Rodolfo Giometti <giometti@linux.it>
+Description:
+		The /sys/class/pps/ppsX/path file reports the path name of
+		the device connected with the X-th source.
+
+		If the source is not connected with any device the content
+		of this file is empty.
diff --git a/Documentation/Changes b/Documentation/Changes
index 664392481c8..6d0f1efc5bf 100644
--- a/Documentation/Changes
+++ b/Documentation/Changes
@@ -72,6 +72,13 @@ assembling the 16-bit boot code, removing the need for as86 to compile
 your kernel.  This change does, however, mean that you need a recent
 release of binutils.
 
+Perl
+----
+
+You will need perl 5 and the following modules: Getopt::Long, Getopt::Std,
+File::Basename, and File::Find to build the kernel.
+
+
 System utilities
 ================
 
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 1a608877b14..23d1262c077 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -152,14 +152,19 @@ When swap is accounted, following files are added.
 
 usage of mem+swap is limited by memsw.limit_in_bytes.
 
-Note: why 'mem+swap' rather than swap.
+* why 'mem+swap' rather than swap.
 The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
 to move account from memory to swap...there is no change in usage of
-mem+swap.
+mem+swap. In other words, when we want to limit the usage of swap without
+affecting global LRU, mem+swap limit is better than just limiting swap from
+OS point of view.
 
-In other words, when we want to limit the usage of swap without affecting
-global LRU, mem+swap limit is better than just limiting swap from OS point
-of view.
+* What happens when a cgroup hits memory.memsw.limit_in_bytes
+When a cgroup his memory.memsw.limit_in_bytes, it's useless to do swap-out
+in this cgroup. Then, swap-out will not be done by cgroup routine and file
+caches are dropped. But as mentioned above, global LRU can do swapout memory
+from it for sanity of the system's memory management state. You can't forbid
+it by cgroup.
 
 2.5 Reclaim
 
@@ -204,6 +209,7 @@ We can alter the memory limit:
 
 NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
 mega or gigabytes.
+NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited).
 
 # cat /cgroups/0/memory.limit_in_bytes
 4194304
diff --git a/Documentation/connector/cn_test.c b/Documentation/connector/cn_test.c
index 6977c178729..f688eba8770 100644
--- a/Documentation/connector/cn_test.c
+++ b/Documentation/connector/cn_test.c
@@ -41,6 +41,12 @@ void cn_test_callback(void *data)
 	       msg->seq, msg->ack, msg->len, (char *)msg->data);
 }
 
+/*
+ * Do not remove this function even if no one is using it as
+ * this is an example of how to get notifications about new
+ * connector user registration
+ */
+#if 0
 static int cn_test_want_notify(void)
 {
 	struct cn_ctl_msg *ctl;
@@ -117,6 +123,7 @@ nlmsg_failure:
 	kfree_skb(skb);
 	return -EINVAL;
 }
+#endif
 
 static u32 cn_test_timer_counter;
 static void cn_test_timer_func(unsigned long __data)
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 7129846a278..8d07ed31207 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -6,6 +6,20 @@ be removed from this file.
 
 ---------------------------
 
+What:	IRQF_SAMPLE_RANDOM
+Check:	IRQF_SAMPLE_RANDOM
+When:	July 2009
+
+Why:	Many of IRQF_SAMPLE_RANDOM users are technically bogus as entropy
+	sources in the kernel's current entropy model. To resolve this, every
+	input point to the kernel's entropy pool needs to better document the
+	type of entropy source it actually is. This will be replaced with
+	additional add_*_randomness functions in drivers/char/random.c
+
+Who:	Robin Getz <rgetz@blackfin.uclinux.org> & Matt Mackall <mpm@selenic.com>
+
+---------------------------
+
 What:	The ieee80211_regdom module parameter
 When:	March 2010 / desktop catchup
 
diff --git a/Documentation/filesystems/ext2.txt b/Documentation/filesystems/ext2.txt
index e055acb6b2d..67639f905f1 100644
--- a/Documentation/filesystems/ext2.txt
+++ b/Documentation/filesystems/ext2.txt
@@ -322,7 +322,7 @@ an upper limit on the block size imposed by the page size of the kernel,
 so 8kB blocks are only allowed on Alpha systems (and other architectures
 which support larger pages).
 
-There is an upper limit of 32768 subdirectories in a single directory.
+There is an upper limit of 32000 subdirectories in a single directory.
 
 There is a "soft" upper limit of about 10-15k files in a single directory
 with the current linear linked-list directory implementation.  This limit
diff --git a/Documentation/filesystems/isofs.txt b/Documentation/filesystems/isofs.txt
index 6973b980ca2..3c367c3b360 100644
--- a/Documentation/filesystems/isofs.txt
+++ b/Documentation/filesystems/isofs.txt
@@ -23,8 +23,13 @@ Mount options unique to the isofs filesystem.
   map=off       Do not map non-Rock Ridge filenames to lower case
   map=normal    Map non-Rock Ridge filenames to lower case
   map=acorn     As map=normal but also apply Acorn extensions if present
-  mode=xxx      Sets the permissions on files to xxx
-  dmode=xxx     Sets the permissions on directories to xxx
+  mode=xxx      Sets the permissions on files to xxx unless Rock Ridge
+		extensions set the permissions otherwise
+  dmode=xxx     Sets the permissions on directories to xxx unless Rock Ridge
+		extensions set the permissions otherwise
+  overriderockperm Set permissions on files and directories according to
+		'mode' and 'dmode' even though Rock Ridge extensions are
+		present.
   nojoliet      Ignore Joliet extensions if they are present.
   norock        Ignore Rock Ridge extensions if they are present.
   hide		Completely strip hidden files from the file system.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index ebff3c10a07..fad18f9456e 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -5,11 +5,12 @@
                   Bodo Bauer <bb@ricochet.net>
 
 2.4.x update	  Jorge Nerin <comandante@zaralinux.com>      November 14 2000
-move /proc/sys	  Shen Feng <shen@cn.fujitsu.com>		    April 1 2009
+move /proc/sys	  Shen Feng <shen@cn.fujitsu.com>		  April 1 2009
 ------------------------------------------------------------------------------
 Version 1.3                                              Kernel version 2.2.12
 					      Kernel version 2.4.0-test11-pre4
 ------------------------------------------------------------------------------
+fixes/update part 1.1  Stefani Seibold <stefani@seibold.net>       June 9 2009
 
 Table of Contents
 -----------------
@@ -116,7 +117,7 @@ The link  self  points  to  the  process reading the file system. Each process
 subdirectory has the entries listed in Table 1-1.
 
 
-Table 1-1: Process specific entries in /proc 
+Table 1-1: Process specific entries in /proc
 ..............................................................................
  File		Content
  clear_refs	Clears page referenced bits shown in smaps output
@@ -134,46 +135,103 @@ Table 1-1: Process specific entries in /proc
  status		Process status in human readable form
  wchan		If CONFIG_KALLSYMS is set, a pre-decoded wchan
  stack		Report full stack trace, enable via CONFIG_STACKTRACE
- smaps		Extension based on maps, the rss size for each mapped file
+ smaps		a extension based on maps, showing the memory consumption of
+		each mapping
 ..............................................................................
 
 For example, to get the status information of a process, all you have to do is
 read the file /proc/PID/status:
 
-  >cat /proc/self/status 
-  Name:   cat 
-  State:  R (running) 
-  Pid:    5452 
-  PPid:   743 
+  >cat /proc/self/status
+  Name:   cat
+  State:  R (running)
+  Tgid:   5452
+  Pid:    5452
+  PPid:   743
   TracerPid:      0						(2.4)
-  Uid:    501     501     501     501 
-  Gid:    100     100     100     100 
-  Groups: 100 14 16 
-  VmSize:     1112 kB 
-  VmLck:         0 kB 
-  VmRSS:       348 kB 
-  VmData:       24 kB 
-  VmStk:        12 kB 
-  VmExe:         8 kB 
-  VmLib:      1044 kB 
-  SigPnd: 0000000000000000 
-  SigBlk: 0000000000000000 
-  SigIgn: 0000000000000000 
-  SigCgt: 0000000000000000 
-  CapInh: 00000000fffffeff 
-  CapPrm: 0000000000000000 
-  CapEff: 0000000000000000 
-
+  Uid:    501     501     501     501
+  Gid:    100     100     100     100
+  FDSize: 256
+  Groups: 100 14 16
+  VmPeak:     5004 kB
+  VmSize:     5004 kB
+  VmLck:         0 kB
+  VmHWM:       476 kB
+  VmRSS:       476 kB
+  VmData:      156 kB
+  VmStk:        88 kB
+  VmExe:        68 kB
+  VmLib:      1412 kB
+  VmPTE:        20 kb
+  Threads:        1
+  SigQ:   0/28578
+  SigPnd: 0000000000000000
+  ShdPnd: 0000000000000000
+  SigBlk: 0000000000000000
+  SigIgn: 0000000000000000
+  SigCgt: 0000000000000000
+  CapInh: 00000000fffffeff
+  CapPrm: 0000000000000000
+  CapEff: 0000000000000000
+  CapBnd: ffffffffffffffff
+  voluntary_ctxt_switches:        0
+  nonvoluntary_ctxt_switches:     1
 
 This shows you nearly the same information you would get if you viewed it with
 the ps  command.  In  fact,  ps  uses  the  proc  file  system  to  obtain its
-information. The  statm  file  contains  more  detailed  information about the
-process memory usage. Its seven fields are explained in Table 1-2.  The stat
-file contains details information about the process itself.  Its fields are
-explained in Table 1-3.
+information.  But you get a more detailed  view of the  process by reading the
+file /proc/PID/status. It fields are described in table 1-2.
+
+The  statm  file  contains  more  detailed  information about the process
+memory usage. Its seven fields are explained in Table 1-3.  The stat file
+contains details information about the process itself.  Its fields are
+explained in Table 1-4.
 
+Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
+..............................................................................
+ Field                       Content
+ Name                        filename of the executable
+ State                       state (R is running, S is sleeping, D is sleeping
+                             in an uninterruptible wait, Z is zombie,
+			     T is traced or stopped)
+ Tgid                        thread group ID
+ Pid                         process id
+ PPid                        process id of the parent process
+ TracerPid                   PID of process tracing this process (0 if not)
+ Uid                         Real, effective, saved set, and  file system UIDs
+ Gid                         Real, effective, saved set, and  file system GIDs
+ FDSize                      number of file descriptor slots currently allocated
+ Groups                      supplementary group list
+ VmPeak                      peak virtual memory size
+ VmSize                      total program size
+ VmLck                       locked memory size
+ VmHWM                       peak resident set size ("high water mark")
+ VmRSS                       size of memory portions
+ VmData                      size of data, stack, and text segments
+ VmStk                       size of data, stack, and text segments
+ VmExe                       size of text segment
+ VmLib                       size of shared library code
+ VmPTE                       size of page table entries
+ Threads                     number of threads
+ SigQ                        number of signals queued/max. number for queue
+ SigPnd                      bitmap of pending signals for the thread
+ ShdPnd                      bitmap of shared pending signals for the process
+ SigBlk                      bitmap of blocked signals
+ SigIgn                      bitmap of ignored signals
+ SigCgt                      bitmap of catched signals
+ CapInh                      bitmap of inheritable capabilities
+ CapPrm                      bitmap of permitted capabilities
+ CapEff                      bitmap of effective capabilities
+ CapBnd                      bitmap of capabilities bounding set
+ Cpus_allowed                mask of CPUs on which this process may run
+ Cpus_allowed_list           Same as previous, but in "list format"
+ Mems_allowed                mask of memory nodes allowed to this process
+ Mems_allowed_list           Same as previous, but in "list format"
+ voluntary_ctxt_switches     number of voluntary context switches
+ nonvoluntary_ctxt_switches  number of non voluntary context switches
+..............................................................................
 
-Table 1-2: Contents of the statm files (as of 2.6.8-rc3)
+Table 1-3: Contents of the statm files (as of 2.6.8-rc3)
 ..............................................................................
  Field    Content
  size     total program size (pages)		(same as VmSize in status)
@@ -188,7 +246,7 @@ Table 1-2: Contents of the statm files (as of 2.6.8-rc3)
 ..............................................................................
 
 
-Table 1-3: Contents of the stat files (as of 2.6.22-rc3)
+Table 1-4: Contents of the stat files (as of 2.6.30-rc7)
 ..............................................................................
  Field          Content
   pid           process id
@@ -222,10 +280,10 @@ Table 1-3: Contents of the stat files (as of 2.6.22-rc3)
   start_stack   address of the start of the stack
   esp           current value of ESP
   eip           current value of EIP
-  pending       bitmap of pending signals (obsolete)
-  blocked       bitmap of blocked signals (obsolete)
-  sigign        bitmap of ignored signals (obsolete)
-  sigcatch      bitmap of catched signals (obsolete)
+  pending       bitmap of pending signals
+  blocked       bitmap of blocked signals
+  sigign        bitmap of ignored signals
+  sigcatch      bitmap of catched signals
   wchan         address where process went to sleep
   0             (place holder)
   0             (place holder)
@@ -234,19 +292,99 @@ Table 1-3: Contents of the stat files (as of 2.6.22-rc3)
   rt_priority   realtime priority
   policy        scheduling policy (man sched_setscheduler)
   blkio_ticks   time spent waiting for block IO
+  gtime         guest time of the task in jiffies
+  cgtime        guest time of the task children in jiffies
 ..............................................................................
 
+The /proc/PID/map file containing the currently mapped memory regions and
+their access permissions.
+
+The format is:
+
+address           perms offset  dev   inode      pathname
+
+08048000-08049000 r-xp 00000000 03:00 8312       /opt/test
+08049000-0804a000 rw-p 00001000 03:00 8312       /opt/test
+0804a000-0806b000 rw-p 00000000 00:00 0          [heap]
+a7cb1000-a7cb2000 ---p 00000000 00:00 0
+a7cb2000-a7eb2000 rw-p 00000000 00:00 0
+a7eb2000-a7eb3000 ---p 00000000 00:00 0
+a7eb3000-a7ed5000 rw-p 00000000 00:00 0
+a7ed5000-a8008000 r-xp 00000000 03:00 4222       /lib/libc.so.6
+a8008000-a800a000 r--p 00133000 03:00 4222       /lib/libc.so.6
+a800a000-a800b000 rw-p 00135000 03:00 4222       /lib/libc.so.6
+a800b000-a800e000 rw-p 00000000 00:00 0
+a800e000-a8022000 r-xp 00000000 03:00 14462      /lib/libpthread.so.0
+a8022000-a8023000 r--p 00013000 03:00 14462      /lib/libpthread.so.0
+a8023000-a8024000 rw-p 00014000 03:00 14462      /lib/libpthread.so.0
+a8024000-a8027000 rw-p 00000000 00:00 0
+a8027000-a8043000 r-xp 00000000 03:00 8317       /lib/ld-linux.so.2
+a8043000-a8044000 r--p 0001b000 03:00 8317       /lib/ld-linux.so.2
+a8044000-a8045000 rw-p 0001c000 03:00 8317       /lib/ld-linux.so.2
+aff35000-aff4a000 rw-p 00000000 00:00 0          [stack]
+ffffe000-fffff000 r-xp 00000000 00:00 0          [vdso]
+
+where "address" is the address space in the process that it occupies, "perms"
+is a set of permissions:
+
+ r = read
+ w = write
+ x = execute
+ s = shared
+ p = private (copy on write)
+
+"offset" is the offset into the mapping, "dev" is the device (major:minor), and
+"inode" is the inode  on that device.  0 indicates that  no inode is associated
+with the memory region, as the case would be with BSS (uninitialized data).
+The "pathname" shows the name associated file for this mapping.  If the mapping
+is not associated with a file:
+
+ [heap]                   = the heap of the program
+ [stack]                  = the stack of the main process
+ [vdso]                   = the "virtual dynamic shared object",
+                            the kernel system call handler
+
+ or if empty, the mapping is anonymous.
+
+
+The /proc/PID/smaps is an extension based on maps, showing the memory
+consumption for each of the process's mappings. For each of mappings there
+is a series of lines such as the following:
+
+08048000-080bc000 r-xp 00000000 03:02 13130      /bin/bash
+Size:               1084 kB
+Rss:                 892 kB
+Pss:                 374 kB
+Shared_Clean:        892 kB
+Shared_Dirty:          0 kB
+Private_Clean:         0 kB
+Private_Dirty:         0 kB
+Referenced:          892 kB
+Swap:                  0 kB
+KernelPageSize:        4 kB
+MMUPageSize:           4 kB
+
+The first  of these lines shows  the same information  as is displayed for the
+mapping in /proc/PID/maps.  The remaining lines show  the size of the mapping,
+the amount of the mapping that is currently resident in RAM, the "proportional
+set size” (divide each shared page by the number of processes sharing it), the
+number of clean and dirty shared pages in the mapping, and the number of clean
+and dirty private pages in the mapping.  The "Referenced" indicates the amount
+of memory currently marked as referenced or accessed.
+
+This file is only present if the CONFIG_MMU kernel configuration option is
+enabled.
 
 1.2 Kernel data
 ---------------
 
 Similar to  the  process entries, the kernel data files give information about
 the running kernel. The files used to obtain this information are contained in
-/proc and  are  listed  in Table 1-4. Not all of these will be present in your
+/proc and  are  listed  in Table 1-5. Not all of these will be present in your
 system. It  depends  on the kernel configuration and the loaded modules, which
 files are there, and which are missing.
 
-Table 1-4: Kernel info in /proc
+Table 1-5: Kernel info in /proc
 ..............................................................................
  File        Content                                           
  apm         Advanced power management info                    
@@ -283,6 +421,7 @@ Table 1-4: Kernel info in /proc
  rtc         Real time clock                                   
  scsi        SCSI info (see text)                              
  slabinfo    Slab pool info                                    
+ softirqs    softirq usage
  stat        Overall statistics                                
  swaps       Swap space utilization                            
  sys         See chapter 2                                     
@@ -597,6 +736,25 @@ on the kind of area :
 0xffffffffa0017000-0xffffffffa0022000   45056 sys_init_module+0xc27/0x1d00 ...
    pages=10 vmalloc N0=10
 
+..............................................................................
+
+softirqs:
+
+Provides counts of softirq handlers serviced since boot time, for each cpu.
+
+> cat /proc/softirqs
+                CPU0       CPU1       CPU2       CPU3
+      HI:          0          0          0          0
+   TIMER:      27166      27120      27097      27034
+  NET_TX:          0          0          0         17
+  NET_RX:         42          0          0         39
+   BLOCK:          0          0        107       1121
+ TASKLET:          0          0          0        290
+   SCHED:      27035      26983      26971      26746
+ HRTIMER:          0          0          0          0
+     RCU:       1678       1769       2178       2250
+
+
 1.3 IDE devices in /proc/ide
 ----------------------------
 
@@ -614,10 +772,10 @@ IDE devices:
 
 More detailed  information  can  be  found  in  the  controller  specific
 subdirectories. These  are  named  ide0,  ide1  and  so  on.  Each  of  these
-directories contains the files shown in table 1-5.
+directories contains the files shown in table 1-6.
 
 
-Table 1-5: IDE controller info in  /proc/ide/ide?
+Table 1-6: IDE controller info in  /proc/ide/ide?
 ..............................................................................
  File    Content                                 
  channel IDE channel (0 or 1)                    
@@ -627,11 +785,11 @@ Table 1-5: IDE controller info in  /proc/ide/ide?
 ..............................................................................
 
 Each device  connected  to  a  controller  has  a separate subdirectory in the
-controllers directory.  The  files  listed in table 1-6 are contained in these
+controllers directory.  The  files  listed in table 1-7 are contained in these
 directories.
 
 
-Table 1-6: IDE device information
+Table 1-7: IDE device information
 ..............................................................................
  File             Content                                    
  cache            The cache                                  
@@ -673,12 +831,12 @@ the drive parameters:
 1.4 Networking info in /proc/net
 --------------------------------
 
-The subdirectory  /proc/net  follows  the  usual  pattern. Table 1-6 shows the
+The subdirectory  /proc/net  follows  the  usual  pattern. Table 1-8 shows the
 additional values  you  get  for  IP  version 6 if you configure the kernel to
-support this. Table 1-7 lists the files and their meaning.
+support this. Table 1-9 lists the files and their meaning.
 
 
-Table 1-6: IPv6 info in /proc/net 
+Table 1-8: IPv6 info in /proc/net
 ..............................................................................
  File       Content                                               
  udp6       UDP sockets (IPv6)                                    
@@ -693,7 +851,7 @@ Table 1-6: IPv6 info in /proc/net
 ..............................................................................
 
 
-Table 1-7: Network info in /proc/net 
+Table 1-9: Network info in /proc/net
 ..............................................................................
  File          Content                                                         
  arp           Kernel  ARP table                                               
@@ -817,10 +975,10 @@ The directory  /proc/parport  contains information about the parallel ports of
 your system.  It  has  one  subdirectory  for  each port, named after the port
 number (0,1,2,...).
 
-These directories contain the four files shown in Table 1-8.
+These directories contain the four files shown in Table 1-10.
 
 
-Table 1-8: Files in /proc/parport 
+Table 1-10: Files in /proc/parport
 ..............................................................................
  File      Content                                                             
  autoprobe Any IEEE-1284 device ID information that has been acquired.         
@@ -838,10 +996,10 @@ Table 1-8: Files in /proc/parport
 
 Information about  the  available  and actually used tty's can be found in the
 directory /proc/tty.You'll  find  entries  for drivers and line disciplines in
-this directory, as shown in Table 1-9.
+this directory, as shown in Table 1-11.
 
 
-Table 1-9: Files in /proc/tty 
+Table 1-11: Files in /proc/tty
 ..............................................................................
  File          Content                                        
  drivers       list of drivers and their usage                
@@ -883,6 +1041,7 @@ since the system first booted.  For a quick look, simply cat the file:
   processes 2915
   procs_running 1
   procs_blocked 0
+  softirq 183433 0 21755 12 39 1137 231 21459 2263
 
 The very first  "cpu" line aggregates the  numbers in all  of the other "cpuN"
 lines.  These numbers identify the amount of time the CPU has spent performing
@@ -918,6 +1077,11 @@ CPUs.
 The   "procs_blocked" line gives  the  number of  processes currently blocked,
 waiting for I/O to complete.
 
+The "softirq" line gives counts of softirqs serviced since boot time, for each
+of the possible system softirqs. The first column is the total of all
+softirqs serviced; each subsequent column is the total for that particular
+softirq.
+
 
 1.9 Ext4 file system parameters
 ------------------------------
@@ -926,9 +1090,9 @@ Information about mounted ext4 file systems can be found in
 /proc/fs/ext4.  Each mounted filesystem will have a directory in
 /proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or
 /proc/fs/ext4/dm-0).   The files in each per-device directory are shown
-in Table 1-10, below.
+in Table 1-12, below.
 
-Table 1-10: Files in /proc/fs/ext4/<devname>
+Table 1-12: Files in /proc/fs/ext4/<devname>
 ..............................................................................
  File            Content                                        
  mb_groups       details of multiblock allocator buddy cache of free blocks
diff --git a/Documentation/gcov.txt b/Documentation/gcov.txt
new file mode 100644
index 00000000000..e716aadb3a3
--- /dev/null
+++ b/Documentation/gcov.txt
@@ -0,0 +1,246 @@
+Using gcov with the Linux kernel
+================================
+
+1. Introduction
+2. Preparation
+3. Customization
+4. Files
+5. Modules
+6. Separated build and test machines
+7. Troubleshooting
+Appendix A: sample script: gather_on_build.sh
+Appendix B: sample script: gather_on_test.sh
+
+
+1. Introduction
+===============
+
+gcov profiling kernel support enables the use of GCC's coverage testing
+tool gcov [1] with the Linux kernel. Coverage data of a running kernel
+is exported in gcov-compatible format via the "gcov" debugfs directory.
+To get coverage data for a specific file, change to the kernel build
+directory and use gcov with the -o option as follows (requires root):
+
+# cd /tmp/linux-out
+# gcov -o /sys/kernel/debug/gcov/tmp/linux-out/kernel spinlock.c
+
+This will create source code files annotated with execution counts
+in the current directory. In addition, graphical gcov front-ends such
+as lcov [2] can be used to automate the process of collecting data
+for the entire kernel and provide coverage overviews in HTML format.
+
+Possible uses:
+
+* debugging (has this line been reached at all?)
+* test improvement (how do I change my test to cover these lines?)
+* minimizing kernel configurations (do I need this option if the
+  associated code is never run?)
+
+--
+
+[1] http://gcc.gnu.org/onlinedocs/gcc/Gcov.html
+[2] http://ltp.sourceforge.net/coverage/lcov.php
+
+
+2. Preparation
+==============
+
+Configure the kernel with:
+
+        CONFIG_DEBUGFS=y
+        CONFIG_GCOV_KERNEL=y
+
+and to get coverage data for the entire kernel:
+
+        CONFIG_GCOV_PROFILE_ALL=y
+
+Note that kernels compiled with profiling flags will be significantly
+larger and run slower. Also CONFIG_GCOV_PROFILE_ALL may not be supported
+on all architectures.
+
+Profiling data will only become accessible once debugfs has been
+mounted:
+
+        mount -t debugfs none /sys/kernel/debug
+
+
+3. Customization
+================
+
+To enable profiling for specific files or directories, add a line
+similar to the following to the respective kernel Makefile:
+
+        For a single file (e.g. main.o):
+                GCOV_PROFILE_main.o := y
+
+        For all files in one directory:
+                GCOV_PROFILE := y
+
+To exclude files from being profiled even when CONFIG_GCOV_PROFILE_ALL
+is specified, use:
+
+                GCOV_PROFILE_main.o := n
+        and:
+                GCOV_PROFILE := n
+
+Only files which are linked to the main kernel image or are compiled as
+kernel modules are supported by this mechanism.
+
+
+4. Files
+========
+
+The gcov kernel support creates the following files in debugfs:
+
+        /sys/kernel/debug/gcov
+                Parent directory for all gcov-related files.
+
+        /sys/kernel/debug/gcov/reset
+                Global reset file: resets all coverage data to zero when
+                written to.
+
+        /sys/kernel/debug/gcov/path/to/compile/dir/file.gcda
+                The actual gcov data file as understood by the gcov
+                tool. Resets file coverage data to zero when written to.
+
+        /sys/kernel/debug/gcov/path/to/compile/dir/file.gcno
+                Symbolic link to a static data file required by the gcov
+                tool. This file is generated by gcc when compiling with
+                option -ftest-coverage.
+
+
+5. Modules
+==========
+
+Kernel modules may contain cleanup code which is only run during
+module unload time. The gcov mechanism provides a means to collect
+coverage data for such code by keeping a copy of the data associated
+with the unloaded module. This data remains available through debugfs.
+Once the module is loaded again, the associated coverage counters are
+initialized with the data from its previous instantiation.
+
+This behavior can be deactivated by specifying the gcov_persist kernel
+parameter:
+
+        gcov_persist=0
+
+At run-time, a user can also choose to discard data for an unloaded
+module by writing to its data file or the global reset file.
+
+
+6. Separated build and test machines
+====================================
+
+The gcov kernel profiling infrastructure is designed to work out-of-the
+box for setups where kernels are built and run on the same machine. In
+cases where the kernel runs on a separate machine, special preparations
+must be made, depending on where the gcov tool is used:
+
+a) gcov is run on the TEST machine
+
+The gcov tool version on the test machine must be compatible with the
+gcc version used for kernel build. Also the following files need to be
+copied from build to test machine:
+
+from the source tree:
+  - all C source files + headers
+
+from the build tree:
+  - all C source files + headers
+  - all .gcda and .gcno files
+  - all links to directories
+
+It is important to note that these files need to be placed into the
+exact same file system location on the test machine as on the build
+machine. If any of the path components is symbolic link, the actual
+directory needs to be used instead (due to make's CURDIR handling).
+
+b) gcov is run on the BUILD machine
+
+The following files need to be copied after each test case from test
+to build machine:
+
+from the gcov directory in sysfs:
+  - all .gcda files
+  - all links to .gcno files
+
+These files can be copied to any location on the build machine. gcov
+must then be called with the -o option pointing to that directory.
+
+Example directory setup on the build machine:
+
+  /tmp/linux:    kernel source tree
+  /tmp/out:      kernel build directory as specified by make O=
+  /tmp/coverage: location of the files copied from the test machine
+
+  [user@build] cd /tmp/out
+  [user@build] gcov -o /tmp/coverage/tmp/out/init main.c
+
+
+7. Troubleshooting
+==================
+
+Problem:  Compilation aborts during linker step.
+Cause:    Profiling flags are specified for source files which are not
+          linked to the main kernel or which are linked by a custom
+          linker procedure.
+Solution: Exclude affected source files from profiling by specifying
+          GCOV_PROFILE := n or GCOV_PROFILE_basename.o := n in the
+          corresponding Makefile.
+
+
+Appendix A: gather_on_build.sh
+==============================
+
+Sample script to gather coverage meta files on the build machine
+(see 6a):
+
+#!/bin/bash
+
+KSRC=$1
+KOBJ=$2
+DEST=$3
+
+if [ -z "$KSRC" ] || [ -z "$KOBJ" ] || [ -z "$DEST" ]; then
+  echo "Usage: $0 <ksrc directory> <kobj directory> <output.tar.gz>" >&2
+  exit 1
+fi
+
+KSRC=$(cd $KSRC; printf "all:\n\t@echo \${CURDIR}\n" | make -f -)
+KOBJ=$(cd $KOBJ; printf "all:\n\t@echo \${CURDIR}\n" | make -f -)
+
+find $KSRC $KOBJ \( -name '*.gcno' -o -name '*.[ch]' -o -type l \) -a \
+                 -perm /u+r,g+r | tar cfz $DEST -P -T -
+
+if [ $? -eq 0 ] ; then
+  echo "$DEST successfully created, copy to test system and unpack with:"
+  echo "  tar xfz $DEST -P"
+else
+  echo "Could not create file $DEST"
+fi
+
+
+Appendix B: gather_on_test.sh
+=============================
+
+Sample script to gather coverage data files on the test machine
+(see 6b):
+
+#!/bin/bash
+
+DEST=$1
+GCDA=/sys/kernel/debug/gcov
+
+if [ -z "$DEST" ] ; then
+  echo "Usage: $0 <output.tar.gz>" >&2
+  exit 1
+fi
+
+find $GCDA -name '*.gcno' -o -name '*.gcda' | tar cfz $DEST -T -
+
+if [ $? -eq 0 ] ; then
+  echo "$DEST successfully created, copy to build system and unpack with:"
+  echo "  tar xfz $DEST"
+else
+  echo "Could not create file $DEST"
+fi
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 1f779a25c70..7bb0d934b6d 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -149,6 +149,8 @@ Code	Seq#	Include File		Comments
 'p'	40-7F	linux/nvram.h
 'p'	80-9F				user-space parport
 					<mailto:tim@cyberelk.net>
+'p'	a1-a4	linux/pps.h		LinuxPPS
+					<mailto:giometti@linux.it>
 'q'	00-1F	linux/serio.h
 'q'	80-FF				Internet PhoneJACK, Internet LineJACK
 					<http://www.quicknet.net>
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 5578248c18a..08def8deb5f 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -48,6 +48,7 @@ parameter is applicable:
 	EFI	EFI Partitioning (GPT) is enabled
 	EIDE	EIDE/ATAPI support is enabled.
 	FB	The frame buffer device is enabled.
+	GCOV	GCOV profiling is enabled.
 	HW	Appropriate hardware is enabled.
 	IA-64	IA-64 architecture is enabled.
 	IMA     Integrity measurement architecture is enabled.
@@ -796,6 +797,12 @@ and is between 256 and 4096 characters. It is defined in the file
 			Format: off | on
 			default: on
 
+	gcov_persist=	[GCOV] When non-zero (default), profiling data for
+			kernel modules is saved and remains accessible via
+			debugfs, even when the module is unloaded/reloaded.
+			When zero, profiling data is discarded and associated
+			debugfs files are removed at module unload time.
+
 	gdth=		[HW,SCSI]
 			See header of drivers/scsi/gdth.c.
 
diff --git a/Documentation/pps/pps.txt b/Documentation/pps/pps.txt
new file mode 100644
index 00000000000..125f4ab4899
--- /dev/null
+++ b/Documentation/pps/pps.txt
@@ -0,0 +1,172 @@
+
+			PPS - Pulse Per Second
+			----------------------
+
+(C) Copyright 2007 Rodolfo Giometti <giometti@enneenne.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+
+
+Overview
+--------
+
+LinuxPPS provides a programming interface (API) to define in the
+system several PPS sources.
+
+PPS means "pulse per second" and a PPS source is just a device which
+provides a high precision signal each second so that an application
+can use it to adjust system clock time.
+
+A PPS source can be connected to a serial port (usually to the Data
+Carrier Detect pin) or to a parallel port (ACK-pin) or to a special
+CPU's GPIOs (this is the common case in embedded systems) but in each
+case when a new pulse arrives the system must apply to it a timestamp
+and record it for userland.
+
+Common use is the combination of the NTPD as userland program, with a
+GPS receiver as PPS source, to obtain a wallclock-time with
+sub-millisecond synchronisation to UTC.
+
+
+RFC considerations
+------------------
+
+While implementing a PPS API as RFC 2783 defines and using an embedded
+CPU GPIO-Pin as physical link to the signal, I encountered a deeper
+problem:
+
+   At startup it needs a file descriptor as argument for the function
+   time_pps_create().
+
+This implies that the source has a /dev/... entry. This assumption is
+ok for the serial and parallel port, where you can do something
+useful besides(!) the gathering of timestamps as it is the central
+task for a PPS-API. But this assumption does not work for a single
+purpose GPIO line. In this case even basic file-related functionality
+(like read() and write()) makes no sense at all and should not be a
+precondition for the use of a PPS-API.
+
+The problem can be simply solved if you consider that a PPS source is
+not always connected with a GPS data source.
+
+So your programs should check if the GPS data source (the serial port
+for instance) is a PPS source too, and if not they should provide the
+possibility to open another device as PPS source.
+
+In LinuxPPS the PPS sources are simply char devices usually mapped
+into files /dev/pps0, /dev/pps1, etc..
+
+
+Coding example
+--------------
+
+To register a PPS source into the kernel you should define a struct
+pps_source_info_s as follows:
+
+    static struct pps_source_info pps_ktimer_info = {
+	    .name         = "ktimer",
+	    .path         = "",
+	    .mode         = PPS_CAPTUREASSERT | PPS_OFFSETASSERT | \
+			    PPS_ECHOASSERT | \
+			    PPS_CANWAIT | PPS_TSFMT_TSPEC,
+	    .echo         = pps_ktimer_echo,
+	    .owner        = THIS_MODULE,
+    };
+
+and then calling the function pps_register_source() in your
+intialization routine as follows:
+
+    source = pps_register_source(&pps_ktimer_info,
+			PPS_CAPTUREASSERT | PPS_OFFSETASSERT);
+
+The pps_register_source() prototype is:
+
+  int pps_register_source(struct pps_source_info_s *info, int default_params)
+
+where "info" is a pointer to a structure that describes a particular
+PPS source, "default_params" tells the system what the initial default
+parameters for the device should be (it is obvious that these parameters
+must be a subset of ones defined in the struct
+pps_source_info_s which describe the capabilities of the driver).
+
+Once you have registered a new PPS source into the system you can
+signal an assert event (for example in the interrupt handler routine)
+just using:
+
+    pps_event(source, &ts, PPS_CAPTUREASSERT, ptr)
+
+where "ts" is the event's timestamp.
+
+The same function may also run the defined echo function
+(pps_ktimer_echo(), passing to it the "ptr" pointer) if the user
+asked for that... etc..
+
+Please see the file drivers/pps/clients/ktimer.c for example code.
+
+
+SYSFS support
+-------------
+
+If the SYSFS filesystem is enabled in the kernel it provides a new class:
+
+   $ ls /sys/class/pps/
+   pps0/  pps1/  pps2/
+
+Every directory is the ID of a PPS sources defined in the system and
+inside you find several files:
+
+   $ ls /sys/class/pps/pps0/
+   assert	clear  echo  mode  name  path  subsystem@  uevent
+
+Inside each "assert" and "clear" file you can find the timestamp and a
+sequence number:
+
+   $ cat /sys/class/pps/pps0/assert
+   1170026870.983207967#8
+
+Where before the "#" is the timestamp in seconds; after it is the
+sequence number. Other files are:
+
+* echo: reports if the PPS source has an echo function or not;
+
+* mode: reports available PPS functioning modes;
+
+* name: reports the PPS source's name;
+
+* path: reports the PPS source's device path, that is the device the
+  PPS source is connected to (if it exists).
+
+
+Testing the PPS support
+-----------------------
+
+In order to test the PPS support even without specific hardware you can use
+the ktimer driver (see the client subsection in the PPS configuration menu)
+and the userland tools provided into Documentaion/pps/ directory.
+
+Once you have enabled the compilation of ktimer just modprobe it (if
+not statically compiled):
+
+   # modprobe ktimer
+
+and the run ppstest as follow:
+
+   $ ./ppstest /dev/pps0
+   trying PPS source "/dev/pps1"
+   found PPS source "/dev/pps1"
+   ok, found 1 source(s), now start fetching data...
+   source 0 - assert 1186592699.388832443, sequence: 364 - clear  0.000000000, sequence: 0
+   source 0 - assert 1186592700.388931295, sequence: 365 - clear  0.000000000, sequence: 0
+   source 0 - assert 1186592701.389032765, sequence: 366 - clear  0.000000000, sequence: 0
+
+Please, note that to compile userland programs you need the file timepps.h
+(see Documentation/pps/).
diff --git a/Documentation/robust-futex-ABI.txt b/Documentation/robust-futex-ABI.txt
index 535f69fab45..fd1cd8aae4e 100644
--- a/Documentation/robust-futex-ABI.txt
+++ b/Documentation/robust-futex-ABI.txt
@@ -135,7 +135,7 @@ manipulating this list), the user code must observe the following
 protocol on 'lock entry' insertion and removal:
 
 On insertion:
- 1) set the 'list_op_pending' word to the address of the 'lock word'
+ 1) set the 'list_op_pending' word to the address of the 'lock entry'
     to be inserted,
  2) acquire the futex lock,
  3) add the lock entry, with its thread id (TID) in the bottom 29 bits
@@ -143,7 +143,7 @@ On insertion:
  4) clear the 'list_op_pending' word.
 
 On removal:
- 1) set the 'list_op_pending' word to the address of the 'lock word'
+ 1) set the 'list_op_pending' word to the address of the 'lock entry'
     to be removed,
  2) remove the lock entry for this lock from the 'head' list,
  2) release the futex lock, and
diff --git a/MAINTAINERS b/MAINTAINERS
index fb94addb34d..035df9d2660 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1544,6 +1544,7 @@ L:	containers@lists.linux-foundation.org
 S:	Maintained
 F:	include/linux/cgroup*
 F:	kernel/cgroup*
+F:	mm/*cgroup*
 
 CORETEMP HARDWARE MONITORING DRIVER
 P:	Rudolf Marek
@@ -2274,11 +2275,9 @@ F:	drivers/net/wan/dlci.c
 F:	drivers/net/wan/sdla.c
 
 FRAMEBUFFER LAYER
-P:	Antonino Daplas
-M:	adaplas@gmail.com
 L:	linux-fbdev-devel@lists.sourceforge.net (moderated for non-subscribers)
 W:	http://linux-fbdev.sourceforge.net/
-S:	Maintained
+S:	Orphan
 F:	Documentation/fb/
 F:	drivers/video/fb*
 F:	include/linux/fb.h
@@ -4578,6 +4577,13 @@ S:	Maintained
 F:	drivers/net/pppol2tp.c
 F:	include/linux/if_pppol2tp.h
 
+PPS SUPPORT
+P:	Rodolfo Giometti
+M:	giometti@enneenne.com
+W:	http://wiki.enneenne.com/index.php/LinuxPPS_support
+L:	linuxpps@ml.enneenne.com (subscribers-only)
+S:	Maintained
+
 PREEMPTIBLE KERNEL
 P:	Robert Love
 M:	rml@tech9.net
diff --git a/Makefile b/Makefile
index ea63667617f..46e1c9d03d5 100644
--- a/Makefile
+++ b/Makefile
@@ -330,6 +330,7 @@ AFLAGS_MODULE   = $(MODFLAGS)
 LDFLAGS_MODULE  =
 CFLAGS_KERNEL	=
 AFLAGS_KERNEL	=
+CFLAGS_GCOV	= -fprofile-arcs -ftest-coverage
 
 
 # Use LINUXINCLUDE when you must reference the include/ directory.
@@ -356,7 +357,7 @@ export CPP AR NM STRIP OBJCOPY OBJDUMP MAKE AWK GENKSYMS PERL UTS_MACHINE
 export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
 
 export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
-export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE
+export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV
 export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE
 
 # When compiling out-of-tree modules, put MODVERDIR in the module
@@ -1216,8 +1217,8 @@ clean: archclean $(clean-dirs)
 		\( -name '*.[oas]' -o -name '*.ko' -o -name '.*.cmd' \
 		-o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \
 		-o -name '*.symtypes' -o -name 'modules.order' \
-		-o -name 'Module.markers' -o -name '.tmp_*.o.*' \) \
-		-type f -print | xargs rm -f
+		-o -name 'Module.markers' -o -name '.tmp_*.o.*' \
+		-o -name '*.gcno' \) -type f -print | xargs rm -f
 
 # mrproper - Delete all generated files, including .config
 #
@@ -1421,8 +1422,8 @@ clean: $(clean-dirs)
 	$(call cmd,rmfiles)
 	@find $(KBUILD_EXTMOD) $(RCS_FIND_IGNORE) \
 		\( -name '*.[oas]' -o -name '*.ko' -o -name '.*.cmd' \
-		-o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \) \
-		-type f -print | xargs rm -f
+		-o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \
+		-o -name '*.gcno' \) -type f -print | xargs rm -f
 
 help:
 	@echo  '  Building external modules.'
diff --git a/arch/Kconfig b/arch/Kconfig
index 78a35e9dc10..99193b16023 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -112,3 +112,5 @@ config HAVE_DMA_API_DEBUG
 
 config HAVE_DEFAULT_NO_SPIN_MUTEXES
 	bool
+
+source "kernel/gcov/Kconfig"
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 1bbe1da5486..93c0342530a 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -394,8 +394,6 @@ asmlinkage int sys_execve(char __user *ufilename, char __user *__user *uargv,
 		goto out;
 
 	error = do_execve(filename, uargv, uenvp, regs);
-	if (error == 0)
-		current->ptrace &= ~PT_DTRACE;
 	putname(filename);
 
 out:
diff --git a/arch/h8300/kernel/asm-offsets.c b/arch/h8300/kernel/asm-offsets.c
index 2042552e087..fd961e0bd74 100644
--- a/arch/h8300/kernel/asm-offsets.c
+++ b/arch/h8300/kernel/asm-offsets.c
@@ -55,7 +55,6 @@ int main(void)
 	DEFINE(LRET,  offsetof(struct pt_regs, pc)       - sizeof(long));
 
 	DEFINE(PT_PTRACED, PT_PTRACED);
-	DEFINE(PT_DTRACE, PT_DTRACE);
 
 	return 0;
 }
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 294a3b13eca..170042b420d 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -28,6 +28,7 @@ config IA64
 	select HAVE_DMA_ATTRS
 	select HAVE_KVM
 	select HAVE_ARCH_TRACEHOOK
+	select HAVE_DMA_API_DEBUG
 	default y
 	help
 	  The Itanium Processor Family is Intel's 64-bit successor to
diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h
index 36c0009dbec..5a61b5c2e18 100644
--- a/arch/ia64/include/asm/dma-mapping.h
+++ b/arch/ia64/include/asm/dma-mapping.h
@@ -8,6 +8,7 @@
 #include <asm/machvec.h>
 #include <linux/scatterlist.h>
 #include <asm/swiotlb.h>
+#include <linux/dma-debug.h>
 
 #define ARCH_HAS_DMA_GET_REQUIRED_MASK
 
@@ -24,95 +25,28 @@ static inline void *dma_alloc_coherent(struct device *dev, size_t size,
 				       dma_addr_t *daddr, gfp_t gfp)
 {
 	struct dma_map_ops *ops = platform_dma_get_ops(dev);
-	return ops->alloc_coherent(dev, size, daddr, gfp);
+	void *caddr;
+
+	caddr = ops->alloc_coherent(dev, size, daddr, gfp);
+	debug_dma_alloc_coherent(dev, size, *daddr, caddr);
+	return caddr;
 }
 
 static inline void dma_free_coherent(struct device *dev, size_t size,
 				     void *caddr, dma_addr_t daddr)
 {
 	struct dma_map_ops *ops = platform_dma_get_ops(dev);
+	debug_dma_free_coherent(dev, size, caddr, daddr);
 	ops->free_coherent(dev, size, caddr, daddr);
 }
 
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
 #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
 
-static inline dma_addr_t dma_map_single_attrs(struct device *dev,
-					      void *caddr, size_t size,
-					      enum dma_data_direction dir,
-					      struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = platform_dma_get_ops(dev);
-	return ops->map_page(dev, virt_to_page(caddr),
-			     (unsigned long)caddr & ~PAGE_MASK, size,
-			     dir, attrs);
-}
-
-static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t daddr,
-					  size_t size,
-					  enum dma_data_direction dir,
-					  struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = platform_dma_get_ops(dev);
-	ops->unmap_page(dev, daddr, size, dir, attrs);
-}
-
-#define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, NULL)
-#define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, NULL)
-
-static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sgl,
-				   int nents, enum dma_data_direction dir,
-				   struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = platform_dma_get_ops(dev);
-	return ops->map_sg(dev, sgl, nents, dir, attrs);
-}
-
-static inline void dma_unmap_sg_attrs(struct device *dev,
-				      struct scatterlist *sgl, int nents,
-				      enum dma_data_direction dir,
-				      struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = platform_dma_get_ops(dev);
-	ops->unmap_sg(dev, sgl, nents, dir, attrs);
-}
-
-#define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, NULL)
-#define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, NULL)
-
-static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t daddr,
-					   size_t size,
-					   enum dma_data_direction dir)
-{
-	struct dma_map_ops *ops = platform_dma_get_ops(dev);
-	ops->sync_single_for_cpu(dev, daddr, size, dir);
-}
-
-static inline void dma_sync_sg_for_cpu(struct device *dev,
-				       struct scatterlist *sgl,
-				       int nents, enum dma_data_direction dir)
-{
-	struct dma_map_ops *ops = platform_dma_get_ops(dev);
-	ops->sync_sg_for_cpu(dev, sgl, nents, dir);
-}
+#define get_dma_ops(dev) platform_dma_get_ops(dev)
+#define flush_write_buffers()
 
-static inline void dma_sync_single_for_device(struct device *dev,
-					      dma_addr_t daddr,
-					      size_t size,
-					      enum dma_data_direction dir)
-{
-	struct dma_map_ops *ops = platform_dma_get_ops(dev);
-	ops->sync_single_for_device(dev, daddr, size, dir);
-}
-
-static inline void dma_sync_sg_for_device(struct device *dev,
-					  struct scatterlist *sgl,
-					  int nents,
-					  enum dma_data_direction dir)
-{
-	struct dma_map_ops *ops = platform_dma_get_ops(dev);
-	ops->sync_sg_for_device(dev, sgl, nents, dir);
-}
+#include <asm-generic/dma-mapping-common.h>
 
 static inline int dma_mapping_error(struct device *dev, dma_addr_t daddr)
 {
@@ -120,30 +54,6 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t daddr)
 	return ops->mapping_error(dev, daddr);
 }
 
-static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
-				      size_t offset, size_t size,
-				      enum dma_data_direction dir)
-{
-	struct dma_map_ops *ops = platform_dma_get_ops(dev);
-	return ops->map_page(dev, page, offset, size, dir, NULL);
-}
-
-static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
-				  size_t size, enum dma_data_direction dir)
-{
-	dma_unmap_single(dev, addr, size, dir);
-}
-
-/*
- * Rest of this file is part of the "Advanced DMA API".  Use at your own risk.
- * See Documentation/DMA-API.txt for details.
- */
-
-#define dma_sync_single_range_for_cpu(dev, dma_handle, offset, size, dir)	\
-	dma_sync_single_for_cpu(dev, dma_handle, size, dir)
-#define dma_sync_single_range_for_device(dev, dma_handle, offset, size, dir)	\
-	dma_sync_single_for_device(dev, dma_handle, size, dir)
-
 static inline int dma_supported(struct device *dev, u64 mask)
 {
 	struct dma_map_ops *ops = platform_dma_get_ops(dev);
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index 3e876f0baeb..67a01e1e428 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -302,11 +302,6 @@ asmlinkage int sys_execve(char __user *ufilename, char __user * __user *uargv,
 		goto out;
 
 	error = do_execve(filename, uargv, uenvp, &regs);
-	if (error == 0) {
-		task_lock(current);
-		current->ptrace &= ~PT_DTRACE;
-		task_unlock(current);
-	}
 	putname(filename);
 out:
 	return error;
diff --git a/arch/m32r/kernel/ptrace.c b/arch/m32r/kernel/ptrace.c
index 9aa615d3a5b..bf0abe9e1f7 100644
--- a/arch/m32r/kernel/ptrace.c
+++ b/arch/m32r/kernel/ptrace.c
@@ -676,10 +676,6 @@ arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		if (!valid_signal(data))
 			break;
 		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		if ((child->ptrace & PT_DTRACE) == 0) {
-			/* Spurious delayed TF traps may occur */
-			child->ptrace |= PT_DTRACE;
-		}
 
 		/* Compute next pc.  */
 		pc = get_stack_long(child, PT_BPC);
diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c
index 184acc90808..aacd6d17b83 100644
--- a/arch/m68k/kernel/traps.c
+++ b/arch/m68k/kernel/traps.c
@@ -1057,7 +1057,6 @@ asmlinkage void trap_c(struct frame *fp)
 	if (fp->ptregs.sr & PS_S) {
 		if ((fp->ptregs.vector >> 2) == VEC_TRACE) {
 			/* traced a trapping instruction */
-			current->ptrace |= PT_DTRACE;
 		} else
 			bad_super_trap(fp);
 		return;
diff --git a/arch/m68knommu/kernel/asm-offsets.c b/arch/m68knommu/kernel/asm-offsets.c
index f500dd6935d..594ee0e657f 100644
--- a/arch/m68knommu/kernel/asm-offsets.c
+++ b/arch/m68knommu/kernel/asm-offsets.c
@@ -73,7 +73,6 @@ int main(void)
 	DEFINE(TRAP_TRACE, TRAP_TRACE);
 
 	DEFINE(PT_PTRACED, PT_PTRACED);
-	DEFINE(PT_DTRACE, PT_DTRACE);
 
 	DEFINE(THREAD_SIZE, THREAD_SIZE);
 
diff --git a/arch/m68knommu/kernel/traps.c b/arch/m68knommu/kernel/traps.c
index 5d5d56bcd0e..51d325343ab 100644
--- a/arch/m68knommu/kernel/traps.c
+++ b/arch/m68knommu/kernel/traps.c
@@ -200,7 +200,6 @@ asmlinkage void trap_c(struct frame *fp)
 	if (fp->ptregs.sr & PS_S) {
 		if ((fp->ptregs.vector >> 2) == VEC_TRACE) {
 			/* traced a trapping instruction */
-			current->ptrace |= PT_DTRACE;
 		} else
 			bad_super_trap(fp);
 		return;
diff --git a/arch/mn10300/include/asm/elf.h b/arch/mn10300/include/asm/elf.h
index 49105462e6f..75a70aa9fd6 100644
--- a/arch/mn10300/include/asm/elf.h
+++ b/arch/mn10300/include/asm/elf.h
@@ -28,6 +28,8 @@
 #define R_MN10300_PCREL8	6	/* PC-relative 8-bit signed.  */
 #define R_MN10300_24		9	/* Direct 24 bit.  */
 #define R_MN10300_RELATIVE	23	/* Adjust by program base.  */
+#define R_MN10300_SYM_DIFF	33	/* Adjustment when relaxing. */
+#define R_MN10300_ALIGN 	34	/* Alignment requirement. */
 
 /*
  * ELF register definitions..
diff --git a/arch/mn10300/kernel/module.c b/arch/mn10300/kernel/module.c
index 4fa0e3648d8..6aea7fd7699 100644
--- a/arch/mn10300/kernel/module.c
+++ b/arch/mn10300/kernel/module.c
@@ -1,6 +1,6 @@
 /* MN10300 Kernel module helper routines
  *
- * Copyright (C) 2007, 2008 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2007, 2008, 2009 Red Hat, Inc. All Rights Reserved.
  * Written by Mark Salter (msalter@redhat.com)
  * - Derived from arch/i386/kernel/module.c
  *
@@ -103,10 +103,10 @@ int apply_relocate_add(Elf32_Shdr *sechdrs,
 		       unsigned int relsec,
 		       struct module *me)
 {
-	unsigned int i;
+	unsigned int i, sym_diff_seen = 0;
 	Elf32_Rela *rel = (void *)sechdrs[relsec].sh_addr;
 	Elf32_Sym *sym;
-	Elf32_Addr relocation;
+	Elf32_Addr relocation, sym_diff_val = 0;
 	uint8_t *location;
 	uint32_t value;
 
@@ -126,6 +126,22 @@ int apply_relocate_add(Elf32_Shdr *sechdrs,
 		/* this is the adjustment to be made */
 		relocation = sym->st_value + rel[i].r_addend;
 
+		if (sym_diff_seen) {
+			switch (ELF32_R_TYPE(rel[i].r_info)) {
+			case R_MN10300_32:
+			case R_MN10300_24:
+			case R_MN10300_16:
+			case R_MN10300_8:
+				relocation -= sym_diff_val;
+				sym_diff_seen = 0;
+				break;
+			default:
+				printk(KERN_ERR "module %s: Unexpected SYM_DIFF relocation: %u\n",
+				       me->name, ELF32_R_TYPE(rel[i].r_info));
+				return -ENOEXEC;
+			}
+		}
+
 		switch (ELF32_R_TYPE(rel[i].r_info)) {
 			/* for the first four relocation types, we simply
 			 * store the adjustment at the location given */
@@ -157,12 +173,29 @@ int apply_relocate_add(Elf32_Shdr *sechdrs,
 			*location = relocation - (uint32_t) location;
 			break;
 
+		case R_MN10300_SYM_DIFF:
+			/* This is used to adjust the next reloc as required
+			 * by relaxation. */
+			sym_diff_seen = 1;
+			sym_diff_val = sym->st_value;
+			break;
+
+		case R_MN10300_ALIGN:
+			/* Just ignore the ALIGN relocs.
+			 * Only interesting if kernel performed relaxation. */
+			continue;
+
 		default:
 			printk(KERN_ERR "module %s: Unknown relocation: %u\n",
 			       me->name, ELF32_R_TYPE(rel[i].r_info));
 			return -ENOEXEC;
 		}
 	}
+	if (sym_diff_seen) {
+		printk(KERN_ERR "module %s: Nothing follows SYM_DIFF relocation: %u\n",
+				       me->name, ELF32_R_TYPE(rel[i].r_info));
+		return -ENOEXEC;
+	}
 	return 0;
 }
 
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index 234cf344cdc..892cce82867 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -281,9 +281,6 @@ asmlinkage long sys_execve(char __user *name,
 	error = PTR_ERR(filename);
 	if (!IS_ERR(filename)) {
 		error = do_execve(filename, argv, envp, __frame);
-		if (error == 0)
-			current->ptrace &= ~PT_DTRACE;
-
 		putname(filename);
 	}
 
diff --git a/arch/parisc/hpux/fs.c b/arch/parisc/hpux/fs.c
index 5cbe9f9e5d9..54075360a8f 100644
--- a/arch/parisc/hpux/fs.c
+++ b/arch/parisc/hpux/fs.c
@@ -44,11 +44,6 @@ int hpux_execve(struct pt_regs *regs)
 	error = do_execve(filename, (char __user * __user *) regs->gr[25],
 		(char __user * __user *) regs->gr[24], regs);
 
-	if (error == 0) {
-		task_lock(current);
-		current->ptrace &= ~PT_DTRACE;
-		task_unlock(current);
-	}
 	putname(filename);
 
 out:
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index 6f69101f90b..61c07078c07 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -349,11 +349,6 @@ asmlinkage int sys_execve(struct pt_regs *regs)
 		goto out;
 	error = do_execve(filename, (char __user * __user *) regs->gr[25],
 		(char __user * __user *) regs->gr[24], regs);
-	if (error == 0) {
-		task_lock(current);
-		current->ptrace &= ~PT_DTRACE;
-		task_unlock(current);
-	}
 	putname(filename);
 out:
 
diff --git a/arch/parisc/kernel/sys_parisc32.c b/arch/parisc/kernel/sys_parisc32.c
index 0838155b7a8..1adb40c8166 100644
--- a/arch/parisc/kernel/sys_parisc32.c
+++ b/arch/parisc/kernel/sys_parisc32.c
@@ -77,11 +77,6 @@ asmlinkage int sys32_execve(struct pt_regs *regs)
 		goto out;
 	error = compat_do_execve(filename, compat_ptr(regs->gr[25]),
 				 compat_ptr(regs->gr[24]), regs);
-	if (error == 0) {
-		task_lock(current);
-		current->ptrace &= ~PT_DTRACE;
-		task_unlock(current);
-	}
 	putname(filename);
 out:
 
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index ef6f64950e9..a538824616f 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -1947,8 +1947,47 @@ static void __init fixup_device_tree_maple(void)
 	prom_setprop(isa, name, "ranges",
 			isa_ranges, sizeof(isa_ranges));
 }
+
+#define CPC925_MC_START		0xf8000000
+#define CPC925_MC_LENGTH	0x1000000
+/* The values for memory-controller don't have right number of cells */
+static void __init fixup_device_tree_maple_memory_controller(void)
+{
+	phandle mc;
+	u32 mc_reg[4];
+	char *name = "/hostbridge@f8000000";
+	struct prom_t *_prom = &RELOC(prom);
+	u32 ac, sc;
+
+	mc = call_prom("finddevice", 1, 1, ADDR(name));
+	if (!PHANDLE_VALID(mc))
+		return;
+
+	if (prom_getproplen(mc, "reg") != 8)
+		return;
+
+	prom_getprop(_prom->root, "#address-cells", &ac, sizeof(ac));
+	prom_getprop(_prom->root, "#size-cells", &sc, sizeof(sc));
+	if ((ac != 2) || (sc != 2))
+		return;
+
+	if (prom_getprop(mc, "reg", mc_reg, sizeof(mc_reg)) == PROM_ERROR)
+		return;
+
+	if (mc_reg[0] != CPC925_MC_START || mc_reg[1] != CPC925_MC_LENGTH)
+		return;
+
+	prom_printf("Fixing up bogus hostbridge on Maple...\n");
+
+	mc_reg[0] = 0x0;
+	mc_reg[1] = CPC925_MC_START;
+	mc_reg[2] = 0x0;
+	mc_reg[3] = CPC925_MC_LENGTH;
+	prom_setprop(mc, name, "reg", mc_reg, sizeof(mc_reg));
+}
 #else
 #define fixup_device_tree_maple()
+#define fixup_device_tree_maple_memory_controller()
 #endif
 
 #ifdef CONFIG_PPC_CHRP
@@ -2189,6 +2228,7 @@ static void __init fixup_device_tree_efika(void)
 static void __init fixup_device_tree(void)
 {
 	fixup_device_tree_maple();
+	fixup_device_tree_maple_memory_controller();
 	fixup_device_tree_chrp();
 	fixup_device_tree_pmac();
 	fixup_device_tree_efika();
diff --git a/arch/powerpc/platforms/maple/setup.c b/arch/powerpc/platforms/maple/setup.c
index bfd60e4acce..0636a3df697 100644
--- a/arch/powerpc/platforms/maple/setup.c
+++ b/arch/powerpc/platforms/maple/setup.c
@@ -335,3 +335,62 @@ define_machine(maple) {
 	.progress		= maple_progress,
 	.power_save		= power4_idle,
 };
+
+#ifdef CONFIG_EDAC
+/*
+ * Register a platform device for CPC925 memory controller on
+ * Motorola ATCA-6101 blade.
+ */
+#define MAPLE_CPC925_MODEL	"Motorola,ATCA-6101"
+static int __init maple_cpc925_edac_setup(void)
+{
+	struct platform_device *pdev;
+	struct device_node *np = NULL;
+	struct resource r;
+	const unsigned char *model;
+	int ret;
+
+	np = of_find_node_by_path("/");
+	if (!np) {
+		printk(KERN_ERR "%s: Unable to get root node\n", __func__);
+		return -ENODEV;
+	}
+
+	model = (const unsigned char *)of_get_property(np, "model", NULL);
+	if (!model) {
+		printk(KERN_ERR "%s: Unabel to get model info\n", __func__);
+		return -ENODEV;
+	}
+
+	ret = strcmp(model, MAPLE_CPC925_MODEL);
+	of_node_put(np);
+
+	if (ret != 0)
+		return 0;
+
+	np = of_find_node_by_type(NULL, "memory-controller");
+	if (!np) {
+		printk(KERN_ERR "%s: Unable to find memory-controller node\n",
+			__func__);
+		return -ENODEV;
+	}
+
+	ret = of_address_to_resource(np, 0, &r);
+	of_node_put(np);
+
+	if (ret < 0) {
+		printk(KERN_ERR "%s: Unable to get memory-controller reg\n",
+			__func__);
+		return -ENODEV;
+	}
+
+	pdev = platform_device_register_simple("cpc925_edac", 0, &r, 1);
+	if (IS_ERR(pdev))
+		return PTR_ERR(pdev);
+
+	printk(KERN_INFO "%s: CPC925 platform device created\n", __func__);
+
+	return 0;
+}
+machine_device_initcall(maple, maple_cpc925_edac_setup);
+#endif
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 002c70d3cb7..9ab188d67a3 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -461,9 +461,6 @@ asmlinkage long sys32_execve(void)
 		result = rc;
 		goto out_putname;
 	}
-	task_lock(current);
-	current->ptrace &= ~PT_DTRACE;
-	task_unlock(current);
 	current->thread.fp_regs.fpc=0;
 	asm volatile("sfpc %0,0" : : "d" (0));
 	result = regs->gprs[2];
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 355f7a30c3f..5a43f27eec1 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -266,9 +266,6 @@ SYSCALL_DEFINE0(vfork)
 
 asmlinkage void execve_tail(void)
 {
-	task_lock(current);
-	current->ptrace &= ~PT_DTRACE;
-	task_unlock(current);
 	current->thread.fp_regs.fpc = 0;
 	if (MACHINE_HAS_IEEE)
 		asm volatile("sfpc %0,%0" : : "d" (0));
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 9289ede29c7..601bbc0ec74 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -367,11 +367,6 @@ asmlinkage int sys_execve(char __user *ufilename, char __user * __user *uargv,
 		goto out;
 
 	error = do_execve(filename, uargv, uenvp, regs);
-	if (error == 0) {
-		task_lock(current);
-		current->ptrace &= ~PT_DTRACE;
-		task_unlock(current);
-	}
 	putname(filename);
 out:
 	return error;
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index 96be839040f..3826773496d 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -529,11 +529,6 @@ asmlinkage int sys_execve(char *ufilename, char **uargv,
 			  (char __user * __user *)uargv,
 			  (char __user * __user *)uenvp,
 			  pregs);
-	if (error == 0) {
-		task_lock(current);
-		current->ptrace &= ~PT_DTRACE;
-		task_unlock(current);
-	}
 	putname(filename);
 out:
 	return error;
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cf42fc30541..73c0bda73fc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,6 +28,7 @@ config X86
 	select HAVE_KPROBES
 	select ARCH_WANT_OPTIONAL_GPIOLIB
 	select ARCH_WANT_FRAME_POINTERS
+	select HAVE_DMA_ATTRS
 	select HAVE_KRETPROBES
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_DYNAMIC_FTRACE
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 8d16ada2504..ec749c2bfdd 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -70,6 +70,7 @@ KBUILD_CFLAGS	:= $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
 		   $(call cc-option, -mpreferred-stack-boundary=2)
 KBUILD_CFLAGS	+= $(call cc-option, -m32)
 KBUILD_AFLAGS	:= $(KBUILD_CFLAGS) -D__ASSEMBLY__
+GCOV_PROFILE := n
 
 $(obj)/bzImage: asflags-y  := $(SVGA_MODE)
 
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 49c8a4c37d7..e2ff504b4dd 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -15,6 +15,7 @@ KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
 KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector)
 
 KBUILD_AFLAGS  := $(KBUILD_CFLAGS) -D__ASSEMBLY__
+GCOV_PROFILE := n
 
 LDFLAGS := -m elf_$(UTS_MACHINE)
 LDFLAGS_vmlinux := -T
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index b93405b228b..1c3f9435f1c 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -33,6 +33,8 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 #endif
 }
 
+#include <asm-generic/dma-mapping-common.h>
+
 /* Make sure we keep the same behaviour */
 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
@@ -53,177 +55,6 @@ extern int dma_set_mask(struct device *dev, u64 mask);
 extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
 					dma_addr_t *dma_addr, gfp_t flag);
 
-static inline dma_addr_t
-dma_map_single(struct device *hwdev, void *ptr, size_t size,
-	       enum dma_data_direction dir)
-{
-	struct dma_map_ops *ops = get_dma_ops(hwdev);
-	dma_addr_t addr;
-
-	BUG_ON(!valid_dma_direction(dir));
-	kmemcheck_mark_initialized(ptr, size);
-	addr = ops->map_page(hwdev, virt_to_page(ptr),
-			     (unsigned long)ptr & ~PAGE_MASK, size,
-			     dir, NULL);
-	debug_dma_map_page(hwdev, virt_to_page(ptr),
-			   (unsigned long)ptr & ~PAGE_MASK, size,
-			   dir, addr, true);
-	return addr;
-}
-
-static inline void
-dma_unmap_single(struct device *dev, dma_addr_t addr, size_t size,
-		 enum dma_data_direction dir)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (ops->unmap_page)
-		ops->unmap_page(dev, addr, size, dir, NULL);
-	debug_dma_unmap_page(dev, addr, size, dir, true);
-}
-
-static inline int
-dma_map_sg(struct device *hwdev, struct scatterlist *sg,
-	   int nents, enum dma_data_direction dir)
-{
-	struct dma_map_ops *ops = get_dma_ops(hwdev);
-	int ents;
-	struct scatterlist *s;
-	int i;
-
-	BUG_ON(!valid_dma_direction(dir));
-	for_each_sg(sg, s, nents, i)
-		kmemcheck_mark_initialized(sg_virt(s), s->length);
-	ents = ops->map_sg(hwdev, sg, nents, dir, NULL);
-	debug_dma_map_sg(hwdev, sg, nents, ents, dir);
-
-	return ents;
-}
-
-static inline void
-dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
-	     enum dma_data_direction dir)
-{
-	struct dma_map_ops *ops = get_dma_ops(hwdev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	debug_dma_unmap_sg(hwdev, sg, nents, dir);
-	if (ops->unmap_sg)
-		ops->unmap_sg(hwdev, sg, nents, dir, NULL);
-}
-
-static inline void
-dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
-			size_t size, enum dma_data_direction dir)
-{
-	struct dma_map_ops *ops = get_dma_ops(hwdev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_single_for_cpu)
-		ops->sync_single_for_cpu(hwdev, dma_handle, size, dir);
-	debug_dma_sync_single_for_cpu(hwdev, dma_handle, size, dir);
-	flush_write_buffers();
-}
-
-static inline void
-dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
-			   size_t size, enum dma_data_direction dir)
-{
-	struct dma_map_ops *ops = get_dma_ops(hwdev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_single_for_device)
-		ops->sync_single_for_device(hwdev, dma_handle, size, dir);
-	debug_dma_sync_single_for_device(hwdev, dma_handle, size, dir);
-	flush_write_buffers();
-}
-
-static inline void
-dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
-			      unsigned long offset, size_t size,
-			      enum dma_data_direction dir)
-{
-	struct dma_map_ops *ops = get_dma_ops(hwdev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_single_range_for_cpu)
-		ops->sync_single_range_for_cpu(hwdev, dma_handle, offset,
-					       size, dir);
-	debug_dma_sync_single_range_for_cpu(hwdev, dma_handle,
-					    offset, size, dir);
-	flush_write_buffers();
-}
-
-static inline void
-dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
-				 unsigned long offset, size_t size,
-				 enum dma_data_direction dir)
-{
-	struct dma_map_ops *ops = get_dma_ops(hwdev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_single_range_for_device)
-		ops->sync_single_range_for_device(hwdev, dma_handle,
-						  offset, size, dir);
-	debug_dma_sync_single_range_for_device(hwdev, dma_handle,
-					       offset, size, dir);
-	flush_write_buffers();
-}
-
-static inline void
-dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
-		    int nelems, enum dma_data_direction dir)
-{
-	struct dma_map_ops *ops = get_dma_ops(hwdev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_sg_for_cpu)
-		ops->sync_sg_for_cpu(hwdev, sg, nelems, dir);
-	debug_dma_sync_sg_for_cpu(hwdev, sg, nelems, dir);
-	flush_write_buffers();
-}
-
-static inline void
-dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
-		       int nelems, enum dma_data_direction dir)
-{
-	struct dma_map_ops *ops = get_dma_ops(hwdev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_sg_for_device)
-		ops->sync_sg_for_device(hwdev, sg, nelems, dir);
-	debug_dma_sync_sg_for_device(hwdev, sg, nelems, dir);
-
-	flush_write_buffers();
-}
-
-static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
-				      size_t offset, size_t size,
-				      enum dma_data_direction dir)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-	dma_addr_t addr;
-
-	BUG_ON(!valid_dma_direction(dir));
-	kmemcheck_mark_initialized(page_address(page) + offset, size);
-	addr = ops->map_page(dev, page, offset, size, dir, NULL);
-	debug_dma_map_page(dev, page, offset, size, dir, addr, false);
-
-	return addr;
-}
-
-static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
-				  size_t size, enum dma_data_direction dir)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (ops->unmap_page)
-		ops->unmap_page(dev, addr, size, dir, NULL);
-	debug_dma_unmap_page(dev, addr, size, dir, false);
-}
-
 static inline void
 dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	enum dma_data_direction dir)
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index f3477bb8456..6c327b852e2 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -24,6 +24,8 @@ CFLAGS_vsyscall_64.o	:= $(PROFILING) -g0 $(nostackp)
 CFLAGS_hpet.o		:= $(nostackp)
 CFLAGS_tsc.o		:= $(nostackp)
 CFLAGS_paravirt.o	:= $(nostackp)
+GCOV_PROFILE_vsyscall_64.o	:= n
+GCOV_PROFILE_hpet.o		:= n
 
 obj-y			:= process_$(BITS).o signal.o entry_$(BITS).o
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
index 167bc16ce0e..6a564ac67ef 100644
--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ b/arch/x86/kernel/acpi/realmode/Makefile
@@ -42,6 +42,7 @@ KBUILD_CFLAGS	:= $(LINUXINCLUDE) -g -Os -D_SETUP -D_WAKEUP -D__KERNEL__ \
 		   $(call cc-option, -mpreferred-stack-boundary=2)
 KBUILD_CFLAGS	+= $(call cc-option, -m32)
 KBUILD_AFLAGS	:= $(KBUILD_CFLAGS) -D__ASSEMBLY__
+GCOV_PROFILE := n
 
 WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y))
 
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 16a9020c8f1..88112b49f02 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -123,6 +123,7 @@ quiet_cmd_vdso = VDSO    $@
 		       -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^)
 
 VDSO_LDFLAGS = -fPIC -shared $(call ld-option, -Wl$(comma)--hash-style=sysv)
+GCOV_PROFILE := n
 
 #
 # Install the unstripped copy of vdso*.so listed in $(vdso-install-y).
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index 031f3668571..e1a04a346e7 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -331,11 +331,6 @@ long xtensa_execve(char __user *name, char __user * __user *argv,
 	if (IS_ERR(filename))
 		goto out;
 	error = do_execve(filename, argv, envp, regs);
-	if (error == 0) {
-		task_lock(current);
-		current->ptrace &= ~PT_DTRACE;
-		task_unlock(current);
-	}
 	putname(filename);
 out:
 	return error;
diff --git a/drivers/Kconfig b/drivers/Kconfig
index a442c8f29fc..48bbdbe43e6 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -52,6 +52,8 @@ source "drivers/i2c/Kconfig"
 
 source "drivers/spi/Kconfig"
 
+source "drivers/pps/Kconfig"
+
 source "drivers/gpio/Kconfig"
 
 source "drivers/w1/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index 00b44f4ccf0..bc4205d2fc3 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -72,6 +72,7 @@ obj-$(CONFIG_INPUT)		+= input/
 obj-$(CONFIG_I2O)		+= message/
 obj-$(CONFIG_RTC_LIB)		+= rtc/
 obj-y				+= i2c/ media/
+obj-$(CONFIG_PPS)		+= pps/
 obj-$(CONFIG_W1)		+= w1/
 obj-$(CONFIG_POWER_SUPPLY)	+= power/
 obj-$(CONFIG_HWMON)		+= hwmon/
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 30bae6de6a0..0bd01f49cfd 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -807,7 +807,7 @@ if RTC_LIB=n
 config RTC
 	tristate "Enhanced Real Time Clock Support (legacy PC RTC driver)"
 	depends on !PPC && !PARISC && !IA64 && !M68K && !SPARC && !FRV \
-			&& !ARM && !SUPERH && !S390 && !AVR32
+			&& !ARM && !SUPERH && !S390 && !AVR32 && !BLACKFIN
 	---help---
 	  If you say Y here and create a character special file /dev/rtc with
 	  major number 10 and minor number 135 using mknod ("man mknod"), you
diff --git a/drivers/char/isicom.c b/drivers/char/isicom.c
index 4d745a89504..4159292e35c 100644
--- a/drivers/char/isicom.c
+++ b/drivers/char/isicom.c
@@ -1593,7 +1593,7 @@ static unsigned int card_count;
 static int __devinit isicom_probe(struct pci_dev *pdev,
 	const struct pci_device_id *ent)
 {
-	unsigned int signature, index;
+	unsigned int uninitialized_var(signature), index;
 	int retval = -EPERM;
 	struct isi_board *board = NULL;
 
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index f96d0bef855..afa8813e737 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -863,59 +863,58 @@ static const struct file_operations kmsg_fops = {
 	.write =	kmsg_write,
 };
 
-static int memory_open(struct inode * inode, struct file * filp)
-{
-	int ret = 0;
-
-	lock_kernel();
-	switch (iminor(inode)) {
-		case 1:
-			filp->f_op = &mem_fops;
-			filp->f_mapping->backing_dev_info =
-				&directly_mappable_cdev_bdi;
-			break;
+static const struct {
+	unsigned int		minor;
+	char			*name;
+	umode_t			mode;
+	const struct file_operations	*fops;
+	struct backing_dev_info	*dev_info;
+} devlist[] = { /* list of minor devices */
+	{1, "mem",     S_IRUSR | S_IWUSR | S_IRGRP, &mem_fops,
+		&directly_mappable_cdev_bdi},
 #ifdef CONFIG_DEVKMEM
-		case 2:
-			filp->f_op = &kmem_fops;
-			filp->f_mapping->backing_dev_info =
-				&directly_mappable_cdev_bdi;
-			break;
+	{2, "kmem",    S_IRUSR | S_IWUSR | S_IRGRP, &kmem_fops,
+		&directly_mappable_cdev_bdi},
 #endif
-		case 3:
-			filp->f_op = &null_fops;
-			break;
+	{3, "null",    S_IRUGO | S_IWUGO,           &null_fops, NULL},
 #ifdef CONFIG_DEVPORT
-		case 4:
-			filp->f_op = &port_fops;
-			break;
+	{4, "port",    S_IRUSR | S_IWUSR | S_IRGRP, &port_fops, NULL},
 #endif
-		case 5:
-			filp->f_mapping->backing_dev_info = &zero_bdi;
-			filp->f_op = &zero_fops;
-			break;
-		case 7:
-			filp->f_op = &full_fops;
-			break;
-		case 8:
-			filp->f_op = &random_fops;
-			break;
-		case 9:
-			filp->f_op = &urandom_fops;
-			break;
-		case 11:
-			filp->f_op = &kmsg_fops;
-			break;
+	{5, "zero",    S_IRUGO | S_IWUGO,           &zero_fops, &zero_bdi},
+	{7, "full",    S_IRUGO | S_IWUGO,           &full_fops, NULL},
+	{8, "random",  S_IRUGO | S_IWUSR,           &random_fops, NULL},
+	{9, "urandom", S_IRUGO | S_IWUSR,           &urandom_fops, NULL},
+	{11,"kmsg",    S_IRUGO | S_IWUSR,           &kmsg_fops, NULL},
 #ifdef CONFIG_CRASH_DUMP
-		case 12:
-			filp->f_op = &oldmem_fops;
-			break;
+	{12,"oldmem",    S_IRUSR | S_IWUSR | S_IRGRP, &oldmem_fops, NULL},
 #endif
-		default:
-			unlock_kernel();
-			return -ENXIO;
+};
+
+static int memory_open(struct inode *inode, struct file *filp)
+{
+	int ret = 0;
+	int i;
+
+	lock_kernel();
+
+	for (i = 0; i < ARRAY_SIZE(devlist); i++) {
+		if (devlist[i].minor == iminor(inode)) {
+			filp->f_op = devlist[i].fops;
+			if (devlist[i].dev_info) {
+				filp->f_mapping->backing_dev_info =
+					devlist[i].dev_info;
+			}
+
+			break;
+		}
 	}
-	if (filp->f_op && filp->f_op->open)
-		ret = filp->f_op->open(inode,filp);
+
+	if (i == ARRAY_SIZE(devlist))
+		ret = -ENXIO;
+	else
+		if (filp->f_op && filp->f_op->open)
+			ret = filp->f_op->open(inode, filp);
+
 	unlock_kernel();
 	return ret;
 }
@@ -924,30 +923,6 @@ static const struct file_operations memory_fops = {
 	.open		= memory_open,	/* just a selector for the real open */
 };
 
-static const struct {
-	unsigned int		minor;
-	char			*name;
-	umode_t			mode;
-	const struct file_operations	*fops;
-} devlist[] = { /* list of minor devices */
-	{1, "mem",     S_IRUSR | S_IWUSR | S_IRGRP, &mem_fops},
-#ifdef CONFIG_DEVKMEM
-	{2, "kmem",    S_IRUSR | S_IWUSR | S_IRGRP, &kmem_fops},
-#endif
-	{3, "null",    S_IRUGO | S_IWUGO,           &null_fops},
-#ifdef CONFIG_DEVPORT
-	{4, "port",    S_IRUSR | S_IWUSR | S_IRGRP, &port_fops},
-#endif
-	{5, "zero",    S_IRUGO | S_IWUGO,           &zero_fops},
-	{7, "full",    S_IRUGO | S_IWUGO,           &full_fops},
-	{8, "random",  S_IRUGO | S_IWUSR,           &random_fops},
-	{9, "urandom", S_IRUGO | S_IWUSR,           &urandom_fops},
-	{11,"kmsg",    S_IRUGO | S_IWUSR,           &kmsg_fops},
-#ifdef CONFIG_CRASH_DUMP
-	{12,"oldmem",    S_IRUSR | S_IWUSR | S_IRGRP, &oldmem_fops},
-#endif
-};
-
 static struct class *mem_class;
 
 static int __init chr_dev_init(void)
diff --git a/drivers/char/ppdev.c b/drivers/char/ppdev.c
index c84c34fb123..432655bcb04 100644
--- a/drivers/char/ppdev.c
+++ b/drivers/char/ppdev.c
@@ -114,8 +114,7 @@ static ssize_t pp_read (struct file * file, char __user * buf, size_t count,
 
 	if (!(pp->flags & PP_CLAIMED)) {
 		/* Don't have the port claimed */
-		printk (KERN_DEBUG CHRDEV "%x: claim the port first\n",
-			minor);
+		pr_debug(CHRDEV "%x: claim the port first\n", minor);
 		return -EINVAL;
 	}
 
@@ -198,8 +197,7 @@ static ssize_t pp_write (struct file * file, const char __user * buf,
 
 	if (!(pp->flags & PP_CLAIMED)) {
 		/* Don't have the port claimed */
-		printk (KERN_DEBUG CHRDEV "%x: claim the port first\n",
-			minor);
+		pr_debug(CHRDEV "%x: claim the port first\n", minor);
 		return -EINVAL;
 	}
 
@@ -313,7 +311,7 @@ static int register_device (int minor, struct pp_struct *pp)
 	}
 
 	pp->pdev = pdev;
-	printk (KERN_DEBUG "%s: registered pardevice\n", name);
+	pr_debug("%s: registered pardevice\n", name);
 	return 0;
 }
 
@@ -343,8 +341,7 @@ static int pp_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		int ret;
 
 		if (pp->flags & PP_CLAIMED) {
-			printk (KERN_DEBUG CHRDEV
-				"%x: you've already got it!\n", minor);
+			pr_debug(CHRDEV "%x: you've already got it!\n", minor);
 			return -EINVAL;
 		}
 
@@ -379,7 +376,7 @@ static int pp_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	    }
 	case PPEXCL:
 		if (pp->pdev) {
-			printk (KERN_DEBUG CHRDEV "%x: too late for PPEXCL; "
+			pr_debug(CHRDEV "%x: too late for PPEXCL; "
 				"already registered\n", minor);
 			if (pp->flags & PP_EXCL)
 				/* But it's not really an error. */
@@ -491,8 +488,7 @@ static int pp_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	/* Everything else requires the port to be claimed, so check
 	 * that now. */
 	if ((pp->flags & PP_CLAIMED) == 0) {
-		printk (KERN_DEBUG CHRDEV "%x: claim the port first\n",
-			minor);
+		pr_debug(CHRDEV "%x: claim the port first\n", minor);
 		return -EINVAL;
 	}
 
@@ -624,8 +620,7 @@ static int pp_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		return 0;
 
 	default:
-		printk (KERN_DEBUG CHRDEV "%x: What? (cmd=0x%x)\n", minor,
-			cmd);
+		pr_debug(CHRDEV "%x: What? (cmd=0x%x)\n", minor, cmd);
 		return -EINVAL;
 	}
 
@@ -698,9 +693,8 @@ static int pp_release (struct inode * inode, struct file * file)
 	}
 	if (compat_negot) {
 		parport_negotiate (pp->pdev->port, IEEE1284_MODE_COMPAT);
-		printk (KERN_DEBUG CHRDEV
-			"%x: negotiated back to compatibility mode because "
-			"user-space forgot\n", minor);
+		pr_debug(CHRDEV "%x: negotiated back to compatibility "
+			"mode because user-space forgot\n", minor);
 	}
 
 	if (pp->flags & PP_CLAIMED) {
@@ -713,7 +707,7 @@ static int pp_release (struct inode * inode, struct file * file)
 		info->phase = pp->saved_state.phase;
 		parport_release (pp->pdev);
 		if (compat_negot != 1) {
-			printk (KERN_DEBUG CHRDEV "%x: released pardevice "
+			pr_debug(CHRDEV "%x: released pardevice "
 				"because user-space forgot\n", minor);
 		}
 	}
@@ -723,8 +717,7 @@ static int pp_release (struct inode * inode, struct file * file)
 		parport_unregister_device (pp->pdev);
 		kfree (name);
 		pp->pdev = NULL;
-		printk (KERN_DEBUG CHRDEV "%x: unregistered pardevice\n",
-			minor);
+		pr_debug(CHRDEV "%x: unregistered pardevice\n", minor);
 	}
 
 	kfree (pp);
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 39a05b5fa9c..0db35857e4d 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -121,20 +121,17 @@ static struct sysrq_key_op sysrq_unraw_op = {
 #define sysrq_unraw_op (*(struct sysrq_key_op *)0)
 #endif /* CONFIG_VT */
 
-#ifdef CONFIG_KEXEC
-static void sysrq_handle_crashdump(int key, struct tty_struct *tty)
+static void sysrq_handle_crash(int key, struct tty_struct *tty)
 {
-	crash_kexec(get_irq_regs());
+	char *killer = NULL;
+	*killer = 1;
 }
 static struct sysrq_key_op sysrq_crashdump_op = {
-	.handler	= sysrq_handle_crashdump,
-	.help_msg	= "Crashdump",
-	.action_msg	= "Trigger a crashdump",
+	.handler	= sysrq_handle_crash,
+	.help_msg	= "Crash",
+	.action_msg	= "Trigger a crash",
 	.enable_mask	= SYSRQ_ENABLE_DUMP,
 };
-#else
-#define sysrq_crashdump_op (*(struct sysrq_key_op *)0)
-#endif
 
 static void sysrq_handle_reboot(int key, struct tty_struct *tty)
 {
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index ab4f3592a11..4339b1a879c 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -5,7 +5,7 @@
 #
 
 menuconfig EDAC
-	bool "EDAC - error detection and reporting"
+	bool "EDAC (Error Detection And Correction) reporting"
 	depends on HAS_IOMEM
 	depends on X86 || PPC
 	help
@@ -232,4 +232,13 @@ config EDAC_AMD8111
 	  Note, add more Kconfig dependency if it's adopted
 	  on some machine other than Maple.
 
+config EDAC_CPC925
+	tristate "IBM CPC925 Memory Controller (PPC970FX)"
+	depends on EDAC_MM_EDAC && PPC64
+	help
+	  Support for error detection and correction on the
+	  IBM CPC925 Bridge and Memory Controller, which is
+	  a companion chip to the PowerPC 970 family of
+	  processors.
+
 endif # EDAC
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index 633dc5604ee..98aa4a7db41 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -18,6 +18,7 @@ edac_core-objs	+= edac_pci.o edac_pci_sysfs.o
 endif
 
 obj-$(CONFIG_EDAC_AMD76X)		+= amd76x_edac.o
+obj-$(CONFIG_EDAC_CPC925)		+= cpc925_edac.o
 obj-$(CONFIG_EDAC_I5000)		+= i5000_edac.o
 obj-$(CONFIG_EDAC_I5100)		+= i5100_edac.o
 obj-$(CONFIG_EDAC_I5400)		+= i5400_edac.o
diff --git a/drivers/edac/amd8111_edac.c b/drivers/edac/amd8111_edac.c
index 2cb58ef743e..35b78d04bbf 100644
--- a/drivers/edac/amd8111_edac.c
+++ b/drivers/edac/amd8111_edac.c
@@ -37,7 +37,6 @@
 #define AMD8111_EDAC_MOD_STR	"amd8111_edac"
 
 #define PCI_DEVICE_ID_AMD_8111_PCI	0x7460
-static int edac_dev_idx;
 
 enum amd8111_edac_devs {
 	LPC_BRIDGE = 0,
@@ -377,7 +376,7 @@ static int amd8111_dev_probe(struct pci_dev *dev,
 	 * edac_device_ctl_info, but make use of existing
 	 * one instead.
 	*/
-	dev_info->edac_idx = edac_dev_idx++;
+	dev_info->edac_idx = edac_device_alloc_index();
 	dev_info->edac_dev =
 		edac_device_alloc_ctl_info(0, dev_info->ctl_name, 1,
 					   NULL, 0, 0,
diff --git a/drivers/edac/cell_edac.c b/drivers/edac/cell_edac.c
index cb0f639f049..c973004c002 100644
--- a/drivers/edac/cell_edac.c
+++ b/drivers/edac/cell_edac.c
@@ -227,7 +227,7 @@ static struct platform_driver cell_edac_driver = {
 		.owner	= THIS_MODULE,
 	},
 	.probe		= cell_edac_probe,
-	.remove		= cell_edac_remove,
+	.remove		= __devexit_p(cell_edac_remove),
 };
 
 static int __init cell_edac_init(void)
diff --git a/drivers/edac/cpc925_edac.c b/drivers/edac/cpc925_edac.c
new file mode 100644
index 00000000000..8c54196b5ab
--- /dev/null
+++ b/drivers/edac/cpc925_edac.c
@@ -0,0 +1,1017 @@
+/*
+ * cpc925_edac.c, EDAC driver for IBM CPC925 Bridge and Memory Controller.
+ *
+ * Copyright (c) 2008 Wind River Systems, Inc.
+ *
+ * Authors:	Cao Qingtao <qingtao.cao@windriver.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/edac.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+
+#include "edac_core.h"
+#include "edac_module.h"
+
+#define CPC925_EDAC_REVISION	" Ver: 1.0.0 " __DATE__
+#define CPC925_EDAC_MOD_STR	"cpc925_edac"
+
+#define cpc925_printk(level, fmt, arg...) \
+	edac_printk(level, "CPC925", fmt, ##arg)
+
+#define cpc925_mc_printk(mci, level, fmt, arg...) \
+	edac_mc_chipset_printk(mci, level, "CPC925", fmt, ##arg)
+
+/*
+ * CPC925 registers are of 32 bits with bit0 defined at the
+ * most significant bit and bit31 at that of least significant.
+ */
+#define CPC925_BITS_PER_REG	32
+#define CPC925_BIT(nr)		(1UL << (CPC925_BITS_PER_REG - 1 - nr))
+
+/*
+ * EDAC device names for the error detections of
+ * CPU Interface and Hypertransport Link.
+ */
+#define CPC925_CPU_ERR_DEV	"cpu"
+#define CPC925_HT_LINK_DEV	"htlink"
+
+/* Suppose DDR Refresh cycle is 15.6 microsecond */
+#define CPC925_REF_FREQ		0xFA69
+#define CPC925_SCRUB_BLOCK_SIZE 64	/* bytes */
+#define CPC925_NR_CSROWS	8
+
+/*
+ * All registers and bits definitions are taken from
+ * "CPC925 Bridge and Memory Controller User Manual, SA14-2761-02".
+ */
+
+/*
+ * CPU and Memory Controller Registers
+ */
+/************************************************************
+ *	Processor Interface Exception Mask Register (APIMASK)
+ ************************************************************/
+#define REG_APIMASK_OFFSET	0x30070
+enum apimask_bits {
+	APIMASK_DART	= CPC925_BIT(0), /* DART Exception */
+	APIMASK_ADI0	= CPC925_BIT(1), /* Handshake Error on PI0_ADI */
+	APIMASK_ADI1	= CPC925_BIT(2), /* Handshake Error on PI1_ADI */
+	APIMASK_STAT	= CPC925_BIT(3), /* Status Exception */
+	APIMASK_DERR	= CPC925_BIT(4), /* Data Error Exception */
+	APIMASK_ADRS0	= CPC925_BIT(5), /* Addressing Exception on PI0 */
+	APIMASK_ADRS1	= CPC925_BIT(6), /* Addressing Exception on PI1 */
+					 /* BIT(7) Reserved */
+	APIMASK_ECC_UE_H = CPC925_BIT(8), /* UECC upper */
+	APIMASK_ECC_CE_H = CPC925_BIT(9), /* CECC upper */
+	APIMASK_ECC_UE_L = CPC925_BIT(10), /* UECC lower */
+	APIMASK_ECC_CE_L = CPC925_BIT(11), /* CECC lower */
+
+	CPU_MASK_ENABLE = (APIMASK_DART | APIMASK_ADI0 | APIMASK_ADI1 |
+			   APIMASK_STAT | APIMASK_DERR | APIMASK_ADRS0 |
+			   APIMASK_ADRS1),
+	ECC_MASK_ENABLE = (APIMASK_ECC_UE_H | APIMASK_ECC_CE_H |
+			   APIMASK_ECC_UE_L | APIMASK_ECC_CE_L),
+};
+
+/************************************************************
+ *	Processor Interface Exception Register (APIEXCP)
+ ************************************************************/
+#define REG_APIEXCP_OFFSET	0x30060
+enum apiexcp_bits {
+	APIEXCP_DART	= CPC925_BIT(0), /* DART Exception */
+	APIEXCP_ADI0	= CPC925_BIT(1), /* Handshake Error on PI0_ADI */
+	APIEXCP_ADI1	= CPC925_BIT(2), /* Handshake Error on PI1_ADI */
+	APIEXCP_STAT	= CPC925_BIT(3), /* Status Exception */
+	APIEXCP_DERR	= CPC925_BIT(4), /* Data Error Exception */
+	APIEXCP_ADRS0	= CPC925_BIT(5), /* Addressing Exception on PI0 */
+	APIEXCP_ADRS1	= CPC925_BIT(6), /* Addressing Exception on PI1 */
+					 /* BIT(7) Reserved */
+	APIEXCP_ECC_UE_H = CPC925_BIT(8), /* UECC upper */
+	APIEXCP_ECC_CE_H = CPC925_BIT(9), /* CECC upper */
+	APIEXCP_ECC_UE_L = CPC925_BIT(10), /* UECC lower */
+	APIEXCP_ECC_CE_L = CPC925_BIT(11), /* CECC lower */
+
+	CPU_EXCP_DETECTED = (APIEXCP_DART | APIEXCP_ADI0 | APIEXCP_ADI1 |
+			     APIEXCP_STAT | APIEXCP_DERR | APIEXCP_ADRS0 |
+			     APIEXCP_ADRS1),
+	UECC_EXCP_DETECTED = (APIEXCP_ECC_UE_H | APIEXCP_ECC_UE_L),
+	CECC_EXCP_DETECTED = (APIEXCP_ECC_CE_H | APIEXCP_ECC_CE_L),
+	ECC_EXCP_DETECTED = (UECC_EXCP_DETECTED | CECC_EXCP_DETECTED),
+};
+
+/************************************************************
+ *	Memory Bus Configuration Register (MBCR)
+************************************************************/
+#define REG_MBCR_OFFSET		0x2190
+#define MBCR_64BITCFG_SHIFT	23
+#define MBCR_64BITCFG_MASK	(1UL << MBCR_64BITCFG_SHIFT)
+#define MBCR_64BITBUS_SHIFT	22
+#define MBCR_64BITBUS_MASK	(1UL << MBCR_64BITBUS_SHIFT)
+
+/************************************************************
+ *	Memory Bank Mode Register (MBMR)
+************************************************************/
+#define REG_MBMR_OFFSET		0x21C0
+#define MBMR_MODE_MAX_VALUE	0xF
+#define MBMR_MODE_SHIFT		25
+#define MBMR_MODE_MASK		(MBMR_MODE_MAX_VALUE << MBMR_MODE_SHIFT)
+#define MBMR_BBA_SHIFT		24
+#define MBMR_BBA_MASK		(1UL << MBMR_BBA_SHIFT)
+
+/************************************************************
+ *	Memory Bank Boundary Address Register (MBBAR)
+ ************************************************************/
+#define REG_MBBAR_OFFSET	0x21D0
+#define MBBAR_BBA_MAX_VALUE	0xFF
+#define MBBAR_BBA_SHIFT		24
+#define MBBAR_BBA_MASK		(MBBAR_BBA_MAX_VALUE << MBBAR_BBA_SHIFT)
+
+/************************************************************
+ *	Memory Scrub Control Register (MSCR)
+ ************************************************************/
+#define REG_MSCR_OFFSET		0x2400
+#define MSCR_SCRUB_MOD_MASK	0xC0000000 /* scrub_mod - bit0:1*/
+#define MSCR_BACKGR_SCRUB	0x40000000 /* 01 */
+#define MSCR_SI_SHIFT		16 	/* si - bit8:15*/
+#define MSCR_SI_MAX_VALUE	0xFF
+#define MSCR_SI_MASK		(MSCR_SI_MAX_VALUE << MSCR_SI_SHIFT)
+
+/************************************************************
+ *	Memory Scrub Range Start Register (MSRSR)
+ ************************************************************/
+#define REG_MSRSR_OFFSET	0x2410
+
+/************************************************************
+ *	Memory Scrub Range End Register (MSRER)
+ ************************************************************/
+#define REG_MSRER_OFFSET	0x2420
+
+/************************************************************
+ *	Memory Scrub Pattern Register (MSPR)
+ ************************************************************/
+#define REG_MSPR_OFFSET		0x2430
+
+/************************************************************
+ *	Memory Check Control Register (MCCR)
+ ************************************************************/
+#define REG_MCCR_OFFSET		0x2440
+enum mccr_bits {
+	MCCR_ECC_EN	= CPC925_BIT(0), /* ECC high and low check */
+};
+
+/************************************************************
+ *	Memory Check Range End Register (MCRER)
+ ************************************************************/
+#define REG_MCRER_OFFSET	0x2450
+
+/************************************************************
+ *	Memory Error Address Register (MEAR)
+ ************************************************************/
+#define REG_MEAR_OFFSET		0x2460
+#define MEAR_BCNT_MAX_VALUE	0x3
+#define MEAR_BCNT_SHIFT		30
+#define MEAR_BCNT_MASK		(MEAR_BCNT_MAX_VALUE << MEAR_BCNT_SHIFT)
+#define MEAR_RANK_MAX_VALUE	0x7
+#define MEAR_RANK_SHIFT		27
+#define MEAR_RANK_MASK		(MEAR_RANK_MAX_VALUE << MEAR_RANK_SHIFT)
+#define MEAR_COL_MAX_VALUE	0x7FF
+#define MEAR_COL_SHIFT		16
+#define MEAR_COL_MASK		(MEAR_COL_MAX_VALUE << MEAR_COL_SHIFT)
+#define MEAR_BANK_MAX_VALUE	0x3
+#define MEAR_BANK_SHIFT		14
+#define MEAR_BANK_MASK		(MEAR_BANK_MAX_VALUE << MEAR_BANK_SHIFT)
+#define MEAR_ROW_MASK		0x00003FFF
+
+/************************************************************
+ *	Memory Error Syndrome Register (MESR)
+ ************************************************************/
+#define REG_MESR_OFFSET		0x2470
+#define MESR_ECC_SYN_H_MASK	0xFF00
+#define MESR_ECC_SYN_L_MASK	0x00FF
+
+/************************************************************
+ *	Memory Mode Control Register (MMCR)
+ ************************************************************/
+#define REG_MMCR_OFFSET		0x2500
+enum mmcr_bits {
+	MMCR_REG_DIMM_MODE = CPC925_BIT(3),
+};
+
+/*
+ * HyperTransport Link Registers
+ */
+/************************************************************
+ *  Error Handling/Enumeration Scratch Pad Register (ERRCTRL)
+ ************************************************************/
+#define REG_ERRCTRL_OFFSET	0x70140
+enum errctrl_bits {			 /* nonfatal interrupts for */
+	ERRCTRL_SERR_NF	= CPC925_BIT(0), /* system error */
+	ERRCTRL_CRC_NF	= CPC925_BIT(1), /* CRC error */
+	ERRCTRL_RSP_NF	= CPC925_BIT(2), /* Response error */
+	ERRCTRL_EOC_NF	= CPC925_BIT(3), /* End-Of-Chain error */
+	ERRCTRL_OVF_NF	= CPC925_BIT(4), /* Overflow error */
+	ERRCTRL_PROT_NF	= CPC925_BIT(5), /* Protocol error */
+
+	ERRCTRL_RSP_ERR	= CPC925_BIT(6), /* Response error received */
+	ERRCTRL_CHN_FAL = CPC925_BIT(7), /* Sync flooding detected */
+
+	HT_ERRCTRL_ENABLE = (ERRCTRL_SERR_NF | ERRCTRL_CRC_NF |
+			     ERRCTRL_RSP_NF | ERRCTRL_EOC_NF |
+			     ERRCTRL_OVF_NF | ERRCTRL_PROT_NF),
+	HT_ERRCTRL_DETECTED = (ERRCTRL_RSP_ERR | ERRCTRL_CHN_FAL),
+};
+
+/************************************************************
+ *  Link Configuration and Link Control Register (LINKCTRL)
+ ************************************************************/
+#define REG_LINKCTRL_OFFSET	0x70110
+enum linkctrl_bits {
+	LINKCTRL_CRC_ERR	= (CPC925_BIT(22) | CPC925_BIT(23)),
+	LINKCTRL_LINK_FAIL	= CPC925_BIT(27),
+
+	HT_LINKCTRL_DETECTED	= (LINKCTRL_CRC_ERR | LINKCTRL_LINK_FAIL),
+};
+
+/************************************************************
+ *  Link FreqCap/Error/Freq/Revision ID Register (LINKERR)
+ ************************************************************/
+#define REG_LINKERR_OFFSET	0x70120
+enum linkerr_bits {
+	LINKERR_EOC_ERR		= CPC925_BIT(17), /* End-Of-Chain error */
+	LINKERR_OVF_ERR		= CPC925_BIT(18), /* Receive Buffer Overflow */
+	LINKERR_PROT_ERR	= CPC925_BIT(19), /* Protocol error */
+
+	HT_LINKERR_DETECTED	= (LINKERR_EOC_ERR | LINKERR_OVF_ERR |
+				   LINKERR_PROT_ERR),
+};
+
+/************************************************************
+ *	Bridge Control Register (BRGCTRL)
+ ************************************************************/
+#define REG_BRGCTRL_OFFSET	0x70300
+enum brgctrl_bits {
+	BRGCTRL_DETSERR = CPC925_BIT(0), /* SERR on Secondary Bus */
+	BRGCTRL_SECBUSRESET = CPC925_BIT(9), /* Secondary Bus Reset */
+};
+
+/* Private structure for edac memory controller */
+struct cpc925_mc_pdata {
+	void __iomem *vbase;
+	unsigned long total_mem;
+	const char *name;
+	int edac_idx;
+};
+
+/* Private structure for common edac device */
+struct cpc925_dev_info {
+	void __iomem *vbase;
+	struct platform_device *pdev;
+	char *ctl_name;
+	int edac_idx;
+	struct edac_device_ctl_info *edac_dev;
+	void (*init)(struct cpc925_dev_info *dev_info);
+	void (*exit)(struct cpc925_dev_info *dev_info);
+	void (*check)(struct edac_device_ctl_info *edac_dev);
+};
+
+/* Get total memory size from Open Firmware DTB */
+static void get_total_mem(struct cpc925_mc_pdata *pdata)
+{
+	struct device_node *np = NULL;
+	const unsigned int *reg, *reg_end;
+	int len, sw, aw;
+	unsigned long start, size;
+
+	np = of_find_node_by_type(NULL, "memory");
+	if (!np)
+		return;
+
+	aw = of_n_addr_cells(np);
+	sw = of_n_size_cells(np);
+	reg = (const unsigned int *)of_get_property(np, "reg", &len);
+	reg_end = reg + len/4;
+
+	pdata->total_mem = 0;
+	do {
+		start = of_read_number(reg, aw);
+		reg += aw;
+		size = of_read_number(reg, sw);
+		reg += sw;
+		debugf1("%s: start 0x%lx, size 0x%lx\n", __func__,
+			start, size);
+		pdata->total_mem += size;
+	} while (reg < reg_end);
+
+	of_node_put(np);
+	debugf0("%s: total_mem 0x%lx\n", __func__, pdata->total_mem);
+}
+
+static void cpc925_init_csrows(struct mem_ctl_info *mci)
+{
+	struct cpc925_mc_pdata *pdata = mci->pvt_info;
+	struct csrow_info *csrow;
+	int index;
+	u32 mbmr, mbbar, bba;
+	unsigned long row_size, last_nr_pages = 0;
+
+	get_total_mem(pdata);
+
+	for (index = 0; index < mci->nr_csrows; index++) {
+		mbmr = __raw_readl(pdata->vbase + REG_MBMR_OFFSET +
+				   0x20 * index);
+		mbbar = __raw_readl(pdata->vbase + REG_MBBAR_OFFSET +
+				   0x20 + index);
+		bba = (((mbmr & MBMR_BBA_MASK) >> MBMR_BBA_SHIFT) << 8) |
+		       ((mbbar & MBBAR_BBA_MASK) >> MBBAR_BBA_SHIFT);
+
+		if (bba == 0)
+			continue; /* not populated */
+
+		csrow = &mci->csrows[index];
+
+		row_size = bba * (1UL << 28);	/* 256M */
+		csrow->first_page = last_nr_pages;
+		csrow->nr_pages = row_size >> PAGE_SHIFT;
+		csrow->last_page = csrow->first_page + csrow->nr_pages - 1;
+		last_nr_pages = csrow->last_page + 1;
+
+		csrow->mtype = MEM_RDDR;
+		csrow->edac_mode = EDAC_SECDED;
+
+		switch (csrow->nr_channels) {
+		case 1: /* Single channel */
+			csrow->grain = 32; /* four-beat burst of 32 bytes */
+			break;
+		case 2: /* Dual channel */
+		default:
+			csrow->grain = 64; /* four-beat burst of 64 bytes */
+			break;
+		}
+
+		switch ((mbmr & MBMR_MODE_MASK) >> MBMR_MODE_SHIFT) {
+		case 6: /* 0110, no way to differentiate X8 VS X16 */
+		case 5:	/* 0101 */
+		case 8: /* 1000 */
+			csrow->dtype = DEV_X16;
+			break;
+		case 7: /* 0111 */
+		case 9: /* 1001 */
+			csrow->dtype = DEV_X8;
+			break;
+		default:
+			csrow->dtype = DEV_UNKNOWN;
+			break;
+		}
+	}
+}
+
+/* Enable memory controller ECC detection */
+static void cpc925_mc_init(struct mem_ctl_info *mci)
+{
+	struct cpc925_mc_pdata *pdata = mci->pvt_info;
+	u32 apimask;
+	u32 mccr;
+
+	/* Enable various ECC error exceptions */
+	apimask = __raw_readl(pdata->vbase + REG_APIMASK_OFFSET);
+	if ((apimask & ECC_MASK_ENABLE) == 0) {
+		apimask |= ECC_MASK_ENABLE;
+		__raw_writel(apimask, pdata->vbase + REG_APIMASK_OFFSET);
+	}
+
+	/* Enable ECC detection */
+	mccr = __raw_readl(pdata->vbase + REG_MCCR_OFFSET);
+	if ((mccr & MCCR_ECC_EN) == 0) {
+		mccr |= MCCR_ECC_EN;
+		__raw_writel(mccr, pdata->vbase + REG_MCCR_OFFSET);
+	}
+}
+
+/* Disable memory controller ECC detection */
+static void cpc925_mc_exit(struct mem_ctl_info *mci)
+{
+	/*
+	 * WARNING:
+	 * We are supposed to clear the ECC error detection bits,
+	 * and it will be no problem to do so. However, once they
+	 * are cleared here if we want to re-install CPC925 EDAC
+	 * module later, setting them up in cpc925_mc_init() will
+	 * trigger machine check exception.
+	 * Also, it's ok to leave ECC error detection bits enabled,
+	 * since they are reset to 1 by default or by boot loader.
+	 */
+
+	return;
+}
+
+/*
+ * Revert DDR column/row/bank addresses into page frame number and
+ * offset in page.
+ *
+ * Suppose memory mode is 0x0111(128-bit mode, identical DIMM pairs),
+ * physical address(PA) bits to column address(CA) bits mappings are:
+ * CA	0   1   2   3   4   5   6   7   8   9   10
+ * PA	59  58  57  56  55  54  53  52  51  50  49
+ *
+ * physical address(PA) bits to bank address(BA) bits mappings are:
+ * BA	0   1
+ * PA	43  44
+ *
+ * physical address(PA) bits to row address(RA) bits mappings are:
+ * RA	0   1   2   3   4   5   6   7   8   9   10   11   12
+ * PA	36  35  34  48  47  46  45  40  41  42  39   38   37
+ */
+static void cpc925_mc_get_pfn(struct mem_ctl_info *mci, u32 mear,
+		unsigned long *pfn, unsigned long *offset, int *csrow)
+{
+	u32 bcnt, rank, col, bank, row;
+	u32 c;
+	unsigned long pa;
+	int i;
+
+	bcnt = (mear & MEAR_BCNT_MASK) >> MEAR_BCNT_SHIFT;
+	rank = (mear & MEAR_RANK_MASK) >> MEAR_RANK_SHIFT;
+	col = (mear & MEAR_COL_MASK) >> MEAR_COL_SHIFT;
+	bank = (mear & MEAR_BANK_MASK) >> MEAR_BANK_SHIFT;
+	row = mear & MEAR_ROW_MASK;
+
+	*csrow = rank;
+
+#ifdef CONFIG_EDAC_DEBUG
+	if (mci->csrows[rank].first_page == 0) {
+		cpc925_mc_printk(mci, KERN_ERR, "ECC occurs in a "
+			"non-populated csrow, broken hardware?\n");
+		return;
+	}
+#endif
+
+	/* Revert csrow number */
+	pa = mci->csrows[rank].first_page << PAGE_SHIFT;
+
+	/* Revert column address */
+	col += bcnt;
+	for (i = 0; i < 11; i++) {
+		c = col & 0x1;
+		col >>= 1;
+		pa |= c << (14 - i);
+	}
+
+	/* Revert bank address */
+	pa |= bank << 19;
+
+	/* Revert row address, in 4 steps */
+	for (i = 0; i < 3; i++) {
+		c = row & 0x1;
+		row >>= 1;
+		pa |= c << (26 - i);
+	}
+
+	for (i = 0; i < 3; i++) {
+		c = row & 0x1;
+		row >>= 1;
+		pa |= c << (21 + i);
+	}
+
+	for (i = 0; i < 4; i++) {
+		c = row & 0x1;
+		row >>= 1;
+		pa |= c << (18 - i);
+	}
+
+	for (i = 0; i < 3; i++) {
+		c = row & 0x1;
+		row >>= 1;
+		pa |= c << (29 - i);
+	}
+
+	*offset = pa & (PAGE_SIZE - 1);
+	*pfn = pa >> PAGE_SHIFT;
+
+	debugf0("%s: ECC physical address 0x%lx\n", __func__, pa);
+}
+
+static int cpc925_mc_find_channel(struct mem_ctl_info *mci, u16 syndrome)
+{
+	if ((syndrome & MESR_ECC_SYN_H_MASK) == 0)
+		return 0;
+
+	if ((syndrome & MESR_ECC_SYN_L_MASK) == 0)
+		return 1;
+
+	cpc925_mc_printk(mci, KERN_INFO, "Unexpected syndrome value: 0x%x\n",
+			 syndrome);
+	return 1;
+}
+
+/* Check memory controller registers for ECC errors */
+static void cpc925_mc_check(struct mem_ctl_info *mci)
+{
+	struct cpc925_mc_pdata *pdata = mci->pvt_info;
+	u32 apiexcp;
+	u32 mear;
+	u32 mesr;
+	u16 syndrome;
+	unsigned long pfn = 0, offset = 0;
+	int csrow = 0, channel = 0;
+
+	/* APIEXCP is cleared when read */
+	apiexcp = __raw_readl(pdata->vbase + REG_APIEXCP_OFFSET);
+	if ((apiexcp & ECC_EXCP_DETECTED) == 0)
+		return;
+
+	mesr = __raw_readl(pdata->vbase + REG_MESR_OFFSET);
+	syndrome = mesr | (MESR_ECC_SYN_H_MASK | MESR_ECC_SYN_L_MASK);
+
+	mear = __raw_readl(pdata->vbase + REG_MEAR_OFFSET);
+
+	/* Revert column/row addresses into page frame number, etc */
+	cpc925_mc_get_pfn(mci, mear, &pfn, &offset, &csrow);
+
+	if (apiexcp & CECC_EXCP_DETECTED) {
+		cpc925_mc_printk(mci, KERN_INFO, "DRAM CECC Fault\n");
+		channel = cpc925_mc_find_channel(mci, syndrome);
+		edac_mc_handle_ce(mci, pfn, offset, syndrome,
+				  csrow, channel, mci->ctl_name);
+	}
+
+	if (apiexcp & UECC_EXCP_DETECTED) {
+		cpc925_mc_printk(mci, KERN_INFO, "DRAM UECC Fault\n");
+		edac_mc_handle_ue(mci, pfn, offset, csrow, mci->ctl_name);
+	}
+
+	cpc925_mc_printk(mci, KERN_INFO, "Dump registers:\n");
+	cpc925_mc_printk(mci, KERN_INFO, "APIMASK		0x%08x\n",
+		__raw_readl(pdata->vbase + REG_APIMASK_OFFSET));
+	cpc925_mc_printk(mci, KERN_INFO, "APIEXCP		0x%08x\n",
+		apiexcp);
+	cpc925_mc_printk(mci, KERN_INFO, "Mem Scrub Ctrl	0x%08x\n",
+		__raw_readl(pdata->vbase + REG_MSCR_OFFSET));
+	cpc925_mc_printk(mci, KERN_INFO, "Mem Scrub Rge Start	0x%08x\n",
+		__raw_readl(pdata->vbase + REG_MSRSR_OFFSET));
+	cpc925_mc_printk(mci, KERN_INFO, "Mem Scrub Rge End	0x%08x\n",
+		__raw_readl(pdata->vbase + REG_MSRER_OFFSET));
+	cpc925_mc_printk(mci, KERN_INFO, "Mem Scrub Pattern	0x%08x\n",
+		__raw_readl(pdata->vbase + REG_MSPR_OFFSET));
+	cpc925_mc_printk(mci, KERN_INFO, "Mem Chk Ctrl		0x%08x\n",
+		__raw_readl(pdata->vbase + REG_MCCR_OFFSET));
+	cpc925_mc_printk(mci, KERN_INFO, "Mem Chk Rge End	0x%08x\n",
+		__raw_readl(pdata->vbase + REG_MCRER_OFFSET));
+	cpc925_mc_printk(mci, KERN_INFO, "Mem Err Address	0x%08x\n",
+		mesr);
+	cpc925_mc_printk(mci, KERN_INFO, "Mem Err Syndrome	0x%08x\n",
+		syndrome);
+}
+
+/******************** CPU err device********************************/
+/* Enable CPU Errors detection */
+static void cpc925_cpu_init(struct cpc925_dev_info *dev_info)
+{
+	u32 apimask;
+
+	apimask = __raw_readl(dev_info->vbase + REG_APIMASK_OFFSET);
+	if ((apimask & CPU_MASK_ENABLE) == 0) {
+		apimask |= CPU_MASK_ENABLE;
+		__raw_writel(apimask, dev_info->vbase + REG_APIMASK_OFFSET);
+	}
+}
+
+/* Disable CPU Errors detection */
+static void cpc925_cpu_exit(struct cpc925_dev_info *dev_info)
+{
+	/*
+	 * WARNING:
+	 * We are supposed to clear the CPU error detection bits,
+	 * and it will be no problem to do so. However, once they
+	 * are cleared here if we want to re-install CPC925 EDAC
+	 * module later, setting them up in cpc925_cpu_init() will
+	 * trigger machine check exception.
+	 * Also, it's ok to leave CPU error detection bits enabled,
+	 * since they are reset to 1 by default.
+	 */
+
+	return;
+}
+
+/* Check for CPU Errors */
+static void cpc925_cpu_check(struct edac_device_ctl_info *edac_dev)
+{
+	struct cpc925_dev_info *dev_info = edac_dev->pvt_info;
+	u32 apiexcp;
+	u32 apimask;
+
+	/* APIEXCP is cleared when read */
+	apiexcp = __raw_readl(dev_info->vbase + REG_APIEXCP_OFFSET);
+	if ((apiexcp & CPU_EXCP_DETECTED) == 0)
+		return;
+
+	apimask = __raw_readl(dev_info->vbase + REG_APIMASK_OFFSET);
+	cpc925_printk(KERN_INFO, "Processor Interface Fault\n"
+				 "Processor Interface register dump:\n");
+	cpc925_printk(KERN_INFO, "APIMASK		0x%08x\n", apimask);
+	cpc925_printk(KERN_INFO, "APIEXCP		0x%08x\n", apiexcp);
+
+	edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
+}
+
+/******************** HT Link err device****************************/
+/* Enable HyperTransport Link Error detection */
+static void cpc925_htlink_init(struct cpc925_dev_info *dev_info)
+{
+	u32 ht_errctrl;
+
+	ht_errctrl = __raw_readl(dev_info->vbase + REG_ERRCTRL_OFFSET);
+	if ((ht_errctrl & HT_ERRCTRL_ENABLE) == 0) {
+		ht_errctrl |= HT_ERRCTRL_ENABLE;
+		__raw_writel(ht_errctrl, dev_info->vbase + REG_ERRCTRL_OFFSET);
+	}
+}
+
+/* Disable HyperTransport Link Error detection */
+static void cpc925_htlink_exit(struct cpc925_dev_info *dev_info)
+{
+	u32 ht_errctrl;
+
+	ht_errctrl = __raw_readl(dev_info->vbase + REG_ERRCTRL_OFFSET);
+	ht_errctrl &= ~HT_ERRCTRL_ENABLE;
+	__raw_writel(ht_errctrl, dev_info->vbase + REG_ERRCTRL_OFFSET);
+}
+
+/* Check for HyperTransport Link errors */
+static void cpc925_htlink_check(struct edac_device_ctl_info *edac_dev)
+{
+	struct cpc925_dev_info *dev_info = edac_dev->pvt_info;
+	u32 brgctrl = __raw_readl(dev_info->vbase + REG_BRGCTRL_OFFSET);
+	u32 linkctrl = __raw_readl(dev_info->vbase + REG_LINKCTRL_OFFSET);
+	u32 errctrl = __raw_readl(dev_info->vbase + REG_ERRCTRL_OFFSET);
+	u32 linkerr = __raw_readl(dev_info->vbase + REG_LINKERR_OFFSET);
+
+	if (!((brgctrl & BRGCTRL_DETSERR) ||
+	      (linkctrl & HT_LINKCTRL_DETECTED) ||
+	      (errctrl & HT_ERRCTRL_DETECTED) ||
+	      (linkerr & HT_LINKERR_DETECTED)))
+		return;
+
+	cpc925_printk(KERN_INFO, "HT Link Fault\n"
+				 "HT register dump:\n");
+	cpc925_printk(KERN_INFO, "Bridge Ctrl			0x%08x\n",
+		      brgctrl);
+	cpc925_printk(KERN_INFO, "Link Config Ctrl		0x%08x\n",
+		      linkctrl);
+	cpc925_printk(KERN_INFO, "Error Enum and Ctrl		0x%08x\n",
+		      errctrl);
+	cpc925_printk(KERN_INFO, "Link Error			0x%08x\n",
+		      linkerr);
+
+	/* Clear by write 1 */
+	if (brgctrl & BRGCTRL_DETSERR)
+		__raw_writel(BRGCTRL_DETSERR,
+				dev_info->vbase + REG_BRGCTRL_OFFSET);
+
+	if (linkctrl & HT_LINKCTRL_DETECTED)
+		__raw_writel(HT_LINKCTRL_DETECTED,
+				dev_info->vbase + REG_LINKCTRL_OFFSET);
+
+	/* Initiate Secondary Bus Reset to clear the chain failure */
+	if (errctrl & ERRCTRL_CHN_FAL)
+		__raw_writel(BRGCTRL_SECBUSRESET,
+				dev_info->vbase + REG_BRGCTRL_OFFSET);
+
+	if (errctrl & ERRCTRL_RSP_ERR)
+		__raw_writel(ERRCTRL_RSP_ERR,
+				dev_info->vbase + REG_ERRCTRL_OFFSET);
+
+	if (linkerr & HT_LINKERR_DETECTED)
+		__raw_writel(HT_LINKERR_DETECTED,
+				dev_info->vbase + REG_LINKERR_OFFSET);
+
+	edac_device_handle_ce(edac_dev, 0, 0, edac_dev->ctl_name);
+}
+
+static struct cpc925_dev_info cpc925_devs[] = {
+	{
+	.ctl_name = CPC925_CPU_ERR_DEV,
+	.init = cpc925_cpu_init,
+	.exit = cpc925_cpu_exit,
+	.check = cpc925_cpu_check,
+	},
+	{
+	.ctl_name = CPC925_HT_LINK_DEV,
+	.init = cpc925_htlink_init,
+	.exit = cpc925_htlink_exit,
+	.check = cpc925_htlink_check,
+	},
+	{0}, /* Terminated by NULL */
+};
+
+/*
+ * Add CPU Err detection and HyperTransport Link Err detection
+ * as common "edac_device", they have no corresponding device
+ * nodes in the Open Firmware DTB and we have to add platform
+ * devices for them. Also, they will share the MMIO with that
+ * of memory controller.
+ */
+static void cpc925_add_edac_devices(void __iomem *vbase)
+{
+	struct cpc925_dev_info *dev_info;
+
+	if (!vbase) {
+		cpc925_printk(KERN_ERR, "MMIO not established yet\n");
+		return;
+	}
+
+	for (dev_info = &cpc925_devs[0]; dev_info->init; dev_info++) {
+		dev_info->vbase = vbase;
+		dev_info->pdev = platform_device_register_simple(
+					dev_info->ctl_name, 0, NULL, 0);
+		if (IS_ERR(dev_info->pdev)) {
+			cpc925_printk(KERN_ERR,
+				"Can't register platform device for %s\n",
+				dev_info->ctl_name);
+			continue;
+		}
+
+		/*
+		 * Don't have to allocate private structure but
+		 * make use of cpc925_devs[] instead.
+		 */
+		dev_info->edac_idx = edac_device_alloc_index();
+		dev_info->edac_dev =
+			edac_device_alloc_ctl_info(0, dev_info->ctl_name,
+				1, NULL, 0, 0, NULL, 0, dev_info->edac_idx);
+		if (!dev_info->edac_dev) {
+			cpc925_printk(KERN_ERR, "No memory for edac device\n");
+			goto err1;
+		}
+
+		dev_info->edac_dev->pvt_info = dev_info;
+		dev_info->edac_dev->dev = &dev_info->pdev->dev;
+		dev_info->edac_dev->ctl_name = dev_info->ctl_name;
+		dev_info->edac_dev->mod_name = CPC925_EDAC_MOD_STR;
+		dev_info->edac_dev->dev_name = dev_name(&dev_info->pdev->dev);
+
+		if (edac_op_state == EDAC_OPSTATE_POLL)
+			dev_info->edac_dev->edac_check = dev_info->check;
+
+		if (dev_info->init)
+			dev_info->init(dev_info);
+
+		if (edac_device_add_device(dev_info->edac_dev) > 0) {
+			cpc925_printk(KERN_ERR,
+				"Unable to add edac device for %s\n",
+				dev_info->ctl_name);
+			goto err2;
+		}
+
+		debugf0("%s: Successfully added edac device for %s\n",
+			__func__, dev_info->ctl_name);
+
+		continue;
+
+err2:
+		if (dev_info->exit)
+			dev_info->exit(dev_info);
+		edac_device_free_ctl_info(dev_info->edac_dev);
+err1:
+		platform_device_unregister(dev_info->pdev);
+	}
+}
+
+/*
+ * Delete the common "edac_device" for CPU Err Detection
+ * and HyperTransport Link Err Detection
+ */
+static void cpc925_del_edac_devices(void)
+{
+	struct cpc925_dev_info *dev_info;
+
+	for (dev_info = &cpc925_devs[0]; dev_info->init; dev_info++) {
+		if (dev_info->edac_dev) {
+			edac_device_del_device(dev_info->edac_dev->dev);
+			edac_device_free_ctl_info(dev_info->edac_dev);
+			platform_device_unregister(dev_info->pdev);
+		}
+
+		if (dev_info->exit)
+			dev_info->exit(dev_info);
+
+		debugf0("%s: Successfully deleted edac device for %s\n",
+			__func__, dev_info->ctl_name);
+	}
+}
+
+/* Convert current back-ground scrub rate into byte/sec bandwith */
+static int cpc925_get_sdram_scrub_rate(struct mem_ctl_info *mci, u32 *bw)
+{
+	struct cpc925_mc_pdata *pdata = mci->pvt_info;
+	u32 mscr;
+	u8 si;
+
+	mscr = __raw_readl(pdata->vbase + REG_MSCR_OFFSET);
+	si = (mscr & MSCR_SI_MASK) >> MSCR_SI_SHIFT;
+
+	debugf0("%s, Mem Scrub Ctrl Register 0x%x\n", __func__, mscr);
+
+	if (((mscr & MSCR_SCRUB_MOD_MASK) != MSCR_BACKGR_SCRUB) ||
+	    (si == 0)) {
+		cpc925_mc_printk(mci, KERN_INFO, "Scrub mode not enabled\n");
+		*bw = 0;
+	} else
+		*bw = CPC925_SCRUB_BLOCK_SIZE * 0xFA67 / si;
+
+	return 0;
+}
+
+/* Return 0 for single channel; 1 for dual channel */
+static int cpc925_mc_get_channels(void __iomem *vbase)
+{
+	int dual = 0;
+	u32 mbcr;
+
+	mbcr = __raw_readl(vbase + REG_MBCR_OFFSET);
+
+	/*
+	 * Dual channel only when 128-bit wide physical bus
+	 * and 128-bit configuration.
+	 */
+	if (((mbcr & MBCR_64BITCFG_MASK) == 0) &&
+	    ((mbcr & MBCR_64BITBUS_MASK) == 0))
+		dual = 1;
+
+	debugf0("%s: %s channel\n", __func__,
+		(dual > 0) ? "Dual" : "Single");
+
+	return dual;
+}
+
+static int __devinit cpc925_probe(struct platform_device *pdev)
+{
+	static int edac_mc_idx;
+	struct mem_ctl_info *mci;
+	void __iomem *vbase;
+	struct cpc925_mc_pdata *pdata;
+	struct resource *r;
+	int res = 0, nr_channels;
+
+	debugf0("%s: %s platform device found!\n", __func__, pdev->name);
+
+	if (!devres_open_group(&pdev->dev, cpc925_probe, GFP_KERNEL)) {
+		res = -ENOMEM;
+		goto out;
+	}
+
+	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!r) {
+		cpc925_printk(KERN_ERR, "Unable to get resource\n");
+		res = -ENOENT;
+		goto err1;
+	}
+
+	if (!devm_request_mem_region(&pdev->dev,
+				     r->start,
+				     r->end - r->start + 1,
+				     pdev->name)) {
+		cpc925_printk(KERN_ERR, "Unable to request mem region\n");
+		res = -EBUSY;
+		goto err1;
+	}
+
+	vbase = devm_ioremap(&pdev->dev, r->start, r->end - r->start + 1);
+	if (!vbase) {
+		cpc925_printk(KERN_ERR, "Unable to ioremap device\n");
+		res = -ENOMEM;
+		goto err2;
+	}
+
+	nr_channels = cpc925_mc_get_channels(vbase);
+	mci = edac_mc_alloc(sizeof(struct cpc925_mc_pdata),
+			CPC925_NR_CSROWS, nr_channels + 1, edac_mc_idx);
+	if (!mci) {
+		cpc925_printk(KERN_ERR, "No memory for mem_ctl_info\n");
+		res = -ENOMEM;
+		goto err2;
+	}
+
+	pdata = mci->pvt_info;
+	pdata->vbase = vbase;
+	pdata->edac_idx = edac_mc_idx++;
+	pdata->name = pdev->name;
+
+	mci->dev = &pdev->dev;
+	platform_set_drvdata(pdev, mci);
+	mci->dev_name = dev_name(&pdev->dev);
+	mci->mtype_cap = MEM_FLAG_RDDR | MEM_FLAG_DDR;
+	mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED;
+	mci->edac_cap = EDAC_FLAG_SECDED;
+	mci->mod_name = CPC925_EDAC_MOD_STR;
+	mci->mod_ver = CPC925_EDAC_REVISION;
+	mci->ctl_name = pdev->name;
+
+	if (edac_op_state == EDAC_OPSTATE_POLL)
+		mci->edac_check = cpc925_mc_check;
+
+	mci->ctl_page_to_phys = NULL;
+	mci->scrub_mode = SCRUB_SW_SRC;
+	mci->set_sdram_scrub_rate = NULL;
+	mci->get_sdram_scrub_rate = cpc925_get_sdram_scrub_rate;
+
+	cpc925_init_csrows(mci);
+
+	/* Setup memory controller registers */
+	cpc925_mc_init(mci);
+
+	if (edac_mc_add_mc(mci) > 0) {
+		cpc925_mc_printk(mci, KERN_ERR, "Failed edac_mc_add_mc()\n");
+		goto err3;
+	}
+
+	cpc925_add_edac_devices(vbase);
+
+	/* get this far and it's successful */
+	debugf0("%s: success\n", __func__);
+
+	res = 0;
+	goto out;
+
+err3:
+	cpc925_mc_exit(mci);
+	edac_mc_free(mci);
+err2:
+	devm_release_mem_region(&pdev->dev, r->start, r->end-r->start+1);
+err1:
+	devres_release_group(&pdev->dev, cpc925_probe);
+out:
+	return res;
+}
+
+static int cpc925_remove(struct platform_device *pdev)
+{
+	struct mem_ctl_info *mci = platform_get_drvdata(pdev);
+
+	/*
+	 * Delete common edac devices before edac mc, because
+	 * the former share the MMIO of the latter.
+	 */
+	cpc925_del_edac_devices();
+	cpc925_mc_exit(mci);
+
+	edac_mc_del_mc(&pdev->dev);
+	edac_mc_free(mci);
+
+	return 0;
+}
+
+static struct platform_driver cpc925_edac_driver = {
+	.probe = cpc925_probe,
+	.remove = cpc925_remove,
+	.driver = {
+		   .name = "cpc925_edac",
+	}
+};
+
+static int __init cpc925_edac_init(void)
+{
+	int ret = 0;
+
+	printk(KERN_INFO "IBM CPC925 EDAC driver " CPC925_EDAC_REVISION "\n");
+	printk(KERN_INFO "\t(c) 2008 Wind River Systems, Inc\n");
+
+	/* Only support POLL mode so far */
+	edac_op_state = EDAC_OPSTATE_POLL;
+
+	ret = platform_driver_register(&cpc925_edac_driver);
+	if (ret) {
+		printk(KERN_WARNING "Failed to register %s\n",
+			CPC925_EDAC_MOD_STR);
+	}
+
+	return ret;
+}
+
+static void __exit cpc925_edac_exit(void)
+{
+	platform_driver_unregister(&cpc925_edac_driver);
+}
+
+module_init(cpc925_edac_init);
+module_exit(cpc925_edac_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Cao Qingtao <qingtao.cao@windriver.com>");
+MODULE_DESCRIPTION("IBM CPC925 Bridge and MC EDAC kernel module");
diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h
index 48d3b140983..3493c6bdb82 100644
--- a/drivers/edac/edac_core.h
+++ b/drivers/edac/edac_core.h
@@ -841,6 +841,7 @@ extern void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
 				int inst_nr, int block_nr, const char *msg);
 extern void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
 				int inst_nr, int block_nr, const char *msg);
+extern int edac_device_alloc_index(void);
 
 /*
  * edac_pci APIs
diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
index a7d2c717d03..b02a6a69a8f 100644
--- a/drivers/edac/edac_device.c
+++ b/drivers/edac/edac_device.c
@@ -490,6 +490,20 @@ void edac_device_reset_delay_period(struct edac_device_ctl_info *edac_dev,
 	mutex_unlock(&device_ctls_mutex);
 }
 
+/*
+ * edac_device_alloc_index: Allocate a unique device index number
+ *
+ * Return:
+ *	allocated index number
+ */
+int edac_device_alloc_index(void)
+{
+	static atomic_t device_indexes = ATOMIC_INIT(0);
+
+	return atomic_inc_return(&device_indexes) - 1;
+}
+EXPORT_SYMBOL_GPL(edac_device_alloc_index);
+
 /**
  * edac_device_add_device: Insert the 'edac_dev' structure into the
  * edac_device global list and create sysfs entries associated with
diff --git a/drivers/gpio/max7301.c b/drivers/gpio/max7301.c
index 3e7f4e06386..7b82eaae262 100644
--- a/drivers/gpio/max7301.c
+++ b/drivers/gpio/max7301.c
@@ -287,7 +287,7 @@ exit_destroy:
 	return ret;
 }
 
-static int max7301_remove(struct spi_device *spi)
+static int __devexit max7301_remove(struct spi_device *spi)
 {
 	struct max7301 *ts;
 	int ret;
diff --git a/drivers/gpio/pca953x.c b/drivers/gpio/pca953x.c
index 8dc0164bd51..cdb6574d25a 100644
--- a/drivers/gpio/pca953x.c
+++ b/drivers/gpio/pca953x.c
@@ -15,6 +15,10 @@
 #include <linux/init.h>
 #include <linux/i2c.h>
 #include <linux/i2c/pca953x.h>
+#ifdef CONFIG_OF_GPIO
+#include <linux/of_platform.h>
+#include <linux/of_gpio.h>
+#endif
 
 #include <asm/gpio.h>
 
@@ -32,6 +36,7 @@ static const struct i2c_device_id pca953x_id[] = {
 	{ "pca9539", 16, },
 	{ "pca9554", 8, },
 	{ "pca9555", 16, },
+	{ "pca9556", 8, },
 	{ "pca9557", 8, },
 
 	{ "max7310", 8, },
@@ -49,7 +54,9 @@ struct pca953x_chip {
 	uint16_t reg_direction;
 
 	struct i2c_client *client;
+	struct pca953x_platform_data *dyn_pdata;
 	struct gpio_chip gpio_chip;
+	char **names;
 };
 
 static int pca953x_write_reg(struct pca953x_chip *chip, int reg, uint16_t val)
@@ -192,8 +199,57 @@ static void pca953x_setup_gpio(struct pca953x_chip *chip, int gpios)
 	gc->label = chip->client->name;
 	gc->dev = &chip->client->dev;
 	gc->owner = THIS_MODULE;
+	gc->names = chip->names;
 }
 
+/*
+ * Handlers for alternative sources of platform_data
+ */
+#ifdef CONFIG_OF_GPIO
+/*
+ * Translate OpenFirmware node properties into platform_data
+ */
+static struct pca953x_platform_data *
+pca953x_get_alt_pdata(struct i2c_client *client)
+{
+	struct pca953x_platform_data *pdata;
+	struct device_node *node;
+	const uint16_t *val;
+
+	node = dev_archdata_get_node(&client->dev.archdata);
+	if (node == NULL)
+		return NULL;
+
+	pdata = kzalloc(sizeof(struct pca953x_platform_data), GFP_KERNEL);
+	if (pdata == NULL) {
+		dev_err(&client->dev, "Unable to allocate platform_data\n");
+		return NULL;
+	}
+
+	pdata->gpio_base = -1;
+	val = of_get_property(node, "linux,gpio-base", NULL);
+	if (val) {
+		if (*val < 0)
+			dev_warn(&client->dev,
+				 "invalid gpio-base in device tree\n");
+		else
+			pdata->gpio_base = *val;
+	}
+
+	val = of_get_property(node, "polarity", NULL);
+	if (val)
+		pdata->invert = *val;
+
+	return pdata;
+}
+#else
+static struct pca953x_platform_data *
+pca953x_get_alt_pdata(struct i2c_client *client)
+{
+	return NULL;
+}
+#endif
+
 static int __devinit pca953x_probe(struct i2c_client *client,
 				   const struct i2c_device_id *id)
 {
@@ -201,20 +257,32 @@ static int __devinit pca953x_probe(struct i2c_client *client,
 	struct pca953x_chip *chip;
 	int ret;
 
+	chip = kzalloc(sizeof(struct pca953x_chip), GFP_KERNEL);
+	if (chip == NULL)
+		return -ENOMEM;
+
 	pdata = client->dev.platform_data;
 	if (pdata == NULL) {
-		dev_dbg(&client->dev, "no platform data\n");
-		return -EINVAL;
+		pdata = pca953x_get_alt_pdata(client);
+		/*
+		 * Unlike normal platform_data, this is allocated
+		 * dynamically and must be freed in the driver
+		 */
+		chip->dyn_pdata = pdata;
 	}
 
-	chip = kzalloc(sizeof(struct pca953x_chip), GFP_KERNEL);
-	if (chip == NULL)
-		return -ENOMEM;
+	if (pdata == NULL) {
+		dev_dbg(&client->dev, "no platform data\n");
+		ret = -EINVAL;
+		goto out_failed;
+	}
 
 	chip->client = client;
 
 	chip->gpio_start = pdata->gpio_base;
 
+	chip->names = pdata->names;
+
 	/* initialize cached registers from their original values.
 	 * we can't share this chip with another i2c master.
 	 */
@@ -249,6 +317,7 @@ static int __devinit pca953x_probe(struct i2c_client *client,
 	return 0;
 
 out_failed:
+	kfree(chip->dyn_pdata);
 	kfree(chip);
 	return ret;
 }
@@ -276,6 +345,7 @@ static int pca953x_remove(struct i2c_client *client)
 		return ret;
 	}
 
+	kfree(chip->dyn_pdata);
 	kfree(chip);
 	return 0;
 }
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 8695809b24b..87d88dbb667 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -255,14 +255,14 @@ static void status(struct seq_file *seq, mddev_t *mddev)
 }
 
 
-static int reconfig(mddev_t *mddev, int layout, int chunk_size)
+static int reshape(mddev_t *mddev)
 {
-	int mode = layout & ModeMask;
-	int count = layout >> ModeShift;
+	int mode = mddev->new_layout & ModeMask;
+	int count = mddev->new_layout >> ModeShift;
 	conf_t *conf = mddev->private;
 
-	if (chunk_size != -1)
-		return -EINVAL;
+	if (mddev->new_layout < 0)
+		return 0;
 
 	/* new layout */
 	if (mode == ClearFaults)
@@ -279,6 +279,7 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size)
 		atomic_set(&conf->counters[mode], count);
 	} else
 		return -EINVAL;
+	mddev->new_layout = -1;
 	mddev->layout = -1; /* makes sure further changes come through */
 	return 0;
 }
@@ -298,8 +299,12 @@ static int run(mddev_t *mddev)
 {
 	mdk_rdev_t *rdev;
 	int i;
+	conf_t *conf;
+
+	if (md_check_no_bitmap(mddev))
+		return -EINVAL;
 
-	conf_t *conf = kmalloc(sizeof(*conf), GFP_KERNEL);
+	conf = kmalloc(sizeof(*conf), GFP_KERNEL);
 	if (!conf)
 		return -ENOMEM;
 
@@ -315,7 +320,7 @@ static int run(mddev_t *mddev)
 	md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
 	mddev->private = conf;
 
-	reconfig(mddev, mddev->layout, -1);
+	reshape(mddev);
 
 	return 0;
 }
@@ -338,7 +343,7 @@ static struct mdk_personality faulty_personality =
 	.run		= run,
 	.stop		= stop,
 	.status		= status,
-	.reconfig	= reconfig,
+	.check_reshape	= reshape,
 	.size		= faulty_size,
 };
 
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 64f1f3e046e..15c8b7b25a9 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -27,19 +27,27 @@
  */
 static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
 {
-	dev_info_t *hash;
-	linear_conf_t *conf = mddev_to_conf(mddev);
-	sector_t idx = sector >> conf->sector_shift;
+	int lo, mid, hi;
+	linear_conf_t *conf;
+
+	lo = 0;
+	hi = mddev->raid_disks - 1;
+	conf = rcu_dereference(mddev->private);
 
 	/*
-	 * sector_div(a,b) returns the remainer and sets a to a/b
+	 * Binary Search
 	 */
-	(void)sector_div(idx, conf->spacing);
-	hash = conf->hash_table[idx];
 
-	while (sector >= hash->num_sectors + hash->start_sector)
-		hash++;
-	return hash;
+	while (hi > lo) {
+
+		mid = (hi + lo) / 2;
+		if (sector < conf->disks[mid].end_sector)
+			hi = mid;
+		else
+			lo = mid + 1;
+	}
+
+	return conf->disks + lo;
 }
 
 /**
@@ -59,8 +67,10 @@ static int linear_mergeable_bvec(struct request_queue *q,
 	unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
 	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 
+	rcu_read_lock();
 	dev0 = which_dev(mddev, sector);
-	maxsectors = dev0->num_sectors - (sector - dev0->start_sector);
+	maxsectors = dev0->end_sector - sector;
+	rcu_read_unlock();
 
 	if (maxsectors < bio_sectors)
 		maxsectors = 0;
@@ -79,46 +89,57 @@ static int linear_mergeable_bvec(struct request_queue *q,
 static void linear_unplug(struct request_queue *q)
 {
 	mddev_t *mddev = q->queuedata;
-	linear_conf_t *conf = mddev_to_conf(mddev);
+	linear_conf_t *conf;
 	int i;
 
+	rcu_read_lock();
+	conf = rcu_dereference(mddev->private);
+
 	for (i=0; i < mddev->raid_disks; i++) {
 		struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev);
 		blk_unplug(r_queue);
 	}
+	rcu_read_unlock();
 }
 
 static int linear_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
-	linear_conf_t *conf = mddev_to_conf(mddev);
+	linear_conf_t *conf;
 	int i, ret = 0;
 
+	rcu_read_lock();
+	conf = rcu_dereference(mddev->private);
+
 	for (i = 0; i < mddev->raid_disks && !ret ; i++) {
 		struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev);
 		ret |= bdi_congested(&q->backing_dev_info, bits);
 	}
+
+	rcu_read_unlock();
 	return ret;
 }
 
 static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 {
-	linear_conf_t *conf = mddev_to_conf(mddev);
+	linear_conf_t *conf;
+	sector_t array_sectors;
 
+	rcu_read_lock();
+	conf = rcu_dereference(mddev->private);
 	WARN_ONCE(sectors || raid_disks,
 		  "%s does not support generic reshape\n", __func__);
+	array_sectors = conf->array_sectors;
+	rcu_read_unlock();
 
-	return conf->array_sectors;
+	return array_sectors;
 }
 
 static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 {
 	linear_conf_t *conf;
-	dev_info_t **table;
 	mdk_rdev_t *rdev;
-	int i, nb_zone, cnt;
-	sector_t min_sectors;
-	sector_t curr_sector;
+	int i, cnt;
 
 	conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t),
 			GFP_KERNEL);
@@ -131,6 +152,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		int j = rdev->raid_disk;
 		dev_info_t *disk = conf->disks + j;
+		sector_t sectors;
 
 		if (j < 0 || j >= raid_disks || disk->rdev) {
 			printk("linear: disk numbering problem. Aborting!\n");
@@ -138,6 +160,11 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 		}
 
 		disk->rdev = rdev;
+		if (mddev->chunk_sectors) {
+			sectors = rdev->sectors;
+			sector_div(sectors, mddev->chunk_sectors);
+			rdev->sectors = sectors * mddev->chunk_sectors;
+		}
 
 		blk_queue_stack_limits(mddev->queue,
 				       rdev->bdev->bd_disk->queue);
@@ -149,102 +176,24 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
-		disk->num_sectors = rdev->sectors;
 		conf->array_sectors += rdev->sectors;
-
 		cnt++;
+
 	}
 	if (cnt != raid_disks) {
 		printk("linear: not enough drives present. Aborting!\n");
 		goto out;
 	}
 
-	min_sectors = conf->array_sectors;
-	sector_div(min_sectors, PAGE_SIZE/sizeof(struct dev_info *));
-	if (min_sectors == 0)
-		min_sectors = 1;
-
-	/* min_sectors is the minimum spacing that will fit the hash
-	 * table in one PAGE.  This may be much smaller than needed.
-	 * We find the smallest non-terminal set of consecutive devices
-	 * that is larger than min_sectors and use the size of that as
-	 * the actual spacing
-	 */
-	conf->spacing = conf->array_sectors;
-	for (i=0; i < cnt-1 ; i++) {
-		sector_t tmp = 0;
-		int j;
-		for (j = i; j < cnt - 1 && tmp < min_sectors; j++)
-			tmp += conf->disks[j].num_sectors;
-		if (tmp >= min_sectors && tmp < conf->spacing)
-			conf->spacing = tmp;
-	}
-
-	/* spacing may be too large for sector_div to work with,
-	 * so we might need to pre-shift
-	 */
-	conf->sector_shift = 0;
-	if (sizeof(sector_t) > sizeof(u32)) {
-		sector_t space = conf->spacing;
-		while (space > (sector_t)(~(u32)0)) {
-			space >>= 1;
-			conf->sector_shift++;
-		}
-	}
 	/*
-	 * This code was restructured to work around a gcc-2.95.3 internal
-	 * compiler error.  Alter it with care.
+	 * Here we calculate the device offsets.
 	 */
-	{
-		sector_t sz;
-		unsigned round;
-		unsigned long base;
-
-		sz = conf->array_sectors >> conf->sector_shift;
-		sz += 1; /* force round-up */
-		base = conf->spacing >> conf->sector_shift;
-		round = sector_div(sz, base);
-		nb_zone = sz + (round ? 1 : 0);
-	}
-	BUG_ON(nb_zone > PAGE_SIZE / sizeof(struct dev_info *));
-
-	conf->hash_table = kmalloc (sizeof (struct dev_info *) * nb_zone,
-					GFP_KERNEL);
-	if (!conf->hash_table)
-		goto out;
+	conf->disks[0].end_sector = conf->disks[0].rdev->sectors;
 
-	/*
-	 * Here we generate the linear hash table
-	 * First calculate the device offsets.
-	 */
-	conf->disks[0].start_sector = 0;
 	for (i = 1; i < raid_disks; i++)
-		conf->disks[i].start_sector =
-			conf->disks[i-1].start_sector +
-			conf->disks[i-1].num_sectors;
-
-	table = conf->hash_table;
-	i = 0;
-	for (curr_sector = 0;
-	     curr_sector < conf->array_sectors;
-	     curr_sector += conf->spacing) {
-
-		while (i < raid_disks-1 &&
-		       curr_sector >= conf->disks[i+1].start_sector)
-			i++;
-
-		*table ++ = conf->disks + i;
-	}
-
-	if (conf->sector_shift) {
-		conf->spacing >>= conf->sector_shift;
-		/* round spacing up so that when we divide by it,
-		 * we err on the side of "too-low", which is safest.
-		 */
-		conf->spacing++;
-	}
-
-	BUG_ON(table - conf->hash_table > nb_zone);
+		conf->disks[i].end_sector =
+			conf->disks[i-1].end_sector +
+			conf->disks[i].rdev->sectors;
 
 	return conf;
 
@@ -257,6 +206,8 @@ static int linear_run (mddev_t *mddev)
 {
 	linear_conf_t *conf;
 
+	if (md_check_no_bitmap(mddev))
+		return -EINVAL;
 	mddev->queue->queue_lock = &mddev->queue->__queue_lock;
 	conf = linear_conf(mddev, mddev->raid_disks);
 
@@ -272,6 +223,12 @@ static int linear_run (mddev_t *mddev)
 	return 0;
 }
 
+static void free_conf(struct rcu_head *head)
+{
+	linear_conf_t *conf = container_of(head, linear_conf_t, rcu);
+	kfree(conf);
+}
+
 static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
 {
 	/* Adding a drive to a linear array allows the array to grow.
@@ -282,7 +239,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
 	 * The current one is never freed until the array is stopped.
 	 * This avoids races.
 	 */
-	linear_conf_t *newconf;
+	linear_conf_t *newconf, *oldconf;
 
 	if (rdev->saved_raid_disk != mddev->raid_disks)
 		return -EINVAL;
@@ -294,25 +251,29 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
 	if (!newconf)
 		return -ENOMEM;
 
-	newconf->prev = mddev_to_conf(mddev);
-	mddev->private = newconf;
+	oldconf = rcu_dereference(mddev->private);
 	mddev->raid_disks++;
+	rcu_assign_pointer(mddev->private, newconf);
 	md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
 	set_capacity(mddev->gendisk, mddev->array_sectors);
+	call_rcu(&oldconf->rcu, free_conf);
 	return 0;
 }
 
 static int linear_stop (mddev_t *mddev)
 {
-	linear_conf_t *conf = mddev_to_conf(mddev);
-  
+	linear_conf_t *conf = mddev->private;
+
+	/*
+	 * We do not require rcu protection here since
+	 * we hold reconfig_mutex for both linear_add and
+	 * linear_stop, so they cannot race.
+	 * We should make sure any old 'conf's are properly
+	 * freed though.
+	 */
+	rcu_barrier();
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
-	do {
-		linear_conf_t *t = conf->prev;
-		kfree(conf->hash_table);
-		kfree(conf);
-		conf = t;
-	} while (conf);
+	kfree(conf);
 
 	return 0;
 }
@@ -322,6 +283,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
 	const int rw = bio_data_dir(bio);
 	mddev_t *mddev = q->queuedata;
 	dev_info_t *tmp_dev;
+	sector_t start_sector;
 	int cpu;
 
 	if (unlikely(bio_barrier(bio))) {
@@ -335,33 +297,36 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
 		      bio_sectors(bio));
 	part_stat_unlock();
 
+	rcu_read_lock();
 	tmp_dev = which_dev(mddev, bio->bi_sector);
-    
-	if (unlikely(bio->bi_sector >= (tmp_dev->num_sectors +
-					tmp_dev->start_sector)
-		     || (bio->bi_sector <
-			 tmp_dev->start_sector))) {
+	start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
+
+
+	if (unlikely(bio->bi_sector >= (tmp_dev->end_sector)
+		     || (bio->bi_sector < start_sector))) {
 		char b[BDEVNAME_SIZE];
 
 		printk("linear_make_request: Sector %llu out of bounds on "
 			"dev %s: %llu sectors, offset %llu\n",
 			(unsigned long long)bio->bi_sector,
 			bdevname(tmp_dev->rdev->bdev, b),
-			(unsigned long long)tmp_dev->num_sectors,
-			(unsigned long long)tmp_dev->start_sector);
+			(unsigned long long)tmp_dev->rdev->sectors,
+			(unsigned long long)start_sector);
+		rcu_read_unlock();
 		bio_io_error(bio);
 		return 0;
 	}
 	if (unlikely(bio->bi_sector + (bio->bi_size >> 9) >
-		     tmp_dev->start_sector + tmp_dev->num_sectors)) {
+		     tmp_dev->end_sector)) {
 		/* This bio crosses a device boundary, so we have to
 		 * split it.
 		 */
 		struct bio_pair *bp;
+		sector_t end_sector = tmp_dev->end_sector;
+
+		rcu_read_unlock();
 
-		bp = bio_split(bio,
-			       tmp_dev->start_sector + tmp_dev->num_sectors
-			       - bio->bi_sector);
+		bp = bio_split(bio, end_sector - bio->bi_sector);
 
 		if (linear_make_request(q, &bp->bio1))
 			generic_make_request(&bp->bio1);
@@ -372,8 +337,9 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
 	}
 		    
 	bio->bi_bdev = tmp_dev->rdev->bdev;
-	bio->bi_sector = bio->bi_sector - tmp_dev->start_sector
+	bio->bi_sector = bio->bi_sector - start_sector
 		+ tmp_dev->rdev->data_offset;
+	rcu_read_unlock();
 
 	return 1;
 }
@@ -381,7 +347,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
 static void linear_status (struct seq_file *seq, mddev_t *mddev)
 {
 
-	seq_printf(seq, " %dk rounding", mddev->chunk_size/1024);
+	seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
 }
 
 
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
index bf8179587f9..0ce29b61605 100644
--- a/drivers/md/linear.h
+++ b/drivers/md/linear.h
@@ -3,27 +3,19 @@
 
 struct dev_info {
 	mdk_rdev_t	*rdev;
-	sector_t	num_sectors;
-	sector_t	start_sector;
+	sector_t	end_sector;
 };
 
 typedef struct dev_info dev_info_t;
 
 struct linear_private_data
 {
-	struct linear_private_data *prev;	/* earlier version */
-	dev_info_t		**hash_table;
-	sector_t		spacing;
 	sector_t		array_sectors;
-	int			sector_shift;	/* shift before dividing
-						 * by spacing
-						 */
 	dev_info_t		disks[0];
+	struct rcu_head		rcu;
 };
 
 
 typedef struct linear_private_data linear_conf_t;
 
-#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
-
 #endif
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 20f6ac33834..09be637d52c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -440,15 +440,6 @@ static inline sector_t calc_dev_sboffset(struct block_device *bdev)
 	return MD_NEW_SIZE_SECTORS(num_sectors);
 }
 
-static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size)
-{
-	sector_t num_sectors = rdev->sb_start;
-
-	if (chunk_size)
-		num_sectors &= ~((sector_t)chunk_size/512 - 1);
-	return num_sectors;
-}
-
 static int alloc_disk_sb(mdk_rdev_t * rdev)
 {
 	if (rdev->sb_page)
@@ -745,6 +736,24 @@ struct super_type  {
 };
 
 /*
+ * Check that the given mddev has no bitmap.
+ *
+ * This function is called from the run method of all personalities that do not
+ * support bitmaps. It prints an error message and returns non-zero if mddev
+ * has a bitmap. Otherwise, it returns 0.
+ *
+ */
+int md_check_no_bitmap(mddev_t *mddev)
+{
+	if (!mddev->bitmap_file && !mddev->bitmap_offset)
+		return 0;
+	printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
+		mdname(mddev), mddev->pers->name);
+	return 1;
+}
+EXPORT_SYMBOL(md_check_no_bitmap);
+
+/*
  * load_super for 0.90.0 
  */
 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
@@ -797,17 +806,6 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
 	rdev->data_offset = 0;
 	rdev->sb_size = MD_SB_BYTES;
 
-	if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) {
-		if (sb->level != 1 && sb->level != 4
-		    && sb->level != 5 && sb->level != 6
-		    && sb->level != 10) {
-			/* FIXME use a better test */
-			printk(KERN_WARNING
-			       "md: bitmaps not supported for this level.\n");
-			goto abort;
-		}
-	}
-
 	if (sb->level == LEVEL_MULTIPATH)
 		rdev->desc_nr = -1;
 	else
@@ -836,7 +834,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
 		else 
 			ret = 0;
 	}
-	rdev->sectors = calc_num_sectors(rdev, sb->chunk_size);
+	rdev->sectors = rdev->sb_start;
 
 	if (rdev->sectors < sb->size * 2 && sb->level > 1)
 		/* "this cannot possibly happen" ... */
@@ -866,7 +864,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		mddev->minor_version = sb->minor_version;
 		mddev->patch_version = sb->patch_version;
 		mddev->external = 0;
-		mddev->chunk_size = sb->chunk_size;
+		mddev->chunk_sectors = sb->chunk_size >> 9;
 		mddev->ctime = sb->ctime;
 		mddev->utime = sb->utime;
 		mddev->level = sb->level;
@@ -883,13 +881,13 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 			mddev->delta_disks = sb->delta_disks;
 			mddev->new_level = sb->new_level;
 			mddev->new_layout = sb->new_layout;
-			mddev->new_chunk = sb->new_chunk;
+			mddev->new_chunk_sectors = sb->new_chunk >> 9;
 		} else {
 			mddev->reshape_position = MaxSector;
 			mddev->delta_disks = 0;
 			mddev->new_level = mddev->level;
 			mddev->new_layout = mddev->layout;
-			mddev->new_chunk = mddev->chunk_size;
+			mddev->new_chunk_sectors = mddev->chunk_sectors;
 		}
 
 		if (sb->state & (1<<MD_SB_CLEAN))
@@ -1004,7 +1002,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 		sb->new_level = mddev->new_level;
 		sb->delta_disks = mddev->delta_disks;
 		sb->new_layout = mddev->new_layout;
-		sb->new_chunk = mddev->new_chunk;
+		sb->new_chunk = mddev->new_chunk_sectors << 9;
 	}
 	mddev->minor_version = sb->minor_version;
 	if (mddev->in_sync)
@@ -1018,7 +1016,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 		sb->recovery_cp = 0;
 
 	sb->layout = mddev->layout;
-	sb->chunk_size = mddev->chunk_size;
+	sb->chunk_size = mddev->chunk_sectors << 9;
 
 	if (mddev->bitmap && mddev->bitmap_file == NULL)
 		sb->state |= (1<<MD_SB_BITMAP_PRESENT);
@@ -1185,17 +1183,6 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 		       bdevname(rdev->bdev,b));
 		return -EINVAL;
 	}
-	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) {
-		if (sb->level != cpu_to_le32(1) &&
-		    sb->level != cpu_to_le32(4) &&
-		    sb->level != cpu_to_le32(5) &&
-		    sb->level != cpu_to_le32(6) &&
-		    sb->level != cpu_to_le32(10)) {
-			printk(KERN_WARNING
-			       "md: bitmaps not supported for this level.\n");
-			return -EINVAL;
-		}
-	}
 
 	rdev->preferred_minor = 0xffff;
 	rdev->data_offset = le64_to_cpu(sb->data_offset);
@@ -1248,9 +1235,6 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	if (rdev->sectors < le64_to_cpu(sb->data_size))
 		return -EINVAL;
 	rdev->sectors = le64_to_cpu(sb->data_size);
-	if (le32_to_cpu(sb->chunksize))
-		rdev->sectors &= ~((sector_t)le32_to_cpu(sb->chunksize) - 1);
-
 	if (le64_to_cpu(sb->size) > rdev->sectors)
 		return -EINVAL;
 	return ret;
@@ -1271,7 +1255,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		mddev->major_version = 1;
 		mddev->patch_version = 0;
 		mddev->external = 0;
-		mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
+		mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
 		mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
 		mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
 		mddev->level = le32_to_cpu(sb->level);
@@ -1297,13 +1281,13 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 			mddev->delta_disks = le32_to_cpu(sb->delta_disks);
 			mddev->new_level = le32_to_cpu(sb->new_level);
 			mddev->new_layout = le32_to_cpu(sb->new_layout);
-			mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
+			mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
 		} else {
 			mddev->reshape_position = MaxSector;
 			mddev->delta_disks = 0;
 			mddev->new_level = mddev->level;
 			mddev->new_layout = mddev->layout;
-			mddev->new_chunk = mddev->chunk_size;
+			mddev->new_chunk_sectors = mddev->chunk_sectors;
 		}
 
 	} else if (mddev->pers == NULL) {
@@ -1375,7 +1359,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 
 	sb->raid_disks = cpu_to_le32(mddev->raid_disks);
 	sb->size = cpu_to_le64(mddev->dev_sectors);
-	sb->chunksize = cpu_to_le32(mddev->chunk_size >> 9);
+	sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
 	sb->level = cpu_to_le32(mddev->level);
 	sb->layout = cpu_to_le32(mddev->layout);
 
@@ -1402,7 +1386,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 		sb->new_layout = cpu_to_le32(mddev->new_layout);
 		sb->delta_disks = cpu_to_le32(mddev->delta_disks);
 		sb->new_level = cpu_to_le32(mddev->new_level);
-		sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
+		sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
 	}
 
 	max_dev = 0;
@@ -1897,6 +1881,7 @@ static void md_update_sb(mddev_t * mddev, int force_change)
 	int sync_req;
 	int nospares = 0;
 
+	mddev->utime = get_seconds();
 	if (mddev->external)
 		return;
 repeat:
@@ -1926,7 +1911,6 @@ repeat:
 		nospares = 0;
 
 	sync_req = mddev->in_sync;
-	mddev->utime = get_seconds();
 
 	/* If this is just a dirty<->clean transition, and the array is clean
 	 * and 'events' is odd, we can roll back to the previous clean state */
@@ -2597,15 +2581,6 @@ static void analyze_sbs(mddev_t * mddev)
 			clear_bit(In_sync, &rdev->flags);
 		}
 	}
-
-
-
-	if (mddev->recovery_cp != MaxSector &&
-	    mddev->level >= 1)
-		printk(KERN_ERR "md: %s: raid array is not clean"
-		       " -- starting background reconstruction\n",
-		       mdname(mddev));
-
 }
 
 static void md_safemode_timeout(unsigned long data);
@@ -2746,7 +2721,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 	if (IS_ERR(priv)) {
 		mddev->new_level = mddev->level;
 		mddev->new_layout = mddev->layout;
-		mddev->new_chunk = mddev->chunk_size;
+		mddev->new_chunk_sectors = mddev->chunk_sectors;
 		mddev->raid_disks -= mddev->delta_disks;
 		mddev->delta_disks = 0;
 		module_put(pers->owner);
@@ -2764,7 +2739,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
 	mddev->level = mddev->new_level;
 	mddev->layout = mddev->new_layout;
-	mddev->chunk_size = mddev->new_chunk;
+	mddev->chunk_sectors = mddev->new_chunk_sectors;
 	mddev->delta_disks = 0;
 	pers->run(mddev);
 	mddev_resume(mddev);
@@ -2800,11 +2775,14 @@ layout_store(mddev_t *mddev, const char *buf, size_t len)
 
 	if (mddev->pers) {
 		int err;
-		if (mddev->pers->reconfig == NULL)
+		if (mddev->pers->check_reshape == NULL)
 			return -EBUSY;
-		err = mddev->pers->reconfig(mddev, n, -1);
-		if (err)
+		mddev->new_layout = n;
+		err = mddev->pers->check_reshape(mddev);
+		if (err) {
+			mddev->new_layout = mddev->layout;
 			return err;
+		}
 	} else {
 		mddev->new_layout = n;
 		if (mddev->reshape_position == MaxSector)
@@ -2857,10 +2835,11 @@ static ssize_t
 chunk_size_show(mddev_t *mddev, char *page)
 {
 	if (mddev->reshape_position != MaxSector &&
-	    mddev->chunk_size != mddev->new_chunk)
-		return sprintf(page, "%d (%d)\n", mddev->new_chunk,
-			       mddev->chunk_size);
-	return sprintf(page, "%d\n", mddev->chunk_size);
+	    mddev->chunk_sectors != mddev->new_chunk_sectors)
+		return sprintf(page, "%d (%d)\n",
+			       mddev->new_chunk_sectors << 9,
+			       mddev->chunk_sectors << 9);
+	return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
 }
 
 static ssize_t
@@ -2874,15 +2853,18 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
 
 	if (mddev->pers) {
 		int err;
-		if (mddev->pers->reconfig == NULL)
+		if (mddev->pers->check_reshape == NULL)
 			return -EBUSY;
-		err = mddev->pers->reconfig(mddev, -1, n);
-		if (err)
+		mddev->new_chunk_sectors = n >> 9;
+		err = mddev->pers->check_reshape(mddev);
+		if (err) {
+			mddev->new_chunk_sectors = mddev->chunk_sectors;
 			return err;
+		}
 	} else {
-		mddev->new_chunk = n;
+		mddev->new_chunk_sectors = n >> 9;
 		if (mddev->reshape_position == MaxSector)
-			mddev->chunk_size = n;
+			mddev->chunk_sectors = n >> 9;
 	}
 	return len;
 }
@@ -3527,8 +3509,9 @@ min_sync_store(mddev_t *mddev, const char *buf, size_t len)
 		return -EBUSY;
 
 	/* Must be a multiple of chunk_size */
-	if (mddev->chunk_size) {
-		if (min & (sector_t)((mddev->chunk_size>>9)-1))
+	if (mddev->chunk_sectors) {
+		sector_t temp = min;
+		if (sector_div(temp, mddev->chunk_sectors))
 			return -EINVAL;
 	}
 	mddev->resync_min = min;
@@ -3564,8 +3547,9 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len)
 			return -EBUSY;
 
 		/* Must be a multiple of chunk_size */
-		if (mddev->chunk_size) {
-			if (max & (sector_t)((mddev->chunk_size>>9)-1))
+		if (mddev->chunk_sectors) {
+			sector_t temp = max;
+			if (sector_div(temp, mddev->chunk_sectors))
 				return -EINVAL;
 		}
 		mddev->resync_max = max;
@@ -3656,7 +3640,7 @@ reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
 	mddev->delta_disks = 0;
 	mddev->new_level = mddev->level;
 	mddev->new_layout = mddev->layout;
-	mddev->new_chunk = mddev->chunk_size;
+	mddev->new_chunk_sectors = mddev->chunk_sectors;
 	return len;
 }
 
@@ -3976,11 +3960,9 @@ static int start_dirty_degraded;
 static int do_md_run(mddev_t * mddev)
 {
 	int err;
-	int chunk_size;
 	mdk_rdev_t *rdev;
 	struct gendisk *disk;
 	struct mdk_personality *pers;
-	char b[BDEVNAME_SIZE];
 
 	if (list_empty(&mddev->disks))
 		/* cannot run an array with no devices.. */
@@ -3998,38 +3980,6 @@ static int do_md_run(mddev_t * mddev)
 		analyze_sbs(mddev);
 	}
 
-	chunk_size = mddev->chunk_size;
-
-	if (chunk_size) {
-		if (chunk_size > MAX_CHUNK_SIZE) {
-			printk(KERN_ERR "too big chunk_size: %d > %d\n",
-				chunk_size, MAX_CHUNK_SIZE);
-			return -EINVAL;
-		}
-		/*
-		 * chunk-size has to be a power of 2
-		 */
-		if ( (1 << ffz(~chunk_size)) != chunk_size) {
-			printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size);
-			return -EINVAL;
-		}
-
-		/* devices must have minimum size of one chunk */
-		list_for_each_entry(rdev, &mddev->disks, same_set) {
-			if (test_bit(Faulty, &rdev->flags))
-				continue;
-			if (rdev->sectors < chunk_size / 512) {
-				printk(KERN_WARNING
-					"md: Dev %s smaller than chunk_size:"
-					" %llu < %d\n",
-					bdevname(rdev->bdev,b),
-					(unsigned long long)rdev->sectors,
-					chunk_size / 512);
-				return -EINVAL;
-			}
-		}
-	}
-
 	if (mddev->level != LEVEL_NONE)
 		request_module("md-level-%d", mddev->level);
 	else if (mddev->clevel[0])
@@ -4405,7 +4355,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		mddev->flags = 0;
 		mddev->ro = 0;
 		mddev->metadata_type[0] = 0;
-		mddev->chunk_size = 0;
+		mddev->chunk_sectors = 0;
 		mddev->ctime = mddev->utime = 0;
 		mddev->layout = 0;
 		mddev->max_disks = 0;
@@ -4413,7 +4363,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		mddev->delta_disks = 0;
 		mddev->new_level = LEVEL_NONE;
 		mddev->new_layout = 0;
-		mddev->new_chunk = 0;
+		mddev->new_chunk_sectors = 0;
 		mddev->curr_resync = 0;
 		mddev->resync_mismatches = 0;
 		mddev->suspend_lo = mddev->suspend_hi = 0;
@@ -4618,7 +4568,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
 	info.spare_disks   = spare;
 
 	info.layout        = mddev->layout;
-	info.chunk_size    = mddev->chunk_size;
+	info.chunk_size    = mddev->chunk_sectors << 9;
 
 	if (copy_to_user(arg, &info, sizeof(info)))
 		return -EFAULT;
@@ -4843,7 +4793,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 			rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
 		} else 
 			rdev->sb_start = calc_dev_sboffset(rdev->bdev);
-		rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size);
+		rdev->sectors = rdev->sb_start;
 
 		err = bind_rdev_to_array(rdev, mddev);
 		if (err) {
@@ -4913,7 +4863,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
 	else
 		rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
 
-	rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size);
+	rdev->sectors = rdev->sb_start;
 
 	if (test_bit(Faulty, &rdev->flags)) {
 		printk(KERN_WARNING 
@@ -5062,7 +5012,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
 	mddev->external	     = 0;
 
 	mddev->layout        = info->layout;
-	mddev->chunk_size    = info->chunk_size;
+	mddev->chunk_sectors = info->chunk_size >> 9;
 
 	mddev->max_disks     = MD_SB_DISKS;
 
@@ -5081,7 +5031,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
 	get_random_bytes(mddev->uuid, 16);
 
 	mddev->new_level = mddev->level;
-	mddev->new_chunk = mddev->chunk_size;
+	mddev->new_chunk_sectors = mddev->chunk_sectors;
 	mddev->new_layout = mddev->layout;
 	mddev->delta_disks = 0;
 
@@ -5191,7 +5141,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
 	    mddev->level         != info->level         ||
 /*	    mddev->layout        != info->layout        || */
 	    !mddev->persistent	 != info->not_persistent||
-	    mddev->chunk_size    != info->chunk_size    ||
+	    mddev->chunk_sectors != info->chunk_size >> 9 ||
 	    /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
 	    ((state^info->state) & 0xfffffe00)
 		)
@@ -5215,10 +5165,15 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
 		 * we don't need to do anything at the md level, the
 		 * personality will take care of it all.
 		 */
-		if (mddev->pers->reconfig == NULL)
+		if (mddev->pers->check_reshape == NULL)
 			return -EINVAL;
-		else
-			return mddev->pers->reconfig(mddev, info->layout, -1);
+		else {
+			mddev->new_layout = info->layout;
+			rv = mddev->pers->check_reshape(mddev);
+			if (rv)
+				mddev->new_layout = mddev->layout;
+			return rv;
+		}
 	}
 	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
 		rv = update_size(mddev, (sector_t)info->size * 2);
@@ -6717,7 +6672,8 @@ void md_check_recovery(mddev_t *mddev)
 		 */
 
 		if (mddev->reshape_position != MaxSector) {
-			if (mddev->pers->check_reshape(mddev) != 0)
+			if (mddev->pers->check_reshape == NULL ||
+			    mddev->pers->check_reshape(mddev) != 0)
 				/* Cannot proceed */
 				goto unlock;
 			set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 8227ab909d4..9430a110db9 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -30,13 +30,6 @@ typedef struct mddev_s mddev_t;
 typedef struct mdk_rdev_s mdk_rdev_t;
 
 /*
- * options passed in raidrun:
- */
-
-/* Currently this must fit in an 'int' */
-#define MAX_CHUNK_SIZE (1<<30)
-
-/*
  * MD's 'extended' device
  */
 struct mdk_rdev_s
@@ -145,7 +138,7 @@ struct mddev_s
 	int 				external;	/* metadata is
 							 * managed externally */
 	char				metadata_type[17]; /* externally set*/
-	int				chunk_size;
+	int				chunk_sectors;
 	time_t				ctime, utime;
 	int				level, layout;
 	char				clevel[16];
@@ -166,7 +159,8 @@ struct mddev_s
 	 * If reshape_position is MaxSector, then no reshape is happening (yet).
 	 */
 	sector_t			reshape_position;
-	int				delta_disks, new_level, new_layout, new_chunk;
+	int				delta_disks, new_level, new_layout;
+	int				new_chunk_sectors;
 
 	struct mdk_thread_s		*thread;	/* management thread */
 	struct mdk_thread_s		*sync_thread;	/* doing resync or reconstruct */
@@ -325,7 +319,6 @@ struct mdk_personality
 	int (*check_reshape) (mddev_t *mddev);
 	int (*start_reshape) (mddev_t *mddev);
 	void (*finish_reshape) (mddev_t *mddev);
-	int (*reconfig) (mddev_t *mddev, int layout, int chunk_size);
 	/* quiesce moves between quiescence states
 	 * 0 - fully active
 	 * 1 - no new requests allowed
@@ -437,5 +430,6 @@ extern void md_new_event(mddev_t *mddev);
 extern int md_allow_write(mddev_t *mddev);
 extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
 extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
+extern int md_check_no_bitmap(mddev_t *mddev);
 
 #endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 4ee31aa13c4..cbe368fa659 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -58,7 +58,7 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
 {
 	unsigned long flags;
 	mddev_t *mddev = mp_bh->mddev;
-	multipath_conf_t *conf = mddev_to_conf(mddev);
+	multipath_conf_t *conf = mddev->private;
 
 	spin_lock_irqsave(&conf->device_lock, flags);
 	list_add(&mp_bh->retry_list, &conf->retry_list);
@@ -75,7 +75,7 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
 static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
 {
 	struct bio *bio = mp_bh->master_bio;
-	multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
+	multipath_conf_t *conf = mp_bh->mddev->private;
 
 	bio_endio(bio, err);
 	mempool_free(mp_bh, conf->pool);
@@ -85,7 +85,7 @@ static void multipath_end_request(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private);
-	multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
+	multipath_conf_t *conf = mp_bh->mddev->private;
 	mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev;
 
 	if (uptodate)
@@ -107,7 +107,7 @@ static void multipath_end_request(struct bio *bio, int error)
 
 static void unplug_slaves(mddev_t *mddev)
 {
-	multipath_conf_t *conf = mddev_to_conf(mddev);
+	multipath_conf_t *conf = mddev->private;
 	int i;
 
 	rcu_read_lock();
@@ -138,7 +138,7 @@ static void multipath_unplug(struct request_queue *q)
 static int multipath_make_request (struct request_queue *q, struct bio * bio)
 {
 	mddev_t *mddev = q->queuedata;
-	multipath_conf_t *conf = mddev_to_conf(mddev);
+	multipath_conf_t *conf = mddev->private;
 	struct multipath_bh * mp_bh;
 	struct multipath_info *multipath;
 	const int rw = bio_data_dir(bio);
@@ -180,7 +180,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
 
 static void multipath_status (struct seq_file *seq, mddev_t *mddev)
 {
-	multipath_conf_t *conf = mddev_to_conf(mddev);
+	multipath_conf_t *conf = mddev->private;
 	int i;
 	
 	seq_printf (seq, " [%d/%d] [", conf->raid_disks,
@@ -195,7 +195,7 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev)
 static int multipath_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
-	multipath_conf_t *conf = mddev_to_conf(mddev);
+	multipath_conf_t *conf = mddev->private;
 	int i, ret = 0;
 
 	rcu_read_lock();
@@ -220,7 +220,7 @@ static int multipath_congested(void *data, int bits)
  */
 static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
 {
-	multipath_conf_t *conf = mddev_to_conf(mddev);
+	multipath_conf_t *conf = mddev->private;
 
 	if (conf->working_disks <= 1) {
 		/*
@@ -367,7 +367,7 @@ static void multipathd (mddev_t *mddev)
 	struct multipath_bh *mp_bh;
 	struct bio *bio;
 	unsigned long flags;
-	multipath_conf_t *conf = mddev_to_conf(mddev);
+	multipath_conf_t *conf = mddev->private;
 	struct list_head *head = &conf->retry_list;
 
 	md_check_recovery(mddev);
@@ -421,6 +421,9 @@ static int multipath_run (mddev_t *mddev)
 	struct multipath_info *disk;
 	mdk_rdev_t *rdev;
 
+	if (md_check_no_bitmap(mddev))
+		return -EINVAL;
+
 	if (mddev->level != LEVEL_MULTIPATH) {
 		printk("multipath: %s: raid level not set to multipath IO (%d)\n",
 		       mdname(mddev), mddev->level);
@@ -531,7 +534,7 @@ out:
 
 static int multipath_stop (mddev_t *mddev)
 {
-	multipath_conf_t *conf = mddev_to_conf(mddev);
+	multipath_conf_t *conf = mddev->private;
 
 	md_unregister_thread(mddev->thread);
 	mddev->thread = NULL;
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h
index 6fa70b400cd..d1c2a8d7839 100644
--- a/drivers/md/multipath.h
+++ b/drivers/md/multipath.h
@@ -19,12 +19,6 @@ struct multipath_private_data {
 typedef struct multipath_private_data multipath_conf_t;
 
 /*
- * this is the only point in the RAID code where we violate
- * C type safety. mddev->private is an 'opaque' pointer.
- */
-#define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private)
-
-/*
  * this is our 'private' 'collective' MULTIPATH buffer head.
  * it contains information about what kind of IO operations were started
  * for this MULTIPATH operation, and about their status:
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 925507e7d67..ab4a489d869 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -26,8 +26,8 @@
 static void raid0_unplug(struct request_queue *q)
 {
 	mddev_t *mddev = q->queuedata;
-	raid0_conf_t *conf = mddev_to_conf(mddev);
-	mdk_rdev_t **devlist = conf->strip_zone[0].dev;
+	raid0_conf_t *conf = mddev->private;
+	mdk_rdev_t **devlist = conf->devlist;
 	int i;
 
 	for (i=0; i<mddev->raid_disks; i++) {
@@ -40,8 +40,8 @@ static void raid0_unplug(struct request_queue *q)
 static int raid0_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
-	raid0_conf_t *conf = mddev_to_conf(mddev);
-	mdk_rdev_t **devlist = conf->strip_zone[0].dev;
+	raid0_conf_t *conf = mddev->private;
+	mdk_rdev_t **devlist = conf->devlist;
 	int i, ret = 0;
 
 	for (i = 0; i < mddev->raid_disks && !ret ; i++) {
@@ -52,27 +52,60 @@ static int raid0_congested(void *data, int bits)
 	return ret;
 }
 
+/*
+ * inform the user of the raid configuration
+*/
+static void dump_zones(mddev_t *mddev)
+{
+	int j, k, h;
+	sector_t zone_size = 0;
+	sector_t zone_start = 0;
+	char b[BDEVNAME_SIZE];
+	raid0_conf_t *conf = mddev->private;
+	printk(KERN_INFO "******* %s configuration *********\n",
+		mdname(mddev));
+	h = 0;
+	for (j = 0; j < conf->nr_strip_zones; j++) {
+		printk(KERN_INFO "zone%d=[", j);
+		for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
+			printk("%s/",
+			bdevname(conf->devlist[j*mddev->raid_disks
+						+ k]->bdev, b));
+		printk("]\n");
+
+		zone_size  = conf->strip_zone[j].zone_end - zone_start;
+		printk(KERN_INFO "        zone offset=%llukb "
+				"device offset=%llukb size=%llukb\n",
+			(unsigned long long)zone_start>>1,
+			(unsigned long long)conf->strip_zone[j].dev_start>>1,
+			(unsigned long long)zone_size>>1);
+		zone_start = conf->strip_zone[j].zone_end;
+	}
+	printk(KERN_INFO "**********************************\n\n");
+}
 
-static int create_strip_zones (mddev_t *mddev)
+static int create_strip_zones(mddev_t *mddev)
 {
-	int i, c, j;
-	sector_t current_start, curr_zone_start;
-	sector_t min_spacing;
-	raid0_conf_t *conf = mddev_to_conf(mddev);
-	mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
+	int i, c, j, err;
+	sector_t curr_zone_end, sectors;
+	mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev, **dev;
 	struct strip_zone *zone;
 	int cnt;
 	char b[BDEVNAME_SIZE];
- 
-	/*
-	 * The number of 'same size groups'
-	 */
-	conf->nr_strip_zones = 0;
- 
+	raid0_conf_t *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
+
+	if (!conf)
+		return -ENOMEM;
 	list_for_each_entry(rdev1, &mddev->disks, same_set) {
 		printk(KERN_INFO "raid0: looking at %s\n",
 			bdevname(rdev1->bdev,b));
 		c = 0;
+
+		/* round size to chunk_size */
+		sectors = rdev1->sectors;
+		sector_div(sectors, mddev->chunk_sectors);
+		rdev1->sectors = sectors * mddev->chunk_sectors;
+
 		list_for_each_entry(rdev2, &mddev->disks, same_set) {
 			printk(KERN_INFO "raid0:   comparing %s(%llu)",
 			       bdevname(rdev1->bdev,b),
@@ -103,16 +136,16 @@ static int create_strip_zones (mddev_t *mddev)
 		}
 	}
 	printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones);
-
+	err = -ENOMEM;
 	conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
 				conf->nr_strip_zones, GFP_KERNEL);
 	if (!conf->strip_zone)
-		return 1;
+		goto abort;
 	conf->devlist = kzalloc(sizeof(mdk_rdev_t*)*
 				conf->nr_strip_zones*mddev->raid_disks,
 				GFP_KERNEL);
 	if (!conf->devlist)
-		return 1;
+		goto abort;
 
 	/* The first zone must contain all devices, so here we check that
 	 * there is a proper alignment of slots to devices and find them all
@@ -120,7 +153,8 @@ static int create_strip_zones (mddev_t *mddev)
 	zone = &conf->strip_zone[0];
 	cnt = 0;
 	smallest = NULL;
-	zone->dev = conf->devlist;
+	dev = conf->devlist;
+	err = -EINVAL;
 	list_for_each_entry(rdev1, &mddev->disks, same_set) {
 		int j = rdev1->raid_disk;
 
@@ -129,12 +163,12 @@ static int create_strip_zones (mddev_t *mddev)
 				"aborting!\n", j);
 			goto abort;
 		}
-		if (zone->dev[j]) {
+		if (dev[j]) {
 			printk(KERN_ERR "raid0: multiple devices for %d - "
 				"aborting!\n", j);
 			goto abort;
 		}
-		zone->dev[j] = rdev1;
+		dev[j] = rdev1;
 
 		blk_queue_stack_limits(mddev->queue,
 				       rdev1->bdev->bd_disk->queue);
@@ -157,34 +191,32 @@ static int create_strip_zones (mddev_t *mddev)
 		goto abort;
 	}
 	zone->nb_dev = cnt;
-	zone->sectors = smallest->sectors * cnt;
-	zone->zone_start = 0;
+	zone->zone_end = smallest->sectors * cnt;
 
-	current_start = smallest->sectors;
-	curr_zone_start = zone->sectors;
+	curr_zone_end = zone->zone_end;
 
 	/* now do the other zones */
 	for (i = 1; i < conf->nr_strip_zones; i++)
 	{
 		zone = conf->strip_zone + i;
-		zone->dev = conf->strip_zone[i-1].dev + mddev->raid_disks;
+		dev = conf->devlist + i * mddev->raid_disks;
 
 		printk(KERN_INFO "raid0: zone %d\n", i);
-		zone->dev_start = current_start;
+		zone->dev_start = smallest->sectors;
 		smallest = NULL;
 		c = 0;
 
 		for (j=0; j<cnt; j++) {
 			char b[BDEVNAME_SIZE];
-			rdev = conf->strip_zone[0].dev[j];
+			rdev = conf->devlist[j];
 			printk(KERN_INFO "raid0: checking %s ...",
 				bdevname(rdev->bdev, b));
-			if (rdev->sectors <= current_start) {
+			if (rdev->sectors <= zone->dev_start) {
 				printk(KERN_INFO " nope.\n");
 				continue;
 			}
 			printk(KERN_INFO " contained as device %d\n", c);
-			zone->dev[c] = rdev;
+			dev[c] = rdev;
 			c++;
 			if (!smallest || rdev->sectors < smallest->sectors) {
 				smallest = rdev;
@@ -194,47 +226,39 @@ static int create_strip_zones (mddev_t *mddev)
 		}
 
 		zone->nb_dev = c;
-		zone->sectors = (smallest->sectors - current_start) * c;
+		sectors = (smallest->sectors - zone->dev_start) * c;
 		printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n",
-			zone->nb_dev, (unsigned long long)zone->sectors);
+			zone->nb_dev, (unsigned long long)sectors);
 
-		zone->zone_start = curr_zone_start;
-		curr_zone_start += zone->sectors;
+		curr_zone_end += sectors;
+		zone->zone_end = curr_zone_end;
 
-		current_start = smallest->sectors;
 		printk(KERN_INFO "raid0: current zone start: %llu\n",
-			(unsigned long long)current_start);
-	}
-
-	/* Now find appropriate hash spacing.
-	 * We want a number which causes most hash entries to cover
-	 * at most two strips, but the hash table must be at most
-	 * 1 PAGE.  We choose the smallest strip, or contiguous collection
-	 * of strips, that has big enough size.  We never consider the last
-	 * strip though as it's size has no bearing on the efficacy of the hash
-	 * table.
-	 */
-	conf->spacing = curr_zone_start;
-	min_spacing = curr_zone_start;
-	sector_div(min_spacing, PAGE_SIZE/sizeof(struct strip_zone*));
-	for (i=0; i < conf->nr_strip_zones-1; i++) {
-		sector_t s = 0;
-		for (j = i; j < conf->nr_strip_zones - 1 &&
-				s < min_spacing; j++)
-			s += conf->strip_zone[j].sectors;
-		if (s >= min_spacing && s < conf->spacing)
-			conf->spacing = s;
+			(unsigned long long)smallest->sectors);
 	}
-
 	mddev->queue->unplug_fn = raid0_unplug;
-
 	mddev->queue->backing_dev_info.congested_fn = raid0_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
 
+	/*
+	 * now since we have the hard sector sizes, we can make sure
+	 * chunk size is a multiple of that sector size
+	 */
+	if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) {
+		printk(KERN_ERR "%s chunk_size of %d not valid\n",
+		       mdname(mddev),
+		       mddev->chunk_sectors << 9);
+		goto abort;
+	}
 	printk(KERN_INFO "raid0: done.\n");
+	mddev->private = conf;
 	return 0;
- abort:
-	return 1;
+abort:
+	kfree(conf->strip_zone);
+	kfree(conf->devlist);
+	kfree(conf);
+	mddev->private = NULL;
+	return err;
 }
 
 /**
@@ -252,10 +276,15 @@ static int raid0_mergeable_bvec(struct request_queue *q,
 	mddev_t *mddev = q->queuedata;
 	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 	int max;
-	unsigned int chunk_sectors = mddev->chunk_size >> 9;
+	unsigned int chunk_sectors = mddev->chunk_sectors;
 	unsigned int bio_sectors = bvm->bi_size >> 9;
 
-	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
+	if (is_power_of_2(chunk_sectors))
+		max =  (chunk_sectors - ((sector & (chunk_sectors-1))
+						+ bio_sectors)) << 9;
+	else
+		max =  (chunk_sectors - (sector_div(sector, chunk_sectors)
+						+ bio_sectors)) << 9;
 	if (max < 0) max = 0; /* bio_add cannot handle a negative return */
 	if (max <= biovec->bv_len && bio_sectors == 0)
 		return biovec->bv_len;
@@ -277,84 +306,28 @@ static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 	return array_sectors;
 }
 
-static int raid0_run (mddev_t *mddev)
+static int raid0_run(mddev_t *mddev)
 {
-	unsigned  cur=0, i=0, nb_zone;
-	s64 sectors;
-	raid0_conf_t *conf;
+	int ret;
 
-	if (mddev->chunk_size == 0) {
-		printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
+	if (mddev->chunk_sectors == 0) {
+		printk(KERN_ERR "md/raid0: chunk size must be set.\n");
 		return -EINVAL;
 	}
-	printk(KERN_INFO "%s: setting max_sectors to %d, segment boundary to %d\n",
-	       mdname(mddev),
-	       mddev->chunk_size >> 9,
-	       (mddev->chunk_size>>1)-1);
-	blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9);
-	blk_queue_segment_boundary(mddev->queue, (mddev->chunk_size>>1) - 1);
+	if (md_check_no_bitmap(mddev))
+		return -EINVAL;
+	blk_queue_max_sectors(mddev->queue, mddev->chunk_sectors);
 	mddev->queue->queue_lock = &mddev->queue->__queue_lock;
 
-	conf = kmalloc(sizeof (raid0_conf_t), GFP_KERNEL);
-	if (!conf)
-		goto out;
-	mddev->private = (void *)conf;
- 
-	conf->strip_zone = NULL;
-	conf->devlist = NULL;
-	if (create_strip_zones (mddev)) 
-		goto out_free_conf;
+	ret = create_strip_zones(mddev);
+	if (ret < 0)
+		return ret;
 
 	/* calculate array device size */
 	md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
 
 	printk(KERN_INFO "raid0 : md_size is %llu sectors.\n",
 		(unsigned long long)mddev->array_sectors);
-	printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n",
-		(unsigned long long)conf->spacing);
-	{
-		sector_t s = raid0_size(mddev, 0, 0);
-		sector_t space = conf->spacing;
-		int round;
-		conf->sector_shift = 0;
-		if (sizeof(sector_t) > sizeof(u32)) {
-			/*shift down space and s so that sector_div will work */
-			while (space > (sector_t) (~(u32)0)) {
-				s >>= 1;
-				space >>= 1;
-				s += 1; /* force round-up */
-				conf->sector_shift++;
-			}
-		}
-		round = sector_div(s, (u32)space) ? 1 : 0;
-		nb_zone = s + round;
-	}
-	printk(KERN_INFO "raid0 : nb_zone is %d.\n", nb_zone);
-
-	printk(KERN_INFO "raid0 : Allocating %zu bytes for hash.\n",
-				nb_zone*sizeof(struct strip_zone*));
-	conf->hash_table = kmalloc (sizeof (struct strip_zone *)*nb_zone, GFP_KERNEL);
-	if (!conf->hash_table)
-		goto out_free_conf;
-	sectors = conf->strip_zone[cur].sectors;
-
-	conf->hash_table[0] = conf->strip_zone + cur;
-	for (i=1; i< nb_zone; i++) {
-		while (sectors <= conf->spacing) {
-			cur++;
-			sectors += conf->strip_zone[cur].sectors;
-		}
-		sectors -= conf->spacing;
-		conf->hash_table[i] = conf->strip_zone + cur;
-	}
-	if (conf->sector_shift) {
-		conf->spacing >>= conf->sector_shift;
-		/* round spacing up so when we divide by it, we
-		 * err on the side of too-low, which is safest
-		 */
-		conf->spacing++;
-	}
-
 	/* calculate the max read-ahead size.
 	 * For read-ahead of large files to be effective, we need to
 	 * readahead at least twice a whole stripe. i.e. number of devices
@@ -365,48 +338,107 @@ static int raid0_run (mddev_t *mddev)
 	 * chunksize should be used in that case.
 	 */
 	{
-		int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_SIZE;
+		int stripe = mddev->raid_disks *
+			(mddev->chunk_sectors << 9) / PAGE_SIZE;
 		if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
 			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
 	}
 
-
 	blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
+	dump_zones(mddev);
 	return 0;
+}
 
-out_free_conf:
+static int raid0_stop(mddev_t *mddev)
+{
+	raid0_conf_t *conf = mddev->private;
+
+	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	kfree(conf->strip_zone);
 	kfree(conf->devlist);
 	kfree(conf);
 	mddev->private = NULL;
-out:
-	return -ENOMEM;
+	return 0;
 }
 
-static int raid0_stop (mddev_t *mddev)
+/* Find the zone which holds a particular offset
+ * Update *sectorp to be an offset in that zone
+ */
+static struct strip_zone *find_zone(struct raid0_private_data *conf,
+				    sector_t *sectorp)
 {
-	raid0_conf_t *conf = mddev_to_conf(mddev);
+	int i;
+	struct strip_zone *z = conf->strip_zone;
+	sector_t sector = *sectorp;
+
+	for (i = 0; i < conf->nr_strip_zones; i++)
+		if (sector < z[i].zone_end) {
+			if (i)
+				*sectorp = sector - z[i-1].zone_end;
+			return z + i;
+		}
+	BUG();
+}
 
-	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
-	kfree(conf->hash_table);
-	conf->hash_table = NULL;
-	kfree(conf->strip_zone);
-	conf->strip_zone = NULL;
-	kfree(conf);
-	mddev->private = NULL;
+/*
+ * remaps the bio to the target device. we separate two flows.
+ * power 2 flow and a general flow for the sake of perfromance
+*/
+static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone,
+				sector_t sector, sector_t *sector_offset)
+{
+	unsigned int sect_in_chunk;
+	sector_t chunk;
+	raid0_conf_t *conf = mddev->private;
+	unsigned int chunk_sects = mddev->chunk_sectors;
+
+	if (is_power_of_2(chunk_sects)) {
+		int chunksect_bits = ffz(~chunk_sects);
+		/* find the sector offset inside the chunk */
+		sect_in_chunk  = sector & (chunk_sects - 1);
+		sector >>= chunksect_bits;
+		/* chunk in zone */
+		chunk = *sector_offset;
+		/* quotient is the chunk in real device*/
+		sector_div(chunk, zone->nb_dev << chunksect_bits);
+	} else{
+		sect_in_chunk = sector_div(sector, chunk_sects);
+		chunk = *sector_offset;
+		sector_div(chunk, chunk_sects * zone->nb_dev);
+	}
+	/*
+	*  position the bio over the real device
+	*  real sector = chunk in device + starting of zone
+	*	+ the position in the chunk
+	*/
+	*sector_offset = (chunk * chunk_sects) + sect_in_chunk;
+	return conf->devlist[(zone - conf->strip_zone)*mddev->raid_disks
+			     + sector_div(sector, zone->nb_dev)];
+}
 
-	return 0;
+/*
+ * Is io distribute over 1 or more chunks ?
+*/
+static inline int is_io_in_chunk_boundary(mddev_t *mddev,
+			unsigned int chunk_sects, struct bio *bio)
+{
+	if (likely(is_power_of_2(chunk_sects))) {
+		return chunk_sects >= ((bio->bi_sector & (chunk_sects-1))
+					+ (bio->bi_size >> 9));
+	} else{
+		sector_t sector = bio->bi_sector;
+		return chunk_sects >= (sector_div(sector, chunk_sects)
+						+ (bio->bi_size >> 9));
+	}
 }
 
-static int raid0_make_request (struct request_queue *q, struct bio *bio)
+static int raid0_make_request(struct request_queue *q, struct bio *bio)
 {
 	mddev_t *mddev = q->queuedata;
-	unsigned int sect_in_chunk, chunksect_bits, chunk_sects;
-	raid0_conf_t *conf = mddev_to_conf(mddev);
+	unsigned int chunk_sects;
+	sector_t sector_offset;
 	struct strip_zone *zone;
 	mdk_rdev_t *tmp_dev;
-	sector_t chunk;
-	sector_t sector, rsect;
 	const int rw = bio_data_dir(bio);
 	int cpu;
 
@@ -421,11 +453,9 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
 		      bio_sectors(bio));
 	part_stat_unlock();
 
-	chunk_sects = mddev->chunk_size >> 9;
-	chunksect_bits = ffz(~chunk_sects);
-	sector = bio->bi_sector;
-
-	if (unlikely(chunk_sects < (bio->bi_sector & (chunk_sects - 1)) + (bio->bi_size >> 9))) {
+	chunk_sects = mddev->chunk_sectors;
+	if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) {
+		sector_t sector = bio->bi_sector;
 		struct bio_pair *bp;
 		/* Sanity check -- queue functions should prevent this happening */
 		if (bio->bi_vcnt != 1 ||
@@ -434,7 +464,12 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
 		/* This is a one page bio that upper layers
 		 * refuse to split for us, so we need to split it.
 		 */
-		bp = bio_split(bio, chunk_sects - (bio->bi_sector & (chunk_sects - 1)));
+		if (likely(is_power_of_2(chunk_sects)))
+			bp = bio_split(bio, chunk_sects - (sector &
+							   (chunk_sects-1)));
+		else
+			bp = bio_split(bio, chunk_sects -
+				       sector_div(sector, chunk_sects));
 		if (raid0_make_request(q, &bp->bio1))
 			generic_make_request(&bp->bio1);
 		if (raid0_make_request(q, &bp->bio2))
@@ -443,34 +478,14 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
 		bio_pair_release(bp);
 		return 0;
 	}
- 
-
-	{
-		sector_t x = sector >> conf->sector_shift;
-		sector_div(x, (u32)conf->spacing);
-		zone = conf->hash_table[x];
-	}
 
-	while (sector >= zone->zone_start + zone->sectors)
-		zone++;
-
-	sect_in_chunk = bio->bi_sector & (chunk_sects - 1);
-
-
-	{
-		sector_t x = (sector - zone->zone_start) >> chunksect_bits;
-
-		sector_div(x, zone->nb_dev);
-		chunk = x;
-
-		x = sector >> chunksect_bits;
-		tmp_dev = zone->dev[sector_div(x, zone->nb_dev)];
-	}
-	rsect = (chunk << chunksect_bits) + zone->dev_start + sect_in_chunk;
- 
+	sector_offset = bio->bi_sector;
+	zone =  find_zone(mddev->private, &sector_offset);
+	tmp_dev = map_sector(mddev, zone, bio->bi_sector,
+			     &sector_offset);
 	bio->bi_bdev = tmp_dev->bdev;
-	bio->bi_sector = rsect + tmp_dev->data_offset;
-
+	bio->bi_sector = sector_offset + zone->dev_start +
+		tmp_dev->data_offset;
 	/*
 	 * Let the main block layer submit the IO and resolve recursion:
 	 */
@@ -485,31 +500,35 @@ bad_map:
 	return 0;
 }
 
-static void raid0_status (struct seq_file *seq, mddev_t *mddev)
+static void raid0_status(struct seq_file *seq, mddev_t *mddev)
 {
 #undef MD_DEBUG
 #ifdef MD_DEBUG
 	int j, k, h;
 	char b[BDEVNAME_SIZE];
-	raid0_conf_t *conf = mddev_to_conf(mddev);
+	raid0_conf_t *conf = mddev->private;
 
+	sector_t zone_size;
+	sector_t zone_start = 0;
 	h = 0;
+
 	for (j = 0; j < conf->nr_strip_zones; j++) {
 		seq_printf(seq, "      z%d", j);
-		if (conf->hash_table[h] == conf->strip_zone+j)
-			seq_printf(seq, "(h%d)", h++);
 		seq_printf(seq, "=[");
 		for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
 			seq_printf(seq, "%s/", bdevname(
-				conf->strip_zone[j].dev[k]->bdev,b));
-
-		seq_printf(seq, "] zs=%d ds=%d s=%d\n",
-				conf->strip_zone[j].zone_start,
-				conf->strip_zone[j].dev_start,
-				conf->strip_zone[j].sectors);
+				conf->devlist[j*mddev->raid_disks + k]
+						->bdev, b));
+
+		zone_size  = conf->strip_zone[j].zone_end - zone_start;
+		seq_printf(seq, "] ze=%lld ds=%lld s=%lld\n",
+			(unsigned long long)zone_start>>1,
+			(unsigned long long)conf->strip_zone[j].dev_start>>1,
+			(unsigned long long)zone_size>>1);
+		zone_start = conf->strip_zone[j].zone_end;
 	}
 #endif
-	seq_printf(seq, " %dk chunks", mddev->chunk_size/1024);
+	seq_printf(seq, " %dk chunks", mddev->chunk_sectors / 2);
 	return;
 }
 
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 824b12eb1d4..91f8e876ee6 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -3,26 +3,18 @@
 
 struct strip_zone
 {
-	sector_t zone_start;	/* Zone offset in md_dev (in sectors) */
+	sector_t zone_end;	/* Start of the next zone (in sectors) */
 	sector_t dev_start;	/* Zone offset in real dev (in sectors) */
-	sector_t sectors;	/* Zone size in sectors */
 	int nb_dev;		/* # of devices attached to the zone */
-	mdk_rdev_t **dev;	/* Devices attached to the zone */
 };
 
 struct raid0_private_data
 {
-	struct strip_zone **hash_table; /* Table of indexes into strip_zone */
 	struct strip_zone *strip_zone;
 	mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
 	int nr_strip_zones;
-
-	sector_t spacing;
-	int sector_shift; /* shift this before divide by spacing */
 };
 
 typedef struct raid0_private_data raid0_conf_t;
 
-#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
-
 #endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e23758b4a34..89939a7aef5 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -182,7 +182,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
 
 static void free_r1bio(r1bio_t *r1_bio)
 {
-	conf_t *conf = mddev_to_conf(r1_bio->mddev);
+	conf_t *conf = r1_bio->mddev->private;
 
 	/*
 	 * Wake up any possible resync thread that waits for the device
@@ -196,7 +196,7 @@ static void free_r1bio(r1bio_t *r1_bio)
 
 static void put_buf(r1bio_t *r1_bio)
 {
-	conf_t *conf = mddev_to_conf(r1_bio->mddev);
+	conf_t *conf = r1_bio->mddev->private;
 	int i;
 
 	for (i=0; i<conf->raid_disks; i++) {
@@ -214,7 +214,7 @@ static void reschedule_retry(r1bio_t *r1_bio)
 {
 	unsigned long flags;
 	mddev_t *mddev = r1_bio->mddev;
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 
 	spin_lock_irqsave(&conf->device_lock, flags);
 	list_add(&r1_bio->retry_list, &conf->retry_list);
@@ -253,7 +253,7 @@ static void raid_end_bio_io(r1bio_t *r1_bio)
  */
 static inline void update_head_pos(int disk, r1bio_t *r1_bio)
 {
-	conf_t *conf = mddev_to_conf(r1_bio->mddev);
+	conf_t *conf = r1_bio->mddev->private;
 
 	conf->mirrors[disk].head_position =
 		r1_bio->sector + (r1_bio->sectors);
@@ -264,7 +264,7 @@ static void raid1_end_read_request(struct bio *bio, int error)
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
 	int mirror;
-	conf_t *conf = mddev_to_conf(r1_bio->mddev);
+	conf_t *conf = r1_bio->mddev->private;
 
 	mirror = r1_bio->read_disk;
 	/*
@@ -309,7 +309,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
 	int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
-	conf_t *conf = mddev_to_conf(r1_bio->mddev);
+	conf_t *conf = r1_bio->mddev->private;
 	struct bio *to_put = NULL;
 
 
@@ -541,7 +541,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 
 static void unplug_slaves(mddev_t *mddev)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	int i;
 
 	rcu_read_lock();
@@ -573,7 +573,7 @@ static void raid1_unplug(struct request_queue *q)
 static int raid1_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	int i, ret = 0;
 
 	rcu_read_lock();
@@ -772,7 +772,7 @@ do_sync_io:
 static int make_request(struct request_queue *q, struct bio * bio)
 {
 	mddev_t *mddev = q->queuedata;
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	mirror_info_t *mirror;
 	r1bio_t *r1_bio;
 	struct bio *read_bio;
@@ -991,7 +991,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
 
 static void status(struct seq_file *seq, mddev_t *mddev)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	int i;
 
 	seq_printf(seq, " [%d/%d] [", conf->raid_disks,
@@ -1010,7 +1010,7 @@ static void status(struct seq_file *seq, mddev_t *mddev)
 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 {
 	char b[BDEVNAME_SIZE];
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 
 	/*
 	 * If it is not operational, then we have already marked it as dead
@@ -1214,7 +1214,7 @@ static void end_sync_write(struct bio *bio, int error)
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
 	mddev_t *mddev = r1_bio->mddev;
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	int i;
 	int mirror=0;
 
@@ -1248,7 +1248,7 @@ static void end_sync_write(struct bio *bio, int error)
 
 static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	int i;
 	int disks = conf->raid_disks;
 	struct bio *bio, *wbio;
@@ -1562,7 +1562,7 @@ static void raid1d(mddev_t *mddev)
 	r1bio_t *r1_bio;
 	struct bio *bio;
 	unsigned long flags;
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	struct list_head *head = &conf->retry_list;
 	int unplug=0;
 	mdk_rdev_t *rdev;
@@ -1585,7 +1585,7 @@ static void raid1d(mddev_t *mddev)
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 
 		mddev = r1_bio->mddev;
-		conf = mddev_to_conf(mddev);
+		conf = mddev->private;
 		if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
 			sync_request_write(mddev, r1_bio);
 			unplug = 1;
@@ -1706,7 +1706,7 @@ static int init_resync(conf_t *conf)
 
 static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	r1bio_t *r1_bio;
 	struct bio *bio;
 	sector_t max_sector, nr_sectors;
@@ -2052,6 +2052,10 @@ static int run(mddev_t *mddev)
 		goto out_free_conf;
 	}
 
+	if (mddev->recovery_cp != MaxSector)
+		printk(KERN_NOTICE "raid1: %s is not clean"
+		       " -- starting background reconstruction\n",
+		       mdname(mddev));
 	printk(KERN_INFO 
 		"raid1: raid set %s active with %d out of %d mirrors\n",
 		mdname(mddev), mddev->raid_disks - mddev->degraded, 
@@ -2087,7 +2091,7 @@ out:
 
 static int stop(mddev_t *mddev)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	struct bitmap *bitmap = mddev->bitmap;
 	int behind_wait = 0;
 
@@ -2155,16 +2159,16 @@ static int raid1_reshape(mddev_t *mddev)
 	mempool_t *newpool, *oldpool;
 	struct pool_info *newpoolinfo;
 	mirror_info_t *newmirrors;
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	int cnt, raid_disks;
 	unsigned long flags;
 	int d, d2, err;
 
 	/* Cannot change chunk_size, layout, or level */
-	if (mddev->chunk_size != mddev->new_chunk ||
+	if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
 	    mddev->layout != mddev->new_layout ||
 	    mddev->level != mddev->new_level) {
-		mddev->new_chunk = mddev->chunk_size;
+		mddev->new_chunk_sectors = mddev->chunk_sectors;
 		mddev->new_layout = mddev->layout;
 		mddev->new_level = mddev->level;
 		return -EINVAL;
@@ -2252,7 +2256,7 @@ static int raid1_reshape(mddev_t *mddev)
 
 static void raid1_quiesce(mddev_t *mddev, int state)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 
 	switch(state) {
 	case 1:
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 1620eea3d57..e87b84deff6 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -64,12 +64,6 @@ struct r1_private_data_s {
 typedef struct r1_private_data_s conf_t;
 
 /*
- * this is the only point in the RAID code where we violate
- * C type safety. mddev->private is an 'opaque' pointer.
- */
-#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
-
-/*
  * this is our 'private' RAID1 bio.
  *
  * it contains information about what kind of IO operations were started
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 750550c1166..ae12ceafe10 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -188,7 +188,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
 
 static void free_r10bio(r10bio_t *r10_bio)
 {
-	conf_t *conf = mddev_to_conf(r10_bio->mddev);
+	conf_t *conf = r10_bio->mddev->private;
 
 	/*
 	 * Wake up any possible resync thread that waits for the device
@@ -202,7 +202,7 @@ static void free_r10bio(r10bio_t *r10_bio)
 
 static void put_buf(r10bio_t *r10_bio)
 {
-	conf_t *conf = mddev_to_conf(r10_bio->mddev);
+	conf_t *conf = r10_bio->mddev->private;
 
 	mempool_free(r10_bio, conf->r10buf_pool);
 
@@ -213,7 +213,7 @@ static void reschedule_retry(r10bio_t *r10_bio)
 {
 	unsigned long flags;
 	mddev_t *mddev = r10_bio->mddev;
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 
 	spin_lock_irqsave(&conf->device_lock, flags);
 	list_add(&r10_bio->retry_list, &conf->retry_list);
@@ -245,7 +245,7 @@ static void raid_end_bio_io(r10bio_t *r10_bio)
  */
 static inline void update_head_pos(int slot, r10bio_t *r10_bio)
 {
-	conf_t *conf = mddev_to_conf(r10_bio->mddev);
+	conf_t *conf = r10_bio->mddev->private;
 
 	conf->mirrors[r10_bio->devs[slot].devnum].head_position =
 		r10_bio->devs[slot].addr + (r10_bio->sectors);
@@ -256,7 +256,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
 	int slot, dev;
-	conf_t *conf = mddev_to_conf(r10_bio->mddev);
+	conf_t *conf = r10_bio->mddev->private;
 
 
 	slot = r10_bio->read_slot;
@@ -297,7 +297,7 @@ static void raid10_end_write_request(struct bio *bio, int error)
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
 	int slot, dev;
-	conf_t *conf = mddev_to_conf(r10_bio->mddev);
+	conf_t *conf = r10_bio->mddev->private;
 
 	for (slot = 0; slot < conf->copies; slot++)
 		if (r10_bio->devs[slot].bio == bio)
@@ -461,7 +461,7 @@ static int raid10_mergeable_bvec(struct request_queue *q,
 	mddev_t *mddev = q->queuedata;
 	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 	int max;
-	unsigned int chunk_sectors = mddev->chunk_size >> 9;
+	unsigned int chunk_sectors = mddev->chunk_sectors;
 	unsigned int bio_sectors = bvm->bi_size >> 9;
 
 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
@@ -596,7 +596,7 @@ rb_out:
 
 static void unplug_slaves(mddev_t *mddev)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	int i;
 
 	rcu_read_lock();
@@ -628,7 +628,7 @@ static void raid10_unplug(struct request_queue *q)
 static int raid10_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	int i, ret = 0;
 
 	rcu_read_lock();
@@ -788,7 +788,7 @@ static void unfreeze_array(conf_t *conf)
 static int make_request(struct request_queue *q, struct bio * bio)
 {
 	mddev_t *mddev = q->queuedata;
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	mirror_info_t *mirror;
 	r10bio_t *r10_bio;
 	struct bio *read_bio;
@@ -981,11 +981,11 @@ static int make_request(struct request_queue *q, struct bio * bio)
 
 static void status(struct seq_file *seq, mddev_t *mddev)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	int i;
 
 	if (conf->near_copies < conf->raid_disks)
-		seq_printf(seq, " %dK chunks", mddev->chunk_size/1024);
+		seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
 	if (conf->near_copies > 1)
 		seq_printf(seq, " %d near-copies", conf->near_copies);
 	if (conf->far_copies > 1) {
@@ -1006,7 +1006,7 @@ static void status(struct seq_file *seq, mddev_t *mddev)
 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 {
 	char b[BDEVNAME_SIZE];
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 
 	/*
 	 * If it is not operational, then we have already marked it as dead
@@ -1215,7 +1215,7 @@ abort:
 static void end_sync_read(struct bio *bio, int error)
 {
 	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
-	conf_t *conf = mddev_to_conf(r10_bio->mddev);
+	conf_t *conf = r10_bio->mddev->private;
 	int i,d;
 
 	for (i=0; i<conf->copies; i++)
@@ -1253,7 +1253,7 @@ static void end_sync_write(struct bio *bio, int error)
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
 	mddev_t *mddev = r10_bio->mddev;
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	int i,d;
 
 	for (i = 0; i < conf->copies; i++)
@@ -1300,7 +1300,7 @@ static void end_sync_write(struct bio *bio, int error)
  */
 static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	int i, first;
 	struct bio *tbio, *fbio;
 
@@ -1400,7 +1400,7 @@ done:
 
 static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	int i, d;
 	struct bio *bio, *wbio;
 
@@ -1549,7 +1549,7 @@ static void raid10d(mddev_t *mddev)
 	r10bio_t *r10_bio;
 	struct bio *bio;
 	unsigned long flags;
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	struct list_head *head = &conf->retry_list;
 	int unplug=0;
 	mdk_rdev_t *rdev;
@@ -1572,7 +1572,7 @@ static void raid10d(mddev_t *mddev)
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 
 		mddev = r10_bio->mddev;
-		conf = mddev_to_conf(mddev);
+		conf = mddev->private;
 		if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
 			sync_request_write(mddev, r10_bio);
 			unplug = 1;
@@ -1680,7 +1680,7 @@ static int init_resync(conf_t *conf)
 
 static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	r10bio_t *r10_bio;
 	struct bio *biolist = NULL, *bio;
 	sector_t max_sector, nr_sectors;
@@ -2026,7 +2026,7 @@ static sector_t
 raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 {
 	sector_t size;
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 
 	if (!raid_disks)
 		raid_disks = mddev->raid_disks;
@@ -2050,9 +2050,10 @@ static int run(mddev_t *mddev)
 	int nc, fc, fo;
 	sector_t stride, size;
 
-	if (mddev->chunk_size < PAGE_SIZE) {
+	if (mddev->chunk_sectors < (PAGE_SIZE >> 9) ||
+	    !is_power_of_2(mddev->chunk_sectors)) {
 		printk(KERN_ERR "md/raid10: chunk size must be "
-		       "at least PAGE_SIZE(%ld).\n", PAGE_SIZE);
+		       "at least PAGE_SIZE(%ld) and be a power of 2.\n", PAGE_SIZE);
 		return -EINVAL;
 	}
 
@@ -2095,8 +2096,8 @@ static int run(mddev_t *mddev)
 	conf->far_copies = fc;
 	conf->copies = nc*fc;
 	conf->far_offset = fo;
-	conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1;
-	conf->chunk_shift = ffz(~mddev->chunk_size) - 9;
+	conf->chunk_mask = mddev->chunk_sectors - 1;
+	conf->chunk_shift = ffz(~mddev->chunk_sectors);
 	size = mddev->dev_sectors >> conf->chunk_shift;
 	sector_div(size, fc);
 	size = size * conf->raid_disks;
@@ -2185,6 +2186,10 @@ static int run(mddev_t *mddev)
 		goto out_free_conf;
 	}
 
+	if (mddev->recovery_cp != MaxSector)
+		printk(KERN_NOTICE "raid10: %s is not clean"
+		       " -- starting background reconstruction\n",
+		       mdname(mddev));
 	printk(KERN_INFO
 		"raid10: raid set %s active with %d out of %d devices\n",
 		mdname(mddev), mddev->raid_disks - mddev->degraded,
@@ -2204,7 +2209,8 @@ static int run(mddev_t *mddev)
 	 * maybe...
 	 */
 	{
-		int stripe = conf->raid_disks * (mddev->chunk_size / PAGE_SIZE);
+		int stripe = conf->raid_disks *
+			((mddev->chunk_sectors << 9) / PAGE_SIZE);
 		stripe /= conf->near_copies;
 		if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
 			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
@@ -2227,7 +2233,7 @@ out:
 
 static int stop(mddev_t *mddev)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 
 	raise_barrier(conf, 0);
 	lower_barrier(conf);
@@ -2245,7 +2251,7 @@ static int stop(mddev_t *mddev)
 
 static void raid10_quiesce(mddev_t *mddev, int state)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 
 	switch(state) {
 	case 1:
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 244dbe507a5..59cd1efb8d3 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -62,12 +62,6 @@ struct r10_private_data_s {
 typedef struct r10_private_data_s conf_t;
 
 /*
- * this is the only point in the RAID code where we violate
- * C type safety. mddev->private is an 'opaque' pointer.
- */
-#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
-
-/*
  * this is our 'private' RAID10 bio.
  *
  * it contains information about what kind of IO operations were started
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index bef87669823..f9f991e6e13 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1274,8 +1274,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
 	sector_t new_sector;
 	int algorithm = previous ? conf->prev_algo
 				 : conf->algorithm;
-	int sectors_per_chunk = previous ? (conf->prev_chunk >> 9)
-					 : (conf->chunk_size >> 9);
+	int sectors_per_chunk = previous ? conf->prev_chunk_sectors
+					 : conf->chunk_sectors;
 	int raid_disks = previous ? conf->previous_raid_disks
 				  : conf->raid_disks;
 	int data_disks = raid_disks - conf->max_degraded;
@@ -1480,8 +1480,8 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
 	int raid_disks = sh->disks;
 	int data_disks = raid_disks - conf->max_degraded;
 	sector_t new_sector = sh->sector, check;
-	int sectors_per_chunk = previous ? (conf->prev_chunk >> 9)
-					 : (conf->chunk_size >> 9);
+	int sectors_per_chunk = previous ? conf->prev_chunk_sectors
+					 : conf->chunk_sectors;
 	int algorithm = previous ? conf->prev_algo
 				 : conf->algorithm;
 	sector_t stripe;
@@ -1997,8 +1997,7 @@ static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
 			    struct stripe_head *sh)
 {
 	int sectors_per_chunk =
-		previous ? (conf->prev_chunk >> 9)
-			 : (conf->chunk_size >> 9);
+		previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
 	int dd_idx;
 	int chunk_offset = sector_div(stripe, sectors_per_chunk);
 	int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
@@ -3284,7 +3283,7 @@ static void activate_bit_delay(raid5_conf_t *conf)
 
 static void unplug_slaves(mddev_t *mddev)
 {
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 	int i;
 
 	rcu_read_lock();
@@ -3308,7 +3307,7 @@ static void unplug_slaves(mddev_t *mddev)
 static void raid5_unplug_device(struct request_queue *q)
 {
 	mddev_t *mddev = q->queuedata;
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 	unsigned long flags;
 
 	spin_lock_irqsave(&conf->device_lock, flags);
@@ -3327,7 +3326,7 @@ static void raid5_unplug_device(struct request_queue *q)
 static int raid5_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 
 	/* No difference between reads and writes.  Just check
 	 * how busy the stripe_cache is
@@ -3352,14 +3351,14 @@ static int raid5_mergeable_bvec(struct request_queue *q,
 	mddev_t *mddev = q->queuedata;
 	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 	int max;
-	unsigned int chunk_sectors = mddev->chunk_size >> 9;
+	unsigned int chunk_sectors = mddev->chunk_sectors;
 	unsigned int bio_sectors = bvm->bi_size >> 9;
 
 	if ((bvm->bi_rw & 1) == WRITE)
 		return biovec->bv_len; /* always allow writes to be mergeable */
 
-	if (mddev->new_chunk < mddev->chunk_size)
-		chunk_sectors = mddev->new_chunk >> 9;
+	if (mddev->new_chunk_sectors < mddev->chunk_sectors)
+		chunk_sectors = mddev->new_chunk_sectors;
 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
 	if (max < 0) max = 0;
 	if (max <= biovec->bv_len && bio_sectors == 0)
@@ -3372,11 +3371,11 @@ static int raid5_mergeable_bvec(struct request_queue *q,
 static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
 {
 	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
-	unsigned int chunk_sectors = mddev->chunk_size >> 9;
+	unsigned int chunk_sectors = mddev->chunk_sectors;
 	unsigned int bio_sectors = bio->bi_size >> 9;
 
-	if (mddev->new_chunk < mddev->chunk_size)
-		chunk_sectors = mddev->new_chunk >> 9;
+	if (mddev->new_chunk_sectors < mddev->chunk_sectors)
+		chunk_sectors = mddev->new_chunk_sectors;
 	return  chunk_sectors >=
 		((sector & (chunk_sectors - 1)) + bio_sectors);
 }
@@ -3440,7 +3439,7 @@ static void raid5_align_endio(struct bio *bi, int error)
 	bio_put(bi);
 
 	mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata;
-	conf = mddev_to_conf(mddev);
+	conf = mddev->private;
 	rdev = (void*)raid_bi->bi_next;
 	raid_bi->bi_next = NULL;
 
@@ -3482,7 +3481,7 @@ static int bio_fits_rdev(struct bio *bi)
 static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
 {
 	mddev_t *mddev = q->queuedata;
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 	unsigned int dd_idx;
 	struct bio* align_bi;
 	mdk_rdev_t *rdev;
@@ -3599,7 +3598,7 @@ static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)
 static int make_request(struct request_queue *q, struct bio * bi)
 {
 	mddev_t *mddev = q->queuedata;
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 	int dd_idx;
 	sector_t new_sector;
 	sector_t logical_sector, last_sector;
@@ -3696,6 +3695,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
 				spin_unlock_irq(&conf->device_lock);
 				if (must_retry) {
 					release_stripe(sh);
+					schedule();
 					goto retry;
 				}
 			}
@@ -3791,10 +3791,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 	 * If old and new chunk sizes differ, we need to process the
 	 * largest of these
 	 */
-	if (mddev->new_chunk > mddev->chunk_size)
-		reshape_sectors = mddev->new_chunk / 512;
+	if (mddev->new_chunk_sectors > mddev->chunk_sectors)
+		reshape_sectors = mddev->new_chunk_sectors;
 	else
-		reshape_sectors = mddev->chunk_size / 512;
+		reshape_sectors = mddev->chunk_sectors;
 
 	/* we update the metadata when there is more than 3Meg
 	 * in the block range (that is rather arbitrary, should
@@ -3917,7 +3917,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 				     1, &dd_idx, NULL);
 	last_sector =
 		raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
-					    *(new_data_disks) - 1),
+					    * new_data_disks - 1),
 				     1, &dd_idx, NULL);
 	if (last_sector >= mddev->dev_sectors)
 		last_sector = mddev->dev_sectors - 1;
@@ -3946,7 +3946,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 		wait_event(conf->wait_for_overlap,
 			   atomic_read(&conf->reshape_stripes) == 0);
 		mddev->reshape_position = conf->reshape_progress;
-		mddev->curr_resync_completed = mddev->curr_resync;
+		mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors;
 		conf->reshape_checkpoint = jiffies;
 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
 		md_wakeup_thread(mddev->thread);
@@ -4129,7 +4129,7 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
 static void raid5d(mddev_t *mddev)
 {
 	struct stripe_head *sh;
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 	int handled;
 
 	pr_debug("+++ raid5d active\n");
@@ -4185,7 +4185,7 @@ static void raid5d(mddev_t *mddev)
 static ssize_t
 raid5_show_stripe_cache_size(mddev_t *mddev, char *page)
 {
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 	if (conf)
 		return sprintf(page, "%d\n", conf->max_nr_stripes);
 	else
@@ -4195,7 +4195,7 @@ raid5_show_stripe_cache_size(mddev_t *mddev, char *page)
 static ssize_t
 raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
 {
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 	unsigned long new;
 	int err;
 
@@ -4233,7 +4233,7 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
 static ssize_t
 raid5_show_preread_threshold(mddev_t *mddev, char *page)
 {
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 	if (conf)
 		return sprintf(page, "%d\n", conf->bypass_threshold);
 	else
@@ -4243,7 +4243,7 @@ raid5_show_preread_threshold(mddev_t *mddev, char *page)
 static ssize_t
 raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len)
 {
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 	unsigned long new;
 	if (len >= PAGE_SIZE)
 		return -EINVAL;
@@ -4267,7 +4267,7 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
 static ssize_t
 stripe_cache_active_show(mddev_t *mddev, char *page)
 {
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 	if (conf)
 		return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
 	else
@@ -4291,7 +4291,7 @@ static struct attribute_group raid5_attrs_group = {
 static sector_t
 raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 {
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 
 	if (!sectors)
 		sectors = mddev->dev_sectors;
@@ -4303,8 +4303,8 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 			raid_disks = conf->previous_raid_disks;
 	}
 
-	sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
-	sectors &= ~((sector_t)mddev->new_chunk/512 - 1);
+	sectors &= ~((sector_t)mddev->chunk_sectors - 1);
+	sectors &= ~((sector_t)mddev->new_chunk_sectors - 1);
 	return sectors * (raid_disks - conf->max_degraded);
 }
 
@@ -4336,9 +4336,11 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE) {
+	if (!mddev->new_chunk_sectors ||
+	    (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
+	    !is_power_of_2(mddev->new_chunk_sectors)) {
 		printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
-			mddev->new_chunk, mdname(mddev));
+		       mddev->new_chunk_sectors << 9, mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -4401,7 +4403,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 			conf->fullsync = 1;
 	}
 
-	conf->chunk_size = mddev->new_chunk;
+	conf->chunk_sectors = mddev->new_chunk_sectors;
 	conf->level = mddev->new_level;
 	if (conf->level == 6)
 		conf->max_degraded = 2;
@@ -4411,7 +4413,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 	conf->max_nr_stripes = NR_STRIPES;
 	conf->reshape_progress = mddev->reshape_position;
 	if (conf->reshape_progress != MaxSector) {
-		conf->prev_chunk = mddev->chunk_size;
+		conf->prev_chunk_sectors = mddev->chunk_sectors;
 		conf->prev_algo = mddev->layout;
 	}
 
@@ -4453,6 +4455,10 @@ static int run(mddev_t *mddev)
 	int working_disks = 0;
 	mdk_rdev_t *rdev;
 
+	if (mddev->recovery_cp != MaxSector)
+		printk(KERN_NOTICE "raid5: %s is not clean"
+		       " -- starting background reconstruction\n",
+		       mdname(mddev));
 	if (mddev->reshape_position != MaxSector) {
 		/* Check that we can continue the reshape.
 		 * Currently only disks can change, it must
@@ -4475,7 +4481,7 @@ static int run(mddev_t *mddev)
 		 * geometry.
 		 */
 		here_new = mddev->reshape_position;
-		if (sector_div(here_new, (mddev->new_chunk>>9)*
+		if (sector_div(here_new, mddev->new_chunk_sectors *
 			       (mddev->raid_disks - max_degraded))) {
 			printk(KERN_ERR "raid5: reshape_position not "
 			       "on a stripe boundary\n");
@@ -4483,7 +4489,7 @@ static int run(mddev_t *mddev)
 		}
 		/* here_new is the stripe we will write to */
 		here_old = mddev->reshape_position;
-		sector_div(here_old, (mddev->chunk_size>>9)*
+		sector_div(here_old, mddev->chunk_sectors *
 			   (old_disks-max_degraded));
 		/* here_old is the first stripe that we might need to read
 		 * from */
@@ -4498,7 +4504,7 @@ static int run(mddev_t *mddev)
 	} else {
 		BUG_ON(mddev->level != mddev->new_level);
 		BUG_ON(mddev->layout != mddev->new_layout);
-		BUG_ON(mddev->chunk_size != mddev->new_chunk);
+		BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
 		BUG_ON(mddev->delta_disks != 0);
 	}
 
@@ -4532,7 +4538,7 @@ static int run(mddev_t *mddev)
 	}
 
 	/* device size must be a multiple of chunk size */
-	mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1);
+	mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
 	mddev->resync_max_sectors = mddev->dev_sectors;
 
 	if (mddev->degraded > 0 &&
@@ -4581,7 +4587,7 @@ static int run(mddev_t *mddev)
 	{
 		int data_disks = conf->previous_raid_disks - conf->max_degraded;
 		int stripe = data_disks *
-			(mddev->chunk_size / PAGE_SIZE);
+			((mddev->chunk_sectors << 9) / PAGE_SIZE);
 		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
 			mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
 	}
@@ -4678,7 +4684,8 @@ static void status(struct seq_file *seq, mddev_t *mddev)
 	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
 	int i;
 
-	seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
+	seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
+		mddev->chunk_sectors / 2, mddev->layout);
 	seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
 	for (i = 0; i < conf->raid_disks; i++)
 		seq_printf (seq, "%s",
@@ -4826,7 +4833,7 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
 	 * any io in the removed space completes, but it hardly seems
 	 * worth it.
 	 */
-	sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
+	sectors &= ~((sector_t)mddev->chunk_sectors - 1);
 	md_set_array_sectors(mddev, raid5_size(mddev, sectors,
 					       mddev->raid_disks));
 	if (mddev->array_sectors >
@@ -4843,14 +4850,37 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
 	return 0;
 }
 
-static int raid5_check_reshape(mddev_t *mddev)
+static int check_stripe_cache(mddev_t *mddev)
 {
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	/* Can only proceed if there are plenty of stripe_heads.
+	 * We need a minimum of one full stripe,, and for sensible progress
+	 * it is best to have about 4 times that.
+	 * If we require 4 times, then the default 256 4K stripe_heads will
+	 * allow for chunk sizes up to 256K, which is probably OK.
+	 * If the chunk size is greater, user-space should request more
+	 * stripe_heads first.
+	 */
+	raid5_conf_t *conf = mddev->private;
+	if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
+	    > conf->max_nr_stripes ||
+	    ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
+	    > conf->max_nr_stripes) {
+		printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n",
+		       ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
+			/ STRIPE_SIZE)*4);
+		return 0;
+	}
+	return 1;
+}
+
+static int check_reshape(mddev_t *mddev)
+{
+	raid5_conf_t *conf = mddev->private;
 
 	if (mddev->delta_disks == 0 &&
 	    mddev->new_layout == mddev->layout &&
-	    mddev->new_chunk == mddev->chunk_size)
-		return -EINVAL; /* nothing to do */
+	    mddev->new_chunk_sectors == mddev->chunk_sectors)
+		return 0; /* nothing to do */
 	if (mddev->bitmap)
 		/* Cannot grow a bitmap yet */
 		return -EBUSY;
@@ -4869,28 +4899,15 @@ static int raid5_check_reshape(mddev_t *mddev)
 			return -EINVAL;
 	}
 
-	/* Can only proceed if there are plenty of stripe_heads.
-	 * We need a minimum of one full stripe,, and for sensible progress
-	 * it is best to have about 4 times that.
-	 * If we require 4 times, then the default 256 4K stripe_heads will
-	 * allow for chunk sizes up to 256K, which is probably OK.
-	 * If the chunk size is greater, user-space should request more
-	 * stripe_heads first.
-	 */
-	if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes ||
-	    (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
-		printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n",
-		       (max(mddev->chunk_size, mddev->new_chunk)
-			/ STRIPE_SIZE)*4);
+	if (!check_stripe_cache(mddev))
 		return -ENOSPC;
-	}
 
 	return resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
 }
 
 static int raid5_start_reshape(mddev_t *mddev)
 {
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 	mdk_rdev_t *rdev;
 	int spares = 0;
 	int added_devices = 0;
@@ -4899,6 +4916,9 @@ static int raid5_start_reshape(mddev_t *mddev)
 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 		return -EBUSY;
 
+	if (!check_stripe_cache(mddev))
+		return -ENOSPC;
+
 	list_for_each_entry(rdev, &mddev->disks, same_set)
 		if (rdev->raid_disk < 0 &&
 		    !test_bit(Faulty, &rdev->flags))
@@ -4925,8 +4945,8 @@ static int raid5_start_reshape(mddev_t *mddev)
 	spin_lock_irq(&conf->device_lock);
 	conf->previous_raid_disks = conf->raid_disks;
 	conf->raid_disks += mddev->delta_disks;
-	conf->prev_chunk = conf->chunk_size;
-	conf->chunk_size = mddev->new_chunk;
+	conf->prev_chunk_sectors = conf->chunk_sectors;
+	conf->chunk_sectors = mddev->new_chunk_sectors;
 	conf->prev_algo = conf->algorithm;
 	conf->algorithm = mddev->new_layout;
 	if (mddev->delta_disks < 0)
@@ -5008,7 +5028,7 @@ static void end_reshape(raid5_conf_t *conf)
 		 */
 		{
 			int data_disks = conf->raid_disks - conf->max_degraded;
-			int stripe = data_disks * (conf->chunk_size
+			int stripe = data_disks * ((conf->chunk_sectors << 9)
 						   / PAGE_SIZE);
 			if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
 				conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
@@ -5022,7 +5042,7 @@ static void end_reshape(raid5_conf_t *conf)
 static void raid5_finish_reshape(mddev_t *mddev)
 {
 	struct block_device *bdev;
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 
 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
 
@@ -5053,7 +5073,7 @@ static void raid5_finish_reshape(mddev_t *mddev)
 				raid5_remove_disk(mddev, d);
 		}
 		mddev->layout = conf->algorithm;
-		mddev->chunk_size = conf->chunk_size;
+		mddev->chunk_sectors = conf->chunk_sectors;
 		mddev->reshape_position = MaxSector;
 		mddev->delta_disks = 0;
 	}
@@ -5061,7 +5081,7 @@ static void raid5_finish_reshape(mddev_t *mddev)
 
 static void raid5_quiesce(mddev_t *mddev, int state)
 {
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 
 	switch(state) {
 	case 2: /* resume for a suspend */
@@ -5111,7 +5131,7 @@ static void *raid5_takeover_raid1(mddev_t *mddev)
 
 	mddev->new_level = 5;
 	mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
-	mddev->new_chunk = chunksect << 9;
+	mddev->new_chunk_sectors = chunksect;
 
 	return setup_conf(mddev);
 }
@@ -5150,24 +5170,24 @@ static void *raid5_takeover_raid6(mddev_t *mddev)
 }
 
 
-static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
+static int raid5_check_reshape(mddev_t *mddev)
 {
 	/* For a 2-drive array, the layout and chunk size can be changed
 	 * immediately as not restriping is needed.
 	 * For larger arrays we record the new value - after validation
 	 * to be used by a reshape pass.
 	 */
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
+	int new_chunk = mddev->new_chunk_sectors;
 
-	if (new_layout >= 0 && !algorithm_valid_raid5(new_layout))
+	if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
 		return -EINVAL;
 	if (new_chunk > 0) {
-		if (new_chunk & (new_chunk-1))
-			/* not a power of 2 */
+		if (!is_power_of_2(new_chunk))
 			return -EINVAL;
-		if (new_chunk < PAGE_SIZE)
+		if (new_chunk < (PAGE_SIZE>>9))
 			return -EINVAL;
-		if (mddev->array_sectors & ((new_chunk>>9)-1))
+		if (mddev->array_sectors & (new_chunk-1))
 			/* not factor of array size */
 			return -EINVAL;
 	}
@@ -5175,49 +5195,39 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
 	/* They look valid */
 
 	if (mddev->raid_disks == 2) {
-
-		if (new_layout >= 0) {
-			conf->algorithm = new_layout;
-			mddev->layout = mddev->new_layout = new_layout;
+		/* can make the change immediately */
+		if (mddev->new_layout >= 0) {
+			conf->algorithm = mddev->new_layout;
+			mddev->layout = mddev->new_layout;
 		}
 		if (new_chunk > 0) {
-			conf->chunk_size = new_chunk;
-			mddev->chunk_size = mddev->new_chunk = new_chunk;
+			conf->chunk_sectors = new_chunk ;
+			mddev->chunk_sectors = new_chunk;
 		}
 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
 		md_wakeup_thread(mddev->thread);
-	} else {
-		if (new_layout >= 0)
-			mddev->new_layout = new_layout;
-		if (new_chunk > 0)
-			mddev->new_chunk = new_chunk;
 	}
-	return 0;
+	return check_reshape(mddev);
 }
 
-static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
+static int raid6_check_reshape(mddev_t *mddev)
 {
-	if (new_layout >= 0 && !algorithm_valid_raid6(new_layout))
+	int new_chunk = mddev->new_chunk_sectors;
+
+	if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
 		return -EINVAL;
 	if (new_chunk > 0) {
-		if (new_chunk & (new_chunk-1))
-			/* not a power of 2 */
+		if (!is_power_of_2(new_chunk))
 			return -EINVAL;
-		if (new_chunk < PAGE_SIZE)
+		if (new_chunk < (PAGE_SIZE >> 9))
 			return -EINVAL;
-		if (mddev->array_sectors & ((new_chunk>>9)-1))
+		if (mddev->array_sectors & (new_chunk-1))
 			/* not factor of array size */
 			return -EINVAL;
 	}
 
 	/* They look valid */
-
-	if (new_layout >= 0)
-		mddev->new_layout = new_layout;
-	if (new_chunk > 0)
-		mddev->new_chunk = new_chunk;
-
-	return 0;
+	return check_reshape(mddev);
 }
 
 static void *raid5_takeover(mddev_t *mddev)
@@ -5227,8 +5237,6 @@ static void *raid5_takeover(mddev_t *mddev)
 	 *  raid1 - if there are two drives.  We need to know the chunk size
 	 *  raid4 - trivial - just use a raid4 layout.
 	 *  raid6 - Providing it is a *_6 layout
-	 *
-	 * For now, just do raid1
 	 */
 
 	if (mddev->level == 1)
@@ -5310,12 +5318,11 @@ static struct mdk_personality raid6_personality =
 	.sync_request	= sync_request,
 	.resize		= raid5_resize,
 	.size		= raid5_size,
-	.check_reshape	= raid5_check_reshape,
+	.check_reshape	= raid6_check_reshape,
 	.start_reshape  = raid5_start_reshape,
 	.finish_reshape = raid5_finish_reshape,
 	.quiesce	= raid5_quiesce,
 	.takeover	= raid6_takeover,
-	.reconfig	= raid6_reconfig,
 };
 static struct mdk_personality raid5_personality =
 {
@@ -5338,7 +5345,6 @@ static struct mdk_personality raid5_personality =
 	.finish_reshape = raid5_finish_reshape,
 	.quiesce	= raid5_quiesce,
 	.takeover	= raid5_takeover,
-	.reconfig	= raid5_reconfig,
 };
 
 static struct mdk_personality raid4_personality =
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 52ba99954de..9459689c4ea 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -334,7 +334,8 @@ struct raid5_private_data {
 	struct hlist_head	*stripe_hashtbl;
 	mddev_t			*mddev;
 	struct disk_info	*spare;
-	int			chunk_size, level, algorithm;
+	int			chunk_sectors;
+	int			level, algorithm;
 	int			max_degraded;
 	int			raid_disks;
 	int			max_nr_stripes;
@@ -350,7 +351,8 @@ struct raid5_private_data {
 	 */
 	sector_t		reshape_safe;
 	int			previous_raid_disks;
-	int			prev_chunk, prev_algo;
+	int			prev_chunk_sectors;
+	int			prev_algo;
 	short			generation; /* increments with every reshape */
 	unsigned long		reshape_checkpoint; /* Time we last updated
 						     * metadata */
@@ -408,8 +410,6 @@ struct raid5_private_data {
 
 typedef struct raid5_private_data raid5_conf_t;
 
-#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
-
 /*
  * Our supported algorithms
  */
diff --git a/drivers/misc/sgi-gru/Makefile b/drivers/misc/sgi-gru/Makefile
index bcd8136d2f9..7c4c306dfa8 100644
--- a/drivers/misc/sgi-gru/Makefile
+++ b/drivers/misc/sgi-gru/Makefile
@@ -3,5 +3,5 @@ ifdef CONFIG_SGI_GRU_DEBUG
 endif
 
 obj-$(CONFIG_SGI_GRU) := gru.o
-gru-y := grufile.o grumain.o grufault.o grutlbpurge.o gruprocfs.o grukservices.o gruhandles.o
+gru-y := grufile.o grumain.o grufault.o grutlbpurge.o gruprocfs.o grukservices.o gruhandles.o grukdump.o
 
diff --git a/drivers/misc/sgi-gru/gru_instructions.h b/drivers/misc/sgi-gru/gru_instructions.h
index 3fde33c1e8f..3c9c06618e6 100644
--- a/drivers/misc/sgi-gru/gru_instructions.h
+++ b/drivers/misc/sgi-gru/gru_instructions.h
@@ -81,6 +81,8 @@ struct control_block_extended_exc_detail {
 	int		exopc;
 	long		exceptdet0;
 	int		exceptdet1;
+	int		cbrstate;
+	int		cbrexecstatus;
 };
 
 /*
@@ -107,7 +109,8 @@ struct gru_instruction_bits {
     unsigned char		reserved2: 2;
     unsigned char		istatus:   2;
     unsigned char		isubstatus:4;
-    unsigned char		reserved3: 2;
+    unsigned char		reserved3: 1;
+    unsigned char		tlb_fault_color: 1;
     /* DW 1 */
     unsigned long		idef4;		/* 42 bits: TRi1, BufSize */
     /* DW 2-6 */
@@ -250,17 +253,37 @@ struct gru_instruction {
 #define CBE_CAUSE_HA_RESPONSE_FATAL		(1 << 13)
 #define CBE_CAUSE_HA_RESPONSE_NON_FATAL		(1 << 14)
 #define CBE_CAUSE_ADDRESS_SPACE_DECODE_ERROR	(1 << 15)
-#define CBE_CAUSE_RESPONSE_DATA_ERROR		(1 << 16)
-#define CBE_CAUSE_PROTOCOL_STATE_DATA_ERROR	(1 << 17)
+#define CBE_CAUSE_PROTOCOL_STATE_DATA_ERROR	(1 << 16)
+#define CBE_CAUSE_RA_RESPONSE_DATA_ERROR	(1 << 17)
+#define CBE_CAUSE_HA_RESPONSE_DATA_ERROR	(1 << 18)
+
+/* CBE cbrexecstatus bits */
+#define CBR_EXS_ABORT_OCC_BIT			0
+#define CBR_EXS_INT_OCC_BIT			1
+#define CBR_EXS_PENDING_BIT			2
+#define CBR_EXS_QUEUED_BIT			3
+#define CBR_EXS_TLB_INVAL_BIT			4
+#define CBR_EXS_EXCEPTION_BIT			5
+
+#define CBR_EXS_ABORT_OCC			(1 << CBR_EXS_ABORT_OCC_BIT)
+#define CBR_EXS_INT_OCC				(1 << CBR_EXS_INT_OCC_BIT)
+#define CBR_EXS_PENDING				(1 << CBR_EXS_PENDING_BIT)
+#define CBR_EXS_QUEUED				(1 << CBR_EXS_QUEUED_BIT)
+#define CBR_TLB_INVAL				(1 << CBR_EXS_TLB_INVAL_BIT)
+#define CBR_EXS_EXCEPTION			(1 << CBR_EXS_EXCEPTION_BIT)
 
 /*
  * Exceptions are retried for the following cases. If any OTHER bits are set
  * in ecause, the exception is not retryable.
  */
-#define EXCEPTION_RETRY_BITS (CBE_CAUSE_RESPONSE_DATA_ERROR |		\
-			      CBE_CAUSE_RA_REQUEST_TIMEOUT |		\
+#define EXCEPTION_RETRY_BITS (CBE_CAUSE_EXECUTION_HW_ERROR |		\
 			      CBE_CAUSE_TLBHW_ERROR |			\
-			      CBE_CAUSE_HA_REQUEST_TIMEOUT)
+			      CBE_CAUSE_RA_REQUEST_TIMEOUT |		\
+			      CBE_CAUSE_RA_RESPONSE_NON_FATAL |		\
+			      CBE_CAUSE_HA_RESPONSE_NON_FATAL |		\
+			      CBE_CAUSE_RA_RESPONSE_DATA_ERROR |	\
+			      CBE_CAUSE_HA_RESPONSE_DATA_ERROR		\
+			      )
 
 /* Message queue head structure */
 union gru_mesqhead {
@@ -600,9 +623,11 @@ static inline int gru_get_cb_substatus(void *cb)
 	return cbs->isubstatus;
 }
 
-/* Check the status of a CB. If the CB is in UPM mode, call the
- * OS to handle the UPM status.
- * Returns the CB status field value (0 for normal completion)
+/*
+ * User interface to check an instruction status. UPM and exceptions
+ * are handled automatically. However, this function does NOT wait
+ * for an active instruction to complete.
+ *
  */
 static inline int gru_check_status(void *cb)
 {
@@ -610,34 +635,31 @@ static inline int gru_check_status(void *cb)
 	int ret;
 
 	ret = cbs->istatus;
-	if (ret == CBS_CALL_OS)
+	if (ret != CBS_ACTIVE)
 		ret = gru_check_status_proc(cb);
 	return ret;
 }
 
-/* Wait for CB to complete.
- * Returns the CB status field value (0 for normal completion)
+/*
+ * User interface (via inline function) to wait for an instruction
+ * to complete. Completion status (IDLE or EXCEPTION is returned
+ * to the user. Exception due to hardware errors are automatically
+ * retried before returning an exception.
+ *
  */
 static inline int gru_wait(void *cb)
 {
-	struct gru_control_block_status *cbs = (void *)cb;
-	int ret = cbs->istatus;
-
-	if (ret != CBS_IDLE)
-		ret = gru_wait_proc(cb);
-	return ret;
+	return gru_wait_proc(cb);
 }
 
-/* Wait for CB to complete. Aborts program if error. (Note: error does NOT
+/*
+ * Wait for CB to complete. Aborts program if error. (Note: error does NOT
  * mean TLB mis - only fatal errors such as memory parity error or user
  * bugs will cause termination.
  */
 static inline void gru_wait_abort(void *cb)
 {
-	struct gru_control_block_status *cbs = (void *)cb;
-
-	if (cbs->istatus != CBS_IDLE)
-		gru_wait_abort_proc(cb);
+	gru_wait_abort_proc(cb);
 }
 
 
diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c
index ab118558552..679e0177828 100644
--- a/drivers/misc/sgi-gru/grufault.c
+++ b/drivers/misc/sgi-gru/grufault.c
@@ -166,7 +166,8 @@ static inline struct gru_state *irq_to_gru(int irq)
  * the GRU, atomic operations must be used to clear bits.
  */
 static void get_clear_fault_map(struct gru_state *gru,
-				struct gru_tlb_fault_map *map)
+				struct gru_tlb_fault_map *imap,
+				struct gru_tlb_fault_map *dmap)
 {
 	unsigned long i, k;
 	struct gru_tlb_fault_map *tfm;
@@ -177,7 +178,11 @@ static void get_clear_fault_map(struct gru_state *gru,
 		k = tfm->fault_bits[i];
 		if (k)
 			k = xchg(&tfm->fault_bits[i], 0UL);
-		map->fault_bits[i] = k;
+		imap->fault_bits[i] = k;
+		k = tfm->done_bits[i];
+		if (k)
+			k = xchg(&tfm->done_bits[i], 0UL);
+		dmap->fault_bits[i] = k;
 	}
 
 	/*
@@ -334,6 +339,12 @@ static int gru_try_dropin(struct gru_thread_state *gts,
 	 * Might be a hardware race OR a stupid user. Ignore FMM because FMM
 	 * is a transient state.
 	 */
+	if (tfh->status != TFHSTATUS_EXCEPTION) {
+		gru_flush_cache(tfh);
+		if (tfh->status != TFHSTATUS_EXCEPTION)
+			goto failnoexception;
+		STAT(tfh_stale_on_fault);
+	}
 	if (tfh->state == TFHSTATE_IDLE)
 		goto failidle;
 	if (tfh->state == TFHSTATE_MISS_FMM && cb)
@@ -401,8 +412,17 @@ failfmm:
 	gru_dbg(grudev, "FAILED fmm tfh: 0x%p, state %d\n", tfh, tfh->state);
 	return 0;
 
+failnoexception:
+	/* TFH status did not show exception pending */
+	gru_flush_cache(tfh);
+	if (cb)
+		gru_flush_cache(cb);
+	STAT(tlb_dropin_fail_no_exception);
+	gru_dbg(grudev, "FAILED non-exception tfh: 0x%p, status %d, state %d\n", tfh, tfh->status, tfh->state);
+	return 0;
+
 failidle:
-	/* TFH was idle  - no miss pending */
+	/* TFH state was idle  - no miss pending */
 	gru_flush_cache(tfh);
 	if (cb)
 		gru_flush_cache(cb);
@@ -438,7 +458,7 @@ failactive:
 irqreturn_t gru_intr(int irq, void *dev_id)
 {
 	struct gru_state *gru;
-	struct gru_tlb_fault_map map;
+	struct gru_tlb_fault_map imap, dmap;
 	struct gru_thread_state *gts;
 	struct gru_tlb_fault_handle *tfh = NULL;
 	int cbrnum, ctxnum;
@@ -451,11 +471,15 @@ irqreturn_t gru_intr(int irq, void *dev_id)
 			raw_smp_processor_id(), irq);
 		return IRQ_NONE;
 	}
-	get_clear_fault_map(gru, &map);
-	gru_dbg(grudev, "irq %d, gru %x, map 0x%lx\n", irq, gru->gs_gid,
-		map.fault_bits[0]);
+	get_clear_fault_map(gru, &imap, &dmap);
+
+	for_each_cbr_in_tfm(cbrnum, dmap.fault_bits) {
+		complete(gru->gs_blade->bs_async_wq);
+		gru_dbg(grudev, "gid %d, cbr_done %d, done %d\n",
+			gru->gs_gid, cbrnum, gru->gs_blade->bs_async_wq->done);
+	}
 
-	for_each_cbr_in_tfm(cbrnum, map.fault_bits) {
+	for_each_cbr_in_tfm(cbrnum, imap.fault_bits) {
 		tfh = get_tfh_by_index(gru, cbrnum);
 		prefetchw(tfh);	/* Helps on hdw, required for emulator */
 
@@ -472,7 +496,9 @@ irqreturn_t gru_intr(int irq, void *dev_id)
 		 * This is running in interrupt context. Trylock the mmap_sem.
 		 * If it fails, retry the fault in user context.
 		 */
-		if (down_read_trylock(&gts->ts_mm->mmap_sem)) {
+		if (!gts->ts_force_cch_reload &&
+					down_read_trylock(&gts->ts_mm->mmap_sem)) {
+			gts->ustats.fmm_tlbdropin++;
 			gru_try_dropin(gts, tfh, NULL);
 			up_read(&gts->ts_mm->mmap_sem);
 		} else {
@@ -491,6 +517,7 @@ static int gru_user_dropin(struct gru_thread_state *gts,
 	struct gru_mm_struct *gms = gts->ts_gms;
 	int ret;
 
+	gts->ustats.upm_tlbdropin++;
 	while (1) {
 		wait_event(gms->ms_wait_queue,
 			   atomic_read(&gms->ms_range_active) == 0);
@@ -546,8 +573,8 @@ int gru_handle_user_call_os(unsigned long cb)
 	 * CCH may contain stale data if ts_force_cch_reload is set.
 	 */
 	if (gts->ts_gru && gts->ts_force_cch_reload) {
-		gru_update_cch(gts, 0);
 		gts->ts_force_cch_reload = 0;
+		gru_update_cch(gts, 0);
 	}
 
 	ret = -EAGAIN;
@@ -589,20 +616,26 @@ int gru_get_exception_detail(unsigned long arg)
 	} else if (gts->ts_gru) {
 		cbrnum = thread_cbr_number(gts, ucbnum);
 		cbe = get_cbe_by_index(gts->ts_gru, cbrnum);
-		prefetchw(cbe);/* Harmless on hardware, required for emulator */
+		gru_flush_cache(cbe);	/* CBE not coherent */
 		excdet.opc = cbe->opccpy;
 		excdet.exopc = cbe->exopccpy;
 		excdet.ecause = cbe->ecause;
 		excdet.exceptdet0 = cbe->idef1upd;
 		excdet.exceptdet1 = cbe->idef3upd;
+		excdet.cbrstate = cbe->cbrstate;
+		excdet.cbrexecstatus = cbe->cbrexecstatus;
+		gru_flush_cache(cbe);
 		ret = 0;
 	} else {
 		ret = -EAGAIN;
 	}
 	gru_unlock_gts(gts);
 
-	gru_dbg(grudev, "address 0x%lx, ecause 0x%x\n", excdet.cb,
-		excdet.ecause);
+	gru_dbg(grudev,
+		"cb 0x%lx, op %d, exopc %d, cbrstate %d, cbrexecstatus 0x%x, ecause 0x%x, "
+		"exdet0 0x%lx, exdet1 0x%x\n",
+		excdet.cb, excdet.opc, excdet.exopc, excdet.cbrstate, excdet.cbrexecstatus,
+		excdet.ecause, excdet.exceptdet0, excdet.exceptdet1);
 	if (!ret && copy_to_user((void __user *)arg, &excdet, sizeof(excdet)))
 		ret = -EFAULT;
 	return ret;
@@ -627,7 +660,7 @@ static int gru_unload_all_contexts(void)
 			if (gts && mutex_trylock(&gts->ts_ctxlock)) {
 				spin_unlock(&gru->gs_lock);
 				gru_unload_context(gts, 1);
-				gru_unlock_gts(gts);
+				mutex_unlock(&gts->ts_ctxlock);
 				spin_lock(&gru->gs_lock);
 			}
 		}
@@ -669,6 +702,7 @@ int gru_user_flush_tlb(unsigned long arg)
 {
 	struct gru_thread_state *gts;
 	struct gru_flush_tlb_req req;
+	struct gru_mm_struct *gms;
 
 	STAT(user_flush_tlb);
 	if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
@@ -681,8 +715,34 @@ int gru_user_flush_tlb(unsigned long arg)
 	if (!gts)
 		return -EINVAL;
 
-	gru_flush_tlb_range(gts->ts_gms, req.vaddr, req.len);
+	gms = gts->ts_gms;
 	gru_unlock_gts(gts);
+	gru_flush_tlb_range(gms, req.vaddr, req.len);
+
+	return 0;
+}
+
+/*
+ * Fetch GSEG statisticss
+ */
+long gru_get_gseg_statistics(unsigned long arg)
+{
+	struct gru_thread_state *gts;
+	struct gru_get_gseg_statistics_req req;
+
+	if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
+		return -EFAULT;
+
+	gts = gru_find_lock_gts(req.gseg);
+	if (gts) {
+		memcpy(&req.stats, &gts->ustats, sizeof(gts->ustats));
+		gru_unlock_gts(gts);
+	} else {
+		memset(&req.stats, 0, sizeof(gts->ustats));
+	}
+
+	if (copy_to_user((void __user *)arg, &req, sizeof(req)))
+		return -EFAULT;
 
 	return 0;
 }
@@ -691,18 +751,34 @@ int gru_user_flush_tlb(unsigned long arg)
  * Register the current task as the user of the GSEG slice.
  * Needed for TLB fault interrupt targeting.
  */
-int gru_set_task_slice(long address)
+int gru_set_context_option(unsigned long arg)
 {
 	struct gru_thread_state *gts;
+	struct gru_set_context_option_req req;
+	int ret = 0;
+
+	STAT(set_context_option);
+	if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
+		return -EFAULT;
+	gru_dbg(grudev, "op %d, gseg 0x%lx, value1 0x%lx\n", req.op, req.gseg, req.val1);
 
-	STAT(set_task_slice);
-	gru_dbg(grudev, "address 0x%lx\n", address);
-	gts = gru_alloc_locked_gts(address);
+	gts = gru_alloc_locked_gts(req.gseg);
 	if (!gts)
 		return -EINVAL;
 
-	gts->ts_tgid_owner = current->tgid;
+	switch (req.op) {
+	case sco_gseg_owner:
+ 		/* Register the current task as the GSEG owner */
+		gts->ts_tgid_owner = current->tgid;
+		break;
+	case sco_cch_req_slice:
+ 		/* Set the CCH slice option */
+		gts->ts_cch_req_slice = req.val1 & 3;
+		break;
+	default:
+		ret = -EINVAL;
+	}
 	gru_unlock_gts(gts);
 
-	return 0;
+	return ret;
 }
diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c
index 3ce2920e2bf..fa2d93a9fb8 100644
--- a/drivers/misc/sgi-gru/grufile.c
+++ b/drivers/misc/sgi-gru/grufile.c
@@ -46,6 +46,7 @@
 
 struct gru_blade_state *gru_base[GRU_MAX_BLADES] __read_mostly;
 unsigned long gru_start_paddr __read_mostly;
+void *gru_start_vaddr __read_mostly;
 unsigned long gru_end_paddr __read_mostly;
 unsigned int gru_max_gids __read_mostly;
 struct gru_stats_s gru_stats;
@@ -135,11 +136,9 @@ static int gru_create_new_context(unsigned long arg)
 	if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
 		return -EFAULT;
 
-	if (req.data_segment_bytes == 0 ||
-				req.data_segment_bytes > max_user_dsr_bytes)
+	if (req.data_segment_bytes > max_user_dsr_bytes)
 		return -EINVAL;
-	if (!req.control_blocks || !req.maximum_thread_count ||
-				req.control_blocks > max_user_cbrs)
+	if (req.control_blocks > max_user_cbrs || !req.maximum_thread_count)
 		return -EINVAL;
 
 	if (!(req.options & GRU_OPT_MISS_MASK))
@@ -184,41 +183,6 @@ static long gru_get_config_info(unsigned long arg)
 }
 
 /*
- * Get GRU chiplet status
- */
-static long gru_get_chiplet_status(unsigned long arg)
-{
-	struct gru_state *gru;
-	struct gru_chiplet_info info;
-
-	if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
-		return -EFAULT;
-
-	if (info.node == -1)
-		info.node = numa_node_id();
-	if (info.node >= num_possible_nodes() ||
-			info.chiplet >= GRU_CHIPLETS_PER_HUB ||
-			info.node < 0 || info.chiplet < 0)
-		return -EINVAL;
-
-	info.blade = uv_node_to_blade_id(info.node);
-	gru = get_gru(info.blade, info.chiplet);
-
-	info.total_dsr_bytes = GRU_NUM_DSR_BYTES;
-	info.total_cbr = GRU_NUM_CB;
-	info.total_user_dsr_bytes = GRU_NUM_DSR_BYTES -
-		gru->gs_reserved_dsr_bytes;
-	info.total_user_cbr = GRU_NUM_CB - gru->gs_reserved_cbrs;
-	info.free_user_dsr_bytes = hweight64(gru->gs_dsr_map) *
-			GRU_DSR_AU_BYTES;
-	info.free_user_cbr = hweight64(gru->gs_cbr_map) * GRU_CBR_AU_SIZE;
-
-	if (copy_to_user((void __user *)arg, &info, sizeof(info)))
-		return -EFAULT;
-	return 0;
-}
-
-/*
  * gru_file_unlocked_ioctl
  *
  * Called to update file attributes via IOCTL calls.
@@ -234,8 +198,8 @@ static long gru_file_unlocked_ioctl(struct file *file, unsigned int req,
 	case GRU_CREATE_CONTEXT:
 		err = gru_create_new_context(arg);
 		break;
-	case GRU_SET_TASK_SLICE:
-		err = gru_set_task_slice(arg);
+	case GRU_SET_CONTEXT_OPTION:
+		err = gru_set_context_option(arg);
 		break;
 	case GRU_USER_GET_EXCEPTION_DETAIL:
 		err = gru_get_exception_detail(arg);
@@ -243,18 +207,24 @@ static long gru_file_unlocked_ioctl(struct file *file, unsigned int req,
 	case GRU_USER_UNLOAD_CONTEXT:
 		err = gru_user_unload_context(arg);
 		break;
-	case GRU_GET_CHIPLET_STATUS:
-		err = gru_get_chiplet_status(arg);
-		break;
 	case GRU_USER_FLUSH_TLB:
 		err = gru_user_flush_tlb(arg);
 		break;
 	case GRU_USER_CALL_OS:
 		err = gru_handle_user_call_os(arg);
 		break;
+	case GRU_GET_GSEG_STATISTICS:
+		err = gru_get_gseg_statistics(arg);
+		break;
+	case GRU_KTEST:
+		err = gru_ktest(arg);
+		break;
 	case GRU_GET_CONFIG_INFO:
 		err = gru_get_config_info(arg);
 		break;
+	case GRU_DUMP_CHIPLET_STATE:
+		err = gru_dump_chiplet_request(arg);
+		break;
 	}
 	return err;
 }
@@ -282,7 +252,6 @@ static void gru_init_chiplet(struct gru_state *gru, unsigned long paddr,
 	gru_dbg(grudev, "bid %d, nid %d, gid %d, vaddr %p (0x%lx)\n",
 		bid, nid, gru->gs_gid, gru->gs_gru_base_vaddr,
 		gru->gs_gru_base_paddr);
-	gru_kservices_init(gru);
 }
 
 static int gru_init_tables(unsigned long gru_base_paddr, void *gru_base_vaddr)
@@ -309,6 +278,7 @@ static int gru_init_tables(unsigned long gru_base_paddr, void *gru_base_vaddr)
 		memset(gru_base[bid], 0, sizeof(struct gru_blade_state));
 		gru_base[bid]->bs_lru_gru = &gru_base[bid]->bs_grus[0];
 		spin_lock_init(&gru_base[bid]->bs_lock);
+		init_rwsem(&gru_base[bid]->bs_kgts_sema);
 
 		dsrbytes = 0;
 		cbrs = 0;
@@ -372,7 +342,6 @@ static int __init gru_init(void)
 {
 	int ret, irq, chip;
 	char id[10];
-	void *gru_start_vaddr;
 
 	if (!is_uv_system())
 		return 0;
@@ -422,6 +391,7 @@ static int __init gru_init(void)
 		printk(KERN_ERR "%s: init tables failed\n", GRU_DRIVER_ID_STR);
 		goto exit3;
 	}
+	gru_kservices_init();
 
 	printk(KERN_INFO "%s: v%s\n", GRU_DRIVER_ID_STR,
 	       GRU_DRIVER_VERSION_STR);
@@ -440,7 +410,7 @@ exit1:
 
 static void __exit gru_exit(void)
 {
-	int i, bid, gid;
+	int i, bid;
 	int order = get_order(sizeof(struct gru_state) *
 			      GRU_CHIPLETS_PER_BLADE);
 
@@ -449,10 +419,7 @@ static void __exit gru_exit(void)
 
 	for (i = 0; i < GRU_CHIPLETS_PER_BLADE; i++)
 		free_irq(IRQ_GRU + i, NULL);
-
-	foreach_gid(gid)
-		gru_kservices_exit(GID_TO_GRU(gid));
-
+	gru_kservices_exit();
 	for (bid = 0; bid < GRU_MAX_BLADES; bid++)
 		free_pages((unsigned long)gru_base[bid], order);
 
diff --git a/drivers/misc/sgi-gru/gruhandles.c b/drivers/misc/sgi-gru/gruhandles.c
index 9b7ccb32869..37e7cfc53b9 100644
--- a/drivers/misc/sgi-gru/gruhandles.c
+++ b/drivers/misc/sgi-gru/gruhandles.c
@@ -57,7 +57,7 @@ static void start_instruction(void *h)
 static int wait_instruction_complete(void *h, enum mcs_op opc)
 {
 	int status;
-	cycles_t start_time = get_cycles();
+	unsigned long start_time = get_cycles();
 
 	while (1) {
 		cpu_relax();
@@ -65,25 +65,16 @@ static int wait_instruction_complete(void *h, enum mcs_op opc)
 		if (status != CCHSTATUS_ACTIVE)
 			break;
 		if (GRU_OPERATION_TIMEOUT < (get_cycles() - start_time))
-			panic("GRU %p is malfunctioning\n", h);
+			panic("GRU %p is malfunctioning: start %ld, end %ld\n",
+			      h, start_time, (unsigned long)get_cycles());
 	}
 	if (gru_options & OPT_STATS)
 		update_mcs_stats(opc, get_cycles() - start_time);
 	return status;
 }
 
-int cch_allocate(struct gru_context_configuration_handle *cch,
-		int asidval, int sizeavail, unsigned long cbrmap,
-		unsigned long dsrmap)
+int cch_allocate(struct gru_context_configuration_handle *cch)
 {
-	int i;
-
-	for (i = 0; i < 8; i++) {
-		cch->asid[i] = (asidval++);
-		cch->sizeavail[i] = sizeavail;
-	}
-	cch->dsr_allocation_map = dsrmap;
-	cch->cbr_allocation_map = cbrmap;
 	cch->opc = CCHOP_ALLOCATE;
 	start_instruction(cch);
 	return wait_instruction_complete(cch, cchop_allocate);
diff --git a/drivers/misc/sgi-gru/gruhandles.h b/drivers/misc/sgi-gru/gruhandles.h
index 1ed74d7508c..f44112242d0 100644
--- a/drivers/misc/sgi-gru/gruhandles.h
+++ b/drivers/misc/sgi-gru/gruhandles.h
@@ -39,7 +39,6 @@
 #define GRU_NUM_CBE		128
 #define GRU_NUM_TFH		128
 #define GRU_NUM_CCH		16
-#define GRU_NUM_GSH		1
 
 /* Maximum resource counts that can be reserved by user programs */
 #define GRU_NUM_USER_CBR	GRU_NUM_CBE
@@ -56,7 +55,6 @@
 #define GRU_CBE_BASE		(GRU_MCS_BASE + 0x10000)
 #define GRU_TFH_BASE		(GRU_MCS_BASE + 0x18000)
 #define GRU_CCH_BASE		(GRU_MCS_BASE + 0x20000)
-#define GRU_GSH_BASE		(GRU_MCS_BASE + 0x30000)
 
 /* User gseg constants */
 #define GRU_GSEG_STRIDE		(4 * 1024 * 1024)
@@ -251,15 +249,15 @@ struct gru_tlb_fault_handle {
 	unsigned int fill1:9;
 
 	unsigned int status:2;
-	unsigned int fill2:1;
-	unsigned int color:1;
+	unsigned int fill2:2;
 	unsigned int state:3;
 	unsigned int fill3:1;
 
-	unsigned int cause:7;		/* DW 0 - high 32 */
+	unsigned int cause:6;
+	unsigned int cb_int:1;
 	unsigned int fill4:1;
 
-	unsigned int indexway:12;
+	unsigned int indexway:12;	/* DW 0 - high 32 */
 	unsigned int fill5:4;
 
 	unsigned int ctxnum:4;
@@ -457,21 +455,7 @@ enum gru_cbr_state {
 	CBRSTATE_BUSY_INTERRUPT,
 };
 
-/* CBE cbrexecstatus bits */
-#define CBR_EXS_ABORT_OCC_BIT			0
-#define CBR_EXS_INT_OCC_BIT			1
-#define CBR_EXS_PENDING_BIT			2
-#define CBR_EXS_QUEUED_BIT			3
-#define CBR_EXS_TLBHW_BIT			4
-#define CBR_EXS_EXCEPTION_BIT			5
-
-#define CBR_EXS_ABORT_OCC			(1 << CBR_EXS_ABORT_OCC_BIT)
-#define CBR_EXS_INT_OCC				(1 << CBR_EXS_INT_OCC_BIT)
-#define CBR_EXS_PENDING				(1 << CBR_EXS_PENDING_BIT)
-#define CBR_EXS_QUEUED				(1 << CBR_EXS_QUEUED_BIT)
-#define CBR_EXS_TLBHW				(1 << CBR_EXS_TLBHW_BIT)
-#define CBR_EXS_EXCEPTION			(1 << CBR_EXS_EXCEPTION_BIT)
-
+/* CBE cbrexecstatus bits  - defined in gru_instructions.h*/
 /* CBE ecause bits  - defined in gru_instructions.h */
 
 /*
@@ -495,9 +479,7 @@ enum gru_cbr_state {
 /* minimum TLB purge count to ensure a full purge */
 #define GRUMAXINVAL		1024UL
 
-int cch_allocate(struct gru_context_configuration_handle *cch,
-       int asidval, int sizeavail, unsigned long cbrmap, unsigned long dsrmap);
-
+int cch_allocate(struct gru_context_configuration_handle *cch);
 int cch_start(struct gru_context_configuration_handle *cch);
 int cch_interrupt(struct gru_context_configuration_handle *cch);
 int cch_deallocate(struct gru_context_configuration_handle *cch);
diff --git a/drivers/misc/sgi-gru/grukdump.c b/drivers/misc/sgi-gru/grukdump.c
new file mode 100644
index 00000000000..55eabfa8558
--- /dev/null
+++ b/drivers/misc/sgi-gru/grukdump.c
@@ -0,0 +1,232 @@
+/*
+ * SN Platform GRU Driver
+ *
+ *            Dump GRU State
+ *
+ *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/uaccess.h>
+#include <linux/delay.h>
+#include <linux/bitops.h>
+#include <asm/uv/uv_hub.h>
+#include "gru.h"
+#include "grutables.h"
+#include "gruhandles.h"
+#include "grulib.h"
+
+#define CCH_LOCK_ATTEMPTS	10
+
+static int gru_user_copy_handle(void __user **dp, void *s)
+{
+	if (copy_to_user(*dp, s, GRU_HANDLE_BYTES))
+		return -1;
+	*dp += GRU_HANDLE_BYTES;
+	return 0;
+}
+
+static int gru_dump_context_data(void *grubase,
+			struct gru_context_configuration_handle *cch,
+			void __user *ubuf, int ctxnum, int dsrcnt)
+{
+	void *cb, *cbe, *tfh, *gseg;
+	int i, scr;
+
+	gseg = grubase + ctxnum * GRU_GSEG_STRIDE;
+	cb = gseg + GRU_CB_BASE;
+	cbe = grubase + GRU_CBE_BASE;
+	tfh = grubase + GRU_TFH_BASE;
+
+	for_each_cbr_in_allocation_map(i, &cch->cbr_allocation_map, scr) {
+		if (gru_user_copy_handle(&ubuf, cb))
+			goto fail;
+		if (gru_user_copy_handle(&ubuf, tfh + i * GRU_HANDLE_STRIDE))
+			goto fail;
+		if (gru_user_copy_handle(&ubuf, cbe + i * GRU_HANDLE_STRIDE))
+			goto fail;
+		cb += GRU_HANDLE_STRIDE;
+	}
+	if (dsrcnt)
+		memcpy(ubuf, gseg + GRU_DS_BASE, dsrcnt * GRU_HANDLE_STRIDE);
+	return 0;
+
+fail:
+	return -EFAULT;
+}
+
+static int gru_dump_tfm(struct gru_state *gru,
+		void __user *ubuf, void __user *ubufend)
+{
+	struct gru_tlb_fault_map *tfm;
+	int i, ret, bytes;
+
+	bytes = GRU_NUM_TFM * GRU_CACHE_LINE_BYTES;
+	if (bytes > ubufend - ubuf)
+		ret = -EFBIG;
+
+	for (i = 0; i < GRU_NUM_TFM; i++) {
+		tfm = get_tfm(gru->gs_gru_base_vaddr, i);
+		if (gru_user_copy_handle(&ubuf, tfm))
+			goto fail;
+	}
+	return GRU_NUM_TFM * GRU_CACHE_LINE_BYTES;
+
+fail:
+	return -EFAULT;
+}
+
+static int gru_dump_tgh(struct gru_state *gru,
+		void __user *ubuf, void __user *ubufend)
+{
+	struct gru_tlb_global_handle *tgh;
+	int i, ret, bytes;
+
+	bytes = GRU_NUM_TGH * GRU_CACHE_LINE_BYTES;
+	if (bytes > ubufend - ubuf)
+		ret = -EFBIG;
+
+	for (i = 0; i < GRU_NUM_TGH; i++) {
+		tgh = get_tgh(gru->gs_gru_base_vaddr, i);
+		if (gru_user_copy_handle(&ubuf, tgh))
+			goto fail;
+	}
+	return GRU_NUM_TGH * GRU_CACHE_LINE_BYTES;
+
+fail:
+	return -EFAULT;
+}
+
+static int gru_dump_context(struct gru_state *gru, int ctxnum,
+		void __user *ubuf, void __user *ubufend, char data_opt,
+		char lock_cch)
+{
+	struct gru_dump_context_header hdr;
+	struct gru_dump_context_header __user *uhdr = ubuf;
+	struct gru_context_configuration_handle *cch, *ubufcch;
+	struct gru_thread_state *gts;
+	int try, cch_locked, cbrcnt = 0, dsrcnt = 0, bytes = 0, ret = 0;
+	void *grubase;
+
+	memset(&hdr, 0, sizeof(hdr));
+	grubase = gru->gs_gru_base_vaddr;
+	cch = get_cch(grubase, ctxnum);
+	for (try = 0; try < CCH_LOCK_ATTEMPTS; try++) {
+		cch_locked =  trylock_cch_handle(cch);
+		if (cch_locked)
+			break;
+		msleep(1);
+	}
+
+	ubuf += sizeof(hdr);
+	ubufcch = ubuf;
+	if (gru_user_copy_handle(&ubuf, cch))
+		goto fail;
+	if (cch_locked)
+		ubufcch->delresp = 0;
+	bytes = sizeof(hdr) + GRU_CACHE_LINE_BYTES;
+
+	if (cch_locked || !lock_cch) {
+		gts = gru->gs_gts[ctxnum];
+		if (gts && gts->ts_vma) {
+			hdr.pid = gts->ts_tgid_owner;
+			hdr.vaddr = gts->ts_vma->vm_start;
+		}
+		if (cch->state != CCHSTATE_INACTIVE) {
+			cbrcnt = hweight64(cch->cbr_allocation_map) *
+						GRU_CBR_AU_SIZE;
+			dsrcnt = data_opt ? hweight32(cch->dsr_allocation_map) *
+						GRU_DSR_AU_CL : 0;
+		}
+		bytes += (3 * cbrcnt + dsrcnt) * GRU_CACHE_LINE_BYTES;
+		if (bytes > ubufend - ubuf)
+			ret = -EFBIG;
+		else
+			ret = gru_dump_context_data(grubase, cch, ubuf, ctxnum,
+							dsrcnt);
+
+	}
+	if (cch_locked)
+		unlock_cch_handle(cch);
+	if (ret)
+		return ret;
+
+	hdr.magic = GRU_DUMP_MAGIC;
+	hdr.gid = gru->gs_gid;
+	hdr.ctxnum = ctxnum;
+	hdr.cbrcnt = cbrcnt;
+	hdr.dsrcnt = dsrcnt;
+	hdr.cch_locked = cch_locked;
+	if (!ret && copy_to_user((void __user *)uhdr, &hdr, sizeof(hdr)))
+		ret = -EFAULT;
+
+	return ret ? ret : bytes;
+
+fail:
+	unlock_cch_handle(cch);
+	return -EFAULT;
+}
+
+int gru_dump_chiplet_request(unsigned long arg)
+{
+	struct gru_state *gru;
+	struct gru_dump_chiplet_state_req req;
+	void __user *ubuf;
+	void __user *ubufend;
+	int ctxnum, ret, cnt = 0;
+
+	if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
+		return -EFAULT;
+
+	/* Currently, only dump by gid is implemented */
+	if (req.gid >= gru_max_gids || req.gid < 0)
+		return -EINVAL;
+
+	gru = GID_TO_GRU(req.gid);
+	ubuf = req.buf;
+	ubufend = req.buf + req.buflen;
+
+	ret = gru_dump_tfm(gru, ubuf, ubufend);
+	if (ret < 0)
+		goto fail;
+	ubuf += ret;
+
+	ret = gru_dump_tgh(gru, ubuf, ubufend);
+	if (ret < 0)
+		goto fail;
+	ubuf += ret;
+
+	for (ctxnum = 0; ctxnum < GRU_NUM_CCH; ctxnum++) {
+		if (req.ctxnum == ctxnum || req.ctxnum < 0) {
+			ret = gru_dump_context(gru, ctxnum, ubuf, ubufend,
+						req.data_opt, req.lock_cch);
+			if (ret < 0)
+				goto fail;
+			ubuf += ret;
+			cnt++;
+		}
+	}
+
+	if (copy_to_user((void __user *)arg, &req, sizeof(req)))
+		return -EFAULT;
+	return cnt;
+
+fail:
+	return ret;
+}
diff --git a/drivers/misc/sgi-gru/grukservices.c b/drivers/misc/sgi-gru/grukservices.c
index d8bd7d84a7c..eedbf9c3276 100644
--- a/drivers/misc/sgi-gru/grukservices.c
+++ b/drivers/misc/sgi-gru/grukservices.c
@@ -31,6 +31,7 @@
 #include <linux/proc_fs.h>
 #include <linux/interrupt.h>
 #include <linux/uaccess.h>
+#include <linux/delay.h>
 #include "gru.h"
 #include "grulib.h"
 #include "grutables.h"
@@ -45,18 +46,66 @@
  * resources. This will likely be replaced when we better understand the
  * kernel/user requirements.
  *
- * At boot time, the kernel permanently reserves a fixed number of
- * CBRs/DSRs for each cpu to use. The resources are all taken from
- * the GRU chiplet 1 on the blade. This leaves the full set of resources
- * of chiplet 0 available to be allocated to a single user.
+ * Blade percpu resources reserved for kernel use. These resources are
+ * reserved whenever the the kernel context for the blade is loaded. Note
+ * that the kernel context is not guaranteed to be always available. It is
+ * loaded on demand & can be stolen by a user if the user demand exceeds the
+ * kernel demand. The kernel can always reload the kernel context but
+ * a SLEEP may be required!!!.
+ *
+ * Async Overview:
+ *
+ * 	Each blade has one "kernel context" that owns GRU kernel resources
+ * 	located on the blade. Kernel drivers use GRU resources in this context
+ * 	for sending messages, zeroing memory, etc.
+ *
+ * 	The kernel context is dynamically loaded on demand. If it is not in
+ * 	use by the kernel, the kernel context can be unloaded & given to a user.
+ * 	The kernel context will be reloaded when needed. This may require that
+ * 	a context be stolen from a user.
+ * 		NOTE: frequent unloading/reloading of the kernel context is
+ * 		expensive. We are depending on batch schedulers, cpusets, sane
+ * 		drivers or some other mechanism to prevent the need for frequent
+ *	 	stealing/reloading.
+ *
+ * 	The kernel context consists of two parts:
+ * 		- 1 CB & a few DSRs that are reserved for each cpu on the blade.
+ * 		  Each cpu has it's own private resources & does not share them
+ * 		  with other cpus. These resources are used serially, ie,
+ * 		  locked, used & unlocked  on each call to a function in
+ * 		  grukservices.
+ * 		  	(Now that we have dynamic loading of kernel contexts, I
+ * 		  	 may rethink this & allow sharing between cpus....)
+ *
+ *		- Additional resources can be reserved long term & used directly
+ *		  by UV drivers located in the kernel. Drivers using these GRU
+ *		  resources can use asynchronous GRU instructions that send
+ *		  interrupts on completion.
+ *		  	- these resources must be explicitly locked/unlocked
+ *		  	- locked resources prevent (obviously) the kernel
+ *		  	  context from being unloaded.
+ *			- drivers using these resource directly issue their own
+ *			  GRU instruction and must wait/check completion.
+ *
+ * 		  When these resources are reserved, the caller can optionally
+ * 		  associate a wait_queue with the resources and use asynchronous
+ * 		  GRU instructions. When an async GRU instruction completes, the
+ * 		  driver will do a wakeup on the event.
+ *
  */
 
-/* Blade percpu resources PERMANENTLY reserved for kernel use */
+
+#define ASYNC_HAN_TO_BID(h)	((h) - 1)
+#define ASYNC_BID_TO_HAN(b)	((b) + 1)
+#define ASYNC_HAN_TO_BS(h)	gru_base[ASYNC_HAN_TO_BID(h)]
+#define KCB_TO_GID(cb)		((cb - gru_start_vaddr) /		\
+					(GRU_SIZE * GRU_CHIPLETS_PER_BLADE))
+#define KCB_TO_BS(cb)		gru_base[KCB_TO_GID(cb)]
+
 #define GRU_NUM_KERNEL_CBR	1
 #define GRU_NUM_KERNEL_DSR_BYTES 256
 #define GRU_NUM_KERNEL_DSR_CL	(GRU_NUM_KERNEL_DSR_BYTES /		\
 					GRU_CACHE_LINE_BYTES)
-#define KERNEL_CTXNUM           15
 
 /* GRU instruction attributes for all instructions */
 #define IMA			IMA_CB_DELAY
@@ -98,6 +147,108 @@ struct message_header {
 
 #define HSTATUS(mq, h)	((mq) + offsetof(struct message_queue, hstatus[h]))
 
+/*
+ * Reload the blade's kernel context into a GRU chiplet. Called holding
+ * the bs_kgts_sema for READ. Will steal user contexts if necessary.
+ */
+static void gru_load_kernel_context(struct gru_blade_state *bs, int blade_id)
+{
+	struct gru_state *gru;
+	struct gru_thread_state *kgts;
+	void *vaddr;
+	int ctxnum, ncpus;
+
+	up_read(&bs->bs_kgts_sema);
+	down_write(&bs->bs_kgts_sema);
+
+	if (!bs->bs_kgts)
+		bs->bs_kgts = gru_alloc_gts(NULL, 0, 0, 0, 0);
+	kgts = bs->bs_kgts;
+
+	if (!kgts->ts_gru) {
+		STAT(load_kernel_context);
+		ncpus = uv_blade_nr_possible_cpus(blade_id);
+		kgts->ts_cbr_au_count = GRU_CB_COUNT_TO_AU(
+			GRU_NUM_KERNEL_CBR * ncpus + bs->bs_async_cbrs);
+		kgts->ts_dsr_au_count = GRU_DS_BYTES_TO_AU(
+			GRU_NUM_KERNEL_DSR_BYTES * ncpus +
+				bs->bs_async_dsr_bytes);
+		while (!gru_assign_gru_context(kgts, blade_id)) {
+			msleep(1);
+			gru_steal_context(kgts, blade_id);
+		}
+		gru_load_context(kgts);
+		gru = bs->bs_kgts->ts_gru;
+		vaddr = gru->gs_gru_base_vaddr;
+		ctxnum = kgts->ts_ctxnum;
+		bs->kernel_cb = get_gseg_base_address_cb(vaddr, ctxnum, 0);
+		bs->kernel_dsr = get_gseg_base_address_ds(vaddr, ctxnum, 0);
+	}
+	downgrade_write(&bs->bs_kgts_sema);
+}
+
+/*
+ * Free all kernel contexts that are not currently in use.
+ *   Returns 0 if all freed, else number of inuse context.
+ */
+static int gru_free_kernel_contexts(void)
+{
+	struct gru_blade_state *bs;
+	struct gru_thread_state *kgts;
+	int bid, ret = 0;
+
+	for (bid = 0; bid < GRU_MAX_BLADES; bid++) {
+		bs = gru_base[bid];
+		if (!bs)
+			continue;
+		if (down_write_trylock(&bs->bs_kgts_sema)) {
+			kgts = bs->bs_kgts;
+			if (kgts && kgts->ts_gru)
+				gru_unload_context(kgts, 0);
+			kfree(kgts);
+			bs->bs_kgts = NULL;
+			up_write(&bs->bs_kgts_sema);
+		} else {
+			ret++;
+		}
+	}
+	return ret;
+}
+
+/*
+ * Lock & load the kernel context for the specified blade.
+ */
+static struct gru_blade_state *gru_lock_kernel_context(int blade_id)
+{
+	struct gru_blade_state *bs;
+
+	STAT(lock_kernel_context);
+	bs = gru_base[blade_id];
+
+	down_read(&bs->bs_kgts_sema);
+	if (!bs->bs_kgts || !bs->bs_kgts->ts_gru)
+		gru_load_kernel_context(bs, blade_id);
+	return bs;
+
+}
+
+/*
+ * Unlock the kernel context for the specified blade. Context is not
+ * unloaded but may be stolen before next use.
+ */
+static void gru_unlock_kernel_context(int blade_id)
+{
+	struct gru_blade_state *bs;
+
+	bs = gru_base[blade_id];
+	up_read(&bs->bs_kgts_sema);
+	STAT(unlock_kernel_context);
+}
+
+/*
+ * Reserve & get pointers to the DSR/CBRs reserved for the current cpu.
+ * 	- returns with preemption disabled
+ */
 static int gru_get_cpu_resources(int dsr_bytes, void **cb, void **dsr)
 {
 	struct gru_blade_state *bs;
@@ -105,30 +256,148 @@ static int gru_get_cpu_resources(int dsr_bytes, void **cb, void **dsr)
 
 	BUG_ON(dsr_bytes > GRU_NUM_KERNEL_DSR_BYTES);
 	preempt_disable();
-	bs = gru_base[uv_numa_blade_id()];
+	bs = gru_lock_kernel_context(uv_numa_blade_id());
 	lcpu = uv_blade_processor_id();
 	*cb = bs->kernel_cb + lcpu * GRU_HANDLE_STRIDE;
 	*dsr = bs->kernel_dsr + lcpu * GRU_NUM_KERNEL_DSR_BYTES;
 	return 0;
 }
 
+/*
+ * Free the current cpus reserved DSR/CBR resources.
+ */
 static void gru_free_cpu_resources(void *cb, void *dsr)
 {
+	gru_unlock_kernel_context(uv_numa_blade_id());
 	preempt_enable();
 }
 
+/*
+ * Reserve GRU resources to be used asynchronously.
+ *   Note: currently supports only 1 reservation per blade.
+ *
+ * 	input:
+ * 		blade_id  - blade on which resources should be reserved
+ * 		cbrs	  - number of CBRs
+ * 		dsr_bytes - number of DSR bytes needed
+ *	output:
+ *		handle to identify resource
+ *		(0 = async resources already reserved)
+ */
+unsigned long gru_reserve_async_resources(int blade_id, int cbrs, int dsr_bytes,
+			struct completion *cmp)
+{
+	struct gru_blade_state *bs;
+	struct gru_thread_state *kgts;
+	int ret = 0;
+
+	bs = gru_base[blade_id];
+
+	down_write(&bs->bs_kgts_sema);
+
+	/* Verify no resources already reserved */
+	if (bs->bs_async_dsr_bytes + bs->bs_async_cbrs)
+		goto done;
+	bs->bs_async_dsr_bytes = dsr_bytes;
+	bs->bs_async_cbrs = cbrs;
+	bs->bs_async_wq = cmp;
+	kgts = bs->bs_kgts;
+
+	/* Resources changed. Unload context if already loaded */
+	if (kgts && kgts->ts_gru)
+		gru_unload_context(kgts, 0);
+	ret = ASYNC_BID_TO_HAN(blade_id);
+
+done:
+	up_write(&bs->bs_kgts_sema);
+	return ret;
+}
+
+/*
+ * Release async resources previously reserved.
+ *
+ *	input:
+ *		han - handle to identify resources
+ */
+void gru_release_async_resources(unsigned long han)
+{
+	struct gru_blade_state *bs = ASYNC_HAN_TO_BS(han);
+
+	down_write(&bs->bs_kgts_sema);
+	bs->bs_async_dsr_bytes = 0;
+	bs->bs_async_cbrs = 0;
+	bs->bs_async_wq = NULL;
+	up_write(&bs->bs_kgts_sema);
+}
+
+/*
+ * Wait for async GRU instructions to complete.
+ *
+ *	input:
+ *		han - handle to identify resources
+ */
+void gru_wait_async_cbr(unsigned long han)
+{
+	struct gru_blade_state *bs = ASYNC_HAN_TO_BS(han);
+
+	wait_for_completion(bs->bs_async_wq);
+	mb();
+}
+
+/*
+ * Lock previous reserved async GRU resources
+ *
+ *	input:
+ *		han - handle to identify resources
+ *	output:
+ *		cb  - pointer to first CBR
+ *		dsr - pointer to first DSR
+ */
+void gru_lock_async_resource(unsigned long han,  void **cb, void **dsr)
+{
+	struct gru_blade_state *bs = ASYNC_HAN_TO_BS(han);
+	int blade_id = ASYNC_HAN_TO_BID(han);
+	int ncpus;
+
+	gru_lock_kernel_context(blade_id);
+	ncpus = uv_blade_nr_possible_cpus(blade_id);
+	if (cb)
+		*cb = bs->kernel_cb + ncpus * GRU_HANDLE_STRIDE;
+	if (dsr)
+		*dsr = bs->kernel_dsr + ncpus * GRU_NUM_KERNEL_DSR_BYTES;
+}
+
+/*
+ * Unlock previous reserved async GRU resources
+ *
+ *	input:
+ *		han - handle to identify resources
+ */
+void gru_unlock_async_resource(unsigned long han)
+{
+	int blade_id = ASYNC_HAN_TO_BID(han);
+
+	gru_unlock_kernel_context(blade_id);
+}
+
+/*----------------------------------------------------------------------*/
 int gru_get_cb_exception_detail(void *cb,
 		struct control_block_extended_exc_detail *excdet)
 {
 	struct gru_control_block_extended *cbe;
+	struct gru_blade_state *bs;
+	int cbrnum;
 
-	cbe = get_cbe(GRUBASE(cb), get_cb_number(cb));
-	prefetchw(cbe);	/* Harmless on hardware, required for emulator */
+	bs = KCB_TO_BS(cb);
+	cbrnum = thread_cbr_number(bs->bs_kgts, get_cb_number(cb));
+	cbe = get_cbe(GRUBASE(cb), cbrnum);
+	gru_flush_cache(cbe);	/* CBE not coherent */
 	excdet->opc = cbe->opccpy;
 	excdet->exopc = cbe->exopccpy;
 	excdet->ecause = cbe->ecause;
 	excdet->exceptdet0 = cbe->idef1upd;
 	excdet->exceptdet1 = cbe->idef3upd;
+	gru_flush_cache(cbe);
 	return 0;
 }
 
@@ -167,13 +436,13 @@ static int gru_retry_exception(void *cb)
 	int retry = EXCEPTION_RETRY_LIMIT;
 
 	while (1)  {
-		if (gru_get_cb_message_queue_substatus(cb))
-			break;
 		if (gru_wait_idle_or_exception(gen) == CBS_IDLE)
 			return CBS_IDLE;
-
+		if (gru_get_cb_message_queue_substatus(cb))
+			return CBS_EXCEPTION;
 		gru_get_cb_exception_detail(cb, &excdet);
-		if (excdet.ecause & ~EXCEPTION_RETRY_BITS)
+		if ((excdet.ecause & ~EXCEPTION_RETRY_BITS) ||
+				(excdet.cbrexecstatus & CBR_EXS_ABORT_OCC))
 			break;
 		if (retry-- == 0)
 			break;
@@ -416,6 +685,29 @@ static void send_message_queue_interrupt(struct gru_message_queue_desc *mqd)
 				mqd->interrupt_vector);
 }
 
+/*
+ * Handle a PUT failure. Note: if message was a 2-line message, one of the
+ * lines might have successfully have been written. Before sending the
+ * message, "present" must be cleared in BOTH lines to prevent the receiver
+ * from prematurely seeing the full message.
+ */
+static int send_message_put_nacked(void *cb, struct gru_message_queue_desc *mqd,
+			void *mesg, int lines)
+{
+	unsigned long m;
+
+	m = mqd->mq_gpa + (gru_get_amo_value_head(cb) << 6);
+	if (lines == 2) {
+		gru_vset(cb, m, 0, XTYPE_CL, lines, 1, IMA);
+		if (gru_wait(cb) != CBS_IDLE)
+			return MQE_UNEXPECTED_CB_ERR;
+	}
+	gru_vstore(cb, m, gru_get_tri(mesg), XTYPE_CL, lines, 1, IMA);
+	if (gru_wait(cb) != CBS_IDLE)
+		return MQE_UNEXPECTED_CB_ERR;
+	send_message_queue_interrupt(mqd);
+	return MQE_OK;
+}
 
 /*
  * Handle a gru_mesq failure. Some of these failures are software recoverable
@@ -425,7 +717,6 @@ static int send_message_failure(void *cb, struct gru_message_queue_desc *mqd,
 				void *mesg, int lines)
 {
 	int substatus, ret = 0;
-	unsigned long m;
 
 	substatus = gru_get_cb_message_queue_substatus(cb);
 	switch (substatus) {
@@ -447,14 +738,7 @@ static int send_message_failure(void *cb, struct gru_message_queue_desc *mqd,
 		break;
 	case CBSS_PUT_NACKED:
 		STAT(mesq_send_put_nacked);
-		m = mqd->mq_gpa + (gru_get_amo_value_head(cb) << 6);
-		gru_vstore(cb, m, gru_get_tri(mesg), XTYPE_CL, lines, 1, IMA);
-		if (gru_wait(cb) == CBS_IDLE) {
-			ret = MQE_OK;
-			send_message_queue_interrupt(mqd);
-		} else {
-			ret = MQE_UNEXPECTED_CB_ERR;
-		}
+		ret = send_message_put_nacked(cb, mqd, mesg, lines);
 		break;
 	default:
 		BUG();
@@ -597,115 +881,177 @@ EXPORT_SYMBOL_GPL(gru_copy_gpa);
 
 /* ------------------- KERNEL QUICKTESTS RUN AT STARTUP ----------------*/
 /* 	Temp - will delete after we gain confidence in the GRU		*/
-static __cacheline_aligned unsigned long word0;
-static __cacheline_aligned unsigned long word1;
 
-static int quicktest(struct gru_state *gru)
+static int quicktest0(unsigned long arg)
 {
+	unsigned long word0;
+	unsigned long word1;
 	void *cb;
-	void *ds;
+	void *dsr;
 	unsigned long *p;
+	int ret = -EIO;
 
-	cb = get_gseg_base_address_cb(gru->gs_gru_base_vaddr, KERNEL_CTXNUM, 0);
-	ds = get_gseg_base_address_ds(gru->gs_gru_base_vaddr, KERNEL_CTXNUM, 0);
-	p = ds;
+	if (gru_get_cpu_resources(GRU_CACHE_LINE_BYTES, &cb, &dsr))
+		return MQE_BUG_NO_RESOURCES;
+	p = dsr;
 	word0 = MAGIC;
+	word1 = 0;
 
-	gru_vload(cb, uv_gpa(&word0), 0, XTYPE_DW, 1, 1, IMA);
-	if (gru_wait(cb) != CBS_IDLE)
-		BUG();
+	gru_vload(cb, uv_gpa(&word0), gru_get_tri(dsr), XTYPE_DW, 1, 1, IMA);
+	if (gru_wait(cb) != CBS_IDLE) {
+		printk(KERN_DEBUG "GRU quicktest0: CBR failure 1\n");
+		goto done;
+	}
 
-	if (*(unsigned long *)ds != MAGIC)
-		BUG();
-	gru_vstore(cb, uv_gpa(&word1), 0, XTYPE_DW, 1, 1, IMA);
-	if (gru_wait(cb) != CBS_IDLE)
-		BUG();
+	if (*p != MAGIC) {
+		printk(KERN_DEBUG "GRU: quicktest0 bad magic 0x%lx\n", *p);
+		goto done;
+	}
+	gru_vstore(cb, uv_gpa(&word1), gru_get_tri(dsr), XTYPE_DW, 1, 1, IMA);
+	if (gru_wait(cb) != CBS_IDLE) {
+		printk(KERN_DEBUG "GRU quicktest0: CBR failure 2\n");
+		goto done;
+	}
 
-	if (word0 != word1 || word0 != MAGIC) {
-		printk
-		    ("GRU quicktest err: gid %d, found 0x%lx, expected 0x%lx\n",
-		     gru->gs_gid, word1, MAGIC);
-		BUG();		/* ZZZ should not be fatal */
+	if (word0 != word1 || word1 != MAGIC) {
+		printk(KERN_DEBUG
+		       "GRU quicktest0 err: found 0x%lx, expected 0x%lx\n",
+		     word1, MAGIC);
+		goto done;
 	}
+	ret = 0;
 
-	return 0;
+done:
+	gru_free_cpu_resources(cb, dsr);
+	return ret;
 }
 
+#define ALIGNUP(p, q)	((void *)(((unsigned long)(p) + (q) - 1) & ~(q - 1)))
 
-int gru_kservices_init(struct gru_state *gru)
+static int quicktest1(unsigned long arg)
 {
-	struct gru_blade_state *bs;
-	struct gru_context_configuration_handle *cch;
-	unsigned long cbr_map, dsr_map;
-	int err, num, cpus_possible;
-
-	/*
-	 * Currently, resources are reserved ONLY on the second chiplet
-	 * on each blade. This leaves ALL resources on chiplet 0 available
-	 * for user code.
-	 */
-	bs = gru->gs_blade;
-	if (gru != &bs->bs_grus[1])
-		return 0;
-
-	cpus_possible = uv_blade_nr_possible_cpus(gru->gs_blade_id);
-
-	num = GRU_NUM_KERNEL_CBR * cpus_possible;
-	cbr_map = gru_reserve_cb_resources(gru, GRU_CB_COUNT_TO_AU(num), NULL);
-	gru->gs_reserved_cbrs += num;
-
-	num = GRU_NUM_KERNEL_DSR_BYTES * cpus_possible;
-	dsr_map = gru_reserve_ds_resources(gru, GRU_DS_BYTES_TO_AU(num), NULL);
-	gru->gs_reserved_dsr_bytes += num;
-
-	gru->gs_active_contexts++;
-	__set_bit(KERNEL_CTXNUM, &gru->gs_context_map);
-	cch = get_cch(gru->gs_gru_base_vaddr, KERNEL_CTXNUM);
-
-	bs->kernel_cb = get_gseg_base_address_cb(gru->gs_gru_base_vaddr,
-					KERNEL_CTXNUM, 0);
-	bs->kernel_dsr = get_gseg_base_address_ds(gru->gs_gru_base_vaddr,
-					KERNEL_CTXNUM, 0);
-
-	lock_cch_handle(cch);
-	cch->tfm_fault_bit_enable = 0;
-	cch->tlb_int_enable = 0;
-	cch->tfm_done_bit_enable = 0;
-	cch->unmap_enable = 1;
-	err = cch_allocate(cch, 0, 0, cbr_map, dsr_map);
-	if (err) {
-		gru_dbg(grudev,
-			"Unable to allocate kernel CCH: gid %d, err %d\n",
-			gru->gs_gid, err);
-		BUG();
+	struct gru_message_queue_desc mqd;
+	void *p, *mq;
+	unsigned long *dw;
+	int i, ret = -EIO;
+	char mes[GRU_CACHE_LINE_BYTES], *m;
+
+	/* Need  1K cacheline aligned that does not cross page boundary */
+	p = kmalloc(4096, 0);
+	mq = ALIGNUP(p, 1024);
+	memset(mes, 0xee, sizeof(mes));
+	dw = mq;
+
+	gru_create_message_queue(&mqd, mq, 8 * GRU_CACHE_LINE_BYTES, 0, 0, 0);
+	for (i = 0; i < 6; i++) {
+		mes[8] = i;
+		do {
+			ret = gru_send_message_gpa(&mqd, mes, sizeof(mes));
+		} while (ret == MQE_CONGESTION);
+		if (ret)
+			break;
 	}
-	if (cch_start(cch)) {
-		gru_dbg(grudev, "Unable to start kernel CCH: gid %d, err %d\n",
-			gru->gs_gid, err);
-		BUG();
+	if (ret != MQE_QUEUE_FULL || i != 4)
+		goto done;
+
+	for (i = 0; i < 6; i++) {
+		m = gru_get_next_message(&mqd);
+		if (!m || m[8] != i)
+			break;
+		gru_free_message(&mqd, m);
 	}
-	unlock_cch_handle(cch);
+	ret = (i == 4) ? 0 : -EIO;
 
-	if (gru_options & GRU_QUICKLOOK)
-		quicktest(gru);
-	return 0;
+done:
+	kfree(p);
+	return ret;
 }
 
-void gru_kservices_exit(struct gru_state *gru)
+static int quicktest2(unsigned long arg)
 {
-	struct gru_context_configuration_handle *cch;
-	struct gru_blade_state *bs;
+	static DECLARE_COMPLETION(cmp);
+	unsigned long han;
+	int blade_id = 0;
+	int numcb = 4;
+	int ret = 0;
+	unsigned long *buf;
+	void *cb0, *cb;
+	int i, k, istatus, bytes;
+
+	bytes = numcb * 4 * 8;
+	buf = kmalloc(bytes, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	ret = -EBUSY;
+	han = gru_reserve_async_resources(blade_id, numcb, 0, &cmp);
+	if (!han)
+		goto done;
+
+	gru_lock_async_resource(han, &cb0, NULL);
+	memset(buf, 0xee, bytes);
+	for (i = 0; i < numcb; i++)
+		gru_vset(cb0 + i * GRU_HANDLE_STRIDE, uv_gpa(&buf[i * 4]), 0,
+				XTYPE_DW, 4, 1, IMA_INTERRUPT);
+
+	ret = 0;
+	for (k = 0; k < numcb; k++) {
+		gru_wait_async_cbr(han);
+		for (i = 0; i < numcb; i++) {
+			cb = cb0 + i * GRU_HANDLE_STRIDE;
+			istatus = gru_check_status(cb);
+			if (istatus == CBS_ACTIVE)
+				continue;
+			if (istatus == CBS_EXCEPTION)
+				ret = -EFAULT;
+			else if (buf[i] || buf[i + 1] || buf[i + 2] ||
+					buf[i + 3])
+				ret = -EIO;
+		}
+	}
+	BUG_ON(cmp.done);
 
-	bs = gru->gs_blade;
-	if (gru != &bs->bs_grus[1])
-		return;
+	gru_unlock_async_resource(han);
+	gru_release_async_resources(han);
+done:
+	kfree(buf);
+	return ret;
+}
 
-	cch = get_cch(gru->gs_gru_base_vaddr, KERNEL_CTXNUM);
-	lock_cch_handle(cch);
-	if (cch_interrupt_sync(cch))
-		BUG();
-	if (cch_deallocate(cch))
+/*
+ * Debugging only. User hook for various kernel tests
+ * of driver & gru.
+ */
+int gru_ktest(unsigned long arg)
+{
+	int ret = -EINVAL;
+
+	switch (arg & 0xff) {
+	case 0:
+		ret = quicktest0(arg);
+		break;
+	case 1:
+		ret = quicktest1(arg);
+		break;
+	case 2:
+		ret = quicktest2(arg);
+		break;
+	case 99:
+		ret = gru_free_kernel_contexts();
+		break;
+	}
+	return ret;
+
+}
+
+int gru_kservices_init(void)
+{
+	return 0;
+}
+
+void gru_kservices_exit(void)
+{
+	if (gru_free_kernel_contexts())
 		BUG();
-	unlock_cch_handle(cch);
 }
 
diff --git a/drivers/misc/sgi-gru/grukservices.h b/drivers/misc/sgi-gru/grukservices.h
index 747ed315d56..d60d34bca44 100644
--- a/drivers/misc/sgi-gru/grukservices.h
+++ b/drivers/misc/sgi-gru/grukservices.h
@@ -146,4 +146,55 @@ extern void *gru_get_next_message(struct gru_message_queue_desc *mqd);
 extern int gru_copy_gpa(unsigned long dest_gpa, unsigned long src_gpa,
 							unsigned int bytes);
 
+/*
+ * Reserve GRU resources to be used asynchronously.
+ *
+ * 	input:
+ * 		blade_id  - blade on which resources should be reserved
+ * 		cbrs	  - number of CBRs
+ * 		dsr_bytes - number of DSR bytes needed
+ * 		cmp	  - completion structure for waiting for
+ * 			    async completions
+ *	output:
+ *		handle to identify resource
+ *		(0 = no resources)
+ */
+extern unsigned long gru_reserve_async_resources(int blade_id, int cbrs, int dsr_bytes,
+				struct completion *cmp);
+
+/*
+ * Release async resources previously reserved.
+ *
+ *	input:
+ *		han - handle to identify resources
+ */
+extern void gru_release_async_resources(unsigned long han);
+
+/*
+ * Wait for async GRU instructions to complete.
+ *
+ *	input:
+ *		han - handle to identify resources
+ */
+extern void gru_wait_async_cbr(unsigned long han);
+
+/*
+ * Lock previous reserved async GRU resources
+ *
+ *	input:
+ *		han - handle to identify resources
+ *	output:
+ *		cb  - pointer to first CBR
+ *		dsr - pointer to first DSR
+ */
+extern void gru_lock_async_resource(unsigned long han,  void **cb, void **dsr);
+
+/*
+ * Unlock previous reserved async GRU resources
+ *
+ *	input:
+ *		han - handle to identify resources
+ */
+extern void gru_unlock_async_resource(unsigned long han);
+
 #endif 		/* __GRU_KSERVICES_H_ */
diff --git a/drivers/misc/sgi-gru/grulib.h b/drivers/misc/sgi-gru/grulib.h
index e56e196a699..889bc442a3e 100644
--- a/drivers/misc/sgi-gru/grulib.h
+++ b/drivers/misc/sgi-gru/grulib.h
@@ -32,8 +32,8 @@
 /* Set Number of Request Blocks */
 #define GRU_CREATE_CONTEXT		_IOWR(GRU_IOCTL_NUM, 1, void *)
 
-/* Register task as using the slice */
-#define GRU_SET_TASK_SLICE		_IOWR(GRU_IOCTL_NUM, 5, void *)
+/*  Set Context Options */
+#define GRU_SET_CONTEXT_OPTION		_IOWR(GRU_IOCTL_NUM, 4, void *)
 
 /* Fetch exception detail */
 #define GRU_USER_GET_EXCEPTION_DETAIL	_IOWR(GRU_IOCTL_NUM, 6, void *)
@@ -44,8 +44,11 @@
 /* For user unload context */
 #define GRU_USER_UNLOAD_CONTEXT		_IOWR(GRU_IOCTL_NUM, 9, void *)
 
-/* For fetching GRU chiplet status */
-#define GRU_GET_CHIPLET_STATUS		_IOWR(GRU_IOCTL_NUM, 10, void *)
+/* For dumpping GRU chiplet state */
+#define GRU_DUMP_CHIPLET_STATE		_IOWR(GRU_IOCTL_NUM, 11, void *)
+
+/* For getting gseg statistics */
+#define GRU_GET_GSEG_STATISTICS		_IOWR(GRU_IOCTL_NUM, 12, void *)
 
 /* For user TLB flushing (primarily for tests) */
 #define GRU_USER_FLUSH_TLB		_IOWR(GRU_IOCTL_NUM, 50, void *)
@@ -53,8 +56,26 @@
 /* Get some config options (primarily for tests & emulator) */
 #define GRU_GET_CONFIG_INFO		_IOWR(GRU_IOCTL_NUM, 51, void *)
 
+/* Various kernel self-tests */
+#define GRU_KTEST			_IOWR(GRU_IOCTL_NUM, 52, void *)
+
 #define CONTEXT_WINDOW_BYTES(th)        (GRU_GSEG_PAGESIZE * (th))
 #define THREAD_POINTER(p, th)		(p + GRU_GSEG_PAGESIZE * (th))
+#define GSEG_START(cb)			((void *)((unsigned long)(cb) & ~(GRU_GSEG_PAGESIZE - 1)))
+
+/*
+ * Statictics kept on a per-GTS basis.
+ */
+struct gts_statistics {
+	unsigned long	fmm_tlbdropin;
+	unsigned long	upm_tlbdropin;
+	unsigned long	context_stolen;
+};
+
+struct gru_get_gseg_statistics_req {
+	unsigned long		gseg;
+	struct gts_statistics	stats;
+};
 
 /*
  * Structure used to pass TLB flush parameters to the driver
@@ -75,6 +96,16 @@ struct gru_unload_context_req {
 };
 
 /*
+ * Structure used to set context options
+ */
+enum {sco_gseg_owner, sco_cch_req_slice};
+struct gru_set_context_option_req {
+	unsigned long	gseg;
+	int		op;
+	unsigned long	val1;
+};
+
+/*
  * Structure used to pass TLB flush parameters to the driver
  */
 struct gru_flush_tlb_req {
@@ -84,6 +115,36 @@ struct gru_flush_tlb_req {
 };
 
 /*
+ * Structure used to pass TLB flush parameters to the driver
+ */
+enum {dcs_pid, dcs_gid};
+struct gru_dump_chiplet_state_req {
+	unsigned int	op;
+	unsigned int	gid;
+	int		ctxnum;
+	char		data_opt;
+	char		lock_cch;
+	pid_t		pid;
+	void		*buf;
+	size_t		buflen;
+	/* ---- output --- */
+	unsigned int	num_contexts;
+};
+
+#define GRU_DUMP_MAGIC	0x3474ab6c
+struct gru_dump_context_header {
+	unsigned int	magic;
+	unsigned int	gid;
+	unsigned char	ctxnum;
+	unsigned char	cbrcnt;
+	unsigned char	dsrcnt;
+	pid_t		pid;
+	unsigned long	vaddr;
+	int		cch_locked;
+	unsigned long	data[0];
+};
+
+/*
  * GRU configuration info (temp - for testing)
  */
 struct gru_config_info {
diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c
index ec3f7a17d22..3bc643dad60 100644
--- a/drivers/misc/sgi-gru/grumain.c
+++ b/drivers/misc/sgi-gru/grumain.c
@@ -3,11 +3,21 @@
  *
  *            DRIVER TABLE MANAGER + GRU CONTEXT LOAD/UNLOAD
  *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
+ *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
  *
- * Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  */
 
 #include <linux/kernel.h>
@@ -96,7 +106,7 @@ static int gru_reset_asid_limit(struct gru_state *gru, int asid)
 	gid = gru->gs_gid;
 again:
 	for (i = 0; i < GRU_NUM_CCH; i++) {
-		if (!gru->gs_gts[i])
+		if (!gru->gs_gts[i] || is_kernel_context(gru->gs_gts[i]))
 			continue;
 		inuse_asid = gru->gs_gts[i]->ts_gms->ms_asids[gid].mt_asid;
 		gru_dbg(grudev, "gid %d, gts %p, gms %p, inuse 0x%x, cxt %d\n",
@@ -150,7 +160,7 @@ static unsigned long reserve_resources(unsigned long *p, int n, int mmax,
 	unsigned long bits = 0;
 	int i;
 
-	do {
+	while (n--) {
 		i = find_first_bit(p, mmax);
 		if (i == mmax)
 			BUG();
@@ -158,7 +168,7 @@ static unsigned long reserve_resources(unsigned long *p, int n, int mmax,
 		__set_bit(i, &bits);
 		if (idx)
 			*idx++ = i;
-	} while (--n);
+	}
 	return bits;
 }
 
@@ -299,38 +309,39 @@ static struct gru_thread_state *gru_find_current_gts_nolock(struct gru_vma_data
 /*
  * Allocate a thread state structure.
  */
-static struct gru_thread_state *gru_alloc_gts(struct vm_area_struct *vma,
-					      struct gru_vma_data *vdata,
-					      int tsid)
+struct gru_thread_state *gru_alloc_gts(struct vm_area_struct *vma,
+		int cbr_au_count, int dsr_au_count, int options, int tsid)
 {
 	struct gru_thread_state *gts;
 	int bytes;
 
-	bytes = DSR_BYTES(vdata->vd_dsr_au_count) +
-				CBR_BYTES(vdata->vd_cbr_au_count);
+	bytes = DSR_BYTES(dsr_au_count) + CBR_BYTES(cbr_au_count);
 	bytes += sizeof(struct gru_thread_state);
-	gts = kzalloc(bytes, GFP_KERNEL);
+	gts = kmalloc(bytes, GFP_KERNEL);
 	if (!gts)
 		return NULL;
 
 	STAT(gts_alloc);
+	memset(gts, 0, sizeof(struct gru_thread_state)); /* zero out header */
 	atomic_set(&gts->ts_refcnt, 1);
 	mutex_init(&gts->ts_ctxlock);
-	gts->ts_cbr_au_count = vdata->vd_cbr_au_count;
-	gts->ts_dsr_au_count = vdata->vd_dsr_au_count;
-	gts->ts_user_options = vdata->vd_user_options;
+	gts->ts_cbr_au_count = cbr_au_count;
+	gts->ts_dsr_au_count = dsr_au_count;
+	gts->ts_user_options = options;
 	gts->ts_tsid = tsid;
-	gts->ts_user_options = vdata->vd_user_options;
 	gts->ts_ctxnum = NULLCTX;
-	gts->ts_mm = current->mm;
-	gts->ts_vma = vma;
 	gts->ts_tlb_int_select = -1;
-	gts->ts_gms = gru_register_mmu_notifier();
+	gts->ts_cch_req_slice = -1;
 	gts->ts_sizeavail = GRU_SIZEAVAIL(PAGE_SHIFT);
-	if (!gts->ts_gms)
-		goto err;
+	if (vma) {
+		gts->ts_mm = current->mm;
+		gts->ts_vma = vma;
+		gts->ts_gms = gru_register_mmu_notifier();
+		if (!gts->ts_gms)
+			goto err;
+	}
 
-	gru_dbg(grudev, "alloc vdata %p, new gts %p\n", vdata, gts);
+	gru_dbg(grudev, "alloc gts %p\n", gts);
 	return gts;
 
 err:
@@ -381,7 +392,8 @@ struct gru_thread_state *gru_alloc_thread_state(struct vm_area_struct *vma,
 	struct gru_vma_data *vdata = vma->vm_private_data;
 	struct gru_thread_state *gts, *ngts;
 
-	gts = gru_alloc_gts(vma, vdata, tsid);
+	gts = gru_alloc_gts(vma, vdata->vd_cbr_au_count, vdata->vd_dsr_au_count,
+			    vdata->vd_user_options, tsid);
 	if (!gts)
 		return NULL;
 
@@ -458,7 +470,8 @@ static void gru_prefetch_context(void *gseg, void *cb, void *cbe,
 }
 
 static void gru_load_context_data(void *save, void *grubase, int ctxnum,
-				  unsigned long cbrmap, unsigned long dsrmap)
+				  unsigned long cbrmap, unsigned long dsrmap,
+				  int data_valid)
 {
 	void *gseg, *cb, *cbe;
 	unsigned long length;
@@ -471,12 +484,22 @@ static void gru_load_context_data(void *save, void *grubase, int ctxnum,
 	gru_prefetch_context(gseg, cb, cbe, cbrmap, length);
 
 	for_each_cbr_in_allocation_map(i, &cbrmap, scr) {
-		save += gru_copy_handle(cb, save);
-		save += gru_copy_handle(cbe + i * GRU_HANDLE_STRIDE, save);
+		if (data_valid) {
+			save += gru_copy_handle(cb, save);
+			save += gru_copy_handle(cbe + i * GRU_HANDLE_STRIDE,
+						save);
+		} else {
+			memset(cb, 0, GRU_CACHE_LINE_BYTES);
+			memset(cbe + i * GRU_HANDLE_STRIDE, 0,
+						GRU_CACHE_LINE_BYTES);
+		}
 		cb += GRU_HANDLE_STRIDE;
 	}
 
-	memcpy(gseg + GRU_DS_BASE, save, length);
+	if (data_valid)
+		memcpy(gseg + GRU_DS_BASE, save, length);
+	else
+		memset(gseg + GRU_DS_BASE, 0, length);
 }
 
 static void gru_unload_context_data(void *save, void *grubase, int ctxnum,
@@ -506,7 +529,8 @@ void gru_unload_context(struct gru_thread_state *gts, int savestate)
 	struct gru_context_configuration_handle *cch;
 	int ctxnum = gts->ts_ctxnum;
 
-	zap_vma_ptes(gts->ts_vma, UGRUADDR(gts), GRU_GSEG_PAGESIZE);
+	if (!is_kernel_context(gts))
+		zap_vma_ptes(gts->ts_vma, UGRUADDR(gts), GRU_GSEG_PAGESIZE);
 	cch = get_cch(gru->gs_gru_base_vaddr, ctxnum);
 
 	gru_dbg(grudev, "gts %p\n", gts);
@@ -514,11 +538,14 @@ void gru_unload_context(struct gru_thread_state *gts, int savestate)
 	if (cch_interrupt_sync(cch))
 		BUG();
 
-	gru_unload_mm_tracker(gru, gts);
-	if (savestate)
+	if (!is_kernel_context(gts))
+		gru_unload_mm_tracker(gru, gts);
+	if (savestate) {
 		gru_unload_context_data(gts->ts_gdata, gru->gs_gru_base_vaddr,
 					ctxnum, gts->ts_cbr_map,
 					gts->ts_dsr_map);
+		gts->ts_data_valid = 1;
+	}
 
 	if (cch_deallocate(cch))
 		BUG();
@@ -526,24 +553,22 @@ void gru_unload_context(struct gru_thread_state *gts, int savestate)
 	unlock_cch_handle(cch);
 
 	gru_free_gru_context(gts);
-	STAT(unload_context);
 }
 
 /*
  * Load a GRU context by copying it from the thread data structure in memory
  * to the GRU.
  */
-static void gru_load_context(struct gru_thread_state *gts)
+void gru_load_context(struct gru_thread_state *gts)
 {
 	struct gru_state *gru = gts->ts_gru;
 	struct gru_context_configuration_handle *cch;
-	int err, asid, ctxnum = gts->ts_ctxnum;
+	int i, err, asid, ctxnum = gts->ts_ctxnum;
 
 	gru_dbg(grudev, "gts %p\n", gts);
 	cch = get_cch(gru->gs_gru_base_vaddr, ctxnum);
 
 	lock_cch_handle(cch);
-	asid = gru_load_mm_tracker(gru, gts);
 	cch->tfm_fault_bit_enable =
 	    (gts->ts_user_options == GRU_OPT_MISS_FMM_POLL
 	     || gts->ts_user_options == GRU_OPT_MISS_FMM_INTR);
@@ -552,9 +577,32 @@ static void gru_load_context(struct gru_thread_state *gts)
 		gts->ts_tlb_int_select = gru_cpu_fault_map_id();
 		cch->tlb_int_select = gts->ts_tlb_int_select;
 	}
+	if (gts->ts_cch_req_slice >= 0) {
+		cch->req_slice_set_enable = 1;
+		cch->req_slice = gts->ts_cch_req_slice;
+	} else {
+		cch->req_slice_set_enable =0;
+	}
 	cch->tfm_done_bit_enable = 0;
-	err = cch_allocate(cch, asid, gts->ts_sizeavail, gts->ts_cbr_map,
-				gts->ts_dsr_map);
+	cch->dsr_allocation_map = gts->ts_dsr_map;
+	cch->cbr_allocation_map = gts->ts_cbr_map;
+
+	if (is_kernel_context(gts)) {
+		cch->unmap_enable = 1;
+		cch->tfm_done_bit_enable = 1;
+		cch->cb_int_enable = 1;
+	} else {
+		cch->unmap_enable = 0;
+		cch->tfm_done_bit_enable = 0;
+		cch->cb_int_enable = 0;
+		asid = gru_load_mm_tracker(gru, gts);
+		for (i = 0; i < 8; i++) {
+			cch->asid[i] = asid + i;
+			cch->sizeavail[i] = gts->ts_sizeavail;
+		}
+	}
+
+	err = cch_allocate(cch);
 	if (err) {
 		gru_dbg(grudev,
 			"err %d: cch %p, gts %p, cbr 0x%lx, dsr 0x%lx\n",
@@ -563,13 +611,11 @@ static void gru_load_context(struct gru_thread_state *gts)
 	}
 
 	gru_load_context_data(gts->ts_gdata, gru->gs_gru_base_vaddr, ctxnum,
-			      gts->ts_cbr_map, gts->ts_dsr_map);
+			gts->ts_cbr_map, gts->ts_dsr_map, gts->ts_data_valid);
 
 	if (cch_start(cch))
 		BUG();
 	unlock_cch_handle(cch);
-
-	STAT(load_context);
 }
 
 /*
@@ -599,6 +645,9 @@ int gru_update_cch(struct gru_thread_state *gts, int force_unload)
 				cch->sizeavail[i] = gts->ts_sizeavail;
 			gts->ts_tlb_int_select = gru_cpu_fault_map_id();
 			cch->tlb_int_select = gru_cpu_fault_map_id();
+			cch->tfm_fault_bit_enable =
+			  (gts->ts_user_options == GRU_OPT_MISS_FMM_POLL
+			    || gts->ts_user_options == GRU_OPT_MISS_FMM_INTR);
 		} else {
 			for (i = 0; i < 8; i++)
 				cch->asid[i] = 0;
@@ -642,7 +691,28 @@ static int gru_retarget_intr(struct gru_thread_state *gts)
 #define next_gru(b, g)	(((g) < &(b)->bs_grus[GRU_CHIPLETS_PER_BLADE - 1]) ?  \
 				 ((g)+1) : &(b)->bs_grus[0])
 
-static void gru_steal_context(struct gru_thread_state *gts)
+static int is_gts_stealable(struct gru_thread_state *gts,
+		struct gru_blade_state *bs)
+{
+	if (is_kernel_context(gts))
+		return down_write_trylock(&bs->bs_kgts_sema);
+	else
+		return mutex_trylock(&gts->ts_ctxlock);
+}
+
+static void gts_stolen(struct gru_thread_state *gts,
+		struct gru_blade_state *bs)
+{
+	if (is_kernel_context(gts)) {
+		up_write(&bs->bs_kgts_sema);
+		STAT(steal_kernel_context);
+	} else {
+		mutex_unlock(&gts->ts_ctxlock);
+		STAT(steal_user_context);
+	}
+}
+
+void gru_steal_context(struct gru_thread_state *gts, int blade_id)
 {
 	struct gru_blade_state *blade;
 	struct gru_state *gru, *gru0;
@@ -652,8 +722,7 @@ static void gru_steal_context(struct gru_thread_state *gts)
 	cbr = gts->ts_cbr_au_count;
 	dsr = gts->ts_dsr_au_count;
 
-	preempt_disable();
-	blade = gru_base[uv_numa_blade_id()];
+	blade = gru_base[blade_id];
 	spin_lock(&blade->bs_lock);
 
 	ctxnum = next_ctxnum(blade->bs_lru_ctxnum);
@@ -676,7 +745,7 @@ static void gru_steal_context(struct gru_thread_state *gts)
 			 * success are high. If trylock fails, try to steal a
 			 * different GSEG.
 			 */
-			if (ngts && mutex_trylock(&ngts->ts_ctxlock))
+			if (ngts && is_gts_stealable(ngts, blade))
 				break;
 			ngts = NULL;
 			flag = 1;
@@ -690,13 +759,12 @@ static void gru_steal_context(struct gru_thread_state *gts)
 	blade->bs_lru_gru = gru;
 	blade->bs_lru_ctxnum = ctxnum;
 	spin_unlock(&blade->bs_lock);
-	preempt_enable();
 
 	if (ngts) {
-		STAT(steal_context);
+		gts->ustats.context_stolen++;
 		ngts->ts_steal_jiffies = jiffies;
-		gru_unload_context(ngts, 1);
-		mutex_unlock(&ngts->ts_ctxlock);
+		gru_unload_context(ngts, is_kernel_context(ngts) ? 0 : 1);
+		gts_stolen(ngts, blade);
 	} else {
 		STAT(steal_context_failed);
 	}
@@ -710,17 +778,17 @@ static void gru_steal_context(struct gru_thread_state *gts)
 /*
  * Scan the GRUs on the local blade & assign a GRU context.
  */
-static struct gru_state *gru_assign_gru_context(struct gru_thread_state *gts)
+struct gru_state *gru_assign_gru_context(struct gru_thread_state *gts,
+						int blade)
 {
 	struct gru_state *gru, *grux;
 	int i, max_active_contexts;
 
-	preempt_disable();
 
 again:
 	gru = NULL;
 	max_active_contexts = GRU_NUM_CCH;
-	for_each_gru_on_blade(grux, uv_numa_blade_id(), i) {
+	for_each_gru_on_blade(grux, blade, i) {
 		if (check_gru_resources(grux, gts->ts_cbr_au_count,
 					gts->ts_dsr_au_count,
 					max_active_contexts)) {
@@ -760,7 +828,6 @@ again:
 		STAT(assign_context_failed);
 	}
 
-	preempt_enable();
 	return gru;
 }
 
@@ -775,6 +842,7 @@ int gru_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct gru_thread_state *gts;
 	unsigned long paddr, vaddr;
+	int blade_id;
 
 	vaddr = (unsigned long)vmf->virtual_address;
 	gru_dbg(grudev, "vma %p, vaddr 0x%lx (0x%lx)\n",
@@ -789,8 +857,10 @@ int gru_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 again:
 	mutex_lock(&gts->ts_ctxlock);
 	preempt_disable();
+	blade_id = uv_numa_blade_id();
+
 	if (gts->ts_gru) {
-		if (gts->ts_gru->gs_blade_id != uv_numa_blade_id()) {
+		if (gts->ts_gru->gs_blade_id != blade_id) {
 			STAT(migrated_nopfn_unload);
 			gru_unload_context(gts, 1);
 		} else {
@@ -800,12 +870,15 @@ again:
 	}
 
 	if (!gts->ts_gru) {
-		if (!gru_assign_gru_context(gts)) {
-			mutex_unlock(&gts->ts_ctxlock);
+		STAT(load_user_context);
+		if (!gru_assign_gru_context(gts, blade_id)) {
 			preempt_enable();
+			mutex_unlock(&gts->ts_ctxlock);
+			set_current_state(TASK_INTERRUPTIBLE);
 			schedule_timeout(GRU_ASSIGN_DELAY);  /* true hack ZZZ */
+			blade_id = uv_numa_blade_id();
 			if (gts->ts_steal_jiffies + GRU_STEAL_DELAY < jiffies)
-				gru_steal_context(gts);
+				gru_steal_context(gts, blade_id);
 			goto again;
 		}
 		gru_load_context(gts);
@@ -815,8 +888,8 @@ again:
 				vma->vm_page_prot);
 	}
 
-	mutex_unlock(&gts->ts_ctxlock);
 	preempt_enable();
+	mutex_unlock(&gts->ts_ctxlock);
 
 	return VM_FAULT_NOPAGE;
 }
diff --git a/drivers/misc/sgi-gru/gruprocfs.c b/drivers/misc/sgi-gru/gruprocfs.c
index ee74821b171..9cbf95bedce 100644
--- a/drivers/misc/sgi-gru/gruprocfs.c
+++ b/drivers/misc/sgi-gru/gruprocfs.c
@@ -51,9 +51,12 @@ static int statistics_show(struct seq_file *s, void *p)
 	printstat(s, assign_context);
 	printstat(s, assign_context_failed);
 	printstat(s, free_context);
-	printstat(s, load_context);
-	printstat(s, unload_context);
-	printstat(s, steal_context);
+	printstat(s, load_user_context);
+	printstat(s, load_kernel_context);
+	printstat(s, lock_kernel_context);
+	printstat(s, unlock_kernel_context);
+	printstat(s, steal_user_context);
+	printstat(s, steal_kernel_context);
 	printstat(s, steal_context_failed);
 	printstat(s, nopfn);
 	printstat(s, break_cow);
@@ -70,7 +73,7 @@ static int statistics_show(struct seq_file *s, void *p)
 	printstat(s, user_flush_tlb);
 	printstat(s, user_unload_context);
 	printstat(s, user_exception);
-	printstat(s, set_task_slice);
+	printstat(s, set_context_option);
 	printstat(s, migrate_check);
 	printstat(s, migrated_retarget);
 	printstat(s, migrated_unload);
@@ -84,6 +87,9 @@ static int statistics_show(struct seq_file *s, void *p)
 	printstat(s, tlb_dropin_fail_range_active);
 	printstat(s, tlb_dropin_fail_idle);
 	printstat(s, tlb_dropin_fail_fmm);
+	printstat(s, tlb_dropin_fail_no_exception);
+	printstat(s, tlb_dropin_fail_no_exception_war);
+	printstat(s, tfh_stale_on_fault);
 	printstat(s, mmu_invalidate_range);
 	printstat(s, mmu_invalidate_page);
 	printstat(s, mmu_clear_flush_young);
@@ -158,8 +164,7 @@ static ssize_t options_write(struct file *file, const char __user *userbuf,
 	unsigned long val;
 	char buf[80];
 
-	if (copy_from_user
-	    (buf, userbuf, count < sizeof(buf) ? count : sizeof(buf)))
+	if (strncpy_from_user(buf, userbuf, sizeof(buf) - 1) < 0)
 		return -EFAULT;
 	buf[count - 1] = '\0';
 	if (!strict_strtoul(buf, 10, &val))
diff --git a/drivers/misc/sgi-gru/grutables.h b/drivers/misc/sgi-gru/grutables.h
index bf1eeb7553e..34ab3d45391 100644
--- a/drivers/misc/sgi-gru/grutables.h
+++ b/drivers/misc/sgi-gru/grutables.h
@@ -148,11 +148,13 @@
 #include <linux/wait.h>
 #include <linux/mmu_notifier.h>
 #include "gru.h"
+#include "grulib.h"
 #include "gruhandles.h"
 
 extern struct gru_stats_s gru_stats;
 extern struct gru_blade_state *gru_base[];
 extern unsigned long gru_start_paddr, gru_end_paddr;
+extern void *gru_start_vaddr;
 extern unsigned int gru_max_gids;
 
 #define GRU_MAX_BLADES		MAX_NUMNODES
@@ -174,9 +176,12 @@ struct gru_stats_s {
 	atomic_long_t assign_context;
 	atomic_long_t assign_context_failed;
 	atomic_long_t free_context;
-	atomic_long_t load_context;
-	atomic_long_t unload_context;
-	atomic_long_t steal_context;
+	atomic_long_t load_user_context;
+	atomic_long_t load_kernel_context;
+	atomic_long_t lock_kernel_context;
+	atomic_long_t unlock_kernel_context;
+	atomic_long_t steal_user_context;
+	atomic_long_t steal_kernel_context;
 	atomic_long_t steal_context_failed;
 	atomic_long_t nopfn;
 	atomic_long_t break_cow;
@@ -193,7 +198,7 @@ struct gru_stats_s {
 	atomic_long_t user_flush_tlb;
 	atomic_long_t user_unload_context;
 	atomic_long_t user_exception;
-	atomic_long_t set_task_slice;
+	atomic_long_t set_context_option;
 	atomic_long_t migrate_check;
 	atomic_long_t migrated_retarget;
 	atomic_long_t migrated_unload;
@@ -207,6 +212,9 @@ struct gru_stats_s {
 	atomic_long_t tlb_dropin_fail_range_active;
 	atomic_long_t tlb_dropin_fail_idle;
 	atomic_long_t tlb_dropin_fail_fmm;
+	atomic_long_t tlb_dropin_fail_no_exception;
+	atomic_long_t tlb_dropin_fail_no_exception_war;
+	atomic_long_t tfh_stale_on_fault;
 	atomic_long_t mmu_invalidate_range;
 	atomic_long_t mmu_invalidate_page;
 	atomic_long_t mmu_clear_flush_young;
@@ -253,7 +261,6 @@ extern struct mcs_op_statistic mcs_op_statistics[mcsop_last];
 
 #define OPT_DPRINT	1
 #define OPT_STATS	2
-#define GRU_QUICKLOOK	4
 
 
 #define IRQ_GRU			110	/* Starting IRQ number for interrupts */
@@ -373,6 +380,7 @@ struct gru_thread_state {
 						   required for contest */
 	unsigned char		ts_cbr_au_count;/* Number of CBR resources
 						   required for contest */
+	char			ts_cch_req_slice;/* CCH packet slice */
 	char			ts_blade;	/* If >= 0, migrate context if
 						   ref from diferent blade */
 	char			ts_force_cch_reload;
@@ -380,6 +388,9 @@ struct gru_thread_state {
 						   after migration */
 	char			ts_cbr_idx[GRU_CBR_AU];/* CBR numbers of each
 							  allocated CB */
+	int			ts_data_valid;	/* Indicates if ts_gdata has
+						   valid data */
+	struct gts_statistics	ustats;		/* User statistics */
 	unsigned long		ts_gdata[0];	/* save area for GRU data (CB,
 						   DS, CBE) */
 };
@@ -452,6 +463,14 @@ struct gru_blade_state {
 							   reserved cb */
 	void			*kernel_dsr;		/* First kernel
 							   reserved DSR */
+	struct rw_semaphore	bs_kgts_sema;		/* lock for kgts */
+	struct gru_thread_state *bs_kgts;		/* GTS for kernel use */
+
+	/* ---- the following are used for managing kernel async GRU CBRs --- */
+	int			bs_async_dsr_bytes;	/* DSRs for async */
+	int			bs_async_cbrs;		/* CBRs AU for async */
+	struct completion	*bs_async_wq;
+
 	/* ---- the following are protected by the bs_lock spinlock ---- */
 	spinlock_t		bs_lock;		/* lock used for
 							   stealing contexts */
@@ -552,6 +571,12 @@ struct gru_blade_state {
 
 /* Lock hierarchy checking enabled only in emulator */
 
+/* 0 = lock failed, 1 = locked */
+static inline int __trylock_handle(void *h)
+{
+	return !test_and_set_bit(1, h);
+}
+
 static inline void __lock_handle(void *h)
 {
 	while (test_and_set_bit(1, h))
@@ -563,6 +588,11 @@ static inline void __unlock_handle(void *h)
 	clear_bit(1, h);
 }
 
+static inline int trylock_cch_handle(struct gru_context_configuration_handle *cch)
+{
+	return __trylock_handle(cch);
+}
+
 static inline void lock_cch_handle(struct gru_context_configuration_handle *cch)
 {
 	__lock_handle(cch);
@@ -584,6 +614,11 @@ static inline void unlock_tgh_handle(struct gru_tlb_global_handle *tgh)
 	__unlock_handle(tgh);
 }
 
+static inline int is_kernel_context(struct gru_thread_state *gts)
+{
+	return !gts->ts_mm;
+}
+
 /*-----------------------------------------------------------------------------
  * Function prototypes & externs
  */
@@ -598,24 +633,32 @@ extern struct gru_thread_state *gru_find_thread_state(struct vm_area_struct
 				*vma, int tsid);
 extern struct gru_thread_state *gru_alloc_thread_state(struct vm_area_struct
 				*vma, int tsid);
+extern struct gru_state *gru_assign_gru_context(struct gru_thread_state *gts,
+		int blade);
+extern void gru_load_context(struct gru_thread_state *gts);
+extern void gru_steal_context(struct gru_thread_state *gts, int blade_id);
 extern void gru_unload_context(struct gru_thread_state *gts, int savestate);
 extern int gru_update_cch(struct gru_thread_state *gts, int force_unload);
 extern void gts_drop(struct gru_thread_state *gts);
 extern void gru_tgh_flush_init(struct gru_state *gru);
-extern int gru_kservices_init(struct gru_state *gru);
-extern void gru_kservices_exit(struct gru_state *gru);
+extern int gru_kservices_init(void);
+extern void gru_kservices_exit(void);
+extern int gru_dump_chiplet_request(unsigned long arg);
+extern long gru_get_gseg_statistics(unsigned long arg);
 extern irqreturn_t gru_intr(int irq, void *dev_id);
 extern int gru_handle_user_call_os(unsigned long address);
 extern int gru_user_flush_tlb(unsigned long arg);
 extern int gru_user_unload_context(unsigned long arg);
 extern int gru_get_exception_detail(unsigned long arg);
-extern int gru_set_task_slice(long address);
+extern int gru_set_context_option(unsigned long address);
 extern int gru_cpu_fault_map_id(void);
 extern struct vm_area_struct *gru_find_vma(unsigned long vaddr);
 extern void gru_flush_all_tlb(struct gru_state *gru);
 extern int gru_proc_init(void);
 extern void gru_proc_exit(void);
 
+extern struct gru_thread_state *gru_alloc_gts(struct vm_area_struct *vma,
+		int cbr_au_count, int dsr_au_count, int options, int tsid);
 extern unsigned long gru_reserve_cb_resources(struct gru_state *gru,
 		int cbr_au_count, char *cbmap);
 extern unsigned long gru_reserve_ds_resources(struct gru_state *gru,
@@ -624,6 +667,7 @@ extern int gru_fault(struct vm_area_struct *, struct vm_fault *vmf);
 extern struct gru_mm_struct *gru_register_mmu_notifier(void);
 extern void gru_drop_mmu_notifier(struct gru_mm_struct *gms);
 
+extern int gru_ktest(unsigned long arg);
 extern void gru_flush_tlb_range(struct gru_mm_struct *gms, unsigned long start,
 					unsigned long len);
 
diff --git a/drivers/pps/Kconfig b/drivers/pps/Kconfig
new file mode 100644
index 00000000000..cc2eb8edb51
--- /dev/null
+++ b/drivers/pps/Kconfig
@@ -0,0 +1,33 @@
+#
+# PPS support configuration
+#
+
+menu "PPS support"
+
+config PPS
+	tristate "PPS support"
+	depends on EXPERIMENTAL
+	---help---
+	  PPS (Pulse Per Second) is a special pulse provided by some GPS
+	  antennae. Userland can use it to get a high-precision time
+	  reference.
+
+	  Some antennae's PPS signals are connected with the CD (Carrier
+	  Detect) pin of the serial line they use to communicate with the
+	  host. In this case use the SERIAL_LINE client support.
+
+	  Some antennae's PPS signals are connected with some special host
+	  inputs so you have to enable the corresponding client support.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called pps_core.ko.
+
+config PPS_DEBUG
+	bool "PPS debugging messages"
+	depends on PPS
+	help
+	  Say Y here if you want the PPS support to produce a bunch of debug
+	  messages to the system log.  Select this if you are having a
+	  problem with PPS support and want to see more of what is going on.
+
+endmenu
diff --git a/drivers/pps/Makefile b/drivers/pps/Makefile
new file mode 100644
index 00000000000..19ea582f431
--- /dev/null
+++ b/drivers/pps/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the PPS core.
+#
+
+pps_core-y			:= pps.o kapi.o sysfs.o
+obj-$(CONFIG_PPS)		:= pps_core.o
+
+ccflags-$(CONFIG_PPS_DEBUG) := -DDEBUG
diff --git a/drivers/pps/kapi.c b/drivers/pps/kapi.c
new file mode 100644
index 00000000000..35a0b192d76
--- /dev/null
+++ b/drivers/pps/kapi.c
@@ -0,0 +1,329 @@
+/*
+ * kernel API
+ *
+ *
+ * Copyright (C) 2005-2009   Rodolfo Giometti <giometti@linux.it>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/fs.h>
+#include <linux/pps_kernel.h>
+
+/*
+ * Global variables
+ */
+
+DEFINE_SPINLOCK(pps_idr_lock);
+DEFINE_IDR(pps_idr);
+
+/*
+ * Local functions
+ */
+
+static void pps_add_offset(struct pps_ktime *ts, struct pps_ktime *offset)
+{
+	ts->nsec += offset->nsec;
+	while (ts->nsec >= NSEC_PER_SEC) {
+		ts->nsec -= NSEC_PER_SEC;
+		ts->sec++;
+	}
+	while (ts->nsec < 0) {
+		ts->nsec += NSEC_PER_SEC;
+		ts->sec--;
+	}
+	ts->sec += offset->sec;
+}
+
+/*
+ * Exported functions
+ */
+
+/* pps_get_source - find a PPS source
+ * @source: the PPS source ID.
+ *
+ * This function is used to find an already registered PPS source into the
+ * system.
+ *
+ * The function returns NULL if found nothing, otherwise it returns a pointer
+ * to the PPS source data struct (the refcounter is incremented by 1).
+ */
+
+struct pps_device *pps_get_source(int source)
+{
+	struct pps_device *pps;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pps_idr_lock, flags);
+
+	pps = idr_find(&pps_idr, source);
+	if (pps != NULL)
+		atomic_inc(&pps->usage);
+
+	spin_unlock_irqrestore(&pps_idr_lock, flags);
+
+	return pps;
+}
+
+/* pps_put_source - free the PPS source data
+ * @pps: a pointer to the PPS source.
+ *
+ * This function is used to free a PPS data struct if its refcount is 0.
+ */
+
+void pps_put_source(struct pps_device *pps)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pps_idr_lock, flags);
+	BUG_ON(atomic_read(&pps->usage) == 0);
+
+	if (!atomic_dec_and_test(&pps->usage)) {
+		pps = NULL;
+		goto exit;
+	}
+
+	/* No more reference to the PPS source. We can safely remove the
+	 * PPS data struct.
+	 */
+	idr_remove(&pps_idr, pps->id);
+
+exit:
+	spin_unlock_irqrestore(&pps_idr_lock, flags);
+	kfree(pps);
+}
+
+/* pps_register_source - add a PPS source in the system
+ * @info: the PPS info struct
+ * @default_params: the default PPS parameters of the new source
+ *
+ * This function is used to add a new PPS source in the system. The new
+ * source is described by info's fields and it will have, as default PPS
+ * parameters, the ones specified into default_params.
+ *
+ * The function returns, in case of success, the PPS source ID.
+ */
+
+int pps_register_source(struct pps_source_info *info, int default_params)
+{
+	struct pps_device *pps;
+	int id;
+	int err;
+
+	/* Sanity checks */
+	if ((info->mode & default_params) != default_params) {
+		printk(KERN_ERR "pps: %s: unsupported default parameters\n",
+					info->name);
+		err = -EINVAL;
+		goto pps_register_source_exit;
+	}
+	if ((info->mode & (PPS_ECHOASSERT | PPS_ECHOCLEAR)) != 0 &&
+			info->echo == NULL) {
+		printk(KERN_ERR "pps: %s: echo function is not defined\n",
+					info->name);
+		err = -EINVAL;
+		goto pps_register_source_exit;
+	}
+	if ((info->mode & (PPS_TSFMT_TSPEC | PPS_TSFMT_NTPFP)) == 0) {
+		printk(KERN_ERR "pps: %s: unspecified time format\n",
+					info->name);
+		err = -EINVAL;
+		goto pps_register_source_exit;
+	}
+
+	/* Allocate memory for the new PPS source struct */
+	pps = kzalloc(sizeof(struct pps_device), GFP_KERNEL);
+	if (pps == NULL) {
+		err = -ENOMEM;
+		goto pps_register_source_exit;
+	}
+
+	/* These initializations must be done before calling idr_get_new()
+	 * in order to avoid reces into pps_event().
+	 */
+	pps->params.api_version = PPS_API_VERS;
+	pps->params.mode = default_params;
+	pps->info = *info;
+
+	init_waitqueue_head(&pps->queue);
+	spin_lock_init(&pps->lock);
+	atomic_set(&pps->usage, 1);
+
+	/* Get new ID for the new PPS source */
+	if (idr_pre_get(&pps_idr, GFP_KERNEL) == 0) {
+		err = -ENOMEM;
+		goto kfree_pps;
+	}
+
+	spin_lock_irq(&pps_idr_lock);
+
+	/* Now really allocate the PPS source.
+	 * After idr_get_new() calling the new source will be freely available
+	 * into the kernel.
+	 */
+	err = idr_get_new(&pps_idr, pps, &id);
+	if (err < 0) {
+		spin_unlock_irq(&pps_idr_lock);
+		goto kfree_pps;
+	}
+
+	id = id & MAX_ID_MASK;
+	if (id >= PPS_MAX_SOURCES) {
+		spin_unlock_irq(&pps_idr_lock);
+
+		printk(KERN_ERR "pps: %s: too many PPS sources in the system\n",
+					info->name);
+		err = -EBUSY;
+		goto free_idr;
+	}
+	pps->id = id;
+
+	spin_unlock_irq(&pps_idr_lock);
+
+	/* Create the char device */
+	err = pps_register_cdev(pps);
+	if (err < 0) {
+		printk(KERN_ERR "pps: %s: unable to create char device\n",
+					info->name);
+		goto free_idr;
+	}
+
+	pr_info("new PPS source %s at ID %d\n", info->name, id);
+
+	return id;
+
+free_idr:
+	spin_lock_irq(&pps_idr_lock);
+	idr_remove(&pps_idr, id);
+	spin_unlock_irq(&pps_idr_lock);
+
+kfree_pps:
+	kfree(pps);
+
+pps_register_source_exit:
+	printk(KERN_ERR "pps: %s: unable to register source\n", info->name);
+
+	return err;
+}
+EXPORT_SYMBOL(pps_register_source);
+
+/* pps_unregister_source - remove a PPS source from the system
+ * @source: the PPS source ID
+ *
+ * This function is used to remove a previously registered PPS source from
+ * the system.
+ */
+
+void pps_unregister_source(int source)
+{
+	struct pps_device *pps;
+
+	spin_lock_irq(&pps_idr_lock);
+	pps = idr_find(&pps_idr, source);
+
+	if (!pps) {
+		BUG();
+		spin_unlock_irq(&pps_idr_lock);
+		return;
+	}
+	spin_unlock_irq(&pps_idr_lock);
+
+	pps_unregister_cdev(pps);
+	pps_put_source(pps);
+}
+EXPORT_SYMBOL(pps_unregister_source);
+
+/* pps_event - register a PPS event into the system
+ * @source: the PPS source ID
+ * @ts: the event timestamp
+ * @event: the event type
+ * @data: userdef pointer
+ *
+ * This function is used by each PPS client in order to register a new
+ * PPS event into the system (it's usually called inside an IRQ handler).
+ *
+ * If an echo function is associated with the PPS source it will be called
+ * as:
+ *	pps->info.echo(source, event, data);
+ */
+
+void pps_event(int source, struct pps_ktime *ts, int event, void *data)
+{
+	struct pps_device *pps;
+	unsigned long flags;
+
+	if ((event & (PPS_CAPTUREASSERT | PPS_CAPTURECLEAR)) == 0) {
+		printk(KERN_ERR "pps: unknown event (%x) for source %d\n",
+			event, source);
+		return;
+	}
+
+	pps = pps_get_source(source);
+	if (!pps)
+		return;
+
+	pr_debug("PPS event on source %d at %llu.%06u\n",
+			pps->id, (unsigned long long) ts->sec, ts->nsec);
+
+	spin_lock_irqsave(&pps->lock, flags);
+
+	/* Must call the echo function? */
+	if ((pps->params.mode & (PPS_ECHOASSERT | PPS_ECHOCLEAR)))
+		pps->info.echo(source, event, data);
+
+	/* Check the event */
+	pps->current_mode = pps->params.mode;
+	if (event & PPS_CAPTUREASSERT) {
+		/* We have to add an offset? */
+		if (pps->params.mode & PPS_OFFSETASSERT)
+			pps_add_offset(ts, &pps->params.assert_off_tu);
+
+		/* Save the time stamp */
+		pps->assert_tu = *ts;
+		pps->assert_sequence++;
+		pr_debug("capture assert seq #%u for source %d\n",
+			pps->assert_sequence, source);
+	}
+	if (event & PPS_CAPTURECLEAR) {
+		/* We have to add an offset? */
+		if (pps->params.mode & PPS_OFFSETCLEAR)
+			pps_add_offset(ts, &pps->params.clear_off_tu);
+
+		/* Save the time stamp */
+		pps->clear_tu = *ts;
+		pps->clear_sequence++;
+		pr_debug("capture clear seq #%u for source %d\n",
+			pps->clear_sequence, source);
+	}
+
+	pps->go = ~0;
+	wake_up_interruptible(&pps->queue);
+
+	kill_fasync(&pps->async_queue, SIGIO, POLL_IN);
+
+	spin_unlock_irqrestore(&pps->lock, flags);
+
+	/* Now we can release the PPS source for (possible) deregistration */
+	pps_put_source(pps);
+}
+EXPORT_SYMBOL(pps_event);
diff --git a/drivers/pps/pps.c b/drivers/pps/pps.c
new file mode 100644
index 00000000000..ac8cc8cea1e
--- /dev/null
+++ b/drivers/pps/pps.c
@@ -0,0 +1,312 @@
+/*
+ * PPS core file
+ *
+ *
+ * Copyright (C) 2005-2009   Rodolfo Giometti <giometti@linux.it>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/idr.h>
+#include <linux/cdev.h>
+#include <linux/poll.h>
+#include <linux/pps_kernel.h>
+
+/*
+ * Local variables
+ */
+
+static dev_t pps_devt;
+static struct class *pps_class;
+
+/*
+ * Char device methods
+ */
+
+static unsigned int pps_cdev_poll(struct file *file, poll_table *wait)
+{
+	struct pps_device *pps = file->private_data;
+
+	poll_wait(file, &pps->queue, wait);
+
+	return POLLIN | POLLRDNORM;
+}
+
+static int pps_cdev_fasync(int fd, struct file *file, int on)
+{
+	struct pps_device *pps = file->private_data;
+	return fasync_helper(fd, file, on, &pps->async_queue);
+}
+
+static long pps_cdev_ioctl(struct file *file,
+		unsigned int cmd, unsigned long arg)
+{
+	struct pps_device *pps = file->private_data;
+	struct pps_kparams params;
+	struct pps_fdata fdata;
+	unsigned long ticks;
+	void __user *uarg = (void __user *) arg;
+	int __user *iuarg = (int __user *) arg;
+	int err;
+
+	switch (cmd) {
+	case PPS_GETPARAMS:
+		pr_debug("PPS_GETPARAMS: source %d\n", pps->id);
+
+		/* Return current parameters */
+		err = copy_to_user(uarg, &pps->params,
+						sizeof(struct pps_kparams));
+		if (err)
+			return -EFAULT;
+
+		break;
+
+	case PPS_SETPARAMS:
+		pr_debug("PPS_SETPARAMS: source %d\n", pps->id);
+
+		/* Check the capabilities */
+		if (!capable(CAP_SYS_TIME))
+			return -EPERM;
+
+		err = copy_from_user(&params, uarg, sizeof(struct pps_kparams));
+		if (err)
+			return -EFAULT;
+		if (!(params.mode & (PPS_CAPTUREASSERT | PPS_CAPTURECLEAR))) {
+			pr_debug("capture mode unspecified (%x)\n",
+								params.mode);
+			return -EINVAL;
+		}
+
+		/* Check for supported capabilities */
+		if ((params.mode & ~pps->info.mode) != 0) {
+			pr_debug("unsupported capabilities (%x)\n",
+								params.mode);
+			return -EINVAL;
+		}
+
+		spin_lock_irq(&pps->lock);
+
+		/* Save the new parameters */
+		pps->params = params;
+
+		/* Restore the read only parameters */
+		if ((params.mode & (PPS_TSFMT_TSPEC | PPS_TSFMT_NTPFP)) == 0) {
+			/* section 3.3 of RFC 2783 interpreted */
+			pr_debug("time format unspecified (%x)\n",
+								params.mode);
+			pps->params.mode |= PPS_TSFMT_TSPEC;
+		}
+		if (pps->info.mode & PPS_CANWAIT)
+			pps->params.mode |= PPS_CANWAIT;
+		pps->params.api_version = PPS_API_VERS;
+
+		spin_unlock_irq(&pps->lock);
+
+		break;
+
+	case PPS_GETCAP:
+		pr_debug("PPS_GETCAP: source %d\n", pps->id);
+
+		err = put_user(pps->info.mode, iuarg);
+		if (err)
+			return -EFAULT;
+
+		break;
+
+	case PPS_FETCH:
+		pr_debug("PPS_FETCH: source %d\n", pps->id);
+
+		err = copy_from_user(&fdata, uarg, sizeof(struct pps_fdata));
+		if (err)
+			return -EFAULT;
+
+		pps->go = 0;
+
+		/* Manage the timeout */
+		if (fdata.timeout.flags & PPS_TIME_INVALID)
+			err = wait_event_interruptible(pps->queue, pps->go);
+		else {
+			pr_debug("timeout %lld.%09d\n",
+					(long long) fdata.timeout.sec,
+					fdata.timeout.nsec);
+			ticks = fdata.timeout.sec * HZ;
+			ticks += fdata.timeout.nsec / (NSEC_PER_SEC / HZ);
+
+			if (ticks != 0) {
+				err = wait_event_interruptible_timeout(
+						pps->queue, pps->go, ticks);
+				if (err == 0)
+					return -ETIMEDOUT;
+			}
+		}
+
+		/* Check for pending signals */
+		if (err == -ERESTARTSYS) {
+			pr_debug("pending signal caught\n");
+			return -EINTR;
+		}
+
+		/* Return the fetched timestamp */
+		spin_lock_irq(&pps->lock);
+
+		fdata.info.assert_sequence = pps->assert_sequence;
+		fdata.info.clear_sequence = pps->clear_sequence;
+		fdata.info.assert_tu = pps->assert_tu;
+		fdata.info.clear_tu = pps->clear_tu;
+		fdata.info.current_mode = pps->current_mode;
+
+		spin_unlock_irq(&pps->lock);
+
+		err = copy_to_user(uarg, &fdata, sizeof(struct pps_fdata));
+		if (err)
+			return -EFAULT;
+
+		break;
+
+	default:
+		return -ENOTTY;
+		break;
+	}
+
+	return 0;
+}
+
+static int pps_cdev_open(struct inode *inode, struct file *file)
+{
+	struct pps_device *pps = container_of(inode->i_cdev,
+						struct pps_device, cdev);
+	int found;
+
+	found = pps_get_source(pps->id) != 0;
+	if (!found)
+		return -ENODEV;
+
+	file->private_data = pps;
+
+	return 0;
+}
+
+static int pps_cdev_release(struct inode *inode, struct file *file)
+{
+	struct pps_device *pps = file->private_data;
+
+	/* Free the PPS source and wake up (possible) deregistration */
+	pps_put_source(pps);
+
+	return 0;
+}
+
+/*
+ * Char device stuff
+ */
+
+static const struct file_operations pps_cdev_fops = {
+	.owner		= THIS_MODULE,
+	.llseek		= no_llseek,
+	.poll		= pps_cdev_poll,
+	.fasync		= pps_cdev_fasync,
+	.unlocked_ioctl	= pps_cdev_ioctl,
+	.open		= pps_cdev_open,
+	.release	= pps_cdev_release,
+};
+
+int pps_register_cdev(struct pps_device *pps)
+{
+	int err;
+
+	pps->devno = MKDEV(MAJOR(pps_devt), pps->id);
+	cdev_init(&pps->cdev, &pps_cdev_fops);
+	pps->cdev.owner = pps->info.owner;
+
+	err = cdev_add(&pps->cdev, pps->devno, 1);
+	if (err) {
+		printk(KERN_ERR "pps: %s: failed to add char device %d:%d\n",
+				pps->info.name, MAJOR(pps_devt), pps->id);
+		return err;
+	}
+	pps->dev = device_create(pps_class, pps->info.dev, pps->devno, NULL,
+							"pps%d", pps->id);
+	if (err)
+		goto del_cdev;
+	dev_set_drvdata(pps->dev, pps);
+
+	pr_debug("source %s got cdev (%d:%d)\n", pps->info.name,
+			MAJOR(pps_devt), pps->id);
+
+	return 0;
+
+del_cdev:
+	cdev_del(&pps->cdev);
+
+	return err;
+}
+
+void pps_unregister_cdev(struct pps_device *pps)
+{
+	device_destroy(pps_class, pps->devno);
+	cdev_del(&pps->cdev);
+}
+
+/*
+ * Module stuff
+ */
+
+static void __exit pps_exit(void)
+{
+	class_destroy(pps_class);
+	unregister_chrdev_region(pps_devt, PPS_MAX_SOURCES);
+}
+
+static int __init pps_init(void)
+{
+	int err;
+
+	pps_class = class_create(THIS_MODULE, "pps");
+	if (!pps_class) {
+		printk(KERN_ERR "pps: failed to allocate class\n");
+		return -ENOMEM;
+	}
+	pps_class->dev_attrs = pps_attrs;
+
+	err = alloc_chrdev_region(&pps_devt, 0, PPS_MAX_SOURCES, "pps");
+	if (err < 0) {
+		printk(KERN_ERR "pps: failed to allocate char device region\n");
+		goto remove_class;
+	}
+
+	pr_info("LinuxPPS API ver. %d registered\n", PPS_API_VERS);
+	pr_info("Software ver. %s - Copyright 2005-2007 Rodolfo Giometti "
+		"<giometti@linux.it>\n", PPS_VERSION);
+
+	return 0;
+
+remove_class:
+	class_destroy(pps_class);
+
+	return err;
+}
+
+subsys_initcall(pps_init);
+module_exit(pps_exit);
+
+MODULE_AUTHOR("Rodolfo Giometti <giometti@linux.it>");
+MODULE_DESCRIPTION("LinuxPPS support (RFC 2783) - ver. " PPS_VERSION);
+MODULE_LICENSE("GPL");
diff --git a/drivers/pps/sysfs.c b/drivers/pps/sysfs.c
new file mode 100644
index 00000000000..ef0978c71ee
--- /dev/null
+++ b/drivers/pps/sysfs.c
@@ -0,0 +1,98 @@
+/*
+ * PPS sysfs support
+ *
+ *
+ * Copyright (C) 2007-2009   Rodolfo Giometti <giometti@linux.it>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/pps_kernel.h>
+
+/*
+ * Attribute functions
+ */
+
+static ssize_t pps_show_assert(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct pps_device *pps = dev_get_drvdata(dev);
+
+	if (!(pps->info.mode & PPS_CAPTUREASSERT))
+		return 0;
+
+	return sprintf(buf, "%lld.%09d#%d\n",
+			(long long) pps->assert_tu.sec, pps->assert_tu.nsec,
+			pps->assert_sequence);
+}
+
+static ssize_t pps_show_clear(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct pps_device *pps = dev_get_drvdata(dev);
+
+	if (!(pps->info.mode & PPS_CAPTURECLEAR))
+		return 0;
+
+	return sprintf(buf, "%lld.%09d#%d\n",
+			(long long) pps->clear_tu.sec, pps->clear_tu.nsec,
+			pps->clear_sequence);
+}
+
+static ssize_t pps_show_mode(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct pps_device *pps = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%4x\n", pps->info.mode);
+}
+
+static ssize_t pps_show_echo(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct pps_device *pps = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%d\n", !!pps->info.echo);
+}
+
+static ssize_t pps_show_name(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct pps_device *pps = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%s\n", pps->info.name);
+}
+
+static ssize_t pps_show_path(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct pps_device *pps = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%s\n", pps->info.path);
+}
+
+struct device_attribute pps_attrs[] = {
+	__ATTR(assert, S_IRUGO, pps_show_assert, NULL),
+	__ATTR(clear, S_IRUGO, pps_show_clear, NULL),
+	__ATTR(mode, S_IRUGO, pps_show_mode, NULL),
+	__ATTR(echo, S_IRUGO, pps_show_echo, NULL),
+	__ATTR(name, S_IRUGO, pps_show_name, NULL),
+	__ATTR(path, S_IRUGO, pps_show_path, NULL),
+	__ATTR_NULL,
+};
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 277d35d232f..81adbdbd504 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -296,6 +296,15 @@ config RTC_DRV_RX8581
 	  This driver can also be built as a module. If so the module
 	  will be called rtc-rx8581.
 
+config RTC_DRV_RX8025
+	tristate "Epson RX-8025SA/NB"
+	help
+	  If you say yes here you get support for the Epson
+	  RX-8025SA/NB RTC chips.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called rtc-rx8025.
+
 endif # I2C
 
 comment "SPI RTC drivers"
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index 6c0639a14f0..3c0f2b2ac92 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -62,6 +62,7 @@ obj-$(CONFIG_RTC_DRV_R9701)	+= rtc-r9701.o
 obj-$(CONFIG_RTC_DRV_RS5C313)	+= rtc-rs5c313.o
 obj-$(CONFIG_RTC_DRV_RS5C348)	+= rtc-rs5c348.o
 obj-$(CONFIG_RTC_DRV_RS5C372)	+= rtc-rs5c372.o
+obj-$(CONFIG_RTC_DRV_RX8025)	+= rtc-rx8025.o
 obj-$(CONFIG_RTC_DRV_RX8581)	+= rtc-rx8581.o
 obj-$(CONFIG_RTC_DRV_S35390A)	+= rtc-s35390a.o
 obj-$(CONFIG_RTC_DRV_S3C)	+= rtc-s3c.o
diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c
index 2c4a65302a9..8a6f9a9f9cb 100644
--- a/drivers/rtc/rtc-ds1307.c
+++ b/drivers/rtc/rtc-ds1307.c
@@ -31,6 +31,8 @@ enum ds_type {
 	ds_1338,
 	ds_1339,
 	ds_1340,
+	ds_1388,
+	ds_3231,
 	m41t00,
 	rx_8025,
 	// rs5c372 too?  different address...
@@ -66,6 +68,7 @@ enum ds_type {
 #define DS1337_REG_CONTROL	0x0e
 #	define DS1337_BIT_nEOSC		0x80
 #	define DS1339_BIT_BBSQI		0x20
+#	define DS3231_BIT_BBSQW		0x40 /* same as BBSQI */
 #	define DS1337_BIT_RS2		0x10
 #	define DS1337_BIT_RS1		0x08
 #	define DS1337_BIT_INTCN		0x04
@@ -94,6 +97,7 @@ enum ds_type {
 
 
 struct ds1307 {
+	u8			offset; /* register's offset */
 	u8			regs[11];
 	enum ds_type		type;
 	unsigned long		flags;
@@ -128,6 +132,9 @@ static const struct chip_desc chips[] = {
 },
 [ds_1340] = {
 },
+[ds_3231] = {
+	.alarm		= 1,
+},
 [m41t00] = {
 },
 [rx_8025] = {
@@ -138,7 +145,9 @@ static const struct i2c_device_id ds1307_id[] = {
 	{ "ds1337", ds_1337 },
 	{ "ds1338", ds_1338 },
 	{ "ds1339", ds_1339 },
+	{ "ds1388", ds_1388 },
 	{ "ds1340", ds_1340 },
+	{ "ds3231", ds_3231 },
 	{ "m41t00", m41t00 },
 	{ "rx8025", rx_8025 },
 	{ }
@@ -291,7 +300,7 @@ static int ds1307_get_time(struct device *dev, struct rtc_time *t)
 
 	/* read the RTC date and time registers all at once */
 	tmp = ds1307->read_block_data(ds1307->client,
-		DS1307_REG_SECS, 7, ds1307->regs);
+		ds1307->offset, 7, ds1307->regs);
 	if (tmp != 7) {
 		dev_err(dev, "%s error %d\n", "read", tmp);
 		return -EIO;
@@ -353,6 +362,7 @@ static int ds1307_set_time(struct device *dev, struct rtc_time *t)
 	switch (ds1307->type) {
 	case ds_1337:
 	case ds_1339:
+	case ds_3231:
 		buf[DS1307_REG_MONTH] |= DS1337_BIT_CENTURY;
 		break;
 	case ds_1340:
@@ -367,7 +377,8 @@ static int ds1307_set_time(struct device *dev, struct rtc_time *t)
 		"write", buf[0], buf[1], buf[2], buf[3],
 		buf[4], buf[5], buf[6]);
 
-	result = ds1307->write_block_data(ds1307->client, 0, 7, buf);
+	result = ds1307->write_block_data(ds1307->client,
+		ds1307->offset, 7, buf);
 	if (result < 0) {
 		dev_err(dev, "%s error %d\n", "write", result);
 		return result;
@@ -624,6 +635,11 @@ static int __devinit ds1307_probe(struct i2c_client *client,
 	struct i2c_adapter	*adapter = to_i2c_adapter(client->dev.parent);
 	int			want_irq = false;
 	unsigned char		*buf;
+	static const int	bbsqi_bitpos[] = {
+		[ds_1337] = 0,
+		[ds_1339] = DS1339_BIT_BBSQI,
+		[ds_3231] = DS3231_BIT_BBSQW,
+	};
 
 	if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA)
 	    && !i2c_check_functionality(adapter, I2C_FUNC_SMBUS_I2C_BLOCK))
@@ -632,9 +648,12 @@ static int __devinit ds1307_probe(struct i2c_client *client,
 	if (!(ds1307 = kzalloc(sizeof(struct ds1307), GFP_KERNEL)))
 		return -ENOMEM;
 
-	ds1307->client = client;
 	i2c_set_clientdata(client, ds1307);
-	ds1307->type = id->driver_data;
+
+	ds1307->client	= client;
+	ds1307->type	= id->driver_data;
+	ds1307->offset	= 0;
+
 	buf = ds1307->regs;
 	if (i2c_check_functionality(adapter, I2C_FUNC_SMBUS_I2C_BLOCK)) {
 		ds1307->read_block_data = i2c_smbus_read_i2c_block_data;
@@ -647,6 +666,7 @@ static int __devinit ds1307_probe(struct i2c_client *client,
 	switch (ds1307->type) {
 	case ds_1337:
 	case ds_1339:
+	case ds_3231:
 		/* has IRQ? */
 		if (ds1307->client->irq > 0 && chip->alarm) {
 			INIT_WORK(&ds1307->work, ds1307_work);
@@ -666,12 +686,12 @@ static int __devinit ds1307_probe(struct i2c_client *client,
 			ds1307->regs[0] &= ~DS1337_BIT_nEOSC;
 
 		/* Using IRQ?  Disable the square wave and both alarms.
-		 * For ds1339, be sure alarms can trigger when we're
-		 * running on Vbackup (BBSQI); we assume ds1337 will
-		 * ignore that bit
+		 * For some variants, be sure alarms can trigger when we're
+		 * running on Vbackup (BBSQI/BBSQW)
 		 */
 		if (want_irq) {
-			ds1307->regs[0] |= DS1337_BIT_INTCN | DS1339_BIT_BBSQI;
+			ds1307->regs[0] |= DS1337_BIT_INTCN
+					| bbsqi_bitpos[ds1307->type];
 			ds1307->regs[0] &= ~(DS1337_BIT_A2IE | DS1337_BIT_A1IE);
 		}
 
@@ -751,6 +771,9 @@ static int __devinit ds1307_probe(struct i2c_client *client,
 						  hour);
 		}
 		break;
+	case ds_1388:
+		ds1307->offset = 1; /* Seconds starts at 1 */
+		break;
 	default:
 		break;
 	}
@@ -814,6 +837,8 @@ read_rtc:
 	case rx_8025:
 	case ds_1337:
 	case ds_1339:
+	case ds_1388:
+	case ds_3231:
 		break;
 	}
 
diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c
index 38d472b6340..717288527c6 100644
--- a/drivers/rtc/rtc-ds1553.c
+++ b/drivers/rtc/rtc-ds1553.c
@@ -329,8 +329,7 @@ static int __devinit ds1553_rtc_probe(struct platform_device *pdev)
 	if (pdata->irq > 0) {
 		writeb(0, ioaddr + RTC_INTERRUPTS);
 		if (request_irq(pdata->irq, ds1553_rtc_interrupt,
-				IRQF_DISABLED | IRQF_SHARED,
-				pdev->name, pdev) < 0) {
+				IRQF_DISABLED, pdev->name, pdev) < 0) {
 			dev_warn(&pdev->dev, "interrupt not available.\n");
 			pdata->irq = 0;
 		}
diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c
index 8bc8501bffc..09249459e9a 100644
--- a/drivers/rtc/rtc-ds1742.c
+++ b/drivers/rtc/rtc-ds1742.c
@@ -57,6 +57,7 @@ struct rtc_plat_data {
 	size_t size;
 	resource_size_t baseaddr;
 	unsigned long last_jiffies;
+	struct bin_attribute nvram_attr;
 };
 
 static int ds1742_rtc_set_time(struct device *dev, struct rtc_time *tm)
@@ -157,18 +158,6 @@ static ssize_t ds1742_nvram_write(struct kobject *kobj,
 	return count;
 }
 
-static struct bin_attribute ds1742_nvram_attr = {
-	.attr = {
-		.name = "nvram",
-		.mode = S_IRUGO | S_IWUSR,
-	},
-	.read = ds1742_nvram_read,
-	.write = ds1742_nvram_write,
-	/* REVISIT: size in sysfs won't match actual size... if it's
-	 * not a constant, each RTC should have its own attribute.
-	 */
-};
-
 static int __devinit ds1742_rtc_probe(struct platform_device *pdev)
 {
 	struct rtc_device *rtc;
@@ -199,6 +188,12 @@ static int __devinit ds1742_rtc_probe(struct platform_device *pdev)
 	pdata->size_nvram = pdata->size - RTC_SIZE;
 	pdata->ioaddr_rtc = ioaddr + pdata->size_nvram;
 
+	pdata->nvram_attr.attr.name = "nvram";
+	pdata->nvram_attr.attr.mode = S_IRUGO | S_IWUSR;
+	pdata->nvram_attr.read = ds1742_nvram_read;
+	pdata->nvram_attr.write = ds1742_nvram_write;
+	pdata->nvram_attr.size = pdata->size_nvram;
+
 	/* turn RTC on if it was not on */
 	ioaddr = pdata->ioaddr_rtc;
 	sec = readb(ioaddr + RTC_SECONDS);
@@ -221,11 +216,13 @@ static int __devinit ds1742_rtc_probe(struct platform_device *pdev)
 	pdata->rtc = rtc;
 	pdata->last_jiffies = jiffies;
 	platform_set_drvdata(pdev, pdata);
-	ds1742_nvram_attr.size = max(ds1742_nvram_attr.size,
-				     pdata->size_nvram);
-	ret = sysfs_create_bin_file(&pdev->dev.kobj, &ds1742_nvram_attr);
-	if (ret)
+
+	ret = sysfs_create_bin_file(&pdev->dev.kobj, &pdata->nvram_attr);
+	if (ret) {
+		dev_err(&pdev->dev, "creating nvram file in sysfs failed\n");
 		goto out;
+	}
+
 	return 0;
  out:
 	if (pdata->rtc)
@@ -242,7 +239,7 @@ static int __devexit ds1742_rtc_remove(struct platform_device *pdev)
 {
 	struct rtc_plat_data *pdata = platform_get_drvdata(pdev);
 
-	sysfs_remove_bin_file(&pdev->dev.kobj, &ds1742_nvram_attr);
+	sysfs_remove_bin_file(&pdev->dev.kobj, &pdata->nvram_attr);
 	rtc_device_unregister(pdata->rtc);
 	iounmap(pdata->ioaddr_nvram);
 	release_mem_region(pdata->baseaddr, pdata->size);
diff --git a/drivers/rtc/rtc-rx8025.c b/drivers/rtc/rtc-rx8025.c
new file mode 100644
index 00000000000..b1a29bcfdf1
--- /dev/null
+++ b/drivers/rtc/rtc-rx8025.c
@@ -0,0 +1,688 @@
+/*
+ * Driver for Epson's RTC module RX-8025 SA/NB
+ *
+ * Copyright (C) 2009 Wolfgang Grandegger <wg@grandegger.com>
+ *
+ * Copyright (C) 2005 by Digi International Inc.
+ * All rights reserved.
+ *
+ * Modified by fengjh at rising.com.cn
+ * <http://lists.lm-sensors.org/mailman/listinfo/lm-sensors>
+ * 2006.11
+ *
+ * Code cleanup by Sergei Poselenov, <sposelenov@emcraft.com>
+ * Converted to new style by Wolfgang Grandegger <wg@grandegger.com>
+ * Alarm and periodic interrupt added by Dmitry Rakhchev <rda@emcraft.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bcd.h>
+#include <linux/i2c.h>
+#include <linux/list.h>
+#include <linux/rtc.h>
+
+/* Register definitions */
+#define RX8025_REG_SEC		0x00
+#define RX8025_REG_MIN		0x01
+#define RX8025_REG_HOUR		0x02
+#define RX8025_REG_WDAY		0x03
+#define RX8025_REG_MDAY		0x04
+#define RX8025_REG_MONTH	0x05
+#define RX8025_REG_YEAR		0x06
+#define RX8025_REG_DIGOFF	0x07
+#define RX8025_REG_ALWMIN	0x08
+#define RX8025_REG_ALWHOUR	0x09
+#define RX8025_REG_ALWWDAY	0x0a
+#define RX8025_REG_ALDMIN	0x0b
+#define RX8025_REG_ALDHOUR	0x0c
+/* 0x0d is reserved */
+#define RX8025_REG_CTRL1	0x0e
+#define RX8025_REG_CTRL2	0x0f
+
+#define RX8025_BIT_CTRL1_CT	(7 << 0)
+/* 1 Hz periodic level irq */
+#define RX8025_BIT_CTRL1_CT_1HZ	4
+#define RX8025_BIT_CTRL1_TEST	(1 << 3)
+#define RX8025_BIT_CTRL1_1224	(1 << 5)
+#define RX8025_BIT_CTRL1_DALE	(1 << 6)
+#define RX8025_BIT_CTRL1_WALE	(1 << 7)
+
+#define RX8025_BIT_CTRL2_DAFG	(1 << 0)
+#define RX8025_BIT_CTRL2_WAFG	(1 << 1)
+#define RX8025_BIT_CTRL2_CTFG	(1 << 2)
+#define RX8025_BIT_CTRL2_PON	(1 << 4)
+#define RX8025_BIT_CTRL2_XST	(1 << 5)
+#define RX8025_BIT_CTRL2_VDET	(1 << 6)
+
+/* Clock precision adjustment */
+#define RX8025_ADJ_RESOLUTION	3050 /* in ppb */
+#define RX8025_ADJ_DATA_MAX	62
+#define RX8025_ADJ_DATA_MIN	-62
+
+static const struct i2c_device_id rx8025_id[] = {
+	{ "rx8025", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, rx8025_id);
+
+struct rx8025_data {
+	struct i2c_client *client;
+	struct rtc_device *rtc;
+	struct work_struct work;
+	u8 ctrl1;
+	unsigned exiting:1;
+};
+
+static int rx8025_read_reg(struct i2c_client *client, int number, u8 *value)
+{
+	int ret = i2c_smbus_read_byte_data(client, (number << 4) | 0x08);
+
+	if (ret < 0) {
+		dev_err(&client->dev, "Unable to read register #%d\n", number);
+		return ret;
+	}
+
+	*value = ret;
+	return 0;
+}
+
+static int rx8025_read_regs(struct i2c_client *client,
+			    int number, u8 length, u8 *values)
+{
+	int ret = i2c_smbus_read_i2c_block_data(client, (number << 4) | 0x08,
+						length, values);
+
+	if (ret != length) {
+		dev_err(&client->dev, "Unable to read registers #%d..#%d\n",
+			number, number + length - 1);
+		return ret < 0 ? ret : -EIO;
+	}
+
+	return 0;
+}
+
+static int rx8025_write_reg(struct i2c_client *client, int number, u8 value)
+{
+	int ret = i2c_smbus_write_byte_data(client, number << 4, value);
+
+	if (ret)
+		dev_err(&client->dev, "Unable to write register #%d\n",
+			number);
+
+	return ret;
+}
+
+static int rx8025_write_regs(struct i2c_client *client,
+			     int number, u8 length, u8 *values)
+{
+	int ret = i2c_smbus_write_i2c_block_data(client, (number << 4) | 0x08,
+						 length, values);
+
+	if (ret)
+		dev_err(&client->dev, "Unable to write registers #%d..#%d\n",
+			number, number + length - 1);
+
+	return ret;
+}
+
+static irqreturn_t rx8025_irq(int irq, void *dev_id)
+{
+	struct i2c_client *client = dev_id;
+	struct rx8025_data *rx8025 = i2c_get_clientdata(client);
+
+	disable_irq_nosync(irq);
+	schedule_work(&rx8025->work);
+	return IRQ_HANDLED;
+}
+
+static void rx8025_work(struct work_struct *work)
+{
+	struct rx8025_data *rx8025 = container_of(work, struct rx8025_data,
+						  work);
+	struct i2c_client *client = rx8025->client;
+	struct mutex *lock = &rx8025->rtc->ops_lock;
+	u8 status;
+
+	mutex_lock(lock);
+
+	if (rx8025_read_reg(client, RX8025_REG_CTRL2, &status))
+		goto out;
+
+	if (!(status & RX8025_BIT_CTRL2_XST))
+		dev_warn(&client->dev, "Oscillation stop was detected,"
+			 "you may have to readjust the clock\n");
+
+	if (status & RX8025_BIT_CTRL2_CTFG) {
+		/* periodic */
+		status &= ~RX8025_BIT_CTRL2_CTFG;
+		local_irq_disable();
+		rtc_update_irq(rx8025->rtc, 1, RTC_PF | RTC_IRQF);
+		local_irq_enable();
+	}
+
+	if (status & RX8025_BIT_CTRL2_DAFG) {
+		/* alarm */
+		status &= RX8025_BIT_CTRL2_DAFG;
+		if (rx8025_write_reg(client, RX8025_REG_CTRL1,
+				     rx8025->ctrl1 & ~RX8025_BIT_CTRL1_DALE))
+			goto out;
+		local_irq_disable();
+		rtc_update_irq(rx8025->rtc, 1, RTC_AF | RTC_IRQF);
+		local_irq_enable();
+	}
+
+	/* acknowledge IRQ */
+	rx8025_write_reg(client, RX8025_REG_CTRL2,
+			 status | RX8025_BIT_CTRL2_XST);
+
+out:
+	if (!rx8025->exiting)
+		enable_irq(client->irq);
+
+	mutex_unlock(lock);
+}
+
+static int rx8025_get_time(struct device *dev, struct rtc_time *dt)
+{
+	struct rx8025_data *rx8025 = dev_get_drvdata(dev);
+	u8 date[7];
+	int err;
+
+	err = rx8025_read_regs(rx8025->client, RX8025_REG_SEC, 7, date);
+	if (err)
+		return err;
+
+	dev_dbg(dev, "%s: read 0x%02x 0x%02x "
+		"0x%02x 0x%02x 0x%02x 0x%02x 0x%02x\n", __func__,
+		date[0], date[1], date[2], date[3], date[4],
+		date[5], date[6]);
+
+	dt->tm_sec = bcd2bin(date[RX8025_REG_SEC] & 0x7f);
+	dt->tm_min = bcd2bin(date[RX8025_REG_MIN] & 0x7f);
+	if (rx8025->ctrl1 & RX8025_BIT_CTRL1_1224)
+		dt->tm_hour = bcd2bin(date[RX8025_REG_HOUR] & 0x3f);
+	else
+		dt->tm_hour = bcd2bin(date[RX8025_REG_HOUR] & 0x1f) % 12
+			+ (date[RX8025_REG_HOUR] & 0x20 ? 12 : 0);
+
+	dt->tm_mday = bcd2bin(date[RX8025_REG_MDAY] & 0x3f);
+	dt->tm_mon = bcd2bin(date[RX8025_REG_MONTH] & 0x1f) - 1;
+	dt->tm_year = bcd2bin(date[RX8025_REG_YEAR]);
+
+	if (dt->tm_year < 70)
+		dt->tm_year += 100;
+
+	dev_dbg(dev, "%s: date %ds %dm %dh %dmd %dm %dy\n", __func__,
+		dt->tm_sec, dt->tm_min, dt->tm_hour,
+		dt->tm_mday, dt->tm_mon, dt->tm_year);
+
+	return rtc_valid_tm(dt);
+}
+
+static int rx8025_set_time(struct device *dev, struct rtc_time *dt)
+{
+	struct rx8025_data *rx8025 = dev_get_drvdata(dev);
+	u8 date[7];
+
+	/*
+	 * BUG: The HW assumes every year that is a multiple of 4 to be a leap
+	 * year.  Next time this is wrong is 2100, which will not be a leap
+	 * year.
+	 */
+
+	/*
+	 * Here the read-only bits are written as "0".  I'm not sure if that
+	 * is sound.
+	 */
+	date[RX8025_REG_SEC] = bin2bcd(dt->tm_sec);
+	date[RX8025_REG_MIN] = bin2bcd(dt->tm_min);
+	if (rx8025->ctrl1 & RX8025_BIT_CTRL1_1224)
+		date[RX8025_REG_HOUR] = bin2bcd(dt->tm_hour);
+	else
+		date[RX8025_REG_HOUR] = (dt->tm_hour >= 12 ? 0x20 : 0)
+			| bin2bcd((dt->tm_hour + 11) % 12 + 1);
+
+	date[RX8025_REG_WDAY] = bin2bcd(dt->tm_wday);
+	date[RX8025_REG_MDAY] = bin2bcd(dt->tm_mday);
+	date[RX8025_REG_MONTH] = bin2bcd(dt->tm_mon + 1);
+	date[RX8025_REG_YEAR] = bin2bcd(dt->tm_year % 100);
+
+	dev_dbg(dev,
+		"%s: write 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x\n",
+		__func__,
+		date[0], date[1], date[2], date[3], date[4], date[5], date[6]);
+
+	return rx8025_write_regs(rx8025->client, RX8025_REG_SEC, 7, date);
+}
+
+static int rx8025_init_client(struct i2c_client *client, int *need_reset)
+{
+	struct rx8025_data *rx8025 = i2c_get_clientdata(client);
+	u8 ctrl[2], ctrl2;
+	int need_clear = 0;
+	int err;
+
+	err = rx8025_read_regs(rx8025->client, RX8025_REG_CTRL1, 2, ctrl);
+	if (err)
+		goto out;
+
+	/* Keep test bit zero ! */
+	rx8025->ctrl1 = ctrl[0] & ~RX8025_BIT_CTRL1_TEST;
+
+	if (ctrl[1] & RX8025_BIT_CTRL2_PON) {
+		dev_warn(&client->dev, "power-on reset was detected, "
+			 "you may have to readjust the clock\n");
+		*need_reset = 1;
+	}
+
+	if (ctrl[1] & RX8025_BIT_CTRL2_VDET) {
+		dev_warn(&client->dev, "a power voltage drop was detected, "
+			 "you may have to readjust the clock\n");
+		*need_reset = 1;
+	}
+
+	if (!(ctrl[1] & RX8025_BIT_CTRL2_XST)) {
+		dev_warn(&client->dev, "Oscillation stop was detected,"
+			 "you may have to readjust the clock\n");
+		*need_reset = 1;
+	}
+
+	if (ctrl[1] & (RX8025_BIT_CTRL2_DAFG | RX8025_BIT_CTRL2_WAFG)) {
+		dev_warn(&client->dev, "Alarm was detected\n");
+		need_clear = 1;
+	}
+
+	if (!(ctrl[1] & RX8025_BIT_CTRL2_CTFG))
+		need_clear = 1;
+
+	if (*need_reset || need_clear) {
+		ctrl2 = ctrl[0];
+		ctrl2 &= ~(RX8025_BIT_CTRL2_PON | RX8025_BIT_CTRL2_VDET |
+			   RX8025_BIT_CTRL2_CTFG | RX8025_BIT_CTRL2_WAFG |
+			   RX8025_BIT_CTRL2_DAFG);
+		ctrl2 |= RX8025_BIT_CTRL2_XST;
+
+		err = rx8025_write_reg(client, RX8025_REG_CTRL2, ctrl2);
+	}
+out:
+	return err;
+}
+
+/* Alarm support */
+static int rx8025_read_alarm(struct device *dev, struct rtc_wkalrm *t)
+{
+	struct rx8025_data *rx8025 = dev_get_drvdata(dev);
+	struct i2c_client *client = rx8025->client;
+	u8 ctrl2, ald[2];
+	int err;
+
+	if (client->irq <= 0)
+		return -EINVAL;
+
+	err = rx8025_read_regs(client, RX8025_REG_ALDMIN, 2, ald);
+	if (err)
+		return err;
+
+	err = rx8025_read_reg(client, RX8025_REG_CTRL2, &ctrl2);
+	if (err)
+		return err;
+
+	dev_dbg(dev, "%s: read alarm 0x%02x 0x%02x ctrl2 %02x\n",
+		__func__, ald[0], ald[1], ctrl2);
+
+	/* Hardware alarms precision is 1 minute! */
+	t->time.tm_sec = 0;
+	t->time.tm_min = bcd2bin(ald[0] & 0x7f);
+	if (rx8025->ctrl1 & RX8025_BIT_CTRL1_1224)
+		t->time.tm_hour = bcd2bin(ald[1] & 0x3f);
+	else
+		t->time.tm_hour = bcd2bin(ald[1] & 0x1f) % 12
+			+ (ald[1] & 0x20 ? 12 : 0);
+
+	t->time.tm_wday = -1;
+	t->time.tm_mday = -1;
+	t->time.tm_mon = -1;
+	t->time.tm_year = -1;
+
+	dev_dbg(dev, "%s: date: %ds %dm %dh %dmd %dm %dy\n",
+		__func__,
+		t->time.tm_sec, t->time.tm_min, t->time.tm_hour,
+		t->time.tm_mday, t->time.tm_mon, t->time.tm_year);
+	t->enabled = !!(rx8025->ctrl1 & RX8025_BIT_CTRL1_DALE);
+	t->pending = (ctrl2 & RX8025_BIT_CTRL2_DAFG) && t->enabled;
+
+	return err;
+}
+
+static int rx8025_set_alarm(struct device *dev, struct rtc_wkalrm *t)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct rx8025_data *rx8025 = dev_get_drvdata(dev);
+	u8 ald[2];
+	int err;
+
+	if (client->irq <= 0)
+		return -EINVAL;
+
+	/* Hardware alarm precision is 1 minute! */
+	ald[0] = bin2bcd(t->time.tm_min);
+	if (rx8025->ctrl1 & RX8025_BIT_CTRL1_1224)
+		ald[1] = bin2bcd(t->time.tm_hour);
+	else
+		ald[1] = (t->time.tm_hour >= 12 ? 0x20 : 0)
+			| bin2bcd((t->time.tm_hour + 11) % 12 + 1);
+
+	dev_dbg(dev, "%s: write 0x%02x 0x%02x\n", __func__, ald[0], ald[1]);
+
+	if (rx8025->ctrl1 & RX8025_BIT_CTRL1_DALE) {
+		rx8025->ctrl1 &= ~RX8025_BIT_CTRL1_DALE;
+		err = rx8025_write_reg(rx8025->client, RX8025_REG_CTRL1,
+				       rx8025->ctrl1);
+		if (err)
+			return err;
+	}
+	err = rx8025_write_regs(rx8025->client, RX8025_REG_ALDMIN, 2, ald);
+	if (err)
+		return err;
+
+	if (t->enabled) {
+		rx8025->ctrl1 |= RX8025_BIT_CTRL1_DALE;
+		err = rx8025_write_reg(rx8025->client, RX8025_REG_CTRL1,
+				       rx8025->ctrl1);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int rx8025_alarm_irq_enable(struct device *dev, unsigned int enabled)
+{
+	struct rx8025_data *rx8025 = dev_get_drvdata(dev);
+	u8 ctrl1;
+	int err;
+
+	ctrl1 = rx8025->ctrl1;
+	if (enabled)
+		ctrl1 |= RX8025_BIT_CTRL1_DALE;
+	else
+		ctrl1 &= ~RX8025_BIT_CTRL1_DALE;
+
+	if (ctrl1 != rx8025->ctrl1) {
+		rx8025->ctrl1 = ctrl1;
+		err = rx8025_write_reg(rx8025->client, RX8025_REG_CTRL1,
+				       rx8025->ctrl1);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int rx8025_irq_set_state(struct device *dev, int enabled)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct rx8025_data *rx8025 = i2c_get_clientdata(client);
+	int ctrl1;
+	int err;
+
+	if (client->irq <= 0)
+		return -ENXIO;
+
+	ctrl1 = rx8025->ctrl1 & ~RX8025_BIT_CTRL1_CT;
+	if (enabled)
+		ctrl1 |= RX8025_BIT_CTRL1_CT_1HZ;
+	if (ctrl1 != rx8025->ctrl1) {
+		rx8025->ctrl1 = ctrl1;
+		err = rx8025_write_reg(rx8025->client, RX8025_REG_CTRL1,
+				       rx8025->ctrl1);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static struct rtc_class_ops rx8025_rtc_ops = {
+	.read_time = rx8025_get_time,
+	.set_time = rx8025_set_time,
+	.read_alarm = rx8025_read_alarm,
+	.set_alarm = rx8025_set_alarm,
+	.alarm_irq_enable = rx8025_alarm_irq_enable,
+	.irq_set_state  = rx8025_irq_set_state,
+};
+
+/*
+ * Clock precision adjustment support
+ *
+ * According to the RX8025 SA/NB application manual the frequency and
+ * temperature charateristics can be approximated using the following
+ * equation:
+ *
+ *   df = a * (ut - t)**2
+ *
+ *   df: Frequency deviation in any temperature
+ *   a : Coefficient = (-35 +-5) * 10**-9
+ *   ut: Ultimate temperature in degree = +25 +-5 degree
+ *   t : Any temperature in degree
+ *
+ * Note that the clock adjustment in ppb must be entered (which is
+ * the negative value of the deviation).
+ */
+static int rx8025_get_clock_adjust(struct device *dev, int *adj)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	u8 digoff;
+	int err;
+
+	err = rx8025_read_reg(client, RX8025_REG_DIGOFF, &digoff);
+	if (err)
+		return err;
+
+	*adj = digoff >= 64 ? digoff - 128 : digoff;
+	if (*adj > 0)
+		(*adj)--;
+	*adj *= -RX8025_ADJ_RESOLUTION;
+
+	return 0;
+}
+
+static int rx8025_set_clock_adjust(struct device *dev, int adj)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	u8 digoff;
+	int err;
+
+	adj /= -RX8025_ADJ_RESOLUTION;
+	if (adj > RX8025_ADJ_DATA_MAX)
+		adj = RX8025_ADJ_DATA_MAX;
+	else if (adj < RX8025_ADJ_DATA_MIN)
+		adj = RX8025_ADJ_DATA_MIN;
+	else if (adj > 0)
+		adj++;
+	else if (adj < 0)
+		adj += 128;
+	digoff = adj;
+
+	err = rx8025_write_reg(client, RX8025_REG_DIGOFF, digoff);
+	if (err)
+		return err;
+
+	dev_dbg(dev, "%s: write 0x%02x\n", __func__, digoff);
+
+	return 0;
+}
+
+static ssize_t rx8025_sysfs_show_clock_adjust(struct device *dev,
+					      struct device_attribute *attr,
+					      char *buf)
+{
+	int err, adj;
+
+	err = rx8025_get_clock_adjust(dev, &adj);
+	if (err)
+		return err;
+
+	return sprintf(buf, "%d\n", adj);
+}
+
+static ssize_t rx8025_sysfs_store_clock_adjust(struct device *dev,
+					       struct device_attribute *attr,
+					       const char *buf, size_t count)
+{
+	int adj, err;
+
+	if (sscanf(buf, "%i", &adj) != 1)
+		return -EINVAL;
+
+	err = rx8025_set_clock_adjust(dev, adj);
+
+	return err ? err : count;
+}
+
+static DEVICE_ATTR(clock_adjust_ppb, S_IRUGO | S_IWUSR,
+		   rx8025_sysfs_show_clock_adjust,
+		   rx8025_sysfs_store_clock_adjust);
+
+static int rx8025_sysfs_register(struct device *dev)
+{
+	return device_create_file(dev, &dev_attr_clock_adjust_ppb);
+}
+
+static void rx8025_sysfs_unregister(struct device *dev)
+{
+	device_remove_file(dev, &dev_attr_clock_adjust_ppb);
+}
+
+static int __devinit rx8025_probe(struct i2c_client *client,
+				  const struct i2c_device_id *id)
+{
+	struct i2c_adapter *adapter = to_i2c_adapter(client->dev.parent);
+	struct rx8025_data *rx8025;
+	int err, need_reset = 0;
+
+	if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA
+				     | I2C_FUNC_SMBUS_I2C_BLOCK)) {
+		dev_err(&adapter->dev,
+			"doesn't support required functionality\n");
+		err = -EIO;
+		goto errout;
+	}
+
+	rx8025 = kzalloc(sizeof(*rx8025), GFP_KERNEL);
+	if (!rx8025) {
+		dev_err(&adapter->dev, "failed to alloc memory\n");
+		err = -ENOMEM;
+		goto errout;
+	}
+
+	rx8025->client = client;
+	i2c_set_clientdata(client, rx8025);
+	INIT_WORK(&rx8025->work, rx8025_work);
+
+	err = rx8025_init_client(client, &need_reset);
+	if (err)
+		goto errout_free;
+
+	if (need_reset) {
+		struct rtc_time tm;
+		dev_info(&client->dev,
+			 "bad conditions detected, resetting date\n");
+		rtc_time_to_tm(0, &tm);	/* 1970/1/1 */
+		rx8025_set_time(&client->dev, &tm);
+	}
+
+	rx8025->rtc = rtc_device_register(client->name, &client->dev,
+					  &rx8025_rtc_ops, THIS_MODULE);
+	if (IS_ERR(rx8025->rtc)) {
+		err = PTR_ERR(rx8025->rtc);
+		dev_err(&client->dev, "unable to register the class device\n");
+		goto errout_free;
+	}
+
+	if (client->irq > 0) {
+		dev_info(&client->dev, "IRQ %d supplied\n", client->irq);
+		err = request_irq(client->irq, rx8025_irq,
+				  0, "rx8025", client);
+		if (err) {
+			dev_err(&client->dev, "unable to request IRQ\n");
+			goto errout_reg;
+		}
+	}
+
+	rx8025->rtc->irq_freq = 1;
+	rx8025->rtc->max_user_freq = 1;
+
+	err = rx8025_sysfs_register(&client->dev);
+	if (err)
+		goto errout_irq;
+
+	return 0;
+
+errout_irq:
+	if (client->irq > 0)
+		free_irq(client->irq, client);
+
+errout_reg:
+	rtc_device_unregister(rx8025->rtc);
+
+errout_free:
+	i2c_set_clientdata(client, NULL);
+	kfree(rx8025);
+
+errout:
+	dev_err(&adapter->dev, "probing for rx8025 failed\n");
+	return err;
+}
+
+static int __devexit rx8025_remove(struct i2c_client *client)
+{
+	struct rx8025_data *rx8025 = i2c_get_clientdata(client);
+	struct mutex *lock = &rx8025->rtc->ops_lock;
+
+	if (client->irq > 0) {
+		mutex_lock(lock);
+		rx8025->exiting = 1;
+		mutex_unlock(lock);
+
+		free_irq(client->irq, client);
+		flush_scheduled_work();
+	}
+
+	rx8025_sysfs_unregister(&client->dev);
+	rtc_device_unregister(rx8025->rtc);
+	i2c_set_clientdata(client, NULL);
+	kfree(rx8025);
+	return 0;
+}
+
+static struct i2c_driver rx8025_driver = {
+	.driver = {
+		.name = "rtc-rx8025",
+		.owner = THIS_MODULE,
+	},
+	.probe		= rx8025_probe,
+	.remove		= __devexit_p(rx8025_remove),
+	.id_table	= rx8025_id,
+};
+
+static int __init rx8025_init(void)
+{
+	return i2c_add_driver(&rx8025_driver);
+}
+
+static void __exit rx8025_exit(void)
+{
+	i2c_del_driver(&rx8025_driver);
+}
+
+MODULE_AUTHOR("Wolfgang Grandegger <wg@grandegger.com>");
+MODULE_DESCRIPTION("RX-8025 SA/NB RTC driver");
+MODULE_LICENSE("GPL");
+
+module_init(rx8025_init);
+module_exit(rx8025_exit);
diff --git a/drivers/rtc/rtc-tx4939.c b/drivers/rtc/rtc-tx4939.c
index 4ee4857ff20..4a6ed1104fb 100644
--- a/drivers/rtc/rtc-tx4939.c
+++ b/drivers/rtc/rtc-tx4939.c
@@ -261,10 +261,8 @@ static int __init tx4939_rtc_probe(struct platform_device *pdev)
 
 	tx4939_rtc_cmd(pdata->rtcreg, TX4939_RTCCTL_COMMAND_NOP);
 	if (devm_request_irq(&pdev->dev, irq, tx4939_rtc_interrupt,
-			     IRQF_DISABLED | IRQF_SHARED,
-			     pdev->name, &pdev->dev) < 0) {
+			     IRQF_DISABLED, pdev->name, &pdev->dev) < 0)
 		return -EBUSY;
-	}
 	rtc = rtc_device_register(pdev->name, &pdev->dev,
 				  &tx4939_rtc_ops, THIS_MODULE);
 	if (IS_ERR(rtc))
diff --git a/drivers/spi/atmel_spi.c b/drivers/spi/atmel_spi.c
index 12e443cc4ac..f5b3fdbb1e2 100644
--- a/drivers/spi/atmel_spi.c
+++ b/drivers/spi/atmel_spi.c
@@ -530,9 +530,6 @@ atmel_spi_interrupt(int irq, void *dev_id)
 	return ret;
 }
 
-/* the spi->mode bits understood by this driver: */
-#define MODEBITS (SPI_CPOL | SPI_CPHA | SPI_CS_HIGH)
-
 static int atmel_spi_setup(struct spi_device *spi)
 {
 	struct atmel_spi	*as;
@@ -555,8 +552,6 @@ static int atmel_spi_setup(struct spi_device *spi)
 		return -EINVAL;
 	}
 
-	if (bits == 0)
-		bits = 8;
 	if (bits < 8 || bits > 16) {
 		dev_dbg(&spi->dev,
 				"setup: invalid bits_per_word %u (8 to 16)\n",
@@ -564,12 +559,6 @@ static int atmel_spi_setup(struct spi_device *spi)
 		return -EINVAL;
 	}
 
-	if (spi->mode & ~MODEBITS) {
-		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
-			spi->mode & ~MODEBITS);
-		return -EINVAL;
-	}
-
 	/* see notes above re chipselect */
 	if (!atmel_spi_is_v2()
 			&& spi->chip_select == 0
@@ -775,6 +764,9 @@ static int __init atmel_spi_probe(struct platform_device *pdev)
 	if (!master)
 		goto out_free;
 
+	/* the spi->mode bits understood by this driver: */
+	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH;
+
 	master->bus_num = pdev->id;
 	master->num_chipselect = 4;
 	master->setup = atmel_spi_setup;
diff --git a/drivers/spi/au1550_spi.c b/drivers/spi/au1550_spi.c
index b02f25c702f..76cbc1a6659 100644
--- a/drivers/spi/au1550_spi.c
+++ b/drivers/spi/au1550_spi.c
@@ -284,27 +284,16 @@ static int au1550_spi_setupxfer(struct spi_device *spi, struct spi_transfer *t)
 	return 0;
 }
 
-/* the spi->mode bits understood by this driver: */
-#define MODEBITS (SPI_CPOL | SPI_CPHA | SPI_CS_HIGH | SPI_LSB_FIRST)
-
 static int au1550_spi_setup(struct spi_device *spi)
 {
 	struct au1550_spi *hw = spi_master_get_devdata(spi->master);
 
-	if (spi->bits_per_word == 0)
-		spi->bits_per_word = 8;
 	if (spi->bits_per_word < 4 || spi->bits_per_word > 24) {
 		dev_err(&spi->dev, "setup: invalid bits_per_word=%d\n",
 			spi->bits_per_word);
 		return -EINVAL;
 	}
 
-	if (spi->mode & ~MODEBITS) {
-		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
-			spi->mode & ~MODEBITS);
-		return -EINVAL;
-	}
-
 	if (spi->max_speed_hz == 0)
 		spi->max_speed_hz = hw->freq_max;
 	if (spi->max_speed_hz > hw->freq_max
@@ -781,6 +770,9 @@ static int __init au1550_spi_probe(struct platform_device *pdev)
 		goto err_nomem;
 	}
 
+	/* the spi->mode bits understood by this driver: */
+	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH | SPI_LSB_FIRST;
+
 	hw = spi_master_get_devdata(master);
 
 	hw->master = spi_master_get(master);
diff --git a/drivers/spi/mpc52xx_psc_spi.c b/drivers/spi/mpc52xx_psc_spi.c
index 68c77a91159..1b74d5ca03f 100644
--- a/drivers/spi/mpc52xx_psc_spi.c
+++ b/drivers/spi/mpc52xx_psc_spi.c
@@ -13,6 +13,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
 #include <linux/of_platform.h>
@@ -30,8 +31,7 @@
 
 struct mpc52xx_psc_spi {
 	/* fsl_spi_platform data */
-	void (*activate_cs)(u8, u8);
-	void (*deactivate_cs)(u8, u8);
+	void (*cs_control)(struct spi_device *spi, bool on);
 	u32 sysclk;
 
 	/* driver internal data */
@@ -111,18 +111,16 @@ static void mpc52xx_psc_spi_activate_cs(struct spi_device *spi)
 	out_be16((u16 __iomem *)&psc->ccr, ccr);
 	mps->bits_per_word = cs->bits_per_word;
 
-	if (mps->activate_cs)
-		mps->activate_cs(spi->chip_select,
-				(spi->mode & SPI_CS_HIGH) ? 1 : 0);
+	if (mps->cs_control)
+		mps->cs_control(spi, (spi->mode & SPI_CS_HIGH) ? 1 : 0);
 }
 
 static void mpc52xx_psc_spi_deactivate_cs(struct spi_device *spi)
 {
 	struct mpc52xx_psc_spi *mps = spi_master_get_devdata(spi->master);
 
-	if (mps->deactivate_cs)
-		mps->deactivate_cs(spi->chip_select,
-				(spi->mode & SPI_CS_HIGH) ? 1 : 0);
+	if (mps->cs_control)
+		mps->cs_control(spi, (spi->mode & SPI_CS_HIGH) ? 0 : 1);
 }
 
 #define MPC52xx_PSC_BUFSIZE (MPC52xx_PSC_RFNUM_MASK + 1)
@@ -261,9 +259,6 @@ static void mpc52xx_psc_spi_work(struct work_struct *work)
 	spin_unlock_irq(&mps->lock);
 }
 
-/* the spi->mode bits understood by this driver: */
-#define MODEBITS (SPI_CPOL | SPI_CPHA | SPI_CS_HIGH | SPI_LSB_FIRST)
-
 static int mpc52xx_psc_spi_setup(struct spi_device *spi)
 {
 	struct mpc52xx_psc_spi *mps = spi_master_get_devdata(spi->master);
@@ -273,12 +268,6 @@ static int mpc52xx_psc_spi_setup(struct spi_device *spi)
 	if (spi->bits_per_word%8)
 		return -EINVAL;
 
-	if (spi->mode & ~MODEBITS) {
-		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
-			spi->mode & ~MODEBITS);
-		return -EINVAL;
-	}
-
 	if (!cs) {
 		cs = kzalloc(sizeof *cs, GFP_KERNEL);
 		if (!cs)
@@ -385,18 +374,19 @@ static int __init mpc52xx_psc_spi_do_probe(struct device *dev, u32 regaddr,
 	dev_set_drvdata(dev, master);
 	mps = spi_master_get_devdata(master);
 
+	/* the spi->mode bits understood by this driver: */
+	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH | SPI_LSB_FIRST;
+
 	mps->irq = irq;
 	if (pdata == NULL) {
 		dev_warn(dev, "probe called without platform data, no "
-				"(de)activate_cs function will be called\n");
-		mps->activate_cs = NULL;
-		mps->deactivate_cs = NULL;
+				"cs_control function will be called\n");
+		mps->cs_control = NULL;
 		mps->sysclk = 0;
 		master->bus_num = bus_num;
 		master->num_chipselect = 255;
 	} else {
-		mps->activate_cs = pdata->activate_cs;
-		mps->deactivate_cs = pdata->deactivate_cs;
+		mps->cs_control = pdata->cs_control;
 		mps->sysclk = pdata->sysclk;
 		master->bus_num = pdata->bus_num;
 		master->num_chipselect = pdata->max_chipselect;
diff --git a/drivers/spi/omap2_mcspi.c b/drivers/spi/omap2_mcspi.c
index d6d0c5d241c..eee4b6e0af2 100644
--- a/drivers/spi/omap2_mcspi.c
+++ b/drivers/spi/omap2_mcspi.c
@@ -603,9 +603,6 @@ static int omap2_mcspi_request_dma(struct spi_device *spi)
 	return 0;
 }
 
-/* the spi->mode bits understood by this driver: */
-#define MODEBITS (SPI_CPOL | SPI_CPHA | SPI_CS_HIGH)
-
 static int omap2_mcspi_setup(struct spi_device *spi)
 {
 	int			ret;
@@ -613,15 +610,7 @@ static int omap2_mcspi_setup(struct spi_device *spi)
 	struct omap2_mcspi_dma	*mcspi_dma;
 	struct omap2_mcspi_cs	*cs = spi->controller_state;
 
-	if (spi->mode & ~MODEBITS) {
-		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
-			spi->mode & ~MODEBITS);
-		return -EINVAL;
-	}
-
-	if (spi->bits_per_word == 0)
-		spi->bits_per_word = 8;
-	else if (spi->bits_per_word < 4 || spi->bits_per_word > 32) {
+	if (spi->bits_per_word < 4 || spi->bits_per_word > 32) {
 		dev_dbg(&spi->dev, "setup: unsupported %d bit words\n",
 			spi->bits_per_word);
 		return -EINVAL;
@@ -984,6 +973,9 @@ static int __init omap2_mcspi_probe(struct platform_device *pdev)
 		return -ENOMEM;
 	}
 
+	/* the spi->mode bits understood by this driver: */
+	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH;
+
 	if (pdev->id != -1)
 		master->bus_num = pdev->id;
 
diff --git a/drivers/spi/omap_uwire.c b/drivers/spi/omap_uwire.c
index fe8b9ac0cce..aa90ddb3706 100644
--- a/drivers/spi/omap_uwire.c
+++ b/drivers/spi/omap_uwire.c
@@ -339,8 +339,6 @@ static int uwire_setup_transfer(struct spi_device *spi, struct spi_transfer *t)
 	bits = spi->bits_per_word;
 	if (t != NULL && t->bits_per_word)
 		bits = t->bits_per_word;
-	if (!bits)
-		bits = 8;
 
 	if (bits > 16) {
 		pr_debug("%s: wordsize %d?\n", dev_name(&spi->dev), bits);
@@ -449,19 +447,10 @@ done:
 	return status;
 }
 
-/* the spi->mode bits understood by this driver: */
-#define MODEBITS (SPI_CPOL | SPI_CPHA | SPI_CS_HIGH)
-
 static int uwire_setup(struct spi_device *spi)
 {
 	struct uwire_state *ust = spi->controller_state;
 
-	if (spi->mode & ~MODEBITS) {
-		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
-			spi->mode & ~MODEBITS);
-		return -EINVAL;
-	}
-
 	if (ust == NULL) {
 		ust = kzalloc(sizeof(*ust), GFP_KERNEL);
 		if (ust == NULL)
@@ -522,6 +511,9 @@ static int __init uwire_probe(struct platform_device *pdev)
 
 	uwire_write_reg(UWIRE_SR3, 1);
 
+	/* the spi->mode bits understood by this driver: */
+	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH;
+
 	master->bus_num = 2;	/* "official" */
 	master->num_chipselect = 4;
 	master->setup = uwire_setup;
diff --git a/drivers/spi/orion_spi.c b/drivers/spi/orion_spi.c
index c8b0babdc2a..3aea50da7b2 100644
--- a/drivers/spi/orion_spi.c
+++ b/drivers/spi/orion_spi.c
@@ -358,20 +358,11 @@ static int orion_spi_setup(struct spi_device *spi)
 
 	orion_spi = spi_master_get_devdata(spi->master);
 
-	if (spi->mode) {
-		dev_err(&spi->dev, "setup: unsupported mode bits %x\n",
-			spi->mode);
-		return -EINVAL;
-	}
-
 	/* Fix ac timing if required.   */
 	if (orion_spi->spi_info->enable_clock_fix)
 		orion_spi_setbits(orion_spi, ORION_SPI_IF_CONFIG_REG,
 				  (1 << 14));
 
-	if (spi->bits_per_word == 0)
-		spi->bits_per_word = 8;
-
 	if ((spi->max_speed_hz == 0)
 			|| (spi->max_speed_hz > orion_spi->max_speed))
 		spi->max_speed_hz = orion_spi->max_speed;
@@ -476,6 +467,9 @@ static int __init orion_spi_probe(struct platform_device *pdev)
 	if (pdev->id != -1)
 		master->bus_num = pdev->id;
 
+	/* we support only mode 0, and no options */
+	master->mode_bits = 0;
+
 	master->setup = orion_spi_setup;
 	master->transfer = orion_spi_transfer;
 	master->num_chipselect = ORION_NUM_CHIPSELECTS;
diff --git a/drivers/spi/pxa2xx_spi.c b/drivers/spi/pxa2xx_spi.c
index 3f3c08c6ba4..d949dbf1141 100644
--- a/drivers/spi/pxa2xx_spi.c
+++ b/drivers/spi/pxa2xx_spi.c
@@ -1185,9 +1185,6 @@ static int transfer(struct spi_device *spi, struct spi_message *msg)
 	return 0;
 }
 
-/* the spi->mode bits understood by this driver: */
-#define MODEBITS (SPI_CPOL | SPI_CPHA)
-
 static int setup_cs(struct spi_device *spi, struct chip_data *chip,
 		    struct pxa2xx_spi_chip *chip_info)
 {
@@ -1236,9 +1233,6 @@ static int setup(struct spi_device *spi)
 	uint tx_thres = TX_THRESH_DFLT;
 	uint rx_thres = RX_THRESH_DFLT;
 
-	if (!spi->bits_per_word)
-		spi->bits_per_word = 8;
-
 	if (drv_data->ssp_type != PXA25x_SSP
 		&& (spi->bits_per_word < 4 || spi->bits_per_word > 32)) {
 		dev_err(&spi->dev, "failed setup: ssp_type=%d, bits/wrd=%d "
@@ -1255,12 +1249,6 @@ static int setup(struct spi_device *spi)
 		return -EINVAL;
 	}
 
-	if (spi->mode & ~MODEBITS) {
-		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
-			spi->mode & ~MODEBITS);
-		return -EINVAL;
-	}
-
 	/* Only alloc on first setup */
 	chip = spi_get_ctldata(spi);
 	if (!chip) {
@@ -1328,18 +1316,14 @@ static int setup(struct spi_device *spi)
 
 	/* NOTE:  PXA25x_SSP _could_ use external clocking ... */
 	if (drv_data->ssp_type != PXA25x_SSP)
-		dev_dbg(&spi->dev, "%d bits/word, %ld Hz, mode %d, %s\n",
-				spi->bits_per_word,
+		dev_dbg(&spi->dev, "%ld Hz actual, %s\n",
 				clk_get_rate(ssp->clk)
 					/ (1 + ((chip->cr0 & SSCR0_SCR) >> 8)),
-				spi->mode & 0x3,
 				chip->enable_dma ? "DMA" : "PIO");
 	else
-		dev_dbg(&spi->dev, "%d bits/word, %ld Hz, mode %d, %s\n",
-				spi->bits_per_word,
+		dev_dbg(&spi->dev, "%ld Hz actual, %s\n",
 				clk_get_rate(ssp->clk) / 2
 					/ (1 + ((chip->cr0 & SSCR0_SCR) >> 8)),
-				spi->mode & 0x3,
 				chip->enable_dma ? "DMA" : "PIO");
 
 	if (spi->bits_per_word <= 8) {
@@ -1500,6 +1484,9 @@ static int __init pxa2xx_spi_probe(struct platform_device *pdev)
 	drv_data->pdev = pdev;
 	drv_data->ssp = ssp;
 
+	/* the spi->mode bits understood by this driver: */
+	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH;
+
 	master->bus_num = pdev->id;
 	master->num_chipselect = platform_info->num_chipselect;
 	master->dma_alignment = DMA_ALIGNMENT;
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 8eba98c8ed1..70845ccd85c 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -265,7 +265,7 @@ int spi_add_device(struct spi_device *spi)
 	 * normally rely on the device being setup.  Devices
 	 * using SPI_CS_HIGH can't coexist well otherwise...
 	 */
-	status = spi->master->setup(spi);
+	status = spi_setup(spi);
 	if (status < 0) {
 		dev_err(dev, "can't %s %s, status %d\n",
 				"setup", dev_name(&spi->dev), status);
@@ -583,6 +583,70 @@ EXPORT_SYMBOL_GPL(spi_busnum_to_master);
 
 /*-------------------------------------------------------------------------*/
 
+/* Core methods for SPI master protocol drivers.  Some of the
+ * other core methods are currently defined as inline functions.
+ */
+
+/**
+ * spi_setup - setup SPI mode and clock rate
+ * @spi: the device whose settings are being modified
+ * Context: can sleep, and no requests are queued to the device
+ *
+ * SPI protocol drivers may need to update the transfer mode if the
+ * device doesn't work with its default.  They may likewise need
+ * to update clock rates or word sizes from initial values.  This function
+ * changes those settings, and must be called from a context that can sleep.
+ * Except for SPI_CS_HIGH, which takes effect immediately, the changes take
+ * effect the next time the device is selected and data is transferred to
+ * or from it.  When this function returns, the spi device is deselected.
+ *
+ * Note that this call will fail if the protocol driver specifies an option
+ * that the underlying controller or its driver does not support.  For
+ * example, not all hardware supports wire transfers using nine bit words,
+ * LSB-first wire encoding, or active-high chipselects.
+ */
+int spi_setup(struct spi_device *spi)
+{
+	unsigned	bad_bits;
+	int		status;
+
+	/* help drivers fail *cleanly* when they need options
+	 * that aren't supported with their current master
+	 */
+	bad_bits = spi->mode & ~spi->master->mode_bits;
+	if (bad_bits) {
+		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
+			bad_bits);
+		return -EINVAL;
+	}
+
+	if (!spi->bits_per_word)
+		spi->bits_per_word = 8;
+
+	status = spi->master->setup(spi);
+
+	dev_dbg(&spi->dev, "setup mode %d, %s%s%s%s"
+				"%u bits/w, %u Hz max --> %d\n",
+			(int) (spi->mode & (SPI_CPOL | SPI_CPHA)),
+			(spi->mode & SPI_CS_HIGH) ? "cs_high, " : "",
+			(spi->mode & SPI_LSB_FIRST) ? "lsb, " : "",
+			(spi->mode & SPI_3WIRE) ? "3wire, " : "",
+			(spi->mode & SPI_LOOP) ? "loopback, " : "",
+			spi->bits_per_word, spi->max_speed_hz,
+			status);
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(spi_setup);
+
+
+/*-------------------------------------------------------------------------*/
+
+/* Utility methods for SPI master protocol drivers, layered on
+ * top of the core.  Some other utility methods are defined as
+ * inline functions.
+ */
+
 static void spi_complete(void *arg)
 {
 	complete(arg);
@@ -636,8 +700,8 @@ static u8	*buf;
  * @spi: device with which data will be exchanged
  * @txbuf: data to be written (need not be dma-safe)
  * @n_tx: size of txbuf, in bytes
- * @rxbuf: buffer into which data will be read
- * @n_rx: size of rxbuf, in bytes (need not be dma-safe)
+ * @rxbuf: buffer into which data will be read (need not be dma-safe)
+ * @n_rx: size of rxbuf, in bytes
  * Context: can sleep
  *
  * This performs a half duplex MicroWire style transaction with the
diff --git a/drivers/spi/spi_bfin5xx.c b/drivers/spi/spi_bfin5xx.c
index 011c5bddba6..73e24ef5a2f 100644
--- a/drivers/spi/spi_bfin5xx.c
+++ b/drivers/spi/spi_bfin5xx.c
@@ -169,7 +169,7 @@ static int bfin_spi_flush(struct driver_data *drv_data)
 	unsigned long limit = loops_per_jiffy << 1;
 
 	/* wait for stop and clear stat */
-	while (!(read_STAT(drv_data) & BIT_STAT_SPIF) && limit--)
+	while (!(read_STAT(drv_data) & BIT_STAT_SPIF) && --limit)
 		cpu_relax();
 
 	write_STAT(drv_data, BIT_STAT_CLR);
@@ -1010,16 +1010,6 @@ static int bfin_spi_setup(struct spi_device *spi)
 	struct driver_data *drv_data = spi_master_get_devdata(spi->master);
 	int ret;
 
-	/* Abort device setup if requested features are not supported */
-	if (spi->mode & ~(SPI_CPOL | SPI_CPHA | SPI_LSB_FIRST)) {
-		dev_err(&spi->dev, "requested mode not fully supported\n");
-		return -EINVAL;
-	}
-
-	/* Zero (the default) here means 8 bits */
-	if (!spi->bits_per_word)
-		spi->bits_per_word = 8;
-
 	if (spi->bits_per_word != 8 && spi->bits_per_word != 16)
 		return -EINVAL;
 
@@ -1287,6 +1277,9 @@ static int __init bfin_spi_probe(struct platform_device *pdev)
 	drv_data->pdev = pdev;
 	drv_data->pin_req = platform_info->pin_req;
 
+	/* the spi->mode bits supported by this driver: */
+	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_LSB_FIRST;
+
 	master->bus_num = pdev->id;
 	master->num_chipselect = platform_info->num_chipselect;
 	master->cleanup = bfin_spi_cleanup;
diff --git a/drivers/spi/spi_bitbang.c b/drivers/spi/spi_bitbang.c
index 85e61f45121..2a5abc08e85 100644
--- a/drivers/spi/spi_bitbang.c
+++ b/drivers/spi/spi_bitbang.c
@@ -188,12 +188,6 @@ int spi_bitbang_setup(struct spi_device *spi)
 
 	bitbang = spi_master_get_devdata(spi->master);
 
-	/* Bitbangers can support SPI_CS_HIGH, SPI_3WIRE, and so on;
-	 * add those to master->flags, and provide the other support.
-	 */
-	if ((spi->mode & ~(SPI_CPOL|SPI_CPHA|bitbang->flags)) != 0)
-		return -EINVAL;
-
 	if (!cs) {
 		cs = kzalloc(sizeof *cs, GFP_KERNEL);
 		if (!cs)
@@ -201,9 +195,6 @@ int spi_bitbang_setup(struct spi_device *spi)
 		spi->controller_state = cs;
 	}
 
-	if (!spi->bits_per_word)
-		spi->bits_per_word = 8;
-
 	/* per-word shift register access, in hardware or bitbanging */
 	cs->txrx_word = bitbang->txrx_word[spi->mode & (SPI_CPOL|SPI_CPHA)];
 	if (!cs->txrx_word)
@@ -213,9 +204,7 @@ int spi_bitbang_setup(struct spi_device *spi)
 	if (retval < 0)
 		return retval;
 
-	dev_dbg(&spi->dev, "%s, mode %d, %u bits/w, %u nsec/bit\n",
-			__func__, spi->mode & (SPI_CPOL | SPI_CPHA),
-			spi->bits_per_word, 2 * cs->nsecs);
+	dev_dbg(&spi->dev, "%s, %u nsec/bit\n", __func__, 2 * cs->nsecs);
 
 	/* NOTE we _need_ to call chipselect() early, ideally with adapter
 	 * setup, unless the hardware defaults cooperate to avoid confusion
@@ -457,6 +446,9 @@ int spi_bitbang_start(struct spi_bitbang *bitbang)
 	spin_lock_init(&bitbang->lock);
 	INIT_LIST_HEAD(&bitbang->queue);
 
+	if (!bitbang->master->mode_bits)
+		bitbang->master->mode_bits = SPI_CPOL | SPI_CPHA | bitbang->flags;
+
 	if (!bitbang->master->transfer)
 		bitbang->master->transfer = spi_bitbang_transfer;
 	if (!bitbang->txrx_bufs) {
diff --git a/drivers/spi/spi_imx.c b/drivers/spi/spi_imx.c
index 0671aeef579..c195e45f7f3 100644
--- a/drivers/spi/spi_imx.c
+++ b/drivers/spi/spi_imx.c
@@ -1171,9 +1171,6 @@ msg_rejected:
 	return -EINVAL;
 }
 
-/* the spi->mode bits understood by this driver: */
-#define MODEBITS (SPI_CPOL | SPI_CPHA | SPI_CS_HIGH)
-
 /* On first setup bad values must free chip_data memory since will cause
    spi_new_device to fail. Bad value setup from protocol driver are simply not
    applied and notified to the calling driver. */
@@ -1186,12 +1183,6 @@ static int setup(struct spi_device *spi)
 	u32 tmp;
 	int status = 0;
 
-	if (spi->mode & ~MODEBITS) {
-		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
-			spi->mode & ~MODEBITS);
-		return -EINVAL;
-	}
-
 	/* Get controller data */
 	chip_info = spi->controller_data;
 
@@ -1286,10 +1277,7 @@ static int setup(struct spi_device *spi)
 
 	/* SPI word width */
 	tmp = spi->bits_per_word;
-	if (tmp == 0) {
-		tmp = 8;
-		spi->bits_per_word = 8;
-	} else if (tmp > 16) {
+	if (tmp > 16) {
 		status = -EINVAL;
 		dev_err(&spi->dev,
 			"setup - "
@@ -1481,6 +1469,9 @@ static int __init spi_imx_probe(struct platform_device *pdev)
 	drv_data->master_info = platform_info;
 	drv_data->pdev = pdev;
 
+	/* the spi->mode bits understood by this driver: */
+	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH;
+
 	master->bus_num = pdev->id;
 	master->num_chipselect = platform_info->num_chipselect;
 	master->dma_alignment = DMA_ALIGNMENT;
diff --git a/drivers/spi/spi_mpc83xx.c b/drivers/spi/spi_mpc83xx.c
index a32ccb44065..ce61be98e06 100644
--- a/drivers/spi/spi_mpc83xx.c
+++ b/drivers/spi/spi_mpc83xx.c
@@ -419,10 +419,6 @@ static void mpc83xx_spi_work(struct work_struct *work)
 	spin_unlock_irq(&mpc83xx_spi->lock);
 }
 
-/* the spi->mode bits understood by this driver: */
-#define MODEBITS	(SPI_CPOL | SPI_CPHA | SPI_CS_HIGH \
-			| SPI_LSB_FIRST | SPI_LOOP)
-
 static int mpc83xx_spi_setup(struct spi_device *spi)
 {
 	struct mpc83xx_spi *mpc83xx_spi;
@@ -430,12 +426,6 @@ static int mpc83xx_spi_setup(struct spi_device *spi)
 	u32 hw_mode;
 	struct spi_mpc83xx_cs	*cs = spi->controller_state;
 
-	if (spi->mode & ~MODEBITS) {
-		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
-			spi->mode & ~MODEBITS);
-		return -EINVAL;
-	}
-
 	if (!spi->max_speed_hz)
 		return -EINVAL;
 
@@ -447,9 +437,6 @@ static int mpc83xx_spi_setup(struct spi_device *spi)
 	}
 	mpc83xx_spi = spi_master_get_devdata(spi->master);
 
-	if (!spi->bits_per_word)
-		spi->bits_per_word = 8;
-
 	hw_mode = cs->hw_mode; /* Save orginal settings */
 	cs->hw_mode = mpc83xx_spi_read_reg(&mpc83xx_spi->base->mode);
 	/* mask out bits we are going to set */
@@ -471,9 +458,6 @@ static int mpc83xx_spi_setup(struct spi_device *spi)
 		return retval;
 	}
 
-	dev_dbg(&spi->dev, "%s, mode %d, %u bits/w, %u Hz\n",
-		__func__, spi->mode & (SPI_CPOL | SPI_CPHA),
-		spi->bits_per_word, spi->max_speed_hz);
 #if 0 /* Don't think this is needed */
 	/* NOTE we _need_ to call chipselect() early, ideally with adapter
 	 * setup, unless the hardware defaults cooperate to avoid confusion
@@ -568,6 +552,10 @@ mpc83xx_spi_probe(struct device *dev, struct resource *mem, unsigned int irq)
 
 	dev_set_drvdata(dev, master);
 
+	/* the spi->mode bits understood by this driver: */
+	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH
+			| SPI_LSB_FIRST | SPI_LOOP;
+
 	master->setup = mpc83xx_spi_setup;
 	master->transfer = mpc83xx_spi_transfer;
 	master->cleanup = mpc83xx_spi_cleanup;
diff --git a/drivers/spi/spi_s3c24xx.c b/drivers/spi/spi_s3c24xx.c
index b3ebc1d0f85..e0d44af4745 100644
--- a/drivers/spi/spi_s3c24xx.c
+++ b/drivers/spi/spi_s3c24xx.c
@@ -146,32 +146,16 @@ static int s3c24xx_spi_setupxfer(struct spi_device *spi,
 	return 0;
 }
 
-/* the spi->mode bits understood by this driver: */
-#define MODEBITS (SPI_CPOL | SPI_CPHA | SPI_CS_HIGH)
-
 static int s3c24xx_spi_setup(struct spi_device *spi)
 {
 	int ret;
 
-	if (!spi->bits_per_word)
-		spi->bits_per_word = 8;
-
-	if (spi->mode & ~MODEBITS) {
-		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
-			spi->mode & ~MODEBITS);
-		return -EINVAL;
-	}
-
 	ret = s3c24xx_spi_setupxfer(spi, NULL);
 	if (ret < 0) {
 		dev_err(&spi->dev, "setupxfer returned %d\n", ret);
 		return ret;
 	}
 
-	dev_dbg(&spi->dev, "%s: mode %d, %u bpw, %d hz\n",
-		__func__, spi->mode, spi->bits_per_word,
-		spi->max_speed_hz);
-
 	return 0;
 }
 
@@ -290,6 +274,9 @@ static int __init s3c24xx_spi_probe(struct platform_device *pdev)
 
 	/* setup the master state. */
 
+	/* the spi->mode bits understood by this driver: */
+	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH;
+
 	master->num_chipselect = hw->pdata->num_cs;
 	master->bus_num = pdata->bus_num;
 
diff --git a/drivers/spi/spi_txx9.c b/drivers/spi/spi_txx9.c
index 29cbb065618..96057de133a 100644
--- a/drivers/spi/spi_txx9.c
+++ b/drivers/spi/spi_txx9.c
@@ -110,23 +110,17 @@ static void txx9spi_cs_func(struct spi_device *spi, struct txx9spi *c,
 	ndelay(cs_delay);	/* CS Setup Time / CS Recovery Time */
 }
 
-/* the spi->mode bits understood by this driver: */
-#define MODEBITS	(SPI_CS_HIGH|SPI_CPOL|SPI_CPHA)
-
 static int txx9spi_setup(struct spi_device *spi)
 {
 	struct txx9spi *c = spi_master_get_devdata(spi->master);
 	u8 bits_per_word;
 
-	if (spi->mode & ~MODEBITS)
-		return -EINVAL;
-
 	if (!spi->max_speed_hz
 			|| spi->max_speed_hz > c->max_speed_hz
 			|| spi->max_speed_hz < c->min_speed_hz)
 		return -EINVAL;
 
-	bits_per_word = spi->bits_per_word ? : 8;
+	bits_per_word = spi->bits_per_word;
 	if (bits_per_word != 8 && bits_per_word != 16)
 		return -EINVAL;
 
@@ -414,6 +408,9 @@ static int __init txx9spi_probe(struct platform_device *dev)
 		 (unsigned long long)res->start, irq,
 		 (c->baseclk + 500000) / 1000000);
 
+	/* the spi->mode bits understood by this driver: */
+	master->mode_bits = SPI_CS_HIGH | SPI_CPOL | SPI_CPHA;
+
 	master->bus_num = dev->id;
 	master->setup = txx9spi_setup;
 	master->transfer = txx9spi_transfer;
diff --git a/drivers/spi/xilinx_spi.c b/drivers/spi/xilinx_spi.c
index 494d3f756e2..46b8c5c2f45 100644
--- a/drivers/spi/xilinx_spi.c
+++ b/drivers/spi/xilinx_spi.c
@@ -158,9 +158,6 @@ static int xilinx_spi_setup_transfer(struct spi_device *spi,
 	return 0;
 }
 
-/* the spi->mode bits understood by this driver: */
-#define MODEBITS (SPI_CPOL | SPI_CPHA)
-
 static int xilinx_spi_setup(struct spi_device *spi)
 {
 	struct spi_bitbang *bitbang;
@@ -170,22 +167,10 @@ static int xilinx_spi_setup(struct spi_device *spi)
 	xspi = spi_master_get_devdata(spi->master);
 	bitbang = &xspi->bitbang;
 
-	if (!spi->bits_per_word)
-		spi->bits_per_word = 8;
-
-	if (spi->mode & ~MODEBITS) {
-		dev_err(&spi->dev, "%s, unsupported mode bits %x\n",
-			__func__, spi->mode & ~MODEBITS);
-		return -EINVAL;
-	}
-
 	retval = xilinx_spi_setup_transfer(spi, NULL);
 	if (retval < 0)
 		return retval;
 
-	dev_dbg(&spi->dev, "%s, mode %d, %u bits/w, %u nsec/bit\n",
-		__func__, spi->mode & MODEBITS, spi->bits_per_word, 0);
-
 	return 0;
 }
 
@@ -333,6 +318,9 @@ static int __init xilinx_spi_of_probe(struct of_device *ofdev,
 		goto put_master;
 	}
 
+	/* the spi->mode bits understood by this driver: */
+	master->mode_bits = SPI_CPOL | SPI_CPHA;
+
 	xspi = spi_master_get_devdata(master);
 	xspi->bitbang.master = spi_master_get(master);
 	xspi->bitbang.chipselect = xilinx_spi_chipselect;
diff --git a/drivers/w1/masters/w1-gpio.c b/drivers/w1/masters/w1-gpio.c
index a411702413d..6f8866d6a90 100644
--- a/drivers/w1/masters/w1-gpio.c
+++ b/drivers/w1/masters/w1-gpio.c
@@ -74,6 +74,9 @@ static int __init w1_gpio_probe(struct platform_device *pdev)
 	if (err)
 		goto free_gpio;
 
+	if (pdata->enable_external_pullup)
+		pdata->enable_external_pullup(1);
+
 	platform_set_drvdata(pdev, master);
 
 	return 0;
@@ -91,6 +94,9 @@ static int __exit w1_gpio_remove(struct platform_device *pdev)
 	struct w1_bus_master *master = platform_get_drvdata(pdev);
 	struct w1_gpio_platform_data *pdata = pdev->dev.platform_data;
 
+	if (pdata->enable_external_pullup)
+		pdata->enable_external_pullup(0);
+
 	w1_remove_master_device(master);
 	gpio_free(pdata->pin);
 	kfree(master);
@@ -98,12 +104,41 @@ static int __exit w1_gpio_remove(struct platform_device *pdev)
 	return 0;
 }
 
+#ifdef CONFIG_PM
+
+static int w1_gpio_suspend(struct platform_device *pdev, pm_message_t state)
+{
+	struct w1_gpio_platform_data *pdata = pdev->dev.platform_data;
+
+	if (pdata->enable_external_pullup)
+		pdata->enable_external_pullup(0);
+
+	return 0;
+}
+
+static int w1_gpio_resume(struct platform_device *pdev)
+{
+	struct w1_gpio_platform_data *pdata = pdev->dev.platform_data;
+
+	if (pdata->enable_external_pullup)
+		pdata->enable_external_pullup(1);
+
+	return 0;
+}
+
+#else
+#define w1_gpio_suspend	NULL
+#define w1_gpio_resume	NULL
+#endif
+
 static struct platform_driver w1_gpio_driver = {
 	.driver = {
 		.name	= "w1-gpio",
 		.owner	= THIS_MODULE,
 	},
 	.remove	= __exit_p(w1_gpio_remove),
+	.suspend = w1_gpio_suspend,
+	.resume = w1_gpio_resume,
 };
 
 static int __init w1_gpio_init(void)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 40381df3486..9fa212b014a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1340,8 +1340,10 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 	prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
 	prstatus->pr_sigpend = p->pending.signal.sig[0];
 	prstatus->pr_sighold = p->blocked.sig[0];
+	rcu_read_lock();
+	prstatus->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+	rcu_read_unlock();
 	prstatus->pr_pid = task_pid_vnr(p);
-	prstatus->pr_ppid = task_pid_vnr(p->real_parent);
 	prstatus->pr_pgrp = task_pgrp_vnr(p);
 	prstatus->pr_sid = task_session_vnr(p);
 	if (thread_group_leader(p)) {
@@ -1382,8 +1384,10 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 			psinfo->pr_psargs[i] = ' ';
 	psinfo->pr_psargs[len] = 0;
 
+	rcu_read_lock();
+	psinfo->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+	rcu_read_unlock();
 	psinfo->pr_pid = task_pid_vnr(p);
-	psinfo->pr_ppid = task_pid_vnr(p->real_parent);
 	psinfo->pr_pgrp = task_pgrp_vnr(p);
 	psinfo->pr_sid = task_session_vnr(p);
 
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index fdb66faa24f..20fbeced472 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1387,8 +1387,10 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 	prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
 	prstatus->pr_sigpend = p->pending.signal.sig[0];
 	prstatus->pr_sighold = p->blocked.sig[0];
+	rcu_read_lock();
+	prstatus->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+	rcu_read_unlock();
 	prstatus->pr_pid = task_pid_vnr(p);
-	prstatus->pr_ppid = task_pid_vnr(p->real_parent);
 	prstatus->pr_pgrp = task_pgrp_vnr(p);
 	prstatus->pr_sid = task_session_vnr(p);
 	if (thread_group_leader(p)) {
@@ -1432,8 +1434,10 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 			psinfo->pr_psargs[i] = ' ';
 	psinfo->pr_psargs[len] = 0;
 
+	rcu_read_lock();
+	psinfo->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+	rcu_read_unlock();
 	psinfo->pr_pid = task_pid_vnr(p);
-	psinfo->pr_ppid = task_pid_vnr(p->real_parent);
 	psinfo->pr_pgrp = task_pgrp_vnr(p);
 	psinfo->pr_sid = task_session_vnr(p);
 
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 5458e80fc55..085c5c06342 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -98,7 +98,7 @@ struct epoll_filefd {
 struct nested_call_node {
 	struct list_head llink;
 	void *cookie;
-	int cpu;
+	void *ctx;
 };
 
 /*
@@ -317,17 +317,17 @@ static void ep_nested_calls_init(struct nested_calls *ncalls)
  * @nproc: Nested call core function pointer.
  * @priv: Opaque data to be passed to the @nproc callback.
  * @cookie: Cookie to be used to identify this nested call.
+ * @ctx: This instance context.
  *
  * Returns: Returns the code returned by the @nproc callback, or -1 if
  *          the maximum recursion limit has been exceeded.
  */
 static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
 			  int (*nproc)(void *, void *, int), void *priv,
-			  void *cookie)
+			  void *cookie, void *ctx)
 {
 	int error, call_nests = 0;
 	unsigned long flags;
-	int this_cpu = get_cpu();
 	struct list_head *lsthead = &ncalls->tasks_call_list;
 	struct nested_call_node *tncur;
 	struct nested_call_node tnode;
@@ -340,7 +340,7 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
 	 * very much limited.
 	 */
 	list_for_each_entry(tncur, lsthead, llink) {
-		if (tncur->cpu == this_cpu &&
+		if (tncur->ctx == ctx &&
 		    (tncur->cookie == cookie || ++call_nests > max_nests)) {
 			/*
 			 * Ops ... loop detected or maximum nest level reached.
@@ -352,7 +352,7 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
 	}
 
 	/* Add the current task and cookie to the list */
-	tnode.cpu = this_cpu;
+	tnode.ctx = ctx;
 	tnode.cookie = cookie;
 	list_add(&tnode.llink, lsthead);
 
@@ -364,10 +364,9 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
 	/* Remove the current task from the list */
 	spin_lock_irqsave(&ncalls->lock, flags);
 	list_del(&tnode.llink);
- out_unlock:
+out_unlock:
 	spin_unlock_irqrestore(&ncalls->lock, flags);
 
-	put_cpu();
 	return error;
 }
 
@@ -408,8 +407,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
  */
 static void ep_poll_safewake(wait_queue_head_t *wq)
 {
+	int this_cpu = get_cpu();
+
 	ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
-		       ep_poll_wakeup_proc, NULL, wq);
+		       ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
+
+	put_cpu();
 }
 
 /*
@@ -663,7 +666,7 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
 	 * could re-enter here.
 	 */
 	pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
-				   ep_poll_readyevents_proc, ep, ep);
+				   ep_poll_readyevents_proc, ep, ep, current);
 
 	return pollflags != -1 ? pollflags : 0;
 }
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 003500498c2..6cde970b0a1 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -450,7 +450,7 @@ ino_t ext2_inode_by_name(struct inode *dir, struct qstr *child)
 
 /* Releases the page */
 void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
-			struct page *page, struct inode *inode)
+		   struct page *page, struct inode *inode, int update_times)
 {
 	loff_t pos = page_offset(page) +
 			(char *) de - (char *) page_address(page);
@@ -465,7 +465,8 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
 	ext2_set_de_type(de, inode);
 	err = ext2_commit_chunk(page, pos, len);
 	ext2_put_page(page);
-	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	if (update_times)
+		dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
 	EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
 	mark_inode_dirty(dir);
 }
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index f2e5811936d..d988a718aed 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -111,7 +111,7 @@ extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct qstr *,
 extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *);
 extern int ext2_empty_dir (struct inode *);
 extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
-extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *);
+extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int);
 
 /* ialloc.c */
 extern struct inode * ext2_new_inode (struct inode *, int);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 90ea17998a7..6524ecaebb7 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -320,7 +320,7 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 		if (!new_de)
 			goto out_dir;
 		inode_inc_link_count(old_inode);
-		ext2_set_link(new_dir, new_de, new_page, old_inode);
+		ext2_set_link(new_dir, new_de, new_page, old_inode, 1);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		if (dir_de)
 			drop_nlink(new_inode);
@@ -352,7 +352,8 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 	inode_dec_link_count(old_inode);
 
 	if (dir_de) {
-		ext2_set_link(old_inode, dir_de, dir_page, new_dir);
+		if (old_dir != new_dir)
+			ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0);
 		inode_dec_link_count(old_dir);
 	}
 	return 0;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index b0248c6d5d4..05dea8132fc 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -820,7 +820,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
 		while (count < maxblocks && count <= blocks_to_boundary) {
 			ext3_fsblk_t blk;
 
-			if (!verify_chain(chain, partial)) {
+			if (!verify_chain(chain, chain + depth - 1)) {
 				/*
 				 * Indirect block might be removed by
 				 * truncate while we were reading it.
@@ -2374,7 +2374,7 @@ void ext3_truncate(struct inode *inode)
 	struct page *page;
 
 	if (!ext3_can_truncate(inode))
-		return;
+		goto out_notrans;
 
 	if (inode->i_size == 0 && ext3_should_writeback_data(inode))
 		ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE;
@@ -2390,7 +2390,7 @@ void ext3_truncate(struct inode *inode)
 		page = grab_cache_page(mapping,
 				inode->i_size >> PAGE_CACHE_SHIFT);
 		if (!page)
-			return;
+			goto out_notrans;
 	}
 
 	handle = start_transaction(inode);
@@ -2401,7 +2401,7 @@ void ext3_truncate(struct inode *inode)
 			unlock_page(page);
 			page_cache_release(page);
 		}
-		return;		/* AKPM: return what? */
+		goto out_notrans;
 	}
 
 	last_block = (inode->i_size + blocksize-1)
@@ -2525,6 +2525,14 @@ out_stop:
 		ext3_orphan_del(handle, inode);
 
 	ext3_journal_stop(handle);
+	return;
+out_notrans:
+	/*
+	 * Delete the inode from orphan list so that it doesn't stay there
+	 * forever and trigger assertion on umount.
+	 */
+	if (inode->i_nlink)
+		ext3_orphan_del(NULL, inode);
 }
 
 static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
@@ -3122,12 +3130,6 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
 
 	rc = inode_setattr(inode, attr);
 
-	/* If inode_setattr's call to ext3_truncate failed to get a
-	 * transaction handle at all, we need to clean up the in-core
-	 * orphan list manually. */
-	if (inode->i_nlink)
-		ext3_orphan_del(NULL, inode);
-
 	if (!rc && (ia_valid & ATTR_MODE))
 		rc = ext3_acl_chmod(inode);
 
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 2f0dc5a1463..8ba5441063b 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -195,9 +195,8 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
 		 * Do not report hidden files if so instructed, or associated
 		 * files unless instructed to do so
 		 */
-		if ((sbi->s_hide == 'y' &&
-				(de->flags[-sbi->s_high_sierra] & 1)) ||
-				(sbi->s_showassoc =='n' &&
+		if ((sbi->s_hide && (de->flags[-sbi->s_high_sierra] & 1)) ||
+		    (!sbi->s_showassoc &&
 				(de->flags[-sbi->s_high_sierra] & 4))) {
 			filp->f_pos += de_len;
 			continue;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 068b34b5a10..58a7963e168 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -141,13 +141,17 @@ static const struct dentry_operations isofs_dentry_ops[] = {
 };
 
 struct iso9660_options{
-	char map;
-	char rock;
+	unsigned int rock:1;
+	unsigned int cruft:1;
+	unsigned int hide:1;
+	unsigned int showassoc:1;
+	unsigned int nocompress:1;
+	unsigned int overriderockperm:1;
+	unsigned int uid_set:1;
+	unsigned int gid_set:1;
+	unsigned int utf8:1;
+	unsigned char map;
 	char joliet;
-	char cruft;
-	char hide;
-	char showassoc;
-	char nocompress;
 	unsigned char check;
 	unsigned int blocksize;
 	mode_t fmode;
@@ -155,7 +159,6 @@ struct iso9660_options{
 	gid_t gid;
 	uid_t uid;
 	char *iocharset;
-	unsigned char utf8;
 	/* LVE */
 	s32 session;
 	s32 sbsector;
@@ -312,7 +315,7 @@ enum {
 	Opt_block, Opt_check_r, Opt_check_s, Opt_cruft, Opt_gid, Opt_ignore,
 	Opt_iocharset, Opt_map_a, Opt_map_n, Opt_map_o, Opt_mode, Opt_nojoliet,
 	Opt_norock, Opt_sb, Opt_session, Opt_uid, Opt_unhide, Opt_utf8, Opt_err,
-	Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode,
+	Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode, Opt_overriderockperm,
 };
 
 static const match_table_t tokens = {
@@ -340,6 +343,7 @@ static const match_table_t tokens = {
 	{Opt_gid, "gid=%u"},
 	{Opt_mode, "mode=%u"},
 	{Opt_dmode, "dmode=%u"},
+	{Opt_overriderockperm, "overriderockperm"},
 	{Opt_block, "block=%u"},
 	{Opt_ignore, "conv=binary"},
 	{Opt_ignore, "conv=b"},
@@ -359,24 +363,22 @@ static int parse_options(char *options, struct iso9660_options *popt)
 	int option;
 
 	popt->map = 'n';
-	popt->rock = 'y';
-	popt->joliet = 'y';
-	popt->cruft = 'n';
-	popt->hide = 'n';
-	popt->showassoc = 'n';
+	popt->rock = 1;
+	popt->joliet = 1;
+	popt->cruft = 0;
+	popt->hide = 0;
+	popt->showassoc = 0;
 	popt->check = 'u';		/* unset */
 	popt->nocompress = 0;
 	popt->blocksize = 1024;
-	popt->fmode = popt->dmode = S_IRUGO | S_IXUGO; /*
-					 * r-x for all.  The disc could
-					 * be shared with DOS machines so
-					 * virtually anything could be
-					 * a valid executable.
-					 */
+	popt->fmode = popt->dmode = ISOFS_INVALID_MODE;
+	popt->uid_set = 0;
+	popt->gid_set = 0;
 	popt->gid = 0;
 	popt->uid = 0;
 	popt->iocharset = NULL;
 	popt->utf8 = 0;
+	popt->overriderockperm = 0;
 	popt->session=-1;
 	popt->sbsector=-1;
 	if (!options)
@@ -393,20 +395,20 @@ static int parse_options(char *options, struct iso9660_options *popt)
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_norock:
-			popt->rock = 'n';
+			popt->rock = 0;
 			break;
 		case Opt_nojoliet:
-			popt->joliet = 'n';
+			popt->joliet = 0;
 			break;
 		case Opt_hide:
-			popt->hide = 'y';
+			popt->hide = 1;
 			break;
 		case Opt_unhide:
 		case Opt_showassoc:
-			popt->showassoc = 'y';
+			popt->showassoc = 1;
 			break;
 		case Opt_cruft:
-			popt->cruft = 'y';
+			popt->cruft = 1;
 			break;
 		case Opt_utf8:
 			popt->utf8 = 1;
@@ -450,11 +452,13 @@ static int parse_options(char *options, struct iso9660_options *popt)
 			if (match_int(&args[0], &option))
 				return 0;
 			popt->uid = option;
+			popt->uid_set = 1;
 			break;
 		case Opt_gid:
 			if (match_int(&args[0], &option))
 				return 0;
 			popt->gid = option;
+			popt->gid_set = 1;
 			break;
 		case Opt_mode:
 			if (match_int(&args[0], &option))
@@ -466,6 +470,9 @@ static int parse_options(char *options, struct iso9660_options *popt)
 				return 0;
 			popt->dmode = option;
 			break;
+		case Opt_overriderockperm:
+			popt->overriderockperm = 1;
+			break;
 		case Opt_block:
 			if (match_int(&args[0], &option))
 				return 0;
@@ -650,7 +657,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
 					goto out_freebh;
 
 				sbi->s_high_sierra = 1;
-				opt.rock = 'n';
+				opt.rock = 0;
 				h_pri = (struct hs_primary_descriptor *)vdp;
 				goto root_found;
 			}
@@ -673,7 +680,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
 
 root_found:
 
-	if (joliet_level && (pri == NULL || opt.rock == 'n')) {
+	if (joliet_level && (pri == NULL || !opt.rock)) {
 		/* This is the case of Joliet with the norock mount flag.
 		 * A disc with both Joliet and Rock Ridge is handled later
 		 */
@@ -802,22 +809,31 @@ root_found:
 	s->s_op = &isofs_sops;
 	s->s_export_op = &isofs_export_ops;
 	sbi->s_mapping = opt.map;
-	sbi->s_rock = (opt.rock == 'y' ? 2 : 0);
+	sbi->s_rock = (opt.rock ? 2 : 0);
 	sbi->s_rock_offset = -1; /* initial offset, will guess until SP is found*/
 	sbi->s_cruft = opt.cruft;
 	sbi->s_hide = opt.hide;
 	sbi->s_showassoc = opt.showassoc;
 	sbi->s_uid = opt.uid;
 	sbi->s_gid = opt.gid;
+	sbi->s_uid_set = opt.uid_set;
+	sbi->s_gid_set = opt.gid_set;
 	sbi->s_utf8 = opt.utf8;
 	sbi->s_nocompress = opt.nocompress;
+	sbi->s_overriderockperm = opt.overriderockperm;
 	/*
 	 * It would be incredibly stupid to allow people to mark every file
 	 * on the disk as suid, so we merely allow them to set the default
 	 * permissions.
 	 */
-	sbi->s_fmode = opt.fmode & 0777;
-	sbi->s_dmode = opt.dmode & 0777;
+	if (opt.fmode != ISOFS_INVALID_MODE)
+		sbi->s_fmode = opt.fmode & 0777;
+	else
+		sbi->s_fmode = ISOFS_INVALID_MODE;
+	if (opt.dmode != ISOFS_INVALID_MODE)
+		sbi->s_dmode = opt.dmode & 0777;
+	else
+		sbi->s_dmode = ISOFS_INVALID_MODE;
 
 	/*
 	 * Read the root inode, which _may_ result in changing
@@ -1095,18 +1111,6 @@ static const struct address_space_operations isofs_aops = {
 	.bmap = _isofs_bmap
 };
 
-static inline void test_and_set_uid(uid_t *p, uid_t value)
-{
-	if (value)
-		*p = value;
-}
-
-static inline void test_and_set_gid(gid_t *p, gid_t value)
-{
-        if (value)
-                *p = value;
-}
-
 static int isofs_read_level3_size(struct inode *inode)
 {
 	unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
@@ -1261,7 +1265,10 @@ static int isofs_read_inode(struct inode *inode)
 	ei->i_file_format = isofs_file_normal;
 
 	if (de->flags[-high_sierra] & 2) {
-		inode->i_mode = sbi->s_dmode | S_IFDIR;
+		if (sbi->s_dmode != ISOFS_INVALID_MODE)
+			inode->i_mode = S_IFDIR | sbi->s_dmode;
+		else
+			inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
 		inode->i_nlink = 1;	/*
 					 * Set to 1.  We know there are 2, but
 					 * the find utility tries to optimize
@@ -1270,8 +1277,16 @@ static int isofs_read_inode(struct inode *inode)
 					 * do it the hard way.
 					 */
 	} else {
-		/* Everybody gets to read the file. */
-		inode->i_mode = sbi->s_fmode | S_IFREG;
+		if (sbi->s_fmode != ISOFS_INVALID_MODE) {
+			inode->i_mode = S_IFREG | sbi->s_fmode;
+		} else {
+			/*
+			 * Set default permissions: r-x for all.  The disc
+			 * could be shared with DOS machines so virtually
+			 * anything could be a valid executable.
+			 */
+			inode->i_mode = S_IFREG | S_IRUGO | S_IXUGO;
+		}
 		inode->i_nlink = 1;
 	}
 	inode->i_uid = sbi->s_uid;
@@ -1300,7 +1315,7 @@ static int isofs_read_inode(struct inode *inode)
 	 * this CDROM was mounted with the cruft option.
 	 */
 
-	if (sbi->s_cruft == 'y')
+	if (sbi->s_cruft)
 		inode->i_size &= 0x00ffffff;
 
 	if (de->interleave[0]) {
@@ -1346,9 +1361,18 @@ static int isofs_read_inode(struct inode *inode)
 	if (!high_sierra) {
 		parse_rock_ridge_inode(de, inode);
 		/* if we want uid/gid set, override the rock ridge setting */
-		test_and_set_uid(&inode->i_uid, sbi->s_uid);
-		test_and_set_gid(&inode->i_gid, sbi->s_gid);
+		if (sbi->s_uid_set)
+			inode->i_uid = sbi->s_uid;
+		if (sbi->s_gid_set)
+			inode->i_gid = sbi->s_gid;
 	}
+	/* Now set final access rights if overriding rock ridge setting */
+	if (S_ISDIR(inode->i_mode) && sbi->s_overriderockperm &&
+	    sbi->s_dmode != ISOFS_INVALID_MODE)
+		inode->i_mode = S_IFDIR | sbi->s_dmode;
+	if (S_ISREG(inode->i_mode) && sbi->s_overriderockperm &&
+	    sbi->s_fmode != ISOFS_INVALID_MODE)
+		inode->i_mode = S_IFREG | sbi->s_fmode;
 
 	/* Install the inode operations vector */
 	if (S_ISREG(inode->i_mode)) {
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index ccbf72faf27..7d33de84f52 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -35,21 +35,20 @@ struct isofs_sb_info {
 	unsigned long s_log_zone_size;
 	unsigned long s_max_size;
 	
-	unsigned char s_high_sierra; /* A simple flag */
-	unsigned char s_mapping;
 	int           s_rock_offset; /* offset of SUSP fields within SU area */
-	unsigned char s_rock;
 	unsigned char s_joliet_level;
-	unsigned char s_utf8;
-	unsigned char s_cruft; /* Broken disks with high
-				  byte of length containing
-				  junk */
-	unsigned char s_unhide;
-	unsigned char s_nosuid;
-	unsigned char s_nodev;
-	unsigned char s_nocompress;
-	unsigned char s_hide;
-	unsigned char s_showassoc;
+	unsigned char s_mapping;
+	unsigned int  s_high_sierra:1;
+	unsigned int  s_rock:2;
+	unsigned int  s_utf8:1;
+	unsigned int  s_cruft:1; /* Broken disks with high byte of length
+				  * containing junk */
+	unsigned int  s_nocompress:1;
+	unsigned int  s_hide:1;
+	unsigned int  s_showassoc:1;
+	unsigned int  s_overriderockperm:1;
+	unsigned int  s_uid_set:1;
+	unsigned int  s_gid_set:1;
 
 	mode_t s_fmode;
 	mode_t s_dmode;
@@ -58,6 +57,8 @@ struct isofs_sb_info {
 	struct nls_table *s_nls_iocharset; /* Native language support table */
 };
 
+#define ISOFS_INVALID_MODE ((mode_t) -1)
+
 static inline struct isofs_sb_info *ISOFS_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 8299889a835..eaa831311c9 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -142,9 +142,9 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
 		 */
 		match = 0;
 		if (dlen > 0 &&
-			(sbi->s_hide =='n' ||
+			(!sbi->s_hide ||
 				(!(de->flags[-sbi->s_high_sierra] & 1))) &&
-			(sbi->s_showassoc =='y' ||
+			(sbi->s_showassoc ||
 				(!(de->flags[-sbi->s_high_sierra] & 4)))) {
 			match = (isofs_cmp(dentry, dpnt, dlen) == 0);
 		}
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index ed886e6db39..73242ba7c7b 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1686,35 +1686,6 @@ out:
 	return;
 }
 
-/*
- * journal_try_to_free_buffers() could race with journal_commit_transaction()
- * The latter might still hold the a count on buffers when inspecting
- * them on t_syncdata_list or t_locked_list.
- *
- * journal_try_to_free_buffers() will call this function to
- * wait for the current transaction to finish syncing data buffers, before
- * tryinf to free that buffer.
- *
- * Called with journal->j_state_lock held.
- */
-static void journal_wait_for_transaction_sync_data(journal_t *journal)
-{
-	transaction_t *transaction = NULL;
-	tid_t tid;
-
-	spin_lock(&journal->j_state_lock);
-	transaction = journal->j_committing_transaction;
-
-	if (!transaction) {
-		spin_unlock(&journal->j_state_lock);
-		return;
-	}
-
-	tid = transaction->t_tid;
-	spin_unlock(&journal->j_state_lock);
-	log_wait_commit(journal, tid);
-}
-
 /**
  * int journal_try_to_free_buffers() - try to free page buffers.
  * @journal: journal for operation
@@ -1786,25 +1757,6 @@ int journal_try_to_free_buffers(journal_t *journal,
 
 	ret = try_to_free_buffers(page);
 
-	/*
-	 * There are a number of places where journal_try_to_free_buffers()
-	 * could race with journal_commit_transaction(), the later still
-	 * holds the reference to the buffers to free while processing them.
-	 * try_to_free_buffers() failed to free those buffers. Some of the
-	 * caller of releasepage() request page buffers to be dropped, otherwise
-	 * treat the fail-to-free as errors (such as generic_file_direct_IO())
-	 *
-	 * So, if the caller of try_to_release_page() wants the synchronous
-	 * behaviour(i.e make sure buffers are dropped upon return),
-	 * let's wait for the current transaction to finish flush of
-	 * dirty data buffers, then try to free those buffers again,
-	 * with the journal locked.
-	 */
-	if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
-		journal_wait_for_transaction_sync_data(journal);
-		ret = try_to_free_buffers(page);
-	}
-
 busy:
 	return ret;
 }
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 63d965193b2..11a7b5c6815 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -18,6 +18,7 @@ proc-y	+= meminfo.o
 proc-y	+= stat.o
 proc-y	+= uptime.o
 proc-y	+= version.o
+proc-y	+= softirqs.o
 proc-$(CONFIG_PROC_SYSCTL)	+= proc_sysctl.o
 proc-$(CONFIG_NET)		+= proc_net.o
 proc-$(CONFIG_PROC_KCORE)	+= kcore.o
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index fc6c3025bef..7ba79a54948 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -195,20 +195,20 @@ void proc_device_tree_add_node(struct device_node *np,
 			p = fixup_name(np, de, p);
 
 		ent = proc_mkdir(p, de);
-		if (ent == 0)
+		if (ent == NULL)
 			break;
 		proc_device_tree_add_node(child, ent);
 	}
 	of_node_put(child);
 
-	for (pp = np->properties; pp != 0; pp = pp->next) {
+	for (pp = np->properties; pp != NULL; pp = pp->next) {
 		p = pp->name;
 
 		if (duplicate_name(de, p))
 			p = fixup_name(np, de, p);
 
 		ent = __proc_device_tree_add_prop(de, pp, p);
-		if (ent == 0)
+		if (ent == NULL)
 			break;
 	}
 }
@@ -221,10 +221,10 @@ void __init proc_device_tree_init(void)
 	struct device_node *root;
 
 	proc_device_tree = proc_mkdir("device-tree", NULL);
-	if (proc_device_tree == 0)
+	if (proc_device_tree == NULL)
 		return;
 	root = of_find_node_by_path("/");
-	if (root == 0) {
+	if (root == NULL) {
 		printk(KERN_ERR "/proc/device-tree: can't find root\n");
 		return;
 	}
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
new file mode 100644
index 00000000000..1807c2419f1
--- /dev/null
+++ b/fs/proc/softirqs.c
@@ -0,0 +1,44 @@
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+/*
+ * /proc/softirqs  ... display the number of softirqs
+ */
+static int show_softirqs(struct seq_file *p, void *v)
+{
+	int i, j;
+
+	seq_printf(p, "                ");
+	for_each_possible_cpu(i)
+		seq_printf(p, "CPU%-8d", i);
+	seq_printf(p, "\n");
+
+	for (i = 0; i < NR_SOFTIRQS; i++) {
+		seq_printf(p, "%8s:", softirq_to_name[i]);
+		for_each_possible_cpu(j)
+			seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
+		seq_printf(p, "\n");
+	}
+	return 0;
+}
+
+static int softirqs_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_softirqs, NULL);
+}
+
+static const struct file_operations proc_softirqs_operations = {
+	.open		= softirqs_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_softirqs_init(void)
+{
+	proc_create("softirqs", 0, NULL, &proc_softirqs_operations);
+	return 0;
+}
+module_init(proc_softirqs_init);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 81e4eb60972..7cc726c6d70 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -29,6 +29,8 @@ static int show_stat(struct seq_file *p, void *v)
 	cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
 	cputime64_t guest;
 	u64 sum = 0;
+	u64 sum_softirq = 0;
+	unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
 	struct timespec boottime;
 	unsigned int per_irq_sum;
 
@@ -53,6 +55,13 @@ static int show_stat(struct seq_file *p, void *v)
 			sum += kstat_irqs_cpu(j, i);
 		}
 		sum += arch_irq_stat_cpu(i);
+
+		for (j = 0; j < NR_SOFTIRQS; j++) {
+			unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
+
+			per_softirq_sums[j] += softirq_stat;
+			sum_softirq += softirq_stat;
+		}
 	}
 	sum += arch_irq_stat();
 
@@ -115,6 +124,12 @@ static int show_stat(struct seq_file *p, void *v)
 		nr_running(),
 		nr_iowait());
 
+	seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq);
+
+	for (i = 0; i < NR_SOFTIRQS; i++)
+		seq_printf(p, " %u", per_softirq_sums[i]);
+	seq_printf(p, "\n");
+
 	return 0;
 }
 
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 5edcc3f92ba..0872afa58d3 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -166,12 +166,7 @@ static const struct file_operations proc_vmcore_operations = {
 
 static struct vmcore* __init get_new_element(void)
 {
-	struct vmcore *p;
-
-	p = kmalloc(sizeof(*p), GFP_KERNEL);
-	if (p)
-		memset(p, 0, sizeof(*p));
-	return p;
+	return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
 }
 
 static u64 __init get_vmcore_size_elf64(char *elfptr)
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 4beb964a2a3..128d3f7c8aa 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -1270,9 +1270,8 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 
 					RFALSE(ih, "PAP-12210: ih must be 0");
 
-					if (is_direntry_le_ih
-					    (aux_ih =
-					     B_N_PITEM_HEAD(tbS0, item_pos))) {
+					aux_ih = B_N_PITEM_HEAD(tbS0, item_pos);
+					if (is_direntry_le_ih(aux_ih)) {
 						/* we append to directory item */
 
 						int entry_count;
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index 381750a155f..03d85cbf90b 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -390,7 +390,8 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
 
 	if (last_first == FIRST_TO_LAST) {
 		/* if ( if item in position item_num in buffer SOURCE is directory item ) */
-		if (is_direntry_le_ih(ih = B_N_PITEM_HEAD(src, item_num)))
+		ih = B_N_PITEM_HEAD(src, item_num);
+		if (is_direntry_le_ih(ih))
 			leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST,
 					      item_num, 0, cpy_bytes);
 		else {
@@ -418,7 +419,8 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
 		}
 	} else {
 		/*  if ( if item in position item_num in buffer SOURCE is directory item ) */
-		if (is_direntry_le_ih(ih = B_N_PITEM_HEAD(src, item_num)))
+		ih = B_N_PITEM_HEAD(src, item_num);
+		if (is_direntry_le_ih(ih))
 			leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
 					      item_num,
 					      I_ENTRY_COUNT(ih) - cpy_bytes,
@@ -774,8 +776,8 @@ void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
 			leaf_delete_items_entirely(cur_bi, first + 1,
 						   del_num - 1);
 
-			if (is_direntry_le_ih
-			    (ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh) - 1)))
+			ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh) - 1);
+			if (is_direntry_le_ih(ih))
 				/* the last item is directory  */
 				/* len = numbers of directory entries in this item */
 				len = ih_entry_count(ih);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 7f40f30c55c..6c959275f2d 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -640,6 +640,26 @@ int seq_puts(struct seq_file *m, const char *s)
 }
 EXPORT_SYMBOL(seq_puts);
 
+/**
+ * seq_write - write arbitrary data to buffer
+ * @seq: seq_file identifying the buffer to which data should be written
+ * @data: data address
+ * @len: number of bytes
+ *
+ * Return 0 on success, non-zero otherwise.
+ */
+int seq_write(struct seq_file *seq, const void *data, size_t len)
+{
+	if (seq->count + len < seq->size) {
+		memcpy(seq->buf + seq->count, data, len);
+		seq->count += len;
+		return 0;
+	}
+	seq->count = seq->size;
+	return -1;
+}
+EXPORT_SYMBOL(seq_write);
+
 struct list_head *seq_list_start(struct list_head *head, loff_t pos)
 {
 	struct list_head *lh;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 3d2512c21f0..7cf33379fd4 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -56,9 +56,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
 
 
 	UFSD("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks);
-	if (i_block < 0) {
-		ufs_warning(inode->i_sb, "ufs_block_to_path", "block < 0");
-	} else if (i_block < direct_blocks) {
+	if (i_block < direct_blocks) {
 		offsets[n++] = i_block;
 	} else if ((i_block -= direct_blocks) < indirect_blocks) {
 		offsets[n++] = UFS_IND_BLOCK;
@@ -440,8 +438,6 @@ int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head
 	lock_kernel();
 
 	UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment);
-	if (fragment < 0)
-		goto abort_negative;
 	if (fragment >
 	    ((UFS_NDADDR + uspi->s_apb + uspi->s_2apb + uspi->s_3apb)
 	     << uspi->s_fpbshift))
@@ -504,10 +500,6 @@ abort:
 	unlock_kernel();
 	return err;
 
-abort_negative:
-	ufs_warning(sb, "ufs_get_block", "block < 0");
-	goto abort;
-
 abort_too_big:
 	ufs_warning(sb, "ufs_get_block", "block > big");
 	goto abort;
diff --git a/include/asm-generic/dma-mapping-common.h b/include/asm-generic/dma-mapping-common.h
new file mode 100644
index 00000000000..5406a601185
--- /dev/null
+++ b/include/asm-generic/dma-mapping-common.h
@@ -0,0 +1,190 @@
+#ifndef _ASM_GENERIC_DMA_MAPPING_H
+#define _ASM_GENERIC_DMA_MAPPING_H
+
+#include <linux/kmemcheck.h>
+#include <linux/scatterlist.h>
+#include <linux/dma-debug.h>
+#include <linux/dma-attrs.h>
+
+static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
+					      size_t size,
+					      enum dma_data_direction dir,
+					      struct dma_attrs *attrs)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+	dma_addr_t addr;
+
+	kmemcheck_mark_initialized(ptr, size);
+	BUG_ON(!valid_dma_direction(dir));
+	addr = ops->map_page(dev, virt_to_page(ptr),
+			     (unsigned long)ptr & ~PAGE_MASK, size,
+			     dir, attrs);
+	debug_dma_map_page(dev, virt_to_page(ptr),
+			   (unsigned long)ptr & ~PAGE_MASK, size,
+			   dir, addr, true);
+	return addr;
+}
+
+static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr,
+					  size_t size,
+					  enum dma_data_direction dir,
+					  struct dma_attrs *attrs)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (ops->unmap_page)
+		ops->unmap_page(dev, addr, size, dir, attrs);
+	debug_dma_unmap_page(dev, addr, size, dir, true);
+}
+
+static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+				   int nents, enum dma_data_direction dir,
+				   struct dma_attrs *attrs)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+	int i, ents;
+	struct scatterlist *s;
+
+	for_each_sg(sg, s, nents, i)
+		kmemcheck_mark_initialized(sg_virt(s), s->length);
+	BUG_ON(!valid_dma_direction(dir));
+	ents = ops->map_sg(dev, sg, nents, dir, attrs);
+	debug_dma_map_sg(dev, sg, nents, ents, dir);
+
+	return ents;
+}
+
+static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
+				      int nents, enum dma_data_direction dir,
+				      struct dma_attrs *attrs)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	debug_dma_unmap_sg(dev, sg, nents, dir);
+	if (ops->unmap_sg)
+		ops->unmap_sg(dev, sg, nents, dir, attrs);
+}
+
+static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
+				      size_t offset, size_t size,
+				      enum dma_data_direction dir)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+	dma_addr_t addr;
+
+	kmemcheck_mark_initialized(page_address(page) + offset, size);
+	BUG_ON(!valid_dma_direction(dir));
+	addr = ops->map_page(dev, page, offset, size, dir, NULL);
+	debug_dma_map_page(dev, page, offset, size, dir, addr, false);
+
+	return addr;
+}
+
+static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
+				  size_t size, enum dma_data_direction dir)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (ops->unmap_page)
+		ops->unmap_page(dev, addr, size, dir, NULL);
+	debug_dma_unmap_page(dev, addr, size, dir, false);
+}
+
+static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
+					   size_t size,
+					   enum dma_data_direction dir)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (ops->sync_single_for_cpu)
+		ops->sync_single_for_cpu(dev, addr, size, dir);
+	debug_dma_sync_single_for_cpu(dev, addr, size, dir);
+	flush_write_buffers();
+}
+
+static inline void dma_sync_single_for_device(struct device *dev,
+					      dma_addr_t addr, size_t size,
+					      enum dma_data_direction dir)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (ops->sync_single_for_device)
+		ops->sync_single_for_device(dev, addr, size, dir);
+	debug_dma_sync_single_for_device(dev, addr, size, dir);
+	flush_write_buffers();
+}
+
+static inline void dma_sync_single_range_for_cpu(struct device *dev,
+						 dma_addr_t addr,
+						 unsigned long offset,
+						 size_t size,
+						 enum dma_data_direction dir)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (ops->sync_single_range_for_cpu) {
+		ops->sync_single_range_for_cpu(dev, addr, offset, size, dir);
+		debug_dma_sync_single_range_for_cpu(dev, addr, offset, size, dir);
+
+		flush_write_buffers();
+	} else
+		dma_sync_single_for_cpu(dev, addr, size, dir);
+}
+
+static inline void dma_sync_single_range_for_device(struct device *dev,
+						    dma_addr_t addr,
+						    unsigned long offset,
+						    size_t size,
+						    enum dma_data_direction dir)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (ops->sync_single_range_for_device) {
+		ops->sync_single_range_for_device(dev, addr, offset, size, dir);
+		debug_dma_sync_single_range_for_device(dev, addr, offset, size, dir);
+
+		flush_write_buffers();
+	} else
+		dma_sync_single_for_device(dev, addr, size, dir);
+}
+
+static inline void
+dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+		    int nelems, enum dma_data_direction dir)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (ops->sync_sg_for_cpu)
+		ops->sync_sg_for_cpu(dev, sg, nelems, dir);
+	debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
+	flush_write_buffers();
+}
+
+static inline void
+dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+		       int nelems, enum dma_data_direction dir)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (ops->sync_sg_for_device)
+		ops->sync_sg_for_device(dev, sg, nelems, dir);
+	debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
+
+	flush_write_buffers();
+}
+
+#define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, NULL)
+#define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, NULL)
+#define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, NULL)
+#define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, NULL)
+
+#endif
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index 4ce48e87853..d083561337f 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -14,6 +14,9 @@ extern char __kprobes_text_start[], __kprobes_text_end[];
 extern char __initdata_begin[], __initdata_end[];
 extern char __start_rodata[], __end_rodata[];
 
+/* Start and end of .ctors section - used for constructor calls. */
+extern char __ctors_start[], __ctors_end[];
+
 /* function descriptor handling (if any).  Override
  * in asm/sections.h */
 #ifndef dereference_function_descriptor
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 6bdba10fef4..55413e568f0 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -440,12 +440,21 @@
 		INIT_TASK						\
 	}
 
+#ifdef CONFIG_CONSTRUCTORS
+#define KERNEL_CTORS()	VMLINUX_SYMBOL(__ctors_start) = .; \
+			*(.ctors)			   \
+			VMLINUX_SYMBOL(__ctors_end) = .;
+#else
+#define KERNEL_CTORS()
+#endif
+
 /* init and exit section handling */
 #define INIT_DATA							\
 	*(.init.data)							\
 	DEV_DISCARD(init.data)						\
 	CPU_DISCARD(init.data)						\
 	MEM_DISCARD(init.data)						\
+	KERNEL_CTORS()							\
 	*(.init.rodata)							\
 	DEV_DISCARD(init.rodata)					\
 	CPU_DISCARD(init.rodata)					\
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index a2df7030d49..03f22076381 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -308,6 +308,7 @@ unifdef-y += pmu.h
 unifdef-y += poll.h
 unifdef-y += ppp_defs.h
 unifdef-y += ppp-comp.h
+unifdef-y += pps.h
 unifdef-y += ptrace.h
 unifdef-y += quota.h
 unifdef-y += random.h
diff --git a/include/linux/compiler-gcc3.h b/include/linux/compiler-gcc3.h
index 8005effc04f..b721129e046 100644
--- a/include/linux/compiler-gcc3.h
+++ b/include/linux/compiler-gcc3.h
@@ -16,6 +16,12 @@
 #define __must_check		__attribute__((warn_unused_result))
 #endif
 
+#ifdef CONFIG_GCOV_KERNEL
+# if __GNUC_MINOR__ < 4
+#   error "GCOV profiling support for gcc versions below 3.4 not included"
+# endif /* __GNUC_MINOR__ */
+#endif /* CONFIG_GCOV_KERNEL */
+
 /*
  * A trick to suppress uninitialized variable warning without generating any
  * code
diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index 244677cc082..43fc95d822d 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -79,10 +79,6 @@ struct fsl_spi_platform_data {
 	u16	max_chipselect;
 	void	(*cs_control)(struct spi_device *spi, bool on);
 	u32	sysclk;
-
-	/* Legacy hooks, used by mpc52xx_psc_spi driver. */
-	void	(*activate_cs)(u8 cs, u8 polarity);
-	void	(*deactivate_cs)(u8 cs, u8 polarity);
 };
 
 struct mpc8xx_pcmcia_ops {
diff --git a/include/linux/gcd.h b/include/linux/gcd.h
new file mode 100644
index 00000000000..69f5e8a01ba
--- /dev/null
+++ b/include/linux/gcd.h
@@ -0,0 +1,8 @@
+#ifndef _GCD_H
+#define _GCD_H
+
+#include <linux/compiler.h>
+
+unsigned long gcd(unsigned long a, unsigned long b) __attribute_const__;
+
+#endif /* _GCD_H */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index cfdb35d71bc..7c777a0da17 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -99,7 +99,7 @@ struct vm_area_struct;
 			__GFP_NORETRY|__GFP_NOMEMALLOC)
 
 /* Control slab gfp mask during early boot */
-#define SLAB_GFP_BOOT_MASK __GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS)
+#define GFP_BOOT_MASK __GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS)
 
 /* Control allocation constraints */
 #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
@@ -348,4 +348,11 @@ static inline void oom_killer_enable(void)
 	oom_killer_disabled = false;
 }
 
+extern gfp_t gfp_allowed_mask;
+
+static inline void set_gfp_allowed_mask(gfp_t mask)
+{
+	gfp_allowed_mask = mask;
+}
+
 #endif /* __LINUX_GFP_H */
diff --git a/include/linux/i2c/pca953x.h b/include/linux/i2c/pca953x.h
index 3c7361217df..81736d6a8db 100644
--- a/include/linux/i2c/pca953x.h
+++ b/include/linux/i2c/pca953x.h
@@ -15,4 +15,5 @@ struct pca953x_platform_data {
 	int		(*teardown)(struct i2c_client *client,
 				unsigned gpio, unsigned ngpio,
 				void *context);
+	char		**names;
 };
diff --git a/include/linux/init.h b/include/linux/init.h
index 8c2c9989626..13b633ed695 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -134,6 +134,9 @@ typedef void (*exitcall_t)(void);
 extern initcall_t __con_initcall_start[], __con_initcall_end[];
 extern initcall_t __security_initcall_start[], __security_initcall_end[];
 
+/* Used for contructor calls. */
+typedef void (*ctor_fn_t)(void);
+
 /* Defined in init/main.c */
 extern int do_one_initcall(initcall_t fn);
 extern char __initdata boot_command_line[];
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 3bf40e246a8..e408722a84c 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -94,13 +94,8 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; }
 #endif
 
 #if defined(CONFIG_IPC_NS)
-extern void free_ipc_ns(struct ipc_namespace *ns);
 extern struct ipc_namespace *copy_ipcs(unsigned long flags,
 				       struct ipc_namespace *ns);
-extern void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
-		      void (*free)(struct ipc_namespace *,
-				   struct kern_ipc_perm *));
-
 static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
 {
 	if (ns)
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index a77c6007dc9..348fa8874b5 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -5,6 +5,7 @@
 #include <linux/threads.h>
 #include <linux/percpu.h>
 #include <linux/cpumask.h>
+#include <linux/interrupt.h>
 #include <asm/irq.h>
 #include <asm/cputime.h>
 
@@ -31,6 +32,7 @@ struct kernel_stat {
 #ifndef CONFIG_GENERIC_HARDIRQS
        unsigned int irqs[NR_IRQS];
 #endif
+	unsigned int softirqs[NR_SOFTIRQS];
 };
 
 DECLARE_PER_CPU(struct kernel_stat, kstat);
@@ -67,6 +69,16 @@ extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
 
 #endif
 
+static inline void kstat_incr_softirqs_this_cpu(unsigned int irq)
+{
+	kstat_this_cpu.softirqs[irq]++;
+}
+
+static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu)
+{
+       return kstat_cpu(cpu).softirqs[irq];
+}
+
 /*
  * Number of interrupts per specific IRQ source, since bootup
  */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 45add35dda1..e46a0734ab6 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -117,7 +117,7 @@ static inline bool mem_cgroup_disabled(void)
 }
 
 extern bool mem_cgroup_oom_called(struct task_struct *task);
-
+void mem_cgroup_update_mapped_file_stat(struct page *page, int val);
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct mem_cgroup;
 
@@ -271,6 +271,11 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
 }
 
+static inline void mem_cgroup_update_mapped_file_stat(struct page *page,
+							int val)
+{
+}
+
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/module.h b/include/linux/module.h
index 505f20dcc1c..098bdb7bfac 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -363,6 +363,12 @@ struct module
 	local_t ref;
 #endif
 #endif
+
+#ifdef CONFIG_CONSTRUCTORS
+	/* Constructor functions. */
+	ctor_fn_t *ctors;
+	unsigned int num_ctors;
+#endif
 };
 #ifndef MODULE_ARCH_INIT
 #define MODULE_ARCH_INIT {}
diff --git a/include/linux/pps.h b/include/linux/pps.h
new file mode 100644
index 00000000000..cfe5c7214ec
--- /dev/null
+++ b/include/linux/pps.h
@@ -0,0 +1,122 @@
+/*
+ * PPS API header
+ *
+ * Copyright (C) 2005-2009   Rodolfo Giometti <giometti@linux.it>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#ifndef _PPS_H_
+#define _PPS_H_
+
+#define PPS_VERSION		"5.3.6"
+#define PPS_MAX_SOURCES		16		/* should be enough... */
+
+/* Implementation note: the logical states ``assert'' and ``clear''
+ * are implemented in terms of the chip register, i.e. ``assert''
+ * means the bit is set.  */
+
+/*
+ * 3.2 New data structures
+ */
+
+#define PPS_API_VERS_1		1
+#define PPS_API_VERS		PPS_API_VERS_1	/* we use API version 1 */
+#define PPS_MAX_NAME_LEN	32
+
+/* 32-bit vs. 64-bit compatibility.
+ *
+ * 0n i386, the alignment of a uint64_t is only 4 bytes, while on most other
+ * architectures it's 8 bytes. On i386, there will be no padding between the
+ * two consecutive 'struct pps_ktime' members of struct pps_kinfo and struct
+ * pps_kparams. But on most platforms there will be padding to ensure correct
+ * alignment.
+ *
+ * The simple fix is probably to add an explicit padding.
+ *					 		[David Woodhouse]
+ */
+struct pps_ktime {
+	__s64 sec;
+	__s32 nsec;
+	__u32 flags;
+};
+#define PPS_TIME_INVALID	(1<<0)	/* used to specify timeout==NULL */
+
+struct pps_kinfo {
+	__u32 assert_sequence;		/* seq. num. of assert event */
+	__u32 clear_sequence; 		/* seq. num. of clear event */
+	struct pps_ktime assert_tu;	/* time of assert event */
+	struct pps_ktime clear_tu;	/* time of clear event */
+	int current_mode;		/* current mode bits */
+};
+
+struct pps_kparams {
+	int api_version;		/* API version # */
+	int mode;			/* mode bits */
+	struct pps_ktime assert_off_tu;	/* offset compensation for assert */
+	struct pps_ktime clear_off_tu;	/* offset compensation for clear */
+};
+
+/*
+ * 3.3 Mode bit definitions
+ */
+
+/* Device/implementation parameters */
+#define PPS_CAPTUREASSERT	0x01	/* capture assert events */
+#define PPS_CAPTURECLEAR	0x02	/* capture clear events */
+#define PPS_CAPTUREBOTH		0x03	/* capture assert and clear events */
+
+#define PPS_OFFSETASSERT	0x10	/* apply compensation for assert ev. */
+#define PPS_OFFSETCLEAR		0x20	/* apply compensation for clear ev. */
+
+#define PPS_CANWAIT		0x100	/* can we wait for an event? */
+#define PPS_CANPOLL		0x200	/* bit reserved for future use */
+
+/* Kernel actions */
+#define PPS_ECHOASSERT		0x40	/* feed back assert event to output */
+#define PPS_ECHOCLEAR		0x80	/* feed back clear event to output */
+
+/* Timestamp formats */
+#define PPS_TSFMT_TSPEC		0x1000	/* select timespec format */
+#define PPS_TSFMT_NTPFP		0x2000	/* select NTP format */
+
+/*
+ * 3.4.4 New functions: disciplining the kernel timebase
+ */
+
+/* Kernel consumers */
+#define PPS_KC_HARDPPS		0	/* hardpps() (or equivalent) */
+#define PPS_KC_HARDPPS_PLL	1	/* hardpps() constrained to
+					   use a phase-locked loop */
+#define PPS_KC_HARDPPS_FLL	2	/* hardpps() constrained to
+					   use a frequency-locked loop */
+/*
+ * Here begins the implementation-specific part!
+ */
+
+struct pps_fdata {
+	struct pps_kinfo info;
+	struct pps_ktime timeout;
+};
+
+#include <linux/ioctl.h>
+
+#define PPS_GETPARAMS		_IOR('p', 0xa1, struct pps_kparams *)
+#define PPS_SETPARAMS		_IOW('p', 0xa2, struct pps_kparams *)
+#define PPS_GETCAP		_IOR('p', 0xa3, int *)
+#define PPS_FETCH		_IOWR('p', 0xa4, struct pps_fdata *)
+
+#endif /* _PPS_H_ */
diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h
new file mode 100644
index 00000000000..e0a193f830e
--- /dev/null
+++ b/include/linux/pps_kernel.h
@@ -0,0 +1,89 @@
+/*
+ * PPS API kernel header
+ *
+ * Copyright (C) 2009   Rodolfo Giometti <giometti@linux.it>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/pps.h>
+
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/time.h>
+
+/*
+ * Global defines
+ */
+
+/* The specific PPS source info */
+struct pps_source_info {
+	char name[PPS_MAX_NAME_LEN];		/* simbolic name */
+	char path[PPS_MAX_NAME_LEN];		/* path of connected device */
+	int mode;				/* PPS's allowed mode */
+
+	void (*echo)(int source, int event, void *data); /* PPS echo function */
+
+	struct module *owner;
+	struct device *dev;
+};
+
+/* The main struct */
+struct pps_device {
+	struct pps_source_info info;		/* PSS source info */
+
+	struct pps_kparams params;		/* PPS's current params */
+
+	__u32 assert_sequence;			/* PPS' assert event seq # */
+	__u32 clear_sequence;			/* PPS' clear event seq # */
+	struct pps_ktime assert_tu;
+	struct pps_ktime clear_tu;
+	int current_mode;			/* PPS mode at event time */
+
+	int go;					/* PPS event is arrived? */
+	wait_queue_head_t queue;		/* PPS event queue */
+
+	unsigned int id;			/* PPS source unique ID */
+	struct cdev cdev;
+	struct device *dev;
+	int devno;
+	struct fasync_struct *async_queue;	/* fasync method */
+	spinlock_t lock;
+
+	atomic_t usage;				/* usage count */
+};
+
+/*
+ * Global variables
+ */
+
+extern spinlock_t pps_idr_lock;
+extern struct idr pps_idr;
+extern struct timespec pps_irq_ts[];
+
+extern struct device_attribute pps_attrs[];
+
+/*
+ * Exported functions
+ */
+
+struct pps_device *pps_get_source(int source);
+extern void pps_put_source(struct pps_device *pps);
+extern int pps_register_source(struct pps_source_info *info,
+				int default_params);
+extern void pps_unregister_source(int source);
+extern int pps_register_cdev(struct pps_device *pps);
+extern void pps_unregister_cdev(struct pps_device *pps);
+extern void pps_event(int source, struct pps_ktime *ts, int event, void *data);
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 59e133d39d5..7456d7d87a1 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -81,7 +81,6 @@
 
 
 extern long arch_ptrace(struct task_struct *child, long request, long addr, long data);
-extern struct task_struct *ptrace_get_task_struct(pid_t pid);
 extern int ptrace_traceme(void);
 extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len);
 extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len);
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index 6ba830fa853..ffa2efbbe38 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -232,7 +232,7 @@ struct mdp_superblock_1 {
 	__le64	reshape_position;	/* next address in array-space for reshape */
 	__le32	delta_disks;	/* change in number of raid_disks		*/
 	__le32	new_layout;	/* new layout					*/
-	__le32	new_chunk;	/* new chunk size (bytes)			*/
+	__le32	new_chunk;	/* new chunk size (512byte sectors)		*/
 	__u8	pad1[128-124];	/* set to 0 when written */
 
 	/* constant this-device information - 64 bytes */
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 2245c78d587..dd31e7bae35 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -660,23 +660,54 @@ static inline void set_le_key_k_type(int version, struct reiserfs_key *key,
 		   cpu_to_le32(type2uniqueness(type)))
 	    : (void)(set_offset_v2_k_type(&(key->u.k_offset_v2), type));
 }
+
 static inline void set_le_ih_k_type(struct item_head *ih, int type)
 {
 	set_le_key_k_type(ih_version(ih), &(ih->ih_key), type);
 }
 
-#define is_direntry_le_key(version,key) (le_key_k_type (version, key) == TYPE_DIRENTRY)
-#define is_direct_le_key(version,key) (le_key_k_type (version, key) == TYPE_DIRECT)
-#define is_indirect_le_key(version,key) (le_key_k_type (version, key) == TYPE_INDIRECT)
-#define is_statdata_le_key(version,key) (le_key_k_type (version, key) == TYPE_STAT_DATA)
+static inline int is_direntry_le_key(int version, struct reiserfs_key *key)
+{
+	return le_key_k_type(version, key) == TYPE_DIRENTRY;
+}
+
+static inline int is_direct_le_key(int version, struct reiserfs_key *key)
+{
+	return le_key_k_type(version, key) == TYPE_DIRECT;
+}
+
+static inline int is_indirect_le_key(int version, struct reiserfs_key *key)
+{
+	return le_key_k_type(version, key) == TYPE_INDIRECT;
+}
+
+static inline int is_statdata_le_key(int version, struct reiserfs_key *key)
+{
+	return le_key_k_type(version, key) == TYPE_STAT_DATA;
+}
 
 //
 // item header has version.
 //
-#define is_direntry_le_ih(ih) is_direntry_le_key (ih_version (ih), &((ih)->ih_key))
-#define is_direct_le_ih(ih) is_direct_le_key (ih_version (ih), &((ih)->ih_key))
-#define is_indirect_le_ih(ih) is_indirect_le_key (ih_version(ih), &((ih)->ih_key))
-#define is_statdata_le_ih(ih) is_statdata_le_key (ih_version (ih), &((ih)->ih_key))
+static inline int is_direntry_le_ih(struct item_head *ih)
+{
+	return is_direntry_le_key(ih_version(ih), &ih->ih_key);
+}
+
+static inline int is_direct_le_ih(struct item_head *ih)
+{
+	return is_direct_le_key(ih_version(ih), &ih->ih_key);
+}
+
+static inline int is_indirect_le_ih(struct item_head *ih)
+{
+	return is_indirect_le_key(ih_version(ih), &ih->ih_key);
+}
+
+static inline int is_statdata_le_ih(struct item_head *ih)
+{
+	return is_statdata_le_key(ih_version(ih), &ih->ih_key);
+}
 
 //
 // key is pointer to cpu key, result is cpu
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index 4c5bcf6ca7e..511f42fc681 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -49,6 +49,8 @@ struct res_counter {
 	struct res_counter *parent;
 };
 
+#define RESOURCE_MAX (unsigned long long)LLONG_MAX
+
 /**
  * Helpers to interact with userspace
  * res_counter_read_u64() - returns the value of the specified member.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 02042e7f219..4d075426988 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -92,7 +92,6 @@ struct sched_param {
 
 #include <asm/processor.h>
 
-struct mem_cgroup;
 struct exec_domain;
 struct futex_pi_state;
 struct robust_list_head;
@@ -1879,9 +1878,6 @@ extern struct pid_namespace init_pid_ns;
 /*
  * find a task by one of its numerical ids
  *
- * find_task_by_pid_type_ns():
- *      it is the most generic call - it finds a task by all id,
- *      type and namespace specified
  * find_task_by_pid_ns():
  *      finds a task by its pid in the specified namespace
  * find_task_by_vpid():
@@ -1890,9 +1886,6 @@ extern struct pid_namespace init_pid_ns;
  * see also find_vpid() etc in include/linux/pid.h
  */
 
-extern struct task_struct *find_task_by_pid_type_ns(int type, int pid,
-		struct pid_namespace *ns);
-
 extern struct task_struct *find_task_by_vpid(pid_t nr);
 extern struct task_struct *find_task_by_pid_ns(pid_t nr,
 		struct pid_namespace *ns);
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 004f3b3342c..0c6a86b7959 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -43,6 +43,7 @@ int seq_release(struct inode *, struct file *);
 int seq_escape(struct seq_file *, const char *, const char *);
 int seq_putc(struct seq_file *m, char c);
 int seq_puts(struct seq_file *m, const char *s);
+int seq_write(struct seq_file *seq, const void *data, size_t len);
 
 int seq_printf(struct seq_file *, const char *, ...)
 	__attribute__ ((format (printf,2,3)));
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index a0faa18f7b1..9c4cd27f468 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -245,6 +245,9 @@ struct spi_master {
 	 */
 	u16			dma_alignment;
 
+	/* spi_device.mode flags understood by this controller driver */
+	u16			mode_bits;
+
 	/* Setup mode and clock, etc (spi driver may call many times).
 	 *
 	 * IMPORTANT:  this may be called when transfers to another
@@ -523,30 +526,7 @@ static inline void spi_message_free(struct spi_message *m)
 	kfree(m);
 }
 
-/**
- * spi_setup - setup SPI mode and clock rate
- * @spi: the device whose settings are being modified
- * Context: can sleep, and no requests are queued to the device
- *
- * SPI protocol drivers may need to update the transfer mode if the
- * device doesn't work with its default.  They may likewise need
- * to update clock rates or word sizes from initial values.  This function
- * changes those settings, and must be called from a context that can sleep.
- * Except for SPI_CS_HIGH, which takes effect immediately, the changes take
- * effect the next time the device is selected and data is transferred to
- * or from it.  When this function returns, the spi device is deselected.
- *
- * Note that this call will fail if the protocol driver specifies an option
- * that the underlying controller or its driver does not support.  For
- * example, not all hardware supports wire transfers using nine bit words,
- * LSB-first wire encoding, or active-high chipselects.
- */
-static inline int
-spi_setup(struct spi_device *spi)
-{
-	return spi->master->setup(spi);
-}
-
+extern int spi_setup(struct spi_device *spi);
 
 /**
  * spi_async - asynchronous SPI transfer
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0cedf31af0b..c88b36665f7 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -319,10 +319,11 @@ static inline void disable_swap_token(void)
 }
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
-extern void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent);
+extern void
+mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout);
 #else
 static inline void
-mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
+mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
 {
 }
 #endif
@@ -423,12 +424,6 @@ static inline swp_entry_t get_swap_page(void)
 #define has_swap_token(x) 0
 #define disable_swap_token() do { } while(0)
 
-static inline int mem_cgroup_cache_charge_swapin(struct page *page,
-			struct mm_struct *mm, gfp_t mask, bool locked)
-{
-	return 0;
-}
-
 static inline void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
 {
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index eb96603d92d..17ba82efa48 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -143,7 +143,7 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
  *
  * Return %LSM_UNSAFE_* bits applied to an exec because of tracing.
  *
- * Called with task_lock() held on @task.
+ * @task->cred_guard_mutex is held by the caller through the do_execve().
  */
 static inline int tracehook_unsafe_exec(struct task_struct *task)
 {
diff --git a/include/linux/w1-gpio.h b/include/linux/w1-gpio.h
index 9797fec7748..3adeff82212 100644
--- a/include/linux/w1-gpio.h
+++ b/include/linux/w1-gpio.h
@@ -18,6 +18,7 @@
 struct w1_gpio_platform_data {
 	unsigned int pin;
 	unsigned int is_open_drain:1;
+	void (*enable_external_pullup)(int enable);
 };
 
 #endif /* _LINUX_W1_GPIO_H */
diff --git a/init/Kconfig b/init/Kconfig
index c4b3c6d51a7..1ce05a4cb5f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -16,6 +16,11 @@ config DEFCONFIG_LIST
 	default "$ARCH_DEFCONFIG"
 	default "arch/$ARCH/defconfig"
 
+config CONSTRUCTORS
+	bool
+	depends on !UML
+	default y
+
 menu "General setup"
 
 config EXPERIMENTAL
diff --git a/init/main.c b/init/main.c
index 0e7aedeaa05..09131ec090c 100644
--- a/init/main.c
+++ b/init/main.c
@@ -642,6 +642,10 @@ asmlinkage void __init start_kernel(void)
 				 "enabled early\n");
 	early_boot_irqs_on();
 	local_irq_enable();
+
+	/* Interrupts are enabled now so all GFP allocations are safe. */
+	set_gfp_allowed_mask(__GFP_BITS_MASK);
+
 	kmem_cache_init_late();
 
 	/*
@@ -720,6 +724,17 @@ asmlinkage void __init start_kernel(void)
 	rest_init();
 }
 
+/* Call all constructor functions linked into the kernel. */
+static void __init do_ctors(void)
+{
+#ifdef CONFIG_CONSTRUCTORS
+	ctor_fn_t *call = (ctor_fn_t *) __ctors_start;
+
+	for (; call < (ctor_fn_t *) __ctors_end; call++)
+		(*call)();
+#endif
+}
+
 int initcall_debug;
 core_param(initcall_debug, initcall_debug, bool, 0644);
 
@@ -800,6 +815,7 @@ static void __init do_basic_setup(void)
 	usermodehelper_init();
 	driver_init();
 	init_irq_proc();
+	do_ctors();
 	do_initcalls();
 }
 
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 4a5e752a927..a1094ff0bef 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -14,7 +14,7 @@
 
 #include "util.h"
 
-static struct ipc_namespace *clone_ipc_ns(struct ipc_namespace *old_ns)
+static struct ipc_namespace *create_ipc_ns(void)
 {
 	struct ipc_namespace *ns;
 	int err;
@@ -48,18 +48,9 @@ static struct ipc_namespace *clone_ipc_ns(struct ipc_namespace *old_ns)
 
 struct ipc_namespace *copy_ipcs(unsigned long flags, struct ipc_namespace *ns)
 {
-	struct ipc_namespace *new_ns;
-
-	BUG_ON(!ns);
-	get_ipc_ns(ns);
-
 	if (!(flags & CLONE_NEWIPC))
-		return ns;
-
-	new_ns = clone_ipc_ns(ns);
-
-	put_ipc_ns(ns);
-	return new_ns;
+		return get_ipc_ns(ns);
+	return create_ipc_ns();
 }
 
 /*
@@ -92,6 +83,30 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
 	up_write(&ids->rw_mutex);
 }
 
+static void free_ipc_ns(struct ipc_namespace *ns)
+{
+	/*
+	 * Unregistering the hotplug notifier at the beginning guarantees
+	 * that the ipc namespace won't be freed while we are inside the
+	 * callback routine. Since the blocking_notifier_chain_XXX routines
+	 * hold a rw lock on the notifier list, unregister_ipcns_notifier()
+	 * won't take the rw lock before blocking_notifier_call_chain() has
+	 * released the rd lock.
+	 */
+	unregister_ipcns_notifier(ns);
+	sem_exit_ns(ns);
+	msg_exit_ns(ns);
+	shm_exit_ns(ns);
+	kfree(ns);
+	atomic_dec(&nr_ipc_ns);
+
+	/*
+	 * Do the ipcns removal notification after decrementing nr_ipc_ns in
+	 * order to have a correct value when recomputing msgmni.
+	 */
+	ipcns_notify(IPCNS_REMOVED);
+}
+
 /*
  * put_ipc_ns - drop a reference to an ipc namespace.
  * @ns: the namespace to put
@@ -117,27 +132,3 @@ void put_ipc_ns(struct ipc_namespace *ns)
 		free_ipc_ns(ns);
 	}
 }
-
-void free_ipc_ns(struct ipc_namespace *ns)
-{
-	/*
-	 * Unregistering the hotplug notifier at the beginning guarantees
-	 * that the ipc namespace won't be freed while we are inside the
-	 * callback routine. Since the blocking_notifier_chain_XXX routines
-	 * hold a rw lock on the notifier list, unregister_ipcns_notifier()
-	 * won't take the rw lock before blocking_notifier_call_chain() has
-	 * released the rd lock.
-	 */
-	unregister_ipcns_notifier(ns);
-	sem_exit_ns(ns);
-	msg_exit_ns(ns);
-	shm_exit_ns(ns);
-	kfree(ns);
-	atomic_dec(&nr_ipc_ns);
-
-	/*
-	 * Do the ipcns removal notification after decrementing nr_ipc_ns in
-	 * order to have a correct value when recomputing msgmni.
-	 */
-	ipcns_notify(IPCNS_REMOVED);
-}
diff --git a/ipc/util.h b/ipc/util.h
index 1187332a89d..ab3ebf2621b 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -128,7 +128,7 @@ void ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out);
 struct kern_ipc_perm *ipcctl_pre_down(struct ipc_ids *ids, int id, int cmd,
 				      struct ipc64_perm *perm, int extra_perm);
 
-#if defined(__ia64__) || defined(__x86_64__) || defined(__hppa__) || defined(__XTENSA__)
+#ifndef __ARCH_WANT_IPC_PARSE_VERSION
   /* On IA-64, we always use the "64-bit version" of the IPC structures.  */ 
 # define ipc_parse_version(cmd)	IPC_64
 #else
@@ -171,5 +171,6 @@ static inline void ipc_unlock(struct kern_ipc_perm *perm)
 struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id);
 int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
 			struct ipc_ops *ops, struct ipc_params *params);
-
+void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
+		void (*free)(struct ipc_namespace *, struct kern_ipc_perm *));
 #endif
diff --git a/kernel/Makefile b/kernel/Makefile
index 9df4501cb92..0a32cb21ec9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -71,6 +71,7 @@ obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
+obj-$(CONFIG_GCOV_KERNEL) += gcov/
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_KGDB) += kgdb.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3fb789f6df9..3737a682cdf 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -843,6 +843,11 @@ static int parse_cgroupfs_options(char *data,
 				     struct cgroup_sb_opts *opts)
 {
 	char *token, *o = data ?: "all";
+	unsigned long mask = (unsigned long)-1;
+
+#ifdef CONFIG_CPUSETS
+	mask = ~(1UL << cpuset_subsys_id);
+#endif
 
 	opts->subsys_bits = 0;
 	opts->flags = 0;
@@ -887,6 +892,15 @@ static int parse_cgroupfs_options(char *data,
 		}
 	}
 
+	/*
+	 * Option noprefix was introduced just for backward compatibility
+	 * with the old cpuset, so we allow noprefix only if mounting just
+	 * the cpuset subsystem.
+	 */
+	if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
+	    (opts->subsys_bits & mask))
+		return -EINVAL;
+
 	/* We can't have an empty hierarchy */
 	if (!opts->subsys_bits)
 		return -EINVAL;
diff --git a/kernel/exit.c b/kernel/exit.c
index b6c90b5ef50..13ae64001fe 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -375,9 +375,8 @@ static void set_special_pids(struct pid *pid)
 }
 
 /*
- * Let kernel threads use this to say that they
- * allow a certain signal (since daemonize() will
- * have disabled all of them by default).
+ * Let kernel threads use this to say that they allow a certain signal.
+ * Must not be used if kthread was cloned with CLONE_SIGHAND.
  */
 int allow_signal(int sig)
 {
@@ -385,14 +384,14 @@ int allow_signal(int sig)
 		return -EINVAL;
 
 	spin_lock_irq(&current->sighand->siglock);
+	/* This is only needed for daemonize()'ed kthreads */
 	sigdelset(&current->blocked, sig);
-	if (!current->mm) {
-		/* Kernel threads handle their own signals.
-		   Let the signal code know it'll be handled, so
-		   that they don't get converted to SIGKILL or
-		   just silently dropped */
-		current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
-	}
+	/*
+	 * Kernel threads handle their own signals. Let the signal code
+	 * know it'll be handled, so that they don't get converted to
+	 * SIGKILL or just silently dropped.
+	 */
+	current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 	return 0;
@@ -591,7 +590,7 @@ retry:
 	/*
 	 * Search in the siblings
 	 */
-	list_for_each_entry(c, &p->parent->children, sibling) {
+	list_for_each_entry(c, &p->real_parent->children, sibling) {
 		if (c->mm == mm)
 			goto assign_new_owner;
 	}
@@ -758,7 +757,7 @@ static void reparent_thread(struct task_struct *father, struct task_struct *p,
 	p->exit_signal = SIGCHLD;
 
 	/* If it has exited notify the new parent about this child's death. */
-	if (!p->ptrace &&
+	if (!task_ptrace(p) &&
 	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
 		do_notify_parent(p, p->exit_signal);
 		if (task_detached(p)) {
@@ -783,7 +782,7 @@ static void forget_original_parent(struct task_struct *father)
 	list_for_each_entry_safe(p, n, &father->children, sibling) {
 		p->real_parent = reaper;
 		if (p->parent == father) {
-			BUG_ON(p->ptrace);
+			BUG_ON(task_ptrace(p));
 			p->parent = p->real_parent;
 		}
 		reparent_thread(father, p, &dead_children);
@@ -1081,6 +1080,18 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
 	return 0;
 }
 
+struct wait_opts {
+	enum pid_type		wo_type;
+	int			wo_flags;
+	struct pid		*wo_pid;
+
+	struct siginfo __user	*wo_info;
+	int __user		*wo_stat;
+	struct rusage __user	*wo_rusage;
+
+	int			notask_error;
+};
+
 static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
 {
 	struct pid *pid = NULL;
@@ -1091,13 +1102,12 @@ static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
 	return pid;
 }
 
-static int eligible_child(enum pid_type type, struct pid *pid, int options,
-			  struct task_struct *p)
+static int eligible_child(struct wait_opts *wo, struct task_struct *p)
 {
 	int err;
 
-	if (type < PIDTYPE_MAX) {
-		if (task_pid_type(p, type) != pid)
+	if (wo->wo_type < PIDTYPE_MAX) {
+		if (task_pid_type(p, wo->wo_type) != wo->wo_pid)
 			return 0;
 	}
 
@@ -1106,8 +1116,8 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
 	 * set; otherwise, wait for non-clone children *only*.  (Note:
 	 * A "clone" child here is one that reports to its parent
 	 * using a signal other than SIGCHLD.) */
-	if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
-	    && !(options & __WALL))
+	if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
+	    && !(wo->wo_flags & __WALL))
 		return 0;
 
 	err = security_task_wait(p);
@@ -1117,14 +1127,15 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
 	return 1;
 }
 
-static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
-			       int why, int status,
-			       struct siginfo __user *infop,
-			       struct rusage __user *rusagep)
+static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
+				pid_t pid, uid_t uid, int why, int status)
 {
-	int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;
+	struct siginfo __user *infop;
+	int retval = wo->wo_rusage
+		? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
 
 	put_task_struct(p);
+	infop = wo->wo_info;
 	if (!retval)
 		retval = put_user(SIGCHLD, &infop->si_signo);
 	if (!retval)
@@ -1148,19 +1159,18 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
  */
-static int wait_task_zombie(struct task_struct *p, int options,
-			    struct siginfo __user *infop,
-			    int __user *stat_addr, struct rusage __user *ru)
+static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 {
 	unsigned long state;
 	int retval, status, traced;
 	pid_t pid = task_pid_vnr(p);
 	uid_t uid = __task_cred(p)->uid;
+	struct siginfo __user *infop;
 
-	if (!likely(options & WEXITED))
+	if (!likely(wo->wo_flags & WEXITED))
 		return 0;
 
-	if (unlikely(options & WNOWAIT)) {
+	if (unlikely(wo->wo_flags & WNOWAIT)) {
 		int exit_code = p->exit_code;
 		int why, status;
 
@@ -1173,8 +1183,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
 			why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
 			status = exit_code & 0x7f;
 		}
-		return wait_noreap_copyout(p, pid, uid, why,
-					   status, infop, ru);
+		return wait_noreap_copyout(wo, p, pid, uid, why, status);
 	}
 
 	/*
@@ -1192,7 +1201,6 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	if (likely(!traced)) {
 		struct signal_struct *psig;
 		struct signal_struct *sig;
-		struct task_cputime cputime;
 
 		/*
 		 * The resource counters for the group leader are in its
@@ -1205,26 +1213,23 @@ static int wait_task_zombie(struct task_struct *p, int options,
 		 * p->signal fields, because they are only touched by
 		 * __exit_signal, which runs with tasklist_lock
 		 * write-locked anyway, and so is excluded here.  We do
-		 * need to protect the access to p->parent->signal fields,
+		 * need to protect the access to parent->signal fields,
 		 * as other threads in the parent group can be right
 		 * here reaping other children at the same time.
-		 *
-		 * We use thread_group_cputime() to get times for the thread
-		 * group, which consolidates times for all threads in the
-		 * group including the group leader.
 		 */
-		thread_group_cputime(p, &cputime);
-		spin_lock_irq(&p->parent->sighand->siglock);
-		psig = p->parent->signal;
+		spin_lock_irq(&p->real_parent->sighand->siglock);
+		psig = p->real_parent->signal;
 		sig = p->signal;
 		psig->cutime =
 			cputime_add(psig->cutime,
-			cputime_add(cputime.utime,
-				    sig->cutime));
+			cputime_add(p->utime,
+			cputime_add(sig->utime,
+				    sig->cutime)));
 		psig->cstime =
 			cputime_add(psig->cstime,
-			cputime_add(cputime.stime,
-				    sig->cstime));
+			cputime_add(p->stime,
+			cputime_add(sig->stime,
+				    sig->cstime)));
 		psig->cgtime =
 			cputime_add(psig->cgtime,
 			cputime_add(p->gtime,
@@ -1246,7 +1251,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
 			sig->oublock + sig->coublock;
 		task_io_accounting_add(&psig->ioac, &p->ioac);
 		task_io_accounting_add(&psig->ioac, &sig->ioac);
-		spin_unlock_irq(&p->parent->sighand->siglock);
+		spin_unlock_irq(&p->real_parent->sighand->siglock);
 	}
 
 	/*
@@ -1255,11 +1260,14 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	 */
 	read_unlock(&tasklist_lock);
 
-	retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+	retval = wo->wo_rusage
+		? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
 	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
 		? p->signal->group_exit_code : p->exit_code;
-	if (!retval && stat_addr)
-		retval = put_user(status, stat_addr);
+	if (!retval && wo->wo_stat)
+		retval = put_user(status, wo->wo_stat);
+
+	infop = wo->wo_info;
 	if (!retval && infop)
 		retval = put_user(SIGCHLD, &infop->si_signo);
 	if (!retval && infop)
@@ -1327,15 +1335,18 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
  */
-static int wait_task_stopped(int ptrace, struct task_struct *p,
-			     int options, struct siginfo __user *infop,
-			     int __user *stat_addr, struct rusage __user *ru)
+static int wait_task_stopped(struct wait_opts *wo,
+				int ptrace, struct task_struct *p)
 {
+	struct siginfo __user *infop;
 	int retval, exit_code, *p_code, why;
 	uid_t uid = 0; /* unneeded, required by compiler */
 	pid_t pid;
 
-	if (!(options & WUNTRACED))
+	/*
+	 * Traditionally we see ptrace'd stopped tasks regardless of options.
+	 */
+	if (!ptrace && !(wo->wo_flags & WUNTRACED))
 		return 0;
 
 	exit_code = 0;
@@ -1349,7 +1360,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
 	if (!exit_code)
 		goto unlock_sig;
 
-	if (!unlikely(options & WNOWAIT))
+	if (!unlikely(wo->wo_flags & WNOWAIT))
 		*p_code = 0;
 
 	/* don't need the RCU readlock here as we're holding a spinlock */
@@ -1371,14 +1382,15 @@ unlock_sig:
 	why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
 	read_unlock(&tasklist_lock);
 
-	if (unlikely(options & WNOWAIT))
-		return wait_noreap_copyout(p, pid, uid,
-					   why, exit_code,
-					   infop, ru);
+	if (unlikely(wo->wo_flags & WNOWAIT))
+		return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
+
+	retval = wo->wo_rusage
+		? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
+	if (!retval && wo->wo_stat)
+		retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
 
-	retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
-	if (!retval && stat_addr)
-		retval = put_user((exit_code << 8) | 0x7f, stat_addr);
+	infop = wo->wo_info;
 	if (!retval && infop)
 		retval = put_user(SIGCHLD, &infop->si_signo);
 	if (!retval && infop)
@@ -1405,15 +1417,13 @@ unlock_sig:
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
  */
-static int wait_task_continued(struct task_struct *p, int options,
-			       struct siginfo __user *infop,
-			       int __user *stat_addr, struct rusage __user *ru)
+static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 {
 	int retval;
 	pid_t pid;
 	uid_t uid;
 
-	if (!unlikely(options & WCONTINUED))
+	if (!unlikely(wo->wo_flags & WCONTINUED))
 		return 0;
 
 	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
@@ -1425,7 +1435,7 @@ static int wait_task_continued(struct task_struct *p, int options,
 		spin_unlock_irq(&p->sighand->siglock);
 		return 0;
 	}
-	if (!unlikely(options & WNOWAIT))
+	if (!unlikely(wo->wo_flags & WNOWAIT))
 		p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
 	uid = __task_cred(p)->uid;
 	spin_unlock_irq(&p->sighand->siglock);
@@ -1434,17 +1444,17 @@ static int wait_task_continued(struct task_struct *p, int options,
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
 
-	if (!infop) {
-		retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+	if (!wo->wo_info) {
+		retval = wo->wo_rusage
+			? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
 		put_task_struct(p);
-		if (!retval && stat_addr)
-			retval = put_user(0xffff, stat_addr);
+		if (!retval && wo->wo_stat)
+			retval = put_user(0xffff, wo->wo_stat);
 		if (!retval)
 			retval = pid;
 	} else {
-		retval = wait_noreap_copyout(p, pid, uid,
-					     CLD_CONTINUED, SIGCONT,
-					     infop, ru);
+		retval = wait_noreap_copyout(wo, p, pid, uid,
+					     CLD_CONTINUED, SIGCONT);
 		BUG_ON(retval == 0);
 	}
 
@@ -1454,19 +1464,16 @@ static int wait_task_continued(struct task_struct *p, int options,
 /*
  * Consider @p for a wait by @parent.
  *
- * -ECHILD should be in *@notask_error before the first call.
+ * -ECHILD should be in ->notask_error before the first call.
  * Returns nonzero for a final return, when we have unlocked tasklist_lock.
  * Returns zero if the search for a child should continue;
- * then *@notask_error is 0 if @p is an eligible child,
+ * then ->notask_error is 0 if @p is an eligible child,
  * or another error from security_task_wait(), or still -ECHILD.
  */
-static int wait_consider_task(struct task_struct *parent, int ptrace,
-			      struct task_struct *p, int *notask_error,
-			      enum pid_type type, struct pid *pid, int options,
-			      struct siginfo __user *infop,
-			      int __user *stat_addr, struct rusage __user *ru)
+static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent,
+				int ptrace, struct task_struct *p)
 {
-	int ret = eligible_child(type, pid, options, p);
+	int ret = eligible_child(wo, p);
 	if (!ret)
 		return ret;
 
@@ -1478,17 +1485,17 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
 		 * to look for security policy problems, rather
 		 * than for mysterious wait bugs.
 		 */
-		if (*notask_error)
-			*notask_error = ret;
+		if (wo->notask_error)
+			wo->notask_error = ret;
 		return 0;
 	}
 
-	if (likely(!ptrace) && unlikely(p->ptrace)) {
+	if (likely(!ptrace) && unlikely(task_ptrace(p))) {
 		/*
 		 * This child is hidden by ptrace.
 		 * We aren't allowed to see it now, but eventually we will.
 		 */
-		*notask_error = 0;
+		wo->notask_error = 0;
 		return 0;
 	}
 
@@ -1499,34 +1506,30 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
 	 * We don't reap group leaders with subthreads.
 	 */
 	if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
-		return wait_task_zombie(p, options, infop, stat_addr, ru);
+		return wait_task_zombie(wo, p);
 
 	/*
 	 * It's stopped or running now, so it might
 	 * later continue, exit, or stop again.
 	 */
-	*notask_error = 0;
+	wo->notask_error = 0;
 
 	if (task_stopped_code(p, ptrace))
-		return wait_task_stopped(ptrace, p, options,
-					 infop, stat_addr, ru);
+		return wait_task_stopped(wo, ptrace, p);
 
-	return wait_task_continued(p, options, infop, stat_addr, ru);
+	return wait_task_continued(wo, p);
 }
 
 /*
  * Do the work of do_wait() for one thread in the group, @tsk.
  *
- * -ECHILD should be in *@notask_error before the first call.
+ * -ECHILD should be in ->notask_error before the first call.
  * Returns nonzero for a final return, when we have unlocked tasklist_lock.
  * Returns zero if the search for a child should continue; then
- * *@notask_error is 0 if there were any eligible children,
+ * ->notask_error is 0 if there were any eligible children,
  * or another error from security_task_wait(), or still -ECHILD.
  */
-static int do_wait_thread(struct task_struct *tsk, int *notask_error,
-			  enum pid_type type, struct pid *pid, int options,
-			  struct siginfo __user *infop, int __user *stat_addr,
-			  struct rusage __user *ru)
+static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
 {
 	struct task_struct *p;
 
@@ -1535,9 +1538,7 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error,
 		 * Do not consider detached threads.
 		 */
 		if (!task_detached(p)) {
-			int ret = wait_consider_task(tsk, 0, p, notask_error,
-						     type, pid, options,
-						     infop, stat_addr, ru);
+			int ret = wait_consider_task(wo, tsk, 0, p);
 			if (ret)
 				return ret;
 		}
@@ -1546,22 +1547,12 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error,
 	return 0;
 }
 
-static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
-			  enum pid_type type, struct pid *pid, int options,
-			  struct siginfo __user *infop, int __user *stat_addr,
-			  struct rusage __user *ru)
+static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
 {
 	struct task_struct *p;
 
-	/*
-	 * Traditionally we see ptrace'd stopped tasks regardless of options.
-	 */
-	options |= WUNTRACED;
-
 	list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
-		int ret = wait_consider_task(tsk, 1, p, notask_error,
-					     type, pid, options,
-					     infop, stat_addr, ru);
+		int ret = wait_consider_task(wo, tsk, 1, p);
 		if (ret)
 			return ret;
 	}
@@ -1569,65 +1560,59 @@ static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
 	return 0;
 }
 
-static long do_wait(enum pid_type type, struct pid *pid, int options,
-		    struct siginfo __user *infop, int __user *stat_addr,
-		    struct rusage __user *ru)
+static long do_wait(struct wait_opts *wo)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	struct task_struct *tsk;
 	int retval;
 
-	trace_sched_process_wait(pid);
+	trace_sched_process_wait(wo->wo_pid);
 
 	add_wait_queue(&current->signal->wait_chldexit,&wait);
 repeat:
 	/*
 	 * If there is nothing that can match our critiera just get out.
-	 * We will clear @retval to zero if we see any child that might later
-	 * match our criteria, even if we are not able to reap it yet.
+	 * We will clear ->notask_error to zero if we see any child that
+	 * might later match our criteria, even if we are not able to reap
+	 * it yet.
 	 */
-	retval = -ECHILD;
-	if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type])))
-		goto end;
+	wo->notask_error = -ECHILD;
+	if ((wo->wo_type < PIDTYPE_MAX) &&
+	   (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
+		goto notask;
 
-	current->state = TASK_INTERRUPTIBLE;
+	set_current_state(TASK_INTERRUPTIBLE);
 	read_lock(&tasklist_lock);
 	tsk = current;
 	do {
-		int tsk_result = do_wait_thread(tsk, &retval,
-						type, pid, options,
-						infop, stat_addr, ru);
-		if (!tsk_result)
-			tsk_result = ptrace_do_wait(tsk, &retval,
-						    type, pid, options,
-						    infop, stat_addr, ru);
-		if (tsk_result) {
-			/*
-			 * tasklist_lock is unlocked and we have a final result.
-			 */
-			retval = tsk_result;
+		retval = do_wait_thread(wo, tsk);
+		if (retval)
+			goto end;
+
+		retval = ptrace_do_wait(wo, tsk);
+		if (retval)
 			goto end;
-		}
 
-		if (options & __WNOTHREAD)
+		if (wo->wo_flags & __WNOTHREAD)
 			break;
-		tsk = next_thread(tsk);
-		BUG_ON(tsk->signal != current->signal);
-	} while (tsk != current);
+	} while_each_thread(current, tsk);
 	read_unlock(&tasklist_lock);
 
-	if (!retval && !(options & WNOHANG)) {
+notask:
+	retval = wo->notask_error;
+	if (!retval && !(wo->wo_flags & WNOHANG)) {
 		retval = -ERESTARTSYS;
 		if (!signal_pending(current)) {
 			schedule();
 			goto repeat;
 		}
 	}
-
 end:
-	current->state = TASK_RUNNING;
+	__set_current_state(TASK_RUNNING);
 	remove_wait_queue(&current->signal->wait_chldexit,&wait);
-	if (infop) {
+	if (wo->wo_info) {
+		struct siginfo __user *infop = wo->wo_info;
+
 		if (retval > 0)
 			retval = 0;
 		else {
@@ -1656,6 +1641,7 @@ end:
 SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 		infop, int, options, struct rusage __user *, ru)
 {
+	struct wait_opts wo;
 	struct pid *pid = NULL;
 	enum pid_type type;
 	long ret;
@@ -1685,7 +1671,14 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 
 	if (type < PIDTYPE_MAX)
 		pid = find_get_pid(upid);
-	ret = do_wait(type, pid, options, infop, NULL, ru);
+
+	wo.wo_type	= type;
+	wo.wo_pid	= pid;
+	wo.wo_flags	= options;
+	wo.wo_info	= infop;
+	wo.wo_stat	= NULL;
+	wo.wo_rusage	= ru;
+	ret = do_wait(&wo);
 	put_pid(pid);
 
 	/* avoid REGPARM breakage on x86: */
@@ -1696,6 +1689,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
 		int, options, struct rusage __user *, ru)
 {
+	struct wait_opts wo;
 	struct pid *pid = NULL;
 	enum pid_type type;
 	long ret;
@@ -1717,7 +1711,13 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
 		pid = find_get_pid(upid);
 	}
 
-	ret = do_wait(type, pid, options | WEXITED, NULL, stat_addr, ru);
+	wo.wo_type	= type;
+	wo.wo_pid	= pid;
+	wo.wo_flags	= options | WEXITED;
+	wo.wo_info	= NULL;
+	wo.wo_stat	= stat_addr;
+	wo.wo_rusage	= ru;
+	ret = do_wait(&wo);
 	put_pid(pid);
 
 	/* avoid REGPARM breakage on x86: */
diff --git a/kernel/fork.c b/kernel/fork.c
index be022c200da..467746b3f0a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1029,7 +1029,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->vfork_done = NULL;
 	spin_lock_init(&p->alloc_lock);
 
-	clear_tsk_thread_flag(p, TIF_SIGPENDING);
 	init_sigpending(&p->pending);
 
 	p->utime = cputime_zero;
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
new file mode 100644
index 00000000000..22e9dcfaa3d
--- /dev/null
+++ b/kernel/gcov/Kconfig
@@ -0,0 +1,48 @@
+menu "GCOV-based kernel profiling"
+
+config GCOV_KERNEL
+	bool "Enable gcov-based kernel profiling"
+	depends on DEBUG_FS && CONSTRUCTORS
+	default n
+	---help---
+	This option enables gcov-based code profiling (e.g. for code coverage
+	measurements).
+
+	If unsure, say N.
+
+	Additionally specify CONFIG_GCOV_PROFILE_ALL=y to get profiling data
+	for the entire kernel. To enable profiling for specific files or
+	directories, add a line similar to the following to the respective
+	Makefile:
+
+	For a single file (e.g. main.o):
+	        GCOV_PROFILE_main.o := y
+
+	For all files in one directory:
+	        GCOV_PROFILE := y
+
+	To exclude files from being profiled even when CONFIG_GCOV_PROFILE_ALL
+	is specified, use:
+
+	        GCOV_PROFILE_main.o := n
+	and:
+	        GCOV_PROFILE := n
+
+	Note that the debugfs filesystem has to be mounted to access
+	profiling data.
+
+config GCOV_PROFILE_ALL
+	bool "Profile entire Kernel"
+	depends on GCOV_KERNEL
+	depends on S390 || X86
+	default n
+	---help---
+	This options activates profiling for the entire kernel.
+
+	If unsure, say N.
+
+	Note that a kernel compiled with profiling flags will be significantly
+	larger and run slower. Also be sure to exclude files from profiling
+	which are not linked to the kernel image to prevent linker errors.
+
+endmenu
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
new file mode 100644
index 00000000000..3f761001d51
--- /dev/null
+++ b/kernel/gcov/Makefile
@@ -0,0 +1,3 @@
+EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
+
+obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
new file mode 100644
index 00000000000..9b22d03cc58
--- /dev/null
+++ b/kernel/gcov/base.c
@@ -0,0 +1,148 @@
+/*
+ *  This code maintains a list of active profiling data structures.
+ *
+ *    Copyright IBM Corp. 2009
+ *    Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *
+ *    Uses gcc-internal data definitions.
+ *    Based on the gcov-kernel patch by:
+ *		 Hubertus Franke <frankeh@us.ibm.com>
+ *		 Nigel Hinds <nhinds@us.ibm.com>
+ *		 Rajan Ravindran <rajancr@us.ibm.com>
+ *		 Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *		 Paul Larson
+ */
+
+#define pr_fmt(fmt)	"gcov: " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include "gcov.h"
+
+static struct gcov_info *gcov_info_head;
+static int gcov_events_enabled;
+static DEFINE_MUTEX(gcov_lock);
+
+/*
+ * __gcov_init is called by gcc-generated constructor code for each object
+ * file compiled with -fprofile-arcs.
+ */
+void __gcov_init(struct gcov_info *info)
+{
+	static unsigned int gcov_version;
+
+	mutex_lock(&gcov_lock);
+	if (gcov_version == 0) {
+		gcov_version = info->version;
+		/*
+		 * Printing gcc's version magic may prove useful for debugging
+		 * incompatibility reports.
+		 */
+		pr_info("version magic: 0x%x\n", gcov_version);
+	}
+	/*
+	 * Add new profiling data structure to list and inform event
+	 * listener.
+	 */
+	info->next = gcov_info_head;
+	gcov_info_head = info;
+	if (gcov_events_enabled)
+		gcov_event(GCOV_ADD, info);
+	mutex_unlock(&gcov_lock);
+}
+EXPORT_SYMBOL(__gcov_init);
+
+/*
+ * These functions may be referenced by gcc-generated profiling code but serve
+ * no function for kernel profiling.
+ */
+void __gcov_flush(void)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_flush);
+
+void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_add);
+
+void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_single);
+
+void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_delta);
+
+/**
+ * gcov_enable_events - enable event reporting through gcov_event()
+ *
+ * Turn on reporting of profiling data load/unload-events through the
+ * gcov_event() callback. Also replay all previous events once. This function
+ * is needed because some events are potentially generated too early for the
+ * callback implementation to handle them initially.
+ */
+void gcov_enable_events(void)
+{
+	struct gcov_info *info;
+
+	mutex_lock(&gcov_lock);
+	gcov_events_enabled = 1;
+	/* Perform event callback for previously registered entries. */
+	for (info = gcov_info_head; info; info = info->next)
+		gcov_event(GCOV_ADD, info);
+	mutex_unlock(&gcov_lock);
+}
+
+#ifdef CONFIG_MODULES
+static inline int within(void *addr, void *start, unsigned long size)
+{
+	return ((addr >= start) && (addr < start + size));
+}
+
+/* Update list and generate events when modules are unloaded. */
+static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
+				void *data)
+{
+	struct module *mod = data;
+	struct gcov_info *info;
+	struct gcov_info *prev;
+
+	if (event != MODULE_STATE_GOING)
+		return NOTIFY_OK;
+	mutex_lock(&gcov_lock);
+	prev = NULL;
+	/* Remove entries located in module from linked list. */
+	for (info = gcov_info_head; info; info = info->next) {
+		if (within(info, mod->module_core, mod->core_size)) {
+			if (prev)
+				prev->next = info->next;
+			else
+				gcov_info_head = info->next;
+			if (gcov_events_enabled)
+				gcov_event(GCOV_REMOVE, info);
+		} else
+			prev = info;
+	}
+	mutex_unlock(&gcov_lock);
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block gcov_nb = {
+	.notifier_call	= gcov_module_notifier,
+};
+
+static int __init gcov_init(void)
+{
+	return register_module_notifier(&gcov_nb);
+}
+device_initcall(gcov_init);
+#endif /* CONFIG_MODULES */
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
new file mode 100644
index 00000000000..ef3c3f88a7a
--- /dev/null
+++ b/kernel/gcov/fs.c
@@ -0,0 +1,673 @@
+/*
+ *  This code exports profiling data as debugfs files to userspace.
+ *
+ *    Copyright IBM Corp. 2009
+ *    Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *
+ *    Uses gcc-internal data definitions.
+ *    Based on the gcov-kernel patch by:
+ *		 Hubertus Franke <frankeh@us.ibm.com>
+ *		 Nigel Hinds <nhinds@us.ibm.com>
+ *		 Rajan Ravindran <rajancr@us.ibm.com>
+ *		 Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *		 Paul Larson
+ *		 Yi CDL Yang
+ */
+
+#define pr_fmt(fmt)	"gcov: " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/seq_file.h>
+#include "gcov.h"
+
+/**
+ * struct gcov_node - represents a debugfs entry
+ * @list: list head for child node list
+ * @children: child nodes
+ * @all: list head for list of all nodes
+ * @parent: parent node
+ * @info: associated profiling data structure if not a directory
+ * @ghost: when an object file containing profiling data is unloaded we keep a
+ *         copy of the profiling data here to allow collecting coverage data
+ *         for cleanup code. Such a node is called a "ghost".
+ * @dentry: main debugfs entry, either a directory or data file
+ * @links: associated symbolic links
+ * @name: data file basename
+ *
+ * struct gcov_node represents an entity within the gcov/ subdirectory
+ * of debugfs. There are directory and data file nodes. The latter represent
+ * the actual synthesized data file plus any associated symbolic links which
+ * are needed by the gcov tool to work correctly.
+ */
+struct gcov_node {
+	struct list_head list;
+	struct list_head children;
+	struct list_head all;
+	struct gcov_node *parent;
+	struct gcov_info *info;
+	struct gcov_info *ghost;
+	struct dentry *dentry;
+	struct dentry **links;
+	char name[0];
+};
+
+static const char objtree[] = OBJTREE;
+static const char srctree[] = SRCTREE;
+static struct gcov_node root_node;
+static struct dentry *reset_dentry;
+static LIST_HEAD(all_head);
+static DEFINE_MUTEX(node_lock);
+
+/* If non-zero, keep copies of profiling data for unloaded modules. */
+static int gcov_persist = 1;
+
+static int __init gcov_persist_setup(char *str)
+{
+	unsigned long val;
+
+	if (strict_strtoul(str, 0, &val)) {
+		pr_warning("invalid gcov_persist parameter '%s'\n", str);
+		return 0;
+	}
+	gcov_persist = val;
+	pr_info("setting gcov_persist to %d\n", gcov_persist);
+
+	return 1;
+}
+__setup("gcov_persist=", gcov_persist_setup);
+
+/*
+ * seq_file.start() implementation for gcov data files. Note that the
+ * gcov_iterator interface is designed to be more restrictive than seq_file
+ * (no start from arbitrary position, etc.), to simplify the iterator
+ * implementation.
+ */
+static void *gcov_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	loff_t i;
+
+	gcov_iter_start(seq->private);
+	for (i = 0; i < *pos; i++) {
+		if (gcov_iter_next(seq->private))
+			return NULL;
+	}
+	return seq->private;
+}
+
+/* seq_file.next() implementation for gcov data files. */
+static void *gcov_seq_next(struct seq_file *seq, void *data, loff_t *pos)
+{
+	struct gcov_iterator *iter = data;
+
+	if (gcov_iter_next(iter))
+		return NULL;
+	(*pos)++;
+
+	return iter;
+}
+
+/* seq_file.show() implementation for gcov data files. */
+static int gcov_seq_show(struct seq_file *seq, void *data)
+{
+	struct gcov_iterator *iter = data;
+
+	if (gcov_iter_write(iter, seq))
+		return -EINVAL;
+	return 0;
+}
+
+static void gcov_seq_stop(struct seq_file *seq, void *data)
+{
+	/* Unused. */
+}
+
+static const struct seq_operations gcov_seq_ops = {
+	.start	= gcov_seq_start,
+	.next	= gcov_seq_next,
+	.show	= gcov_seq_show,
+	.stop	= gcov_seq_stop,
+};
+
+/*
+ * Return the profiling data set for a given node. This can either be the
+ * original profiling data structure or a duplicate (also called "ghost")
+ * in case the associated object file has been unloaded.
+ */
+static struct gcov_info *get_node_info(struct gcov_node *node)
+{
+	if (node->info)
+		return node->info;
+
+	return node->ghost;
+}
+
+/*
+ * open() implementation for gcov data files. Create a copy of the profiling
+ * data set and initialize the iterator and seq_file interface.
+ */
+static int gcov_seq_open(struct inode *inode, struct file *file)
+{
+	struct gcov_node *node = inode->i_private;
+	struct gcov_iterator *iter;
+	struct seq_file *seq;
+	struct gcov_info *info;
+	int rc = -ENOMEM;
+
+	mutex_lock(&node_lock);
+	/*
+	 * Read from a profiling data copy to minimize reference tracking
+	 * complexity and concurrent access.
+	 */
+	info = gcov_info_dup(get_node_info(node));
+	if (!info)
+		goto out_unlock;
+	iter = gcov_iter_new(info);
+	if (!iter)
+		goto err_free_info;
+	rc = seq_open(file, &gcov_seq_ops);
+	if (rc)
+		goto err_free_iter_info;
+	seq = file->private_data;
+	seq->private = iter;
+out_unlock:
+	mutex_unlock(&node_lock);
+	return rc;
+
+err_free_iter_info:
+	gcov_iter_free(iter);
+err_free_info:
+	gcov_info_free(info);
+	goto out_unlock;
+}
+
+/*
+ * release() implementation for gcov data files. Release resources allocated
+ * by open().
+ */
+static int gcov_seq_release(struct inode *inode, struct file *file)
+{
+	struct gcov_iterator *iter;
+	struct gcov_info *info;
+	struct seq_file *seq;
+
+	seq = file->private_data;
+	iter = seq->private;
+	info = gcov_iter_get_info(iter);
+	gcov_iter_free(iter);
+	gcov_info_free(info);
+	seq_release(inode, file);
+
+	return 0;
+}
+
+/*
+ * Find a node by the associated data file name. Needs to be called with
+ * node_lock held.
+ */
+static struct gcov_node *get_node_by_name(const char *name)
+{
+	struct gcov_node *node;
+	struct gcov_info *info;
+
+	list_for_each_entry(node, &all_head, all) {
+		info = get_node_info(node);
+		if (info && (strcmp(info->filename, name) == 0))
+			return node;
+	}
+
+	return NULL;
+}
+
+static void remove_node(struct gcov_node *node);
+
+/*
+ * write() implementation for gcov data files. Reset profiling data for the
+ * associated file. If the object file has been unloaded (i.e. this is
+ * a "ghost" node), remove the debug fs node as well.
+ */
+static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
+			      size_t len, loff_t *pos)
+{
+	struct seq_file *seq;
+	struct gcov_info *info;
+	struct gcov_node *node;
+
+	seq = file->private_data;
+	info = gcov_iter_get_info(seq->private);
+	mutex_lock(&node_lock);
+	node = get_node_by_name(info->filename);
+	if (node) {
+		/* Reset counts or remove node for unloaded modules. */
+		if (node->ghost)
+			remove_node(node);
+		else
+			gcov_info_reset(node->info);
+	}
+	/* Reset counts for open file. */
+	gcov_info_reset(info);
+	mutex_unlock(&node_lock);
+
+	return len;
+}
+
+/*
+ * Given a string <path> representing a file path of format:
+ *   path/to/file.gcda
+ * construct and return a new string:
+ *   <dir/>path/to/file.<ext>
+ */
+static char *link_target(const char *dir, const char *path, const char *ext)
+{
+	char *target;
+	char *old_ext;
+	char *copy;
+
+	copy = kstrdup(path, GFP_KERNEL);
+	if (!copy)
+		return NULL;
+	old_ext = strrchr(copy, '.');
+	if (old_ext)
+		*old_ext = '\0';
+	if (dir)
+		target = kasprintf(GFP_KERNEL, "%s/%s.%s", dir, copy, ext);
+	else
+		target = kasprintf(GFP_KERNEL, "%s.%s", copy, ext);
+	kfree(copy);
+
+	return target;
+}
+
+/*
+ * Construct a string representing the symbolic link target for the given
+ * gcov data file name and link type. Depending on the link type and the
+ * location of the data file, the link target can either point to a
+ * subdirectory of srctree, objtree or in an external location.
+ */
+static char *get_link_target(const char *filename, const struct gcov_link *ext)
+{
+	const char *rel;
+	char *result;
+
+	if (strncmp(filename, objtree, strlen(objtree)) == 0) {
+		rel = filename + strlen(objtree) + 1;
+		if (ext->dir == SRC_TREE)
+			result = link_target(srctree, rel, ext->ext);
+		else
+			result = link_target(objtree, rel, ext->ext);
+	} else {
+		/* External compilation. */
+		result = link_target(NULL, filename, ext->ext);
+	}
+
+	return result;
+}
+
+#define SKEW_PREFIX	".tmp_"
+
+/*
+ * For a filename .tmp_filename.ext return filename.ext. Needed to compensate
+ * for filename skewing caused by the mod-versioning mechanism.
+ */
+static const char *deskew(const char *basename)
+{
+	if (strncmp(basename, SKEW_PREFIX, sizeof(SKEW_PREFIX) - 1) == 0)
+		return basename + sizeof(SKEW_PREFIX) - 1;
+	return basename;
+}
+
+/*
+ * Create links to additional files (usually .c and .gcno files) which the
+ * gcov tool expects to find in the same directory as the gcov data file.
+ */
+static void add_links(struct gcov_node *node, struct dentry *parent)
+{
+	char *basename;
+	char *target;
+	int num;
+	int i;
+
+	for (num = 0; gcov_link[num].ext; num++)
+		/* Nothing. */;
+	node->links = kcalloc(num, sizeof(struct dentry *), GFP_KERNEL);
+	if (!node->links)
+		return;
+	for (i = 0; i < num; i++) {
+		target = get_link_target(get_node_info(node)->filename,
+					 &gcov_link[i]);
+		if (!target)
+			goto out_err;
+		basename = strrchr(target, '/');
+		if (!basename)
+			goto out_err;
+		basename++;
+		node->links[i] = debugfs_create_symlink(deskew(basename),
+							parent,	target);
+		if (!node->links[i])
+			goto out_err;
+		kfree(target);
+	}
+
+	return;
+out_err:
+	kfree(target);
+	while (i-- > 0)
+		debugfs_remove(node->links[i]);
+	kfree(node->links);
+	node->links = NULL;
+}
+
+static const struct file_operations gcov_data_fops = {
+	.open		= gcov_seq_open,
+	.release	= gcov_seq_release,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.write		= gcov_seq_write,
+};
+
+/* Basic initialization of a new node. */
+static void init_node(struct gcov_node *node, struct gcov_info *info,
+		      const char *name, struct gcov_node *parent)
+{
+	INIT_LIST_HEAD(&node->list);
+	INIT_LIST_HEAD(&node->children);
+	INIT_LIST_HEAD(&node->all);
+	node->info = info;
+	node->parent = parent;
+	if (name)
+		strcpy(node->name, name);
+}
+
+/*
+ * Create a new node and associated debugfs entry. Needs to be called with
+ * node_lock held.
+ */
+static struct gcov_node *new_node(struct gcov_node *parent,
+				  struct gcov_info *info, const char *name)
+{
+	struct gcov_node *node;
+
+	node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL);
+	if (!node) {
+		pr_warning("out of memory\n");
+		return NULL;
+	}
+	init_node(node, info, name, parent);
+	/* Differentiate between gcov data file nodes and directory nodes. */
+	if (info) {
+		node->dentry = debugfs_create_file(deskew(node->name), 0600,
+					parent->dentry, node, &gcov_data_fops);
+	} else
+		node->dentry = debugfs_create_dir(node->name, parent->dentry);
+	if (!node->dentry) {
+		pr_warning("could not create file\n");
+		kfree(node);
+		return NULL;
+	}
+	if (info)
+		add_links(node, parent->dentry);
+	list_add(&node->list, &parent->children);
+	list_add(&node->all, &all_head);
+
+	return node;
+}
+
+/* Remove symbolic links associated with node. */
+static void remove_links(struct gcov_node *node)
+{
+	int i;
+
+	if (!node->links)
+		return;
+	for (i = 0; gcov_link[i].ext; i++)
+		debugfs_remove(node->links[i]);
+	kfree(node->links);
+	node->links = NULL;
+}
+
+/*
+ * Remove node from all lists and debugfs and release associated resources.
+ * Needs to be called with node_lock held.
+ */
+static void release_node(struct gcov_node *node)
+{
+	list_del(&node->list);
+	list_del(&node->all);
+	debugfs_remove(node->dentry);
+	remove_links(node);
+	if (node->ghost)
+		gcov_info_free(node->ghost);
+	kfree(node);
+}
+
+/* Release node and empty parents. Needs to be called with node_lock held. */
+static void remove_node(struct gcov_node *node)
+{
+	struct gcov_node *parent;
+
+	while ((node != &root_node) && list_empty(&node->children)) {
+		parent = node->parent;
+		release_node(node);
+		node = parent;
+	}
+}
+
+/*
+ * Find child node with given basename. Needs to be called with node_lock
+ * held.
+ */
+static struct gcov_node *get_child_by_name(struct gcov_node *parent,
+					   const char *name)
+{
+	struct gcov_node *node;
+
+	list_for_each_entry(node, &parent->children, list) {
+		if (strcmp(node->name, name) == 0)
+			return node;
+	}
+
+	return NULL;
+}
+
+/*
+ * write() implementation for reset file. Reset all profiling data to zero
+ * and remove ghost nodes.
+ */
+static ssize_t reset_write(struct file *file, const char __user *addr,
+			   size_t len, loff_t *pos)
+{
+	struct gcov_node *node;
+
+	mutex_lock(&node_lock);
+restart:
+	list_for_each_entry(node, &all_head, all) {
+		if (node->info)
+			gcov_info_reset(node->info);
+		else if (list_empty(&node->children)) {
+			remove_node(node);
+			/* Several nodes may have gone - restart loop. */
+			goto restart;
+		}
+	}
+	mutex_unlock(&node_lock);
+
+	return len;
+}
+
+/* read() implementation for reset file. Unused. */
+static ssize_t reset_read(struct file *file, char __user *addr, size_t len,
+			  loff_t *pos)
+{
+	/* Allow read operation so that a recursive copy won't fail. */
+	return 0;
+}
+
+static const struct file_operations gcov_reset_fops = {
+	.write	= reset_write,
+	.read	= reset_read,
+};
+
+/*
+ * Create a node for a given profiling data set and add it to all lists and
+ * debugfs. Needs to be called with node_lock held.
+ */
+static void add_node(struct gcov_info *info)
+{
+	char *filename;
+	char *curr;
+	char *next;
+	struct gcov_node *parent;
+	struct gcov_node *node;
+
+	filename = kstrdup(info->filename, GFP_KERNEL);
+	if (!filename)
+		return;
+	parent = &root_node;
+	/* Create directory nodes along the path. */
+	for (curr = filename; (next = strchr(curr, '/')); curr = next + 1) {
+		if (curr == next)
+			continue;
+		*next = 0;
+		if (strcmp(curr, ".") == 0)
+			continue;
+		if (strcmp(curr, "..") == 0) {
+			if (!parent->parent)
+				goto err_remove;
+			parent = parent->parent;
+			continue;
+		}
+		node = get_child_by_name(parent, curr);
+		if (!node) {
+			node = new_node(parent, NULL, curr);
+			if (!node)
+				goto err_remove;
+		}
+		parent = node;
+	}
+	/* Create file node. */
+	node = new_node(parent, info, curr);
+	if (!node)
+		goto err_remove;
+out:
+	kfree(filename);
+	return;
+
+err_remove:
+	remove_node(parent);
+	goto out;
+}
+
+/*
+ * The profiling data set associated with this node is being unloaded. Store a
+ * copy of the profiling data and turn this node into a "ghost".
+ */
+static int ghost_node(struct gcov_node *node)
+{
+	node->ghost = gcov_info_dup(node->info);
+	if (!node->ghost) {
+		pr_warning("could not save data for '%s' (out of memory)\n",
+			   node->info->filename);
+		return -ENOMEM;
+	}
+	node->info = NULL;
+
+	return 0;
+}
+
+/*
+ * Profiling data for this node has been loaded again. Add profiling data
+ * from previous instantiation and turn this node into a regular node.
+ */
+static void revive_node(struct gcov_node *node, struct gcov_info *info)
+{
+	if (gcov_info_is_compatible(node->ghost, info))
+		gcov_info_add(info, node->ghost);
+	else {
+		pr_warning("discarding saved data for '%s' (version changed)\n",
+			   info->filename);
+	}
+	gcov_info_free(node->ghost);
+	node->ghost = NULL;
+	node->info = info;
+}
+
+/*
+ * Callback to create/remove profiling files when code compiled with
+ * -fprofile-arcs is loaded/unloaded.
+ */
+void gcov_event(enum gcov_action action, struct gcov_info *info)
+{
+	struct gcov_node *node;
+
+	mutex_lock(&node_lock);
+	node = get_node_by_name(info->filename);
+	switch (action) {
+	case GCOV_ADD:
+		/* Add new node or revive ghost. */
+		if (!node) {
+			add_node(info);
+			break;
+		}
+		if (gcov_persist)
+			revive_node(node, info);
+		else {
+			pr_warning("could not add '%s' (already exists)\n",
+				   info->filename);
+		}
+		break;
+	case GCOV_REMOVE:
+		/* Remove node or turn into ghost. */
+		if (!node) {
+			pr_warning("could not remove '%s' (not found)\n",
+				   info->filename);
+			break;
+		}
+		if (gcov_persist) {
+			if (!ghost_node(node))
+				break;
+		}
+		remove_node(node);
+		break;
+	}
+	mutex_unlock(&node_lock);
+}
+
+/* Create debugfs entries. */
+static __init int gcov_fs_init(void)
+{
+	int rc = -EIO;
+
+	init_node(&root_node, NULL, NULL, NULL);
+	/*
+	 * /sys/kernel/debug/gcov will be parent for the reset control file
+	 * and all profiling files.
+	 */
+	root_node.dentry = debugfs_create_dir("gcov", NULL);
+	if (!root_node.dentry)
+		goto err_remove;
+	/*
+	 * Create reset file which resets all profiling counts when written
+	 * to.
+	 */
+	reset_dentry = debugfs_create_file("reset", 0600, root_node.dentry,
+					   NULL, &gcov_reset_fops);
+	if (!reset_dentry)
+		goto err_remove;
+	/* Replay previous events to get our fs hierarchy up-to-date. */
+	gcov_enable_events();
+	return 0;
+
+err_remove:
+	pr_err("init failed\n");
+	if (root_node.dentry)
+		debugfs_remove(root_node.dentry);
+
+	return rc;
+}
+device_initcall(gcov_fs_init);
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
new file mode 100644
index 00000000000..ae5bb426003
--- /dev/null
+++ b/kernel/gcov/gcc_3_4.c
@@ -0,0 +1,447 @@
+/*
+ *  This code provides functions to handle gcc's profiling data format
+ *  introduced with gcc 3.4. Future versions of gcc may change the gcov
+ *  format (as happened before), so all format-specific information needs
+ *  to be kept modular and easily exchangeable.
+ *
+ *  This file is based on gcc-internal definitions. Functions and data
+ *  structures are defined to be compatible with gcc counterparts.
+ *  For a better understanding, refer to gcc source: gcc/gcov-io.h.
+ *
+ *    Copyright IBM Corp. 2009
+ *    Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *
+ *    Uses gcc-internal data definitions.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
+#include "gcov.h"
+
+/* Symbolic links to be created for each profiling data file. */
+const struct gcov_link gcov_link[] = {
+	{ OBJ_TREE, "gcno" },	/* Link to .gcno file in $(objtree). */
+	{ 0, NULL},
+};
+
+/*
+ * Determine whether a counter is active. Based on gcc magic. Doesn't change
+ * at run-time.
+ */
+static int counter_active(struct gcov_info *info, unsigned int type)
+{
+	return (1 << type) & info->ctr_mask;
+}
+
+/* Determine number of active counters. Based on gcc magic. */
+static unsigned int num_counter_active(struct gcov_info *info)
+{
+	unsigned int i;
+	unsigned int result = 0;
+
+	for (i = 0; i < GCOV_COUNTERS; i++) {
+		if (counter_active(info, i))
+			result++;
+	}
+	return result;
+}
+
+/**
+ * gcov_info_reset - reset profiling data to zero
+ * @info: profiling data set
+ */
+void gcov_info_reset(struct gcov_info *info)
+{
+	unsigned int active = num_counter_active(info);
+	unsigned int i;
+
+	for (i = 0; i < active; i++) {
+		memset(info->counts[i].values, 0,
+		       info->counts[i].num * sizeof(gcov_type));
+	}
+}
+
+/**
+ * gcov_info_is_compatible - check if profiling data can be added
+ * @info1: first profiling data set
+ * @info2: second profiling data set
+ *
+ * Returns non-zero if profiling data can be added, zero otherwise.
+ */
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
+{
+	return (info1->stamp == info2->stamp);
+}
+
+/**
+ * gcov_info_add - add up profiling data
+ * @dest: profiling data set to which data is added
+ * @source: profiling data set which is added
+ *
+ * Adds profiling counts of @source to @dest.
+ */
+void gcov_info_add(struct gcov_info *dest, struct gcov_info *source)
+{
+	unsigned int i;
+	unsigned int j;
+
+	for (i = 0; i < num_counter_active(dest); i++) {
+		for (j = 0; j < dest->counts[i].num; j++) {
+			dest->counts[i].values[j] +=
+				source->counts[i].values[j];
+		}
+	}
+}
+
+/* Get size of function info entry. Based on gcc magic. */
+static size_t get_fn_size(struct gcov_info *info)
+{
+	size_t size;
+
+	size = sizeof(struct gcov_fn_info) + num_counter_active(info) *
+	       sizeof(unsigned int);
+	if (__alignof__(struct gcov_fn_info) > sizeof(unsigned int))
+		size = ALIGN(size, __alignof__(struct gcov_fn_info));
+	return size;
+}
+
+/* Get address of function info entry. Based on gcc magic. */
+static struct gcov_fn_info *get_fn_info(struct gcov_info *info, unsigned int fn)
+{
+	return (struct gcov_fn_info *)
+		((char *) info->functions + fn * get_fn_size(info));
+}
+
+/**
+ * gcov_info_dup - duplicate profiling data set
+ * @info: profiling data set to duplicate
+ *
+ * Return newly allocated duplicate on success, %NULL on error.
+ */
+struct gcov_info *gcov_info_dup(struct gcov_info *info)
+{
+	struct gcov_info *dup;
+	unsigned int i;
+	unsigned int active;
+
+	/* Duplicate gcov_info. */
+	active = num_counter_active(info);
+	dup = kzalloc(sizeof(struct gcov_info) +
+		      sizeof(struct gcov_ctr_info) * active, GFP_KERNEL);
+	if (!dup)
+		return NULL;
+	dup->version		= info->version;
+	dup->stamp		= info->stamp;
+	dup->n_functions	= info->n_functions;
+	dup->ctr_mask		= info->ctr_mask;
+	/* Duplicate filename. */
+	dup->filename		= kstrdup(info->filename, GFP_KERNEL);
+	if (!dup->filename)
+		goto err_free;
+	/* Duplicate table of functions. */
+	dup->functions = kmemdup(info->functions, info->n_functions *
+				 get_fn_size(info), GFP_KERNEL);
+	if (!dup->functions)
+		goto err_free;
+	/* Duplicate counter arrays. */
+	for (i = 0; i < active ; i++) {
+		struct gcov_ctr_info *ctr = &info->counts[i];
+		size_t size = ctr->num * sizeof(gcov_type);
+
+		dup->counts[i].num = ctr->num;
+		dup->counts[i].merge = ctr->merge;
+		dup->counts[i].values = vmalloc(size);
+		if (!dup->counts[i].values)
+			goto err_free;
+		memcpy(dup->counts[i].values, ctr->values, size);
+	}
+	return dup;
+
+err_free:
+	gcov_info_free(dup);
+	return NULL;
+}
+
+/**
+ * gcov_info_free - release memory for profiling data set duplicate
+ * @info: profiling data set duplicate to free
+ */
+void gcov_info_free(struct gcov_info *info)
+{
+	unsigned int active = num_counter_active(info);
+	unsigned int i;
+
+	for (i = 0; i < active ; i++)
+		vfree(info->counts[i].values);
+	kfree(info->functions);
+	kfree(info->filename);
+	kfree(info);
+}
+
+/**
+ * struct type_info - iterator helper array
+ * @ctr_type: counter type
+ * @offset: index of the first value of the current function for this type
+ *
+ * This array is needed to convert the in-memory data format into the in-file
+ * data format:
+ *
+ * In-memory:
+ *   for each counter type
+ *     for each function
+ *       values
+ *
+ * In-file:
+ *   for each function
+ *     for each counter type
+ *       values
+ *
+ * See gcc source gcc/gcov-io.h for more information on data organization.
+ */
+struct type_info {
+	int ctr_type;
+	unsigned int offset;
+};
+
+/**
+ * struct gcov_iterator - specifies current file position in logical records
+ * @info: associated profiling data
+ * @record: record type
+ * @function: function number
+ * @type: counter type
+ * @count: index into values array
+ * @num_types: number of counter types
+ * @type_info: helper array to get values-array offset for current function
+ */
+struct gcov_iterator {
+	struct gcov_info *info;
+
+	int record;
+	unsigned int function;
+	unsigned int type;
+	unsigned int count;
+
+	int num_types;
+	struct type_info type_info[0];
+};
+
+static struct gcov_fn_info *get_func(struct gcov_iterator *iter)
+{
+	return get_fn_info(iter->info, iter->function);
+}
+
+static struct type_info *get_type(struct gcov_iterator *iter)
+{
+	return &iter->type_info[iter->type];
+}
+
+/**
+ * gcov_iter_new - allocate and initialize profiling data iterator
+ * @info: profiling data set to be iterated
+ *
+ * Return file iterator on success, %NULL otherwise.
+ */
+struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
+{
+	struct gcov_iterator *iter;
+
+	iter = kzalloc(sizeof(struct gcov_iterator) +
+		       num_counter_active(info) * sizeof(struct type_info),
+		       GFP_KERNEL);
+	if (iter)
+		iter->info = info;
+
+	return iter;
+}
+
+/**
+ * gcov_iter_free - release memory for iterator
+ * @iter: file iterator to free
+ */
+void gcov_iter_free(struct gcov_iterator *iter)
+{
+	kfree(iter);
+}
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
+{
+	return iter->info;
+}
+
+/**
+ * gcov_iter_start - reset file iterator to starting position
+ * @iter: file iterator
+ */
+void gcov_iter_start(struct gcov_iterator *iter)
+{
+	int i;
+
+	iter->record = 0;
+	iter->function = 0;
+	iter->type = 0;
+	iter->count = 0;
+	iter->num_types = 0;
+	for (i = 0; i < GCOV_COUNTERS; i++) {
+		if (counter_active(iter->info, i)) {
+			iter->type_info[iter->num_types].ctr_type = i;
+			iter->type_info[iter->num_types++].offset = 0;
+		}
+	}
+}
+
+/* Mapping of logical record number to actual file content. */
+#define RECORD_FILE_MAGIC	0
+#define RECORD_GCOV_VERSION	1
+#define RECORD_TIME_STAMP	2
+#define RECORD_FUNCTION_TAG	3
+#define RECORD_FUNCTON_TAG_LEN	4
+#define RECORD_FUNCTION_IDENT	5
+#define RECORD_FUNCTION_CHECK	6
+#define RECORD_COUNT_TAG	7
+#define RECORD_COUNT_LEN	8
+#define RECORD_COUNT		9
+
+/**
+ * gcov_iter_next - advance file iterator to next logical record
+ * @iter: file iterator
+ *
+ * Return zero if new position is valid, non-zero if iterator has reached end.
+ */
+int gcov_iter_next(struct gcov_iterator *iter)
+{
+	switch (iter->record) {
+	case RECORD_FILE_MAGIC:
+	case RECORD_GCOV_VERSION:
+	case RECORD_FUNCTION_TAG:
+	case RECORD_FUNCTON_TAG_LEN:
+	case RECORD_FUNCTION_IDENT:
+	case RECORD_COUNT_TAG:
+		/* Advance to next record */
+		iter->record++;
+		break;
+	case RECORD_COUNT:
+		/* Advance to next count */
+		iter->count++;
+		/* fall through */
+	case RECORD_COUNT_LEN:
+		if (iter->count < get_func(iter)->n_ctrs[iter->type]) {
+			iter->record = 9;
+			break;
+		}
+		/* Advance to next counter type */
+		get_type(iter)->offset += iter->count;
+		iter->count = 0;
+		iter->type++;
+		/* fall through */
+	case RECORD_FUNCTION_CHECK:
+		if (iter->type < iter->num_types) {
+			iter->record = 7;
+			break;
+		}
+		/* Advance to next function */
+		iter->type = 0;
+		iter->function++;
+		/* fall through */
+	case RECORD_TIME_STAMP:
+		if (iter->function < iter->info->n_functions)
+			iter->record = 3;
+		else
+			iter->record = -1;
+		break;
+	}
+	/* Check for EOF. */
+	if (iter->record == -1)
+		return -EINVAL;
+	else
+		return 0;
+}
+
+/**
+ * seq_write_gcov_u32 - write 32 bit number in gcov format to seq_file
+ * @seq: seq_file handle
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file.
+ */
+static int seq_write_gcov_u32(struct seq_file *seq, u32 v)
+{
+	return seq_write(seq, &v, sizeof(v));
+}
+
+/**
+ * seq_write_gcov_u64 - write 64 bit number in gcov format to seq_file
+ * @seq: seq_file handle
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first.
+ */
+static int seq_write_gcov_u64(struct seq_file *seq, u64 v)
+{
+	u32 data[2];
+
+	data[0] = (v & 0xffffffffUL);
+	data[1] = (v >> 32);
+	return seq_write(seq, data, sizeof(data));
+}
+
+/**
+ * gcov_iter_write - write data for current pos to seq_file
+ * @iter: file iterator
+ * @seq: seq_file handle
+ *
+ * Return zero on success, non-zero otherwise.
+ */
+int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
+{
+	int rc = -EINVAL;
+
+	switch (iter->record) {
+	case RECORD_FILE_MAGIC:
+		rc = seq_write_gcov_u32(seq, GCOV_DATA_MAGIC);
+		break;
+	case RECORD_GCOV_VERSION:
+		rc = seq_write_gcov_u32(seq, iter->info->version);
+		break;
+	case RECORD_TIME_STAMP:
+		rc = seq_write_gcov_u32(seq, iter->info->stamp);
+		break;
+	case RECORD_FUNCTION_TAG:
+		rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION);
+		break;
+	case RECORD_FUNCTON_TAG_LEN:
+		rc = seq_write_gcov_u32(seq, 2);
+		break;
+	case RECORD_FUNCTION_IDENT:
+		rc = seq_write_gcov_u32(seq, get_func(iter)->ident);
+		break;
+	case RECORD_FUNCTION_CHECK:
+		rc = seq_write_gcov_u32(seq, get_func(iter)->checksum);
+		break;
+	case RECORD_COUNT_TAG:
+		rc = seq_write_gcov_u32(seq,
+			GCOV_TAG_FOR_COUNTER(get_type(iter)->ctr_type));
+		break;
+	case RECORD_COUNT_LEN:
+		rc = seq_write_gcov_u32(seq,
+				get_func(iter)->n_ctrs[iter->type] * 2);
+		break;
+	case RECORD_COUNT:
+		rc = seq_write_gcov_u64(seq,
+			iter->info->counts[iter->type].
+				values[iter->count + get_type(iter)->offset]);
+		break;
+	}
+	return rc;
+}
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
new file mode 100644
index 00000000000..060073ebf7a
--- /dev/null
+++ b/kernel/gcov/gcov.h
@@ -0,0 +1,128 @@
+/*
+ *  Profiling infrastructure declarations.
+ *
+ *  This file is based on gcc-internal definitions. Data structures are
+ *  defined to be compatible with gcc counterparts. For a better
+ *  understanding, refer to gcc source: gcc/gcov-io.h.
+ *
+ *    Copyright IBM Corp. 2009
+ *    Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *
+ *    Uses gcc-internal data definitions.
+ */
+
+#ifndef GCOV_H
+#define GCOV_H GCOV_H
+
+#include <linux/types.h>
+
+/*
+ * Profiling data types used for gcc 3.4 and above - these are defined by
+ * gcc and need to be kept as close to the original definition as possible to
+ * remain compatible.
+ */
+#define GCOV_COUNTERS		5
+#define GCOV_DATA_MAGIC		((unsigned int) 0x67636461)
+#define GCOV_TAG_FUNCTION	((unsigned int) 0x01000000)
+#define GCOV_TAG_COUNTER_BASE	((unsigned int) 0x01a10000)
+#define GCOV_TAG_FOR_COUNTER(count)					\
+	(GCOV_TAG_COUNTER_BASE + ((unsigned int) (count) << 17))
+
+#if BITS_PER_LONG >= 64
+typedef long gcov_type;
+#else
+typedef long long gcov_type;
+#endif
+
+/**
+ * struct gcov_fn_info - profiling meta data per function
+ * @ident: object file-unique function identifier
+ * @checksum: function checksum
+ * @n_ctrs: number of values per counter type belonging to this function
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time.
+ */
+struct gcov_fn_info {
+	unsigned int ident;
+	unsigned int checksum;
+	unsigned int n_ctrs[0];
+};
+
+/**
+ * struct gcov_ctr_info - profiling data per counter type
+ * @num: number of counter values for this type
+ * @values: array of counter values for this type
+ * @merge: merge function for counter values of this type (unused)
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the values array.
+ */
+struct gcov_ctr_info {
+	unsigned int	num;
+	gcov_type	*values;
+	void		(*merge)(gcov_type *, unsigned int);
+};
+
+/**
+ * struct gcov_info - profiling data per object file
+ * @version: gcov version magic indicating the gcc version used for compilation
+ * @next: list head for a singly-linked list
+ * @stamp: time stamp
+ * @filename: name of the associated gcov data file
+ * @n_functions: number of instrumented functions
+ * @functions: function data
+ * @ctr_mask: mask specifying which counter types are active
+ * @counts: counter data per counter type
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the next pointer.
+ */
+struct gcov_info {
+	unsigned int			version;
+	struct gcov_info		*next;
+	unsigned int			stamp;
+	const char			*filename;
+	unsigned int			n_functions;
+	const struct gcov_fn_info	*functions;
+	unsigned int			ctr_mask;
+	struct gcov_ctr_info		counts[0];
+};
+
+/* Base interface. */
+enum gcov_action {
+	GCOV_ADD,
+	GCOV_REMOVE,
+};
+
+void gcov_event(enum gcov_action action, struct gcov_info *info);
+void gcov_enable_events(void);
+
+/* Iterator control. */
+struct seq_file;
+struct gcov_iterator;
+
+struct gcov_iterator *gcov_iter_new(struct gcov_info *info);
+void gcov_iter_free(struct gcov_iterator *iter);
+void gcov_iter_start(struct gcov_iterator *iter);
+int gcov_iter_next(struct gcov_iterator *iter);
+int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq);
+struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter);
+
+/* gcov_info control. */
+void gcov_info_reset(struct gcov_info *info);
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2);
+void gcov_info_add(struct gcov_info *dest, struct gcov_info *source);
+struct gcov_info *gcov_info_dup(struct gcov_info *info);
+void gcov_info_free(struct gcov_info *info);
+
+struct gcov_link {
+	enum {
+		OBJ_TREE,
+		SRC_TREE,
+	} dir;
+	const char *ext;
+};
+extern const struct gcov_link gcov_link[];
+
+#endif /* GCOV_H */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 7fa44133352..9b1a7de2697 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -27,7 +27,6 @@ struct kthread_create_info
 	/* Information passed to kthread() from kthreadd. */
 	int (*threadfn)(void *data);
 	void *data;
-	struct completion started;
 
 	/* Result passed back to kthread_create() from kthreadd. */
 	struct task_struct *result;
@@ -36,17 +35,13 @@ struct kthread_create_info
 	struct list_head list;
 };
 
-struct kthread_stop_info
-{
-	struct task_struct *k;
-	int err;
-	struct completion done;
+struct kthread {
+	int should_stop;
+	struct completion exited;
 };
 
-/* Thread stopping is done by setthing this var: lock serializes
- * multiple kthread_stop calls. */
-static DEFINE_MUTEX(kthread_stop_lock);
-static struct kthread_stop_info kthread_stop_info;
+#define to_kthread(tsk)	\
+	container_of((tsk)->vfork_done, struct kthread, exited)
 
 /**
  * kthread_should_stop - should this kthread return now?
@@ -57,36 +52,35 @@ static struct kthread_stop_info kthread_stop_info;
  */
 int kthread_should_stop(void)
 {
-	return (kthread_stop_info.k == current);
+	return to_kthread(current)->should_stop;
 }
 EXPORT_SYMBOL(kthread_should_stop);
 
 static int kthread(void *_create)
 {
+	/* Copy data: it's on kthread's stack */
 	struct kthread_create_info *create = _create;
-	int (*threadfn)(void *data);
-	void *data;
-	int ret = -EINTR;
+	int (*threadfn)(void *data) = create->threadfn;
+	void *data = create->data;
+	struct kthread self;
+	int ret;
 
-	/* Copy data: it's on kthread's stack */
-	threadfn = create->threadfn;
-	data = create->data;
+	self.should_stop = 0;
+	init_completion(&self.exited);
+	current->vfork_done = &self.exited;
 
 	/* OK, tell user we're spawned, wait for stop or wakeup */
 	__set_current_state(TASK_UNINTERRUPTIBLE);
 	create->result = current;
-	complete(&create->started);
+	complete(&create->done);
 	schedule();
 
-	if (!kthread_should_stop())
+	ret = -EINTR;
+	if (!self.should_stop)
 		ret = threadfn(data);
 
-	/* It might have exited on its own, w/o kthread_stop.  Check. */
-	if (kthread_should_stop()) {
-		kthread_stop_info.err = ret;
-		complete(&kthread_stop_info.done);
-	}
-	return 0;
+	/* we can't just return, we must preserve "self" on stack */
+	do_exit(ret);
 }
 
 static void create_kthread(struct kthread_create_info *create)
@@ -95,11 +89,10 @@ static void create_kthread(struct kthread_create_info *create)
 
 	/* We want our own signal handler (we take no signals by default). */
 	pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
-	if (pid < 0)
+	if (pid < 0) {
 		create->result = ERR_PTR(pid);
-	else
-		wait_for_completion(&create->started);
-	complete(&create->done);
+		complete(&create->done);
+	}
 }
 
 /**
@@ -130,7 +123,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 
 	create.threadfn = threadfn;
 	create.data = data;
-	init_completion(&create.started);
 	init_completion(&create.done);
 
 	spin_lock(&kthread_create_lock);
@@ -198,30 +190,22 @@ EXPORT_SYMBOL(kthread_bind);
  */
 int kthread_stop(struct task_struct *k)
 {
+	struct kthread *kthread;
 	int ret;
 
-	mutex_lock(&kthread_stop_lock);
-
-	/* It could exit after stop_info.k set, but before wake_up_process. */
-	get_task_struct(k);
-
 	trace_sched_kthread_stop(k);
+	get_task_struct(k);
 
-	/* Must init completion *before* thread sees kthread_stop_info.k */
-	init_completion(&kthread_stop_info.done);
-	smp_wmb();
+	kthread = to_kthread(k);
+	barrier(); /* it might have exited */
+	if (k->vfork_done != NULL) {
+		kthread->should_stop = 1;
+		wake_up_process(k);
+		wait_for_completion(&kthread->exited);
+	}
+	ret = k->exit_code;
 
-	/* Now set kthread_should_stop() to true, and wake it up. */
-	kthread_stop_info.k = k;
-	wake_up_process(k);
 	put_task_struct(k);
-
-	/* Once it dies, reset stop ptr, gather result and we're done. */
-	wait_for_completion(&kthread_stop_info.done);
-	kthread_stop_info.k = NULL;
-	ret = kthread_stop_info.err;
-	mutex_unlock(&kthread_stop_lock);
-
 	trace_sched_kthread_stop_ret(ret);
 
 	return ret;
diff --git a/kernel/module.c b/kernel/module.c
index 215aaab09e9..38928fcaff2 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2216,6 +2216,10 @@ static noinline struct module *load_module(void __user *umod,
 	mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
 					    "__kcrctab_unused_gpl");
 #endif
+#ifdef CONFIG_CONSTRUCTORS
+	mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors",
+				  sizeof(*mod->ctors), &mod->num_ctors);
+#endif
 
 #ifdef CONFIG_MARKERS
 	mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
@@ -2389,6 +2393,17 @@ static noinline struct module *load_module(void __user *umod,
 	goto free_hdr;
 }
 
+/* Call module constructors. */
+static void do_mod_ctors(struct module *mod)
+{
+#ifdef CONFIG_CONSTRUCTORS
+	unsigned long i;
+
+	for (i = 0; i < mod->num_ctors; i++)
+		mod->ctors[i]();
+#endif
+}
+
 /* This is where the real work happens */
 SYSCALL_DEFINE3(init_module, void __user *, umod,
 		unsigned long, len, const char __user *, uargs)
@@ -2417,6 +2432,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
 	blocking_notifier_call_chain(&module_notify_list,
 			MODULE_STATE_COMING, mod);
 
+	do_mod_ctors(mod);
 	/* Start the module */
 	if (mod->init != NULL)
 		ret = do_one_initcall(mod->init);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 63598dca2d0..09b4ff9711b 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,19 +26,14 @@ static struct kmem_cache *nsproxy_cachep;
 
 struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
 
-/*
- * creates a copy of "orig" with refcount 1.
- */
-static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
+static inline struct nsproxy *create_nsproxy(void)
 {
-	struct nsproxy *ns;
+	struct nsproxy *nsproxy;
 
-	ns = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
-	if (ns) {
-		memcpy(ns, orig, sizeof(struct nsproxy));
-		atomic_set(&ns->count, 1);
-	}
-	return ns;
+	nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
+	if (nsproxy)
+		atomic_set(&nsproxy->count, 1);
+	return nsproxy;
 }
 
 /*
@@ -52,7 +47,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 	struct nsproxy *new_nsp;
 	int err;
 
-	new_nsp = clone_nsproxy(tsk->nsproxy);
+	new_nsp = create_nsproxy();
 	if (!new_nsp)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/kernel/pid.c b/kernel/pid.c
index b2e5f78fd28..31310b5d3f5 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -378,26 +378,15 @@ EXPORT_SYMBOL(pid_task);
 /*
  * Must be called under rcu_read_lock() or with tasklist_lock read-held.
  */
-struct task_struct *find_task_by_pid_type_ns(int type, int nr,
-		struct pid_namespace *ns)
+struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
 {
-	return pid_task(find_pid_ns(nr, ns), type);
+	return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
 }
 
-EXPORT_SYMBOL(find_task_by_pid_type_ns);
-
 struct task_struct *find_task_by_vpid(pid_t vnr)
 {
-	return find_task_by_pid_type_ns(PIDTYPE_PID, vnr,
-			current->nsproxy->pid_ns);
-}
-EXPORT_SYMBOL(find_task_by_vpid);
-
-struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
-{
-	return find_task_by_pid_type_ns(PIDTYPE_PID, nr, ns);
+	return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
 }
-EXPORT_SYMBOL(find_task_by_pid_ns);
 
 struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
 {
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 2d1001b4858..821722ae58a 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -67,9 +67,10 @@ err_alloc:
 	return NULL;
 }
 
-static struct pid_namespace *create_pid_namespace(unsigned int level)
+static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
 {
 	struct pid_namespace *ns;
+	unsigned int level = parent_pid_ns->level + 1;
 	int i;
 
 	ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
@@ -86,6 +87,7 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
 
 	kref_init(&ns->kref);
 	ns->level = level;
+	ns->parent = get_pid_ns(parent_pid_ns);
 
 	set_bit(0, ns->pidmap[0].page);
 	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -114,25 +116,11 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
 
 struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
 {
-	struct pid_namespace *new_ns;
-
-	BUG_ON(!old_ns);
-	new_ns = get_pid_ns(old_ns);
 	if (!(flags & CLONE_NEWPID))
-		goto out;
-
-	new_ns = ERR_PTR(-EINVAL);
+		return get_pid_ns(old_ns);
 	if (flags & CLONE_THREAD)
-		goto out_put;
-
-	new_ns = create_pid_namespace(old_ns->level + 1);
-	if (!IS_ERR(new_ns))
-		new_ns->parent = get_pid_ns(old_ns);
-
-out_put:
-	put_pid_ns(old_ns);
-out:
-	return new_ns;
+		return ERR_PTR(-EINVAL);
+	return create_pid_namespace(old_ns);
 }
 
 void free_pid_ns(struct kref *kref)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f6d8b8cb5e3..61c78b2c07b 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -167,67 +167,82 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
 int ptrace_attach(struct task_struct *task)
 {
 	int retval;
-	unsigned long flags;
 
 	audit_ptrace(task);
 
 	retval = -EPERM;
+	if (unlikely(task->flags & PF_KTHREAD))
+		goto out;
 	if (same_thread_group(task, current))
 		goto out;
 
-	/* Protect the target's credential calculations against our
+	/*
+	 * Protect exec's credential calculations against our interference;
 	 * interference; SUID, SGID and LSM creds get determined differently
 	 * under ptrace.
 	 */
 	retval = mutex_lock_interruptible(&task->cred_guard_mutex);
-	if (retval  < 0)
+	if (retval < 0)
 		goto out;
 
-	retval = -EPERM;
-repeat:
-	/*
-	 * Nasty, nasty.
-	 *
-	 * We want to hold both the task-lock and the
-	 * tasklist_lock for writing at the same time.
-	 * But that's against the rules (tasklist_lock
-	 * is taken for reading by interrupts on other
-	 * cpu's that may have task_lock).
-	 */
 	task_lock(task);
-	if (!write_trylock_irqsave(&tasklist_lock, flags)) {
-		task_unlock(task);
-		do {
-			cpu_relax();
-		} while (!write_can_lock(&tasklist_lock));
-		goto repeat;
-	}
-
-	if (!task->mm)
-		goto bad;
-	/* the same process cannot be attached many times */
-	if (task->ptrace & PT_PTRACED)
-		goto bad;
 	retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
+	task_unlock(task);
 	if (retval)
-		goto bad;
+		goto unlock_creds;
 
-	/* Go */
-	task->ptrace |= PT_PTRACED;
+	write_lock_irq(&tasklist_lock);
+	retval = -EPERM;
+	if (unlikely(task->exit_state))
+		goto unlock_tasklist;
+	if (task->ptrace)
+		goto unlock_tasklist;
+
+	task->ptrace = PT_PTRACED;
 	if (capable(CAP_SYS_PTRACE))
 		task->ptrace |= PT_PTRACE_CAP;
 
 	__ptrace_link(task, current);
-
 	send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
-bad:
-	write_unlock_irqrestore(&tasklist_lock, flags);
-	task_unlock(task);
+
+	retval = 0;
+unlock_tasklist:
+	write_unlock_irq(&tasklist_lock);
+unlock_creds:
 	mutex_unlock(&task->cred_guard_mutex);
 out:
 	return retval;
 }
 
+/**
+ * ptrace_traceme  --  helper for PTRACE_TRACEME
+ *
+ * Performs checks and sets PT_PTRACED.
+ * Should be used by all ptrace implementations for PTRACE_TRACEME.
+ */
+int ptrace_traceme(void)
+{
+	int ret = -EPERM;
+
+	write_lock_irq(&tasklist_lock);
+	/* Are we already being traced? */
+	if (!current->ptrace) {
+		ret = security_ptrace_traceme(current->parent);
+		/*
+		 * Check PF_EXITING to ensure ->real_parent has not passed
+		 * exit_ptrace(). Otherwise we don't report the error but
+		 * pretend ->real_parent untraces us right after return.
+		 */
+		if (!ret && !(current->real_parent->flags & PF_EXITING)) {
+			current->ptrace = PT_PTRACED;
+			__ptrace_link(current, current->real_parent);
+		}
+	}
+	write_unlock_irq(&tasklist_lock);
+
+	return ret;
+}
+
 /*
  * Called with irqs disabled, returns true if childs should reap themselves.
  */
@@ -409,37 +424,33 @@ static int ptrace_setoptions(struct task_struct *child, long data)
 
 static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
 {
+	unsigned long flags;
 	int error = -ESRCH;
 
-	read_lock(&tasklist_lock);
-	if (likely(child->sighand != NULL)) {
+	if (lock_task_sighand(child, &flags)) {
 		error = -EINVAL;
-		spin_lock_irq(&child->sighand->siglock);
 		if (likely(child->last_siginfo != NULL)) {
 			*info = *child->last_siginfo;
 			error = 0;
 		}
-		spin_unlock_irq(&child->sighand->siglock);
+		unlock_task_sighand(child, &flags);
 	}
-	read_unlock(&tasklist_lock);
 	return error;
 }
 
 static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
 {
+	unsigned long flags;
 	int error = -ESRCH;
 
-	read_lock(&tasklist_lock);
-	if (likely(child->sighand != NULL)) {
+	if (lock_task_sighand(child, &flags)) {
 		error = -EINVAL;
-		spin_lock_irq(&child->sighand->siglock);
 		if (likely(child->last_siginfo != NULL)) {
 			*child->last_siginfo = *info;
 			error = 0;
 		}
-		spin_unlock_irq(&child->sighand->siglock);
+		unlock_task_sighand(child, &flags);
 	}
-	read_unlock(&tasklist_lock);
 	return error;
 }
 
@@ -566,72 +577,16 @@ int ptrace_request(struct task_struct *child, long request,
 	return ret;
 }
 
-/**
- * ptrace_traceme  --  helper for PTRACE_TRACEME
- *
- * Performs checks and sets PT_PTRACED.
- * Should be used by all ptrace implementations for PTRACE_TRACEME.
- */
-int ptrace_traceme(void)
-{
-	int ret = -EPERM;
-
-	/*
-	 * Are we already being traced?
-	 */
-repeat:
-	task_lock(current);
-	if (!(current->ptrace & PT_PTRACED)) {
-		/*
-		 * See ptrace_attach() comments about the locking here.
-		 */
-		unsigned long flags;
-		if (!write_trylock_irqsave(&tasklist_lock, flags)) {
-			task_unlock(current);
-			do {
-				cpu_relax();
-			} while (!write_can_lock(&tasklist_lock));
-			goto repeat;
-		}
-
-		ret = security_ptrace_traceme(current->parent);
-
-		/*
-		 * Check PF_EXITING to ensure ->real_parent has not passed
-		 * exit_ptrace(). Otherwise we don't report the error but
-		 * pretend ->real_parent untraces us right after return.
-		 */
-		if (!ret && !(current->real_parent->flags & PF_EXITING)) {
-			current->ptrace |= PT_PTRACED;
-			__ptrace_link(current, current->real_parent);
-		}
-
-		write_unlock_irqrestore(&tasklist_lock, flags);
-	}
-	task_unlock(current);
-	return ret;
-}
-
-/**
- * ptrace_get_task_struct  --  grab a task struct reference for ptrace
- * @pid:       process id to grab a task_struct reference of
- *
- * This function is a helper for ptrace implementations.  It checks
- * permissions and then grabs a task struct for use of the actual
- * ptrace implementation.
- *
- * Returns the task_struct for @pid or an ERR_PTR() on failure.
- */
-struct task_struct *ptrace_get_task_struct(pid_t pid)
+static struct task_struct *ptrace_get_task_struct(pid_t pid)
 {
 	struct task_struct *child;
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	child = find_task_by_vpid(pid);
 	if (child)
 		get_task_struct(child);
+	rcu_read_unlock();
 
-	read_unlock(&tasklist_lock);
 	if (!child)
 		return ERR_PTR(-ESRCH);
 	return child;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bf8e7534c80..e1338f07431 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -18,7 +18,7 @@
 void res_counter_init(struct res_counter *counter, struct res_counter *parent)
 {
 	spin_lock_init(&counter->lock);
-	counter->limit = (unsigned long long)LLONG_MAX;
+	counter->limit = RESOURCE_MAX;
 	counter->parent = parent;
 }
 
@@ -133,6 +133,16 @@ int res_counter_memparse_write_strategy(const char *buf,
 					unsigned long long *res)
 {
 	char *end;
+
+	/* return RESOURCE_MAX(unlimited) if "-1" is specified */
+	if (*buf == '-') {
+		*res = simple_strtoull(buf + 1, &end, 10);
+		if (*res != 1 || *end != '\0')
+			return -EINVAL;
+		*res = RESOURCE_MAX;
+		return 0;
+	}
+
 	/* FIXME - make memparse() take const char* args */
 	*res = memparse((char *)buf, &end);
 	if (*end != '\0')
diff --git a/kernel/sched.c b/kernel/sched.c
index 8fb88a906aa..247fd0fedd0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7045,7 +7045,7 @@ static int migration_thread(void *data)
 
 		if (cpu_is_offline(cpu)) {
 			spin_unlock_irq(&rq->lock);
-			goto wait_to_die;
+			break;
 		}
 
 		if (rq->active_balance) {
@@ -7071,16 +7071,7 @@ static int migration_thread(void *data)
 		complete(&req->done);
 	}
 	__set_current_state(TASK_RUNNING);
-	return 0;
 
-wait_to_die:
-	/* Wait for kthread_stop */
-	set_current_state(TASK_INTERRUPTIBLE);
-	while (!kthread_should_stop()) {
-		schedule();
-		set_current_state(TASK_INTERRUPTIBLE);
-	}
-	__set_current_state(TASK_RUNNING);
 	return 0;
 }
 
@@ -7494,6 +7485,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		rq = task_rq_lock(p, &flags);
 		__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
 		task_rq_unlock(rq, &flags);
+		get_task_struct(p);
 		cpu_rq(cpu)->migration_thread = p;
 		break;
 
@@ -7524,6 +7516,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		kthread_bind(cpu_rq(cpu)->migration_thread,
 			     cpumask_any(cpu_online_mask));
 		kthread_stop(cpu_rq(cpu)->migration_thread);
+		put_task_struct(cpu_rq(cpu)->migration_thread);
 		cpu_rq(cpu)->migration_thread = NULL;
 		break;
 
@@ -7533,6 +7526,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		migrate_live_tasks(cpu);
 		rq = cpu_rq(cpu);
 		kthread_stop(rq->migration_thread);
+		put_task_struct(rq->migration_thread);
 		rq->migration_thread = NULL;
 		/* Idle task back to normal (off runqueue, low prio) */
 		spin_lock_irq(&rq->lock);
diff --git a/kernel/signal.c b/kernel/signal.c
index d81f4952eeb..ccf1ceedaeb 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1410,7 +1410,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
  	/* do_notify_parent_cldstop should have been called instead.  */
  	BUG_ON(task_is_stopped_or_traced(tsk));
 
-	BUG_ON(!tsk->ptrace &&
+	BUG_ON(!task_ptrace(tsk) &&
 	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
 
 	info.si_signo = sig;
@@ -1449,7 +1449,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 
 	psig = tsk->parent->sighand;
 	spin_lock_irqsave(&psig->siglock, flags);
-	if (!tsk->ptrace && sig == SIGCHLD &&
+	if (!task_ptrace(tsk) && sig == SIGCHLD &&
 	    (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
 	     (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
 		/*
@@ -1486,7 +1486,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 	struct task_struct *parent;
 	struct sighand_struct *sighand;
 
-	if (tsk->ptrace & PT_PTRACED)
+	if (task_ptrace(tsk))
 		parent = tsk->parent;
 	else {
 		tsk = tsk->group_leader;
@@ -1499,7 +1499,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 	 * see comment in do_notify_parent() abot the following 3 lines
 	 */
 	rcu_read_lock();
-	info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
+	info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
 	info.si_uid = __task_cred(tsk)->uid;
 	rcu_read_unlock();
 
@@ -1535,7 +1535,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 
 static inline int may_ptrace_stop(void)
 {
-	if (!likely(current->ptrace & PT_PTRACED))
+	if (!likely(task_ptrace(current)))
 		return 0;
 	/*
 	 * Are we in the middle of do_coredump?
@@ -1753,7 +1753,7 @@ static int do_signal_stop(int signr)
 static int ptrace_signal(int signr, siginfo_t *info,
 			 struct pt_regs *regs, void *cookie)
 {
-	if (!(current->ptrace & PT_PTRACED))
+	if (!task_ptrace(current))
 		return signr;
 
 	ptrace_signal_deliver(regs, cookie);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b41fb710e11..3a94905fa5d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -213,6 +213,7 @@ restart:
 	do {
 		if (pending & 1) {
 			int prev_count = preempt_count();
+			kstat_incr_softirqs_this_cpu(h - softirq_vec);
 
 			trace_softirq_entry(h, softirq_vec);
 			h->action(h);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ab462b9968d..62e4ff9968b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2283,7 +2283,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 		  void *data)
 {
 #define TMPBUFLEN 21
-	int *i, vleft, first=1, neg, val;
+	int *i, vleft, first = 1, neg;
 	unsigned long lval;
 	size_t left, len;
 	
@@ -2336,8 +2336,6 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 			len = p-buf;
 			if ((len < left) && *p && !isspace(*p))
 				break;
-			if (neg)
-				val = -val;
 			s += len;
 			left -= len;
 
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 815237a55af..8a82b4b8ea5 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,6 +15,16 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 
+static struct uts_namespace *create_uts_ns(void)
+{
+	struct uts_namespace *uts_ns;
+
+	uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
+	if (uts_ns)
+		kref_init(&uts_ns->kref);
+	return uts_ns;
+}
+
 /*
  * Clone a new ns copying an original utsname, setting refcount to 1
  * @old_ns: namespace to clone
@@ -24,14 +34,13 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
 {
 	struct uts_namespace *ns;
 
-	ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
+	ns = create_uts_ns();
 	if (!ns)
 		return ERR_PTR(-ENOMEM);
 
 	down_read(&uts_sem);
 	memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
 	up_read(&uts_sem);
-	kref_init(&ns->kref);
 	return ns;
 }
 
diff --git a/lib/Makefile b/lib/Makefile
index 8e9bcf9d326..b6d1857bbf0 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -21,7 +21,7 @@ lib-y	+= kobject.o kref.o klist.o
 
 obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
 	 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
-	 string_helpers.o
+	 string_helpers.o gcd.o
 
 ifeq ($(CONFIG_DEBUG_KOBJECT),y)
 CFLAGS_kobject.o += -DDEBUG
diff --git a/lib/gcd.c b/lib/gcd.c
new file mode 100644
index 00000000000..f879033d982
--- /dev/null
+++ b/lib/gcd.c
@@ -0,0 +1,18 @@
+#include <linux/kernel.h>
+#include <linux/gcd.h>
+#include <linux/module.h>
+
+/* Greatest common divisor */
+unsigned long gcd(unsigned long a, unsigned long b)
+{
+	unsigned long r;
+
+	if (a < b)
+		swap(a, b);
+	while ((r = a % b) != 0) {
+		a = b;
+		b = r;
+	}
+	return b;
+}
+EXPORT_SYMBOL_GPL(gcd);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 70db6e0a5ee..e2fa20dadf4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -45,7 +45,7 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES	5
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
-/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */
+/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
 int do_swap_account __read_mostly;
 static int really_do_swap_account __initdata = 1; /* for remember boot option*/
 #else
@@ -62,7 +62,8 @@ enum mem_cgroup_stat_index {
 	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
 	 */
 	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
-	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as rss */
+	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
+	MEM_CGROUP_STAT_MAPPED_FILE,  /* # of pages charged as file rss */
 	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
 	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
 
@@ -176,6 +177,9 @@ struct mem_cgroup {
 
 	unsigned int	swappiness;
 
+	/* set when res.limit == memsw.limit */
+	bool		memsw_is_minimum;
+
 	/*
 	 * statistics. This must be placed at the end of memcg.
 	 */
@@ -188,6 +192,7 @@ enum charge_type {
 	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
 	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
 	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
+	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
 	NR_CHARGE_TYPE,
 };
 
@@ -644,6 +649,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 	int zid = zone_idx(z);
 	struct mem_cgroup_per_zone *mz;
 	int lru = LRU_FILE * !!file + !!active;
+	int ret;
 
 	BUG_ON(!mem_cont);
 	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
@@ -661,9 +667,19 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 			continue;
 
 		scan++;
-		if (__isolate_lru_page(page, mode, file) == 0) {
+		ret = __isolate_lru_page(page, mode, file);
+		switch (ret) {
+		case 0:
 			list_move(&page->lru, dst);
+			mem_cgroup_del_lru(page);
 			nr_taken++;
+			break;
+		case -EBUSY:
+			/* we don't affect global LRU but rotate in our LRU */
+			mem_cgroup_rotate_lru_list(page, page_lru(page));
+			break;
+		default:
+			break;
 		}
 	}
 
@@ -845,6 +861,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
 	int ret, total = 0;
 	int loop = 0;
 
+	/* If memsw_is_minimum==1, swap-out is of-no-use. */
+	if (root_mem->memsw_is_minimum)
+		noswap = true;
+
 	while (loop < 2) {
 		victim = mem_cgroup_select_victim(root_mem);
 		if (victim == root_mem)
@@ -900,6 +920,44 @@ static void record_last_oom(struct mem_cgroup *mem)
 	mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
 }
 
+/*
+ * Currently used to update mapped file statistics, but the routine can be
+ * generalized to update other statistics as well.
+ */
+void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
+{
+	struct mem_cgroup *mem;
+	struct mem_cgroup_stat *stat;
+	struct mem_cgroup_stat_cpu *cpustat;
+	int cpu;
+	struct page_cgroup *pc;
+
+	if (!page_is_file_cache(page))
+		return;
+
+	pc = lookup_page_cgroup(page);
+	if (unlikely(!pc))
+		return;
+
+	lock_page_cgroup(pc);
+	mem = pc->mem_cgroup;
+	if (!mem)
+		goto done;
+
+	if (!PageCgroupUsed(pc))
+		goto done;
+
+	/*
+	 * Preemption is already disabled, we don't need get_cpu()
+	 */
+	cpu = smp_processor_id();
+	stat = &mem->stat;
+	cpustat = &stat->cpustat[cpu];
+
+	__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val);
+done:
+	unlock_page_cgroup(pc);
+}
 
 /*
  * Unlike exported interface, "oom" parameter is added. if oom==true,
@@ -1098,6 +1156,10 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 	struct mem_cgroup_per_zone *from_mz, *to_mz;
 	int nid, zid;
 	int ret = -EBUSY;
+	struct page *page;
+	int cpu;
+	struct mem_cgroup_stat *stat;
+	struct mem_cgroup_stat_cpu *cpustat;
 
 	VM_BUG_ON(from == to);
 	VM_BUG_ON(PageLRU(pc->page));
@@ -1118,6 +1180,23 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 
 	res_counter_uncharge(&from->res, PAGE_SIZE);
 	mem_cgroup_charge_statistics(from, pc, false);
+
+	page = pc->page;
+	if (page_is_file_cache(page) && page_mapped(page)) {
+		cpu = smp_processor_id();
+		/* Update mapped_file data for mem_cgroup "from" */
+		stat = &from->stat;
+		cpustat = &stat->cpustat[cpu];
+		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE,
+						-1);
+
+		/* Update mapped_file data for mem_cgroup "to" */
+		stat = &to->stat;
+		cpustat = &stat->cpustat[cpu];
+		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE,
+						1);
+	}
+
 	if (do_swap_account)
 		res_counter_uncharge(&from->memsw, PAGE_SIZE);
 	css_put(&from->css);
@@ -1433,6 +1512,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 
 	switch (ctype) {
 	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
+	case MEM_CGROUP_CHARGE_TYPE_DROP:
 		if (page_mapped(page))
 			goto unlock_out;
 		break;
@@ -1496,18 +1576,23 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
  * called after __delete_from_swap_cache() and drop "page" account.
  * memcg information is recorded to swap_cgroup of "ent"
  */
-void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
+void
+mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
 {
 	struct mem_cgroup *memcg;
+	int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
+
+	if (!swapout) /* this was a swap cache but the swap is unused ! */
+		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
+
+	memcg = __mem_cgroup_uncharge_common(page, ctype);
 
-	memcg = __mem_cgroup_uncharge_common(page,
-					MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
 	/* record memcg information */
-	if (do_swap_account && memcg) {
+	if (do_swap_account && swapout && memcg) {
 		swap_cgroup_record(ent, css_id(&memcg->css));
 		mem_cgroup_get(memcg);
 	}
-	if (memcg)
+	if (swapout && memcg)
 		css_put(&memcg->css);
 }
 #endif
@@ -1685,6 +1770,12 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 			break;
 		}
 		ret = res_counter_set_limit(&memcg->res, val);
+		if (!ret) {
+			if (memswlimit == val)
+				memcg->memsw_is_minimum = true;
+			else
+				memcg->memsw_is_minimum = false;
+		}
 		mutex_unlock(&set_limit_mutex);
 
 		if (!ret)
@@ -1703,16 +1794,14 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 	return ret;
 }
 
-int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
-				unsigned long long val)
+static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
+					unsigned long long val)
 {
 	int retry_count;
 	u64 memlimit, oldusage, curusage;
 	int children = mem_cgroup_count_children(memcg);
 	int ret = -EBUSY;
 
-	if (!do_swap_account)
-		return -EINVAL;
 	/* see mem_cgroup_resize_res_limit */
  	retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
 	oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
@@ -1734,6 +1823,12 @@ int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 			break;
 		}
 		ret = res_counter_set_limit(&memcg->memsw, val);
+		if (!ret) {
+			if (memlimit == val)
+				memcg->memsw_is_minimum = true;
+			else
+				memcg->memsw_is_minimum = false;
+		}
 		mutex_unlock(&set_limit_mutex);
 
 		if (!ret)
@@ -1947,8 +2042,7 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 		val = res_counter_read_u64(&mem->res, name);
 		break;
 	case _MEMSWAP:
-		if (do_swap_account)
-			val = res_counter_read_u64(&mem->memsw, name);
+		val = res_counter_read_u64(&mem->memsw, name);
 		break;
 	default:
 		BUG();
@@ -2046,6 +2140,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 enum {
 	MCS_CACHE,
 	MCS_RSS,
+	MCS_MAPPED_FILE,
 	MCS_PGPGIN,
 	MCS_PGPGOUT,
 	MCS_INACTIVE_ANON,
@@ -2066,6 +2161,7 @@ struct {
 } memcg_stat_strings[NR_MCS_STAT] = {
 	{"cache", "total_cache"},
 	{"rss", "total_rss"},
+	{"mapped_file", "total_mapped_file"},
 	{"pgpgin", "total_pgpgin"},
 	{"pgpgout", "total_pgpgout"},
 	{"inactive_anon", "total_inactive_anon"},
@@ -2086,6 +2182,8 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
 	s->stat[MCS_CACHE] += val * PAGE_SIZE;
 	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
 	s->stat[MCS_RSS] += val * PAGE_SIZE;
+	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE);
+	s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE;
 	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
 	s->stat[MCS_PGPGIN] += val;
 	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a5f3c278c57..6f0753fe694 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -73,6 +73,7 @@ unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 unsigned long highest_memmap_pfn __read_mostly;
 int percpu_pagelist_fraction;
+gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 int pageblock_order __read_mostly;
@@ -1863,6 +1864,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	struct page *page;
 	int migratetype = allocflags_to_migratetype(gfp_mask);
 
+	gfp_mask &= gfp_allowed_mask;
+
 	lockdep_trace_alloc(gfp_mask);
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 11a8a10a390..f22b4ebbd8d 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -83,12 +83,12 @@ void __init page_cgroup_init_flatmem(void)
 			goto fail;
 	}
 	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
-	printk(KERN_INFO "please try cgroup_disable=memory option if you"
-	" don't want\n");
+	printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
+	" don't want memory cgroups\n");
 	return;
 fail:
-	printk(KERN_CRIT "allocation of page_cgroup was failed.\n");
-	printk(KERN_CRIT "please try cgroup_disable=memory boot option\n");
+	printk(KERN_CRIT "allocation of page_cgroup failed.\n");
+	printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
 	panic("Out of memory");
 }
 
@@ -99,6 +99,8 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
 	unsigned long pfn = page_to_pfn(page);
 	struct mem_section *section = __pfn_to_section(pfn);
 
+	if (!section->page_cgroup)
+		return NULL;
 	return section->page_cgroup + pfn;
 }
 
@@ -252,14 +254,14 @@ void __init page_cgroup_init(void)
 		fail = init_section_page_cgroup(pfn);
 	}
 	if (fail) {
-		printk(KERN_CRIT "try cgroup_disable=memory boot option\n");
+		printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
 		panic("Out of memory");
 	} else {
 		hotplug_memory_notifier(page_cgroup_callback, 0);
 	}
 	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
-	printk(KERN_INFO "please try cgroup_disable=memory option if you don't"
-	" want\n");
+	printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't"
+	" want memory cgroups\n");
 }
 
 void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
@@ -309,8 +311,6 @@ static int swap_cgroup_prepare(int type)
 	struct swap_cgroup_ctrl *ctrl;
 	unsigned long idx, max;
 
-	if (!do_swap_account)
-		return 0;
 	ctrl = &swap_cgroup_ctrl[type];
 
 	for (idx = 0; idx < ctrl->length; idx++) {
@@ -347,9 +347,6 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
 	struct swap_cgroup *sc;
 	unsigned short old;
 
-	if (!do_swap_account)
-		return 0;
-
 	ctrl = &swap_cgroup_ctrl[type];
 
 	mappage = ctrl->map[idx];
@@ -378,9 +375,6 @@ unsigned short lookup_swap_cgroup(swp_entry_t ent)
 	struct swap_cgroup *sc;
 	unsigned short ret;
 
-	if (!do_swap_account)
-		return 0;
-
 	ctrl = &swap_cgroup_ctrl[type];
 	mappage = ctrl->map[idx];
 	sc = page_address(mappage);
diff --git a/mm/rmap.c b/mm/rmap.c
index c9ccc1a72dc..836c6c63e1f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -703,8 +703,10 @@ void page_add_new_anon_rmap(struct page *page,
  */
 void page_add_file_rmap(struct page *page)
 {
-	if (atomic_inc_and_test(&page->_mapcount))
+	if (atomic_inc_and_test(&page->_mapcount)) {
 		__inc_zone_page_state(page, NR_FILE_MAPPED);
+		mem_cgroup_update_mapped_file_stat(page, 1);
+	}
 }
 
 #ifdef CONFIG_DEBUG_VM
@@ -753,6 +755,7 @@ void page_remove_rmap(struct page *page)
 			mem_cgroup_uncharge_page(page);
 		__dec_zone_page_state(page,
 			PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
+		mem_cgroup_update_mapped_file_stat(page, -1);
 		/*
 		 * It would be tidy to reset the PageAnon mapping here,
 		 * but that might overwrite a racing page_add_anon_rmap
diff --git a/mm/slab.c b/mm/slab.c
index d08692303f6..e74a16e4ced 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -305,12 +305,6 @@ struct kmem_list3 {
 };
 
 /*
- * The slab allocator is initialized with interrupts disabled. Therefore, make
- * sure early boot allocations don't accidentally enable interrupts.
- */
-static gfp_t slab_gfp_mask __read_mostly = SLAB_GFP_BOOT_MASK;
-
-/*
  * Need this for bootstrapping a per node allocator.
  */
 #define NUM_INIT_LISTS (3 * MAX_NUMNODES)
@@ -1559,11 +1553,6 @@ void __init kmem_cache_init_late(void)
 {
 	struct kmem_cache *cachep;
 
-	/*
-	 * Interrupts are enabled now so all GFP allocations are safe.
-	 */
-	slab_gfp_mask = __GFP_BITS_MASK;
-
 	/* 6) resize the head arrays to their final sizes */
 	mutex_lock(&cache_chain_mutex);
 	list_for_each_entry(cachep, &cache_chain, next)
@@ -3307,7 +3296,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 	unsigned long save_flags;
 	void *ptr;
 
-	flags &= slab_gfp_mask;
+	flags &= gfp_allowed_mask;
 
 	lockdep_trace_alloc(flags);
 
@@ -3392,7 +3381,7 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
 	unsigned long save_flags;
 	void *objp;
 
-	flags &= slab_gfp_mask;
+	flags &= gfp_allowed_mask;
 
 	lockdep_trace_alloc(flags);
 
diff --git a/mm/slub.c b/mm/slub.c
index 4c6449310a0..ce62b770e2f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -179,12 +179,6 @@ static enum {
 	SYSFS		/* Sysfs up */
 } slab_state = DOWN;
 
-/*
- * The slab allocator is initialized with interrupts disabled. Therefore, make
- * sure early boot allocations don't accidentally enable interrupts.
- */
-static gfp_t slab_gfp_mask __read_mostly = SLAB_GFP_BOOT_MASK;
-
 /* A list of all slab caches on the system */
 static DECLARE_RWSEM(slub_lock);
 static LIST_HEAD(slab_caches);
@@ -1692,7 +1686,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
 	unsigned long flags;
 	unsigned int objsize;
 
-	gfpflags &= slab_gfp_mask;
+	gfpflags &= gfp_allowed_mask;
 
 	lockdep_trace_alloc(gfpflags);
 	might_sleep_if(gfpflags & __GFP_WAIT);
@@ -3220,10 +3214,6 @@ void __init kmem_cache_init(void)
 
 void __init kmem_cache_init_late(void)
 {
-	/*
-	 * Interrupts are enabled now so all GFP allocations are safe.
-	 */
-	slab_gfp_mask = __GFP_BITS_MASK;
 }
 
 /*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 28faa01cf57..d1ade1a48ee 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -583,8 +583,9 @@ static int swap_entry_free(struct swap_info_struct *p,
 			swap_list.next = p - swap_info;
 		nr_swap_pages++;
 		p->inuse_pages--;
-		mem_cgroup_uncharge_swap(ent);
 	}
+	if (!swap_count(count))
+		mem_cgroup_uncharge_swap(ent);
 	return count;
 }
 
@@ -609,12 +610,19 @@ void swap_free(swp_entry_t entry)
 void swapcache_free(swp_entry_t entry, struct page *page)
 {
 	struct swap_info_struct *p;
+	int ret;
 
-	if (page)
-		mem_cgroup_uncharge_swapcache(page, entry);
 	p = swap_info_get(entry);
 	if (p) {
-		swap_entry_free(p, entry, SWAP_CACHE);
+		ret = swap_entry_free(p, entry, SWAP_CACHE);
+		if (page) {
+			bool swapout;
+			if (ret)
+				swapout = true; /* the end of swap out */
+			else
+				swapout = false; /* no more swap users! */
+			mem_cgroup_uncharge_swapcache(page, entry, swapout);
+		}
 		spin_unlock(&swap_lock);
 	}
 	return;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4139aa52b94..e8fa2d9eb21 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -837,7 +837,6 @@ int __isolate_lru_page(struct page *page, int mode, int file)
 		 */
 		ClearPageLRU(page);
 		ret = 0;
-		mem_cgroup_del_lru(page);
 	}
 
 	return ret;
@@ -885,12 +884,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		switch (__isolate_lru_page(page, mode, file)) {
 		case 0:
 			list_move(&page->lru, dst);
+			mem_cgroup_del_lru(page);
 			nr_taken++;
 			break;
 
 		case -EBUSY:
 			/* else it is being freed elsewhere */
 			list_move(&page->lru, src);
+			mem_cgroup_rotate_lru_list(page, page_lru(page));
 			continue;
 
 		default:
@@ -931,6 +932,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 				continue;
 			if (__isolate_lru_page(cursor_page, mode, file) == 0) {
 				list_move(&cursor_page->lru, dst);
+				mem_cgroup_del_lru(page);
 				nr_taken++;
 				scan++;
 			}
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 2b706617c89..7a7778746ea 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -116,6 +116,17 @@ _a_flags       = $(KBUILD_CPPFLAGS) $(KBUILD_AFLAGS) $(KBUILD_SUBDIR_ASFLAGS) \
                  $(asflags-y) $(AFLAGS_$(basetarget).o)
 _cpp_flags     = $(KBUILD_CPPFLAGS) $(cppflags-y) $(CPPFLAGS_$(@F))
 
+#
+# Enable gcov profiling flags for a file, directory or for all files depending
+# on variables GCOV_PROFILE_obj.o, GCOV_PROFILE and CONFIG_GCOV_PROFILE_ALL
+# (in this order)
+#
+ifeq ($(CONFIG_GCOV_KERNEL),y)
+_c_flags += $(if $(patsubst n%,, \
+		$(GCOV_PROFILE_$(basetarget).o)$(GCOV_PROFILE)$(CONFIG_GCOV_PROFILE_ALL)), \
+		$(CFLAGS_GCOV))
+endif
+
 # If building the kernel in a separate objtree expand all occurrences
 # of -Idir to -I$(srctree)/dir except for absolute paths (starting with '/').
 
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 5fda7df1972..b8186bac8b7 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -490,7 +490,7 @@ int devcgroup_inode_permission(struct inode *inode, int mask)
 
 	list_for_each_entry_rcu(wh, &dev_cgroup->whitelist, list) {
 		if (wh->type & DEV_ALL)
-			goto acc_check;
+			goto found;
 		if ((wh->type & DEV_BLOCK) && !S_ISBLK(inode->i_mode))
 			continue;
 		if ((wh->type & DEV_CHAR) && !S_ISCHR(inode->i_mode))
@@ -499,11 +499,12 @@ int devcgroup_inode_permission(struct inode *inode, int mask)
 			continue;
 		if (wh->minor != ~0 && wh->minor != iminor(inode))
 			continue;
-acc_check:
+
 		if ((mask & MAY_WRITE) && !(wh->access & ACC_WRITE))
 			continue;
 		if ((mask & MAY_READ) && !(wh->access & ACC_READ))
 			continue;
+found:
 		rcu_read_unlock();
 		return 0;
 	}
@@ -527,7 +528,7 @@ int devcgroup_inode_mknod(int mode, dev_t dev)
 
 	list_for_each_entry_rcu(wh, &dev_cgroup->whitelist, list) {
 		if (wh->type & DEV_ALL)
-			goto acc_check;
+			goto found;
 		if ((wh->type & DEV_BLOCK) && !S_ISBLK(mode))
 			continue;
 		if ((wh->type & DEV_CHAR) && !S_ISCHR(mode))
@@ -536,9 +537,10 @@ int devcgroup_inode_mknod(int mode, dev_t dev)
 			continue;
 		if (wh->minor != ~0 && wh->minor != MINOR(dev))
 			continue;
-acc_check:
+
 		if (!(wh->access & ACC_MKNOD))
 			continue;
+found:
 		rcu_read_unlock();
 		return 0;
 	}