1 files changed, 0 insertions, 52 deletions
diff --git a/arch/powerpc/platforms/512x/mpc5121_generic.c b/arch/powerpc/platforms/512x/mpc5121_generic.c
deleted file mode 100644
index 926731f1ff0..00000000000
--- a/arch/powerpc/platforms/512x/mpc5121_generic.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (C) 2007,2008 Freescale Semiconductor, Inc. All rights reserved.
- *
- * Author: John Rigby, <jrigby@freescale.com>
- *
- * Description:
- * MPC5121 SoC setup
- *
- * This is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/of_platform.h>
-
-#include <asm/machdep.h>
-#include <asm/ipic.h>
-#include <asm/prom.h>
-#include <asm/time.h>
-
-#include "mpc512x.h"
-
-/*
- * list of supported boards
- */
-static const char *board[] __initdata = {
-	"prt,prtlvt",
-	NULL
-};
-
-/*
- * Called very early, MMU is off, device-tree isn't unflattened
- */
-static int __init mpc5121_generic_probe(void)
-{
-	return of_flat_dt_match(of_get_flat_dt_root(), board);
-}
-
-define_machine(mpc5121_generic) {
-	.name			= "MPC5121 generic",
-	.probe			= mpc5121_generic_probe,
-	.init			= mpc512x_init,
-	.init_early		= mpc512x_init_diu,
-	.setup_arch		= mpc512x_setup_diu,
-	.init_IRQ		= mpc512x_init_IRQ,
-	.get_irq		= ipic_get_irq,
-	.calibrate_decr		= generic_calibrate_decr,
-	.restart		= mpc512x_restart,
-};
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index 7868f4dc1d0..c348eaee7ee 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -1,30 +1,9 @@
 comment "Processor Type"
 
-config CPU_32
-	bool
-	default y
-
 # Select CPU types depending on the architecture selected.  This selects
 # which CPUs we support in the kernel image, and the compiler instruction
 # optimiser behaviour.
 
-# ARM610
-config CPU_ARM610
-	bool "Support ARM610 processor"
-	depends on ARCH_RPC
-	select CPU_32v3
-	select CPU_CACHE_V3
-	select CPU_CACHE_VIVT
-	select CPU_CP15_MMU
-	select CPU_COPY_V3 if MMU
-	select CPU_TLB_V3 if MMU
-	help
-	  The ARM610 is the successor to the ARM3 processor
-	  and was produced by VLSI Technology Inc.
-
-	  Say Y if you want support for the ARM610 processor.
-	  Otherwise, say N.
-
 # ARM7TDMI
 config CPU_ARM7TDMI
 	bool "Support ARM7TDMI processor"
@@ -32,6 +11,7 @@ config CPU_ARM7TDMI
 	select CPU_32v4T
 	select CPU_ABRT_LV4T
 	select CPU_CACHE_V4
+	select CPU_PABRT_LEGACY
 	help
 	  A 32-bit RISC microprocessor based on the ARM7 processor core
 	  which has no memory control unit and cache.
@@ -39,35 +19,16 @@ config CPU_ARM7TDMI
 	  Say Y if you want support for the ARM7TDMI processor.
 	  Otherwise, say N.
 
-# ARM710
-config CPU_ARM710
-	bool "Support ARM710 processor" if !ARCH_CLPS7500 && ARCH_RPC
-	default y if ARCH_CLPS7500
-	select CPU_32v3
-	select CPU_CACHE_V3
-	select CPU_CACHE_VIVT
-	select CPU_CP15_MMU
-	select CPU_COPY_V3 if MMU
-	select CPU_TLB_V3 if MMU
-	help
-	  A 32-bit RISC microprocessor based on the ARM7 processor core
-	  designed by Advanced RISC Machines Ltd. The ARM710 is the
-	  successor to the ARM610 processor. It was released in
-	  July 1994 by VLSI Technology Inc.
-
-	  Say Y if you want support for the ARM710 processor.
-	  Otherwise, say N.
-
 # ARM720T
 config CPU_ARM720T
-	bool "Support ARM720T processor" if !ARCH_CLPS711X && !ARCH_L7200 && !ARCH_CDB89712 && ARCH_INTEGRATOR
-	default y if ARCH_CLPS711X || ARCH_L7200 || ARCH_CDB89712 || ARCH_H720X
+	bool "Support ARM720T processor" if ARCH_INTEGRATOR
 	select CPU_32v4T
 	select CPU_ABRT_LV4T
 	select CPU_CACHE_V4
 	select CPU_CACHE_VIVT
-	select CPU_CP15_MMU
 	select CPU_COPY_V4WT if MMU
+	select CPU_CP15_MMU
+	select CPU_PABRT_LEGACY
 	select CPU_TLB_V4WT if MMU
 	help
 	  A 32-bit RISC processor with 8kByte Cache, Write Buffer and
@@ -82,8 +43,9 @@ config CPU_ARM740T
 	depends on !MMU
 	select CPU_32v4T
 	select CPU_ABRT_LV4T
-	select CPU_CACHE_V3	# although the core is v4t
+	select CPU_CACHE_V4
 	select CPU_CP15_MPU
+	select CPU_PABRT_LEGACY
 	help
 	  A 32-bit RISC processor with 8KB cache or 4KB variants,
 	  write buffer and MPU(Protection Unit) built around
@@ -99,6 +61,7 @@ config CPU_ARM9TDMI
 	select CPU_32v4T
 	select CPU_ABRT_NOMMU
 	select CPU_CACHE_V4
+	select CPU_PABRT_LEGACY
 	help
 	  A 32-bit RISC microprocessor based on the ARM9 processor core
 	  which has no memory control unit and cache.
@@ -108,22 +71,18 @@ config CPU_ARM9TDMI
 
 # ARM920T
 config CPU_ARM920T
-	bool "Support ARM920T processor"
-	depends on ARCH_EP93XX || ARCH_INTEGRATOR || CPU_S3C2410 || CPU_S3C2440 || CPU_S3C2442 || ARCH_IMX || ARCH_AAEC2000 || ARCH_AT91RM9200
-	default y if CPU_S3C2410 || CPU_S3C2440 || CPU_S3C2442 || ARCH_AT91RM9200
+	bool "Support ARM920T processor" if ARCH_INTEGRATOR
 	select CPU_32v4T
 	select CPU_ABRT_EV4T
 	select CPU_CACHE_V4WT
 	select CPU_CACHE_VIVT
-	select CPU_CP15_MMU
 	select CPU_COPY_V4WB if MMU
+	select CPU_CP15_MMU
+	select CPU_PABRT_LEGACY
 	select CPU_TLB_V4WBI if MMU
 	help
 	  The ARM920T is licensed to be produced by numerous vendors,
-	  and is used in the Maverick EP9312 and the Samsung S3C2410.
-
-	  More information on the Maverick EP9312 at
-	  <http://linuxdevices.com/products/PD2382866068.html>.
+	  and is used in the Cirrus EP93xx and the Samsung S3C2410.
 
 	  Say Y if you want support for the ARM920T processor.
 	  Otherwise, say N.
@@ -131,14 +90,13 @@ config CPU_ARM920T
 # ARM922T
 config CPU_ARM922T
 	bool "Support ARM922T processor" if ARCH_INTEGRATOR
-	depends on ARCH_LH7A40X || ARCH_INTEGRATOR || ARCH_KS8695
-	default y if ARCH_LH7A40X || ARCH_KS8695
 	select CPU_32v4T
 	select CPU_ABRT_EV4T
 	select CPU_CACHE_V4WT
 	select CPU_CACHE_VIVT
-	select CPU_CP15_MMU
 	select CPU_COPY_V4WB if MMU
+	select CPU_CP15_MMU
+	select CPU_PABRT_LEGACY
 	select CPU_TLB_V4WBI if MMU
 	help
 	  The ARM922T is a version of the ARM920T, but with smaller
@@ -151,14 +109,13 @@ config CPU_ARM922T
 # ARM925T
 config CPU_ARM925T
  	bool "Support ARM925T processor" if ARCH_OMAP1
- 	depends on ARCH_OMAP15XX
- 	default y if ARCH_OMAP15XX
 	select CPU_32v4T
 	select CPU_ABRT_EV4T
 	select CPU_CACHE_V4WT
 	select CPU_CACHE_VIVT
-	select CPU_CP15_MMU
 	select CPU_COPY_V4WB if MMU
+	select CPU_CP15_MMU
+	select CPU_PABRT_LEGACY
 	select CPU_TLB_V4WBI if MMU
  	help
  	  The ARM925T is a mix between the ARM920T and ARM926T, but with
@@ -170,14 +127,13 @@ config CPU_ARM925T
 
 # ARM926T
 config CPU_ARM926T
-	bool "Support ARM926T processor"
-	depends on ARCH_INTEGRATOR || ARCH_VERSATILE_PB || MACH_VERSATILE_AB || ARCH_OMAP730 || ARCH_OMAP16XX || MACH_REALVIEW_EB || ARCH_PNX4008 || ARCH_NETX || CPU_S3C2412 || ARCH_AT91SAM9260 || ARCH_AT91SAM9261 || ARCH_AT91SAM9263 || ARCH_AT91SAM9RL || ARCH_NS9XXX || ARCH_DAVINCI
-	default y if ARCH_VERSATILE_PB || MACH_VERSATILE_AB || ARCH_OMAP730 || ARCH_OMAP16XX || ARCH_PNX4008 || ARCH_NETX || CPU_S3C2412 || ARCH_AT91SAM9260 || ARCH_AT91SAM9261 || ARCH_AT91SAM9263 || ARCH_AT91SAM9RL || ARCH_NS9XXX || ARCH_DAVINCI
+	bool "Support ARM926T processor" if ARCH_INTEGRATOR || MACH_REALVIEW_EB
 	select CPU_32v5
 	select CPU_ABRT_EV5TJ
 	select CPU_CACHE_VIVT
-	select CPU_CP15_MMU
 	select CPU_COPY_V4WB if MMU
+	select CPU_CP15_MMU
+	select CPU_PABRT_LEGACY
 	select CPU_TLB_V4WBI if MMU
 	help
 	  This is a variant of the ARM920.  It has slightly different
@@ -187,6 +143,24 @@ config CPU_ARM926T
 	  Say Y if you want support for the ARM926T processor.
 	  Otherwise, say N.
 
+# FA526
+config CPU_FA526
+	bool
+	select CPU_32v4
+	select CPU_ABRT_EV4
+	select CPU_CACHE_FA
+	select CPU_CACHE_VIVT
+	select CPU_COPY_FA if MMU
+	select CPU_CP15_MMU
+	select CPU_PABRT_LEGACY
+	select CPU_TLB_FA if MMU
+	help
+	  The FA526 is a version of the ARMv4 compatible processor with
+	  Branch Target Buffer, Unified TLB and cache line size 16.
+
+	  Say Y if you want support for the FA526 processor.
+	  Otherwise, say N.
+
 # ARM940T
 config CPU_ARM940T
 	bool "Support ARM940T processor" if ARCH_INTEGRATOR
@@ -195,6 +169,7 @@ config CPU_ARM940T
 	select CPU_ABRT_NOMMU
 	select CPU_CACHE_VIVT
 	select CPU_CP15_MPU
+	select CPU_PABRT_LEGACY
 	help
 	  ARM940T is a member of the ARM9TDMI family of general-
 	  purpose microprocessors with MPU and separate 4KB
@@ -212,6 +187,7 @@ config CPU_ARM946E
 	select CPU_ABRT_NOMMU
 	select CPU_CACHE_VIVT
 	select CPU_CP15_MPU
+	select CPU_PABRT_LEGACY
 	help
 	  ARM946E-S is a member of the ARM9E-S family of high-
 	  performance, 32-bit system-on-chip processor solutions.
@@ -222,14 +198,14 @@ config CPU_ARM946E
 
 # ARM1020 - needs validating
 config CPU_ARM1020
-	bool "Support ARM1020T (rev 0) processor"
-	depends on ARCH_INTEGRATOR
+	bool "Support ARM1020T (rev 0) processor" if ARCH_INTEGRATOR
 	select CPU_32v5
 	select CPU_ABRT_EV4T
 	select CPU_CACHE_V4WT
 	select CPU_CACHE_VIVT
-	select CPU_CP15_MMU
 	select CPU_COPY_V4WB if MMU
+	select CPU_CP15_MMU
+	select CPU_PABRT_LEGACY
 	select CPU_TLB_V4WBI if MMU
 	help
 	  The ARM1020 is the 32K cached version of the ARM10 processor,
@@ -240,26 +216,26 @@ config CPU_ARM1020
 
 # ARM1020E - needs validating
 config CPU_ARM1020E
-	bool "Support ARM1020E processor"
-	depends on ARCH_INTEGRATOR
+	bool "Support ARM1020E processor" if ARCH_INTEGRATOR
+	depends on n
 	select CPU_32v5
 	select CPU_ABRT_EV4T
 	select CPU_CACHE_V4WT
 	select CPU_CACHE_VIVT
-	select CPU_CP15_MMU
 	select CPU_COPY_V4WB if MMU
+	select CPU_CP15_MMU
+	select CPU_PABRT_LEGACY
 	select CPU_TLB_V4WBI if MMU
-	depends on n
 
 # ARM1022E
 config CPU_ARM1022
-	bool "Support ARM1022E processor"
-	depends on ARCH_INTEGRATOR
+	bool "Support ARM1022E processor" if ARCH_INTEGRATOR
 	select CPU_32v5
 	select CPU_ABRT_EV4T
 	select CPU_CACHE_VIVT
-	select CPU_CP15_MMU
 	select CPU_COPY_V4WB if MMU # can probably do better
+	select CPU_CP15_MMU
+	select CPU_PABRT_LEGACY
 	select CPU_TLB_V4WBI if MMU
 	help
 	  The ARM1022E is an implementation of the ARMv5TE architecture
@@ -271,13 +247,13 @@ config CPU_ARM1022
 
 # ARM1026EJ-S
 config CPU_ARM1026
-	bool "Support ARM1026EJ-S processor"
-	depends on ARCH_INTEGRATOR
+	bool "Support ARM1026EJ-S processor" if ARCH_INTEGRATOR
 	select CPU_32v5
 	select CPU_ABRT_EV5T # But need Jazelle, but EV5TJ ignores bit 10
 	select CPU_CACHE_VIVT
-	select CPU_CP15_MMU
 	select CPU_COPY_V4WB if MMU # can probably do better
+	select CPU_CP15_MMU
+	select CPU_PABRT_LEGACY
 	select CPU_TLB_V4WBI if MMU
 	help
 	  The ARM1026EJ-S is an implementation of the ARMv5TEJ architecture
@@ -288,15 +264,15 @@ config CPU_ARM1026
 
 # SA110
 config CPU_SA110
-	bool "Support StrongARM(R) SA-110 processor" if !ARCH_EBSA110 && !FOOTBRIDGE && !ARCH_TBOX && !ARCH_SHARK && !ARCH_NEXUSPCI && ARCH_RPC
-	default y if ARCH_EBSA110 || FOOTBRIDGE || ARCH_TBOX || ARCH_SHARK || ARCH_NEXUSPCI
+	bool
 	select CPU_32v3 if ARCH_RPC
 	select CPU_32v4 if !ARCH_RPC
 	select CPU_ABRT_EV4
 	select CPU_CACHE_V4WB
 	select CPU_CACHE_VIVT
-	select CPU_CP15_MMU
 	select CPU_COPY_V4WB if MMU
+	select CPU_CP15_MMU
+	select CPU_PABRT_LEGACY
 	select CPU_TLB_V4WB if MMU
 	help
 	  The Intel StrongARM(R) SA-110 is a 32-bit microprocessor and
@@ -310,107 +286,177 @@ config CPU_SA110
 # SA1100
 config CPU_SA1100
 	bool
-	depends on ARCH_SA1100
-	default y
 	select CPU_32v4
 	select CPU_ABRT_EV4
 	select CPU_CACHE_V4WB
 	select CPU_CACHE_VIVT
 	select CPU_CP15_MMU
+	select CPU_PABRT_LEGACY
 	select CPU_TLB_V4WB if MMU
 
 # XScale
 config CPU_XSCALE
 	bool
-	depends on ARCH_IOP32X || ARCH_IOP33X || PXA25x || PXA27x || ARCH_IXP4XX || ARCH_IXP2000
-	default y
 	select CPU_32v5
 	select CPU_ABRT_EV5T
 	select CPU_CACHE_VIVT
 	select CPU_CP15_MMU
+	select CPU_PABRT_LEGACY
 	select CPU_TLB_V4WBI if MMU
 
 # XScale Core Version 3
 config CPU_XSC3
 	bool
-	depends on ARCH_IXP23XX || ARCH_IOP13XX || PXA3xx
-	default y
 	select CPU_32v5
 	select CPU_ABRT_EV5T
 	select CPU_CACHE_VIVT
 	select CPU_CP15_MMU
+	select CPU_PABRT_LEGACY
 	select CPU_TLB_V4WBI if MMU
 	select IO_36
 
+# Marvell PJ1 (Mohawk)
+config CPU_MOHAWK
+	bool
+	select CPU_32v5
+	select CPU_ABRT_EV5T
+	select CPU_CACHE_VIVT
+	select CPU_COPY_V4WB if MMU
+	select CPU_CP15_MMU
+	select CPU_PABRT_LEGACY
+	select CPU_TLB_V4WBI if MMU
+
+# Feroceon
+config CPU_FEROCEON
+	bool
+	select CPU_32v5
+	select CPU_ABRT_EV5T
+	select CPU_CACHE_VIVT
+	select CPU_COPY_FEROCEON if MMU
+	select CPU_CP15_MMU
+	select CPU_PABRT_LEGACY
+	select CPU_TLB_FEROCEON if MMU
+
+config CPU_FEROCEON_OLD_ID
+	bool "Accept early Feroceon cores with an ARM926 ID"
+	depends on CPU_FEROCEON && !CPU_ARM926T
+	default y
+	help
+	  This enables the usage of some old Feroceon cores
+	  for which the CPU ID is equal to the ARM926 ID.
+	  Relevant for Feroceon-1850 and early Feroceon-2850.
+
+# Marvell PJ4
+config CPU_PJ4
+	bool
+	select ARM_THUMBEE
+	select CPU_V7
+
+config CPU_PJ4B
+	bool
+	select CPU_V7
+
 # ARMv6
 config CPU_V6
-	bool "Support ARM V6 processor"
-	depends on ARCH_INTEGRATOR || MACH_REALVIEW_EB || ARCH_OMAP2 || ARCH_MX3
-	default y if ARCH_MX3
+	bool "Support ARM V6 processor" if ARCH_INTEGRATOR || MACH_REALVIEW_EB || MACH_REALVIEW_PBX
 	select CPU_32v6
 	select CPU_ABRT_EV6
 	select CPU_CACHE_V6
 	select CPU_CACHE_VIPT
+	select CPU_COPY_V6 if MMU
 	select CPU_CP15_MMU
 	select CPU_HAS_ASID if MMU
-	select CPU_COPY_V6 if MMU
+	select CPU_PABRT_V6
 	select CPU_TLB_V6 if MMU
 
 # ARMv6k
-config CPU_32v6K
-	bool "Support ARM V6K processor extensions" if !SMP
-	depends on CPU_V6
-	default y if SMP && !ARCH_MX3
-	help
-	  Say Y here if your ARMv6 processor supports the 'K' extension.
-	  This enables the kernel to use some instructions not present
-	  on previous processors, and as such a kernel build with this
-	  enabled will not boot on processors with do not support these
-	  instructions.
+config CPU_V6K
+	bool "Support ARM V6K processor" if ARCH_INTEGRATOR || MACH_REALVIEW_EB || MACH_REALVIEW_PBX
+	select CPU_32v6
+	select CPU_32v6K
+	select CPU_ABRT_EV6
+	select CPU_CACHE_V6
+	select CPU_CACHE_VIPT
+	select CPU_COPY_V6 if MMU
+	select CPU_CP15_MMU
+	select CPU_HAS_ASID if MMU
+	select CPU_PABRT_V6
+	select CPU_TLB_V6 if MMU
 
 # ARMv7
 config CPU_V7
-	bool "Support ARM V7 processor"
-	depends on ARCH_INTEGRATOR
+	bool "Support ARM V7 processor" if ARCH_INTEGRATOR || MACH_REALVIEW_EB || MACH_REALVIEW_PBX
 	select CPU_32v6K
 	select CPU_32v7
 	select CPU_ABRT_EV7
 	select CPU_CACHE_V7
 	select CPU_CACHE_VIPT
-	select CPU_CP15_MMU
-	select CPU_HAS_ASID if MMU
 	select CPU_COPY_V6 if MMU
+	select CPU_CP15_MMU if MMU
+	select CPU_CP15_MPU if !MMU
+	select CPU_HAS_ASID if MMU
+	select CPU_PABRT_V7
 	select CPU_TLB_V7 if MMU
 
+# ARMv7M
+config CPU_V7M
+	bool
+	select CPU_32v7M
+	select CPU_ABRT_NOMMU
+	select CPU_CACHE_NOP
+	select CPU_PABRT_LEGACY
+	select CPU_THUMBONLY
+
+config CPU_THUMBONLY
+	bool
+	# There are no CPUs available with MMU that don't implement an ARM ISA:
+	depends on !MMU
+	help
+	  Select this if your CPU doesn't support the 32 bit ARM instructions.
+
 # Figure out what processor architecture version we should be using.
 # This defines the compiler instruction set which depends on the machine type.
 config CPU_32v3
 	bool
-	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_USE_DOMAINS if MMU
 	select NEEDS_SYSCALL_FOR_CMPXCHG if SMP
+	select NEED_KUSER_HELPERS
+	select TLS_REG_EMUL if SMP || !MMU
 
 config CPU_32v4
 	bool
-	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_USE_DOMAINS if MMU
 	select NEEDS_SYSCALL_FOR_CMPXCHG if SMP
+	select NEED_KUSER_HELPERS
+	select TLS_REG_EMUL if SMP || !MMU
 
 config CPU_32v4T
 	bool
-	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_USE_DOMAINS if MMU
 	select NEEDS_SYSCALL_FOR_CMPXCHG if SMP
+	select NEED_KUSER_HELPERS
+	select TLS_REG_EMUL if SMP || !MMU
 
 config CPU_32v5
 	bool
-	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_USE_DOMAINS if MMU
 	select NEEDS_SYSCALL_FOR_CMPXCHG if SMP
+	select NEED_KUSER_HELPERS
+	select TLS_REG_EMUL if SMP || !MMU
 
 config CPU_32v6
 	bool
 	select TLS_REG_EMUL if !CPU_32v6K && !MMU
 
+config CPU_32v6K
+	bool
+
 config CPU_32v7
 	bool
 
+config CPU_32v7M
+	bool
+
 # The abort model
 config CPU_ABRT_NOMMU
 	bool
@@ -436,10 +482,16 @@ config CPU_ABRT_EV6
 config CPU_ABRT_EV7
 	bool
 
-# The cache model
-config CPU_CACHE_V3
+config CPU_PABRT_LEGACY
+	bool
+
+config CPU_PABRT_V6
 	bool
 
+config CPU_PABRT_V7
+	bool
+
+# The cache model
 config CPU_CACHE_V4
 	bool
 
@@ -455,32 +507,36 @@ config CPU_CACHE_V6
 config CPU_CACHE_V7
 	bool
 
+config CPU_CACHE_NOP
+	bool
+
 config CPU_CACHE_VIVT
 	bool
 
 config CPU_CACHE_VIPT
 	bool
 
-if MMU
-# The copy-page model
-config CPU_COPY_V3
+config CPU_CACHE_FA
 	bool
 
+if MMU
+# The copy-page model
 config CPU_COPY_V4WT
 	bool
 
 config CPU_COPY_V4WB
 	bool
 
-config CPU_COPY_V6
+config CPU_COPY_FEROCEON
 	bool
 
-# This selects the TLB model
-config CPU_TLB_V3
+config CPU_COPY_FA
+	bool
+
+config CPU_COPY_V6
 	bool
-	help
-	  ARM Architecture Version 3 TLB.
 
+# This selects the TLB model
 config CPU_TLB_V4WT
 	bool
 	help
@@ -497,12 +553,26 @@ config CPU_TLB_V4WBI
 	  ARM Architecture Version 4 TLB with writeback cache and invalidate
 	  instruction cache entry.
 
+config CPU_TLB_FEROCEON
+	bool
+	help
+	  Feroceon TLB (v4wbi with non-outer-cachable page table walks).
+
+config CPU_TLB_FA
+	bool
+	help
+	  Faraday ARM FA526 architecture, unified TLB with writeback cache
+	  and invalidate instruction cache entry. Branch target buffer is
+	  also supported.
+
 config CPU_TLB_V6
 	bool
 
 config CPU_TLB_V7
 	bool
 
+config VERIFY_PERMISSION_FAULT
+	bool
 endif
 
 config CPU_HAS_ASID
@@ -528,6 +598,12 @@ config CPU_CP15_MPU
 	help
 	  Processor has the CP15 register, which has MPU related registers.
 
+config CPU_USE_DOMAINS
+	bool
+	help
+	  This option enables or disables the use of domain switching
+	  via the set_fs() function.
+
 #
 # CPU supports 36-bit I/O
 #
@@ -536,9 +612,31 @@ config IO_36
 
 comment "Processor Features"
 
+config ARM_LPAE
+	bool "Support for the Large Physical Address Extension"
+	depends on MMU && CPU_32v7 && !CPU_32v6 && !CPU_32v5 && \
+		!CPU_32v4 && !CPU_32v3
+	help
+	  Say Y if you have an ARMv7 processor supporting the LPAE page
+	  table format and you would like to access memory beyond the
+	  4GB limit. The resulting kernel image will not run on
+	  processors without the LPA extension.
+
+	  If unsure, say N.
+
+config ARCH_PHYS_ADDR_T_64BIT
+	def_bool ARM_LPAE
+
+config ARCH_DMA_ADDR_T_64BIT
+	bool
+
 config ARM_THUMB
-	bool "Support Thumb user binaries"
-	depends on CPU_ARM720T || CPU_ARM740T || CPU_ARM920T || CPU_ARM922T || CPU_ARM925T || CPU_ARM926T || CPU_ARM940T || CPU_ARM946E || CPU_ARM1020 || CPU_ARM1020E || CPU_ARM1022 || CPU_ARM1026 || CPU_XSCALE || CPU_XSC3 || CPU_V6 || CPU_V7
+	bool "Support Thumb user binaries" if !CPU_THUMBONLY
+	depends on CPU_ARM720T || CPU_ARM740T || CPU_ARM920T || CPU_ARM922T || \
+		CPU_ARM925T || CPU_ARM926T || CPU_ARM940T || CPU_ARM946E || \
+		CPU_ARM1020 || CPU_ARM1020E || CPU_ARM1022 || CPU_ARM1026 || \
+		CPU_XSCALE || CPU_XSC3 || CPU_MOHAWK || CPU_V6 || CPU_V6K || \
+		CPU_V7 || CPU_FEROCEON || CPU_V7M
 	default y
 	help
 	  Say Y if you want to include kernel support for running user space
@@ -550,6 +648,53 @@ config ARM_THUMB
 
 	  If you don't know what this all is, saying Y is a safe choice.
 
+config ARM_THUMBEE
+	bool "Enable ThumbEE CPU extension"
+	depends on CPU_V7
+	help
+	  Say Y here if you have a CPU with the ThumbEE extension and code to
+	  make use of it. Say N for code that can run on CPUs without ThumbEE.
+
+config ARM_VIRT_EXT
+	bool
+	depends on MMU
+	default y if CPU_V7
+	help
+	  Enable the kernel to make use of the ARM Virtualization
+	  Extensions to install hypervisors without run-time firmware
+	  assistance.
+
+	  A compliant bootloader is required in order to make maximum
+	  use of this feature.  Refer to Documentation/arm/Booting for
+	  details.
+
+config SWP_EMULATE
+	bool "Emulate SWP/SWPB instructions"
+	depends on CPU_V7
+	default y if SMP
+	select HAVE_PROC_CPU if PROC_FS
+	help
+	  ARMv6 architecture deprecates use of the SWP/SWPB instructions.
+	  ARMv7 multiprocessing extensions introduce the ability to disable
+	  these instructions, triggering an undefined instruction exception
+	  when executed. Say Y here to enable software emulation of these
+	  instructions for userspace (not kernel) using LDREX/STREX.
+	  Also creates /proc/cpu/swp_emulation for statistics.
+
+	  In some older versions of glibc [<=2.8] SWP is used during futex
+	  trylock() operations with the assumption that the code will not
+	  be preempted. This invalid assumption may be more likely to fail
+	  with SWP emulation enabled, leading to deadlock of the user
+	  application.
+
+	  NOTE: when accessing uncached shared regions, LDREX/STREX rely
+	  on an external transaction monitoring block called a global
+	  monitor to maintain update atomicity. If your system does not
+	  implement a global monitor, this option can cause programs that
+	  perform SWP operations to uncached memory to deadlock.
+
+	  If unsure, say Y.
+
 config CPU_BIG_ENDIAN
 	bool "Build big-endian kernel"
 	depends on ARCH_SUPPORTS_BIG_ENDIAN
@@ -559,13 +704,26 @@ config CPU_BIG_ENDIAN
 	  port must properly enable any big-endian related features
 	  of your chipset/board/processor.
 
+config CPU_ENDIAN_BE8
+	bool
+	depends on CPU_BIG_ENDIAN
+	default CPU_V6 || CPU_V6K || CPU_V7
+	help
+	  Support for the BE-8 (big-endian) mode on ARMv6 and ARMv7 processors.
+
+config CPU_ENDIAN_BE32
+	bool
+	depends on CPU_BIG_ENDIAN
+	default !CPU_ENDIAN_BE8
+	help
+	  Support for the BE-32 (big-endian) mode on pre-ARMv6 processors.
+
 config CPU_HIGH_VECTOR
 	depends on !MMU && CPU_CP15 && !CPU_ARM740T
 	bool "Select the High exception vector"
-	default n
 	help
 	  Say Y here to select high exception vector(0xFFFF0000~).
-	  The exception vector can be vary depending on the platform
+	  The exception vector can vary depending on the platform
 	  design in nommu mode. If your platform needs to select
 	  high exception vector, say Y.
 	  Otherwise or if you are unsure, say N, and the low exception
@@ -573,7 +731,7 @@ config CPU_HIGH_VECTOR
 
 config CPU_ICACHE_DISABLE
 	bool "Disable I-Cache (I-bit)"
-	depends on CPU_CP15 && !(CPU_ARM610 || CPU_ARM710 || CPU_ARM720T || CPU_ARM740T || CPU_XSCALE || CPU_XSC3)
+	depends on CPU_CP15 && !(CPU_ARM720T || CPU_ARM740T || CPU_XSCALE || CPU_XSC3)
 	help
 	  Say Y here to disable the processor instruction cache. Unless
 	  you have a reason not to or are unsure, say N.
@@ -600,7 +758,7 @@ config CPU_DCACHE_SIZE
 
 config CPU_DCACHE_WRITETHROUGH
 	bool "Force write through D-cache"
-	depends on (CPU_ARM740T || CPU_ARM920T || CPU_ARM922T || CPU_ARM925T || CPU_ARM926T || CPU_ARM940T || CPU_ARM946E || CPU_ARM1020) && !CPU_DCACHE_DISABLE
+	depends on (CPU_ARM740T || CPU_ARM920T || CPU_ARM922T || CPU_ARM925T || CPU_ARM926T || CPU_ARM940T || CPU_ARM946E || CPU_ARM1020 || CPU_FA526) && !CPU_DCACHE_DISABLE
 	default y if CPU_ARM925T
 	help
 	  Say Y here to use the data cache in writethrough mode. Unless you
@@ -615,39 +773,237 @@ config CPU_CACHE_ROUND_ROBIN
 
 config CPU_BPREDICT_DISABLE
 	bool "Disable branch prediction"
-	depends on CPU_ARM1020 || CPU_V6 || CPU_XSC3 || CPU_V7
+	depends on CPU_ARM1020 || CPU_V6 || CPU_V6K || CPU_MOHAWK || CPU_XSC3 || CPU_V7 || CPU_FA526
 	help
 	  Say Y here to disable branch prediction.  If unsure, say N.
 
 config TLS_REG_EMUL
 	bool
+	select NEED_KUSER_HELPERS
 	help
 	  An SMP system using a pre-ARMv6 processor (there are apparently
 	  a few prototypes like that in existence) and therefore access to
 	  that required register must be emulated.
 
-config HAS_TLS_REG
-	bool
-	depends on !TLS_REG_EMUL
-	default y if SMP || CPU_32v7
-	help
-	  This selects support for the CP15 thread register.
-	  It is defined to be available on some ARMv6 processors (including
-	  all SMP capable ARMv6's) or later processors.  User space may
-	  assume directly accessing that register and always obtain the
-	  expected value only on ARMv7 and above.
-
 config NEEDS_SYSCALL_FOR_CMPXCHG
 	bool
+	select NEED_KUSER_HELPERS
 	help
 	  SMP on a pre-ARMv6 processor?  Well OK then.
 	  Forget about fast user space cmpxchg support.
 	  It is just not possible.
 
+config NEED_KUSER_HELPERS
+	bool
+
+config KUSER_HELPERS
+	bool "Enable kuser helpers in vector page" if !NEED_KUSER_HELPERS
+	default y
+	help
+	  Warning: disabling this option may break user programs.
+
+	  Provide kuser helpers in the vector page.  The kernel provides
+	  helper code to userspace in read only form at a fixed location
+	  in the high vector page to allow userspace to be independent of
+	  the CPU type fitted to the system.  This permits binaries to be
+	  run on ARMv4 through to ARMv7 without modification.
+
+	  See Documentation/arm/kernel_user_helpers.txt for details.
+
+	  However, the fixed address nature of these helpers can be used
+	  by ROP (return orientated programming) authors when creating
+	  exploits.
+
+	  If all of the binaries and libraries which run on your platform
+	  are built specifically for your platform, and make no use of
+	  these helpers, then you can turn this option off to hinder
+	  such exploits. However, in that case, if a binary or library
+	  relying on those helpers is run, it will receive a SIGILL signal,
+	  which will terminate the program.
+
+	  Say N here only if you are absolutely certain that you do not
+	  need these helpers; otherwise, the safe option is to say Y.
+
+config DMA_CACHE_RWFO
+	bool "Enable read/write for ownership DMA cache maintenance"
+	depends on CPU_V6K && SMP
+	default y
+	help
+	  The Snoop Control Unit on ARM11MPCore does not detect the
+	  cache maintenance operations and the dma_{map,unmap}_area()
+	  functions may leave stale cache entries on other CPUs. By
+	  enabling this option, Read or Write For Ownership in the ARMv6
+	  DMA cache maintenance functions is performed. These LDR/STR
+	  instructions change the cache line state to shared or modified
+	  so that the cache operation has the desired effect.
+
+	  Note that the workaround is only valid on processors that do
+	  not perform speculative loads into the D-cache. For such
+	  processors, if cache maintenance operations are not broadcast
+	  in hardware, other workarounds are needed (e.g. cache
+	  maintenance broadcasting in software via FIQ).
+
 config OUTER_CACHE
 	bool
-	default n
+
+config OUTER_CACHE_SYNC
+	bool
+	help
+	  The outer cache has a outer_cache_fns.sync function pointer
+	  that can be used to drain the write buffer of the outer cache.
+
+config CACHE_FEROCEON_L2
+	bool "Enable the Feroceon L2 cache controller"
+	depends on ARCH_KIRKWOOD || ARCH_MV78XX0 || ARCH_MVEBU
+	default y
+	select OUTER_CACHE
+	help
+	  This option enables the Feroceon L2 cache controller.
+
+config CACHE_FEROCEON_L2_WRITETHROUGH
+	bool "Force Feroceon L2 cache write through"
+	depends on CACHE_FEROCEON_L2
+	help
+	  Say Y here to use the Feroceon L2 cache in writethrough mode.
+	  Unless you specifically require this, say N for writeback mode.
+
+config MIGHT_HAVE_CACHE_L2X0
+	bool
+	help
+	  This option should be selected by machines which have a L2x0
+	  or PL310 cache controller, but where its use is optional.
+
+	  The only effect of this option is to make CACHE_L2X0 and
+	  related options available to the user for configuration.
+
+	  Boards or SoCs which always require the cache controller
+	  support to be present should select CACHE_L2X0 directly
+	  instead of this option, thus preventing the user from
+	  inadvertently configuring a broken kernel.
 
 config CACHE_L2X0
+	bool "Enable the L2x0 outer cache controller" if MIGHT_HAVE_CACHE_L2X0
+	default MIGHT_HAVE_CACHE_L2X0
+	select OUTER_CACHE
+	select OUTER_CACHE_SYNC
+	help
+	  This option enables the L2x0 PrimeCell.
+
+if CACHE_L2X0
+
+config CACHE_PL310
 	bool
+	default y if CPU_V7 && !(CPU_V6 || CPU_V6K)
+	help
+	  This option enables optimisations for the PL310 cache
+	  controller.
+
+config PL310_ERRATA_588369
+	bool "PL310 errata: Clean & Invalidate maintenance operations do not invalidate clean lines"
+	help
+	   The PL310 L2 cache controller implements three types of Clean &
+	   Invalidate maintenance operations: by Physical Address
+	   (offset 0x7F0), by Index/Way (0x7F8) and by Way (0x7FC).
+	   They are architecturally defined to behave as the execution of a
+	   clean operation followed immediately by an invalidate operation,
+	   both performing to the same memory location. This functionality
+	   is not correctly implemented in PL310 as clean lines are not
+	   invalidated as a result of these operations.
+
+config PL310_ERRATA_727915
+	bool "PL310 errata: Background Clean & Invalidate by Way operation can cause data corruption"
+	help
+	  PL310 implements the Clean & Invalidate by Way L2 cache maintenance
+	  operation (offset 0x7FC). This operation runs in background so that
+	  PL310 can handle normal accesses while it is in progress. Under very
+	  rare circumstances, due to this erratum, write data can be lost when
+	  PL310 treats a cacheable write transaction during a Clean &
+	  Invalidate by Way operation.
+
+config PL310_ERRATA_753970
+	bool "PL310 errata: cache sync operation may be faulty"
+	help
+	  This option enables the workaround for the 753970 PL310 (r3p0) erratum.
+
+	  Under some condition the effect of cache sync operation on
+	  the store buffer still remains when the operation completes.
+	  This means that the store buffer is always asked to drain and
+	  this prevents it from merging any further writes. The workaround
+	  is to replace the normal offset of cache sync operation (0x730)
+	  by another offset targeting an unmapped PL310 register 0x740.
+	  This has the same effect as the cache sync operation: store buffer
+	  drain and waiting for all buffers empty.
+
+config PL310_ERRATA_769419
+	bool "PL310 errata: no automatic Store Buffer drain"
+	help
+	  On revisions of the PL310 prior to r3p2, the Store Buffer does
+	  not automatically drain. This can cause normal, non-cacheable
+	  writes to be retained when the memory system is idle, leading
+	  to suboptimal I/O performance for drivers using coherent DMA.
+	  This option adds a write barrier to the cpu_idle loop so that,
+	  on systems with an outer cache, the store buffer is drained
+	  explicitly.
+
+endif
+
+config CACHE_TAUROS2
+	bool "Enable the Tauros2 L2 cache controller"
+	depends on (ARCH_DOVE || ARCH_MMP || CPU_PJ4)
+	default y
 	select OUTER_CACHE
+	help
+	  This option enables the Tauros2 L2 cache controller (as
+	  found on PJ1/PJ4).
+
+config CACHE_XSC3L2
+	bool "Enable the L2 cache on XScale3"
+	depends on CPU_XSC3
+	default y
+	select OUTER_CACHE
+	help
+	  This option enables the L2 cache on XScale3.
+
+config ARM_L1_CACHE_SHIFT_6
+	bool
+	default y if CPU_V7
+	help
+	  Setting ARM L1 cache line size to 64 Bytes.
+
+config ARM_L1_CACHE_SHIFT
+	int
+	default 6 if ARM_L1_CACHE_SHIFT_6
+	default 5
+
+config ARM_DMA_MEM_BUFFERABLE
+	bool "Use non-cacheable memory for DMA" if (CPU_V6 || CPU_V6K) && !CPU_V7
+	depends on !(MACH_REALVIEW_PB1176 || REALVIEW_EB_ARM11MP || \
+		     MACH_REALVIEW_PB11MP)
+	default y if CPU_V6 || CPU_V6K || CPU_V7
+	help
+	  Historically, the kernel has used strongly ordered mappings to
+	  provide DMA coherent memory.  With the advent of ARMv7, mapping
+	  memory with differing types results in unpredictable behaviour,
+	  so on these CPUs, this option is forced on.
+
+	  Multiple mappings with differing attributes is also unpredictable
+	  on ARMv6 CPUs, but since they do not have aggressive speculative
+	  prefetch, no harm appears to occur.
+
+	  However, drivers may be missing the necessary barriers for ARMv6,
+	  and therefore turning this on may result in unpredictable driver
+	  behaviour.  Therefore, we offer this as an option.
+
+	  You are recommended say 'Y' here and debug any affected drivers.
+
+config ARCH_HAS_BARRIERS
+	bool
+	help
+	  This option allows the use of custom mandatory barriers
+	  included via the mach/barriers.h file.
+
+config ARCH_SUPPORTS_BIG_ENDIAN
+	bool
+	help
+	  This option specifies the architecture can support big endian
+	  operation.
diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
index 762702765fc..91da64de440 100644
--- a/arch/arm/mm/Makefile
+++ b/arch/arm/mm/Makefile
@@ -2,20 +2,22 @@
 # Makefile for the linux arm-specific parts of the memory manager.
 #
 
-obj-y				:= consistent.o extable.o fault.o init.o \
+obj-y				:= dma-mapping.o extable.o fault.o init.o \
 				   iomap.o
 
-obj-$(CONFIG_MMU)		+= fault-armv.o flush.o ioremap.o mmap.o \
-				   pgd.o mmu.o
+obj-$(CONFIG_MMU)		+= fault-armv.o flush.o idmap.o ioremap.o \
+				   mmap.o pgd.o mmu.o
 
 ifneq ($(CONFIG_MMU),y)
 obj-y				+= nommu.o
 endif
 
+obj-$(CONFIG_ARM_PTDUMP)	+= dump.o
 obj-$(CONFIG_MODULES)		+= proc-syms.o
 
 obj-$(CONFIG_ALIGNMENT_TRAP)	+= alignment.o
-obj-$(CONFIG_DISCONTIGMEM)	+= discontig.o
+obj-$(CONFIG_HIGHMEM)		+= highmem.o
+obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
 
 obj-$(CONFIG_CPU_ABRT_NOMMU)	+= abort-nommu.o
 obj-$(CONFIG_CPU_ABRT_EV4)	+= abort-ev4.o
@@ -26,30 +28,44 @@ obj-$(CONFIG_CPU_ABRT_EV5TJ)	+= abort-ev5tj.o
 obj-$(CONFIG_CPU_ABRT_EV6)	+= abort-ev6.o
 obj-$(CONFIG_CPU_ABRT_EV7)	+= abort-ev7.o
 
-obj-$(CONFIG_CPU_CACHE_V3)	+= cache-v3.o
+AFLAGS_abort-ev6.o	:=-Wa,-march=armv6k
+AFLAGS_abort-ev7.o	:=-Wa,-march=armv7-a
+
+obj-$(CONFIG_CPU_PABRT_LEGACY)	+= pabort-legacy.o
+obj-$(CONFIG_CPU_PABRT_V6)	+= pabort-v6.o
+obj-$(CONFIG_CPU_PABRT_V7)	+= pabort-v7.o
+
 obj-$(CONFIG_CPU_CACHE_V4)	+= cache-v4.o
 obj-$(CONFIG_CPU_CACHE_V4WT)	+= cache-v4wt.o
 obj-$(CONFIG_CPU_CACHE_V4WB)	+= cache-v4wb.o
 obj-$(CONFIG_CPU_CACHE_V6)	+= cache-v6.o
 obj-$(CONFIG_CPU_CACHE_V7)	+= cache-v7.o
+obj-$(CONFIG_CPU_CACHE_FA)	+= cache-fa.o
+obj-$(CONFIG_CPU_CACHE_NOP)	+= cache-nop.o
+
+AFLAGS_cache-v6.o	:=-Wa,-march=armv6
+AFLAGS_cache-v7.o	:=-Wa,-march=armv7-a
 
-obj-$(CONFIG_CPU_COPY_V3)	+= copypage-v3.o
 obj-$(CONFIG_CPU_COPY_V4WT)	+= copypage-v4wt.o
 obj-$(CONFIG_CPU_COPY_V4WB)	+= copypage-v4wb.o
+obj-$(CONFIG_CPU_COPY_FEROCEON)	+= copypage-feroceon.o
 obj-$(CONFIG_CPU_COPY_V6)	+= copypage-v6.o context.o
 obj-$(CONFIG_CPU_SA1100)	+= copypage-v4mc.o
 obj-$(CONFIG_CPU_XSCALE)	+= copypage-xscale.o
 obj-$(CONFIG_CPU_XSC3)		+= copypage-xsc3.o
+obj-$(CONFIG_CPU_COPY_FA)	+= copypage-fa.o
 
-obj-$(CONFIG_CPU_TLB_V3)	+= tlb-v3.o
 obj-$(CONFIG_CPU_TLB_V4WT)	+= tlb-v4.o
 obj-$(CONFIG_CPU_TLB_V4WB)	+= tlb-v4wb.o
 obj-$(CONFIG_CPU_TLB_V4WBI)	+= tlb-v4wbi.o
+obj-$(CONFIG_CPU_TLB_FEROCEON)	+= tlb-v4wbi.o	# reuse v4wbi TLB functions
 obj-$(CONFIG_CPU_TLB_V6)	+= tlb-v6.o
 obj-$(CONFIG_CPU_TLB_V7)	+= tlb-v7.o
+obj-$(CONFIG_CPU_TLB_FA)	+= tlb-fa.o
+
+AFLAGS_tlb-v6.o		:=-Wa,-march=armv6
+AFLAGS_tlb-v7.o		:=-Wa,-march=armv7-a
 
-obj-$(CONFIG_CPU_ARM610)	+= proc-arm6_7.o
-obj-$(CONFIG_CPU_ARM710)	+= proc-arm6_7.o
 obj-$(CONFIG_CPU_ARM7TDMI)	+= proc-arm7tdmi.o
 obj-$(CONFIG_CPU_ARM720T)	+= proc-arm720.o
 obj-$(CONFIG_CPU_ARM740T)	+= proc-arm740.o
@@ -60,6 +76,7 @@ obj-$(CONFIG_CPU_ARM925T)	+= proc-arm925.o
 obj-$(CONFIG_CPU_ARM926T)	+= proc-arm926.o
 obj-$(CONFIG_CPU_ARM940T)	+= proc-arm940.o
 obj-$(CONFIG_CPU_ARM946E)	+= proc-arm946.o
+obj-$(CONFIG_CPU_FA526)		+= proc-fa526.o
 obj-$(CONFIG_CPU_ARM1020)	+= proc-arm1020.o
 obj-$(CONFIG_CPU_ARM1020E)	+= proc-arm1020e.o
 obj-$(CONFIG_CPU_ARM1022)	+= proc-arm1022.o
@@ -68,7 +85,18 @@ obj-$(CONFIG_CPU_SA110)		+= proc-sa110.o
 obj-$(CONFIG_CPU_SA1100)	+= proc-sa1100.o
 obj-$(CONFIG_CPU_XSCALE)	+= proc-xscale.o
 obj-$(CONFIG_CPU_XSC3)		+= proc-xsc3.o
+obj-$(CONFIG_CPU_MOHAWK)	+= proc-mohawk.o
+obj-$(CONFIG_CPU_FEROCEON)	+= proc-feroceon.o
 obj-$(CONFIG_CPU_V6)		+= proc-v6.o
+obj-$(CONFIG_CPU_V6K)		+= proc-v6.o
 obj-$(CONFIG_CPU_V7)		+= proc-v7.o
+obj-$(CONFIG_CPU_V7M)		+= proc-v7m.o
+
+AFLAGS_proc-v6.o	:=-Wa,-march=armv6
+AFLAGS_proc-v7.o	:=-Wa,-march=armv7-a
 
-obj-$(CONFIG_CACHE_L2X0)	+= cache-l2x0.o
+obj-$(CONFIG_OUTER_CACHE)	+= l2c-common.o
+obj-$(CONFIG_CACHE_FEROCEON_L2)	+= cache-feroceon-l2.o
+obj-$(CONFIG_CACHE_L2X0)	+= cache-l2x0.o l2c-l2x0-resume.o
+obj-$(CONFIG_CACHE_XSC3L2)	+= cache-xsc3l2.o
+obj-$(CONFIG_CACHE_TAUROS2)	+= cache-tauros2.o
diff --git a/arch/arm/mm/abort-ev4.S b/arch/arm/mm/abort-ev4.S
index 4f18f9e87ba..54473cd4aba 100644
--- a/arch/arm/mm/abort-ev4.S
+++ b/arch/arm/mm/abort-ev4.S
@@ -3,14 +3,11 @@
 /*
  * Function: v4_early_abort
  *
- * Params  : r2 = address of aborted instruction
- *         : r3 = saved SPSR
+ * Params  : r2 = pt_regs
+ *	   : r4 = aborted context pc
+ *	   : r5 = aborted context psr
  *
- * Returns : r0 = address of abort
- *	   : r1 = FSR, bit 11 = write
- *	   : r2-r8 = corrupted
- *	   : r9 = preserved
- *	   : sp = pointer to registers
+ * Returns : r4 - r11, r13 preserved
  *
  * Purpose : obtain information about current aborted instruction.
  * Note: we read user space.  This means we might cause a data
@@ -21,10 +18,8 @@
 ENTRY(v4_early_abort)
 	mrc	p15, 0, r1, c5, c0, 0		@ get FSR
 	mrc	p15, 0, r0, c6, c0, 0		@ get FAR
-	ldr	r3, [r2]			@ read aborted ARM instruction
+	ldr	r3, [r4]			@ read aborted ARM instruction
 	bic	r1, r1, #1 << 11 | 1 << 10	@ clear bits 11 and 10 of FSR
 	tst	r3, #1 << 20			@ L = 1 -> write?
 	orreq	r1, r1, #1 << 11		@ yes.
-	mov	pc, lr
-
-
+	b	do_DataAbort
diff --git a/arch/arm/mm/abort-ev4t.S b/arch/arm/mm/abort-ev4t.S
index b6282548f92..9da704e7b86 100644
--- a/arch/arm/mm/abort-ev4t.S
+++ b/arch/arm/mm/abort-ev4t.S
@@ -4,14 +4,11 @@
 /*
  * Function: v4t_early_abort
  *
- * Params  : r2 = address of aborted instruction
- *         : r3 = saved SPSR
+ * Params  : r2 = pt_regs
+ *	   : r4 = aborted context pc
+ *	   : r5 = aborted context psr
  *
- * Returns : r0 = address of abort
- *	   : r1 = FSR, bit 11 = write
- *	   : r2-r8 = corrupted
- *	   : r9 = preserved
- *	   : sp = pointer to registers
+ * Returns : r4 - r11, r13 preserved
  *
  * Purpose : obtain information about current aborted instruction.
  * Note: we read user space.  This means we might cause a data
@@ -22,9 +19,9 @@
 ENTRY(v4t_early_abort)
 	mrc	p15, 0, r1, c5, c0, 0		@ get FSR
 	mrc	p15, 0, r0, c6, c0, 0		@ get FAR
-	do_thumb_abort
-	ldreq	r3, [r2]			@ read aborted ARM instruction
+	do_thumb_abort fsr=r1, pc=r4, psr=r5, tmp=r3
+	ldreq	r3, [r4]			@ read aborted ARM instruction
 	bic	r1, r1, #1 << 11 | 1 << 10	@ clear bits 11 and 10 of FSR
 	tst	r3, #1 << 20			@ check write
 	orreq	r1, r1, #1 << 11
-	mov	pc, lr
+	b	do_DataAbort
diff --git a/arch/arm/mm/abort-ev5t.S b/arch/arm/mm/abort-ev5t.S
index 02251b526c0..a0908d4653a 100644
--- a/arch/arm/mm/abort-ev5t.S
+++ b/arch/arm/mm/abort-ev5t.S
@@ -4,14 +4,11 @@
 /*
  * Function: v5t_early_abort
  *
- * Params  : r2 = address of aborted instruction
- *         : r3 = saved SPSR
+ * Params  : r2 = pt_regs
+ *	   : r4 = aborted context pc
+ *	   : r5 = aborted context psr
  *
- * Returns : r0 = address of abort
- *	   : r1 = FSR, bit 11 = write
- *	   : r2-r8 = corrupted
- *	   : r9 = preserved
- *	   : sp = pointer to registers
+ * Returns : r4 - r11, r13 preserved
  *
  * Purpose : obtain information about current aborted instruction.
  * Note: we read user space.  This means we might cause a data
@@ -22,10 +19,10 @@
 ENTRY(v5t_early_abort)
 	mrc	p15, 0, r1, c5, c0, 0		@ get FSR
 	mrc	p15, 0, r0, c6, c0, 0		@ get FAR
-	do_thumb_abort
-	ldreq	r3, [r2]			@ read aborted ARM instruction
+	do_thumb_abort fsr=r1, pc=r4, psr=r5, tmp=r3
+	ldreq	r3, [r4]			@ read aborted ARM instruction
 	bic	r1, r1, #1 << 11		@ clear bits 11 of FSR
-	do_ldrd_abort
+	do_ldrd_abort tmp=ip, insn=r3
 	tst	r3, #1 << 20			@ check write
 	orreq	r1, r1, #1 << 11
-	mov	pc, lr
+	b	do_DataAbort
diff --git a/arch/arm/mm/abort-ev5tj.S b/arch/arm/mm/abort-ev5tj.S
index bce68d601c8..4006b7a6126 100644
--- a/arch/arm/mm/abort-ev5tj.S
+++ b/arch/arm/mm/abort-ev5tj.S
@@ -4,14 +4,11 @@
 /*
  * Function: v5tj_early_abort
  *
- * Params  : r2 = address of aborted instruction
- *         : r3 = saved SPSR
+ * Params  : r2 = pt_regs
+ *	   : r4 = aborted context pc
+ *	   : r5 = aborted context psr
  *
- * Returns : r0 = address of abort
- *	   : r1 = FSR, bit 11 = write
- *	   : r2-r8 = corrupted
- *	   : r9 = preserved
- *	   : sp = pointer to registers
+ * Returns : r4 - r11, r13 preserved
  *
  * Purpose : obtain information about current aborted instruction.
  * Note: we read user space.  This means we might cause a data
@@ -23,13 +20,11 @@ ENTRY(v5tj_early_abort)
 	mrc	p15, 0, r1, c5, c0, 0		@ get FSR
 	mrc	p15, 0, r0, c6, c0, 0		@ get FAR
 	bic	r1, r1, #1 << 11 | 1 << 10	@ clear bits 11 and 10 of FSR
-	tst	r3, #PSR_J_BIT			@ Java?
-	movne	pc, lr
-	do_thumb_abort
-	ldreq	r3, [r2]			@ read aborted ARM instruction
-	do_ldrd_abort
+	tst	r5, #PSR_J_BIT			@ Java?
+	bne	do_DataAbort
+	do_thumb_abort fsr=r1, pc=r4, psr=r5, tmp=r3
+	ldreq	r3, [r4]			@ read aborted ARM instruction
+	do_ldrd_abort tmp=ip, insn=r3
 	tst	r3, #1 << 20			@ L = 0 -> write
 	orreq	r1, r1, #1 << 11		@ yes.
-	mov	pc, lr
-
-
+	b	do_DataAbort
diff --git a/arch/arm/mm/abort-ev6.S b/arch/arm/mm/abort-ev6.S
index 8a7f65ba14b..3815a8262af 100644
--- a/arch/arm/mm/abort-ev6.S
+++ b/arch/arm/mm/abort-ev6.S
@@ -4,14 +4,11 @@
 /*
  * Function: v6_early_abort
  *
- * Params  : r2 = address of aborted instruction
- *         : r3 = saved SPSR
+ * Params  : r2 = pt_regs
+ *	   : r4 = aborted context pc
+ *	   : r5 = aborted context psr
  *
- * Returns : r0 = address of abort
- *	   : r1 = FSR, bit 11 = write
- *	   : r2-r8 = corrupted
- *	   : r9 = preserved
- *	   : sp = pointer to registers
+ * Returns : r4 - r11, r13 preserved
  *
  * Purpose : obtain information about current aborted instruction.
  * Note: we read user space.  This means we might cause a data
@@ -20,25 +17,31 @@
  */
 	.align	5
 ENTRY(v6_early_abort)
-#ifdef CONFIG_CPU_32v6K
+#ifdef CONFIG_CPU_V6
+	sub	r1, sp, #4			@ Get unused stack location
+	strex	r0, r1, [r1]			@ Clear the exclusive monitor
+#elif defined(CONFIG_CPU_32v6K)
 	clrex
-#else
-	strex	r0, r1, [sp]			@ Clear the exclusive monitor
 #endif
 	mrc	p15, 0, r1, c5, c0, 0		@ get FSR
 	mrc	p15, 0, r0, c6, c0, 0		@ get FAR
 /*
  * Faulty SWP instruction on 1136 doesn't set bit 11 in DFSR.
- * The test below covers all the write situations, including Java bytecodes
  */
-	bic	r1, r1, #1 << 11 | 1 << 10	@ clear bits 11 and 10 of FSR
-	tst	r3, #PSR_J_BIT			@ Java?
-	movne	pc, lr
-	do_thumb_abort
-	ldreq	r3, [r2]			@ read aborted ARM instruction
-	do_ldrd_abort
+#ifdef CONFIG_ARM_ERRATA_326103
+	ldr	ip, =0x4107b36
+	mrc	p15, 0, r3, c0, c0, 0		@ get processor id
+	teq	ip, r3, lsr #4			@ r0 ARM1136?
+	bne	do_DataAbort
+	tst	r5, #PSR_J_BIT			@ Java?
+	tsteq	r5, #PSR_T_BIT			@ Thumb?
+	bne	do_DataAbort
+	bic	r1, r1, #1 << 11		@ clear bit 11 of FSR
+	ldr	r3, [r4]			@ read aborted ARM instruction
+ ARM_BE8(rev	r3, r3)
+
+	do_ldrd_abort tmp=ip, insn=r3
 	tst	r3, #1 << 20			@ L = 0 -> write
 	orreq	r1, r1, #1 << 11		@ yes.
-	mov	pc, lr
-
-
+#endif
+	b	do_DataAbort
diff --git a/arch/arm/mm/abort-ev7.S b/arch/arm/mm/abort-ev7.S
index eb90bce38e1..703375277ba 100644
--- a/arch/arm/mm/abort-ev7.S
+++ b/arch/arm/mm/abort-ev7.S
@@ -3,14 +3,11 @@
 /*
  * Function: v7_early_abort
  *
- * Params  : r2 = address of aborted instruction
- *         : r3 = saved SPSR
+ * Params  : r2 = pt_regs
+ *	   : r4 = aborted context pc
+ *	   : r5 = aborted context psr
  *
- * Returns : r0 = address of abort
- *	   : r1 = FSR, bit 11 = write
- *	   : r2-r8 = corrupted
- *	   : r9 = preserved
- *	   : sp = pointer to registers
+ * Returns : r4 - r11, r13 preserved
  *
  * Purpose : obtain information about current aborted instruction.
  */
@@ -29,4 +26,26 @@ ENTRY(v7_early_abort)
 	 * V6 code adjusts the returned DFSR.
 	 * New designs should not need to patch up faults.
 	 */
-	mov	pc, lr
+
+#if defined(CONFIG_VERIFY_PERMISSION_FAULT)
+	/*
+	 * Detect erroneous permission failures and fix
+	 */
+	ldr	r3, =0x40d			@ On permission fault
+	and	r3, r1, r3
+	cmp	r3, #0x0d
+	bne	do_DataAbort
+
+	mcr	p15, 0, r0, c7, c8, 0   	@ Retranslate FAR
+	isb
+	mrc	p15, 0, ip, c7, c4, 0   	@ Read the PAR
+	and	r3, ip, #0x7b   		@ On translation fault
+	cmp	r3, #0x0b
+	bne	do_DataAbort
+	bic	r1, r1, #0xf			@ Fix up FSR FS[5:0]
+	and	ip, ip, #0x7e
+	orr	r1, r1, ip, LSR #1
+#endif
+
+	b	do_DataAbort
+ENDPROC(v7_early_abort)
diff --git a/arch/arm/mm/abort-lv4t.S b/arch/arm/mm/abort-lv4t.S
index 9fb7b0e25ea..f3982580c27 100644
--- a/arch/arm/mm/abort-lv4t.S
+++ b/arch/arm/mm/abort-lv4t.S
@@ -3,14 +3,11 @@
 /*
  * Function: v4t_late_abort
  *
- * Params  : r2 = address of aborted instruction
- *         : r3 = saved SPSR
+ * Params  : r2 = pt_regs
+ *	   : r4 = aborted context pc
+ *	   : r5 = aborted context psr
  *
- * Returns : r0 = address of abort
- *	   : r1 = FSR, bit 11 = write
- *	   : r2-r8 = corrupted
- *	   : r9 = preserved
- *	   : sp = pointer to registers
+ * Returns : r4-r5, r10-r11, r13 preserved
  *
  * Purpose : obtain information about current aborted instruction.
  * Note: we read user space.  This means we might cause a data
@@ -18,7 +15,7 @@
  * picture.  Unfortunately, this does happen.  We live with it.
  */
 ENTRY(v4t_late_abort)
-	tst	r3, #PSR_T_BIT			@ check for thumb mode
+	tst	r5, #PSR_T_BIT			@ check for thumb mode
 #ifdef CONFIG_CPU_CP15_MMU
 	mrc	p15, 0, r1, c5, c0, 0		@ get FSR
 	mrc	p15, 0, r0, c6, c0, 0		@ get FAR
@@ -28,7 +25,7 @@ ENTRY(v4t_late_abort)
 	mov	r1, #0
 #endif
 	bne	.data_thumb_abort
-	ldr	r8, [r2]			@ read arm instruction
+	ldr	r8, [r4]			@ read arm instruction
 	tst	r8, #1 << 20			@ L = 1 -> write?
 	orreq	r1, r1, #1 << 11		@ yes.
 	and	r7, r8, #15 << 24
@@ -47,86 +44,84 @@ ENTRY(v4t_late_abort)
 /* 9 */	b	.data_arm_ldmstm		@ ldm*b	rn, <rlist>
 /* a */	b	.data_unknown
 /* b */	b	.data_unknown
-/* c */	mov	pc, lr				@ ldc	rd, [rn], #m	@ Same as ldr	rd, [rn], #m
-/* d */	mov	pc, lr				@ ldc	rd, [rn, #m]
+/* c */	b	do_DataAbort			@ ldc	rd, [rn], #m	@ Same as ldr	rd, [rn], #m
+/* d */	b	do_DataAbort			@ ldc	rd, [rn, #m]
 /* e */	b	.data_unknown
 /* f */
 .data_unknown:	@ Part of jumptable
-	mov	r0, r2
+	mov	r0, r4
 	mov	r1, r8
-	mov	r2, sp
-	bl	baddataabort
-	b	ret_from_exception
+	b	baddataabort
 
 .data_arm_ldmstm:
 	tst	r8, #1 << 21			@ check writeback bit
-	moveq	pc, lr				@ no writeback -> no fixup
+	beq	do_DataAbort			@ no writeback -> no fixup
 	mov	r7, #0x11
 	orr	r7, r7, #0x1100
 	and	r6, r8, r7
-	and	r2, r8, r7, lsl #1
-	add	r6, r6, r2, lsr #1
-	and	r2, r8, r7, lsl #2
-	add	r6, r6, r2, lsr #2
-	and	r2, r8, r7, lsl #3
-	add	r6, r6, r2, lsr #3
+	and	r9, r8, r7, lsl #1
+	add	r6, r6, r9, lsr #1
+	and	r9, r8, r7, lsl #2
+	add	r6, r6, r9, lsr #2
+	and	r9, r8, r7, lsl #3
+	add	r6, r6, r9, lsr #3
 	add	r6, r6, r6, lsr #8
 	add	r6, r6, r6, lsr #4
 	and	r6, r6, #15			@ r6 = no. of registers to transfer.
-	and	r5, r8, #15 << 16		@ Extract 'n' from instruction
-	ldr	r7, [sp, r5, lsr #14]		@ Get register 'Rn'
+	and	r9, r8, #15 << 16		@ Extract 'n' from instruction
+	ldr	r7, [r2, r9, lsr #14]		@ Get register 'Rn'
 	tst	r8, #1 << 23			@ Check U bit
 	subne	r7, r7, r6, lsl #2		@ Undo increment
 	addeq	r7, r7, r6, lsl #2		@ Undo decrement
-	str	r7, [sp, r5, lsr #14]		@ Put register 'Rn'
-	mov	pc, lr
+	str	r7, [r2, r9, lsr #14]		@ Put register 'Rn'
+	b	do_DataAbort
 
 .data_arm_lateldrhpre:
 	tst	r8, #1 << 21			@ Check writeback bit
-	moveq	pc, lr				@ No writeback -> no fixup
+	beq	do_DataAbort			@ No writeback -> no fixup
 .data_arm_lateldrhpost:
-	and	r5, r8, #0x00f			@ get Rm / low nibble of immediate value
+	and	r9, r8, #0x00f			@ get Rm / low nibble of immediate value
 	tst	r8, #1 << 22			@ if (immediate offset)
 	andne	r6, r8, #0xf00			@ { immediate high nibble
-	orrne	r6, r5, r6, lsr #4		@   combine nibbles } else
-	ldreq	r6, [sp, r5, lsl #2]		@ { load Rm value }
+	orrne	r6, r9, r6, lsr #4		@   combine nibbles } else
+	ldreq	r6, [r2, r9, lsl #2]		@ { load Rm value }
 .data_arm_apply_r6_and_rn:
-	and	r5, r8, #15 << 16		@ Extract 'n' from instruction
-	ldr	r7, [sp, r5, lsr #14]		@ Get register 'Rn'
+	and	r9, r8, #15 << 16		@ Extract 'n' from instruction
+	ldr	r7, [r2, r9, lsr #14]		@ Get register 'Rn'
 	tst	r8, #1 << 23			@ Check U bit
 	subne	r7, r7, r6			@ Undo incrmenet
 	addeq	r7, r7, r6			@ Undo decrement
-	str	r7, [sp, r5, lsr #14]		@ Put register 'Rn'
-	mov	pc, lr
+	str	r7, [r2, r9, lsr #14]		@ Put register 'Rn'
+	b	do_DataAbort
 
 .data_arm_lateldrpreconst:
 	tst	r8, #1 << 21			@ check writeback bit
-	moveq	pc, lr				@ no writeback -> no fixup
+	beq	do_DataAbort			@ no writeback -> no fixup
 .data_arm_lateldrpostconst:
-	movs	r2, r8, lsl #20			@ Get offset
-	moveq	pc, lr				@ zero -> no fixup
-	and	r5, r8, #15 << 16		@ Extract 'n' from instruction
-	ldr	r7, [sp, r5, lsr #14]		@ Get register 'Rn'
+	movs	r6, r8, lsl #20			@ Get offset
+	beq	do_DataAbort			@ zero -> no fixup
+	and	r9, r8, #15 << 16		@ Extract 'n' from instruction
+	ldr	r7, [r2, r9, lsr #14]		@ Get register 'Rn'
 	tst	r8, #1 << 23			@ Check U bit
-	subne	r7, r7, r2, lsr #20		@ Undo increment
-	addeq	r7, r7, r2, lsr #20		@ Undo decrement
-	str	r7, [sp, r5, lsr #14]		@ Put register 'Rn'
-	mov	pc, lr
+	subne	r7, r7, r6, lsr #20		@ Undo increment
+	addeq	r7, r7, r6, lsr #20		@ Undo decrement
+	str	r7, [r2, r9, lsr #14]		@ Put register 'Rn'
+	b	do_DataAbort
 
 .data_arm_lateldrprereg:
 	tst	r8, #1 << 21			@ check writeback bit
-	moveq	pc, lr				@ no writeback -> no fixup
+	beq	do_DataAbort			@ no writeback -> no fixup
 .data_arm_lateldrpostreg:
 	and	r7, r8, #15			@ Extract 'm' from instruction
-	ldr	r6, [sp, r7, lsl #2]		@ Get register 'Rm'
-	mov	r5, r8, lsr #7			@ get shift count
-	ands	r5, r5, #31
+	ldr	r6, [r2, r7, lsl #2]		@ Get register 'Rm'
+	mov	r9, r8, lsr #7			@ get shift count
+	ands	r9, r9, #31
 	and	r7, r8, #0x70			@ get shift type
 	orreq	r7, r7, #8			@ shift count = 0
 	add	pc, pc, r7
 	nop
 
-	mov	r6, r6, lsl r5			@ 0: LSL #!0
+	mov	r6, r6, lsl r9			@ 0: LSL #!0
 	b	.data_arm_apply_r6_and_rn
 	b	.data_arm_apply_r6_and_rn	@ 1: LSL #0
 	nop
@@ -134,7 +129,7 @@ ENTRY(v4t_late_abort)
 	nop
 	b	.data_unknown			@ 3: MUL?
 	nop
-	mov	r6, r6, lsr r5			@ 4: LSR #!0
+	mov	r6, r6, lsr r9			@ 4: LSR #!0
 	b	.data_arm_apply_r6_and_rn
 	mov	r6, r6, lsr #32			@ 5: LSR #32
 	b	.data_arm_apply_r6_and_rn
@@ -142,7 +137,7 @@ ENTRY(v4t_late_abort)
 	nop
 	b	.data_unknown			@ 7: MUL?
 	nop
-	mov	r6, r6, asr r5			@ 8: ASR #!0
+	mov	r6, r6, asr r9			@ 8: ASR #!0
 	b	.data_arm_apply_r6_and_rn
 	mov	r6, r6, asr #32			@ 9: ASR #32
 	b	.data_arm_apply_r6_and_rn
@@ -150,7 +145,7 @@ ENTRY(v4t_late_abort)
 	nop
 	b	.data_unknown			@ B: MUL?
 	nop
-	mov	r6, r6, ror r5			@ C: ROR #!0
+	mov	r6, r6, ror r9			@ C: ROR #!0
 	b	.data_arm_apply_r6_and_rn
 	mov	r6, r6, rrx			@ D: RRX
 	b	.data_arm_apply_r6_and_rn
@@ -159,7 +154,7 @@ ENTRY(v4t_late_abort)
 	b	.data_unknown			@ F: MUL?
 
 .data_thumb_abort:
-	ldrh	r8, [r2]			@ read instruction
+	ldrh	r8, [r4]			@ read instruction
 	tst	r8, #1 << 11			@ L = 1 -> write?
 	orreq	r1, r1, #1 << 8			@ yes
 	and	r7, r8, #15 << 12
@@ -172,10 +167,10 @@ ENTRY(v4t_late_abort)
 /* 3 */	b	.data_unknown
 /* 4 */	b	.data_unknown
 /* 5 */	b	.data_thumb_reg
-/* 6 */	mov	pc, lr
-/* 7 */	mov	pc, lr
-/* 8 */	mov	pc, lr
-/* 9 */	mov	pc, lr
+/* 6 */	b	do_DataAbort
+/* 7 */	b	do_DataAbort
+/* 8 */	b	do_DataAbort
+/* 9 */	b	do_DataAbort
 /* A */	b	.data_unknown
 /* B */	b	.data_thumb_pushpop
 /* C */	b	.data_thumb_ldmstm
@@ -185,41 +180,41 @@ ENTRY(v4t_late_abort)
 
 .data_thumb_reg:
 	tst	r8, #1 << 9
-	moveq	pc, lr
+	beq	do_DataAbort
 	tst	r8, #1 << 10			@ If 'S' (signed) bit is set
 	movne	r1, #0				@ it must be a load instr
-	mov	pc, lr
+	b	do_DataAbort
 
 .data_thumb_pushpop:
 	tst	r8, #1 << 10
 	beq	.data_unknown
 	and	r6, r8, #0x55			@ hweight8(r8) + R bit
-	and	r2, r8, #0xaa
-	add	r6, r6, r2, lsr #1
-	and	r2, r6, #0xcc
+	and	r9, r8, #0xaa
+	add	r6, r6, r9, lsr #1
+	and	r9, r6, #0xcc
 	and	r6, r6, #0x33
-	add	r6, r6, r2, lsr #2
+	add	r6, r6, r9, lsr #2
 	movs	r7, r8, lsr #9			@ C = r8 bit 8 (R bit)
 	adc	r6, r6, r6, lsr #4		@ high + low nibble + R bit
 	and	r6, r6, #15			@ number of regs to transfer
-	ldr	r7, [sp, #13 << 2]
+	ldr	r7, [r2, #13 << 2]
 	tst	r8, #1 << 11
 	addeq	r7, r7, r6, lsl #2		@ increment SP if PUSH
 	subne	r7, r7, r6, lsl #2		@ decrement SP if POP
-	str	r7, [sp, #13 << 2]
-	mov	pc, lr
+	str	r7, [r2, #13 << 2]
+	b	do_DataAbort
 
 .data_thumb_ldmstm:
 	and	r6, r8, #0x55			@ hweight8(r8)
-	and	r2, r8, #0xaa
-	add	r6, r6, r2, lsr #1
-	and	r2, r6, #0xcc
+	and	r9, r8, #0xaa
+	add	r6, r6, r9, lsr #1
+	and	r9, r6, #0xcc
 	and	r6, r6, #0x33
-	add	r6, r6, r2, lsr #2
+	add	r6, r6, r9, lsr #2
 	add	r6, r6, r6, lsr #4
-	and	r5, r8, #7 << 8
-	ldr	r7, [sp, r5, lsr #6]
+	and	r9, r8, #7 << 8
+	ldr	r7, [r2, r9, lsr #6]
 	and	r6, r6, #15			@ number of regs to transfer
 	sub	r7, r7, r6, lsl #2		@ always decrement
-	str	r7, [sp, r5, lsr #6]
-	mov	pc, lr
+	str	r7, [r2, r9, lsr #6]
+	b	do_DataAbort
diff --git a/arch/arm/mm/abort-macro.S b/arch/arm/mm/abort-macro.S
index d7cb1bfa51a..2cbf68ef0e8 100644
--- a/arch/arm/mm/abort-macro.S
+++ b/arch/arm/mm/abort-macro.S
@@ -9,34 +9,32 @@
  *
  */
 
-	.macro	do_thumb_abort
-	tst	r3, #PSR_T_BIT
+	.macro	do_thumb_abort, fsr, pc, psr, tmp
+	tst	\psr, #PSR_T_BIT
 	beq	not_thumb
-	ldrh	r3, [r2]			@ Read aborted Thumb instruction
-	and	r3, r3, # 0xfe00		@ Mask opcode field
-	cmp	r3, # 0x5600			@ Is it ldrsb?
-	orreq	r3, r3, #1 << 11		@ Set L-bit if yes
-	tst	r3, #1 << 11			@ L = 0 -> write
-	orreq	r1, r1, #1 << 11		@ yes.
-	mov	pc, lr
+	ldrh	\tmp, [\pc]			@ Read aborted Thumb instruction
+	and	\tmp, \tmp, # 0xfe00		@ Mask opcode field
+	cmp	\tmp, # 0x5600			@ Is it ldrsb?
+	orreq	\tmp, \tmp, #1 << 11		@ Set L-bit if yes
+	tst	\tmp, #1 << 11			@ L = 0 -> write
+	orreq	\fsr, \fsr, #1 << 11		@ yes.
+	b	do_DataAbort
 not_thumb:
 	.endm
 
 /*
- * We check for the following insturction encoding for LDRD.
+ * We check for the following instruction encoding for LDRD.
  *
- * [27:25] == 0
+ * [27:25] == 000
  *   [7:4] == 1101
  *    [20] == 0
  */
- 	.macro	do_ldrd_abort
- 	tst	r3, #0x0e000000			@ [27:25] == 0
+	.macro	do_ldrd_abort, tmp, insn
+	tst	\insn, #0x0e100000		@ [27:25,20] == 0
 	bne	not_ldrd
-	and	r2, r3, #0x000000f0		@ [7:4] == 1101
-	cmp	r2, #0x000000d0
-	bne	not_ldrd
-	tst	r3, #1 << 20			@ [20] == 0
-	moveq	pc, lr
+	and	\tmp, \insn, #0x000000f0	@ [7:4] == 1101
+	cmp	\tmp, #0x000000d0
+	beq	do_DataAbort
 not_ldrd:
 	.endm
 
diff --git a/arch/arm/mm/abort-nommu.S b/arch/arm/mm/abort-nommu.S
index a7cc7f9ee45..119cb479c2a 100644
--- a/arch/arm/mm/abort-nommu.S
+++ b/arch/arm/mm/abort-nommu.S
@@ -3,11 +3,11 @@
 /*
  * Function: nommu_early_abort
  *
- * Params  : r2 = address of aborted instruction
- *         : r3 = saved SPSR
+ * Params  : r2 = pt_regs
+ *	   : r4 = aborted context pc
+ *	   : r5 = aborted context psr
  *
- * Returns : r0 = 0 (abort address)
- *	   : r1 = 0 (FSR)
+ * Returns : r4 - r11, r13 preserved
  *
  * Note: There is no FSR/FAR on !CPU_CP15_MMU cores.
  *       Just fill zero into the registers.
@@ -16,4 +16,5 @@
 ENTRY(nommu_early_abort)
 	mov	r0, #0				@ clear r0, r1 (no FSR/FAR)
 	mov	r1, #0
-	mov	pc, lr
+	b	do_DataAbort
+ENDPROC(nommu_early_abort)
diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c
index 074b7cb0774..b8cb1a2688a 100644
--- a/arch/arm/mm/alignment.c
+++ b/arch/arm/mm/alignment.c
@@ -11,17 +11,24 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
+#include <linux/moduleparam.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
 
-#include <asm/uaccess.h>
+#include <asm/cp15.h>
+#include <asm/system_info.h>
 #include <asm/unaligned.h>
+#include <asm/opcodes.h>
 
 #include "fault.h"
+#include "mm.h"
 
 /*
  * 32-bit misaligned trap handler (c) 1998 San Mehat (CCC) -July 1998
@@ -61,6 +68,12 @@
 #define SHIFT_ASR	0x40
 #define SHIFT_RORRRX	0x60
 
+#define BAD_INSTR 	0xdeadc0de
+
+/* Thumb-2 32 bit format per ARMv7 DDI0406A A6.3, either f800h,e800h,f800h */
+#define IS_T32(hi16) \
+	(((hi16) & 0xe000) == 0xe000 && ((hi16) & 0x1800))
+
 static unsigned long ai_user;
 static unsigned long ai_sys;
 static unsigned long ai_skipped;
@@ -69,6 +82,40 @@ static unsigned long ai_word;
 static unsigned long ai_dword;
 static unsigned long ai_multi;
 static int ai_usermode;
+static unsigned long cr_no_alignment;
+
+core_param(alignment, ai_usermode, int, 0600);
+
+#define UM_WARN		(1 << 0)
+#define UM_FIXUP	(1 << 1)
+#define UM_SIGNAL	(1 << 2)
+
+/* Return true if and only if the ARMv6 unaligned access model is in use. */
+static bool cpu_is_v6_unaligned(void)
+{
+	return cpu_architecture() >= CPU_ARCH_ARMv6 && get_cr() & CR_U;
+}
+
+static int safe_usermode(int new_usermode, bool warn)
+{
+	/*
+	 * ARMv6 and later CPUs can perform unaligned accesses for
+	 * most single load and store instructions up to word size.
+	 * LDM, STM, LDRD and STRD still need to be handled.
+	 *
+	 * Ignoring the alignment fault is not an option on these
+	 * CPUs since we spin re-faulting the instruction without
+	 * making any progress.
+	 */
+	if (cpu_is_v6_unaligned() && !(new_usermode & (UM_FIXUP | UM_SIGNAL))) {
+		new_usermode |= UM_FIXUP;
+
+		if (warn)
+			printk(KERN_WARNING "alignment: ignoring faults is unsafe on this CPU.  Defaulting to fixup mode.\n");
+	}
+
+	return new_usermode;
+}
 
 #ifdef CONFIG_PROC_FS
 static const char *usermode_action[] = {
@@ -80,36 +127,29 @@ static const char *usermode_action[] = {
 	"signal+warn"
 };
 
-static int
-proc_alignment_read(char *page, char **start, off_t off, int count, int *eof,
-		    void *data)
+static int alignment_proc_show(struct seq_file *m, void *v)
 {
-	char *p = page;
-	int len;
-
-	p += sprintf(p, "User:\t\t%lu\n", ai_user);
-	p += sprintf(p, "System:\t\t%lu\n", ai_sys);
-	p += sprintf(p, "Skipped:\t%lu\n", ai_skipped);
-	p += sprintf(p, "Half:\t\t%lu\n", ai_half);
-	p += sprintf(p, "Word:\t\t%lu\n", ai_word);
+	seq_printf(m, "User:\t\t%lu\n", ai_user);
+	seq_printf(m, "System:\t\t%lu\n", ai_sys);
+	seq_printf(m, "Skipped:\t%lu\n", ai_skipped);
+	seq_printf(m, "Half:\t\t%lu\n", ai_half);
+	seq_printf(m, "Word:\t\t%lu\n", ai_word);
 	if (cpu_architecture() >= CPU_ARCH_ARMv5TE)
-		p += sprintf(p, "DWord:\t\t%lu\n", ai_dword);
-	p += sprintf(p, "Multi:\t\t%lu\n", ai_multi);
-	p += sprintf(p, "User faults:\t%i (%s)\n", ai_usermode,
+		seq_printf(m, "DWord:\t\t%lu\n", ai_dword);
+	seq_printf(m, "Multi:\t\t%lu\n", ai_multi);
+	seq_printf(m, "User faults:\t%i (%s)\n", ai_usermode,
 			usermode_action[ai_usermode]);
 
-	len = (p - page) - off;
-	if (len < 0)
-		len = 0;
-
-	*eof = (len <= count) ? 1 : 0;
-	*start = page + off;
+	return 0;
+}
 
-	return len;
+static int alignment_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, alignment_proc_show, NULL);
 }
 
-static int proc_alignment_write(struct file *file, const char __user *buffer,
-				unsigned long count, void *data)
+static ssize_t alignment_proc_write(struct file *file, const char __user *buffer,
+				    size_t count, loff_t *pos)
 {
 	char mode;
 
@@ -117,11 +157,18 @@ static int proc_alignment_write(struct file *file, const char __user *buffer,
 		if (get_user(mode, buffer))
 			return -EFAULT;
 		if (mode >= '0' && mode <= '5')
-			ai_usermode = mode - '0';
+			ai_usermode = safe_usermode(mode - '0', true);
 	}
 	return count;
 }
 
+static const struct file_operations alignment_proc_fops = {
+	.open		= alignment_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= alignment_proc_write,
+};
 #endif /* CONFIG_PROC_FS */
 
 union offset_union {
@@ -148,17 +195,19 @@ union offset_union {
 
 #define __get8_unaligned_check(ins,val,addr,err)	\
 	__asm__(					\
-	"1:	"ins"	%1, [%2], #1\n"			\
+ ARM(	"1:	"ins"	%1, [%2], #1\n"	)		\
+ THUMB(	"1:	"ins"	%1, [%2]\n"	)		\
+ THUMB(	"	add	%2, %2, #1\n"	)		\
 	"2:\n"						\
-	"	.section .fixup,\"ax\"\n"		\
+	"	.pushsection .fixup,\"ax\"\n"		\
 	"	.align	2\n"				\
 	"3:	mov	%0, #1\n"			\
 	"	b	2b\n"				\
-	"	.previous\n"				\
-	"	.section __ex_table,\"a\"\n"		\
+	"	.popsection\n"				\
+	"	.pushsection __ex_table,\"a\"\n"	\
 	"	.align	3\n"				\
 	"	.long	1b, 3b\n"			\
-	"	.previous\n"				\
+	"	.popsection\n"				\
 	: "=r" (err), "=&r" (val), "=r" (addr)		\
 	: "0" (err), "2" (addr))
 
@@ -204,20 +253,22 @@ union offset_union {
 	do {							\
 		unsigned int err = 0, v = val, a = addr;	\
 		__asm__( FIRST_BYTE_16				\
-		"1:	"ins"	%1, [%2], #1\n"			\
+	 ARM(	"1:	"ins"	%1, [%2], #1\n"	)		\
+	 THUMB(	"1:	"ins"	%1, [%2]\n"	)		\
+	 THUMB(	"	add	%2, %2, #1\n"	)		\
 		"	mov	%1, %1, "NEXT_BYTE"\n"		\
 		"2:	"ins"	%1, [%2]\n"			\
 		"3:\n"						\
-		"	.section .fixup,\"ax\"\n"		\
+		"	.pushsection .fixup,\"ax\"\n"		\
 		"	.align	2\n"				\
 		"4:	mov	%0, #1\n"			\
 		"	b	3b\n"				\
-		"	.previous\n"				\
-		"	.section __ex_table,\"a\"\n"		\
+		"	.popsection\n"				\
+		"	.pushsection __ex_table,\"a\"\n"	\
 		"	.align	3\n"				\
 		"	.long	1b, 4b\n"			\
 		"	.long	2b, 4b\n"			\
-		"	.previous\n"				\
+		"	.popsection\n"				\
 		: "=r" (err), "=&r" (v), "=&r" (a)		\
 		: "0" (err), "1" (v), "2" (a));			\
 		if (err)					\
@@ -234,26 +285,32 @@ union offset_union {
 	do {							\
 		unsigned int err = 0, v = val, a = addr;	\
 		__asm__( FIRST_BYTE_32				\
-		"1:	"ins"	%1, [%2], #1\n"			\
+	 ARM(	"1:	"ins"	%1, [%2], #1\n"	)		\
+	 THUMB(	"1:	"ins"	%1, [%2]\n"	)		\
+	 THUMB(	"	add	%2, %2, #1\n"	)		\
 		"	mov	%1, %1, "NEXT_BYTE"\n"		\
-		"2:	"ins"	%1, [%2], #1\n"			\
+	 ARM(	"2:	"ins"	%1, [%2], #1\n"	)		\
+	 THUMB(	"2:	"ins"	%1, [%2]\n"	)		\
+	 THUMB(	"	add	%2, %2, #1\n"	)		\
 		"	mov	%1, %1, "NEXT_BYTE"\n"		\
-		"3:	"ins"	%1, [%2], #1\n"			\
+	 ARM(	"3:	"ins"	%1, [%2], #1\n"	)		\
+	 THUMB(	"3:	"ins"	%1, [%2]\n"	)		\
+	 THUMB(	"	add	%2, %2, #1\n"	)		\
 		"	mov	%1, %1, "NEXT_BYTE"\n"		\
 		"4:	"ins"	%1, [%2]\n"			\
 		"5:\n"						\
-		"	.section .fixup,\"ax\"\n"		\
+		"	.pushsection .fixup,\"ax\"\n"		\
 		"	.align	2\n"				\
 		"6:	mov	%0, #1\n"			\
 		"	b	5b\n"				\
-		"	.previous\n"				\
-		"	.section __ex_table,\"a\"\n"		\
+		"	.popsection\n"				\
+		"	.pushsection __ex_table,\"a\"\n"	\
 		"	.align	3\n"				\
 		"	.long	1b, 6b\n"			\
 		"	.long	2b, 6b\n"			\
 		"	.long	3b, 6b\n"			\
 		"	.long	4b, 6b\n"			\
-		"	.previous\n"				\
+		"	.popsection\n"				\
 		: "=r" (err), "=&r" (v), "=&r" (a)		\
 		: "0" (err), "1" (v), "2" (a));			\
 		if (err)					\
@@ -327,38 +384,48 @@ do_alignment_ldrdstrd(unsigned long addr, unsigned long instr,
 		      struct pt_regs *regs)
 {
 	unsigned int rd = RD_BITS(instr);
-
-	if (((rd & 1) == 1) || (rd == 14))
+	unsigned int rd2;
+	int load;
+
+	if ((instr & 0xfe000000) == 0xe8000000) {
+		/* ARMv7 Thumb-2 32-bit LDRD/STRD */
+		rd2 = (instr >> 8) & 0xf;
+		load = !!(LDST_L_BIT(instr));
+	} else if (((rd & 1) == 1) || (rd == 14))
 		goto bad;
+	else {
+		load = ((instr & 0xf0) == 0xd0);
+		rd2 = rd + 1;
+	}
 
 	ai_dword += 1;
 
 	if (user_mode(regs))
 		goto user;
 
-	if ((instr & 0xf0) == 0xd0) {
+	if (load) {
 		unsigned long val;
 		get32_unaligned_check(val, addr);
 		regs->uregs[rd] = val;
 		get32_unaligned_check(val, addr + 4);
-		regs->uregs[rd + 1] = val;
+		regs->uregs[rd2] = val;
 	} else {
 		put32_unaligned_check(regs->uregs[rd], addr);
-		put32_unaligned_check(regs->uregs[rd + 1], addr + 4);
+		put32_unaligned_check(regs->uregs[rd2], addr + 4);
 	}
 
 	return TYPE_LDST;
 
  user:
-	if ((instr & 0xf0) == 0xd0) {
+	if (load) {
 		unsigned long val;
 		get32t_unaligned_check(val, addr);
 		regs->uregs[rd] = val;
 		get32t_unaligned_check(val, addr + 4);
-		regs->uregs[rd + 1] = val;
+		regs->uregs[rd2] = val;
 	} else {
 		put32t_unaligned_check(regs->uregs[rd], addr);
-		put32t_unaligned_check(regs->uregs[rd + 1], addr + 4);
+		put32t_unaligned_check(regs->uregs[rd2], addr + 4);
 	}
 
 	return TYPE_LDST;
@@ -611,32 +678,112 @@ thumb2arm(u16 tinstr)
 		/* Else fall through for illegal instruction case */
 
 	default:
-		return 0xdeadc0de;
+		return BAD_INSTR;
 	}
 }
 
+/*
+ * Convert Thumb-2 32 bit LDM, STM, LDRD, STRD to equivalent instruction
+ * handlable by ARM alignment handler, also find the corresponding handler,
+ * so that we can reuse ARM userland alignment fault fixups for Thumb.
+ *
+ * @pinstr: original Thumb-2 instruction; returns new handlable instruction
+ * @regs: register context.
+ * @poffset: return offset from faulted addr for later writeback
+ *
+ * NOTES:
+ * 1. Comments below refer to ARMv7 DDI0406A Thumb Instruction sections.
+ * 2. Register name Rt from ARMv7 is same as Rd from ARMv6 (Rd is Rt)
+ */
+static void *
+do_alignment_t32_to_handler(unsigned long *pinstr, struct pt_regs *regs,
+			    union offset_union *poffset)
+{
+	unsigned long instr = *pinstr;
+	u16 tinst1 = (instr >> 16) & 0xffff;
+	u16 tinst2 = instr & 0xffff;
+
+	switch (tinst1 & 0xffe0) {
+	/* A6.3.5 Load/Store multiple */
+	case 0xe880:		/* STM/STMIA/STMEA,LDM/LDMIA, PUSH/POP T2 */
+	case 0xe8a0:		/* ...above writeback version */
+	case 0xe900:		/* STMDB/STMFD, LDMDB/LDMEA */
+	case 0xe920:		/* ...above writeback version */
+		/* no need offset decision since handler calculates it */
+		return do_alignment_ldmstm;
+
+	case 0xf840:		/* POP/PUSH T3 (single register) */
+		if (RN_BITS(instr) == 13 && (tinst2 & 0x09ff) == 0x0904) {
+			u32 L = !!(LDST_L_BIT(instr));
+			const u32 subset[2] = {
+				0xe92d0000,	/* STMDB sp!,{registers} */
+				0xe8bd0000,	/* LDMIA sp!,{registers} */
+			};
+			*pinstr = subset[L] | (1<<RD_BITS(instr));
+			return do_alignment_ldmstm;
+		}
+		/* Else fall through for illegal instruction case */
+		break;
+
+	/* A6.3.6 Load/store double, STRD/LDRD(immed, lit, reg) */
+	case 0xe860:
+	case 0xe960:
+	case 0xe8e0:
+	case 0xe9e0:
+		poffset->un = (tinst2 & 0xff) << 2;
+	case 0xe940:
+	case 0xe9c0:
+		return do_alignment_ldrdstrd;
+
+	/*
+	 * No need to handle load/store instructions up to word size
+	 * since ARMv6 and later CPUs can perform unaligned accesses.
+	 */
+	default:
+		break;
+	}
+	return NULL;
+}
+
 static int
 do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
-	union offset_union offset;
+	union offset_union uninitialized_var(offset);
 	unsigned long instr = 0, instrptr;
 	int (*handler)(unsigned long addr, unsigned long instr, struct pt_regs *regs);
 	unsigned int type;
-	mm_segment_t fs;
 	unsigned int fault;
 	u16 tinstr = 0;
+	int isize = 4;
+	int thumb2_32b = 0;
+
+	if (interrupts_enabled(regs))
+		local_irq_enable();
 
 	instrptr = instruction_pointer(regs);
 
-	fs = get_fs();
-	set_fs(KERNEL_DS);
 	if (thumb_mode(regs)) {
-		fault = __get_user(tinstr, (u16 *)(instrptr & ~1));
-		if (!(fault))
-			instr = thumb2arm(tinstr);
-	} else
-		fault = __get_user(instr, (u32 *)instrptr);
-	set_fs(fs);
+		u16 *ptr = (u16 *)(instrptr & ~1);
+		fault = probe_kernel_address(ptr, tinstr);
+		tinstr = __mem_to_opcode_thumb16(tinstr);
+		if (!fault) {
+			if (cpu_architecture() >= CPU_ARCH_ARMv7 &&
+			    IS_T32(tinstr)) {
+				/* Thumb-2 32-bit */
+				u16 tinst2 = 0;
+				fault = probe_kernel_address(ptr + 1, tinst2);
+				tinst2 = __mem_to_opcode_thumb16(tinst2);
+				instr = __opcode_thumb32_compose(tinstr, tinst2);
+				thumb2_32b = 1;
+			} else {
+				isize = 2;
+				instr = thumb2arm(tinstr);
+			}
+		}
+	} else {
+		fault = probe_kernel_address(instrptr, instr);
+		instr = __mem_to_opcode_arm(instr);
+	}
 
 	if (fault) {
 		type = TYPE_FAULT;
@@ -650,7 +797,7 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 
  fixup:
 
-	regs->ARM_pc += thumb_mode(regs) ? 2 : 4;
+	regs->ARM_pc += isize;
 
 	switch (CODING_BITS(instr)) {
 	case 0x00000000:	/* 3.13.4 load/store instruction extensions */
@@ -709,18 +856,28 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 		handler = do_alignment_ldrstr;
 		break;
 
-	case 0x08000000:	/* ldm or stm */
-		handler = do_alignment_ldmstm;
+	case 0x08000000:	/* ldm or stm, or thumb-2 32bit instruction */
+		if (thumb2_32b) {
+			offset.un = 0;
+			handler = do_alignment_t32_to_handler(&instr, regs, &offset);
+		} else {
+			offset.un = 0;
+			handler = do_alignment_ldmstm;
+		}
 		break;
 
 	default:
 		goto bad;
 	}
 
+	if (!handler)
+		goto bad;
 	type = handler(addr, instr, regs);
 
-	if (type == TYPE_ERROR || type == TYPE_FAULT)
+	if (type == TYPE_ERROR || type == TYPE_FAULT) {
+		regs->ARM_pc -= isize;
 		goto bad_or_fault;
+	}
 
 	if (type == TYPE_LDST)
 		do_alignment_finish_ldst(addr, instr, regs, offset);
@@ -730,7 +887,6 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
  bad_or_fault:
 	if (type == TYPE_ERROR)
 		goto bad;
-	regs->ARM_pc -= thumb_mode(regs) ? 2 : 4;
 	/*
 	 * We got a fault - fix it up, or die.
 	 */
@@ -746,33 +902,62 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	 */
 	printk(KERN_ERR "Alignment trap: not handling instruction "
 		"%0*lx at [<%08lx>]\n",
-		thumb_mode(regs) ? 4 : 8,
-		thumb_mode(regs) ? tinstr : instr, instrptr);
+		isize << 1,
+		isize == 2 ? tinstr : instr, instrptr);
 	ai_skipped += 1;
 	return 1;
 
  user:
 	ai_user += 1;
 
-	if (ai_usermode & 1)
+	if (ai_usermode & UM_WARN)
 		printk("Alignment trap: %s (%d) PC=0x%08lx Instr=0x%0*lx "
 		       "Address=0x%08lx FSR 0x%03x\n", current->comm,
-			current->pid, instrptr,
-		        thumb_mode(regs) ? 4 : 8,
-		        thumb_mode(regs) ? tinstr : instr,
+			task_pid_nr(current), instrptr,
+			isize << 1,
+			isize == 2 ? tinstr : instr,
 		        addr, fsr);
 
-	if (ai_usermode & 2)
+	if (ai_usermode & UM_FIXUP)
 		goto fixup;
 
-	if (ai_usermode & 4)
-		force_sig(SIGBUS, current);
-	else
-		set_cr(cr_no_alignment);
+	if (ai_usermode & UM_SIGNAL) {
+		siginfo_t si;
+
+		si.si_signo = SIGBUS;
+		si.si_errno = 0;
+		si.si_code = BUS_ADRALN;
+		si.si_addr = (void __user *)addr;
+
+		force_sig_info(si.si_signo, &si, current);
+	} else {
+		/*
+		 * We're about to disable the alignment trap and return to
+		 * user space.  But if an interrupt occurs before actually
+		 * reaching user space, then the IRQ vector entry code will
+		 * notice that we were still in kernel space and therefore
+		 * the alignment trap won't be re-enabled in that case as it
+		 * is presumed to be always on from kernel space.
+		 * Let's prevent that race by disabling interrupts here (they
+		 * are disabled on the way back to user space anyway in
+		 * entry-common.S) and disable the alignment trap only if
+		 * there is no work pending for this thread.
+		 */
+		raw_local_irq_disable();
+		if (!(current_thread_info()->flags & _TIF_WORK_MASK))
+			set_cr(cr_no_alignment);
+	}
 
 	return 0;
 }
 
+static int __init noalign_setup(char *__unused)
+{
+	set_cr(__clear_cr(CR_A));
+	return 1;
+}
+__setup("noalign", noalign_setup);
+
 /*
  * This needs to be done after sysctl_init, otherwise sys/ will be
  * overwritten.  Actually, this shouldn't be in sys/ at all since
@@ -784,20 +969,33 @@ static int __init alignment_init(void)
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry *res;
 
-	res = proc_mkdir("cpu", NULL);
+	res = proc_create("cpu/alignment", S_IWUSR | S_IRUGO, NULL,
+			  &alignment_proc_fops);
 	if (!res)
 		return -ENOMEM;
+#endif
 
-	res = create_proc_entry("alignment", S_IWUSR | S_IRUGO, res);
-	if (!res)
-		return -ENOMEM;
+	if (cpu_is_v6_unaligned()) {
+		set_cr(__clear_cr(CR_A));
+		ai_usermode = safe_usermode(ai_usermode, false);
+	}
 
-	res->read_proc = proc_alignment_read;
-	res->write_proc = proc_alignment_write;
-#endif
+	cr_no_alignment = get_cr() & ~CR_A;
 
-	hook_fault_code(1, do_alignment, SIGILL, "alignment exception");
-	hook_fault_code(3, do_alignment, SIGILL, "alignment exception");
+	hook_fault_code(FAULT_CODE_ALIGNMENT, do_alignment, SIGBUS, BUS_ADRALN,
+			"alignment exception");
+
+	/*
+	 * ARMv6K and ARMv7 use fault status 3 (0b00011) as Access Flag section
+	 * fault, not as alignment error.
+	 *
+	 * TODO: handle ARMv6K properly. Runtime check for 'K' extension is
+	 * needed.
+	 */
+	if (cpu_architecture() <= CPU_ARCH_ARMv6) {
+		hook_fault_code(3, do_alignment, SIGBUS, BUS_ADRALN,
+				"alignment exception");
+	}
 
 	return 0;
 }
diff --git a/arch/arm/mm/cache-aurora-l2.h b/arch/arm/mm/cache-aurora-l2.h
new file mode 100644
index 00000000000..c8612476983
--- /dev/null
+++ b/arch/arm/mm/cache-aurora-l2.h
@@ -0,0 +1,55 @@
+/*
+ * AURORA shared L2 cache controller support
+ *
+ * Copyright (C) 2012 Marvell
+ *
+ * Yehuda Yitschak <yehuday@marvell.com>
+ * Gregory CLEMENT <gregory.clement@free-electrons.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2.  This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#ifndef __ASM_ARM_HARDWARE_AURORA_L2_H
+#define __ASM_ARM_HARDWARE_AURORA_L2_H
+
+#define AURORA_SYNC_REG		    0x700
+#define AURORA_RANGE_BASE_ADDR_REG  0x720
+#define AURORA_FLUSH_PHY_ADDR_REG   0x7f0
+#define AURORA_INVAL_RANGE_REG	    0x774
+#define AURORA_CLEAN_RANGE_REG	    0x7b4
+#define AURORA_FLUSH_RANGE_REG	    0x7f4
+
+#define AURORA_ACR_REPLACEMENT_OFFSET	    27
+#define AURORA_ACR_REPLACEMENT_MASK	     \
+	(0x3 << AURORA_ACR_REPLACEMENT_OFFSET)
+#define AURORA_ACR_REPLACEMENT_TYPE_WAYRR    \
+	(0 << AURORA_ACR_REPLACEMENT_OFFSET)
+#define AURORA_ACR_REPLACEMENT_TYPE_LFSR     \
+	(1 << AURORA_ACR_REPLACEMENT_OFFSET)
+#define AURORA_ACR_REPLACEMENT_TYPE_SEMIPLRU \
+	(3 << AURORA_ACR_REPLACEMENT_OFFSET)
+
+#define AURORA_ACR_FORCE_WRITE_POLICY_OFFSET	0
+#define AURORA_ACR_FORCE_WRITE_POLICY_MASK	\
+	(0x3 << AURORA_ACR_FORCE_WRITE_POLICY_OFFSET)
+#define AURORA_ACR_FORCE_WRITE_POLICY_DIS	\
+	(0 << AURORA_ACR_FORCE_WRITE_POLICY_OFFSET)
+#define AURORA_ACR_FORCE_WRITE_BACK_POLICY	\
+	(1 << AURORA_ACR_FORCE_WRITE_POLICY_OFFSET)
+#define AURORA_ACR_FORCE_WRITE_THRO_POLICY	\
+	(2 << AURORA_ACR_FORCE_WRITE_POLICY_OFFSET)
+
+#define MAX_RANGE_SIZE		1024
+
+#define AURORA_WAY_SIZE_SHIFT	2
+
+#define AURORA_CTRL_FW		0x100
+
+/* chose a number outside L2X0_CACHE_ID_PART_MASK to be sure to make
+ * the distinction between a number coming from hardware and a number
+ * coming from the device tree */
+#define AURORA_CACHE_ID	       0x100
+
+#endif /* __ASM_ARM_HARDWARE_AURORA_L2_H */
diff --git a/arch/arm/mm/cache-fa.S b/arch/arm/mm/cache-fa.S
new file mode 100644
index 00000000000..e505befe51b
--- /dev/null
+++ b/arch/arm/mm/cache-fa.S
@@ -0,0 +1,249 @@
+/*
+ *  linux/arch/arm/mm/cache-fa.S
+ *
+ *  Copyright (C) 2005 Faraday Corp.
+ *  Copyright (C) 2008-2009 Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
+ *
+ * Based on cache-v4wb.S:
+ *  Copyright (C) 1997-2002 Russell king
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  Processors: FA520 FA526 FA626	
+ */
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <asm/memory.h>
+#include <asm/page.h>
+
+#include "proc-macros.S"
+
+/*
+ * The size of one data cache line.
+ */
+#define CACHE_DLINESIZE	16
+
+/*
+ * The total size of the data cache.
+ */
+#ifdef CONFIG_ARCH_GEMINI
+#define CACHE_DSIZE	8192
+#else
+#define CACHE_DSIZE	16384 
+#endif 
+
+/* FIXME: put optimal value here. Current one is just estimation */
+#define CACHE_DLIMIT	(CACHE_DSIZE * 2)
+
+/*
+ *	flush_icache_all()
+ *
+ *	Unconditionally clean and invalidate the entire icache.
+ */
+ENTRY(fa_flush_icache_all)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c5, 0		@ invalidate I cache
+	mov	pc, lr
+ENDPROC(fa_flush_icache_all)
+
+/*
+ *	flush_user_cache_all()
+ *
+ *	Clean and invalidate all cache entries in a particular address
+ *	space.
+ */
+ENTRY(fa_flush_user_cache_all)
+	/* FALLTHROUGH */
+/*
+ *	flush_kern_cache_all()
+ *
+ *	Clean and invalidate the entire cache.
+ */
+ENTRY(fa_flush_kern_cache_all)
+	mov	ip, #0
+	mov	r2, #VM_EXEC
+__flush_whole_cache:
+	mcr	p15, 0, ip, c7, c14, 0		@ clean/invalidate D cache
+	tst	r2, #VM_EXEC
+	mcrne	p15, 0, ip, c7, c5, 0		@ invalidate I cache
+	mcrne	p15, 0, ip, c7, c5, 6		@ invalidate BTB
+	mcrne	p15, 0, ip, c7, c10, 4		@ drain write buffer
+	mcrne	p15, 0, ip, c7, c5, 4		@ prefetch flush
+	mov	pc, lr
+
+/*
+ *	flush_user_cache_range(start, end, flags)
+ *
+ *	Invalidate a range of cache entries in the specified
+ *	address space.
+ *
+ *	- start - start address (inclusive, page aligned)
+ *	- end	- end address (exclusive, page aligned)
+ *	- flags	- vma_area_struct flags describing address space
+ */
+ENTRY(fa_flush_user_cache_range)
+	mov	ip, #0
+	sub	r3, r1, r0			@ calculate total size
+	cmp	r3, #CACHE_DLIMIT		@ total size >= limit?
+	bhs	__flush_whole_cache		@ flush whole D cache
+
+1:	tst	r2, #VM_EXEC
+	mcrne	p15, 0, r0, c7, c5, 1		@ invalidate I line
+	mcr	p15, 0, r0, c7, c14, 1		@ clean and invalidate D entry
+	add	r0, r0, #CACHE_DLINESIZE
+	cmp	r0, r1
+	blo	1b
+	tst	r2, #VM_EXEC
+	mcrne	p15, 0, ip, c7, c5, 6		@ invalidate BTB
+	mcrne	p15, 0, ip, c7, c10, 4		@ data write barrier
+	mcrne	p15, 0, ip, c7, c5, 4		@ prefetch flush
+	mov	pc, lr
+
+/*
+ *	coherent_kern_range(start, end)
+ *
+ *	Ensure coherency between the Icache and the Dcache in the
+ *	region described by start.  If you have non-snooping
+ *	Harvard caches, you need to implement this function.
+ *
+ *	- start  - virtual start address
+ *	- end	 - virtual end address
+ */
+ENTRY(fa_coherent_kern_range)
+	/* fall through */
+
+/*
+ *	coherent_user_range(start, end)
+ *
+ *	Ensure coherency between the Icache and the Dcache in the
+ *	region described by start.  If you have non-snooping
+ *	Harvard caches, you need to implement this function.
+ *
+ *	- start  - virtual start address
+ *	- end	 - virtual end address
+ */
+ENTRY(fa_coherent_user_range)
+	bic	r0, r0, #CACHE_DLINESIZE - 1
+1:	mcr	p15, 0, r0, c7, c14, 1		@ clean and invalidate D entry
+	mcr	p15, 0, r0, c7, c5, 1		@ invalidate I entry
+	add	r0, r0, #CACHE_DLINESIZE
+	cmp	r0, r1
+	blo	1b
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c5, 6		@ invalidate BTB
+	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
+	mcr	p15, 0, r0, c7, c5, 4		@ prefetch flush
+	mov	pc, lr
+
+/*
+ *	flush_kern_dcache_area(void *addr, size_t size)
+ *
+ *	Ensure that the data held in the page kaddr is written back
+ *	to the page in question.
+ *
+ *	- addr	- kernel address
+ *	- size	- size of region
+ */
+ENTRY(fa_flush_kern_dcache_area)
+	add	r1, r0, r1
+1:	mcr	p15, 0, r0, c7, c14, 1		@ clean & invalidate D line
+	add	r0, r0, #CACHE_DLINESIZE
+	cmp	r0, r1
+	blo	1b
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c5, 0		@ invalidate I cache
+	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
+	mov	pc, lr
+
+/*
+ *	dma_inv_range(start, end)
+ *
+ *	Invalidate (discard) the specified virtual address range.
+ *	May not write back any entries.  If 'start' or 'end'
+ *	are not cache line aligned, those lines must be written
+ *	back.
+ *
+ *	- start  - virtual start address
+ *	- end	 - virtual end address
+ */
+fa_dma_inv_range:
+	tst	r0, #CACHE_DLINESIZE - 1
+	bic	r0, r0, #CACHE_DLINESIZE - 1
+	mcrne	p15, 0, r0, c7, c14, 1		@ clean & invalidate D entry
+	tst	r1, #CACHE_DLINESIZE - 1
+	bic	r1, r1, #CACHE_DLINESIZE - 1
+	mcrne	p15, 0, r1, c7, c14, 1		@ clean & invalidate D entry
+1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
+	add	r0, r0, #CACHE_DLINESIZE
+	cmp	r0, r1
+	blo	1b
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
+	mov	pc, lr
+
+/*
+ *	dma_clean_range(start, end)
+ *
+ *	Clean (write back) the specified virtual address range.
+ *
+ *	- start  - virtual start address
+ *	- end	 - virtual end address
+ */
+fa_dma_clean_range:
+	bic	r0, r0, #CACHE_DLINESIZE - 1
+1:	mcr	p15, 0, r0, c7, c10, 1		@ clean D entry
+	add	r0, r0, #CACHE_DLINESIZE
+	cmp	r0, r1
+	blo	1b
+	mov	r0, #0	
+	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
+	mov	pc, lr
+
+/*
+ *	dma_flush_range(start,end)
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
+ */
+ENTRY(fa_dma_flush_range)
+	bic	r0, r0, #CACHE_DLINESIZE - 1
+1:	mcr	p15, 0, r0, c7, c14, 1		@ clean & invalidate D entry
+	add	r0, r0, #CACHE_DLINESIZE
+	cmp	r0, r1
+	blo	1b
+	mov	r0, #0	
+	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
+	mov	pc, lr
+
+/*
+ *	dma_map_area(start, size, dir)
+ *	- start	- kernel virtual start address
+ *	- size	- size of region
+ *	- dir	- DMA direction
+ */
+ENTRY(fa_dma_map_area)
+	add	r1, r1, r0
+	cmp	r2, #DMA_TO_DEVICE
+	beq	fa_dma_clean_range
+	bcs	fa_dma_inv_range
+	b	fa_dma_flush_range
+ENDPROC(fa_dma_map_area)
+
+/*
+ *	dma_unmap_area(start, size, dir)
+ *	- start	- kernel virtual start address
+ *	- size	- size of region
+ *	- dir	- DMA direction
+ */
+ENTRY(fa_dma_unmap_area)
+	mov	pc, lr
+ENDPROC(fa_dma_unmap_area)
+
+	.globl	fa_flush_kern_cache_louis
+	.equ	fa_flush_kern_cache_louis, fa_flush_kern_cache_all
+
+	__INITDATA
+
+	@ define struct cpu_cache_fns (see <asm/cacheflush.h> and proc-macros.S)
+	define_cache_functions fa
diff --git a/arch/arm/mm/cache-feroceon-l2.c b/arch/arm/mm/cache-feroceon-l2.c
new file mode 100644
index 00000000000..e028a7f2ebc
--- /dev/null
+++ b/arch/arm/mm/cache-feroceon-l2.c
@@ -0,0 +1,396 @@
+/*
+ * arch/arm/mm/cache-feroceon-l2.c - Feroceon L2 cache controller support
+ *
+ * Copyright (C) 2008 Marvell Semiconductor
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2.  This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ *
+ * References:
+ * - Unified Layer 2 Cache for Feroceon CPU Cores,
+ *   Document ID MV-S104858-00, Rev. A, October 23 2007.
+ */
+
+#include <linux/init.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <asm/cacheflush.h>
+#include <asm/cp15.h>
+#include <asm/hardware/cache-feroceon-l2.h>
+
+#define L2_WRITETHROUGH_KIRKWOOD	BIT(4)
+
+/*
+ * Low-level cache maintenance operations.
+ *
+ * As well as the regular 'clean/invalidate/flush L2 cache line by
+ * MVA' instructions, the Feroceon L2 cache controller also features
+ * 'clean/invalidate L2 range by MVA' operations.
+ *
+ * Cache range operations are initiated by writing the start and
+ * end addresses to successive cp15 registers, and process every
+ * cache line whose first byte address lies in the inclusive range
+ * [start:end].
+ *
+ * The cache range operations stall the CPU pipeline until completion.
+ *
+ * The range operations require two successive cp15 writes, in
+ * between which we don't want to be preempted.
+ */
+
+static inline unsigned long l2_get_va(unsigned long paddr)
+{
+#ifdef CONFIG_HIGHMEM
+	/*
+	 * Because range ops can't be done on physical addresses,
+	 * we simply install a virtual mapping for it only for the
+	 * TLB lookup to occur, hence no need to flush the untouched
+	 * memory mapping afterwards (note: a cache flush may happen
+	 * in some circumstances depending on the path taken in kunmap_atomic).
+	 */
+	void *vaddr = kmap_atomic_pfn(paddr >> PAGE_SHIFT);
+	return (unsigned long)vaddr + (paddr & ~PAGE_MASK);
+#else
+	return __phys_to_virt(paddr);
+#endif
+}
+
+static inline void l2_put_va(unsigned long vaddr)
+{
+#ifdef CONFIG_HIGHMEM
+	kunmap_atomic((void *)vaddr);
+#endif
+}
+
+static inline void l2_clean_pa(unsigned long addr)
+{
+	__asm__("mcr p15, 1, %0, c15, c9, 3" : : "r" (addr));
+}
+
+static inline void l2_clean_pa_range(unsigned long start, unsigned long end)
+{
+	unsigned long va_start, va_end, flags;
+
+	/*
+	 * Make sure 'start' and 'end' reference the same page, as
+	 * L2 is PIPT and range operations only do a TLB lookup on
+	 * the start address.
+	 */
+	BUG_ON((start ^ end) >> PAGE_SHIFT);
+
+	va_start = l2_get_va(start);
+	va_end = va_start + (end - start);
+	raw_local_irq_save(flags);
+	__asm__("mcr p15, 1, %0, c15, c9, 4\n\t"
+		"mcr p15, 1, %1, c15, c9, 5"
+		: : "r" (va_start), "r" (va_end));
+	raw_local_irq_restore(flags);
+	l2_put_va(va_start);
+}
+
+static inline void l2_clean_inv_pa(unsigned long addr)
+{
+	__asm__("mcr p15, 1, %0, c15, c10, 3" : : "r" (addr));
+}
+
+static inline void l2_inv_pa(unsigned long addr)
+{
+	__asm__("mcr p15, 1, %0, c15, c11, 3" : : "r" (addr));
+}
+
+static inline void l2_inv_pa_range(unsigned long start, unsigned long end)
+{
+	unsigned long va_start, va_end, flags;
+
+	/*
+	 * Make sure 'start' and 'end' reference the same page, as
+	 * L2 is PIPT and range operations only do a TLB lookup on
+	 * the start address.
+	 */
+	BUG_ON((start ^ end) >> PAGE_SHIFT);
+
+	va_start = l2_get_va(start);
+	va_end = va_start + (end - start);
+	raw_local_irq_save(flags);
+	__asm__("mcr p15, 1, %0, c15, c11, 4\n\t"
+		"mcr p15, 1, %1, c15, c11, 5"
+		: : "r" (va_start), "r" (va_end));
+	raw_local_irq_restore(flags);
+	l2_put_va(va_start);
+}
+
+static inline void l2_inv_all(void)
+{
+	__asm__("mcr p15, 1, %0, c15, c11, 0" : : "r" (0));
+}
+
+/*
+ * Linux primitives.
+ *
+ * Note that the end addresses passed to Linux primitives are
+ * noninclusive, while the hardware cache range operations use
+ * inclusive start and end addresses.
+ */
+#define CACHE_LINE_SIZE		32
+#define MAX_RANGE_SIZE		1024
+
+static int l2_wt_override;
+
+static unsigned long calc_range_end(unsigned long start, unsigned long end)
+{
+	unsigned long range_end;
+
+	BUG_ON(start & (CACHE_LINE_SIZE - 1));
+	BUG_ON(end & (CACHE_LINE_SIZE - 1));
+
+	/*
+	 * Try to process all cache lines between 'start' and 'end'.
+	 */
+	range_end = end;
+
+	/*
+	 * Limit the number of cache lines processed at once,
+	 * since cache range operations stall the CPU pipeline
+	 * until completion.
+	 */
+	if (range_end > start + MAX_RANGE_SIZE)
+		range_end = start + MAX_RANGE_SIZE;
+
+	/*
+	 * Cache range operations can't straddle a page boundary.
+	 */
+	if (range_end > (start | (PAGE_SIZE - 1)) + 1)
+		range_end = (start | (PAGE_SIZE - 1)) + 1;
+
+	return range_end;
+}
+
+static void feroceon_l2_inv_range(unsigned long start, unsigned long end)
+{
+	/*
+	 * Clean and invalidate partial first cache line.
+	 */
+	if (start & (CACHE_LINE_SIZE - 1)) {
+		l2_clean_inv_pa(start & ~(CACHE_LINE_SIZE - 1));
+		start = (start | (CACHE_LINE_SIZE - 1)) + 1;
+	}
+
+	/*
+	 * Clean and invalidate partial last cache line.
+	 */
+	if (start < end && end & (CACHE_LINE_SIZE - 1)) {
+		l2_clean_inv_pa(end & ~(CACHE_LINE_SIZE - 1));
+		end &= ~(CACHE_LINE_SIZE - 1);
+	}
+
+	/*
+	 * Invalidate all full cache lines between 'start' and 'end'.
+	 */
+	while (start < end) {
+		unsigned long range_end = calc_range_end(start, end);
+		l2_inv_pa_range(start, range_end - CACHE_LINE_SIZE);
+		start = range_end;
+	}
+
+	dsb();
+}
+
+static void feroceon_l2_clean_range(unsigned long start, unsigned long end)
+{
+	/*
+	 * If L2 is forced to WT, the L2 will always be clean and we
+	 * don't need to do anything here.
+	 */
+	if (!l2_wt_override) {
+		start &= ~(CACHE_LINE_SIZE - 1);
+		end = (end + CACHE_LINE_SIZE - 1) & ~(CACHE_LINE_SIZE - 1);
+		while (start != end) {
+			unsigned long range_end = calc_range_end(start, end);
+			l2_clean_pa_range(start, range_end - CACHE_LINE_SIZE);
+			start = range_end;
+		}
+	}
+
+	dsb();
+}
+
+static void feroceon_l2_flush_range(unsigned long start, unsigned long end)
+{
+	start &= ~(CACHE_LINE_SIZE - 1);
+	end = (end + CACHE_LINE_SIZE - 1) & ~(CACHE_LINE_SIZE - 1);
+	while (start != end) {
+		unsigned long range_end = calc_range_end(start, end);
+		if (!l2_wt_override)
+			l2_clean_pa_range(start, range_end - CACHE_LINE_SIZE);
+		l2_inv_pa_range(start, range_end - CACHE_LINE_SIZE);
+		start = range_end;
+	}
+
+	dsb();
+}
+
+
+/*
+ * Routines to disable and re-enable the D-cache and I-cache at run
+ * time.  These are necessary because the L2 cache can only be enabled
+ * or disabled while the L1 Dcache and Icache are both disabled.
+ */
+static int __init flush_and_disable_dcache(void)
+{
+	u32 cr;
+
+	cr = get_cr();
+	if (cr & CR_C) {
+		unsigned long flags;
+
+		raw_local_irq_save(flags);
+		flush_cache_all();
+		set_cr(cr & ~CR_C);
+		raw_local_irq_restore(flags);
+		return 1;
+	}
+	return 0;
+}
+
+static void __init enable_dcache(void)
+{
+	u32 cr;
+
+	cr = get_cr();
+	set_cr(cr | CR_C);
+}
+
+static void __init __invalidate_icache(void)
+{
+	__asm__("mcr p15, 0, %0, c7, c5, 0" : : "r" (0));
+}
+
+static int __init invalidate_and_disable_icache(void)
+{
+	u32 cr;
+
+	cr = get_cr();
+	if (cr & CR_I) {
+		set_cr(cr & ~CR_I);
+		__invalidate_icache();
+		return 1;
+	}
+	return 0;
+}
+
+static void __init enable_icache(void)
+{
+	u32 cr;
+
+	cr = get_cr();
+	set_cr(cr | CR_I);
+}
+
+static inline u32 read_extra_features(void)
+{
+	u32 u;
+
+	__asm__("mrc p15, 1, %0, c15, c1, 0" : "=r" (u));
+
+	return u;
+}
+
+static inline void write_extra_features(u32 u)
+{
+	__asm__("mcr p15, 1, %0, c15, c1, 0" : : "r" (u));
+}
+
+static void __init disable_l2_prefetch(void)
+{
+	u32 u;
+
+	/*
+	 * Read the CPU Extra Features register and verify that the
+	 * Disable L2 Prefetch bit is set.
+	 */
+	u = read_extra_features();
+	if (!(u & 0x01000000)) {
+		printk(KERN_INFO "Feroceon L2: Disabling L2 prefetch.\n");
+		write_extra_features(u | 0x01000000);
+	}
+}
+
+static void __init enable_l2(void)
+{
+	u32 u;
+
+	u = read_extra_features();
+	if (!(u & 0x00400000)) {
+		int i, d;
+
+		printk(KERN_INFO "Feroceon L2: Enabling L2\n");
+
+		d = flush_and_disable_dcache();
+		i = invalidate_and_disable_icache();
+		l2_inv_all();
+		write_extra_features(u | 0x00400000);
+		if (i)
+			enable_icache();
+		if (d)
+			enable_dcache();
+	} else
+		pr_err(FW_BUG
+		       "Feroceon L2: bootloader left the L2 cache on!\n");
+}
+
+void __init feroceon_l2_init(int __l2_wt_override)
+{
+	l2_wt_override = __l2_wt_override;
+
+	disable_l2_prefetch();
+
+	outer_cache.inv_range = feroceon_l2_inv_range;
+	outer_cache.clean_range = feroceon_l2_clean_range;
+	outer_cache.flush_range = feroceon_l2_flush_range;
+
+	enable_l2();
+
+	printk(KERN_INFO "Feroceon L2: Cache support initialised%s.\n",
+			 l2_wt_override ? ", in WT override mode" : "");
+}
+#ifdef CONFIG_OF
+static const struct of_device_id feroceon_ids[] __initconst = {
+	{ .compatible = "marvell,kirkwood-cache"},
+	{ .compatible = "marvell,feroceon-cache"},
+	{}
+};
+
+int __init feroceon_of_init(void)
+{
+	struct device_node *node;
+	void __iomem *base;
+	bool l2_wt_override = false;
+	struct resource res;
+
+#if defined(CONFIG_CACHE_FEROCEON_L2_WRITETHROUGH)
+	l2_wt_override = true;
+#endif
+
+	node = of_find_matching_node(NULL, feroceon_ids);
+	if (node && of_device_is_compatible(node, "marvell,kirkwood-cache")) {
+		if (of_address_to_resource(node, 0, &res))
+			return -ENODEV;
+
+		base = ioremap(res.start, resource_size(&res));
+		if (!base)
+			return -ENOMEM;
+
+		if (l2_wt_override)
+			writel(readl(base) | L2_WRITETHROUGH_KIRKWOOD, base);
+		else
+			writel(readl(base) & ~L2_WRITETHROUGH_KIRKWOOD, base);
+	}
+
+	feroceon_l2_init(l2_wt_override);
+
+	return 0;
+}
+#endif
diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c
index 76b800a9519..7c3fb41a462 100644
--- a/arch/arm/mm/cache-l2x0.c
+++ b/arch/arm/mm/cache-l2x0.c
@@ -16,105 +16,1532 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
+#include <linux/cpu.h>
+#include <linux/err.h>
 #include <linux/init.h>
+#include <linux/smp.h>
 #include <linux/spinlock.h>
+#include <linux/io.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
 
 #include <asm/cacheflush.h>
-#include <asm/io.h>
+#include <asm/cp15.h>
+#include <asm/cputype.h>
 #include <asm/hardware/cache-l2x0.h>
+#include "cache-tauros3.h"
+#include "cache-aurora-l2.h"
+
+struct l2c_init_data {
+	const char *type;
+	unsigned way_size_0;
+	unsigned num_lock;
+	void (*of_parse)(const struct device_node *, u32 *, u32 *);
+	void (*enable)(void __iomem *, u32, unsigned);
+	void (*fixup)(void __iomem *, u32, struct outer_cache_fns *);
+	void (*save)(void __iomem *);
+	struct outer_cache_fns outer_cache;
+};
 
 #define CACHE_LINE_SIZE		32
 
 static void __iomem *l2x0_base;
-static DEFINE_SPINLOCK(l2x0_lock);
+static DEFINE_RAW_SPINLOCK(l2x0_lock);
+static u32 l2x0_way_mask;	/* Bitmask of active ways */
+static u32 l2x0_size;
+static unsigned long sync_reg_offset = L2X0_CACHE_SYNC;
+
+struct l2x0_regs l2x0_saved_regs;
+
+/*
+ * Common code for all cache controllers.
+ */
+static inline void l2c_wait_mask(void __iomem *reg, unsigned long mask)
+{
+	/* wait for cache operation by line or way to complete */
+	while (readl_relaxed(reg) & mask)
+		cpu_relax();
+}
 
-static inline void sync_writel(unsigned long val, unsigned long reg,
-			       unsigned long complete_mask)
+/*
+ * By default, we write directly to secure registers.  Platforms must
+ * override this if they are running non-secure.
+ */
+static void l2c_write_sec(unsigned long val, void __iomem *base, unsigned reg)
+{
+	if (val == readl_relaxed(base + reg))
+		return;
+	if (outer_cache.write_sec)
+		outer_cache.write_sec(val, reg);
+	else
+		writel_relaxed(val, base + reg);
+}
+
+/*
+ * This should only be called when we have a requirement that the
+ * register be written due to a work-around, as platforms running
+ * in non-secure mode may not be able to access this register.
+ */
+static inline void l2c_set_debug(void __iomem *base, unsigned long val)
+{
+	l2c_write_sec(val, base, L2X0_DEBUG_CTRL);
+}
+
+static void __l2c_op_way(void __iomem *reg)
+{
+	writel_relaxed(l2x0_way_mask, reg);
+	l2c_wait_mask(reg, l2x0_way_mask);
+}
+
+static inline void l2c_unlock(void __iomem *base, unsigned num)
+{
+	unsigned i;
+
+	for (i = 0; i < num; i++) {
+		writel_relaxed(0, base + L2X0_LOCKDOWN_WAY_D_BASE +
+			       i * L2X0_LOCKDOWN_STRIDE);
+		writel_relaxed(0, base + L2X0_LOCKDOWN_WAY_I_BASE +
+			       i * L2X0_LOCKDOWN_STRIDE);
+	}
+}
+
+/*
+ * Enable the L2 cache controller.  This function must only be
+ * called when the cache controller is known to be disabled.
+ */
+static void l2c_enable(void __iomem *base, u32 aux, unsigned num_lock)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&l2x0_lock, flags);
-	writel(val, l2x0_base + reg);
-	/* wait for the operation to complete */
-	while (readl(l2x0_base + reg) & complete_mask)
-		;
-	spin_unlock_irqrestore(&l2x0_lock, flags);
+	l2c_write_sec(aux, base, L2X0_AUX_CTRL);
+
+	l2c_unlock(base, num_lock);
+
+	local_irq_save(flags);
+	__l2c_op_way(base + L2X0_INV_WAY);
+	writel_relaxed(0, base + sync_reg_offset);
+	l2c_wait_mask(base + sync_reg_offset, 1);
+	local_irq_restore(flags);
+
+	l2c_write_sec(L2X0_CTRL_EN, base, L2X0_CTRL);
+}
+
+static void l2c_disable(void)
+{
+	void __iomem *base = l2x0_base;
+
+	outer_cache.flush_all();
+	l2c_write_sec(0, base, L2X0_CTRL);
+	dsb(st);
 }
 
+#ifdef CONFIG_CACHE_PL310
+static inline void cache_wait(void __iomem *reg, unsigned long mask)
+{
+	/* cache operations by line are atomic on PL310 */
+}
+#else
+#define cache_wait	l2c_wait_mask
+#endif
+
 static inline void cache_sync(void)
 {
-	sync_writel(0, L2X0_CACHE_SYNC, 1);
+	void __iomem *base = l2x0_base;
+
+	writel_relaxed(0, base + sync_reg_offset);
+	cache_wait(base + L2X0_CACHE_SYNC, 1);
+}
+
+#if defined(CONFIG_PL310_ERRATA_588369) || defined(CONFIG_PL310_ERRATA_727915)
+static inline void debug_writel(unsigned long val)
+{
+	l2c_set_debug(l2x0_base, val);
+}
+#else
+/* Optimised out for non-errata case */
+static inline void debug_writel(unsigned long val)
+{
 }
+#endif
 
-static inline void l2x0_inv_all(void)
+static void l2x0_cache_sync(void)
 {
-	/* invalidate all ways */
-	sync_writel(0xff, L2X0_INV_WAY, 0xff);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&l2x0_lock, flags);
 	cache_sync();
+	raw_spin_unlock_irqrestore(&l2x0_lock, flags);
+}
+
+static void __l2x0_flush_all(void)
+{
+	debug_writel(0x03);
+	__l2c_op_way(l2x0_base + L2X0_CLEAN_INV_WAY);
+	cache_sync();
+	debug_writel(0x00);
+}
+
+static void l2x0_flush_all(void)
+{
+	unsigned long flags;
+
+	/* clean all ways */
+	raw_spin_lock_irqsave(&l2x0_lock, flags);
+	__l2x0_flush_all();
+	raw_spin_unlock_irqrestore(&l2x0_lock, flags);
 }
 
-static void l2x0_inv_range(unsigned long start, unsigned long end)
+static void l2x0_disable(void)
 {
-	unsigned long addr;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&l2x0_lock, flags);
+	__l2x0_flush_all();
+	l2c_write_sec(0, l2x0_base, L2X0_CTRL);
+	dsb(st);
+	raw_spin_unlock_irqrestore(&l2x0_lock, flags);
+}
+
+static void l2c_save(void __iomem *base)
+{
+	l2x0_saved_regs.aux_ctrl = readl_relaxed(l2x0_base + L2X0_AUX_CTRL);
+}
+
+/*
+ * L2C-210 specific code.
+ *
+ * The L2C-2x0 PA, set/way and sync operations are atomic, but we must
+ * ensure that no background operation is running.  The way operations
+ * are all background tasks.
+ *
+ * While a background operation is in progress, any new operation is
+ * ignored (unspecified whether this causes an error.)  Thankfully, not
+ * used on SMP.
+ *
+ * Never has a different sync register other than L2X0_CACHE_SYNC, but
+ * we use sync_reg_offset here so we can share some of this with L2C-310.
+ */
+static void __l2c210_cache_sync(void __iomem *base)
+{
+	writel_relaxed(0, base + sync_reg_offset);
+}
+
+static void __l2c210_op_pa_range(void __iomem *reg, unsigned long start,
+	unsigned long end)
+{
+	while (start < end) {
+		writel_relaxed(start, reg);
+		start += CACHE_LINE_SIZE;
+	}
+}
+
+static void l2c210_inv_range(unsigned long start, unsigned long end)
+{
+	void __iomem *base = l2x0_base;
 
 	if (start & (CACHE_LINE_SIZE - 1)) {
 		start &= ~(CACHE_LINE_SIZE - 1);
-		sync_writel(start, L2X0_CLEAN_INV_LINE_PA, 1);
+		writel_relaxed(start, base + L2X0_CLEAN_INV_LINE_PA);
 		start += CACHE_LINE_SIZE;
 	}
 
 	if (end & (CACHE_LINE_SIZE - 1)) {
 		end &= ~(CACHE_LINE_SIZE - 1);
-		sync_writel(end, L2X0_CLEAN_INV_LINE_PA, 1);
+		writel_relaxed(end, base + L2X0_CLEAN_INV_LINE_PA);
 	}
 
-	for (addr = start; addr < end; addr += CACHE_LINE_SIZE)
-		sync_writel(addr, L2X0_INV_LINE_PA, 1);
-	cache_sync();
+	__l2c210_op_pa_range(base + L2X0_INV_LINE_PA, start, end);
+	__l2c210_cache_sync(base);
 }
 
-static void l2x0_clean_range(unsigned long start, unsigned long end)
+static void l2c210_clean_range(unsigned long start, unsigned long end)
 {
-	unsigned long addr;
+	void __iomem *base = l2x0_base;
 
 	start &= ~(CACHE_LINE_SIZE - 1);
-	for (addr = start; addr < end; addr += CACHE_LINE_SIZE)
-		sync_writel(addr, L2X0_CLEAN_LINE_PA, 1);
-	cache_sync();
+	__l2c210_op_pa_range(base + L2X0_CLEAN_LINE_PA, start, end);
+	__l2c210_cache_sync(base);
 }
 
-static void l2x0_flush_range(unsigned long start, unsigned long end)
+static void l2c210_flush_range(unsigned long start, unsigned long end)
 {
-	unsigned long addr;
+	void __iomem *base = l2x0_base;
 
 	start &= ~(CACHE_LINE_SIZE - 1);
-	for (addr = start; addr < end; addr += CACHE_LINE_SIZE)
-		sync_writel(addr, L2X0_CLEAN_INV_LINE_PA, 1);
-	cache_sync();
+	__l2c210_op_pa_range(base + L2X0_CLEAN_INV_LINE_PA, start, end);
+	__l2c210_cache_sync(base);
 }
 
-void __init l2x0_init(void __iomem *base, __u32 aux_val, __u32 aux_mask)
+static void l2c210_flush_all(void)
 {
-	__u32 aux;
+	void __iomem *base = l2x0_base;
 
-	l2x0_base = base;
+	BUG_ON(!irqs_disabled());
+
+	__l2c_op_way(base + L2X0_CLEAN_INV_WAY);
+	__l2c210_cache_sync(base);
+}
+
+static void l2c210_sync(void)
+{
+	__l2c210_cache_sync(l2x0_base);
+}
+
+static void l2c210_resume(void)
+{
+	void __iomem *base = l2x0_base;
+
+	if (!(readl_relaxed(base + L2X0_CTRL) & L2X0_CTRL_EN))
+		l2c_enable(base, l2x0_saved_regs.aux_ctrl, 1);
+}
+
+static const struct l2c_init_data l2c210_data __initconst = {
+	.type = "L2C-210",
+	.way_size_0 = SZ_8K,
+	.num_lock = 1,
+	.enable = l2c_enable,
+	.save = l2c_save,
+	.outer_cache = {
+		.inv_range = l2c210_inv_range,
+		.clean_range = l2c210_clean_range,
+		.flush_range = l2c210_flush_range,
+		.flush_all = l2c210_flush_all,
+		.disable = l2c_disable,
+		.sync = l2c210_sync,
+		.resume = l2c210_resume,
+	},
+};
+
+/*
+ * L2C-220 specific code.
+ *
+ * All operations are background operations: they have to be waited for.
+ * Conflicting requests generate a slave error (which will cause an
+ * imprecise abort.)  Never uses sync_reg_offset, so we hard-code the
+ * sync register here.
+ *
+ * However, we can re-use the l2c210_resume call.
+ */
+static inline void __l2c220_cache_sync(void __iomem *base)
+{
+	writel_relaxed(0, base + L2X0_CACHE_SYNC);
+	l2c_wait_mask(base + L2X0_CACHE_SYNC, 1);
+}
+
+static void l2c220_op_way(void __iomem *base, unsigned reg)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&l2x0_lock, flags);
+	__l2c_op_way(base + reg);
+	__l2c220_cache_sync(base);
+	raw_spin_unlock_irqrestore(&l2x0_lock, flags);
+}
+
+static unsigned long l2c220_op_pa_range(void __iomem *reg, unsigned long start,
+	unsigned long end, unsigned long flags)
+{
+	raw_spinlock_t *lock = &l2x0_lock;
+
+	while (start < end) {
+		unsigned long blk_end = start + min(end - start, 4096UL);
+
+		while (start < blk_end) {
+			l2c_wait_mask(reg, 1);
+			writel_relaxed(start, reg);
+			start += CACHE_LINE_SIZE;
+		}
+
+		if (blk_end < end) {
+			raw_spin_unlock_irqrestore(lock, flags);
+			raw_spin_lock_irqsave(lock, flags);
+		}
+	}
+
+	return flags;
+}
+
+static void l2c220_inv_range(unsigned long start, unsigned long end)
+{
+	void __iomem *base = l2x0_base;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&l2x0_lock, flags);
+	if ((start | end) & (CACHE_LINE_SIZE - 1)) {
+		if (start & (CACHE_LINE_SIZE - 1)) {
+			start &= ~(CACHE_LINE_SIZE - 1);
+			writel_relaxed(start, base + L2X0_CLEAN_INV_LINE_PA);
+			start += CACHE_LINE_SIZE;
+		}
+
+		if (end & (CACHE_LINE_SIZE - 1)) {
+			end &= ~(CACHE_LINE_SIZE - 1);
+			l2c_wait_mask(base + L2X0_CLEAN_INV_LINE_PA, 1);
+			writel_relaxed(end, base + L2X0_CLEAN_INV_LINE_PA);
+		}
+	}
+
+	flags = l2c220_op_pa_range(base + L2X0_INV_LINE_PA,
+				   start, end, flags);
+	l2c_wait_mask(base + L2X0_INV_LINE_PA, 1);
+	__l2c220_cache_sync(base);
+	raw_spin_unlock_irqrestore(&l2x0_lock, flags);
+}
+
+static void l2c220_clean_range(unsigned long start, unsigned long end)
+{
+	void __iomem *base = l2x0_base;
+	unsigned long flags;
+
+	start &= ~(CACHE_LINE_SIZE - 1);
+	if ((end - start) >= l2x0_size) {
+		l2c220_op_way(base, L2X0_CLEAN_WAY);
+		return;
+	}
+
+	raw_spin_lock_irqsave(&l2x0_lock, flags);
+	flags = l2c220_op_pa_range(base + L2X0_CLEAN_LINE_PA,
+				   start, end, flags);
+	l2c_wait_mask(base + L2X0_CLEAN_INV_LINE_PA, 1);
+	__l2c220_cache_sync(base);
+	raw_spin_unlock_irqrestore(&l2x0_lock, flags);
+}
+
+static void l2c220_flush_range(unsigned long start, unsigned long end)
+{
+	void __iomem *base = l2x0_base;
+	unsigned long flags;
+
+	start &= ~(CACHE_LINE_SIZE - 1);
+	if ((end - start) >= l2x0_size) {
+		l2c220_op_way(base, L2X0_CLEAN_INV_WAY);
+		return;
+	}
+
+	raw_spin_lock_irqsave(&l2x0_lock, flags);
+	flags = l2c220_op_pa_range(base + L2X0_CLEAN_INV_LINE_PA,
+				   start, end, flags);
+	l2c_wait_mask(base + L2X0_CLEAN_INV_LINE_PA, 1);
+	__l2c220_cache_sync(base);
+	raw_spin_unlock_irqrestore(&l2x0_lock, flags);
+}
+
+static void l2c220_flush_all(void)
+{
+	l2c220_op_way(l2x0_base, L2X0_CLEAN_INV_WAY);
+}
+
+static void l2c220_sync(void)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&l2x0_lock, flags);
+	__l2c220_cache_sync(l2x0_base);
+	raw_spin_unlock_irqrestore(&l2x0_lock, flags);
+}
+
+static void l2c220_enable(void __iomem *base, u32 aux, unsigned num_lock)
+{
+	/*
+	 * Always enable non-secure access to the lockdown registers -
+	 * we write to them as part of the L2C enable sequence so they
+	 * need to be accessible.
+	 */
+	aux |= L220_AUX_CTRL_NS_LOCKDOWN;
+
+	l2c_enable(base, aux, num_lock);
+}
+
+static const struct l2c_init_data l2c220_data = {
+	.type = "L2C-220",
+	.way_size_0 = SZ_8K,
+	.num_lock = 1,
+	.enable = l2c220_enable,
+	.save = l2c_save,
+	.outer_cache = {
+		.inv_range = l2c220_inv_range,
+		.clean_range = l2c220_clean_range,
+		.flush_range = l2c220_flush_range,
+		.flush_all = l2c220_flush_all,
+		.disable = l2c_disable,
+		.sync = l2c220_sync,
+		.resume = l2c210_resume,
+	},
+};
+
+/*
+ * L2C-310 specific code.
+ *
+ * Very similar to L2C-210, the PA, set/way and sync operations are atomic,
+ * and the way operations are all background tasks.  However, issuing an
+ * operation while a background operation is in progress results in a
+ * SLVERR response.  We can reuse:
+ *
+ *  __l2c210_cache_sync (using sync_reg_offset)
+ *  l2c210_sync
+ *  l2c210_inv_range (if 588369 is not applicable)
+ *  l2c210_clean_range
+ *  l2c210_flush_range (if 588369 is not applicable)
+ *  l2c210_flush_all (if 727915 is not applicable)
+ *
+ * Errata:
+ * 588369: PL310 R0P0->R1P0, fixed R2P0.
+ *	Affects: all clean+invalidate operations
+ *	clean and invalidate skips the invalidate step, so we need to issue
+ *	separate operations.  We also require the above debug workaround
+ *	enclosing this code fragment on affected parts.  On unaffected parts,
+ *	we must not use this workaround without the debug register writes
+ *	to avoid exposing a problem similar to 727915.
+ *
+ * 727915: PL310 R2P0->R3P0, fixed R3P1.
+ *	Affects: clean+invalidate by way
+ *	clean and invalidate by way runs in the background, and a store can
+ *	hit the line between the clean operation and invalidate operation,
+ *	resulting in the store being lost.
+ *
+ * 752271: PL310 R3P0->R3P1-50REL0, fixed R3P2.
+ *	Affects: 8x64-bit (double fill) line fetches
+ *	double fill line fetches can fail to cause dirty data to be evicted
+ *	from the cache before the new data overwrites the second line.
+ *
+ * 753970: PL310 R3P0, fixed R3P1.
+ *	Affects: sync
+ *	prevents merging writes after the sync operation, until another L2C
+ *	operation is performed (or a number of other conditions.)
+ *
+ * 769419: PL310 R0P0->R3P1, fixed R3P2.
+ *	Affects: store buffer
+ *	store buffer is not automatically drained.
+ */
+static void l2c310_inv_range_erratum(unsigned long start, unsigned long end)
+{
+	void __iomem *base = l2x0_base;
+
+	if ((start | end) & (CACHE_LINE_SIZE - 1)) {
+		unsigned long flags;
+
+		/* Erratum 588369 for both clean+invalidate operations */
+		raw_spin_lock_irqsave(&l2x0_lock, flags);
+		l2c_set_debug(base, 0x03);
+
+		if (start & (CACHE_LINE_SIZE - 1)) {
+			start &= ~(CACHE_LINE_SIZE - 1);
+			writel_relaxed(start, base + L2X0_CLEAN_LINE_PA);
+			writel_relaxed(start, base + L2X0_INV_LINE_PA);
+			start += CACHE_LINE_SIZE;
+		}
+
+		if (end & (CACHE_LINE_SIZE - 1)) {
+			end &= ~(CACHE_LINE_SIZE - 1);
+			writel_relaxed(end, base + L2X0_CLEAN_LINE_PA);
+			writel_relaxed(end, base + L2X0_INV_LINE_PA);
+		}
+
+		l2c_set_debug(base, 0x00);
+		raw_spin_unlock_irqrestore(&l2x0_lock, flags);
+	}
+
+	__l2c210_op_pa_range(base + L2X0_INV_LINE_PA, start, end);
+	__l2c210_cache_sync(base);
+}
+
+static void l2c310_flush_range_erratum(unsigned long start, unsigned long end)
+{
+	raw_spinlock_t *lock = &l2x0_lock;
+	unsigned long flags;
+	void __iomem *base = l2x0_base;
+
+	raw_spin_lock_irqsave(lock, flags);
+	while (start < end) {
+		unsigned long blk_end = start + min(end - start, 4096UL);
+
+		l2c_set_debug(base, 0x03);
+		while (start < blk_end) {
+			writel_relaxed(start, base + L2X0_CLEAN_LINE_PA);
+			writel_relaxed(start, base + L2X0_INV_LINE_PA);
+			start += CACHE_LINE_SIZE;
+		}
+		l2c_set_debug(base, 0x00);
+
+		if (blk_end < end) {
+			raw_spin_unlock_irqrestore(lock, flags);
+			raw_spin_lock_irqsave(lock, flags);
+		}
+	}
+	raw_spin_unlock_irqrestore(lock, flags);
+	__l2c210_cache_sync(base);
+}
+
+static void l2c310_flush_all_erratum(void)
+{
+	void __iomem *base = l2x0_base;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&l2x0_lock, flags);
+	l2c_set_debug(base, 0x03);
+	__l2c_op_way(base + L2X0_CLEAN_INV_WAY);
+	l2c_set_debug(base, 0x00);
+	__l2c210_cache_sync(base);
+	raw_spin_unlock_irqrestore(&l2x0_lock, flags);
+}
+
+static void __init l2c310_save(void __iomem *base)
+{
+	unsigned revision;
+
+	l2c_save(base);
+
+	l2x0_saved_regs.tag_latency = readl_relaxed(base +
+		L310_TAG_LATENCY_CTRL);
+	l2x0_saved_regs.data_latency = readl_relaxed(base +
+		L310_DATA_LATENCY_CTRL);
+	l2x0_saved_regs.filter_end = readl_relaxed(base +
+		L310_ADDR_FILTER_END);
+	l2x0_saved_regs.filter_start = readl_relaxed(base +
+		L310_ADDR_FILTER_START);
+
+	revision = readl_relaxed(base + L2X0_CACHE_ID) &
+			L2X0_CACHE_ID_RTL_MASK;
+
+	/* From r2p0, there is Prefetch offset/control register */
+	if (revision >= L310_CACHE_ID_RTL_R2P0)
+		l2x0_saved_regs.prefetch_ctrl = readl_relaxed(base +
+							L310_PREFETCH_CTRL);
+
+	/* From r3p0, there is Power control register */
+	if (revision >= L310_CACHE_ID_RTL_R3P0)
+		l2x0_saved_regs.pwr_ctrl = readl_relaxed(base +
+							L310_POWER_CTRL);
+}
+
+static void l2c310_resume(void)
+{
+	void __iomem *base = l2x0_base;
+
+	if (!(readl_relaxed(base + L2X0_CTRL) & L2X0_CTRL_EN)) {
+		unsigned revision;
+
+		/* restore pl310 setup */
+		writel_relaxed(l2x0_saved_regs.tag_latency,
+			       base + L310_TAG_LATENCY_CTRL);
+		writel_relaxed(l2x0_saved_regs.data_latency,
+			       base + L310_DATA_LATENCY_CTRL);
+		writel_relaxed(l2x0_saved_regs.filter_end,
+			       base + L310_ADDR_FILTER_END);
+		writel_relaxed(l2x0_saved_regs.filter_start,
+			       base + L310_ADDR_FILTER_START);
+
+		revision = readl_relaxed(base + L2X0_CACHE_ID) &
+				L2X0_CACHE_ID_RTL_MASK;
+
+		if (revision >= L310_CACHE_ID_RTL_R2P0)
+			l2c_write_sec(l2x0_saved_regs.prefetch_ctrl, base,
+				      L310_PREFETCH_CTRL);
+		if (revision >= L310_CACHE_ID_RTL_R3P0)
+			l2c_write_sec(l2x0_saved_regs.pwr_ctrl, base,
+				      L310_POWER_CTRL);
+
+		l2c_enable(base, l2x0_saved_regs.aux_ctrl, 8);
+
+		/* Re-enable full-line-of-zeros for Cortex-A9 */
+		if (l2x0_saved_regs.aux_ctrl & L310_AUX_CTRL_FULL_LINE_ZERO)
+			set_auxcr(get_auxcr() | BIT(3) | BIT(2) | BIT(1));
+	}
+}
+
+static int l2c310_cpu_enable_flz(struct notifier_block *nb, unsigned long act, void *data)
+{
+	switch (act & ~CPU_TASKS_FROZEN) {
+	case CPU_STARTING:
+		set_auxcr(get_auxcr() | BIT(3) | BIT(2) | BIT(1));
+		break;
+	case CPU_DYING:
+		set_auxcr(get_auxcr() & ~(BIT(3) | BIT(2) | BIT(1)));
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static void __init l2c310_enable(void __iomem *base, u32 aux, unsigned num_lock)
+{
+	unsigned rev = readl_relaxed(base + L2X0_CACHE_ID) & L2X0_CACHE_ID_RTL_MASK;
+	bool cortex_a9 = read_cpuid_part_number() == ARM_CPU_PART_CORTEX_A9;
+
+	if (rev >= L310_CACHE_ID_RTL_R2P0) {
+		if (cortex_a9) {
+			aux |= L310_AUX_CTRL_EARLY_BRESP;
+			pr_info("L2C-310 enabling early BRESP for Cortex-A9\n");
+		} else if (aux & L310_AUX_CTRL_EARLY_BRESP) {
+			pr_warn("L2C-310 early BRESP only supported with Cortex-A9\n");
+			aux &= ~L310_AUX_CTRL_EARLY_BRESP;
+		}
+	}
+
+	if (cortex_a9) {
+		u32 aux_cur = readl_relaxed(base + L2X0_AUX_CTRL);
+		u32 acr = get_auxcr();
+
+		pr_debug("Cortex-A9 ACR=0x%08x\n", acr);
+
+		if (acr & BIT(3) && !(aux_cur & L310_AUX_CTRL_FULL_LINE_ZERO))
+			pr_err("L2C-310: full line of zeros enabled in Cortex-A9 but not L2C-310 - invalid\n");
+
+		if (aux & L310_AUX_CTRL_FULL_LINE_ZERO && !(acr & BIT(3)))
+			pr_err("L2C-310: enabling full line of zeros but not enabled in Cortex-A9\n");
+
+		if (!(aux & L310_AUX_CTRL_FULL_LINE_ZERO) && !outer_cache.write_sec) {
+			aux |= L310_AUX_CTRL_FULL_LINE_ZERO;
+			pr_info("L2C-310 full line of zeros enabled for Cortex-A9\n");
+		}
+	} else if (aux & (L310_AUX_CTRL_FULL_LINE_ZERO | L310_AUX_CTRL_EARLY_BRESP)) {
+		pr_err("L2C-310: disabling Cortex-A9 specific feature bits\n");
+		aux &= ~(L310_AUX_CTRL_FULL_LINE_ZERO | L310_AUX_CTRL_EARLY_BRESP);
+	}
+
+	if (aux & (L310_AUX_CTRL_DATA_PREFETCH | L310_AUX_CTRL_INSTR_PREFETCH)) {
+		u32 prefetch = readl_relaxed(base + L310_PREFETCH_CTRL);
+
+		pr_info("L2C-310 %s%s prefetch enabled, offset %u lines\n",
+			aux & L310_AUX_CTRL_INSTR_PREFETCH ? "I" : "",
+			aux & L310_AUX_CTRL_DATA_PREFETCH ? "D" : "",
+			1 + (prefetch & L310_PREFETCH_CTRL_OFFSET_MASK));
+	}
+
+	/* r3p0 or later has power control register */
+	if (rev >= L310_CACHE_ID_RTL_R3P0) {
+		u32 power_ctrl;
+
+		l2c_write_sec(L310_DYNAMIC_CLK_GATING_EN | L310_STNDBY_MODE_EN,
+			      base, L310_POWER_CTRL);
+		power_ctrl = readl_relaxed(base + L310_POWER_CTRL);
+		pr_info("L2C-310 dynamic clock gating %sabled, standby mode %sabled\n",
+			power_ctrl & L310_DYNAMIC_CLK_GATING_EN ? "en" : "dis",
+			power_ctrl & L310_STNDBY_MODE_EN ? "en" : "dis");
+	}
+
+	/*
+	 * Always enable non-secure access to the lockdown registers -
+	 * we write to them as part of the L2C enable sequence so they
+	 * need to be accessible.
+	 */
+	aux |= L310_AUX_CTRL_NS_LOCKDOWN;
+
+	l2c_enable(base, aux, num_lock);
+
+	if (aux & L310_AUX_CTRL_FULL_LINE_ZERO) {
+		set_auxcr(get_auxcr() | BIT(3) | BIT(2) | BIT(1));
+		cpu_notifier(l2c310_cpu_enable_flz, 0);
+	}
+}
+
+static void __init l2c310_fixup(void __iomem *base, u32 cache_id,
+	struct outer_cache_fns *fns)
+{
+	unsigned revision = cache_id & L2X0_CACHE_ID_RTL_MASK;
+	const char *errata[8];
+	unsigned n = 0;
 
-	/* disable L2X0 */
-	writel(0, l2x0_base + L2X0_CTRL);
+	if (IS_ENABLED(CONFIG_PL310_ERRATA_588369) &&
+	    revision < L310_CACHE_ID_RTL_R2P0 &&
+	    /* For bcm compatibility */
+	    fns->inv_range == l2c210_inv_range) {
+		fns->inv_range = l2c310_inv_range_erratum;
+		fns->flush_range = l2c310_flush_range_erratum;
+		errata[n++] = "588369";
+	}
+
+	if (IS_ENABLED(CONFIG_PL310_ERRATA_727915) &&
+	    revision >= L310_CACHE_ID_RTL_R2P0 &&
+	    revision < L310_CACHE_ID_RTL_R3P1) {
+		fns->flush_all = l2c310_flush_all_erratum;
+		errata[n++] = "727915";
+	}
+
+	if (revision >= L310_CACHE_ID_RTL_R3P0 &&
+	    revision < L310_CACHE_ID_RTL_R3P2) {
+		u32 val = readl_relaxed(base + L310_PREFETCH_CTRL);
+		/* I don't think bit23 is required here... but iMX6 does so */
+		if (val & (BIT(30) | BIT(23))) {
+			val &= ~(BIT(30) | BIT(23));
+			l2c_write_sec(val, base, L310_PREFETCH_CTRL);
+			errata[n++] = "752271";
+		}
+	}
+
+	if (IS_ENABLED(CONFIG_PL310_ERRATA_753970) &&
+	    revision == L310_CACHE_ID_RTL_R3P0) {
+		sync_reg_offset = L2X0_DUMMY_REG;
+		errata[n++] = "753970";
+	}
+
+	if (IS_ENABLED(CONFIG_PL310_ERRATA_769419))
+		errata[n++] = "769419";
+
+	if (n) {
+		unsigned i;
+
+		pr_info("L2C-310 errat%s", n > 1 ? "a" : "um");
+		for (i = 0; i < n; i++)
+			pr_cont(" %s", errata[i]);
+		pr_cont(" enabled\n");
+	}
+}
+
+static void l2c310_disable(void)
+{
+	/*
+	 * If full-line-of-zeros is enabled, we must first disable it in the
+	 * Cortex-A9 auxiliary control register before disabling the L2 cache.
+	 */
+	if (l2x0_saved_regs.aux_ctrl & L310_AUX_CTRL_FULL_LINE_ZERO)
+		set_auxcr(get_auxcr() & ~(BIT(3) | BIT(2) | BIT(1)));
+
+	l2c_disable();
+}
 
-	aux = readl(l2x0_base + L2X0_AUX_CTRL);
+static const struct l2c_init_data l2c310_init_fns __initconst = {
+	.type = "L2C-310",
+	.way_size_0 = SZ_8K,
+	.num_lock = 8,
+	.enable = l2c310_enable,
+	.fixup = l2c310_fixup,
+	.save = l2c310_save,
+	.outer_cache = {
+		.inv_range = l2c210_inv_range,
+		.clean_range = l2c210_clean_range,
+		.flush_range = l2c210_flush_range,
+		.flush_all = l2c210_flush_all,
+		.disable = l2c310_disable,
+		.sync = l2c210_sync,
+		.resume = l2c310_resume,
+	},
+};
+
+static void __init __l2c_init(const struct l2c_init_data *data,
+	u32 aux_val, u32 aux_mask, u32 cache_id)
+{
+	struct outer_cache_fns fns;
+	unsigned way_size_bits, ways;
+	u32 aux, old_aux;
+
+	/*
+	 * Sanity check the aux values.  aux_mask is the bits we preserve
+	 * from reading the hardware register, and aux_val is the bits we
+	 * set.
+	 */
+	if (aux_val & aux_mask)
+		pr_alert("L2C: platform provided aux values permit register corruption.\n");
+
+	old_aux = aux = readl_relaxed(l2x0_base + L2X0_AUX_CTRL);
 	aux &= aux_mask;
 	aux |= aux_val;
-	writel(aux, l2x0_base + L2X0_AUX_CTRL);
 
-	l2x0_inv_all();
+	if (old_aux != aux)
+		pr_warn("L2C: DT/platform modifies aux control register: 0x%08x -> 0x%08x\n",
+		        old_aux, aux);
+
+	/* Determine the number of ways */
+	switch (cache_id & L2X0_CACHE_ID_PART_MASK) {
+	case L2X0_CACHE_ID_PART_L310:
+		if ((aux_val | ~aux_mask) & (L2C_AUX_CTRL_WAY_SIZE_MASK | L310_AUX_CTRL_ASSOCIATIVITY_16))
+			pr_warn("L2C: DT/platform tries to modify or specify cache size\n");
+		if (aux & (1 << 16))
+			ways = 16;
+		else
+			ways = 8;
+		break;
+
+	case L2X0_CACHE_ID_PART_L210:
+	case L2X0_CACHE_ID_PART_L220:
+		ways = (aux >> 13) & 0xf;
+		break;
+
+	case AURORA_CACHE_ID:
+		ways = (aux >> 13) & 0xf;
+		ways = 2 << ((ways + 1) >> 2);
+		break;
+
+	default:
+		/* Assume unknown chips have 8 ways */
+		ways = 8;
+		break;
+	}
+
+	l2x0_way_mask = (1 << ways) - 1;
+
+	/*
+	 * way_size_0 is the size that a way_size value of zero would be
+	 * given the calculation: way_size = way_size_0 << way_size_bits.
+	 * So, if way_size_bits=0 is reserved, but way_size_bits=1 is 16k,
+	 * then way_size_0 would be 8k.
+	 *
+	 * L2 cache size = number of ways * way size.
+	 */
+	way_size_bits = (aux & L2C_AUX_CTRL_WAY_SIZE_MASK) >>
+			L2C_AUX_CTRL_WAY_SIZE_SHIFT;
+	l2x0_size = ways * (data->way_size_0 << way_size_bits);
+
+	fns = data->outer_cache;
+	fns.write_sec = outer_cache.write_sec;
+	if (data->fixup)
+		data->fixup(l2x0_base, cache_id, &fns);
+
+	/*
+	 * Check if l2x0 controller is already enabled.  If we are booting
+	 * in non-secure mode accessing the below registers will fault.
+	 */
+	if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN))
+		data->enable(l2x0_base, aux, data->num_lock);
+
+	outer_cache = fns;
+
+	/*
+	 * It is strange to save the register state before initialisation,
+	 * but hey, this is what the DT implementations decided to do.
+	 */
+	if (data->save)
+		data->save(l2x0_base);
+
+	/* Re-read it in case some bits are reserved. */
+	aux = readl_relaxed(l2x0_base + L2X0_AUX_CTRL);
+
+	pr_info("%s cache controller enabled, %d ways, %d kB\n",
+		data->type, ways, l2x0_size >> 10);
+	pr_info("%s: CACHE_ID 0x%08x, AUX_CTRL 0x%08x\n",
+		data->type, cache_id, aux);
+}
+
+void __init l2x0_init(void __iomem *base, u32 aux_val, u32 aux_mask)
+{
+	const struct l2c_init_data *data;
+	u32 cache_id;
+
+	l2x0_base = base;
+
+	cache_id = readl_relaxed(base + L2X0_CACHE_ID);
+
+	switch (cache_id & L2X0_CACHE_ID_PART_MASK) {
+	default:
+	case L2X0_CACHE_ID_PART_L210:
+		data = &l2c210_data;
+		break;
+
+	case L2X0_CACHE_ID_PART_L220:
+		data = &l2c220_data;
+		break;
+
+	case L2X0_CACHE_ID_PART_L310:
+		data = &l2c310_init_fns;
+		break;
+	}
+
+	__l2c_init(data, aux_val, aux_mask, cache_id);
+}
+
+#ifdef CONFIG_OF
+static int l2_wt_override;
+
+/* Aurora don't have the cache ID register available, so we have to
+ * pass it though the device tree */
+static u32 cache_id_part_number_from_dt;
+
+static void __init l2x0_of_parse(const struct device_node *np,
+				 u32 *aux_val, u32 *aux_mask)
+{
+	u32 data[2] = { 0, 0 };
+	u32 tag = 0;
+	u32 dirty = 0;
+	u32 val = 0, mask = 0;
+
+	of_property_read_u32(np, "arm,tag-latency", &tag);
+	if (tag) {
+		mask |= L2X0_AUX_CTRL_TAG_LATENCY_MASK;
+		val |= (tag - 1) << L2X0_AUX_CTRL_TAG_LATENCY_SHIFT;
+	}
+
+	of_property_read_u32_array(np, "arm,data-latency",
+				   data, ARRAY_SIZE(data));
+	if (data[0] && data[1]) {
+		mask |= L2X0_AUX_CTRL_DATA_RD_LATENCY_MASK |
+			L2X0_AUX_CTRL_DATA_WR_LATENCY_MASK;
+		val |= ((data[0] - 1) << L2X0_AUX_CTRL_DATA_RD_LATENCY_SHIFT) |
+		       ((data[1] - 1) << L2X0_AUX_CTRL_DATA_WR_LATENCY_SHIFT);
+	}
+
+	of_property_read_u32(np, "arm,dirty-latency", &dirty);
+	if (dirty) {
+		mask |= L2X0_AUX_CTRL_DIRTY_LATENCY_MASK;
+		val |= (dirty - 1) << L2X0_AUX_CTRL_DIRTY_LATENCY_SHIFT;
+	}
+
+	*aux_val &= ~mask;
+	*aux_val |= val;
+	*aux_mask &= ~mask;
+}
+
+static const struct l2c_init_data of_l2c210_data __initconst = {
+	.type = "L2C-210",
+	.way_size_0 = SZ_8K,
+	.num_lock = 1,
+	.of_parse = l2x0_of_parse,
+	.enable = l2c_enable,
+	.save = l2c_save,
+	.outer_cache = {
+		.inv_range   = l2c210_inv_range,
+		.clean_range = l2c210_clean_range,
+		.flush_range = l2c210_flush_range,
+		.flush_all   = l2c210_flush_all,
+		.disable     = l2c_disable,
+		.sync        = l2c210_sync,
+		.resume      = l2c210_resume,
+	},
+};
+
+static const struct l2c_init_data of_l2c220_data __initconst = {
+	.type = "L2C-220",
+	.way_size_0 = SZ_8K,
+	.num_lock = 1,
+	.of_parse = l2x0_of_parse,
+	.enable = l2c220_enable,
+	.save = l2c_save,
+	.outer_cache = {
+		.inv_range   = l2c220_inv_range,
+		.clean_range = l2c220_clean_range,
+		.flush_range = l2c220_flush_range,
+		.flush_all   = l2c220_flush_all,
+		.disable     = l2c_disable,
+		.sync        = l2c220_sync,
+		.resume      = l2c210_resume,
+	},
+};
+
+static void __init l2c310_of_parse(const struct device_node *np,
+	u32 *aux_val, u32 *aux_mask)
+{
+	u32 data[3] = { 0, 0, 0 };
+	u32 tag[3] = { 0, 0, 0 };
+	u32 filter[2] = { 0, 0 };
+
+	of_property_read_u32_array(np, "arm,tag-latency", tag, ARRAY_SIZE(tag));
+	if (tag[0] && tag[1] && tag[2])
+		writel_relaxed(
+			L310_LATENCY_CTRL_RD(tag[0] - 1) |
+			L310_LATENCY_CTRL_WR(tag[1] - 1) |
+			L310_LATENCY_CTRL_SETUP(tag[2] - 1),
+			l2x0_base + L310_TAG_LATENCY_CTRL);
+
+	of_property_read_u32_array(np, "arm,data-latency",
+				   data, ARRAY_SIZE(data));
+	if (data[0] && data[1] && data[2])
+		writel_relaxed(
+			L310_LATENCY_CTRL_RD(data[0] - 1) |
+			L310_LATENCY_CTRL_WR(data[1] - 1) |
+			L310_LATENCY_CTRL_SETUP(data[2] - 1),
+			l2x0_base + L310_DATA_LATENCY_CTRL);
+
+	of_property_read_u32_array(np, "arm,filter-ranges",
+				   filter, ARRAY_SIZE(filter));
+	if (filter[1]) {
+		writel_relaxed(ALIGN(filter[0] + filter[1], SZ_1M),
+			       l2x0_base + L310_ADDR_FILTER_END);
+		writel_relaxed((filter[0] & ~(SZ_1M - 1)) | L310_ADDR_FILTER_EN,
+			       l2x0_base + L310_ADDR_FILTER_START);
+	}
+}
+
+static const struct l2c_init_data of_l2c310_data __initconst = {
+	.type = "L2C-310",
+	.way_size_0 = SZ_8K,
+	.num_lock = 8,
+	.of_parse = l2c310_of_parse,
+	.enable = l2c310_enable,
+	.fixup = l2c310_fixup,
+	.save  = l2c310_save,
+	.outer_cache = {
+		.inv_range   = l2c210_inv_range,
+		.clean_range = l2c210_clean_range,
+		.flush_range = l2c210_flush_range,
+		.flush_all   = l2c210_flush_all,
+		.disable     = l2c310_disable,
+		.sync        = l2c210_sync,
+		.resume      = l2c310_resume,
+	},
+};
+
+/*
+ * This is a variant of the of_l2c310_data with .sync set to
+ * NULL. Outer sync operations are not needed when the system is I/O
+ * coherent, and potentially harmful in certain situations (PCIe/PL310
+ * deadlock on Armada 375/38x due to hardware I/O coherency). The
+ * other operations are kept because they are infrequent (therefore do
+ * not cause the deadlock in practice) and needed for secondary CPU
+ * boot and other power management activities.
+ */
+static const struct l2c_init_data of_l2c310_coherent_data __initconst = {
+	.type = "L2C-310 Coherent",
+	.way_size_0 = SZ_8K,
+	.num_lock = 8,
+	.of_parse = l2c310_of_parse,
+	.enable = l2c310_enable,
+	.fixup = l2c310_fixup,
+	.save  = l2c310_save,
+	.outer_cache = {
+		.inv_range   = l2c210_inv_range,
+		.clean_range = l2c210_clean_range,
+		.flush_range = l2c210_flush_range,
+		.flush_all   = l2c210_flush_all,
+		.disable     = l2c310_disable,
+		.resume      = l2c310_resume,
+	},
+};
+
+/*
+ * Note that the end addresses passed to Linux primitives are
+ * noninclusive, while the hardware cache range operations use
+ * inclusive start and end addresses.
+ */
+static unsigned long calc_range_end(unsigned long start, unsigned long end)
+{
+	/*
+	 * Limit the number of cache lines processed at once,
+	 * since cache range operations stall the CPU pipeline
+	 * until completion.
+	 */
+	if (end > start + MAX_RANGE_SIZE)
+		end = start + MAX_RANGE_SIZE;
+
+	/*
+	 * Cache range operations can't straddle a page boundary.
+	 */
+	if (end > PAGE_ALIGN(start+1))
+		end = PAGE_ALIGN(start+1);
+
+	return end;
+}
+
+/*
+ * Make sure 'start' and 'end' reference the same page, as L2 is PIPT
+ * and range operations only do a TLB lookup on the start address.
+ */
+static void aurora_pa_range(unsigned long start, unsigned long end,
+			unsigned long offset)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&l2x0_lock, flags);
+	writel_relaxed(start, l2x0_base + AURORA_RANGE_BASE_ADDR_REG);
+	writel_relaxed(end, l2x0_base + offset);
+	raw_spin_unlock_irqrestore(&l2x0_lock, flags);
+
+	cache_sync();
+}
+
+static void aurora_inv_range(unsigned long start, unsigned long end)
+{
+	/*
+	 * round start and end adresses up to cache line size
+	 */
+	start &= ~(CACHE_LINE_SIZE - 1);
+	end = ALIGN(end, CACHE_LINE_SIZE);
+
+	/*
+	 * Invalidate all full cache lines between 'start' and 'end'.
+	 */
+	while (start < end) {
+		unsigned long range_end = calc_range_end(start, end);
+		aurora_pa_range(start, range_end - CACHE_LINE_SIZE,
+				AURORA_INVAL_RANGE_REG);
+		start = range_end;
+	}
+}
+
+static void aurora_clean_range(unsigned long start, unsigned long end)
+{
+	/*
+	 * If L2 is forced to WT, the L2 will always be clean and we
+	 * don't need to do anything here.
+	 */
+	if (!l2_wt_override) {
+		start &= ~(CACHE_LINE_SIZE - 1);
+		end = ALIGN(end, CACHE_LINE_SIZE);
+		while (start != end) {
+			unsigned long range_end = calc_range_end(start, end);
+			aurora_pa_range(start, range_end - CACHE_LINE_SIZE,
+					AURORA_CLEAN_RANGE_REG);
+			start = range_end;
+		}
+	}
+}
+
+static void aurora_flush_range(unsigned long start, unsigned long end)
+{
+	start &= ~(CACHE_LINE_SIZE - 1);
+	end = ALIGN(end, CACHE_LINE_SIZE);
+	while (start != end) {
+		unsigned long range_end = calc_range_end(start, end);
+		/*
+		 * If L2 is forced to WT, the L2 will always be clean and we
+		 * just need to invalidate.
+		 */
+		if (l2_wt_override)
+			aurora_pa_range(start, range_end - CACHE_LINE_SIZE,
+							AURORA_INVAL_RANGE_REG);
+		else
+			aurora_pa_range(start, range_end - CACHE_LINE_SIZE,
+							AURORA_FLUSH_RANGE_REG);
+		start = range_end;
+	}
+}
+
+static void aurora_save(void __iomem *base)
+{
+	l2x0_saved_regs.ctrl = readl_relaxed(base + L2X0_CTRL);
+	l2x0_saved_regs.aux_ctrl = readl_relaxed(base + L2X0_AUX_CTRL);
+}
+
+static void aurora_resume(void)
+{
+	void __iomem *base = l2x0_base;
+
+	if (!(readl(base + L2X0_CTRL) & L2X0_CTRL_EN)) {
+		writel_relaxed(l2x0_saved_regs.aux_ctrl, base + L2X0_AUX_CTRL);
+		writel_relaxed(l2x0_saved_regs.ctrl, base + L2X0_CTRL);
+	}
+}
+
+/*
+ * For Aurora cache in no outer mode, enable via the CP15 coprocessor
+ * broadcasting of cache commands to L2.
+ */
+static void __init aurora_enable_no_outer(void __iomem *base, u32 aux,
+	unsigned num_lock)
+{
+	u32 u;
+
+	asm volatile("mrc p15, 1, %0, c15, c2, 0" : "=r" (u));
+	u |= AURORA_CTRL_FW;		/* Set the FW bit */
+	asm volatile("mcr p15, 1, %0, c15, c2, 0" : : "r" (u));
+
+	isb();
+
+	l2c_enable(base, aux, num_lock);
+}
+
+static void __init aurora_fixup(void __iomem *base, u32 cache_id,
+	struct outer_cache_fns *fns)
+{
+	sync_reg_offset = AURORA_SYNC_REG;
+}
+
+static void __init aurora_of_parse(const struct device_node *np,
+				u32 *aux_val, u32 *aux_mask)
+{
+	u32 val = AURORA_ACR_REPLACEMENT_TYPE_SEMIPLRU;
+	u32 mask =  AURORA_ACR_REPLACEMENT_MASK;
+
+	of_property_read_u32(np, "cache-id-part",
+			&cache_id_part_number_from_dt);
+
+	/* Determine and save the write policy */
+	l2_wt_override = of_property_read_bool(np, "wt-override");
+
+	if (l2_wt_override) {
+		val |= AURORA_ACR_FORCE_WRITE_THRO_POLICY;
+		mask |= AURORA_ACR_FORCE_WRITE_POLICY_MASK;
+	}
+
+	*aux_val &= ~mask;
+	*aux_val |= val;
+	*aux_mask &= ~mask;
+}
+
+static const struct l2c_init_data of_aurora_with_outer_data __initconst = {
+	.type = "Aurora",
+	.way_size_0 = SZ_4K,
+	.num_lock = 4,
+	.of_parse = aurora_of_parse,
+	.enable = l2c_enable,
+	.fixup = aurora_fixup,
+	.save  = aurora_save,
+	.outer_cache = {
+		.inv_range   = aurora_inv_range,
+		.clean_range = aurora_clean_range,
+		.flush_range = aurora_flush_range,
+		.flush_all   = l2x0_flush_all,
+		.disable     = l2x0_disable,
+		.sync        = l2x0_cache_sync,
+		.resume      = aurora_resume,
+	},
+};
+
+static const struct l2c_init_data of_aurora_no_outer_data __initconst = {
+	.type = "Aurora",
+	.way_size_0 = SZ_4K,
+	.num_lock = 4,
+	.of_parse = aurora_of_parse,
+	.enable = aurora_enable_no_outer,
+	.fixup = aurora_fixup,
+	.save  = aurora_save,
+	.outer_cache = {
+		.resume      = aurora_resume,
+	},
+};
+
+/*
+ * For certain Broadcom SoCs, depending on the address range, different offsets
+ * need to be added to the address before passing it to L2 for
+ * invalidation/clean/flush
+ *
+ * Section Address Range              Offset        EMI
+ *   1     0x00000000 - 0x3FFFFFFF    0x80000000    VC
+ *   2     0x40000000 - 0xBFFFFFFF    0x40000000    SYS
+ *   3     0xC0000000 - 0xFFFFFFFF    0x80000000    VC
+ *
+ * When the start and end addresses have crossed two different sections, we
+ * need to break the L2 operation into two, each within its own section.
+ * For example, if we need to invalidate addresses starts at 0xBFFF0000 and
+ * ends at 0xC0001000, we need do invalidate 1) 0xBFFF0000 - 0xBFFFFFFF and 2)
+ * 0xC0000000 - 0xC0001000
+ *
+ * Note 1:
+ * By breaking a single L2 operation into two, we may potentially suffer some
+ * performance hit, but keep in mind the cross section case is very rare
+ *
+ * Note 2:
+ * We do not need to handle the case when the start address is in
+ * Section 1 and the end address is in Section 3, since it is not a valid use
+ * case
+ *
+ * Note 3:
+ * Section 1 in practical terms can no longer be used on rev A2. Because of
+ * that the code does not need to handle section 1 at all.
+ *
+ */
+#define BCM_SYS_EMI_START_ADDR        0x40000000UL
+#define BCM_VC_EMI_SEC3_START_ADDR    0xC0000000UL
+
+#define BCM_SYS_EMI_OFFSET            0x40000000UL
+#define BCM_VC_EMI_OFFSET             0x80000000UL
+
+static inline int bcm_addr_is_sys_emi(unsigned long addr)
+{
+	return (addr >= BCM_SYS_EMI_START_ADDR) &&
+		(addr < BCM_VC_EMI_SEC3_START_ADDR);
+}
+
+static inline unsigned long bcm_l2_phys_addr(unsigned long addr)
+{
+	if (bcm_addr_is_sys_emi(addr))
+		return addr + BCM_SYS_EMI_OFFSET;
+	else
+		return addr + BCM_VC_EMI_OFFSET;
+}
+
+static void bcm_inv_range(unsigned long start, unsigned long end)
+{
+	unsigned long new_start, new_end;
+
+	BUG_ON(start < BCM_SYS_EMI_START_ADDR);
+
+	if (unlikely(end <= start))
+		return;
+
+	new_start = bcm_l2_phys_addr(start);
+	new_end = bcm_l2_phys_addr(end);
+
+	/* normal case, no cross section between start and end */
+	if (likely(bcm_addr_is_sys_emi(end) || !bcm_addr_is_sys_emi(start))) {
+		l2c210_inv_range(new_start, new_end);
+		return;
+	}
+
+	/* They cross sections, so it can only be a cross from section
+	 * 2 to section 3
+	 */
+	l2c210_inv_range(new_start,
+		bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR-1));
+	l2c210_inv_range(bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR),
+		new_end);
+}
+
+static void bcm_clean_range(unsigned long start, unsigned long end)
+{
+	unsigned long new_start, new_end;
+
+	BUG_ON(start < BCM_SYS_EMI_START_ADDR);
+
+	if (unlikely(end <= start))
+		return;
+
+	new_start = bcm_l2_phys_addr(start);
+	new_end = bcm_l2_phys_addr(end);
+
+	/* normal case, no cross section between start and end */
+	if (likely(bcm_addr_is_sys_emi(end) || !bcm_addr_is_sys_emi(start))) {
+		l2c210_clean_range(new_start, new_end);
+		return;
+	}
+
+	/* They cross sections, so it can only be a cross from section
+	 * 2 to section 3
+	 */
+	l2c210_clean_range(new_start,
+		bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR-1));
+	l2c210_clean_range(bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR),
+		new_end);
+}
+
+static void bcm_flush_range(unsigned long start, unsigned long end)
+{
+	unsigned long new_start, new_end;
+
+	BUG_ON(start < BCM_SYS_EMI_START_ADDR);
+
+	if (unlikely(end <= start))
+		return;
+
+	if ((end - start) >= l2x0_size) {
+		outer_cache.flush_all();
+		return;
+	}
+
+	new_start = bcm_l2_phys_addr(start);
+	new_end = bcm_l2_phys_addr(end);
+
+	/* normal case, no cross section between start and end */
+	if (likely(bcm_addr_is_sys_emi(end) || !bcm_addr_is_sys_emi(start))) {
+		l2c210_flush_range(new_start, new_end);
+		return;
+	}
+
+	/* They cross sections, so it can only be a cross from section
+	 * 2 to section 3
+	 */
+	l2c210_flush_range(new_start,
+		bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR-1));
+	l2c210_flush_range(bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR),
+		new_end);
+}
+
+/* Broadcom L2C-310 start from ARMs R3P2 or later, and require no fixups */
+static const struct l2c_init_data of_bcm_l2x0_data __initconst = {
+	.type = "BCM-L2C-310",
+	.way_size_0 = SZ_8K,
+	.num_lock = 8,
+	.of_parse = l2c310_of_parse,
+	.enable = l2c310_enable,
+	.save  = l2c310_save,
+	.outer_cache = {
+		.inv_range   = bcm_inv_range,
+		.clean_range = bcm_clean_range,
+		.flush_range = bcm_flush_range,
+		.flush_all   = l2c210_flush_all,
+		.disable     = l2c310_disable,
+		.sync        = l2c210_sync,
+		.resume      = l2c310_resume,
+	},
+};
+
+static void __init tauros3_save(void __iomem *base)
+{
+	l2c_save(base);
+
+	l2x0_saved_regs.aux2_ctrl =
+		readl_relaxed(base + TAUROS3_AUX2_CTRL);
+	l2x0_saved_regs.prefetch_ctrl =
+		readl_relaxed(base + L310_PREFETCH_CTRL);
+}
+
+static void tauros3_resume(void)
+{
+	void __iomem *base = l2x0_base;
+
+	if (!(readl_relaxed(base + L2X0_CTRL) & L2X0_CTRL_EN)) {
+		writel_relaxed(l2x0_saved_regs.aux2_ctrl,
+			       base + TAUROS3_AUX2_CTRL);
+		writel_relaxed(l2x0_saved_regs.prefetch_ctrl,
+			       base + L310_PREFETCH_CTRL);
+
+		l2c_enable(base, l2x0_saved_regs.aux_ctrl, 8);
+	}
+}
+
+static const struct l2c_init_data of_tauros3_data __initconst = {
+	.type = "Tauros3",
+	.way_size_0 = SZ_8K,
+	.num_lock = 8,
+	.enable = l2c_enable,
+	.save  = tauros3_save,
+	/* Tauros3 broadcasts L1 cache operations to L2 */
+	.outer_cache = {
+		.resume      = tauros3_resume,
+	},
+};
+
+#define L2C_ID(name, fns) { .compatible = name, .data = (void *)&fns }
+static const struct of_device_id l2x0_ids[] __initconst = {
+	L2C_ID("arm,l210-cache", of_l2c210_data),
+	L2C_ID("arm,l220-cache", of_l2c220_data),
+	L2C_ID("arm,pl310-cache", of_l2c310_data),
+	L2C_ID("brcm,bcm11351-a2-pl310-cache", of_bcm_l2x0_data),
+	L2C_ID("marvell,aurora-outer-cache", of_aurora_with_outer_data),
+	L2C_ID("marvell,aurora-system-cache", of_aurora_no_outer_data),
+	L2C_ID("marvell,tauros3-cache", of_tauros3_data),
+	/* Deprecated IDs */
+	L2C_ID("bcm,bcm11351-a2-pl310-cache", of_bcm_l2x0_data),
+	{}
+};
+
+int __init l2x0_of_init(u32 aux_val, u32 aux_mask)
+{
+	const struct l2c_init_data *data;
+	struct device_node *np;
+	struct resource res;
+	u32 cache_id, old_aux;
+
+	np = of_find_matching_node(NULL, l2x0_ids);
+	if (!np)
+		return -ENODEV;
+
+	if (of_address_to_resource(np, 0, &res))
+		return -ENODEV;
+
+	l2x0_base = ioremap(res.start, resource_size(&res));
+	if (!l2x0_base)
+		return -ENOMEM;
+
+	l2x0_saved_regs.phy_base = res.start;
+
+	data = of_match_node(l2x0_ids, np)->data;
+
+	if (of_device_is_compatible(np, "arm,pl310-cache") &&
+	    of_property_read_bool(np, "arm,io-coherent"))
+		data = &of_l2c310_coherent_data;
+
+	old_aux = readl_relaxed(l2x0_base + L2X0_AUX_CTRL);
+	if (old_aux != ((old_aux & aux_mask) | aux_val)) {
+		pr_warn("L2C: platform modifies aux control register: 0x%08x -> 0x%08x\n",
+		        old_aux, (old_aux & aux_mask) | aux_val);
+	} else if (aux_mask != ~0U && aux_val != 0) {
+		pr_alert("L2C: platform provided aux values match the hardware, so have no effect.  Please remove them.\n");
+	}
+
+	/* All L2 caches are unified, so this property should be specified */
+	if (!of_property_read_bool(np, "cache-unified"))
+		pr_err("L2C: device tree omits to specify unified cache\n");
+
+	/* L2 configuration can only be changed if the cache is disabled */
+	if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN))
+		if (data->of_parse)
+			data->of_parse(np, &aux_val, &aux_mask);
 
-	/* enable L2X0 */
-	writel(1, l2x0_base + L2X0_CTRL);
+	if (cache_id_part_number_from_dt)
+		cache_id = cache_id_part_number_from_dt;
+	else
+		cache_id = readl_relaxed(l2x0_base + L2X0_CACHE_ID);
 
-	outer_cache.inv_range = l2x0_inv_range;
-	outer_cache.clean_range = l2x0_clean_range;
-	outer_cache.flush_range = l2x0_flush_range;
+	__l2c_init(data, aux_val, aux_mask, cache_id);
 
-	printk(KERN_INFO "L2X0 cache controller enabled\n");
+	return 0;
 }
+#endif
diff --git a/arch/arm/mm/cache-nop.S b/arch/arm/mm/cache-nop.S
new file mode 100644
index 00000000000..8e12ddca003
--- /dev/null
+++ b/arch/arm/mm/cache-nop.S
@@ -0,0 +1,50 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <linux/init.h>
+
+#include "proc-macros.S"
+
+ENTRY(nop_flush_icache_all)
+	mov	pc, lr
+ENDPROC(nop_flush_icache_all)
+
+	.globl nop_flush_kern_cache_all
+	.equ nop_flush_kern_cache_all, nop_flush_icache_all
+
+	.globl nop_flush_kern_cache_louis
+	.equ nop_flush_kern_cache_louis, nop_flush_icache_all
+
+	.globl nop_flush_user_cache_all
+	.equ nop_flush_user_cache_all, nop_flush_icache_all
+
+	.globl nop_flush_user_cache_range
+	.equ nop_flush_user_cache_range, nop_flush_icache_all
+
+	.globl nop_coherent_kern_range
+	.equ nop_coherent_kern_range, nop_flush_icache_all
+
+ENTRY(nop_coherent_user_range)
+	mov	r0, 0
+	mov	pc, lr
+ENDPROC(nop_coherent_user_range)
+
+	.globl nop_flush_kern_dcache_area
+	.equ nop_flush_kern_dcache_area, nop_flush_icache_all
+
+	.globl nop_dma_flush_range
+	.equ nop_dma_flush_range, nop_flush_icache_all
+
+	.globl nop_dma_map_area
+	.equ nop_dma_map_area, nop_flush_icache_all
+
+	.globl nop_dma_unmap_area
+	.equ nop_dma_unmap_area, nop_flush_icache_all
+
+	__INITDATA
+
+	@ define struct cpu_cache_fns (see <asm/cacheflush.h> and proc-macros.S)
+	define_cache_functions nop
diff --git a/arch/arm/mm/cache-tauros2.c b/arch/arm/mm/cache-tauros2.c
new file mode 100644
index 00000000000..b273739e635
--- /dev/null
+++ b/arch/arm/mm/cache-tauros2.c
@@ -0,0 +1,302 @@
+/*
+ * arch/arm/mm/cache-tauros2.c - Tauros2 L2 cache controller support
+ *
+ * Copyright (C) 2008 Marvell Semiconductor
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2.  This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ *
+ * References:
+ * - PJ1 CPU Core Datasheet,
+ *   Document ID MV-S104837-01, Rev 0.7, January 24 2008.
+ * - PJ4 CPU Core Datasheet,
+ *   Document ID MV-S105190-00, Rev 0.7, March 14 2008.
+ */
+
+#include <linux/init.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <asm/cacheflush.h>
+#include <asm/cp15.h>
+#include <asm/cputype.h>
+#include <asm/hardware/cache-tauros2.h>
+
+
+/*
+ * When Tauros2 is used on a CPU that supports the v7 hierarchical
+ * cache operations, the cache handling code in proc-v7.S takes care
+ * of everything, including handling DMA coherency.
+ *
+ * So, we only need to register outer cache operations here if we're
+ * being used on a pre-v7 CPU, and we only need to build support for
+ * outer cache operations into the kernel image if the kernel has been
+ * configured to support a pre-v7 CPU.
+ */
+#ifdef CONFIG_CPU_32v5
+/*
+ * Low-level cache maintenance operations.
+ */
+static inline void tauros2_clean_pa(unsigned long addr)
+{
+	__asm__("mcr p15, 1, %0, c7, c11, 3" : : "r" (addr));
+}
+
+static inline void tauros2_clean_inv_pa(unsigned long addr)
+{
+	__asm__("mcr p15, 1, %0, c7, c15, 3" : : "r" (addr));
+}
+
+static inline void tauros2_inv_pa(unsigned long addr)
+{
+	__asm__("mcr p15, 1, %0, c7, c7, 3" : : "r" (addr));
+}
+
+
+/*
+ * Linux primitives.
+ *
+ * Note that the end addresses passed to Linux primitives are
+ * noninclusive.
+ */
+#define CACHE_LINE_SIZE		32
+
+static void tauros2_inv_range(unsigned long start, unsigned long end)
+{
+	/*
+	 * Clean and invalidate partial first cache line.
+	 */
+	if (start & (CACHE_LINE_SIZE - 1)) {
+		tauros2_clean_inv_pa(start & ~(CACHE_LINE_SIZE - 1));
+		start = (start | (CACHE_LINE_SIZE - 1)) + 1;
+	}
+
+	/*
+	 * Clean and invalidate partial last cache line.
+	 */
+	if (end & (CACHE_LINE_SIZE - 1)) {
+		tauros2_clean_inv_pa(end & ~(CACHE_LINE_SIZE - 1));
+		end &= ~(CACHE_LINE_SIZE - 1);
+	}
+
+	/*
+	 * Invalidate all full cache lines between 'start' and 'end'.
+	 */
+	while (start < end) {
+		tauros2_inv_pa(start);
+		start += CACHE_LINE_SIZE;
+	}
+
+	dsb();
+}
+
+static void tauros2_clean_range(unsigned long start, unsigned long end)
+{
+	start &= ~(CACHE_LINE_SIZE - 1);
+	while (start < end) {
+		tauros2_clean_pa(start);
+		start += CACHE_LINE_SIZE;
+	}
+
+	dsb();
+}
+
+static void tauros2_flush_range(unsigned long start, unsigned long end)
+{
+	start &= ~(CACHE_LINE_SIZE - 1);
+	while (start < end) {
+		tauros2_clean_inv_pa(start);
+		start += CACHE_LINE_SIZE;
+	}
+
+	dsb();
+}
+
+static void tauros2_disable(void)
+{
+	__asm__ __volatile__ (
+	"mcr	p15, 1, %0, c7, c11, 0 @L2 Cache Clean All\n\t"
+	"mrc	p15, 0, %0, c1, c0, 0\n\t"
+	"bic	%0, %0, #(1 << 26)\n\t"
+	"mcr	p15, 0, %0, c1, c0, 0  @Disable L2 Cache\n\t"
+	: : "r" (0x0));
+}
+
+static void tauros2_resume(void)
+{
+	__asm__ __volatile__ (
+	"mcr	p15, 1, %0, c7, c7, 0 @L2 Cache Invalidate All\n\t"
+	"mrc	p15, 0, %0, c1, c0, 0\n\t"
+	"orr	%0, %0, #(1 << 26)\n\t"
+	"mcr	p15, 0, %0, c1, c0, 0 @Enable L2 Cache\n\t"
+	: : "r" (0x0));
+}
+#endif
+
+static inline u32 __init read_extra_features(void)
+{
+	u32 u;
+
+	__asm__("mrc p15, 1, %0, c15, c1, 0" : "=r" (u));
+
+	return u;
+}
+
+static inline void __init write_extra_features(u32 u)
+{
+	__asm__("mcr p15, 1, %0, c15, c1, 0" : : "r" (u));
+}
+
+static inline int __init cpuid_scheme(void)
+{
+	return !!((processor_id & 0x000f0000) == 0x000f0000);
+}
+
+static inline u32 __init read_mmfr3(void)
+{
+	u32 mmfr3;
+
+	__asm__("mrc p15, 0, %0, c0, c1, 7\n" : "=r" (mmfr3));
+
+	return mmfr3;
+}
+
+static inline u32 __init read_actlr(void)
+{
+	u32 actlr;
+
+	__asm__("mrc p15, 0, %0, c1, c0, 1\n" : "=r" (actlr));
+
+	return actlr;
+}
+
+static inline void __init write_actlr(u32 actlr)
+{
+	__asm__("mcr p15, 0, %0, c1, c0, 1\n" : : "r" (actlr));
+}
+
+static void enable_extra_feature(unsigned int features)
+{
+	u32 u;
+
+	u = read_extra_features();
+
+	if (features & CACHE_TAUROS2_PREFETCH_ON)
+		u &= ~0x01000000;
+	else
+		u |= 0x01000000;
+	printk(KERN_INFO "Tauros2: %s L2 prefetch.\n",
+			(features & CACHE_TAUROS2_PREFETCH_ON)
+			? "Enabling" : "Disabling");
+
+	if (features & CACHE_TAUROS2_LINEFILL_BURST8)
+		u |= 0x00100000;
+	else
+		u &= ~0x00100000;
+	printk(KERN_INFO "Tauros2: %s line fill burt8.\n",
+			(features & CACHE_TAUROS2_LINEFILL_BURST8)
+			? "Enabling" : "Disabling");
+
+	write_extra_features(u);
+}
+
+static void __init tauros2_internal_init(unsigned int features)
+{
+	char *mode = NULL;
+
+	enable_extra_feature(features);
+
+#ifdef CONFIG_CPU_32v5
+	if ((processor_id & 0xff0f0000) == 0x56050000) {
+		u32 feat;
+
+		/*
+		 * v5 CPUs with Tauros2 have the L2 cache enable bit
+		 * located in the CPU Extra Features register.
+		 */
+		feat = read_extra_features();
+		if (!(feat & 0x00400000)) {
+			printk(KERN_INFO "Tauros2: Enabling L2 cache.\n");
+			write_extra_features(feat | 0x00400000);
+		}
+
+		mode = "ARMv5";
+		outer_cache.inv_range = tauros2_inv_range;
+		outer_cache.clean_range = tauros2_clean_range;
+		outer_cache.flush_range = tauros2_flush_range;
+		outer_cache.disable = tauros2_disable;
+		outer_cache.resume = tauros2_resume;
+	}
+#endif
+
+#ifdef CONFIG_CPU_32v7
+	/*
+	 * Check whether this CPU has support for the v7 hierarchical
+	 * cache ops.  (PJ4 is in its v7 personality mode if the MMFR3
+	 * register indicates support for the v7 hierarchical cache
+	 * ops.)
+	 *
+	 * (Although strictly speaking there may exist CPUs that
+	 * implement the v7 cache ops but are only ARMv6 CPUs (due to
+	 * not complying with all of the other ARMv7 requirements),
+	 * there are no real-life examples of Tauros2 being used on
+	 * such CPUs as of yet.)
+	 */
+	if (cpuid_scheme() && (read_mmfr3() & 0xf) == 1) {
+		u32 actlr;
+
+		/*
+		 * When Tauros2 is used in an ARMv7 system, the L2
+		 * enable bit is located in the Auxiliary System Control
+		 * Register (which is the only register allowed by the
+		 * ARMv7 spec to contain fine-grained cache control bits).
+		 */
+		actlr = read_actlr();
+		if (!(actlr & 0x00000002)) {
+			printk(KERN_INFO "Tauros2: Enabling L2 cache.\n");
+			write_actlr(actlr | 0x00000002);
+		}
+
+		mode = "ARMv7";
+	}
+#endif
+
+	if (mode == NULL) {
+		printk(KERN_CRIT "Tauros2: Unable to detect CPU mode.\n");
+		return;
+	}
+
+	printk(KERN_INFO "Tauros2: L2 cache support initialised "
+			 "in %s mode.\n", mode);
+}
+
+#ifdef CONFIG_OF
+static const struct of_device_id tauros2_ids[] __initconst = {
+	{ .compatible = "marvell,tauros2-cache"},
+	{}
+};
+#endif
+
+void __init tauros2_init(unsigned int features)
+{
+#ifdef CONFIG_OF
+	struct device_node *node;
+	int ret;
+	unsigned int f;
+
+	node = of_find_matching_node(NULL, tauros2_ids);
+	if (!node) {
+		pr_info("Not found marvell,tauros2-cache, disable it\n");
+		return;
+	}
+
+	ret = of_property_read_u32(node, "marvell,tauros2-cache-features", &f);
+	if (ret) {
+		pr_info("Not found marvell,tauros-cache-features property, "
+			"disable extra features\n");
+		features = 0;
+	} else
+		features = f;
+#endif
+	tauros2_internal_init(features);
+}
diff --git a/arch/arm/mm/cache-tauros3.h b/arch/arm/mm/cache-tauros3.h
new file mode 100644
index 00000000000..02c0a97cbc0
--- /dev/null
+++ b/arch/arm/mm/cache-tauros3.h
@@ -0,0 +1,41 @@
+/*
+ * Marvell Tauros3 cache controller includes
+ *
+ * Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
+ *
+ * based on GPL'ed 2.6 kernel sources
+ *  (c) Marvell International Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef __ASM_ARM_HARDWARE_TAUROS3_H
+#define __ASM_ARM_HARDWARE_TAUROS3_H
+
+/*
+ * Marvell Tauros3 L2CC is compatible with PL310 r0p0
+ * but with PREFETCH_CTRL (r2p0) and an additional event counter.
+ * Also, there is AUX2_CTRL for some Marvell specific control.
+ */
+
+#define TAUROS3_EVENT_CNT2_CFG		0x224
+#define TAUROS3_EVENT_CNT2_VAL		0x228
+#define TAUROS3_INV_ALL			0x780
+#define TAUROS3_CLEAN_ALL		0x784
+#define TAUROS3_AUX2_CTRL		0x820
+
+/* Registers shifts and masks */
+#define TAUROS3_AUX2_CTRL_LINEFILL_BURST8_EN	(1 << 2)
+
+#endif
diff --git a/arch/arm/mm/cache-v3.S b/arch/arm/mm/cache-v3.S
deleted file mode 100644
index e1994788cf0..00000000000
--- a/arch/arm/mm/cache-v3.S
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- *  linux/arch/arm/mm/cache-v3.S
- *
- *  Copyright (C) 1997-2002 Russell king
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <asm/hardware.h>
-#include <asm/page.h>
-#include "proc-macros.S"
-
-/*
- *	flush_user_cache_all()
- *
- *	Invalidate all cache entries in a particular address
- *	space.
- *
- *	- mm	- mm_struct describing address space
- */
-ENTRY(v3_flush_user_cache_all)
-	/* FALLTHROUGH */
-/*
- *	flush_kern_cache_all()
- *
- *	Clean and invalidate the entire cache.
- */
-ENTRY(v3_flush_kern_cache_all)
-	/* FALLTHROUGH */
-
-/*
- *	flush_user_cache_range(start, end, flags)
- *
- *	Invalidate a range of cache entries in the specified
- *	address space.
- *
- *	- start - start address (may not be aligned)
- *	- end	- end address (exclusive, may not be aligned)
- *	- flags	- vma_area_struct flags describing address space
- */
-ENTRY(v3_flush_user_cache_range)
-	mov	ip, #0
-	mcreq	p15, 0, ip, c7, c0, 0		@ flush ID cache
-	mov	pc, lr
-
-/*
- *	coherent_kern_range(start, end)
- *
- *	Ensure coherency between the Icache and the Dcache in the
- *	region described by start.  If you have non-snooping
- *	Harvard caches, you need to implement this function.
- *
- *	- start  - virtual start address
- *	- end	 - virtual end address
- */
-ENTRY(v3_coherent_kern_range)
-	/* FALLTHROUGH */
-
-/*
- *	coherent_user_range(start, end)
- *
- *	Ensure coherency between the Icache and the Dcache in the
- *	region described by start.  If you have non-snooping
- *	Harvard caches, you need to implement this function.
- *
- *	- start  - virtual start address
- *	- end	 - virtual end address
- */
-ENTRY(v3_coherent_user_range)
-	mov	pc, lr
-
-/*
- *	flush_kern_dcache_page(void *page)
- *
- *	Ensure no D cache aliasing occurs, either with itself or
- *	the I cache
- *
- *	- addr	- page aligned address
- */
-ENTRY(v3_flush_kern_dcache_page)
-	/* FALLTHROUGH */
-
-/*
- *	dma_inv_range(start, end)
- *
- *	Invalidate (discard) the specified virtual address range.
- *	May not write back any entries.  If 'start' or 'end'
- *	are not cache line aligned, those lines must be written
- *	back.
- *
- *	- start  - virtual start address
- *	- end	 - virtual end address
- */
-ENTRY(v3_dma_inv_range)
-	/* FALLTHROUGH */
-
-/*
- *	dma_flush_range(start, end)
- *
- *	Clean and invalidate the specified virtual address range.
- *
- *	- start  - virtual start address
- *	- end	 - virtual end address
- */
-ENTRY(v3_dma_flush_range)
-	mov	r0, #0
-	mcr	p15, 0, r0, c7, c0, 0		@ flush ID cache
-	/* FALLTHROUGH */
-
-/*
- *	dma_clean_range(start, end)
- *
- *	Clean (write back) the specified virtual address range.
- *
- *	- start  - virtual start address
- *	- end	 - virtual end address
- */
-ENTRY(v3_dma_clean_range)
-	mov	pc, lr
-
-	__INITDATA
-
-	.type	v3_cache_fns, #object
-ENTRY(v3_cache_fns)
-	.long	v3_flush_kern_cache_all
-	.long	v3_flush_user_cache_all
-	.long	v3_flush_user_cache_range
-	.long	v3_coherent_kern_range
-	.long	v3_coherent_user_range
-	.long	v3_flush_kern_dcache_page
-	.long	v3_dma_inv_range
-	.long	v3_dma_clean_range
-	.long	v3_dma_flush_range
-	.size	v3_cache_fns, . - v3_cache_fns
diff --git a/arch/arm/mm/cache-v4.S b/arch/arm/mm/cache-v4.S
index b2908063ed6..a7ba68f59f0 100644
--- a/arch/arm/mm/cache-v4.S
+++ b/arch/arm/mm/cache-v4.S
@@ -9,11 +9,19 @@
  */
 #include <linux/linkage.h>
 #include <linux/init.h>
-#include <asm/hardware.h>
 #include <asm/page.h>
 #include "proc-macros.S"
 
 /*
+ *	flush_icache_all()
+ *
+ *	Unconditionally clean and invalidate the entire icache.
+ */
+ENTRY(v4_flush_icache_all)
+	mov	pc, lr
+ENDPROC(v4_flush_icache_all)
+
+/*
  *	flush_user_cache_all()
  *
  *	Invalidate all cache entries in a particular address
@@ -29,7 +37,7 @@ ENTRY(v4_flush_user_cache_all)
  *	Clean and invalidate the entire cache.
  */
 ENTRY(v4_flush_kern_cache_all)
-#ifdef CPU_CP15
+#ifdef CONFIG_CPU_CP15
 	mov	r0, #0
 	mcr	p15, 0, r0, c7, c7, 0		@ flush ID cache
 	mov	pc, lr
@@ -48,9 +56,9 @@ ENTRY(v4_flush_kern_cache_all)
  *	- flags	- vma_area_struct flags describing address space
  */
 ENTRY(v4_flush_user_cache_range)
-#ifdef CPU_CP15
+#ifdef CONFIG_CPU_CP15
 	mov	ip, #0
-	mcreq	p15, 0, ip, c7, c7, 0		@ flush ID cache
+	mcr	p15, 0, ip, c7, c7, 0		@ flush ID cache
 	mov	pc, lr
 #else
 	/* FALLTHROUGH */
@@ -80,31 +88,19 @@ ENTRY(v4_coherent_kern_range)
  *	- end	 - virtual end address
  */
 ENTRY(v4_coherent_user_range)
+	mov	r0, #0
 	mov	pc, lr
 
 /*
- *	flush_kern_dcache_page(void *page)
+ *	flush_kern_dcache_area(void *addr, size_t size)
  *
  *	Ensure no D cache aliasing occurs, either with itself or
  *	the I cache
  *
- *	- addr	- page aligned address
+ *	- addr	- kernel address
+ *	- size	- region size
  */
-ENTRY(v4_flush_kern_dcache_page)
-	/* FALLTHROUGH */
-
-/*
- *	dma_inv_range(start, end)
- *
- *	Invalidate (discard) the specified virtual address range.
- *	May not write back any entries.  If 'start' or 'end'
- *	are not cache line aligned, those lines must be written
- *	back.
- *
- *	- start  - virtual start address
- *	- end	 - virtual end address
- */
-ENTRY(v4_dma_inv_range)
+ENTRY(v4_flush_kern_dcache_area)
 	/* FALLTHROUGH */
 
 /*
@@ -116,34 +112,38 @@ ENTRY(v4_dma_inv_range)
  *	- end	 - virtual end address
  */
 ENTRY(v4_dma_flush_range)
-#ifdef CPU_CP15
+#ifdef CONFIG_CPU_CP15
 	mov	r0, #0
 	mcr	p15, 0, r0, c7, c7, 0		@ flush ID cache
 #endif
+	mov	pc, lr
+
+/*
+ *	dma_unmap_area(start, size, dir)
+ *	- start	- kernel virtual start address
+ *	- size	- size of region
+ *	- dir	- DMA direction
+ */
+ENTRY(v4_dma_unmap_area)
+	teq	r2, #DMA_TO_DEVICE
+	bne	v4_dma_flush_range
 	/* FALLTHROUGH */
 
 /*
- *	dma_clean_range(start, end)
- *
- *	Clean (write back) the specified virtual address range.
- *
- *	- start  - virtual start address
- *	- end	 - virtual end address
+ *	dma_map_area(start, size, dir)
+ *	- start	- kernel virtual start address
+ *	- size	- size of region
+ *	- dir	- DMA direction
  */
-ENTRY(v4_dma_clean_range)
+ENTRY(v4_dma_map_area)
 	mov	pc, lr
+ENDPROC(v4_dma_unmap_area)
+ENDPROC(v4_dma_map_area)
+
+	.globl	v4_flush_kern_cache_louis
+	.equ	v4_flush_kern_cache_louis, v4_flush_kern_cache_all
 
 	__INITDATA
 
-	.type	v4_cache_fns, #object
-ENTRY(v4_cache_fns)
-	.long	v4_flush_kern_cache_all
-	.long	v4_flush_user_cache_all
-	.long	v4_flush_user_cache_range
-	.long	v4_coherent_kern_range
-	.long	v4_coherent_user_range
-	.long	v4_flush_kern_dcache_page
-	.long	v4_dma_inv_range
-	.long	v4_dma_clean_range
-	.long	v4_dma_flush_range
-	.size	v4_cache_fns, . - v4_cache_fns
+	@ define struct cpu_cache_fns (see <asm/cacheflush.h> and proc-macros.S)
+	define_cache_functions v4
diff --git a/arch/arm/mm/cache-v4wb.S b/arch/arm/mm/cache-v4wb.S
index 2ebc1b3bf85..cd494532140 100644
--- a/arch/arm/mm/cache-v4wb.S
+++ b/arch/arm/mm/cache-v4wb.S
@@ -32,7 +32,7 @@
 /*
  * This is the size at which it becomes more efficient to
  * clean the whole cache, rather than using the individual
- * cache line maintainence instructions.
+ * cache line maintenance instructions.
  *
  *  Size  Clean (ticks) Dirty (ticks)
  *   4096   21  20  21    53  55  54
@@ -51,6 +51,17 @@ flush_base:
 	.text
 
 /*
+ *	flush_icache_all()
+ *
+ *	Unconditionally clean and invalidate the entire icache.
+ */
+ENTRY(v4wb_flush_icache_all)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c5, 0		@ invalidate I cache
+	mov	pc, lr
+ENDPROC(v4wb_flush_icache_all)
+
+/*
  *	flush_user_cache_all()
  *
  *	Clean and invalidate all cache entries in a particular address
@@ -114,15 +125,16 @@ ENTRY(v4wb_flush_user_cache_range)
 	mov	pc, lr
 
 /*
- *	flush_kern_dcache_page(void *page)
+ *	flush_kern_dcache_area(void *addr, size_t size)
  *
  *	Ensure no D cache aliasing occurs, either with itself or
  *	the I cache
  *
- *	- addr	- page aligned address
+ *	- addr	- kernel address
+ *	- size	- region size
  */
-ENTRY(v4wb_flush_kern_dcache_page)
-	add	r1, r0, #PAGE_SZ
+ENTRY(v4wb_flush_kern_dcache_area)
+	add	r1, r0, r1
 	/* fall through */
 
 /*
@@ -155,9 +167,9 @@ ENTRY(v4wb_coherent_user_range)
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mov	ip, #0
-	mcr	p15, 0, ip, c7, c5, 0		@ invalidate I cache
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c5, 0		@ invalidate I cache
+	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 
@@ -172,7 +184,7 @@ ENTRY(v4wb_coherent_user_range)
  *	- start  - virtual start address
  *	- end	 - virtual end address
  */
-ENTRY(v4wb_dma_inv_range)
+v4wb_dma_inv_range:
 	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
 	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
@@ -193,7 +205,7 @@ ENTRY(v4wb_dma_inv_range)
  *	- start  - virtual start address
  *	- end	 - virtual end address
  */
-ENTRY(v4wb_dma_clean_range)
+v4wb_dma_clean_range:
 	bic	r0, r0, #CACHE_DLINESIZE - 1
 1:	mcr	p15, 0, r0, c7, c10, 1		@ clean D entry
 	add	r0, r0, #CACHE_DLINESIZE
@@ -215,17 +227,34 @@ ENTRY(v4wb_dma_clean_range)
 	.globl	v4wb_dma_flush_range
 	.set	v4wb_dma_flush_range, v4wb_coherent_kern_range
 
+/*
+ *	dma_map_area(start, size, dir)
+ *	- start	- kernel virtual start address
+ *	- size	- size of region
+ *	- dir	- DMA direction
+ */
+ENTRY(v4wb_dma_map_area)
+	add	r1, r1, r0
+	cmp	r2, #DMA_TO_DEVICE
+	beq	v4wb_dma_clean_range
+	bcs	v4wb_dma_inv_range
+	b	v4wb_dma_flush_range
+ENDPROC(v4wb_dma_map_area)
+
+/*
+ *	dma_unmap_area(start, size, dir)
+ *	- start	- kernel virtual start address
+ *	- size	- size of region
+ *	- dir	- DMA direction
+ */
+ENTRY(v4wb_dma_unmap_area)
+	mov	pc, lr
+ENDPROC(v4wb_dma_unmap_area)
+
+	.globl	v4wb_flush_kern_cache_louis
+	.equ	v4wb_flush_kern_cache_louis, v4wb_flush_kern_cache_all
+
 	__INITDATA
 
-	.type	v4wb_cache_fns, #object
-ENTRY(v4wb_cache_fns)
-	.long	v4wb_flush_kern_cache_all
-	.long	v4wb_flush_user_cache_all
-	.long	v4wb_flush_user_cache_range
-	.long	v4wb_coherent_kern_range
-	.long	v4wb_coherent_user_range
-	.long	v4wb_flush_kern_dcache_page
-	.long	v4wb_dma_inv_range
-	.long	v4wb_dma_clean_range
-	.long	v4wb_dma_flush_range
-	.size	v4wb_cache_fns, . - v4wb_cache_fns
+	@ define struct cpu_cache_fns (see <asm/cacheflush.h> and proc-macros.S)
+	define_cache_functions v4wb
diff --git a/arch/arm/mm/cache-v4wt.S b/arch/arm/mm/cache-v4wt.S
index 9bcabd86c6f..11e5e5838bc 100644
--- a/arch/arm/mm/cache-v4wt.S
+++ b/arch/arm/mm/cache-v4wt.S
@@ -13,7 +13,6 @@
  */
 #include <linux/linkage.h>
 #include <linux/init.h>
-#include <asm/hardware.h>
 #include <asm/page.h>
 #include "proc-macros.S"
 
@@ -35,13 +34,24 @@
 /*
  * This is the size at which it becomes more efficient to
  * clean the whole cache, rather than using the individual
- * cache line maintainence instructions.
+ * cache line maintenance instructions.
  *
  * *** This needs benchmarking
  */
 #define CACHE_DLIMIT	16384
 
 /*
+ *	flush_icache_all()
+ *
+ *	Unconditionally clean and invalidate the entire icache.
+ */
+ENTRY(v4wt_flush_icache_all)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c5, 0		@ invalidate I cache
+	mov	pc, lr
+ENDPROC(v4wt_flush_icache_all)
+
+/*
  *	flush_user_cache_all()
  *
  *	Invalidate all cache entries in a particular address
@@ -115,20 +125,22 @@ ENTRY(v4wt_coherent_user_range)
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
+	mov	r0, #0
 	mov	pc, lr
 
 /*
- *	flush_kern_dcache_page(void *page)
+ *	flush_kern_dcache_area(void *addr, size_t size)
  *
  *	Ensure no D cache aliasing occurs, either with itself or
  *	the I cache
  *
- *	- addr	- page aligned address
+ *	- addr	- kernel address
+ *	- size	- region size
  */
-ENTRY(v4wt_flush_kern_dcache_page)
+ENTRY(v4wt_flush_kern_dcache_area)
 	mov	r2, #0
 	mcr	p15, 0, r2, c7, c5, 0		@ invalidate I cache
-	add	r1, r0, #PAGE_SZ
+	add	r1, r0, r1
 	/* fallthrough */
 
 /*
@@ -142,23 +154,12 @@ ENTRY(v4wt_flush_kern_dcache_page)
  *	- start  - virtual start address
  *	- end	 - virtual end address
  */
-ENTRY(v4wt_dma_inv_range)
+v4wt_dma_inv_range:
 	bic	r0, r0, #CACHE_DLINESIZE - 1
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	/* FALLTHROUGH */
-
-/*
- *	dma_clean_range(start, end)
- *
- *	Clean the specified virtual address range.
- *
- *	- start  - virtual start address
- *	- end	 - virtual end address
- */
-ENTRY(v4wt_dma_clean_range)
 	mov	pc, lr
 
 /*
@@ -172,17 +173,33 @@ ENTRY(v4wt_dma_clean_range)
 	.globl	v4wt_dma_flush_range
 	.equ	v4wt_dma_flush_range, v4wt_dma_inv_range
 
+/*
+ *	dma_unmap_area(start, size, dir)
+ *	- start	- kernel virtual start address
+ *	- size	- size of region
+ *	- dir	- DMA direction
+ */
+ENTRY(v4wt_dma_unmap_area)
+	add	r1, r1, r0
+	teq	r2, #DMA_TO_DEVICE
+	bne	v4wt_dma_inv_range
+	/* FALLTHROUGH */
+
+/*
+ *	dma_map_area(start, size, dir)
+ *	- start	- kernel virtual start address
+ *	- size	- size of region
+ *	- dir	- DMA direction
+ */
+ENTRY(v4wt_dma_map_area)
+	mov	pc, lr
+ENDPROC(v4wt_dma_unmap_area)
+ENDPROC(v4wt_dma_map_area)
+
+	.globl	v4wt_flush_kern_cache_louis
+	.equ	v4wt_flush_kern_cache_louis, v4wt_flush_kern_cache_all
+
 	__INITDATA
 
-	.type	v4wt_cache_fns, #object
-ENTRY(v4wt_cache_fns)
-	.long	v4wt_flush_kern_cache_all
-	.long	v4wt_flush_user_cache_all
-	.long	v4wt_flush_user_cache_range
-	.long	v4wt_coherent_kern_range
-	.long	v4wt_coherent_user_range
-	.long	v4wt_flush_kern_dcache_page
-	.long	v4wt_dma_inv_range
-	.long	v4wt_dma_clean_range
-	.long	v4wt_dma_flush_range
-	.size	v4wt_cache_fns, . - v4wt_cache_fns
+	@ define struct cpu_cache_fns (see <asm/cacheflush.h> and proc-macros.S)
+	define_cache_functions v4wt
diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S
index 2c6c2a7c05a..d8fd4d4bd3d 100644
--- a/arch/arm/mm/cache-v6.S
+++ b/arch/arm/mm/cache-v6.S
@@ -12,6 +12,8 @@
 #include <linux/linkage.h>
 #include <linux/init.h>
 #include <asm/assembler.h>
+#include <asm/errno.h>
+#include <asm/unwind.h>
 
 #include "proc-macros.S"
 
@@ -21,6 +23,38 @@
 #define BTB_FLUSH_SIZE		8
 
 /*
+ *	v6_flush_icache_all()
+ *
+ *	Flush the whole I-cache.
+ *
+ *	ARM1136 erratum 411920 - Invalidate Instruction Cache operation can fail.
+ *	This erratum is present in 1136, 1156 and 1176. It does not affect the
+ *	MPCore.
+ *
+ *	Registers:
+ *	r0 - set to 0
+ *	r1 - corrupted
+ */
+ENTRY(v6_flush_icache_all)
+	mov	r0, #0
+#ifdef CONFIG_ARM_ERRATA_411920
+	mrs	r1, cpsr
+	cpsid	ifa				@ disable interrupts
+	mcr	p15, 0, r0, c7, c5, 0		@ invalidate entire I-cache
+	mcr	p15, 0, r0, c7, c5, 0		@ invalidate entire I-cache
+	mcr	p15, 0, r0, c7, c5, 0		@ invalidate entire I-cache
+	mcr	p15, 0, r0, c7, c5, 0		@ invalidate entire I-cache
+	msr	cpsr_cx, r1			@ restore interrupts
+	.rept	11				@ ARM Ltd recommends at least
+	nop					@ 11 NOPs
+	.endr
+#else
+	mcr	p15, 0, r0, c7, c5, 0		@ invalidate I-cache
+#endif
+	mov	pc, lr
+ENDPROC(v6_flush_icache_all)
+
+/*
  *	v6_flush_cache_all()
  *
  *	Flush the entire cache.
@@ -31,8 +65,12 @@ ENTRY(v6_flush_kern_cache_all)
 	mov	r0, #0
 #ifdef HARVARD_CACHE
 	mcr	p15, 0, r0, c7, c14, 0		@ D cache clean+invalidate
+#ifndef CONFIG_ARM_ERRATA_411920
 	mcr	p15, 0, r0, c7, c5, 0		@ I+BTB cache invalidate
 #else
+	b	v6_flush_icache_all
+#endif
+#else
 	mcr	p15, 0, r0, c7, c15, 0		@ Cache clean+invalidate
 #endif
 	mov	pc, lr
@@ -92,10 +130,11 @@ ENTRY(v6_coherent_kern_range)
  *	- the Icache does not read data from the write buffer
  */
 ENTRY(v6_coherent_user_range)
-
+ UNWIND(.fnstart		)
 #ifdef HARVARD_CACHE
 	bic	r0, r0, #CACHE_LINE_SIZE - 1
-1:	mcr	p15, 0, r0, c7, c10, 1		@ clean D line
+1:
+ USER(	mcr	p15, 0, r0, c7, c10, 1	)	@ clean D line
 	add	r0, r0, #CACHE_LINE_SIZE
 	cmp	r0, r1
 	blo	1b
@@ -103,22 +142,39 @@ ENTRY(v6_coherent_user_range)
 	mov	r0, #0
 #ifdef HARVARD_CACHE
 	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
+#ifndef CONFIG_ARM_ERRATA_411920
 	mcr	p15, 0, r0, c7, c5, 0		@ I+BTB cache invalidate
 #else
+	b	v6_flush_icache_all
+#endif
+#else
 	mcr	p15, 0, r0, c7, c5, 6		@ invalidate BTB
 #endif
 	mov	pc, lr
 
 /*
- *	v6_flush_kern_dcache_page(kaddr)
+ * Fault handling for the cache operation above. If the virtual address in r0
+ * isn't mapped, fail with -EFAULT.
+ */
+9001:
+	mov	r0, #-EFAULT
+	mov	pc, lr
+ UNWIND(.fnend		)
+ENDPROC(v6_coherent_user_range)
+ENDPROC(v6_coherent_kern_range)
+
+/*
+ *	v6_flush_kern_dcache_area(void *addr, size_t size)
  *
  *	Ensure that the data held in the page kaddr is written back
  *	to the page in question.
  *
- *	- kaddr   - kernel address (guaranteed to be page aligned)
+ *	- addr	- kernel address
+ *	- size	- region size
  */
-ENTRY(v6_flush_kern_dcache_page)
-	add	r1, r0, #PAGE_SZ
+ENTRY(v6_flush_kern_dcache_area)
+	add	r1, r0, r1
+	bic	r0, r0, #D_CACHE_LINE_SIZE - 1
 1:
 #ifdef HARVARD_CACHE
 	mcr	p15, 0, r0, c7, c14, 1		@ clean & invalidate D line
@@ -145,7 +201,11 @@ ENTRY(v6_flush_kern_dcache_page)
  *	- start   - virtual start address of region
  *	- end     - virtual end address of region
  */
-ENTRY(v6_dma_inv_range)
+v6_dma_inv_range:
+#ifdef CONFIG_DMA_CACHE_RWFO
+	ldrb	r2, [r0]			@ read for ownership
+	strb	r2, [r0]			@ write for ownership
+#endif
 	tst	r0, #D_CACHE_LINE_SIZE - 1
 	bic	r0, r0, #D_CACHE_LINE_SIZE - 1
 #ifdef HARVARD_CACHE
@@ -154,6 +214,10 @@ ENTRY(v6_dma_inv_range)
 	mcrne	p15, 0, r0, c7, c11, 1		@ clean unified line
 #endif
 	tst	r1, #D_CACHE_LINE_SIZE - 1
+#ifdef CONFIG_DMA_CACHE_RWFO
+	ldrneb	r2, [r1, #-1]			@ read for ownership
+	strneb	r2, [r1, #-1]			@ write for ownership
+#endif
 	bic	r1, r1, #D_CACHE_LINE_SIZE - 1
 #ifdef HARVARD_CACHE
 	mcrne	p15, 0, r1, c7, c14, 1		@ clean & invalidate D line
@@ -168,6 +232,10 @@ ENTRY(v6_dma_inv_range)
 #endif
 	add	r0, r0, #D_CACHE_LINE_SIZE
 	cmp	r0, r1
+#ifdef CONFIG_DMA_CACHE_RWFO
+	ldrlo	r2, [r0]			@ read for ownership
+	strlo	r2, [r0]			@ write for ownership
+#endif
 	blo	1b
 	mov	r0, #0
 	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
@@ -178,9 +246,12 @@ ENTRY(v6_dma_inv_range)
  *	- start   - virtual start address of region
  *	- end     - virtual end address of region
  */
-ENTRY(v6_dma_clean_range)
+v6_dma_clean_range:
 	bic	r0, r0, #D_CACHE_LINE_SIZE - 1
 1:
+#ifdef CONFIG_DMA_CACHE_RWFO
+	ldr	r2, [r0]			@ read for ownership
+#endif
 #ifdef HARVARD_CACHE
 	mcr	p15, 0, r0, c7, c10, 1		@ clean D line
 #else
@@ -199,6 +270,10 @@ ENTRY(v6_dma_clean_range)
  *	- end     - virtual end address of region
  */
 ENTRY(v6_dma_flush_range)
+#ifdef CONFIG_DMA_CACHE_RWFO
+	ldrb	r2, [r0]		@ read for ownership
+	strb	r2, [r0]		@ write for ownership
+#endif
 	bic	r0, r0, #D_CACHE_LINE_SIZE - 1
 1:
 #ifdef HARVARD_CACHE
@@ -208,22 +283,53 @@ ENTRY(v6_dma_flush_range)
 #endif
 	add	r0, r0, #D_CACHE_LINE_SIZE
 	cmp	r0, r1
+#ifdef CONFIG_DMA_CACHE_RWFO
+	ldrlob	r2, [r0]			@ read for ownership
+	strlob	r2, [r0]			@ write for ownership
+#endif
 	blo	1b
 	mov	r0, #0
 	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
+/*
+ *	dma_map_area(start, size, dir)
+ *	- start	- kernel virtual start address
+ *	- size	- size of region
+ *	- dir	- DMA direction
+ */
+ENTRY(v6_dma_map_area)
+	add	r1, r1, r0
+	teq	r2, #DMA_FROM_DEVICE
+	beq	v6_dma_inv_range
+#ifndef CONFIG_DMA_CACHE_RWFO
+	b	v6_dma_clean_range
+#else
+	teq	r2, #DMA_TO_DEVICE
+	beq	v6_dma_clean_range
+	b	v6_dma_flush_range
+#endif
+ENDPROC(v6_dma_map_area)
+
+/*
+ *	dma_unmap_area(start, size, dir)
+ *	- start	- kernel virtual start address
+ *	- size	- size of region
+ *	- dir	- DMA direction
+ */
+ENTRY(v6_dma_unmap_area)
+#ifndef CONFIG_DMA_CACHE_RWFO
+	add	r1, r1, r0
+	teq	r2, #DMA_TO_DEVICE
+	bne	v6_dma_inv_range
+#endif
+	mov	pc, lr
+ENDPROC(v6_dma_unmap_area)
+
+	.globl	v6_flush_kern_cache_louis
+	.equ	v6_flush_kern_cache_louis, v6_flush_kern_cache_all
+
 	__INITDATA
 
-	.type	v6_cache_fns, #object
-ENTRY(v6_cache_fns)
-	.long	v6_flush_kern_cache_all
-	.long	v6_flush_user_cache_all
-	.long	v6_flush_user_cache_range
-	.long	v6_coherent_kern_range
-	.long	v6_coherent_user_range
-	.long	v6_flush_kern_dcache_page
-	.long	v6_dma_inv_range
-	.long	v6_dma_clean_range
-	.long	v6_dma_flush_range
-	.size	v6_cache_fns, . - v6_cache_fns
+	@ define struct cpu_cache_fns (see <asm/cacheflush.h> and proc-macros.S)
+	define_cache_functions v6
diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index 35ffc4d9599..615c99e38ba 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -13,33 +13,131 @@
 #include <linux/linkage.h>
 #include <linux/init.h>
 #include <asm/assembler.h>
+#include <asm/errno.h>
+#include <asm/unwind.h>
 
 #include "proc-macros.S"
 
 /*
+ * The secondary kernel init calls v7_flush_dcache_all before it enables
+ * the L1; however, the L1 comes out of reset in an undefined state, so
+ * the clean + invalidate performed by v7_flush_dcache_all causes a bunch
+ * of cache lines with uninitialized data and uninitialized tags to get
+ * written out to memory, which does really unpleasant things to the main
+ * processor.  We fix this by performing an invalidate, rather than a
+ * clean + invalidate, before jumping into the kernel.
+ *
+ * This function is cloned from arch/arm/mach-tegra/headsmp.S, and needs
+ * to be called for both secondary cores startup and primary core resume
+ * procedures.
+ */
+ENTRY(v7_invalidate_l1)
+       mov     r0, #0
+       mcr     p15, 2, r0, c0, c0, 0
+       mrc     p15, 1, r0, c0, c0, 0
+
+       ldr     r1, =0x7fff
+       and     r2, r1, r0, lsr #13
+
+       ldr     r1, =0x3ff
+
+       and     r3, r1, r0, lsr #3      @ NumWays - 1
+       add     r2, r2, #1              @ NumSets
+
+       and     r0, r0, #0x7
+       add     r0, r0, #4      @ SetShift
+
+       clz     r1, r3          @ WayShift
+       add     r4, r3, #1      @ NumWays
+1:     sub     r2, r2, #1      @ NumSets--
+       mov     r3, r4          @ Temp = NumWays
+2:     subs    r3, r3, #1      @ Temp--
+       mov     r5, r3, lsl r1
+       mov     r6, r2, lsl r0
+       orr     r5, r5, r6      @ Reg = (Temp<<WayShift)|(NumSets<<SetShift)
+       mcr     p15, 0, r5, c7, c6, 2
+       bgt     2b
+       cmp     r2, #0
+       bgt     1b
+       dsb     st
+       isb
+       mov     pc, lr
+ENDPROC(v7_invalidate_l1)
+
+/*
+ *	v7_flush_icache_all()
+ *
+ *	Flush the whole I-cache.
+ *
+ *	Registers:
+ *	r0 - set to 0
+ */
+ENTRY(v7_flush_icache_all)
+	mov	r0, #0
+	ALT_SMP(mcr	p15, 0, r0, c7, c1, 0)		@ invalidate I-cache inner shareable
+	ALT_UP(mcr	p15, 0, r0, c7, c5, 0)		@ I+BTB cache invalidate
+	mov	pc, lr
+ENDPROC(v7_flush_icache_all)
+
+ /*
+ *     v7_flush_dcache_louis()
+ *
+ *     Flush the D-cache up to the Level of Unification Inner Shareable
+ *
+ *     Corrupted registers: r0-r7, r9-r11 (r6 only in Thumb mode)
+ */
+
+ENTRY(v7_flush_dcache_louis)
+	dmb					@ ensure ordering with previous memory accesses
+	mrc	p15, 1, r0, c0, c0, 1		@ read clidr, r0 = clidr
+	ALT_SMP(ands	r3, r0, #(7 << 21))	@ extract LoUIS from clidr
+	ALT_UP(ands	r3, r0, #(7 << 27))	@ extract LoUU from clidr
+#ifdef CONFIG_ARM_ERRATA_643719
+	ALT_SMP(mrceq	p15, 0, r2, c0, c0, 0)	@ read main ID register
+	ALT_UP(moveq	pc, lr)			@ LoUU is zero, so nothing to do
+	ldreq	r1, =0x410fc090                 @ ID of ARM Cortex A9 r0p?
+	biceq	r2, r2, #0x0000000f             @ clear minor revision number
+	teqeq	r2, r1                          @ test for errata affected core and if so...
+	orreqs	r3, #(1 << 21)			@   fix LoUIS value (and set flags state to 'ne')
+#endif
+	ALT_SMP(mov	r3, r3, lsr #20)	@ r3 = LoUIS * 2
+	ALT_UP(mov	r3, r3, lsr #26)	@ r3 = LoUU * 2
+	moveq	pc, lr				@ return if level == 0
+	mov	r10, #0				@ r10 (starting level) = 0
+	b	flush_levels			@ start flushing cache levels
+ENDPROC(v7_flush_dcache_louis)
+
+/*
  *	v7_flush_dcache_all()
  *
  *	Flush the whole D-cache.
  *
- *	Corrupted registers: r0-r5, r7, r9-r11
+ *	Corrupted registers: r0-r7, r9-r11 (r6 only in Thumb mode)
  *
  *	- mm    - mm_struct describing address space
  */
 ENTRY(v7_flush_dcache_all)
+	dmb					@ ensure ordering with previous memory accesses
 	mrc	p15, 1, r0, c0, c0, 1		@ read clidr
 	ands	r3, r0, #0x7000000		@ extract loc from clidr
 	mov	r3, r3, lsr #23			@ left align loc bit field
 	beq	finished			@ if loc is 0, then no need to clean
 	mov	r10, #0				@ start clean at cache level 0
-loop1:
+flush_levels:
 	add	r2, r10, r10, lsr #1		@ work out 3x current cache level
 	mov	r1, r0, lsr r2			@ extract cache type bits from clidr
 	and	r1, r1, #7			@ mask of the bits for current cache only
 	cmp	r1, #2				@ see what cache we have at this level
 	blt	skip				@ skip if no cache, or just i-cache
+#ifdef CONFIG_PREEMPT
+	save_and_disable_irqs_notrace r9	@ make cssr&csidr read atomic
+#endif
 	mcr	p15, 2, r10, c0, c0, 0		@ select current cache level in cssr
 	isb					@ isb to sych the new cssr&csidr
 	mrc	p15, 1, r1, c0, c0, 0		@ read the new csidr
+#ifdef CONFIG_PREEMPT
+	restore_irqs_notrace r9
+#endif
 	and	r2, r1, #7			@ extract the length of the cache lines
 	add	r2, r2, #4			@ add 4 (line length offset)
 	ldr	r4, =0x3ff
@@ -47,25 +145,31 @@ loop1:
 	clz	r5, r4				@ find bit position of way size increment
 	ldr	r7, =0x7fff
 	ands	r7, r7, r1, lsr #13		@ extract max number of the index size
+loop1:
+	mov	r9, r7				@ create working copy of max index
 loop2:
-	mov	r9, r4				@ create working copy of max way size
-loop3:
-	orr	r11, r10, r9, lsl r5		@ factor way and cache number into r11
-	orr	r11, r11, r7, lsl r2		@ factor index number into r11
+ ARM(	orr	r11, r10, r4, lsl r5	)	@ factor way and cache number into r11
+ THUMB(	lsl	r6, r4, r5		)
+ THUMB(	orr	r11, r10, r6		)	@ factor way and cache number into r11
+ ARM(	orr	r11, r11, r9, lsl r2	)	@ factor index number into r11
+ THUMB(	lsl	r6, r9, r2		)
+ THUMB(	orr	r11, r11, r6		)	@ factor index number into r11
 	mcr	p15, 0, r11, c7, c14, 2		@ clean & invalidate by set/way
-	subs	r9, r9, #1			@ decrement the way
-	bge	loop3
-	subs	r7, r7, #1			@ decrement the index
+	subs	r9, r9, #1			@ decrement the index
 	bge	loop2
+	subs	r4, r4, #1			@ decrement the way
+	bge	loop1
 skip:
 	add	r10, r10, #2			@ increment cache number
 	cmp	r3, r10
-	bgt	loop1
+	bgt	flush_levels
 finished:
 	mov	r10, #0				@ swith back to cache level 0
 	mcr	p15, 2, r10, c0, c0, 0		@ select current cache level in cssr
+	dsb	st
 	isb
 	mov	pc, lr
+ENDPROC(v7_flush_dcache_all)
 
 /*
  *	v7_flush_cache_all()
@@ -73,18 +177,40 @@ finished:
  *	Flush the entire cache system.
  *  The data cache flush is now achieved using atomic clean / invalidates
  *  working outwards from L1 cache. This is done using Set/Way based cache
- *  maintainance instructions.
+ *  maintenance instructions.
  *  The instruction cache can still be invalidated back to the point of
  *  unification in a single instruction.
  *
  */
 ENTRY(v7_flush_kern_cache_all)
-	stmfd	sp!, {r4-r5, r7, r9-r11, lr}
+ ARM(	stmfd	sp!, {r4-r5, r7, r9-r11, lr}	)
+ THUMB(	stmfd	sp!, {r4-r7, r9-r11, lr}	)
 	bl	v7_flush_dcache_all
 	mov	r0, #0
-	mcr	p15, 0, r0, c7, c5, 0		@ I+BTB cache invalidate
-	ldmfd	sp!, {r4-r5, r7, r9-r11, lr}
+	ALT_SMP(mcr	p15, 0, r0, c7, c1, 0)	@ invalidate I-cache inner shareable
+	ALT_UP(mcr	p15, 0, r0, c7, c5, 0)	@ I+BTB cache invalidate
+ ARM(	ldmfd	sp!, {r4-r5, r7, r9-r11, lr}	)
+ THUMB(	ldmfd	sp!, {r4-r7, r9-r11, lr}	)
 	mov	pc, lr
+ENDPROC(v7_flush_kern_cache_all)
+
+ /*
+ *     v7_flush_kern_cache_louis(void)
+ *
+ *     Flush the data cache up to Level of Unification Inner Shareable.
+ *     Invalidate the I-cache to the point of unification.
+ */
+ENTRY(v7_flush_kern_cache_louis)
+ ARM(	stmfd	sp!, {r4-r5, r7, r9-r11, lr}	)
+ THUMB(	stmfd	sp!, {r4-r7, r9-r11, lr}	)
+	bl	v7_flush_dcache_louis
+	mov	r0, #0
+	ALT_SMP(mcr	p15, 0, r0, c7, c1, 0)	@ invalidate I-cache inner shareable
+	ALT_UP(mcr	p15, 0, r0, c7, c5, 0)	@ I+BTB cache invalidate
+ ARM(	ldmfd	sp!, {r4-r5, r7, r9-r11, lr}	)
+ THUMB(	ldmfd	sp!, {r4-r7, r9-r11, lr}	)
+	mov	pc, lr
+ENDPROC(v7_flush_kern_cache_louis)
 
 /*
  *	v7_flush_cache_all()
@@ -110,6 +236,8 @@ ENTRY(v7_flush_user_cache_all)
  */
 ENTRY(v7_flush_user_cache_range)
 	mov	pc, lr
+ENDPROC(v7_flush_user_cache_all)
+ENDPROC(v7_flush_user_cache_range)
 
 /*
  *	v7_coherent_kern_range(start,end)
@@ -141,39 +269,75 @@ ENTRY(v7_coherent_kern_range)
  *	- the Icache does not read data from the write buffer
  */
 ENTRY(v7_coherent_user_range)
+ UNWIND(.fnstart		)
 	dcache_line_size r2, r3
 	sub	r3, r2, #1
-	bic	r0, r0, r3
-1:	mcr	p15, 0, r0, c7, c11, 1		@ clean D line to the point of unification
-	dsb
-	mcr	p15, 0, r0, c7, c5, 1		@ invalidate I line
-	add	r0, r0, r2
-	cmp	r0, r1
+	bic	r12, r0, r3
+#ifdef CONFIG_ARM_ERRATA_764369
+	ALT_SMP(W(dsb))
+	ALT_UP(W(nop))
+#endif
+1:
+ USER(	mcr	p15, 0, r12, c7, c11, 1	)	@ clean D line to the point of unification
+	add	r12, r12, r2
+	cmp	r12, r1
 	blo	1b
+	dsb	ishst
+	icache_line_size r2, r3
+	sub	r3, r2, #1
+	bic	r12, r0, r3
+2:
+ USER(	mcr	p15, 0, r12, c7, c5, 1	)	@ invalidate I line
+	add	r12, r12, r2
+	cmp	r12, r1
+	blo	2b
 	mov	r0, #0
-	mcr	p15, 0, r0, c7, c5, 6		@ invalidate BTB
-	dsb
+	ALT_SMP(mcr	p15, 0, r0, c7, c1, 6)	@ invalidate BTB Inner Shareable
+	ALT_UP(mcr	p15, 0, r0, c7, c5, 6)	@ invalidate BTB
+	dsb	ishst
 	isb
 	mov	pc, lr
 
 /*
- *	v7_flush_kern_dcache_page(kaddr)
+ * Fault handling for the cache operation above. If the virtual address in r0
+ * isn't mapped, fail with -EFAULT.
+ */
+9001:
+#ifdef CONFIG_ARM_ERRATA_775420
+	dsb
+#endif
+	mov	r0, #-EFAULT
+	mov	pc, lr
+ UNWIND(.fnend		)
+ENDPROC(v7_coherent_kern_range)
+ENDPROC(v7_coherent_user_range)
+
+/*
+ *	v7_flush_kern_dcache_area(void *addr, size_t size)
  *
  *	Ensure that the data held in the page kaddr is written back
  *	to the page in question.
  *
- *	- kaddr   - kernel address (guaranteed to be page aligned)
+ *	- addr	- kernel address
+ *	- size	- region size
  */
-ENTRY(v7_flush_kern_dcache_page)
+ENTRY(v7_flush_kern_dcache_area)
 	dcache_line_size r2, r3
-	add	r1, r0, #PAGE_SZ
+	add	r1, r0, r1
+	sub	r3, r2, #1
+	bic	r0, r0, r3
+#ifdef CONFIG_ARM_ERRATA_764369
+	ALT_SMP(W(dsb))
+	ALT_UP(W(nop))
+#endif
 1:
 	mcr	p15, 0, r0, c7, c14, 1		@ clean & invalidate D line / unified line
 	add	r0, r0, r2
 	cmp	r0, r1
 	blo	1b
-	dsb
+	dsb	st
 	mov	pc, lr
+ENDPROC(v7_flush_kern_dcache_area)
 
 /*
  *	v7_dma_inv_range(start,end)
@@ -185,11 +349,15 @@ ENTRY(v7_flush_kern_dcache_page)
  *	- start   - virtual start address of region
  *	- end     - virtual end address of region
  */
-ENTRY(v7_dma_inv_range)
+v7_dma_inv_range:
 	dcache_line_size r2, r3
 	sub	r3, r2, #1
 	tst	r0, r3
 	bic	r0, r0, r3
+#ifdef CONFIG_ARM_ERRATA_764369
+	ALT_SMP(W(dsb))
+	ALT_UP(W(nop))
+#endif
 	mcrne	p15, 0, r0, c7, c14, 1		@ clean & invalidate D / U line
 
 	tst	r1, r3
@@ -200,25 +368,31 @@ ENTRY(v7_dma_inv_range)
 	add	r0, r0, r2
 	cmp	r0, r1
 	blo	1b
-	dsb
+	dsb	st
 	mov	pc, lr
+ENDPROC(v7_dma_inv_range)
 
 /*
  *	v7_dma_clean_range(start,end)
  *	- start   - virtual start address of region
  *	- end     - virtual end address of region
  */
-ENTRY(v7_dma_clean_range)
+v7_dma_clean_range:
 	dcache_line_size r2, r3
 	sub	r3, r2, #1
 	bic	r0, r0, r3
+#ifdef CONFIG_ARM_ERRATA_764369
+	ALT_SMP(W(dsb))
+	ALT_UP(W(nop))
+#endif
 1:
 	mcr	p15, 0, r0, c7, c10, 1		@ clean D / U line
 	add	r0, r0, r2
 	cmp	r0, r1
 	blo	1b
-	dsb
+	dsb	st
 	mov	pc, lr
+ENDPROC(v7_dma_clean_range)
 
 /*
  *	v7_dma_flush_range(start,end)
@@ -229,25 +403,46 @@ ENTRY(v7_dma_flush_range)
 	dcache_line_size r2, r3
 	sub	r3, r2, #1
 	bic	r0, r0, r3
+#ifdef CONFIG_ARM_ERRATA_764369
+	ALT_SMP(W(dsb))
+	ALT_UP(W(nop))
+#endif
 1:
 	mcr	p15, 0, r0, c7, c14, 1		@ clean & invalidate D / U line
 	add	r0, r0, r2
 	cmp	r0, r1
 	blo	1b
-	dsb
+	dsb	st
+	mov	pc, lr
+ENDPROC(v7_dma_flush_range)
+
+/*
+ *	dma_map_area(start, size, dir)
+ *	- start	- kernel virtual start address
+ *	- size	- size of region
+ *	- dir	- DMA direction
+ */
+ENTRY(v7_dma_map_area)
+	add	r1, r1, r0
+	teq	r2, #DMA_FROM_DEVICE
+	beq	v7_dma_inv_range
+	b	v7_dma_clean_range
+ENDPROC(v7_dma_map_area)
+
+/*
+ *	dma_unmap_area(start, size, dir)
+ *	- start	- kernel virtual start address
+ *	- size	- size of region
+ *	- dir	- DMA direction
+ */
+ENTRY(v7_dma_unmap_area)
+	add	r1, r1, r0
+	teq	r2, #DMA_TO_DEVICE
+	bne	v7_dma_inv_range
 	mov	pc, lr
+ENDPROC(v7_dma_unmap_area)
 
 	__INITDATA
 
-	.type	v7_cache_fns, #object
-ENTRY(v7_cache_fns)
-	.long	v7_flush_kern_cache_all
-	.long	v7_flush_user_cache_all
-	.long	v7_flush_user_cache_range
-	.long	v7_coherent_kern_range
-	.long	v7_coherent_user_range
-	.long	v7_flush_kern_dcache_page
-	.long	v7_dma_inv_range
-	.long	v7_dma_clean_range
-	.long	v7_dma_flush_range
-	.size	v7_cache_fns, . - v7_cache_fns
+	@ define struct cpu_cache_fns (see <asm/cacheflush.h> and proc-macros.S)
+	define_cache_functions v7
diff --git a/arch/arm/mm/cache-xsc3l2.c b/arch/arm/mm/cache-xsc3l2.c
new file mode 100644
index 00000000000..6c3edeb66e7
--- /dev/null
+++ b/arch/arm/mm/cache-xsc3l2.c
@@ -0,0 +1,220 @@
+/*
+ * arch/arm/mm/cache-xsc3l2.c - XScale3 L2 cache controller support
+ *
+ * Copyright (C) 2007 ARM Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <asm/cp15.h>
+#include <asm/cputype.h>
+#include <asm/cacheflush.h>
+
+#define CR_L2	(1 << 26)
+
+#define CACHE_LINE_SIZE		32
+#define CACHE_LINE_SHIFT	5
+#define CACHE_WAY_PER_SET	8
+
+#define CACHE_WAY_SIZE(l2ctype)	(8192 << (((l2ctype) >> 8) & 0xf))
+#define CACHE_SET_SIZE(l2ctype)	(CACHE_WAY_SIZE(l2ctype) >> CACHE_LINE_SHIFT)
+
+static inline int xsc3_l2_present(void)
+{
+	unsigned long l2ctype;
+
+	__asm__("mrc p15, 1, %0, c0, c0, 1" : "=r" (l2ctype));
+
+	return !!(l2ctype & 0xf8);
+}
+
+static inline void xsc3_l2_clean_mva(unsigned long addr)
+{
+	__asm__("mcr p15, 1, %0, c7, c11, 1" : : "r" (addr));
+}
+
+static inline void xsc3_l2_inv_mva(unsigned long addr)
+{
+	__asm__("mcr p15, 1, %0, c7, c7, 1" : : "r" (addr));
+}
+
+static inline void xsc3_l2_inv_all(void)
+{
+	unsigned long l2ctype, set_way;
+	int set, way;
+
+	__asm__("mrc p15, 1, %0, c0, c0, 1" : "=r" (l2ctype));
+
+	for (set = 0; set < CACHE_SET_SIZE(l2ctype); set++) {
+		for (way = 0; way < CACHE_WAY_PER_SET; way++) {
+			set_way = (way << 29) | (set << 5);
+			__asm__("mcr p15, 1, %0, c7, c11, 2" : : "r"(set_way));
+		}
+	}
+
+	dsb();
+}
+
+static inline void l2_unmap_va(unsigned long va)
+{
+#ifdef CONFIG_HIGHMEM
+	if (va != -1)
+		kunmap_atomic((void *)va);
+#endif
+}
+
+static inline unsigned long l2_map_va(unsigned long pa, unsigned long prev_va)
+{
+#ifdef CONFIG_HIGHMEM
+	unsigned long va = prev_va & PAGE_MASK;
+	unsigned long pa_offset = pa << (32 - PAGE_SHIFT);
+	if (unlikely(pa_offset < (prev_va << (32 - PAGE_SHIFT)))) {
+		/*
+		 * Switching to a new page.  Because cache ops are
+		 * using virtual addresses only, we must put a mapping
+		 * in place for it.
+		 */
+		l2_unmap_va(prev_va);
+		va = (unsigned long)kmap_atomic_pfn(pa >> PAGE_SHIFT);
+	}
+	return va + (pa_offset >> (32 - PAGE_SHIFT));
+#else
+	return __phys_to_virt(pa);
+#endif
+}
+
+static void xsc3_l2_inv_range(unsigned long start, unsigned long end)
+{
+	unsigned long vaddr;
+
+	if (start == 0 && end == -1ul) {
+		xsc3_l2_inv_all();
+		return;
+	}
+
+	vaddr = -1;  /* to force the first mapping */
+
+	/*
+	 * Clean and invalidate partial first cache line.
+	 */
+	if (start & (CACHE_LINE_SIZE - 1)) {
+		vaddr = l2_map_va(start & ~(CACHE_LINE_SIZE - 1), vaddr);
+		xsc3_l2_clean_mva(vaddr);
+		xsc3_l2_inv_mva(vaddr);
+		start = (start | (CACHE_LINE_SIZE - 1)) + 1;
+	}
+
+	/*
+	 * Invalidate all full cache lines between 'start' and 'end'.
+	 */
+	while (start < (end & ~(CACHE_LINE_SIZE - 1))) {
+		vaddr = l2_map_va(start, vaddr);
+		xsc3_l2_inv_mva(vaddr);
+		start += CACHE_LINE_SIZE;
+	}
+
+	/*
+	 * Clean and invalidate partial last cache line.
+	 */
+	if (start < end) {
+		vaddr = l2_map_va(start, vaddr);
+		xsc3_l2_clean_mva(vaddr);
+		xsc3_l2_inv_mva(vaddr);
+	}
+
+	l2_unmap_va(vaddr);
+
+	dsb();
+}
+
+static void xsc3_l2_clean_range(unsigned long start, unsigned long end)
+{
+	unsigned long vaddr;
+
+	vaddr = -1;  /* to force the first mapping */
+
+	start &= ~(CACHE_LINE_SIZE - 1);
+	while (start < end) {
+		vaddr = l2_map_va(start, vaddr);
+		xsc3_l2_clean_mva(vaddr);
+		start += CACHE_LINE_SIZE;
+	}
+
+	l2_unmap_va(vaddr);
+
+	dsb();
+}
+
+/*
+ * optimize L2 flush all operation by set/way format
+ */
+static inline void xsc3_l2_flush_all(void)
+{
+	unsigned long l2ctype, set_way;
+	int set, way;
+
+	__asm__("mrc p15, 1, %0, c0, c0, 1" : "=r" (l2ctype));
+
+	for (set = 0; set < CACHE_SET_SIZE(l2ctype); set++) {
+		for (way = 0; way < CACHE_WAY_PER_SET; way++) {
+			set_way = (way << 29) | (set << 5);
+			__asm__("mcr p15, 1, %0, c7, c15, 2" : : "r"(set_way));
+		}
+	}
+
+	dsb();
+}
+
+static void xsc3_l2_flush_range(unsigned long start, unsigned long end)
+{
+	unsigned long vaddr;
+
+	if (start == 0 && end == -1ul) {
+		xsc3_l2_flush_all();
+		return;
+	}
+
+	vaddr = -1;  /* to force the first mapping */
+
+	start &= ~(CACHE_LINE_SIZE - 1);
+	while (start < end) {
+		vaddr = l2_map_va(start, vaddr);
+		xsc3_l2_clean_mva(vaddr);
+		xsc3_l2_inv_mva(vaddr);
+		start += CACHE_LINE_SIZE;
+	}
+
+	l2_unmap_va(vaddr);
+
+	dsb();
+}
+
+static int __init xsc3_l2_init(void)
+{
+	if (!cpu_is_xsc3() || !xsc3_l2_present())
+		return 0;
+
+	if (get_cr() & CR_L2) {
+		pr_info("XScale3 L2 cache enabled.\n");
+		xsc3_l2_inv_all();
+
+		outer_cache.inv_range = xsc3_l2_inv_range;
+		outer_cache.clean_range = xsc3_l2_clean_range;
+		outer_cache.flush_range = xsc3_l2_flush_range;
+	}
+
+	return 0;
+}
+core_initcall(xsc3_l2_init);
diff --git a/arch/arm/mm/consistent.c b/arch/arm/mm/consistent.c
deleted file mode 100644
index cefdf2f9f26..00000000000
--- a/arch/arm/mm/consistent.c
+++ /dev/null
@@ -1,507 +0,0 @@
-/*
- *  linux/arch/arm/mm/consistent.c
- *
- *  Copyright (C) 2000-2004 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- *  DMA uncached mapping support.
- */
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/errno.h>
-#include <linux/list.h>
-#include <linux/init.h>
-#include <linux/device.h>
-#include <linux/dma-mapping.h>
-
-#include <asm/memory.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-#include <asm/sizes.h>
-
-/* Sanity check size */
-#if (CONSISTENT_DMA_SIZE % SZ_2M)
-#error "CONSISTENT_DMA_SIZE must be multiple of 2MiB"
-#endif
-
-#define CONSISTENT_END	(0xffe00000)
-#define CONSISTENT_BASE	(CONSISTENT_END - CONSISTENT_DMA_SIZE)
-
-#define CONSISTENT_OFFSET(x)	(((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT)
-#define CONSISTENT_PTE_INDEX(x) (((unsigned long)(x) - CONSISTENT_BASE) >> PGDIR_SHIFT)
-#define NUM_CONSISTENT_PTES (CONSISTENT_DMA_SIZE >> PGDIR_SHIFT)
-
-
-/*
- * These are the page tables (2MB each) covering uncached, DMA consistent allocations
- */
-static pte_t *consistent_pte[NUM_CONSISTENT_PTES];
-static DEFINE_SPINLOCK(consistent_lock);
-
-/*
- * VM region handling support.
- *
- * This should become something generic, handling VM region allocations for
- * vmalloc and similar (ioremap, module space, etc).
- *
- * I envisage vmalloc()'s supporting vm_struct becoming:
- *
- *  struct vm_struct {
- *    struct vm_region	region;
- *    unsigned long	flags;
- *    struct page	**pages;
- *    unsigned int	nr_pages;
- *    unsigned long	phys_addr;
- *  };
- *
- * get_vm_area() would then call vm_region_alloc with an appropriate
- * struct vm_region head (eg):
- *
- *  struct vm_region vmalloc_head = {
- *	.vm_list	= LIST_HEAD_INIT(vmalloc_head.vm_list),
- *	.vm_start	= VMALLOC_START,
- *	.vm_end		= VMALLOC_END,
- *  };
- *
- * However, vmalloc_head.vm_start is variable (typically, it is dependent on
- * the amount of RAM found at boot time.)  I would imagine that get_vm_area()
- * would have to initialise this each time prior to calling vm_region_alloc().
- */
-struct vm_region {
-	struct list_head	vm_list;
-	unsigned long		vm_start;
-	unsigned long		vm_end;
-	struct page		*vm_pages;
-	int			vm_active;
-};
-
-static struct vm_region consistent_head = {
-	.vm_list	= LIST_HEAD_INIT(consistent_head.vm_list),
-	.vm_start	= CONSISTENT_BASE,
-	.vm_end		= CONSISTENT_END,
-};
-
-static struct vm_region *
-vm_region_alloc(struct vm_region *head, size_t size, gfp_t gfp)
-{
-	unsigned long addr = head->vm_start, end = head->vm_end - size;
-	unsigned long flags;
-	struct vm_region *c, *new;
-
-	new = kmalloc(sizeof(struct vm_region), gfp);
-	if (!new)
-		goto out;
-
-	spin_lock_irqsave(&consistent_lock, flags);
-
-	list_for_each_entry(c, &head->vm_list, vm_list) {
-		if ((addr + size) < addr)
-			goto nospc;
-		if ((addr + size) <= c->vm_start)
-			goto found;
-		addr = c->vm_end;
-		if (addr > end)
-			goto nospc;
-	}
-
- found:
-	/*
-	 * Insert this entry _before_ the one we found.
-	 */
-	list_add_tail(&new->vm_list, &c->vm_list);
-	new->vm_start = addr;
-	new->vm_end = addr + size;
-	new->vm_active = 1;
-
-	spin_unlock_irqrestore(&consistent_lock, flags);
-	return new;
-
- nospc:
-	spin_unlock_irqrestore(&consistent_lock, flags);
-	kfree(new);
- out:
-	return NULL;
-}
-
-static struct vm_region *vm_region_find(struct vm_region *head, unsigned long addr)
-{
-	struct vm_region *c;
-	
-	list_for_each_entry(c, &head->vm_list, vm_list) {
-		if (c->vm_active && c->vm_start == addr)
-			goto out;
-	}
-	c = NULL;
- out:
-	return c;
-}
-
-#ifdef CONFIG_HUGETLB_PAGE
-#error ARM Coherent DMA allocator does not (yet) support huge TLB
-#endif
-
-static void *
-__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
-	    pgprot_t prot)
-{
-	struct page *page;
-	struct vm_region *c;
-	unsigned long order;
-	u64 mask = ISA_DMA_THRESHOLD, limit;
-
-	if (!consistent_pte[0]) {
-		printk(KERN_ERR "%s: not initialised\n", __func__);
-		dump_stack();
-		return NULL;
-	}
-
-	if (dev) {
-		mask = dev->coherent_dma_mask;
-
-		/*
-		 * Sanity check the DMA mask - it must be non-zero, and
-		 * must be able to be satisfied by a DMA allocation.
-		 */
-		if (mask == 0) {
-			dev_warn(dev, "coherent DMA mask is unset\n");
-			goto no_page;
-		}
-
-		if ((~mask) & ISA_DMA_THRESHOLD) {
-			dev_warn(dev, "coherent DMA mask %#llx is smaller "
-				 "than system GFP_DMA mask %#llx\n",
-				 mask, (unsigned long long)ISA_DMA_THRESHOLD);
-			goto no_page;
-		}
-	}
-
-	/*
-	 * Sanity check the allocation size.
-	 */
-	size = PAGE_ALIGN(size);
-	limit = (mask + 1) & ~mask;
-	if ((limit && size >= limit) ||
-	    size >= (CONSISTENT_END - CONSISTENT_BASE)) {
-		printk(KERN_WARNING "coherent allocation too big "
-		       "(requested %#x mask %#llx)\n", size, mask);
-		goto no_page;
-	}
-
-	order = get_order(size);
-
-	if (mask != 0xffffffff)
-		gfp |= GFP_DMA;
-
-	page = alloc_pages(gfp, order);
-	if (!page)
-		goto no_page;
-
-	/*
-	 * Invalidate any data that might be lurking in the
-	 * kernel direct-mapped region for device DMA.
-	 */
-	{
-		void *ptr = page_address(page);
-		memset(ptr, 0, size);
-		dmac_flush_range(ptr, ptr + size);
-		outer_flush_range(__pa(ptr), __pa(ptr) + size);
-	}
-
-	/*
-	 * Allocate a virtual address in the consistent mapping region.
-	 */
-	c = vm_region_alloc(&consistent_head, size,
-			    gfp & ~(__GFP_DMA | __GFP_HIGHMEM));
-	if (c) {
-		pte_t *pte;
-		struct page *end = page + (1 << order);
-		int idx = CONSISTENT_PTE_INDEX(c->vm_start);
-		u32 off = CONSISTENT_OFFSET(c->vm_start) & (PTRS_PER_PTE-1);
-
-		pte = consistent_pte[idx] + off;
-		c->vm_pages = page;
-
-		split_page(page, order);
-
-		/*
-		 * Set the "dma handle"
-		 */
-		*handle = page_to_dma(dev, page);
-
-		do {
-			BUG_ON(!pte_none(*pte));
-
-			/*
-			 * x86 does not mark the pages reserved...
-			 */
-			SetPageReserved(page);
-			set_pte_ext(pte, mk_pte(page, prot), 0);
-			page++;
-			pte++;
-			off++;
-			if (off >= PTRS_PER_PTE) {
-				off = 0;
-				pte = consistent_pte[++idx];
-			}
-		} while (size -= PAGE_SIZE);
-
-		/*
-		 * Free the otherwise unused pages.
-		 */
-		while (page < end) {
-			__free_page(page);
-			page++;
-		}
-
-		return (void *)c->vm_start;
-	}
-
-	if (page)
-		__free_pages(page, order);
- no_page:
-	*handle = ~0;
-	return NULL;
-}
-
-/*
- * Allocate DMA-coherent memory space and return both the kernel remapped
- * virtual and bus address for that space.
- */
-void *
-dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp)
-{
-	if (arch_is_coherent()) {
-		void *virt;
-
-		virt = kmalloc(size, gfp);
-		if (!virt)
-			return NULL;
-		*handle =  virt_to_dma(dev, virt);
-
-		return virt;
-	}
-
-	return __dma_alloc(dev, size, handle, gfp,
-			   pgprot_noncached(pgprot_kernel));
-}
-EXPORT_SYMBOL(dma_alloc_coherent);
-
-/*
- * Allocate a writecombining region, in much the same way as
- * dma_alloc_coherent above.
- */
-void *
-dma_alloc_writecombine(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp)
-{
-	return __dma_alloc(dev, size, handle, gfp,
-			   pgprot_writecombine(pgprot_kernel));
-}
-EXPORT_SYMBOL(dma_alloc_writecombine);
-
-static int dma_mmap(struct device *dev, struct vm_area_struct *vma,
-		    void *cpu_addr, dma_addr_t dma_addr, size_t size)
-{
-	unsigned long flags, user_size, kern_size;
-	struct vm_region *c;
-	int ret = -ENXIO;
-
-	user_size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
-
-	spin_lock_irqsave(&consistent_lock, flags);
-	c = vm_region_find(&consistent_head, (unsigned long)cpu_addr);
-	spin_unlock_irqrestore(&consistent_lock, flags);
-
-	if (c) {
-		unsigned long off = vma->vm_pgoff;
-
-		kern_size = (c->vm_end - c->vm_start) >> PAGE_SHIFT;
-
-		if (off < kern_size &&
-		    user_size <= (kern_size - off)) {
-			vma->vm_flags |= VM_RESERVED;
-			ret = remap_pfn_range(vma, vma->vm_start,
-					      page_to_pfn(c->vm_pages) + off,
-					      user_size << PAGE_SHIFT,
-					      vma->vm_page_prot);
-		}
-	}
-
-	return ret;
-}
-
-int dma_mmap_coherent(struct device *dev, struct vm_area_struct *vma,
-		      void *cpu_addr, dma_addr_t dma_addr, size_t size)
-{
-	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-	return dma_mmap(dev, vma, cpu_addr, dma_addr, size);
-}
-EXPORT_SYMBOL(dma_mmap_coherent);
-
-int dma_mmap_writecombine(struct device *dev, struct vm_area_struct *vma,
-			  void *cpu_addr, dma_addr_t dma_addr, size_t size)
-{
-	vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
-	return dma_mmap(dev, vma, cpu_addr, dma_addr, size);
-}
-EXPORT_SYMBOL(dma_mmap_writecombine);
-
-/*
- * free a page as defined by the above mapping.
- * Must not be called with IRQs disabled.
- */
-void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t handle)
-{
-	struct vm_region *c;
-	unsigned long flags, addr;
-	pte_t *ptep;
-	int idx;
-	u32 off;
-
-	WARN_ON(irqs_disabled());
-
-	if (arch_is_coherent()) {
-		kfree(cpu_addr);
-		return;
-	}
-
-	size = PAGE_ALIGN(size);
-
-	spin_lock_irqsave(&consistent_lock, flags);
-	c = vm_region_find(&consistent_head, (unsigned long)cpu_addr);
-	if (!c)
-		goto no_area;
-
-	c->vm_active = 0;
-	spin_unlock_irqrestore(&consistent_lock, flags);
-
-	if ((c->vm_end - c->vm_start) != size) {
-		printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n",
-		       __func__, c->vm_end - c->vm_start, size);
-		dump_stack();
-		size = c->vm_end - c->vm_start;
-	}
-
-	idx = CONSISTENT_PTE_INDEX(c->vm_start);
-	off = CONSISTENT_OFFSET(c->vm_start) & (PTRS_PER_PTE-1);
-	ptep = consistent_pte[idx] + off;
-	addr = c->vm_start;
-	do {
-		pte_t pte = ptep_get_and_clear(&init_mm, addr, ptep);
-		unsigned long pfn;
-
-		ptep++;
-		addr += PAGE_SIZE;
-		off++;
-		if (off >= PTRS_PER_PTE) {
-			off = 0;
-			ptep = consistent_pte[++idx];
-		}
-
-		if (!pte_none(pte) && pte_present(pte)) {
-			pfn = pte_pfn(pte);
-
-			if (pfn_valid(pfn)) {
-				struct page *page = pfn_to_page(pfn);
-
-				/*
-				 * x86 does not mark the pages reserved...
-				 */
-				ClearPageReserved(page);
-
-				__free_page(page);
-				continue;
-			}
-		}
-
-		printk(KERN_CRIT "%s: bad page in kernel page table\n",
-		       __func__);
-	} while (size -= PAGE_SIZE);
-
-	flush_tlb_kernel_range(c->vm_start, c->vm_end);
-
-	spin_lock_irqsave(&consistent_lock, flags);
-	list_del(&c->vm_list);
-	spin_unlock_irqrestore(&consistent_lock, flags);
-
-	kfree(c);
-	return;
-
- no_area:
-	spin_unlock_irqrestore(&consistent_lock, flags);
-	printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n",
-	       __func__, cpu_addr);
-	dump_stack();
-}
-EXPORT_SYMBOL(dma_free_coherent);
-
-/*
- * Initialise the consistent memory allocation.
- */
-static int __init consistent_init(void)
-{
-	pgd_t *pgd;
-	pmd_t *pmd;
-	pte_t *pte;
-	int ret = 0, i = 0;
-	u32 base = CONSISTENT_BASE;
-
-	do {
-		pgd = pgd_offset(&init_mm, base);
-		pmd = pmd_alloc(&init_mm, pgd, base);
-		if (!pmd) {
-			printk(KERN_ERR "%s: no pmd tables\n", __func__);
-			ret = -ENOMEM;
-			break;
-		}
-		WARN_ON(!pmd_none(*pmd));
-
-		pte = pte_alloc_kernel(pmd, base);
-		if (!pte) {
-			printk(KERN_ERR "%s: no pte tables\n", __func__);
-			ret = -ENOMEM;
-			break;
-		}
-
-		consistent_pte[i++] = pte;
-		base += (1 << PGDIR_SHIFT);
-	} while (base < CONSISTENT_END);
-
-	return ret;
-}
-
-core_initcall(consistent_init);
-
-/*
- * Make an area consistent for devices.
- * Note: Drivers should NOT use this function directly, as it will break
- * platforms with CONFIG_DMABOUNCE.
- * Use the driver DMA support - see dma-mapping.h (dma_sync_*)
- */
-void dma_cache_maint(const void *start, size_t size, int direction)
-{
-	const void *end = start + size;
-
-	BUG_ON(!virt_addr_valid(start) || !virt_addr_valid(end - 1));
-
-	switch (direction) {
-	case DMA_FROM_DEVICE:		/* invalidate only */
-		dmac_inv_range(start, end);
-		outer_inv_range(__pa(start), __pa(end));
-		break;
-	case DMA_TO_DEVICE:		/* writeback only */
-		dmac_clean_range(start, end);
-		outer_clean_range(__pa(start), __pa(end));
-		break;
-	case DMA_BIDIRECTIONAL:		/* writeback and invalidate */
-		dmac_flush_range(start, end);
-		outer_flush_range(__pa(start), __pa(end));
-		break;
-	default:
-		BUG();
-	}
-}
-EXPORT_SYMBOL(dma_cache_maint);
diff --git a/arch/arm/mm/context.c b/arch/arm/mm/context.c
index fc84fcc7438..6eb97b3a748 100644
--- a/arch/arm/mm/context.c
+++ b/arch/arm/mm/context.c
@@ -2,6 +2,9 @@
  *  linux/arch/arm/mm/context.c
  *
  *  Copyright (C) 2002-2003 Deep Blue Solutions Ltd, all rights reserved.
+ *  Copyright (C) 2012 ARM Limited
+ *
+ *  Author: Will Deacon <will.deacon@arm.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -10,55 +13,248 @@
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/percpu.h>
 
 #include <asm/mmu_context.h>
+#include <asm/smp_plat.h>
+#include <asm/thread_notify.h>
 #include <asm/tlbflush.h>
+#include <asm/proc-fns.h>
+
+/*
+ * On ARMv6, we have the following structure in the Context ID:
+ *
+ * 31                         7          0
+ * +-------------------------+-----------+
+ * |      process ID         |   ASID    |
+ * +-------------------------+-----------+
+ * |              context ID             |
+ * +-------------------------------------+
+ *
+ * The ASID is used to tag entries in the CPU caches and TLBs.
+ * The context ID is used by debuggers and trace logic, and
+ * should be unique within all running processes.
+ *
+ * In big endian operation, the two 32 bit words are swapped if accessed
+ * by non-64-bit operations.
+ */
+#define ASID_FIRST_VERSION	(1ULL << ASID_BITS)
+#define NUM_USER_ASIDS		ASID_FIRST_VERSION
 
-static DEFINE_SPINLOCK(cpu_asid_lock);
-unsigned int cpu_last_asid = ASID_FIRST_VERSION;
+static DEFINE_RAW_SPINLOCK(cpu_asid_lock);
+static atomic64_t asid_generation = ATOMIC64_INIT(ASID_FIRST_VERSION);
+static DECLARE_BITMAP(asid_map, NUM_USER_ASIDS);
+
+static DEFINE_PER_CPU(atomic64_t, active_asids);
+static DEFINE_PER_CPU(u64, reserved_asids);
+static cpumask_t tlb_flush_pending;
+
+#ifdef CONFIG_ARM_ERRATA_798181
+void a15_erratum_get_cpumask(int this_cpu, struct mm_struct *mm,
+			     cpumask_t *mask)
+{
+	int cpu;
+	unsigned long flags;
+	u64 context_id, asid;
+
+	raw_spin_lock_irqsave(&cpu_asid_lock, flags);
+	context_id = mm->context.id.counter;
+	for_each_online_cpu(cpu) {
+		if (cpu == this_cpu)
+			continue;
+		/*
+		 * We only need to send an IPI if the other CPUs are
+		 * running the same ASID as the one being invalidated.
+		 */
+		asid = per_cpu(active_asids, cpu).counter;
+		if (asid == 0)
+			asid = per_cpu(reserved_asids, cpu);
+		if (context_id == asid)
+			cpumask_set_cpu(cpu, mask);
+	}
+	raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
+}
+#endif
 
+#ifdef CONFIG_ARM_LPAE
 /*
- * We fork()ed a process, and we need a new context for the child
- * to run in.  We reserve version 0 for initial tasks so we will
- * always allocate an ASID. The ASID 0 is reserved for the TTBR
- * register changing sequence.
+ * With LPAE, the ASID and page tables are updated atomicly, so there is
+ * no need for a reserved set of tables (the active ASID tracking prevents
+ * any issues across a rollover).
  */
-void __init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+#define cpu_set_reserved_ttbr0()
+#else
+static void cpu_set_reserved_ttbr0(void)
 {
-	mm->context.id = 0;
+	u32 ttb;
+	/*
+	 * Copy TTBR1 into TTBR0.
+	 * This points at swapper_pg_dir, which contains only global
+	 * entries so any speculative walks are perfectly safe.
+	 */
+	asm volatile(
+	"	mrc	p15, 0, %0, c2, c0, 1		@ read TTBR1\n"
+	"	mcr	p15, 0, %0, c2, c0, 0		@ set TTBR0\n"
+	: "=r" (ttb));
+	isb();
+}
+#endif
+
+#ifdef CONFIG_PID_IN_CONTEXTIDR
+static int contextidr_notifier(struct notifier_block *unused, unsigned long cmd,
+			       void *t)
+{
+	u32 contextidr;
+	pid_t pid;
+	struct thread_info *thread = t;
+
+	if (cmd != THREAD_NOTIFY_SWITCH)
+		return NOTIFY_DONE;
+
+	pid = task_pid_nr(thread->task) << ASID_BITS;
+	asm volatile(
+	"	mrc	p15, 0, %0, c13, c0, 1\n"
+	"	and	%0, %0, %2\n"
+	"	orr	%0, %0, %1\n"
+	"	mcr	p15, 0, %0, c13, c0, 1\n"
+	: "=r" (contextidr), "+r" (pid)
+	: "I" (~ASID_MASK));
+	isb();
+
+	return NOTIFY_OK;
 }
 
-void __new_context(struct mm_struct *mm)
+static struct notifier_block contextidr_notifier_block = {
+	.notifier_call = contextidr_notifier,
+};
+
+static int __init contextidr_notifier_init(void)
 {
-	unsigned int asid;
+	return thread_register_notifier(&contextidr_notifier_block);
+}
+arch_initcall(contextidr_notifier_init);
+#endif
 
-	spin_lock(&cpu_asid_lock);
-	asid = ++cpu_last_asid;
-	if (asid == 0)
-		asid = cpu_last_asid = ASID_FIRST_VERSION;
+static void flush_context(unsigned int cpu)
+{
+	int i;
+	u64 asid;
+
+	/* Update the list of reserved ASIDs and the ASID bitmap. */
+	bitmap_clear(asid_map, 0, NUM_USER_ASIDS);
+	for_each_possible_cpu(i) {
+		if (i == cpu) {
+			asid = 0;
+		} else {
+			asid = atomic64_xchg(&per_cpu(active_asids, i), 0);
+			/*
+			 * If this CPU has already been through a
+			 * rollover, but hasn't run another task in
+			 * the meantime, we must preserve its reserved
+			 * ASID, as this is the only trace we have of
+			 * the process it is still running.
+			 */
+			if (asid == 0)
+				asid = per_cpu(reserved_asids, i);
+			__set_bit(asid & ~ASID_MASK, asid_map);
+		}
+		per_cpu(reserved_asids, i) = asid;
+	}
+
+	/* Queue a TLB invalidate and flush the I-cache if necessary. */
+	cpumask_setall(&tlb_flush_pending);
+
+	if (icache_is_vivt_asid_tagged())
+		__flush_icache_all();
+}
+
+static int is_reserved_asid(u64 asid)
+{
+	int cpu;
+	for_each_possible_cpu(cpu)
+		if (per_cpu(reserved_asids, cpu) == asid)
+			return 1;
+	return 0;
+}
+
+static u64 new_context(struct mm_struct *mm, unsigned int cpu)
+{
+	static u32 cur_idx = 1;
+	u64 asid = atomic64_read(&mm->context.id);
+	u64 generation = atomic64_read(&asid_generation);
+
+	if (asid != 0 && is_reserved_asid(asid)) {
+		/*
+		 * Our current ASID was active during a rollover, we can
+		 * continue to use it and this was just a false alarm.
+		 */
+		asid = generation | (asid & ~ASID_MASK);
+	} else {
+		/*
+		 * Allocate a free ASID. If we can't find one, take a
+		 * note of the currently active ASIDs and mark the TLBs
+		 * as requiring flushes. We always count from ASID #1,
+		 * as we reserve ASID #0 to switch via TTBR0 and to
+		 * avoid speculative page table walks from hitting in
+		 * any partial walk caches, which could be populated
+		 * from overlapping level-1 descriptors used to map both
+		 * the module area and the userspace stack.
+		 */
+		asid = find_next_zero_bit(asid_map, NUM_USER_ASIDS, cur_idx);
+		if (asid == NUM_USER_ASIDS) {
+			generation = atomic64_add_return(ASID_FIRST_VERSION,
+							 &asid_generation);
+			flush_context(cpu);
+			asid = find_next_zero_bit(asid_map, NUM_USER_ASIDS, 1);
+		}
+		__set_bit(asid, asid_map);
+		cur_idx = asid;
+		asid |= generation;
+		cpumask_clear(mm_cpumask(mm));
+	}
+
+	return asid;
+}
+
+void check_and_switch_context(struct mm_struct *mm, struct task_struct *tsk)
+{
+	unsigned long flags;
+	unsigned int cpu = smp_processor_id();
+	u64 asid;
+
+	if (unlikely(mm->context.vmalloc_seq != init_mm.context.vmalloc_seq))
+		__check_vmalloc_seq(mm);
 
 	/*
-	 * If we've used up all our ASIDs, we need
-	 * to start a new version and flush the TLB.
+	 * We cannot update the pgd and the ASID atomicly with classic
+	 * MMU, so switch exclusively to global mappings to avoid
+	 * speculative page table walking with the wrong TTBR.
 	 */
-	if (unlikely((asid & ~ASID_MASK) == 0)) {
-		asid = ++cpu_last_asid;
-		/* set the reserved ASID before flushing the TLB */
-		asm("mcr	p15, 0, %0, c13, c0, 1	@ set reserved context ID\n"
-		    :
-		    : "r" (0));
-		isb();
-		flush_tlb_all();
-		if (icache_is_vivt_asid_tagged()) {
-			asm("mcr	p15, 0, %0, c7, c5, 0	@ invalidate I-cache\n"
-			    "mcr	p15, 0, %0, c7, c5, 6	@ flush BTAC/BTB\n"
-			    :
-			    : "r" (0));
-			dsb();
-		}
+	cpu_set_reserved_ttbr0();
+
+	asid = atomic64_read(&mm->context.id);
+	if (!((asid ^ atomic64_read(&asid_generation)) >> ASID_BITS)
+	    && atomic64_xchg(&per_cpu(active_asids, cpu), asid))
+		goto switch_mm_fastpath;
+
+	raw_spin_lock_irqsave(&cpu_asid_lock, flags);
+	/* Check that our ASID belongs to the current generation. */
+	asid = atomic64_read(&mm->context.id);
+	if ((asid ^ atomic64_read(&asid_generation)) >> ASID_BITS) {
+		asid = new_context(mm, cpu);
+		atomic64_set(&mm->context.id, asid);
 	}
-	spin_unlock(&cpu_asid_lock);
 
-	mm->cpu_vm_mask = cpumask_of_cpu(smp_processor_id());
-	mm->context.id = asid;
+	if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending)) {
+		local_flush_bp_all();
+		local_flush_tlb_all();
+	}
+
+	atomic64_set(&per_cpu(active_asids, cpu), asid);
+	cpumask_set_cpu(cpu, mm_cpumask(mm));
+	raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
+
+switch_mm_fastpath:
+	cpu_switch_mm(mm->pgd, mm);
 }
diff --git a/arch/arm/mm/copypage-fa.c b/arch/arm/mm/copypage-fa.c
new file mode 100644
index 00000000000..d130a5ece5d
--- /dev/null
+++ b/arch/arm/mm/copypage-fa.c
@@ -0,0 +1,86 @@
+/*
+ *  linux/arch/arm/lib/copypage-fa.S
+ *
+ *  Copyright (C) 2005 Faraday Corp.
+ *  Copyright (C) 2008-2009 Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
+ *
+ * Based on copypage-v4wb.S:
+ *  Copyright (C) 1995-1999 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/init.h>
+#include <linux/highmem.h>
+
+/*
+ * Faraday optimised copy_user_page
+ */
+static void __naked
+fa_copy_user_page(void *kto, const void *kfrom)
+{
+	asm("\
+	stmfd	sp!, {r4, lr}			@ 2\n\
+	mov	r2, %0				@ 1\n\
+1:	ldmia	r1!, {r3, r4, ip, lr}		@ 4\n\
+	stmia	r0, {r3, r4, ip, lr}		@ 4\n\
+	mcr	p15, 0, r0, c7, c14, 1		@ 1   clean and invalidate D line\n\
+	add	r0, r0, #16			@ 1\n\
+	ldmia	r1!, {r3, r4, ip, lr}		@ 4\n\
+	stmia	r0, {r3, r4, ip, lr}		@ 4\n\
+	mcr	p15, 0, r0, c7, c14, 1		@ 1   clean and invalidate D line\n\
+	add	r0, r0, #16			@ 1\n\
+	subs	r2, r2, #1			@ 1\n\
+	bne	1b				@ 1\n\
+	mcr	p15, 0, r2, c7, c10, 4		@ 1   drain WB\n\
+	ldmfd	sp!, {r4, pc}			@ 3"
+	:
+	: "I" (PAGE_SIZE / 32));
+}
+
+void fa_copy_user_highpage(struct page *to, struct page *from,
+	unsigned long vaddr, struct vm_area_struct *vma)
+{
+	void *kto, *kfrom;
+
+	kto = kmap_atomic(to);
+	kfrom = kmap_atomic(from);
+	fa_copy_user_page(kto, kfrom);
+	kunmap_atomic(kfrom);
+	kunmap_atomic(kto);
+}
+
+/*
+ * Faraday optimised clear_user_page
+ *
+ * Same story as above.
+ */
+void fa_clear_user_highpage(struct page *page, unsigned long vaddr)
+{
+	void *ptr, *kaddr = kmap_atomic(page);
+	asm volatile("\
+	mov	r1, %2				@ 1\n\
+	mov	r2, #0				@ 1\n\
+	mov	r3, #0				@ 1\n\
+	mov	ip, #0				@ 1\n\
+	mov	lr, #0				@ 1\n\
+1:	stmia	%0, {r2, r3, ip, lr}		@ 4\n\
+	mcr	p15, 0, %0, c7, c14, 1		@ 1   clean and invalidate D line\n\
+	add	%0, %0, #16			@ 1\n\
+	stmia	%0, {r2, r3, ip, lr}		@ 4\n\
+	mcr	p15, 0, %0, c7, c14, 1		@ 1   clean and invalidate D line\n\
+	add	%0, %0, #16			@ 1\n\
+	subs	r1, r1, #1			@ 1\n\
+	bne	1b				@ 1\n\
+	mcr	p15, 0, r1, c7, c10, 4		@ 1   drain WB"
+	: "=r" (ptr)
+	: "0" (kaddr), "I" (PAGE_SIZE / 32)
+	: "r1", "r2", "r3", "ip", "lr");
+	kunmap_atomic(kaddr);
+}
+
+struct cpu_user_fns fa_user_fns __initdata = {
+	.cpu_clear_user_highpage = fa_clear_user_highpage,
+	.cpu_copy_user_highpage	= fa_copy_user_highpage,
+};
diff --git a/arch/arm/mm/copypage-feroceon.c b/arch/arm/mm/copypage-feroceon.c
new file mode 100644
index 00000000000..49ee0c1a720
--- /dev/null
+++ b/arch/arm/mm/copypage-feroceon.c
@@ -0,0 +1,112 @@
+/*
+ *  linux/arch/arm/mm/copypage-feroceon.S
+ *
+ *  Copyright (C) 2008 Marvell Semiconductors
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This handles copy_user_highpage and clear_user_page on Feroceon
+ * more optimally than the generic implementations.
+ */
+#include <linux/init.h>
+#include <linux/highmem.h>
+
+static void __naked
+feroceon_copy_user_page(void *kto, const void *kfrom)
+{
+	asm("\
+	stmfd	sp!, {r4-r9, lr}		\n\
+	mov	ip, %2				\n\
+1:	mov	lr, r1				\n\
+	ldmia	r1!, {r2 - r9}			\n\
+	pld	[lr, #32]			\n\
+	pld	[lr, #64]			\n\
+	pld	[lr, #96]			\n\
+	pld	[lr, #128]			\n\
+	pld	[lr, #160]			\n\
+	pld	[lr, #192]			\n\
+	pld	[lr, #224]			\n\
+	stmia	r0, {r2 - r9}			\n\
+	ldmia	r1!, {r2 - r9}			\n\
+	mcr	p15, 0, r0, c7, c14, 1		@ clean and invalidate D line\n\
+	add	r0, r0, #32			\n\
+	stmia	r0, {r2 - r9}			\n\
+	ldmia	r1!, {r2 - r9}			\n\
+	mcr	p15, 0, r0, c7, c14, 1		@ clean and invalidate D line\n\
+	add	r0, r0, #32			\n\
+	stmia	r0, {r2 - r9}			\n\
+	ldmia	r1!, {r2 - r9}			\n\
+	mcr	p15, 0, r0, c7, c14, 1		@ clean and invalidate D line\n\
+	add	r0, r0, #32			\n\
+	stmia	r0, {r2 - r9}			\n\
+	ldmia	r1!, {r2 - r9}			\n\
+	mcr	p15, 0, r0, c7, c14, 1		@ clean and invalidate D line\n\
+	add	r0, r0, #32			\n\
+	stmia	r0, {r2 - r9}			\n\
+	ldmia	r1!, {r2 - r9}			\n\
+	mcr	p15, 0, r0, c7, c14, 1		@ clean and invalidate D line\n\
+	add	r0, r0, #32			\n\
+	stmia	r0, {r2 - r9}			\n\
+	ldmia	r1!, {r2 - r9}			\n\
+	mcr	p15, 0, r0, c7, c14, 1		@ clean and invalidate D line\n\
+	add	r0, r0, #32			\n\
+	stmia	r0, {r2 - r9}			\n\
+	ldmia	r1!, {r2 - r9}			\n\
+	mcr	p15, 0, r0, c7, c14, 1		@ clean and invalidate D line\n\
+	add	r0, r0, #32			\n\
+	stmia	r0, {r2 - r9}			\n\
+	subs	ip, ip, #(32 * 8)		\n\
+	mcr	p15, 0, r0, c7, c14, 1		@ clean and invalidate D line\n\
+	add	r0, r0, #32			\n\
+	bne	1b				\n\
+	mcr	p15, 0, ip, c7, c10, 4		@ drain WB\n\
+	ldmfd	sp!, {r4-r9, pc}"
+	:
+	: "r" (kto), "r" (kfrom), "I" (PAGE_SIZE));
+}
+
+void feroceon_copy_user_highpage(struct page *to, struct page *from,
+	unsigned long vaddr, struct vm_area_struct *vma)
+{
+	void *kto, *kfrom;
+
+	kto = kmap_atomic(to);
+	kfrom = kmap_atomic(from);
+	flush_cache_page(vma, vaddr, page_to_pfn(from));
+	feroceon_copy_user_page(kto, kfrom);
+	kunmap_atomic(kfrom);
+	kunmap_atomic(kto);
+}
+
+void feroceon_clear_user_highpage(struct page *page, unsigned long vaddr)
+{
+	void *ptr, *kaddr = kmap_atomic(page);
+	asm volatile ("\
+	mov	r1, %2				\n\
+	mov	r2, #0				\n\
+	mov	r3, #0				\n\
+	mov	r4, #0				\n\
+	mov	r5, #0				\n\
+	mov	r6, #0				\n\
+	mov	r7, #0				\n\
+	mov	ip, #0				\n\
+	mov	lr, #0				\n\
+1:	stmia	%0, {r2-r7, ip, lr}		\n\
+	subs	r1, r1, #1			\n\
+	mcr	p15, 0, %0, c7, c14, 1		@ clean and invalidate D line\n\
+	add	%0, %0, #32			\n\
+	bne	1b				\n\
+	mcr	p15, 0, r1, c7, c10, 4		@ drain WB"
+	: "=r" (ptr)
+	: "0" (kaddr), "I" (PAGE_SIZE / 32)
+	: "r1", "r2", "r3", "r4", "r5", "r6", "r7", "ip", "lr");
+	kunmap_atomic(kaddr);
+}
+
+struct cpu_user_fns feroceon_user_fns __initdata = {
+	.cpu_clear_user_highpage = feroceon_clear_user_highpage,
+	.cpu_copy_user_highpage	= feroceon_copy_user_highpage,
+};
+
diff --git a/arch/arm/mm/copypage-v3.S b/arch/arm/mm/copypage-v3.S
deleted file mode 100644
index 2ee394b11bc..00000000000
--- a/arch/arm/mm/copypage-v3.S
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- *  linux/arch/arm/lib/copypage.S
- *
- *  Copyright (C) 1995-1999 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- *  ASM optimised string functions
- */
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <asm/assembler.h>
-#include <asm/asm-offsets.h>
-
-		.text
-		.align	5
-/*
- * ARMv3 optimised copy_user_page
- *
- * FIXME: do we need to handle cache stuff...
- */
-ENTRY(v3_copy_user_page)
-	stmfd	sp!, {r4, lr}			@	2
-	mov	r2, #PAGE_SZ/64			@	1
-	ldmia	r1!, {r3, r4, ip, lr}		@	4+1
-1:	stmia	r0!, {r3, r4, ip, lr}		@	4
-	ldmia	r1!, {r3, r4, ip, lr}		@	4+1
-	stmia	r0!, {r3, r4, ip, lr}		@	4
-	ldmia	r1!, {r3, r4, ip, lr}		@	4+1
-	stmia	r0!, {r3, r4, ip, lr}		@	4
-	ldmia	r1!, {r3, r4, ip, lr}		@	4
-	subs	r2, r2, #1			@	1
-	stmia	r0!, {r3, r4, ip, lr}		@	4
-	ldmneia	r1!, {r3, r4, ip, lr}		@	4
-	bne	1b				@	1
-	ldmfd	sp!, {r4, pc}			@	3
-
-	.align	5
-/*
- * ARMv3 optimised clear_user_page
- *
- * FIXME: do we need to handle cache stuff...
- */
-ENTRY(v3_clear_user_page)
-	str	lr, [sp, #-4]!
-	mov	r1, #PAGE_SZ/64			@ 1
-	mov	r2, #0				@ 1
-	mov	r3, #0				@ 1
-	mov	ip, #0				@ 1
-	mov	lr, #0				@ 1
-1:	stmia	r0!, {r2, r3, ip, lr}		@ 4
-	stmia	r0!, {r2, r3, ip, lr}		@ 4
-	stmia	r0!, {r2, r3, ip, lr}		@ 4
-	stmia	r0!, {r2, r3, ip, lr}		@ 4
-	subs	r1, r1, #1			@ 1
-	bne	1b				@ 1
-	ldr	pc, [sp], #4
-
-	__INITDATA
-
-	.type	v3_user_fns, #object
-ENTRY(v3_user_fns)
-	.long	v3_clear_user_page
-	.long	v3_copy_user_page
-	.size	v3_user_fns, . - v3_user_fns
diff --git a/arch/arm/mm/copypage-v4mc.c b/arch/arm/mm/copypage-v4mc.c
index ded0e96d069..1267e64133b 100644
--- a/arch/arm/mm/copypage-v4mc.c
+++ b/arch/arm/mm/copypage-v4mc.c
@@ -15,25 +15,21 @@
  */
 #include <linux/init.h>
 #include <linux/mm.h>
+#include <linux/highmem.h>
 
-#include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
 
 #include "mm.h"
 
-/*
- * 0xffff8000 to 0xffffffff is reserved for any ARM architecture
- * specific hacks for copying pages efficiently.
- */
 #define minicache_pgprot __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | \
-				  L_PTE_CACHEABLE)
+				  L_PTE_MT_MINICACHE)
 
-static DEFINE_SPINLOCK(minicache_lock);
+static DEFINE_RAW_SPINLOCK(minicache_lock);
 
 /*
- * ARMv4 mini-dcache optimised copy_user_page
+ * ARMv4 mini-dcache optimised copy_user_highpage
  *
  * We flush the destination cache lines just before we write the data into the
  * corresponding address.  Since the Dcache is read-allocate, this removes the
@@ -42,9 +38,9 @@ static DEFINE_SPINLOCK(minicache_lock);
  *
  * Note: We rely on all ARMv4 processors implementing the "invalidate D line"
  * instruction.  If your processor does not supply this, you have to write your
- * own copy_user_page that does the right thing.
+ * own copy_user_highpage that does the right thing.
  */
-static void __attribute__((naked))
+static void __naked
 mc_copy_user_page(void *from, void *to)
 {
 	asm volatile(
@@ -68,50 +64,52 @@ mc_copy_user_page(void *from, void *to)
 	: "r" (from), "r" (to), "I" (PAGE_SIZE / 64));
 }
 
-void v4_mc_copy_user_page(void *kto, const void *kfrom, unsigned long vaddr)
+void v4_mc_copy_user_highpage(struct page *to, struct page *from,
+	unsigned long vaddr, struct vm_area_struct *vma)
 {
-	struct page *page = virt_to_page(kfrom);
+	void *kto = kmap_atomic(to);
+
+	if (!test_and_set_bit(PG_dcache_clean, &from->flags))
+		__flush_dcache_page(page_mapping(from), from);
 
-	if (test_and_clear_bit(PG_dcache_dirty, &page->flags))
-		__flush_dcache_page(page_mapping(page), page);
+	raw_spin_lock(&minicache_lock);
 
-	spin_lock(&minicache_lock);
+	set_top_pte(COPYPAGE_MINICACHE, mk_pte(from, minicache_pgprot));
 
-	set_pte_ext(TOP_PTE(0xffff8000), pfn_pte(__pa(kfrom) >> PAGE_SHIFT, minicache_pgprot), 0);
-	flush_tlb_kernel_page(0xffff8000);
+	mc_copy_user_page((void *)COPYPAGE_MINICACHE, kto);
 
-	mc_copy_user_page((void *)0xffff8000, kto);
+	raw_spin_unlock(&minicache_lock);
 
-	spin_unlock(&minicache_lock);
+	kunmap_atomic(kto);
 }
 
 /*
  * ARMv4 optimised clear_user_page
  */
-void __attribute__((naked))
-v4_mc_clear_user_page(void *kaddr, unsigned long vaddr)
+void v4_mc_clear_user_highpage(struct page *page, unsigned long vaddr)
 {
-	asm volatile(
-	"str	lr, [sp, #-4]!\n\
-	mov	r1, %0				@ 1\n\
+	void *ptr, *kaddr = kmap_atomic(page);
+	asm volatile("\
+	mov	r1, %2				@ 1\n\
 	mov	r2, #0				@ 1\n\
 	mov	r3, #0				@ 1\n\
 	mov	ip, #0				@ 1\n\
 	mov	lr, #0				@ 1\n\
-1:	mcr	p15, 0, r0, c7, c6, 1		@ 1   invalidate D line\n\
-	stmia	r0!, {r2, r3, ip, lr}		@ 4\n\
-	stmia	r0!, {r2, r3, ip, lr}		@ 4\n\
-	mcr	p15, 0, r0, c7, c6, 1		@ 1   invalidate D line\n\
-	stmia	r0!, {r2, r3, ip, lr}		@ 4\n\
-	stmia	r0!, {r2, r3, ip, lr}		@ 4\n\
+1:	mcr	p15, 0, %0, c7, c6, 1		@ 1   invalidate D line\n\
+	stmia	%0!, {r2, r3, ip, lr}		@ 4\n\
+	stmia	%0!, {r2, r3, ip, lr}		@ 4\n\
+	mcr	p15, 0, %0, c7, c6, 1		@ 1   invalidate D line\n\
+	stmia	%0!, {r2, r3, ip, lr}		@ 4\n\
+	stmia	%0!, {r2, r3, ip, lr}		@ 4\n\
 	subs	r1, r1, #1			@ 1\n\
-	bne	1b				@ 1\n\
-	ldr	pc, [sp], #4"
-	:
-	: "I" (PAGE_SIZE / 64));
+	bne	1b				@ 1"
+	: "=r" (ptr)
+	: "0" (kaddr), "I" (PAGE_SIZE / 64)
+	: "r1", "r2", "r3", "ip", "lr");
+	kunmap_atomic(kaddr);
 }
 
 struct cpu_user_fns v4_mc_user_fns __initdata = {
-	.cpu_clear_user_page	= v4_mc_clear_user_page, 
-	.cpu_copy_user_page	= v4_mc_copy_user_page,
+	.cpu_clear_user_highpage = v4_mc_clear_user_highpage,
+	.cpu_copy_user_highpage	= v4_mc_copy_user_highpage,
 };
diff --git a/arch/arm/mm/copypage-v4wb.S b/arch/arm/mm/copypage-v4wb.S
deleted file mode 100644
index 83117354b1c..00000000000
--- a/arch/arm/mm/copypage-v4wb.S
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- *  linux/arch/arm/lib/copypage.S
- *
- *  Copyright (C) 1995-1999 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- *  ASM optimised string functions
- */
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <asm/asm-offsets.h>
-
-	.text
-	.align	5
-/*
- * ARMv4 optimised copy_user_page
- *
- * We flush the destination cache lines just before we write the data into the
- * corresponding address.  Since the Dcache is read-allocate, this removes the
- * Dcache aliasing issue.  The writes will be forwarded to the write buffer,
- * and merged as appropriate.
- *
- * Note: We rely on all ARMv4 processors implementing the "invalidate D line"
- * instruction.  If your processor does not supply this, you have to write your
- * own copy_user_page that does the right thing.
- */
-ENTRY(v4wb_copy_user_page)
-	stmfd	sp!, {r4, lr}			@ 2
-	mov	r2, #PAGE_SZ/64			@ 1
-	ldmia	r1!, {r3, r4, ip, lr}		@ 4
-1:	mcr	p15, 0, r0, c7, c6, 1		@ 1   invalidate D line
-	stmia	r0!, {r3, r4, ip, lr}		@ 4
-	ldmia	r1!, {r3, r4, ip, lr}		@ 4+1
-	stmia	r0!, {r3, r4, ip, lr}		@ 4
-	ldmia	r1!, {r3, r4, ip, lr}		@ 4
-	mcr	p15, 0, r0, c7, c6, 1		@ 1   invalidate D line
-	stmia	r0!, {r3, r4, ip, lr}		@ 4
-	ldmia	r1!, {r3, r4, ip, lr}		@ 4
-	subs	r2, r2, #1			@ 1
-	stmia	r0!, {r3, r4, ip, lr}		@ 4
-	ldmneia	r1!, {r3, r4, ip, lr}		@ 4
-	bne	1b				@ 1
-	mcr	p15, 0, r1, c7, c10, 4		@ 1   drain WB
-	ldmfd	 sp!, {r4, pc}			@ 3
-
-	.align	5
-/*
- * ARMv4 optimised clear_user_page
- *
- * Same story as above.
- */
-ENTRY(v4wb_clear_user_page)
-	str	lr, [sp, #-4]!
-	mov	r1, #PAGE_SZ/64			@ 1
-	mov	r2, #0				@ 1
-	mov	r3, #0				@ 1
-	mov	ip, #0				@ 1
-	mov	lr, #0				@ 1
-1:	mcr	p15, 0, r0, c7, c6, 1		@ 1   invalidate D line
-	stmia	r0!, {r2, r3, ip, lr}		@ 4
-	stmia	r0!, {r2, r3, ip, lr}		@ 4
-	mcr	p15, 0, r0, c7, c6, 1		@ 1   invalidate D line
-	stmia	r0!, {r2, r3, ip, lr}		@ 4
-	stmia	r0!, {r2, r3, ip, lr}		@ 4
-	subs	r1, r1, #1			@ 1
-	bne	1b				@ 1
-	mcr	p15, 0, r1, c7, c10, 4		@ 1   drain WB
-	ldr	pc, [sp], #4
-
-	__INITDATA
-
-	.type	v4wb_user_fns, #object
-ENTRY(v4wb_user_fns)
-	.long	v4wb_clear_user_page
-	.long	v4wb_copy_user_page
-	.size	v4wb_user_fns, . - v4wb_user_fns
diff --git a/arch/arm/mm/copypage-v4wb.c b/arch/arm/mm/copypage-v4wb.c
new file mode 100644