64 files changed, 3994 insertions, 1694 deletions
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index 3fd629d5a51..c348eaee7ee 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -43,7 +43,7 @@ config CPU_ARM740T
 	depends on !MMU
 	select CPU_32v4T
 	select CPU_ABRT_LV4T
-	select CPU_CACHE_V3	# although the core is v4t
+	select CPU_CACHE_V4
 	select CPU_CP15_MPU
 	select CPU_PABRT_LEGACY
 	help
@@ -264,7 +264,7 @@ config CPU_ARM1026
 
 # SA110
 config CPU_SA110
-	bool "Support StrongARM(R) SA-110 processor" if ARCH_RPC
+	bool
 	select CPU_32v3 if ARCH_RPC
 	select CPU_32v4 if !ARCH_RPC
 	select CPU_ABRT_EV4
@@ -392,40 +392,60 @@ config CPU_V7
 	select CPU_CACHE_V7
 	select CPU_CACHE_VIPT
 	select CPU_COPY_V6 if MMU
-	select CPU_CP15_MMU
+	select CPU_CP15_MMU if MMU
+	select CPU_CP15_MPU if !MMU
 	select CPU_HAS_ASID if MMU
 	select CPU_PABRT_V7
 	select CPU_TLB_V7 if MMU
 
+# ARMv7M
+config CPU_V7M
+	bool
+	select CPU_32v7M
+	select CPU_ABRT_NOMMU
+	select CPU_CACHE_NOP
+	select CPU_PABRT_LEGACY
+	select CPU_THUMBONLY
+
+config CPU_THUMBONLY
+	bool
+	# There are no CPUs available with MMU that don't implement an ARM ISA:
+	depends on !MMU
+	help
+	  Select this if your CPU doesn't support the 32 bit ARM instructions.
+
 # Figure out what processor architecture version we should be using.
 # This defines the compiler instruction set which depends on the machine type.
 config CPU_32v3
 	bool
 	select CPU_USE_DOMAINS if MMU
 	select NEEDS_SYSCALL_FOR_CMPXCHG if SMP
+	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
 
 config CPU_32v4
 	bool
 	select CPU_USE_DOMAINS if MMU
 	select NEEDS_SYSCALL_FOR_CMPXCHG if SMP
+	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
 
 config CPU_32v4T
 	bool
 	select CPU_USE_DOMAINS if MMU
 	select NEEDS_SYSCALL_FOR_CMPXCHG if SMP
+	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
 
 config CPU_32v5
 	bool
 	select CPU_USE_DOMAINS if MMU
 	select NEEDS_SYSCALL_FOR_CMPXCHG if SMP
+	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
 
 config CPU_32v6
 	bool
-	select CPU_USE_DOMAINS if CPU_V6 && MMU
 	select TLS_REG_EMUL if !CPU_32v6K && !MMU
 
 config CPU_32v6K
@@ -434,6 +454,9 @@ config CPU_32v6K
 config CPU_32v7
 	bool
 
+config CPU_32v7M
+	bool
+
 # The abort model
 config CPU_ABRT_NOMMU
 	bool
@@ -469,9 +492,6 @@ config CPU_PABRT_V7
 	bool
 
 # The cache model
-config CPU_CACHE_V3
-	bool
-
 config CPU_CACHE_V4
 	bool
 
@@ -487,6 +507,9 @@ config CPU_CACHE_V6
 config CPU_CACHE_V7
 	bool
 
+config CPU_CACHE_NOP
+	bool
+
 config CPU_CACHE_VIVT
 	bool
 
@@ -608,8 +631,12 @@ config ARCH_DMA_ADDR_T_64BIT
 	bool
 
 config ARM_THUMB
-	bool "Support Thumb user binaries"
-	depends on CPU_ARM720T || CPU_ARM740T || CPU_ARM920T || CPU_ARM922T || CPU_ARM925T || CPU_ARM926T || CPU_ARM940T || CPU_ARM946E || CPU_ARM1020 || CPU_ARM1020E || CPU_ARM1022 || CPU_ARM1026 || CPU_XSCALE || CPU_XSC3 || CPU_MOHAWK || CPU_V6 || CPU_V6K || CPU_V7 || CPU_FEROCEON
+	bool "Support Thumb user binaries" if !CPU_THUMBONLY
+	depends on CPU_ARM720T || CPU_ARM740T || CPU_ARM920T || CPU_ARM922T || \
+		CPU_ARM925T || CPU_ARM926T || CPU_ARM940T || CPU_ARM946E || \
+		CPU_ARM1020 || CPU_ARM1020E || CPU_ARM1022 || CPU_ARM1026 || \
+		CPU_XSCALE || CPU_XSC3 || CPU_MOHAWK || CPU_V6 || CPU_V6K || \
+		CPU_V7 || CPU_FEROCEON || CPU_V7M
 	default y
 	help
 	  Say Y if you want to include kernel support for running user space
@@ -629,8 +656,9 @@ config ARM_THUMBEE
 	  make use of it. Say N for code that can run on CPUs without ThumbEE.
 
 config ARM_VIRT_EXT
-	bool "Native support for the ARM Virtualization Extensions"
-	depends on MMU && CPU_V7
+	bool
+	depends on MMU
+	default y if CPU_V7
 	help
 	  Enable the kernel to make use of the ARM Virtualization
 	  Extensions to install hypervisors without run-time firmware
@@ -640,14 +668,9 @@ config ARM_VIRT_EXT
 	  use of this feature.  Refer to Documentation/arm/Booting for
 	  details.
 
-	  It is safe to enable this option even if the kernel may not be
-	  booted in HYP mode, may not have support for the
-	  virtualization extensions, or may be booted with a
-	  non-compliant bootloader.
-
 config SWP_EMULATE
 	bool "Emulate SWP/SWPB instructions"
-	depends on !CPU_USE_DOMAINS && CPU_V7
+	depends on CPU_V7
 	default y if SMP
 	select HAVE_PROC_CPU if PROC_FS
 	help
@@ -756,6 +779,7 @@ config CPU_BPREDICT_DISABLE
 
 config TLS_REG_EMUL
 	bool
+	select NEED_KUSER_HELPERS
 	help
 	  An SMP system using a pre-ARMv6 processor (there are apparently
 	  a few prototypes like that in existence) and therefore access to
@@ -763,11 +787,43 @@ config TLS_REG_EMUL
 
 config NEEDS_SYSCALL_FOR_CMPXCHG
 	bool
+	select NEED_KUSER_HELPERS
 	help
 	  SMP on a pre-ARMv6 processor?  Well OK then.
 	  Forget about fast user space cmpxchg support.
 	  It is just not possible.
 
+config NEED_KUSER_HELPERS
+	bool
+
+config KUSER_HELPERS
+	bool "Enable kuser helpers in vector page" if !NEED_KUSER_HELPERS
+	default y
+	help
+	  Warning: disabling this option may break user programs.
+
+	  Provide kuser helpers in the vector page.  The kernel provides
+	  helper code to userspace in read only form at a fixed location
+	  in the high vector page to allow userspace to be independent of
+	  the CPU type fitted to the system.  This permits binaries to be
+	  run on ARMv4 through to ARMv7 without modification.
+
+	  See Documentation/arm/kernel_user_helpers.txt for details.
+
+	  However, the fixed address nature of these helpers can be used
+	  by ROP (return orientated programming) authors when creating
+	  exploits.
+
+	  If all of the binaries and libraries which run on your platform
+	  are built specifically for your platform, and make no use of
+	  these helpers, then you can turn this option off to hinder
+	  such exploits. However, in that case, if a binary or library
+	  relying on those helpers is run, it will receive a SIGILL signal,
+	  which will terminate the program.
+
+	  Say N here only if you are absolutely certain that you do not
+	  need these helpers; otherwise, the safe option is to say Y.
+
 config DMA_CACHE_RWFO
 	bool "Enable read/write for ownership DMA cache maintenance"
 	depends on CPU_V6K && SMP
@@ -798,7 +854,7 @@ config OUTER_CACHE_SYNC
 
 config CACHE_FEROCEON_L2
 	bool "Enable the Feroceon L2 cache controller"
-	depends on ARCH_KIRKWOOD || ARCH_MV78XX0
+	depends on ARCH_KIRKWOOD || ARCH_MV78XX0 || ARCH_MVEBU
 	default y
 	select OUTER_CACHE
 	help
@@ -833,14 +889,64 @@ config CACHE_L2X0
 	help
 	  This option enables the L2x0 PrimeCell.
 
+if CACHE_L2X0
+
 config CACHE_PL310
 	bool
-	depends on CACHE_L2X0
 	default y if CPU_V7 && !(CPU_V6 || CPU_V6K)
 	help
 	  This option enables optimisations for the PL310 cache
 	  controller.
 
+config PL310_ERRATA_588369
+	bool "PL310 errata: Clean & Invalidate maintenance operations do not invalidate clean lines"
+	help
+	   The PL310 L2 cache controller implements three types of Clean &
+	   Invalidate maintenance operations: by Physical Address
+	   (offset 0x7F0), by Index/Way (0x7F8) and by Way (0x7FC).
+	   They are architecturally defined to behave as the execution of a
+	   clean operation followed immediately by an invalidate operation,
+	   both performing to the same memory location. This functionality
+	   is not correctly implemented in PL310 as clean lines are not
+	   invalidated as a result of these operations.
+
+config PL310_ERRATA_727915
+	bool "PL310 errata: Background Clean & Invalidate by Way operation can cause data corruption"
+	help
+	  PL310 implements the Clean & Invalidate by Way L2 cache maintenance
+	  operation (offset 0x7FC). This operation runs in background so that
+	  PL310 can handle normal accesses while it is in progress. Under very
+	  rare circumstances, due to this erratum, write data can be lost when
+	  PL310 treats a cacheable write transaction during a Clean &
+	  Invalidate by Way operation.
+
+config PL310_ERRATA_753970
+	bool "PL310 errata: cache sync operation may be faulty"
+	help
+	  This option enables the workaround for the 753970 PL310 (r3p0) erratum.
+
+	  Under some condition the effect of cache sync operation on
+	  the store buffer still remains when the operation completes.
+	  This means that the store buffer is always asked to drain and
+	  this prevents it from merging any further writes. The workaround
+	  is to replace the normal offset of cache sync operation (0x730)
+	  by another offset targeting an unmapped PL310 register 0x740.
+	  This has the same effect as the cache sync operation: store buffer
+	  drain and waiting for all buffers empty.
+
+config PL310_ERRATA_769419
+	bool "PL310 errata: no automatic Store Buffer drain"
+	help
+	  On revisions of the PL310 prior to r3p2, the Store Buffer does
+	  not automatically drain. This can cause normal, non-cacheable
+	  writes to be retained when the memory system is idle, leading
+	  to suboptimal I/O performance for drivers using coherent DMA.
+	  This option adds a write barrier to the cpu_idle loop so that,
+	  on systems with an outer cache, the store buffer is drained
+	  explicitly.
+
+endif
+
 config CACHE_TAUROS2
 	bool "Enable the Tauros2 L2 cache controller"
 	depends on (ARCH_DOVE || ARCH_MMP || CPU_PJ4)
@@ -895,3 +1001,9 @@ config ARCH_HAS_BARRIERS
 	help
 	  This option allows the use of custom mandatory barriers
 	  included via the mach/barriers.h file.
+
+config ARCH_SUPPORTS_BIG_ENDIAN
+	bool
+	help
+	  This option specifies the architecture can support big endian
+	  operation.
diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
index 8a9c4cb50a9..91da64de440 100644
--- a/arch/arm/mm/Makefile
+++ b/arch/arm/mm/Makefile
@@ -6,16 +6,18 @@ obj-y				:= dma-mapping.o extable.o fault.o init.o \
 				   iomap.o
 
 obj-$(CONFIG_MMU)		+= fault-armv.o flush.o idmap.o ioremap.o \
-				   mmap.o pgd.o mmu.o vmregion.o
+				   mmap.o pgd.o mmu.o
 
 ifneq ($(CONFIG_MMU),y)
 obj-y				+= nommu.o
 endif
 
+obj-$(CONFIG_ARM_PTDUMP)	+= dump.o
 obj-$(CONFIG_MODULES)		+= proc-syms.o
 
 obj-$(CONFIG_ALIGNMENT_TRAP)	+= alignment.o
 obj-$(CONFIG_HIGHMEM)		+= highmem.o
+obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
 
 obj-$(CONFIG_CPU_ABRT_NOMMU)	+= abort-nommu.o
 obj-$(CONFIG_CPU_ABRT_EV4)	+= abort-ev4.o
@@ -33,13 +35,13 @@ obj-$(CONFIG_CPU_PABRT_LEGACY)	+= pabort-legacy.o
 obj-$(CONFIG_CPU_PABRT_V6)	+= pabort-v6.o
 obj-$(CONFIG_CPU_PABRT_V7)	+= pabort-v7.o
 
-obj-$(CONFIG_CPU_CACHE_V3)	+= cache-v3.o
 obj-$(CONFIG_CPU_CACHE_V4)	+= cache-v4.o
 obj-$(CONFIG_CPU_CACHE_V4WT)	+= cache-v4wt.o
 obj-$(CONFIG_CPU_CACHE_V4WB)	+= cache-v4wb.o
 obj-$(CONFIG_CPU_CACHE_V6)	+= cache-v6.o
 obj-$(CONFIG_CPU_CACHE_V7)	+= cache-v7.o
 obj-$(CONFIG_CPU_CACHE_FA)	+= cache-fa.o
+obj-$(CONFIG_CPU_CACHE_NOP)	+= cache-nop.o
 
 AFLAGS_cache-v6.o	:=-Wa,-march=armv6
 AFLAGS_cache-v7.o	:=-Wa,-march=armv7-a
@@ -88,11 +90,13 @@ obj-$(CONFIG_CPU_FEROCEON)	+= proc-feroceon.o
 obj-$(CONFIG_CPU_V6)		+= proc-v6.o
 obj-$(CONFIG_CPU_V6K)		+= proc-v6.o
 obj-$(CONFIG_CPU_V7)		+= proc-v7.o
+obj-$(CONFIG_CPU_V7M)		+= proc-v7m.o
 
 AFLAGS_proc-v6.o	:=-Wa,-march=armv6
 AFLAGS_proc-v7.o	:=-Wa,-march=armv7-a
 
+obj-$(CONFIG_OUTER_CACHE)	+= l2c-common.o
 obj-$(CONFIG_CACHE_FEROCEON_L2)	+= cache-feroceon-l2.o
-obj-$(CONFIG_CACHE_L2X0)	+= cache-l2x0.o
+obj-$(CONFIG_CACHE_L2X0)	+= cache-l2x0.o l2c-l2x0-resume.o
 obj-$(CONFIG_CACHE_XSC3L2)	+= cache-xsc3l2.o
 obj-$(CONFIG_CACHE_TAUROS2)	+= cache-tauros2.o
diff --git a/arch/arm/mm/abort-ev6.S b/arch/arm/mm/abort-ev6.S
index 80741992a9f..3815a8262af 100644
--- a/arch/arm/mm/abort-ev6.S
+++ b/arch/arm/mm/abort-ev6.S
@@ -38,9 +38,8 @@ ENTRY(v6_early_abort)
 	bne	do_DataAbort
 	bic	r1, r1, #1 << 11		@ clear bit 11 of FSR
 	ldr	r3, [r4]			@ read aborted ARM instruction
-#ifdef CONFIG_CPU_ENDIAN_BE8
-	rev	r3, r3
-#endif
+ ARM_BE8(rev	r3, r3)
+
 	do_ldrd_abort tmp=ip, insn=r3
 	tst	r3, #1 << 20			@ L = 0 -> write
 	orreq	r1, r1, #1 << 11		@ yes.
diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c
index b820edaf318..b8cb1a2688a 100644
--- a/arch/arm/mm/alignment.c
+++ b/arch/arm/mm/alignment.c
@@ -25,8 +25,10 @@
 #include <asm/cp15.h>
 #include <asm/system_info.h>
 #include <asm/unaligned.h>
+#include <asm/opcodes.h>
 
 #include "fault.h"
+#include "mm.h"
 
 /*
  * 32-bit misaligned trap handler (c) 1998 San Mehat (CCC) -July 1998
@@ -80,6 +82,7 @@ static unsigned long ai_word;
 static unsigned long ai_dword;
 static unsigned long ai_multi;
 static int ai_usermode;
+static unsigned long cr_no_alignment;
 
 core_param(alignment, ai_usermode, int, 0600);
 
@@ -90,7 +93,7 @@ core_param(alignment, ai_usermode, int, 0600);
 /* Return true if and only if the ARMv6 unaligned access model is in use. */
 static bool cpu_is_v6_unaligned(void)
 {
-	return cpu_architecture() >= CPU_ARCH_ARMv6 && (cr_alignment & CR_U);
+	return cpu_architecture() >= CPU_ARCH_ARMv6 && get_cr() & CR_U;
 }
 
 static int safe_usermode(int new_usermode, bool warn)
@@ -749,7 +752,6 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	unsigned long instr = 0, instrptr;
 	int (*handler)(unsigned long addr, unsigned long instr, struct pt_regs *regs);
 	unsigned int type;
-	mm_segment_t fs;
 	unsigned int fault;
 	u16 tinstr = 0;
 	int isize = 4;
@@ -760,26 +762,28 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 
 	instrptr = instruction_pointer(regs);
 
-	fs = get_fs();
-	set_fs(KERNEL_DS);
 	if (thumb_mode(regs)) {
-		fault = __get_user(tinstr, (u16 *)(instrptr & ~1));
+		u16 *ptr = (u16 *)(instrptr & ~1);
+		fault = probe_kernel_address(ptr, tinstr);
+		tinstr = __mem_to_opcode_thumb16(tinstr);
 		if (!fault) {
 			if (cpu_architecture() >= CPU_ARCH_ARMv7 &&
 			    IS_T32(tinstr)) {
 				/* Thumb-2 32-bit */
 				u16 tinst2 = 0;
-				fault = __get_user(tinst2, (u16 *)(instrptr+2));
-				instr = (tinstr << 16) | tinst2;
+				fault = probe_kernel_address(ptr + 1, tinst2);
+				tinst2 = __mem_to_opcode_thumb16(tinst2);
+				instr = __opcode_thumb32_compose(tinstr, tinst2);
 				thumb2_32b = 1;
 			} else {
 				isize = 2;
 				instr = thumb2arm(tinstr);
 			}
 		}
-	} else
-		fault = __get_user(instr, (u32 *)instrptr);
-	set_fs(fs);
+	} else {
+		fault = probe_kernel_address(instrptr, instr);
+		instr = __mem_to_opcode_arm(instr);
+	}
 
 	if (fault) {
 		type = TYPE_FAULT;
@@ -947,6 +951,13 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	return 0;
 }
 
+static int __init noalign_setup(char *__unused)
+{
+	set_cr(__clear_cr(CR_A));
+	return 1;
+}
+__setup("noalign", noalign_setup);
+
 /*
  * This needs to be done after sysctl_init, otherwise sys/ will be
  * overwritten.  Actually, this shouldn't be in sys/ at all since
@@ -965,12 +976,12 @@ static int __init alignment_init(void)
 #endif
 
 	if (cpu_is_v6_unaligned()) {
-		cr_alignment &= ~CR_A;
-		cr_no_alignment &= ~CR_A;
-		set_cr(cr_alignment);
+		set_cr(__clear_cr(CR_A));
 		ai_usermode = safe_usermode(ai_usermode, false);
 	}
 
+	cr_no_alignment = get_cr() & ~CR_A;
+
 	hook_fault_code(FAULT_CODE_ALIGNMENT, do_alignment, SIGBUS, BUS_ADRALN,
 			"alignment exception");
 
diff --git a/arch/arm/mm/cache-feroceon-l2.c b/arch/arm/mm/cache-feroceon-l2.c
index dd3d59122cc..e028a7f2ebc 100644
--- a/arch/arm/mm/cache-feroceon-l2.c
+++ b/arch/arm/mm/cache-feroceon-l2.c
@@ -13,10 +13,15 @@
  */
 
 #include <linux/init.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/highmem.h>
+#include <linux/io.h>
 #include <asm/cacheflush.h>
 #include <asm/cp15.h>
-#include <plat/cache-feroceon-l2.h>
+#include <asm/hardware/cache-feroceon-l2.h>
+
+#define L2_WRITETHROUGH_KIRKWOOD	BIT(4)
 
 /*
  * Low-level cache maintenance operations.
@@ -331,7 +336,9 @@ static void __init enable_l2(void)
 			enable_icache();
 		if (d)
 			enable_dcache();
-	}
+	} else
+		pr_err(FW_BUG
+		       "Feroceon L2: bootloader left the L2 cache on!\n");
 }
 
 void __init feroceon_l2_init(int __l2_wt_override)
@@ -349,3 +356,41 @@ void __init feroceon_l2_init(int __l2_wt_override)
 	printk(KERN_INFO "Feroceon L2: Cache support initialised%s.\n",
 			 l2_wt_override ? ", in WT override mode" : "");
 }
+#ifdef CONFIG_OF
+static const struct of_device_id feroceon_ids[] __initconst = {
+	{ .compatible = "marvell,kirkwood-cache"},
+	{ .compatible = "marvell,feroceon-cache"},
+	{}
+};
+
+int __init feroceon_of_init(void)
+{
+	struct device_node *node;
+	void __iomem *base;
+	bool l2_wt_override = false;
+	struct resource res;
+
+#if defined(CONFIG_CACHE_FEROCEON_L2_WRITETHROUGH)
+	l2_wt_override = true;
+#endif
+
+	node = of_find_matching_node(NULL, feroceon_ids);
+	if (node && of_device_is_compatible(node, "marvell,kirkwood-cache")) {
+		if (of_address_to_resource(node, 0, &res))
+			return -ENODEV;
+
+		base = ioremap(res.start, resource_size(&res));
+		if (!base)
+			return -ENOMEM;
+
+		if (l2_wt_override)
+			writel(readl(base) | L2_WRITETHROUGH_KIRKWOOD, base);
+		else
+			writel(readl(base) & ~L2_WRITETHROUGH_KIRKWOOD, base);
+	}
+
+	feroceon_l2_init(l2_wt_override);
+
+	return 0;
+}
+#endif
diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c
index c2f37390308..7c3fb41a462 100644
--- a/arch/arm/mm/cache-l2x0.c
+++ b/arch/arm/mm/cache-l2x0.c
@@ -16,17 +16,33 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
+#include <linux/cpu.h>
 #include <linux/err.h>
 #include <linux/init.h>
+#include <linux/smp.h>
 #include <linux/spinlock.h>
 #include <linux/io.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
 
 #include <asm/cacheflush.h>
+#include <asm/cp15.h>
+#include <asm/cputype.h>
 #include <asm/hardware/cache-l2x0.h>
+#include "cache-tauros3.h"
 #include "cache-aurora-l2.h"
 
+struct l2c_init_data {
+	const char *type;
+	unsigned way_size_0;
+	unsigned num_lock;
+	void (*of_parse)(const struct device_node *, u32 *, u32 *);
+	void (*enable)(void __iomem *, u32, unsigned);
+	void (*fixup)(void __iomem *, u32, struct outer_cache_fns *);
+	void (*save)(void __iomem *);
+	struct outer_cache_fns outer_cache;
+};
+
 #define CACHE_LINE_SIZE		32
 
 static void __iomem *l2x0_base;
@@ -35,96 +51,116 @@ static u32 l2x0_way_mask;	/* Bitmask of active ways */
 static u32 l2x0_size;
 static unsigned long sync_reg_offset = L2X0_CACHE_SYNC;
 
-/* Aurora don't have the cache ID register available, so we have to
- * pass it though the device tree */
-static u32  cache_id_part_number_from_dt;
-
 struct l2x0_regs l2x0_saved_regs;
 
-struct l2x0_of_data {
-	void (*setup)(const struct device_node *, u32 *, u32 *);
-	void (*save)(void);
-	struct outer_cache_fns outer_cache;
-};
-
-static bool of_init = false;
-
-static inline void cache_wait_way(void __iomem *reg, unsigned long mask)
+/*
+ * Common code for all cache controllers.
+ */
+static inline void l2c_wait_mask(void __iomem *reg, unsigned long mask)
 {
 	/* wait for cache operation by line or way to complete */
 	while (readl_relaxed(reg) & mask)
 		cpu_relax();
 }
 
-#ifdef CONFIG_CACHE_PL310
-static inline void cache_wait(void __iomem *reg, unsigned long mask)
+/*
+ * By default, we write directly to secure registers.  Platforms must
+ * override this if they are running non-secure.
+ */
+static void l2c_write_sec(unsigned long val, void __iomem *base, unsigned reg)
 {
-	/* cache operations by line are atomic on PL310 */
+	if (val == readl_relaxed(base + reg))
+		return;
+	if (outer_cache.write_sec)
+		outer_cache.write_sec(val, reg);
+	else
+		writel_relaxed(val, base + reg);
 }
-#else
-#define cache_wait	cache_wait_way
-#endif
 
-static inline void cache_sync(void)
+/*
+ * This should only be called when we have a requirement that the
+ * register be written due to a work-around, as platforms running
+ * in non-secure mode may not be able to access this register.
+ */
+static inline void l2c_set_debug(void __iomem *base, unsigned long val)
 {
-	void __iomem *base = l2x0_base;
-
-	writel_relaxed(0, base + sync_reg_offset);
-	cache_wait(base + L2X0_CACHE_SYNC, 1);
+	l2c_write_sec(val, base, L2X0_DEBUG_CTRL);
 }
 
-static inline void l2x0_clean_line(unsigned long addr)
+static void __l2c_op_way(void __iomem *reg)
 {
-	void __iomem *base = l2x0_base;
-	cache_wait(base + L2X0_CLEAN_LINE_PA, 1);
-	writel_relaxed(addr, base + L2X0_CLEAN_LINE_PA);
+	writel_relaxed(l2x0_way_mask, reg);
+	l2c_wait_mask(reg, l2x0_way_mask);
 }
 
-static inline void l2x0_inv_line(unsigned long addr)
+static inline void l2c_unlock(void __iomem *base, unsigned num)
 {
-	void __iomem *base = l2x0_base;
-	cache_wait(base + L2X0_INV_LINE_PA, 1);
-	writel_relaxed(addr, base + L2X0_INV_LINE_PA);
+	unsigned i;
+
+	for (i = 0; i < num; i++) {
+		writel_relaxed(0, base + L2X0_LOCKDOWN_WAY_D_BASE +
+			       i * L2X0_LOCKDOWN_STRIDE);
+		writel_relaxed(0, base + L2X0_LOCKDOWN_WAY_I_BASE +
+			       i * L2X0_LOCKDOWN_STRIDE);
+	}
 }
 
-#if defined(CONFIG_PL310_ERRATA_588369) || defined(CONFIG_PL310_ERRATA_727915)
-static inline void debug_writel(unsigned long val)
+/*
+ * Enable the L2 cache controller.  This function must only be
+ * called when the cache controller is known to be disabled.
+ */
+static void l2c_enable(void __iomem *base, u32 aux, unsigned num_lock)
 {
-	if (outer_cache.set_debug)
-		outer_cache.set_debug(val);
+	unsigned long flags;
+
+	l2c_write_sec(aux, base, L2X0_AUX_CTRL);
+
+	l2c_unlock(base, num_lock);
+
+	local_irq_save(flags);
+	__l2c_op_way(base + L2X0_INV_WAY);
+	writel_relaxed(0, base + sync_reg_offset);
+	l2c_wait_mask(base + sync_reg_offset, 1);
+	local_irq_restore(flags);
+
+	l2c_write_sec(L2X0_CTRL_EN, base, L2X0_CTRL);
 }
 
-static void pl310_set_debug(unsigned long val)
+static void l2c_disable(void)
 {
-	writel_relaxed(val, l2x0_base + L2X0_DEBUG_CTRL);
+	void __iomem *base = l2x0_base;
+
+	outer_cache.flush_all();
+	l2c_write_sec(0, base, L2X0_CTRL);
+	dsb(st);
 }
-#else
-/* Optimised out for non-errata case */
-static inline void debug_writel(unsigned long val)
+
+#ifdef CONFIG_CACHE_PL310
+static inline void cache_wait(void __iomem *reg, unsigned long mask)
 {
+	/* cache operations by line are atomic on PL310 */
 }
-
-#define pl310_set_debug	NULL
+#else
+#define cache_wait	l2c_wait_mask
 #endif
 
-#ifdef CONFIG_PL310_ERRATA_588369
-static inline void l2x0_flush_line(unsigned long addr)
+static inline void cache_sync(void)
 {
 	void __iomem *base = l2x0_base;
 
-	/* Clean by PA followed by Invalidate by PA */
-	cache_wait(base + L2X0_CLEAN_LINE_PA, 1);
-	writel_relaxed(addr, base + L2X0_CLEAN_LINE_PA);
-	cache_wait(base + L2X0_INV_LINE_PA, 1);
-	writel_relaxed(addr, base + L2X0_INV_LINE_PA);
+	writel_relaxed(0, base + sync_reg_offset);
+	cache_wait(base + L2X0_CACHE_SYNC, 1);
 }
-#else
 
-static inline void l2x0_flush_line(unsigned long addr)
+#if defined(CONFIG_PL310_ERRATA_588369) || defined(CONFIG_PL310_ERRATA_727915)
+static inline void debug_writel(unsigned long val)
+{
+	l2c_set_debug(l2x0_base, val);
+}
+#else
+/* Optimised out for non-errata case */
+static inline void debug_writel(unsigned long val)
 {
-	void __iomem *base = l2x0_base;
-	cache_wait(base + L2X0_CLEAN_INV_LINE_PA, 1);
-	writel_relaxed(addr, base + L2X0_CLEAN_INV_LINE_PA);
 }
 #endif
 
@@ -140,8 +176,7 @@ static void l2x0_cache_sync(void)
 static void __l2x0_flush_all(void)
 {
 	debug_writel(0x03);
-	writel_relaxed(l2x0_way_mask, l2x0_base + L2X0_CLEAN_INV_WAY);
-	cache_wait_way(l2x0_base + L2X0_CLEAN_INV_WAY, l2x0_way_mask);
+	__l2c_op_way(l2x0_base + L2X0_CLEAN_INV_WAY);
 	cache_sync();
 	debug_writel(0x00);
 }
@@ -156,276 +191,910 @@ static void l2x0_flush_all(void)
 	raw_spin_unlock_irqrestore(&l2x0_lock, flags);
 }
 
-static void l2x0_clean_all(void)
+static void l2x0_disable(void)
 {
 	unsigned long flags;
 
-	/* clean all ways */
 	raw_spin_lock_irqsave(&l2x0_lock, flags);
-	writel_relaxed(l2x0_way_mask, l2x0_base + L2X0_CLEAN_WAY);
-	cache_wait_way(l2x0_base + L2X0_CLEAN_WAY, l2x0_way_mask);
-	cache_sync();
+	__l2x0_flush_all();
+	l2c_write_sec(0, l2x0_base, L2X0_CTRL);
+	dsb(st);
 	raw_spin_unlock_irqrestore(&l2x0_lock, flags);
 }
 
-static void l2x0_inv_all(void)
+static void l2c_save(void __iomem *base)
 {
-	unsigned long flags;
+	l2x0_saved_regs.aux_ctrl = readl_relaxed(l2x0_base + L2X0_AUX_CTRL);
+}
 
-	/* invalidate all ways */
-	raw_spin_lock_irqsave(&l2x0_lock, flags);
-	/* Invalidating when L2 is enabled is a nono */
-	BUG_ON(readl(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN);
-	writel_relaxed(l2x0_way_mask, l2x0_base + L2X0_INV_WAY);
-	cache_wait_way(l2x0_base + L2X0_INV_WAY, l2x0_way_mask);
-	cache_sync();
-	raw_spin_unlock_irqrestore(&l2x0_lock, flags);
+/*
+ * L2C-210 specific code.
+ *
+ * The L2C-2x0 PA, set/way and sync operations are atomic, but we must
+ * ensure that no background operation is running.  The way operations
+ * are all background tasks.
+ *
+ * While a background operation is in progress, any new operation is
+ * ignored (unspecified whether this causes an error.)  Thankfully, not
+ * used on SMP.
+ *
+ * Never has a different sync register other than L2X0_CACHE_SYNC, but
+ * we use sync_reg_offset here so we can share some of this with L2C-310.
+ */
+static void __l2c210_cache_sync(void __iomem *base)
+{
+	writel_relaxed(0, base + sync_reg_offset);
 }
 
-static void l2x0_inv_range(unsigned long start, unsigned long end)
+static void __l2c210_op_pa_range(void __iomem *reg, unsigned long start,
+	unsigned long end)
+{
+	while (start < end) {
+		writel_relaxed(start, reg);
+		start += CACHE_LINE_SIZE;
+	}
+}
+
+static void l2c210_inv_range(unsigned long start, unsigned long end)
 {
 	void __iomem *base = l2x0_base;
-	unsigned long flags;
 
-	raw_spin_lock_irqsave(&l2x0_lock, flags);
 	if (start & (CACHE_LINE_SIZE - 1)) {
 		start &= ~(CACHE_LINE_SIZE - 1);
-		debug_writel(0x03);
-		l2x0_flush_line(start);
-		debug_writel(0x00);
+		writel_relaxed(start, base + L2X0_CLEAN_INV_LINE_PA);
 		start += CACHE_LINE_SIZE;
 	}
 
 	if (end & (CACHE_LINE_SIZE - 1)) {
 		end &= ~(CACHE_LINE_SIZE - 1);
-		debug_writel(0x03);
-		l2x0_flush_line(end);
-		debug_writel(0x00);
+		writel_relaxed(end, base + L2X0_CLEAN_INV_LINE_PA);
 	}
 
+	__l2c210_op_pa_range(base + L2X0_INV_LINE_PA, start, end);
+	__l2c210_cache_sync(base);
+}
+
+static void l2c210_clean_range(unsigned long start, unsigned long end)
+{
+	void __iomem *base = l2x0_base;
+
+	start &= ~(CACHE_LINE_SIZE - 1);
+	__l2c210_op_pa_range(base + L2X0_CLEAN_LINE_PA, start, end);
+	__l2c210_cache_sync(base);
+}
+
+static void l2c210_flush_range(unsigned long start, unsigned long end)
+{
+	void __iomem *base = l2x0_base;
+
+	start &= ~(CACHE_LINE_SIZE - 1);
+	__l2c210_op_pa_range(base + L2X0_CLEAN_INV_LINE_PA, start, end);
+	__l2c210_cache_sync(base);
+}
+
+static void l2c210_flush_all(void)
+{
+	void __iomem *base = l2x0_base;
+
+	BUG_ON(!irqs_disabled());
+
+	__l2c_op_way(base + L2X0_CLEAN_INV_WAY);
+	__l2c210_cache_sync(base);
+}
+
+static void l2c210_sync(void)
+{
+	__l2c210_cache_sync(l2x0_base);
+}
+
+static void l2c210_resume(void)
+{
+	void __iomem *base = l2x0_base;
+
+	if (!(readl_relaxed(base + L2X0_CTRL) & L2X0_CTRL_EN))
+		l2c_enable(base, l2x0_saved_regs.aux_ctrl, 1);
+}
+
+static const struct l2c_init_data l2c210_data __initconst = {
+	.type = "L2C-210",
+	.way_size_0 = SZ_8K,
+	.num_lock = 1,
+	.enable = l2c_enable,
+	.save = l2c_save,
+	.outer_cache = {
+		.inv_range = l2c210_inv_range,
+		.clean_range = l2c210_clean_range,
+		.flush_range = l2c210_flush_range,
+		.flush_all = l2c210_flush_all,
+		.disable = l2c_disable,
+		.sync = l2c210_sync,
+		.resume = l2c210_resume,
+	},
+};
+
+/*
+ * L2C-220 specific code.
+ *
+ * All operations are background operations: they have to be waited for.
+ * Conflicting requests generate a slave error (which will cause an
+ * imprecise abort.)  Never uses sync_reg_offset, so we hard-code the
+ * sync register here.
+ *
+ * However, we can re-use the l2c210_resume call.
+ */
+static inline void __l2c220_cache_sync(void __iomem *base)
+{
+	writel_relaxed(0, base + L2X0_CACHE_SYNC);
+	l2c_wait_mask(base + L2X0_CACHE_SYNC, 1);
+}
+
+static void l2c220_op_way(void __iomem *base, unsigned reg)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&l2x0_lock, flags);
+	__l2c_op_way(base + reg);
+	__l2c220_cache_sync(base);
+	raw_spin_unlock_irqrestore(&l2x0_lock, flags);
+}
+
+static unsigned long l2c220_op_pa_range(void __iomem *reg, unsigned long start,
+	unsigned long end, unsigned long flags)
+{
+	raw_spinlock_t *lock = &l2x0_lock;
+
 	while (start < end) {
 		unsigned long blk_end = start + min(end - start, 4096UL);
 
 		while (start < blk_end) {
-			l2x0_inv_line(start);
+			l2c_wait_mask(reg, 1);
+			writel_relaxed(start, reg);
 			start += CACHE_LINE_SIZE;
 		}
 
 		if (blk_end < end) {
-			raw_spin_unlock_irqrestore(&l2x0_lock, flags);
-			raw_spin_lock_irqsave(&l2x0_lock, flags);
+			raw_spin_unlock_irqrestore(lock, flags);
+			raw_spin_lock_irqsave(lock, flags);
 		}
 	}
-	cache_wait(base + L2X0_INV_LINE_PA, 1);
-	cache_sync();
-	raw_spin_unlock_irqrestore(&l2x0_lock, flags);
+
+	return flags;
 }
 
-static void l2x0_clean_range(unsigned long start, unsigned long end)
+static void l2c220_inv_range(unsigned long start, unsigned long end)
 {
 	void __iomem *base = l2x0_base;
 	unsigned long flags;
 
-	if ((end - start) >= l2x0_size) {
-		l2x0_clean_all();
-		return;
-	}
-
 	raw_spin_lock_irqsave(&l2x0_lock, flags);
-	start &= ~(CACHE_LINE_SIZE - 1);
-	while (start < end) {
-		unsigned long blk_end = start + min(end - start, 4096UL);
-
-		while (start < blk_end) {
-			l2x0_clean_line(start);
+	if ((start | end) & (CACHE_LINE_SIZE - 1)) {
+		if (start & (CACHE_LINE_SIZE - 1)) {
+			start &= ~(CACHE_LINE_SIZE - 1);
+			writel_relaxed(start, base + L2X0_CLEAN_INV_LINE_PA);
 			start += CACHE_LINE_SIZE;
 		}
 
-		if (blk_end < end) {
-			raw_spin_unlock_irqrestore(&l2x0_lock, flags);
-			raw_spin_lock_irqsave(&l2x0_lock, flags);
+		if (end & (CACHE_LINE_SIZE - 1)) {
+			end &= ~(CACHE_LINE_SIZE - 1);
+			l2c_wait_mask(base + L2X0_CLEAN_INV_LINE_PA, 1);
+			writel_relaxed(end, base + L2X0_CLEAN_INV_LINE_PA);
 		}
 	}
-	cache_wait(base + L2X0_CLEAN_LINE_PA, 1);
-	cache_sync();
+
+	flags = l2c220_op_pa_range(base + L2X0_INV_LINE_PA,
+				   start, end, flags);
+	l2c_wait_mask(base + L2X0_INV_LINE_PA, 1);
+	__l2c220_cache_sync(base);
 	raw_spin_unlock_irqrestore(&l2x0_lock, flags);
 }
 
-static void l2x0_flush_range(unsigned long start, unsigned long end)
+static void l2c220_clean_range(unsigned long start, unsigned long end)
 {
 	void __iomem *base = l2x0_base;
 	unsigned long flags;
 
+	start &= ~(CACHE_LINE_SIZE - 1);
 	if ((end - start) >= l2x0_size) {
-		l2x0_flush_all();
+		l2c220_op_way(base, L2X0_CLEAN_WAY);
 		return;
 	}
 
 	raw_spin_lock_irqsave(&l2x0_lock, flags);
+	flags = l2c220_op_pa_range(base + L2X0_CLEAN_LINE_PA,
+				   start, end, flags);
+	l2c_wait_mask(base + L2X0_CLEAN_INV_LINE_PA, 1);
+	__l2c220_cache_sync(base);
+	raw_spin_unlock_irqrestore(&l2x0_lock, flags);
+}
+
+static void l2c220_flush_range(unsigned long start, unsigned long end)
+{
+	void __iomem *base = l2x0_base;
+	unsigned long flags;
+
 	start &= ~(CACHE_LINE_SIZE - 1);
+	if ((end - start) >= l2x0_size) {
+		l2c220_op_way(base, L2X0_CLEAN_INV_WAY);
+		return;
+	}
+
+	raw_spin_lock_irqsave(&l2x0_lock, flags);
+	flags = l2c220_op_pa_range(base + L2X0_CLEAN_INV_LINE_PA,
+				   start, end, flags);
+	l2c_wait_mask(base + L2X0_CLEAN_INV_LINE_PA, 1);
+	__l2c220_cache_sync(base);
+	raw_spin_unlock_irqrestore(&l2x0_lock, flags);
+}
+
+static void l2c220_flush_all(void)
+{
+	l2c220_op_way(l2x0_base, L2X0_CLEAN_INV_WAY);
+}
+
+static void l2c220_sync(void)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&l2x0_lock, flags);
+	__l2c220_cache_sync(l2x0_base);
+	raw_spin_unlock_irqrestore(&l2x0_lock, flags);
+}
+
+static void l2c220_enable(void __iomem *base, u32 aux, unsigned num_lock)
+{
+	/*
+	 * Always enable non-secure access to the lockdown registers -
+	 * we write to them as part of the L2C enable sequence so they
+	 * need to be accessible.
+	 */
+	aux |= L220_AUX_CTRL_NS_LOCKDOWN;
+
+	l2c_enable(base, aux, num_lock);
+}
+
+static const struct l2c_init_data l2c220_data = {
+	.type = "L2C-220",
+	.way_size_0 = SZ_8K,
+	.num_lock = 1,
+	.enable = l2c220_enable,
+	.save = l2c_save,
+	.outer_cache = {
+		.inv_range = l2c220_inv_range,
+		.clean_range = l2c220_clean_range,
+		.flush_range = l2c220_flush_range,
+		.flush_all = l2c220_flush_all,
+		.disable = l2c_disable,
+		.sync = l2c220_sync,
+		.resume = l2c210_resume,
+	},
+};
+
+/*
+ * L2C-310 specific code.
+ *
+ * Very similar to L2C-210, the PA, set/way and sync operations are atomic,
+ * and the way operations are all background tasks.  However, issuing an
+ * operation while a background operation is in progress results in a
+ * SLVERR response.  We can reuse:
+ *
+ *  __l2c210_cache_sync (using sync_reg_offset)
+ *  l2c210_sync
+ *  l2c210_inv_range (if 588369 is not applicable)
+ *  l2c210_clean_range
+ *  l2c210_flush_range (if 588369 is not applicable)
+ *  l2c210_flush_all (if 727915 is not applicable)
+ *
+ * Errata:
+ * 588369: PL310 R0P0->R1P0, fixed R2P0.
+ *	Affects: all clean+invalidate operations
+ *	clean and invalidate skips the invalidate step, so we need to issue
+ *	separate operations.  We also require the above debug workaround
+ *	enclosing this code fragment on affected parts.  On unaffected parts,
+ *	we must not use this workaround without the debug register writes
+ *	to avoid exposing a problem similar to 727915.
+ *
+ * 727915: PL310 R2P0->R3P0, fixed R3P1.
+ *	Affects: clean+invalidate by way
+ *	clean and invalidate by way runs in the background, and a store can
+ *	hit the line between the clean operation and invalidate operation,
+ *	resulting in the store being lost.
+ *
+ * 752271: PL310 R3P0->R3P1-50REL0, fixed R3P2.
+ *	Affects: 8x64-bit (double fill) line fetches
+ *	double fill line fetches can fail to cause dirty data to be evicted
+ *	from the cache before the new data overwrites the second line.
+ *
+ * 753970: PL310 R3P0, fixed R3P1.
+ *	Affects: sync
+ *	prevents merging writes after the sync operation, until another L2C
+ *	operation is performed (or a number of other conditions.)
+ *
+ * 769419: PL310 R0P0->R3P1, fixed R3P2.
+ *	Affects: store buffer
+ *	store buffer is not automatically drained.
+ */
+static void l2c310_inv_range_erratum(unsigned long start, unsigned long end)
+{
+	void __iomem *base = l2x0_base;
+
+	if ((start | end) & (CACHE_LINE_SIZE - 1)) {
+		unsigned long flags;
+
+		/* Erratum 588369 for both clean+invalidate operations */
+		raw_spin_lock_irqsave(&l2x0_lock, flags);
+		l2c_set_debug(base, 0x03);
+
+		if (start & (CACHE_LINE_SIZE - 1)) {
+			start &= ~(CACHE_LINE_SIZE - 1);
+			writel_relaxed(start, base + L2X0_CLEAN_LINE_PA);
+			writel_relaxed(start, base + L2X0_INV_LINE_PA);
+			start += CACHE_LINE_SIZE;
+		}
+
+		if (end & (CACHE_LINE_SIZE - 1)) {
+			end &= ~(CACHE_LINE_SIZE - 1);
+			writel_relaxed(end, base + L2X0_CLEAN_LINE_PA);
+			writel_relaxed(end, base + L2X0_INV_LINE_PA);
+		}
+
+		l2c_set_debug(base, 0x00);
+		raw_spin_unlock_irqrestore(&l2x0_lock, flags);
+	}
+
+	__l2c210_op_pa_range(base + L2X0_INV_LINE_PA, start, end);
+	__l2c210_cache_sync(base);
+}
+
+static void l2c310_flush_range_erratum(unsigned long start, unsigned long end)
+{
+	raw_spinlock_t *lock = &l2x0_lock;
+	unsigned long flags;
+	void __iomem *base = l2x0_base;
+
+	raw_spin_lock_irqsave(lock, flags);
 	while (start < end) {
 		unsigned long blk_end = start + min(end - start, 4096UL);
 
-		debug_writel(0x03);
+		l2c_set_debug(base, 0x03);
 		while (start < blk_end) {
-			l2x0_flush_line(start);
+			writel_relaxed(start, base + L2X0_CLEAN_LINE_PA);
+			writel_relaxed(start, base + L2X0_INV_LINE_PA);
 			start += CACHE_LINE_SIZE;
 		}
-		debug_writel(0x00);
+		l2c_set_debug(base, 0x00);
 
 		if (blk_end < end) {
-			raw_spin_unlock_irqrestore(&l2x0_lock, flags);
-			raw_spin_lock_irqsave(&l2x0_lock, flags);
+			raw_spin_unlock_irqrestore(lock, flags);
+			raw_spin_lock_irqsave(lock, flags);
 		}
 	}
-	cache_wait(base + L2X0_CLEAN_INV_LINE_PA, 1);
-	cache_sync();
-	raw_spin_unlock_irqrestore(&l2x0_lock, flags);
+	raw_spin_unlock_irqrestore(lock, flags);
+	__l2c210_cache_sync(base);
 }
 
-static void l2x0_disable(void)
+static void l2c310_flush_all_erratum(void)
 {
+	void __iomem *base = l2x0_base;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&l2x0_lock, flags);
-	__l2x0_flush_all();
-	writel_relaxed(0, l2x0_base + L2X0_CTRL);
-	dsb();
+	l2c_set_debug(base, 0x03);
+	__l2c_op_way(base + L2X0_CLEAN_INV_WAY);
+	l2c_set_debug(base, 0x00);
+	__l2c210_cache_sync(base);
 	raw_spin_unlock_irqrestore(&l2x0_lock, flags);
 }
 
-static void l2x0_unlock(u32 cache_id)
+static void __init l2c310_save(void __iomem *base)
 {
-	int lockregs;
-	int i;
+	unsigned revision;
 
-	switch (cache_id) {
-	case L2X0_CACHE_ID_PART_L310:
-		lockregs = 8;
-		break;
-	case AURORA_CACHE_ID:
-		lockregs = 4;
+	l2c_save(base);
+
+	l2x0_saved_regs.tag_latency = readl_relaxed(base +
+		L310_TAG_LATENCY_CTRL);
+	l2x0_saved_regs.data_latency = readl_relaxed(base +
+		L310_DATA_LATENCY_CTRL);
+	l2x0_saved_regs.filter_end = readl_relaxed(base +
+		L310_ADDR_FILTER_END);
+	l2x0_saved_regs.filter_start = readl_relaxed(base +
+		L310_ADDR_FILTER_START);
+
+	revision = readl_relaxed(base + L2X0_CACHE_ID) &
+			L2X0_CACHE_ID_RTL_MASK;
+
+	/* From r2p0, there is Prefetch offset/control register */
+	if (revision >= L310_CACHE_ID_RTL_R2P0)
+		l2x0_saved_regs.prefetch_ctrl = readl_relaxed(base +
+							L310_PREFETCH_CTRL);
+
+	/* From r3p0, there is Power control register */
+	if (revision >= L310_CACHE_ID_RTL_R3P0)
+		l2x0_saved_regs.pwr_ctrl = readl_relaxed(base +
+							L310_POWER_CTRL);
+}
+
+static void l2c310_resume(void)
+{
+	void __iomem *base = l2x0_base;
+
+	if (!(readl_relaxed(base + L2X0_CTRL) & L2X0_CTRL_EN)) {
+		unsigned revision;
+
+		/* restore pl310 setup */
+		writel_relaxed(l2x0_saved_regs.tag_latency,
+			       base + L310_TAG_LATENCY_CTRL);
+		writel_relaxed(l2x0_saved_regs.data_latency,
+			       base + L310_DATA_LATENCY_CTRL);
+		writel_relaxed(l2x0_saved_regs.filter_end,
+			       base + L310_ADDR_FILTER_END);
+		writel_relaxed(l2x0_saved_regs.filter_start,
+			       base + L310_ADDR_FILTER_START);
+
+		revision = readl_relaxed(base + L2X0_CACHE_ID) &
+				L2X0_CACHE_ID_RTL_MASK;
+
+		if (revision >= L310_CACHE_ID_RTL_R2P0)
+			l2c_write_sec(l2x0_saved_regs.prefetch_ctrl, base,
+				      L310_PREFETCH_CTRL);
+		if (revision >= L310_CACHE_ID_RTL_R3P0)
+			l2c_write_sec(l2x0_saved_regs.pwr_ctrl, base,
+				      L310_POWER_CTRL);
+
+		l2c_enable(base, l2x0_saved_regs.aux_ctrl, 8);
+
+		/* Re-enable full-line-of-zeros for Cortex-A9 */
+		if (l2x0_saved_regs.aux_ctrl & L310_AUX_CTRL_FULL_LINE_ZERO)
+			set_auxcr(get_auxcr() | BIT(3) | BIT(2) | BIT(1));
+	}
+}
+
+static int l2c310_cpu_enable_flz(struct notifier_block *nb, unsigned long act, void *data)
+{
+	switch (act & ~CPU_TASKS_FROZEN) {
+	case CPU_STARTING:
+		set_auxcr(get_auxcr() | BIT(3) | BIT(2) | BIT(1));
 		break;
-	default:
-		/* L210 and unknown types */
-		lockregs = 1;
+	case CPU_DYING:
+		set_auxcr(get_auxcr() & ~(BIT(3) | BIT(2) | BIT(1)));
 		break;
 	}
+	return NOTIFY_OK;
+}
+
+static void __init l2c310_enable(void __iomem *base, u32 aux, unsigned num_lock)
+{
+	unsigned rev = readl_relaxed(base + L2X0_CACHE_ID) & L2X0_CACHE_ID_RTL_MASK;
+	bool cortex_a9 = read_cpuid_part_number() == ARM_CPU_PART_CORTEX_A9;
+
+	if (rev >= L310_CACHE_ID_RTL_R2P0) {
+		if (cortex_a9) {
+			aux |= L310_AUX_CTRL_EARLY_BRESP;
+			pr_info("L2C-310 enabling early BRESP for Cortex-A9\n");
+		} else if (aux & L310_AUX_CTRL_EARLY_BRESP) {
+			pr_warn("L2C-310 early BRESP only supported with Cortex-A9\n");
+			aux &= ~L310_AUX_CTRL_EARLY_BRESP;
+		}
+	}
 
-	for (i = 0; i < lockregs; i++) {
-		writel_relaxed(0x0, l2x0_base + L2X0_LOCKDOWN_WAY_D_BASE +
-			       i * L2X0_LOCKDOWN_STRIDE);
-		writel_relaxed(0x0, l2x0_base + L2X0_LOCKDOWN_WAY_I_BASE +
-			       i * L2X0_LOCKDOWN_STRIDE);
+	if (cortex_a9) {
+		u32 aux_cur = readl_relaxed(base + L2X0_AUX_CTRL);
+		u32 acr = get_auxcr();
+
+		pr_debug("Cortex-A9 ACR=0x%08x\n", acr);
+
+		if (acr & BIT(3) && !(aux_cur & L310_AUX_CTRL_FULL_LINE_ZERO))
+			pr_err("L2C-310: full line of zeros enabled in Cortex-A9 but not L2C-310 - invalid\n");
+
+		if (aux & L310_AUX_CTRL_FULL_LINE_ZERO && !(acr & BIT(3)))
+			pr_err("L2C-310: enabling full line of zeros but not enabled in Cortex-A9\n");
+
+		if (!(aux & L310_AUX_CTRL_FULL_LINE_ZERO) && !outer_cache.write_sec) {
+			aux |= L310_AUX_CTRL_FULL_LINE_ZERO;
+			pr_info("L2C-310 full line of zeros enabled for Cortex-A9\n");
+		}
+	} else if (aux & (L310_AUX_CTRL_FULL_LINE_ZERO | L310_AUX_CTRL_EARLY_BRESP)) {
+		pr_err("L2C-310: disabling Cortex-A9 specific feature bits\n");
+		aux &= ~(L310_AUX_CTRL_FULL_LINE_ZERO | L310_AUX_CTRL_EARLY_BRESP);
+	}
+
+	if (aux & (L310_AUX_CTRL_DATA_PREFETCH | L310_AUX_CTRL_INSTR_PREFETCH)) {
+		u32 prefetch = readl_relaxed(base + L310_PREFETCH_CTRL);
+
+		pr_info("L2C-310 %s%s prefetch enabled, offset %u lines\n",
+			aux & L310_AUX_CTRL_INSTR_PREFETCH ? "I" : "",
+			aux & L310_AUX_CTRL_DATA_PREFETCH ? "D" : "",
+			1 + (prefetch & L310_PREFETCH_CTRL_OFFSET_MASK));
+	}
+
+	/* r3p0 or later has power control register */
+	if (rev >= L310_CACHE_ID_RTL_R3P0) {
+		u32 power_ctrl;
+
+		l2c_write_sec(L310_DYNAMIC_CLK_GATING_EN | L310_STNDBY_MODE_EN,
+			      base, L310_POWER_CTRL);
+		power_ctrl = readl_relaxed(base + L310_POWER_CTRL);
+		pr_info("L2C-310 dynamic clock gating %sabled, standby mode %sabled\n",
+			power_ctrl & L310_DYNAMIC_CLK_GATING_EN ? "en" : "dis",
+			power_ctrl & L310_STNDBY_MODE_EN ? "en" : "dis");
+	}
+
+	/*
+	 * Always enable non-secure access to the lockdown registers -
+	 * we write to them as part of the L2C enable sequence so they
+	 * need to be accessible.
+	 */
+	aux |= L310_AUX_CTRL_NS_LOCKDOWN;
+
+	l2c_enable(base, aux, num_lock);
+
+	if (aux & L310_AUX_CTRL_FULL_LINE_ZERO) {
+		set_auxcr(get_auxcr() | BIT(3) | BIT(2) | BIT(1));
+		cpu_notifier(l2c310_cpu_enable_flz, 0);
 	}
 }
 
-void __init l2x0_init(void __iomem *base, u32 aux_val, u32 aux_mask)
+static void __init l2c310_fixup(void __iomem *base, u32 cache_id,
+	struct outer_cache_fns *fns)
 {
-	u32 aux;
-	u32 cache_id;
-	u32 way_size = 0;
-	int ways;
-	int way_size_shift = L2X0_WAY_SIZE_SHIFT;
-	const char *type;
+	unsigned revision = cache_id & L2X0_CACHE_ID_RTL_MASK;
+	const char *errata[8];
+	unsigned n = 0;
+
+	if (IS_ENABLED(CONFIG_PL310_ERRATA_588369) &&
+	    revision < L310_CACHE_ID_RTL_R2P0 &&
+	    /* For bcm compatibility */
+	    fns->inv_range == l2c210_inv_range) {
+		fns->inv_range = l2c310_inv_range_erratum;
+		fns->flush_range = l2c310_flush_range_erratum;
+		errata[n++] = "588369";
+	}
 
-	l2x0_base = base;
-	if (cache_id_part_number_from_dt)
-		cache_id = cache_id_part_number_from_dt;
-	else
-		cache_id = readl_relaxed(l2x0_base + L2X0_CACHE_ID)
-			& L2X0_CACHE_ID_PART_MASK;
-	aux = readl_relaxed(l2x0_base + L2X0_AUX_CTRL);
+	if (IS_ENABLED(CONFIG_PL310_ERRATA_727915) &&
+	    revision >= L310_CACHE_ID_RTL_R2P0 &&
+	    revision < L310_CACHE_ID_RTL_R3P1) {
+		fns->flush_all = l2c310_flush_all_erratum;
+		errata[n++] = "727915";
+	}
+
+	if (revision >= L310_CACHE_ID_RTL_R3P0 &&
+	    revision < L310_CACHE_ID_RTL_R3P2) {
+		u32 val = readl_relaxed(base + L310_PREFETCH_CTRL);
+		/* I don't think bit23 is required here... but iMX6 does so */
+		if (val & (BIT(30) | BIT(23))) {
+			val &= ~(BIT(30) | BIT(23));
+			l2c_write_sec(val, base, L310_PREFETCH_CTRL);
+			errata[n++] = "752271";
+		}
+	}
+
+	if (IS_ENABLED(CONFIG_PL310_ERRATA_753970) &&
+	    revision == L310_CACHE_ID_RTL_R3P0) {
+		sync_reg_offset = L2X0_DUMMY_REG;
+		errata[n++] = "753970";
+	}
+
+	if (IS_ENABLED(CONFIG_PL310_ERRATA_769419))
+		errata[n++] = "769419";
 
+	if (n) {
+		unsigned i;
+
+		pr_info("L2C-310 errat%s", n > 1 ? "a" : "um");
+		for (i = 0; i < n; i++)
+			pr_cont(" %s", errata[i]);
+		pr_cont(" enabled\n");
+	}
+}
+
+static void l2c310_disable(void)
+{
+	/*
+	 * If full-line-of-zeros is enabled, we must first disable it in the
+	 * Cortex-A9 auxiliary control register before disabling the L2 cache.
+	 */
+	if (l2x0_saved_regs.aux_ctrl & L310_AUX_CTRL_FULL_LINE_ZERO)
+		set_auxcr(get_auxcr() & ~(BIT(3) | BIT(2) | BIT(1)));
+
+	l2c_disable();
+}
+
+static const struct l2c_init_data l2c310_init_fns __initconst = {
+	.type = "L2C-310",
+	.way_size_0 = SZ_8K,
+	.num_lock = 8,
+	.enable = l2c310_enable,
+	.fixup = l2c310_fixup,
+	.save = l2c310_save,
+	.outer_cache = {
+		.inv_range = l2c210_inv_range,
+		.clean_range = l2c210_clean_range,
+		.flush_range = l2c210_flush_range,
+		.flush_all = l2c210_flush_all,
+		.disable = l2c310_disable,
+		.sync = l2c210_sync,
+		.resume = l2c310_resume,
+	},
+};
+
+static void __init __l2c_init(const struct l2c_init_data *data,
+	u32 aux_val, u32 aux_mask, u32 cache_id)
+{
+	struct outer_cache_fns fns;
+	unsigned way_size_bits, ways;
+	u32 aux, old_aux;
+
+	/*
+	 * Sanity check the aux values.  aux_mask is the bits we preserve
+	 * from reading the hardware register, and aux_val is the bits we
+	 * set.
+	 */
+	if (aux_val & aux_mask)
+		pr_alert("L2C: platform provided aux values permit register corruption.\n");
+
+	old_aux = aux = readl_relaxed(l2x0_base + L2X0_AUX_CTRL);
 	aux &= aux_mask;
 	aux |= aux_val;
 
+	if (old_aux != aux)
+		pr_warn("L2C: DT/platform modifies aux control register: 0x%08x -> 0x%08x\n",
+		        old_aux, aux);
+
 	/* Determine the number of ways */
-	switch (cache_id) {
+	switch (cache_id & L2X0_CACHE_ID_PART_MASK) {
 	case L2X0_CACHE_ID_PART_L310:
+		if ((aux_val | ~aux_mask) & (L2C_AUX_CTRL_WAY_SIZE_MASK | L310_AUX_CTRL_ASSOCIATIVITY_16))
+			pr_warn("L2C: DT/platform tries to modify or specify cache size\n");
 		if (aux & (1 << 16))
 			ways = 16;
 		else
 			ways = 8;
-		type = "L310";
-#ifdef CONFIG_PL310_ERRATA_753970
-		/* Unmapped register. */
-		sync_reg_offset = L2X0_DUMMY_REG;
-#endif
-		if ((cache_id & L2X0_CACHE_ID_RTL_MASK) <= L2X0_CACHE_ID_RTL_R3P0)
-			outer_cache.set_debug = pl310_set_debug;
 		break;
+
 	case L2X0_CACHE_ID_PART_L210:
+	case L2X0_CACHE_ID_PART_L220:
 		ways = (aux >> 13) & 0xf;
-		type = "L210";
 		break;
 
 	case AURORA_CACHE_ID:
-		sync_reg_offset = AURORA_SYNC_REG;
 		ways = (aux >> 13) & 0xf;
 		ways = 2 << ((ways + 1) >> 2);
-		way_size_shift = AURORA_WAY_SIZE_SHIFT;
-		type = "Aurora";
 		break;
+
 	default:
 		/* Assume unknown chips have 8 ways */
 		ways = 8;
-		type = "L2x0 series";
 		break;
 	}
 
 	l2x0_way_mask = (1 << ways) - 1;
 
 	/*
-	 * L2 cache Size =  Way size * Number of ways
+	 * way_size_0 is the size that a way_size value of zero would be
+	 * given the calculation: way_size = way_size_0 << way_size_bits.
+	 * So, if way_size_bits=0 is reserved, but way_size_bits=1 is 16k,
+	 * then way_size_0 would be 8k.
+	 *
+	 * L2 cache size = number of ways * way size.
 	 */
-	way_size = (aux & L2X0_AUX_CTRL_WAY_SIZE_MASK) >> 17;
-	way_size = 1 << (way_size + way_size_shift);
+	way_size_bits = (aux & L2C_AUX_CTRL_WAY_SIZE_MASK) >>
+			L2C_AUX_CTRL_WAY_SIZE_SHIFT;
+	l2x0_size = ways * (data->way_size_0 << way_size_bits);
 
-	l2x0_size = ways * way_size * SZ_1K;
+	fns = data->outer_cache;
+	fns.write_sec = outer_cache.write_sec;
+	if (data->fixup)
+		data->fixup(l2x0_base, cache_id, &fns);
 
 	/*
-	 * Check if l2x0 controller is already enabled.
-	 * If you are booting from non-secure mode
-	 * accessing the below registers will fault.
+	 * Check if l2x0 controller is already enabled.  If we are booting
+	 * in non-secure mode accessing the below registers will fault.
 	 */
-	if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN)) {
-		/* Make sure that I&D is not locked down when starting */
-		l2x0_unlock(cache_id);
-
-		/* l2x0 controller is disabled */
-		writel_relaxed(aux, l2x0_base + L2X0_AUX_CTRL);
+	if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN))
+		data->enable(l2x0_base, aux, data->num_lock);
 
-		l2x0_inv_all();
+	outer_cache = fns;
 
-		/* enable L2X0 */
-		writel_relaxed(L2X0_CTRL_EN, l2x0_base + L2X0_CTRL);
-	}
+	/*
+	 * It is strange to save the register state before initialisation,
+	 * but hey, this is what the DT implementations decided to do.
+	 */
+	if (data->save)
+		data->save(l2x0_base);
 
 	/* Re-read it in case some bits are reserved. */
 	aux = readl_relaxed(l2x0_base + L2X0_AUX_CTRL);
 
-	/* Save the value for resuming. */
-	l2x0_saved_regs.aux_ctrl = aux;
+	pr_info("%s cache controller enabled, %d ways, %d kB\n",
+		data->type, ways, l2x0_size >> 10);
+	pr_info("%s: CACHE_ID 0x%08x, AUX_CTRL 0x%08x\n",
+		data->type, cache_id, aux);
+}
+
+void __init l2x0_init(void __iomem *base, u32 aux_val, u32 aux_mask)
+{
+	const struct l2c_init_data *data;
+	u32 cache_id;
 
-	if (!of_init) {
-		outer_cache.inv_range = l2x0_inv_range;
-		outer_cache.clean_range = l2x0_clean_range;
-		outer_cache.flush_range = l2x0_flush_range;
-		outer_cache.sync = l2x0_cache_sync;
-		outer_cache.flush_all = l2x0_flush_all;
-		outer_cache.inv_all = l2x0_inv_all;
-		outer_cache.disable = l2x0_disable;
+	l2x0_base = base;
+
+	cache_id = readl_relaxed(base + L2X0_CACHE_ID);
+
+	switch (cache_id & L2X0_CACHE_ID_PART_MASK) {
+	default:
+	case L2X0_CACHE_ID_PART_L210:
+		data = &l2c210_data;
+		break;
+
+	case L2X0_CACHE_ID_PART_L220:
+		data = &l2c220_data;
+		break;
+
+	case L2X0_CACHE_ID_PART_L310:
+		data = &l2c310_init_fns;
+		break;
 	}
 
-	printk(KERN_INFO "%s cache controller enabled\n", type);
-	printk(KERN_INFO "l2x0: %d ways, CACHE_ID 0x%08x, AUX_CTRL 0x%08x, Cache size: %d B\n",
-			ways, cache_id, aux, l2x0_size);
+	__l2c_init(data, aux_val, aux_mask, cache_id);
 }
 
 #ifdef CONFIG_OF
 static int l2_wt_override;
 
+/* Aurora don't have the cache ID register available, so we have to
+ * pass it though the device tree */
+static u32 cache_id_part_number_from_dt;
+
+static void __init l2x0_of_parse(const struct device_node *np,
+				 u32 *aux_val, u32 *aux_mask)
+{
+	u32 data[2] = { 0, 0 };
+	u32 tag = 0;
+	u32 dirty = 0;
+	u32 val = 0, mask = 0;
+
+	of_property_read_u32(np, "arm,tag-latency", &tag);
+	if (tag) {
+		mask |= L2X0_AUX_CTRL_TAG_LATENCY_MASK;
+		val |= (tag - 1) << L2X0_AUX_CTRL_TAG_LATENCY_SHIFT;
+	}
+
+	of_property_read_u32_array(np, "arm,data-latency",
+				   data, ARRAY_SIZE(data));
+	if (data[0] && data[1]) {
+		mask |= L2X0_AUX_CTRL_DATA_RD_LATENCY_MASK |
+			L2X0_AUX_CTRL_DATA_WR_LATENCY_MASK;
+		val |= ((data[0] - 1) << L2X0_AUX_CTRL_DATA_RD_LATENCY_SHIFT) |
+		       ((data[1] - 1) << L2X0_AUX_CTRL_DATA_WR_LATENCY_SHIFT);
+	}
+
+	of_property_read_u32(np, "arm,dirty-latency", &dirty);
+	if (dirty) {
+		mask |= L2X0_AUX_CTRL_DIRTY_LATENCY_MASK;
+		val |= (dirty - 1) << L2X0_AUX_CTRL_DIRTY_LATENCY_SHIFT;
+	}
+
+	*aux_val &= ~mask;
+	*aux_val |= val;
+	*aux_mask &= ~mask;
+}
+
+static const struct l2c_init_data of_l2c210_data __initconst = {
+	.type = "L2C-210",
+	.way_size_0 = SZ_8K,
+	.num_lock = 1,
+	.of_parse = l2x0_of_parse,
+	.enable = l2c_enable,
+	.save = l2c_save,
+	.outer_cache = {
+		.inv_range   = l2c210_inv_range,
+		.clean_range = l2c210_clean_range,
+		.flush_range = l2c210_flush_range,
+		.flush_all   = l2c210_flush_all,
+		.disable     = l2c_disable,
+		.sync        = l2c210_sync,
+		.resume      = l2c210_resume,
+	},
+};
+
+static const struct l2c_init_data of_l2c220_data __initconst = {
+	.type = "L2C-220",
+	.way_size_0 = SZ_8K,
+	.num_lock = 1,
+	.of_parse = l2x0_of_parse,
+	.enable = l2c220_enable,
+	.save = l2c_save,
+	.outer_cache = {
+		.inv_range   = l2c220_inv_range,
+		.clean_range = l2c220_clean_range,
+		.flush_range = l2c220_flush_range,
+		.flush_all   = l2c220_flush_all,
+		.disable     = l2c_disable,
+		.sync        = l2c220_sync,
+		.resume      = l2c210_resume,
+	},
+};
+
+static void __init l2c310_of_parse(const struct device_node *np,
+	u32 *aux_val, u32 *aux_mask)
+{
+	u32 data[3] = { 0, 0, 0 };
+	u32 tag[3] = { 0, 0, 0 };
+	u32 filter[2] = { 0, 0 };
+
+	of_property_read_u32_array(np, "arm,tag-latency", tag, ARRAY_SIZE(tag));
+	if (tag[0] && tag[1] && tag[2])
+		writel_relaxed(
+			L310_LATENCY_CTRL_RD(tag[0] - 1) |
+			L310_LATENCY_CTRL_WR(tag[1] - 1) |
+			L310_LATENCY_CTRL_SETUP(tag[2] - 1),
+			l2x0_base + L310_TAG_LATENCY_CTRL);
+
+	of_property_read_u32_array(np, "arm,data-latency",
+				   data, ARRAY_SIZE(data));
+	if (data[0] && data[1] && data[2])
+		writel_relaxed(
+			L310_LATENCY_CTRL_RD(data[0] - 1) |
+			L310_LATENCY_CTRL_WR(data[1] - 1) |
+			L310_LATENCY_CTRL_SETUP(data[2] - 1),
+			l2x0_base + L310_DATA_LATENCY_CTRL);
+
+	of_property_read_u32_array(np, "arm,filter-ranges",
+				   filter, ARRAY_SIZE(filter));
+	if (filter[1]) {
+		writel_relaxed(ALIGN(filter[0] + filter[1], SZ_1M),
+			       l2x0_base + L310_ADDR_FILTER_END);
+		writel_relaxed((filter[0] & ~(SZ_1M - 1)) | L310_ADDR_FILTER_EN,
+			       l2x0_base + L310_ADDR_FILTER_START);
+	}
+}
+
+static const struct l2c_init_data of_l2c310_data __initconst = {
+	.type = "L2C-310",
+	.way_size_0 = SZ_8K,
+	.num_lock = 8,
+	.of_parse = l2c310_of_parse,
+	.enable = l2c310_enable,
+	.fixup = l2c310_fixup,
+	.save  = l2c310_save,
+	.outer_cache = {
+		.inv_range   = l2c210_inv_range,
+		.clean_range = l2c210_clean_range,
+		.flush_range = l2c210_flush_range,
+		.flush_all   = l2c210_flush_all,
+		.disable     = l2c310_disable,
+		.sync        = l2c210_sync,
+		.resume      = l2c310_resume,
+	},
+};
+
+/*
+ * This is a variant of the of_l2c310_data with .sync set to
+ * NULL. Outer sync operations are not needed when the system is I/O
+ * coherent, and potentially harmful in certain situations (PCIe/PL310
+ * deadlock on Armada 375/38x due to hardware I/O coherency). The
+ * other operations are kept because they are infrequent (therefore do
+ * not cause the deadlock in practice) and needed for secondary CPU
+ * boot and other power management activities.
+ */
+static const struct l2c_init_data of_l2c310_coherent_data __initconst = {
+	.type = "L2C-310 Coherent",
+	.way_size_0 = SZ_8K,
+	.num_lock = 8,
+	.of_parse = l2c310_of_parse,
+	.enable = l2c310_enable,
+	.fixup = l2c310_fixup,
+	.save  = l2c310_save,
+	.outer_cache = {
+		.inv_range   = l2c210_inv_range,
+		.clean_range = l2c210_clean_range,
+		.flush_range = l2c210_flush_range,
+		.flush_all   = l2c210_flush_all,
+		.disable     = l2c310_disable,
+		.resume      = l2c310_resume,
+	},
+};
+
 /*
  * Note that the end addresses passed to Linux primitives are
  * noninclusive, while the hardware cache range operations use
@@ -524,265 +1193,311 @@ static void aurora_flush_range(unsigned long start, unsigned long end)
 	}
 }
 
-static void __init l2x0_of_setup(const struct device_node *np,
-				 u32 *aux_val, u32 *aux_mask)
+static void aurora_save(void __iomem *base)
 {
-	u32 data[2] = { 0, 0 };
-	u32 tag = 0;
-	u32 dirty = 0;
-	u32 val = 0, mask = 0;
+	l2x0_saved_regs.ctrl = readl_relaxed(base + L2X0_CTRL);
+	l2x0_saved_regs.aux_ctrl = readl_relaxed(base + L2X0_AUX_CTRL);
+}
 
-	of_property_read_u32(np, "arm,tag-latency", &tag);
-	if (tag) {
-		mask |= L2X0_AUX_CTRL_TAG_LATENCY_MASK;
-		val |= (tag - 1) << L2X0_AUX_CTRL_TAG_LATENCY_SHIFT;
-	}
+static void aurora_resume(void)
+{
+	void __iomem *base = l2x0_base;
 
-	of_property_read_u32_array(np, "arm,data-latency",
-				   data, ARRAY_SIZE(data));
-	if (data[0] && data[1]) {
-		mask |= L2X0_AUX_CTRL_DATA_RD_LATENCY_MASK |
-			L2X0_AUX_CTRL_DATA_WR_LATENCY_MASK;
-		val |= ((data[0] - 1) << L2X0_AUX_CTRL_DATA_RD_LATENCY_SHIFT) |
-		       ((data[1] - 1) << L2X0_AUX_CTRL_DATA_WR_LATENCY_SHIFT);
+	if (!(readl(base + L2X0_CTRL) & L2X0_CTRL_EN)) {
+		writel_relaxed(l2x0_saved_regs.aux_ctrl, base + L2X0_AUX_CTRL);
+		writel_relaxed(l2x0_saved_regs.ctrl, base + L2X0_CTRL);
 	}
+}
 
-	of_property_read_u32(np, "arm,dirty-latency", &dirty);
-	if (dirty) {
-		mask |= L2X0_AUX_CTRL_DIRTY_LATENCY_MASK;
-		val |= (dirty - 1) << L2X0_AUX_CTRL_DIRTY_LATENCY_SHIFT;
-	}
+/*
+ * For Aurora cache in no outer mode, enable via the CP15 coprocessor
+ * broadcasting of cache commands to L2.
+ */
+static void __init aurora_enable_no_outer(void __iomem *base, u32 aux,
+	unsigned num_lock)
+{
+	u32 u;
 
-	*aux_val &= ~mask;
-	*aux_val |= val;
-	*aux_mask &= ~mask;
+	asm volatile("mrc p15, 1, %0, c15, c2, 0" : "=r" (u));
+	u |= AURORA_CTRL_FW;		/* Set the FW bit */
+	asm volatile("mcr p15, 1, %0, c15, c2, 0" : : "r" (u));
+
+	isb();
+
+	l2c_enable(base, aux, num_lock);
 }
 
-static void __init pl310_of_setup(const struct device_node *np,
-				  u32 *aux_val, u32 *aux_mask)
+static void __init aurora_fixup(void __iomem *base, u32 cache_id,
+	struct outer_cache_fns *fns)
 {
-	u32 data[3] = { 0, 0, 0 };
-	u32 tag[3] = { 0, 0, 0 };
-	u32 filter[2] = { 0, 0 };
+	sync_reg_offset = AURORA_SYNC_REG;
+}
 
-	of_property_read_u32_array(np, "arm,tag-latency", tag, ARRAY_SIZE(tag));
-	if (tag[0] && tag[1] && tag[2])
-		writel_relaxed(
-			((tag[0] - 1) << L2X0_LATENCY_CTRL_RD_SHIFT) |
-			((tag[1] - 1) << L2X0_LATENCY_CTRL_WR_SHIFT) |
-			((tag[2] - 1) << L2X0_LATENCY_CTRL_SETUP_SHIFT),
-			l2x0_base + L2X0_TAG_LATENCY_CTRL);
+static void __init aurora_of_parse(const struct device_node *np,
+				u32 *aux_val, u32 *aux_mask)
+{
+	u32 val = AURORA_ACR_REPLACEMENT_TYPE_SEMIPLRU;
+	u32 mask =  AURORA_ACR_REPLACEMENT_MASK;
 
-	of_property_read_u32_array(np, "arm,data-latency",
-				   data, ARRAY_SIZE(data));
-	if (data[0] && data[1] && data[2])
-		writel_relaxed(
-			((data[0] - 1) << L2X0_LATENCY_CTRL_RD_SHIFT) |
-			((data[1] - 1) << L2X0_LATENCY_CTRL_WR_SHIFT) |
-			((data[2] - 1) << L2X0_LATENCY_CTRL_SETUP_SHIFT),
-			l2x0_base + L2X0_DATA_LATENCY_CTRL);
+	of_property_read_u32(np, "cache-id-part",
+			&cache_id_part_number_from_dt);
 
-	of_property_read_u32_array(np, "arm,filter-ranges",
-				   filter, ARRAY_SIZE(filter));
-	if (filter[1]) {
-		writel_relaxed(ALIGN(filter[0] + filter[1], SZ_1M),
-			       l2x0_base + L2X0_ADDR_FILTER_END);
-		writel_relaxed((filter[0] & ~(SZ_1M - 1)) | L2X0_ADDR_FILTER_EN,
-			       l2x0_base + L2X0_ADDR_FILTER_START);
+	/* Determine and save the write policy */
+	l2_wt_override = of_property_read_bool(np, "wt-override");
+
+	if (l2_wt_override) {
+		val |= AURORA_ACR_FORCE_WRITE_THRO_POLICY;
+		mask |= AURORA_ACR_FORCE_WRITE_POLICY_MASK;
 	}
+
+	*aux_val &= ~mask;
+	*aux_val |= val;
+	*aux_mask &= ~mask;
 }
 
-static void __init pl310_save(void)
-{
-	u32 l2x0_revision = readl_relaxed(l2x0_base + L2X0_CACHE_ID) &
-		L2X0_CACHE_ID_RTL_MASK;
+static const struct l2c_init_data of_aurora_with_outer_data __initconst = {
+	.type = "Aurora",
+	.way_size_0 = SZ_4K,
+	.num_lock = 4,
+	.of_parse = aurora_of_parse,
+	.enable = l2c_enable,
+	.fixup = aurora_fixup,
+	.save  = aurora_save,
+	.outer_cache = {
+		.inv_range   = aurora_inv_range,
+		.clean_range = aurora_clean_range,
+		.flush_range = aurora_flush_range,
+		.flush_all   = l2x0_flush_all,
+		.disable     = l2x0_disable,
+		.sync        = l2x0_cache_sync,
+		.resume      = aurora_resume,
+	},
+};
 
-	l2x0_saved_regs.tag_latency = readl_relaxed(l2x0_base +
-		L2X0_TAG_LATENCY_CTRL);
-	l2x0_saved_regs.data_latency = readl_relaxed(l2x0_base +
-		L2X0_DATA_LATENCY_CTRL);
-	l2x0_saved_regs.filter_end = readl_relaxed(l2x0_base +
-		L2X0_ADDR_FILTER_END);
-	l2x0_saved_regs.filter_start = readl_relaxed(l2x0_base +
-		L2X0_ADDR_FILTER_START);
+static const struct l2c_init_data of_aurora_no_outer_data __initconst = {
+	.type = "Aurora",
+	.way_size_0 = SZ_4K,
+	.num_lock = 4,
+	.of_parse = aurora_of_parse,
+	.enable = aurora_enable_no_outer,
+	.fixup = aurora_fixup,
+	.save  = aurora_save,
+	.outer_cache = {
+		.resume      = aurora_resume,
+	},
+};
 
-	if (l2x0_revision >= L2X0_CACHE_ID_RTL_R2P0) {
-		/*
-		 * From r2p0, there is Prefetch offset/control register
-		 */
-		l2x0_saved_regs.prefetch_ctrl = readl_relaxed(l2x0_base +
-			L2X0_PREFETCH_CTRL);
-		/*
-		 * From r3p0, there is Power control register
-		 */
-		if (l2x0_revision >= L2X0_CACHE_ID_RTL_R3P0)
-			l2x0_saved_regs.pwr_ctrl = readl_relaxed(l2x0_base +
-				L2X0_POWER_CTRL);
-	}
+/*
+ * For certain Broadcom SoCs, depending on the address range, different offsets
+ * need to be added to the address before passing it to L2 for
+ * invalidation/clean/flush
+ *
+ * Section Address Range              Offset        EMI
+ *   1     0x00000000 - 0x3FFFFFFF    0x80000000    VC
+ *   2     0x40000000 - 0xBFFFFFFF    0x40000000    SYS
+ *   3     0xC0000000 - 0xFFFFFFFF    0x80000000    VC
+ *
+ * When the start and end addresses have crossed two different sections, we
+ * need to break the L2 operation into two, each within its own section.
+ * For example, if we need to invalidate addresses starts at 0xBFFF0000 and
+ * ends at 0xC0001000, we need do invalidate 1) 0xBFFF0000 - 0xBFFFFFFF and 2)
+ * 0xC0000000 - 0xC0001000
+ *
+ * Note 1:
+ * By breaking a single L2 operation into two, we may potentially suffer some
+ * performance hit, but keep in mind the cross section case is very rare
+ *
+ * Note 2:
+ * We do not need to handle the case when the start address is in
+ * Section 1 and the end address is in Section 3, since it is not a valid use
+ * case
+ *
+ * Note 3:
+ * Section 1 in practical terms can no longer be used on rev A2. Because of
+ * that the code does not need to handle section 1 at all.
+ *
+ */
+#define BCM_SYS_EMI_START_ADDR        0x40000000UL
+#define BCM_VC_EMI_SEC3_START_ADDR    0xC0000000UL
+
+#define BCM_SYS_EMI_OFFSET            0x40000000UL
+#define BCM_VC_EMI_OFFSET             0x80000000UL
+
+static inline int bcm_addr_is_sys_emi(unsigned long addr)
+{
+	return (addr >= BCM_SYS_EMI_START_ADDR) &&
+		(addr < BCM_VC_EMI_SEC3_START_ADDR);
 }
 
-static void aurora_save(void)
+static inline unsigned long bcm_l2_phys_addr(unsigned long addr)
 {
-	l2x0_saved_regs.ctrl = readl_relaxed(l2x0_base + L2X0_CTRL);
-	l2x0_saved_regs.aux_ctrl = readl_relaxed(l2x0_base + L2X0_AUX_CTRL);
+	if (bcm_addr_is_sys_emi(addr))
+		return addr + BCM_SYS_EMI_OFFSET;
+	else
+		return addr + BCM_VC_EMI_OFFSET;
 }
 
-static void l2x0_resume(void)
+static void bcm_inv_range(unsigned long start, unsigned long end)
 {
-	if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN)) {
-		/* restore aux ctrl and enable l2 */
-		l2x0_unlock(readl_relaxed(l2x0_base + L2X0_CACHE_ID));
+	unsigned long new_start, new_end;
 
-		writel_relaxed(l2x0_saved_regs.aux_ctrl, l2x0_base +
-			L2X0_AUX_CTRL);
+	BUG_ON(start < BCM_SYS_EMI_START_ADDR);
 
-		l2x0_inv_all();
+	if (unlikely(end <= start))
+		return;
 
-		writel_relaxed(L2X0_CTRL_EN, l2x0_base + L2X0_CTRL);
+	new_start = bcm_l2_phys_addr(start);
+	new_end = bcm_l2_phys_addr(end);
+
+	/* normal case, no cross section between start and end */
+	if (likely(bcm_addr_is_sys_emi(end) || !bcm_addr_is_sys_emi(start))) {
+		l2c210_inv_range(new_start, new_end);
+		return;
 	}
+
+	/* They cross sections, so it can only be a cross from section
+	 * 2 to section 3
+	 */
+	l2c210_inv_range(new_start,
+		bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR-1));
+	l2c210_inv_range(bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR),
+		new_end);
 }
 
-static void pl310_resume(void)
+static void bcm_clean_range(unsigned long start, unsigned long end)
 {
-	u32 l2x0_revision;
+	unsigned long new_start, new_end;
 
-	if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN)) {
-		/* restore pl310 setup */
-		writel_relaxed(l2x0_saved_regs.tag_latency,
-			l2x0_base + L2X0_TAG_LATENCY_CTRL);
-		writel_relaxed(l2x0_saved_regs.data_latency,
-			l2x0_base + L2X0_DATA_LATENCY_CTRL);
-		writel_relaxed(l2x0_saved_regs.filter_end,
-			l2x0_base + L2X0_ADDR_FILTER_END);
-		writel_relaxed(l2x0_saved_regs.filter_start,
-			l2x0_base + L2X0_ADDR_FILTER_START);
+	BUG_ON(start < BCM_SYS_EMI_START_ADDR);
 
-		l2x0_revision = readl_relaxed(l2x0_base + L2X0_CACHE_ID) &
-			L2X0_CACHE_ID_RTL_MASK;
+	if (unlikely(end <= start))
+		return;
 
-		if (l2x0_revision >= L2X0_CACHE_ID_RTL_R2P0) {
-			writel_relaxed(l2x0_saved_regs.prefetch_ctrl,
-				l2x0_base + L2X0_PREFETCH_CTRL);
-			if (l2x0_revision >= L2X0_CACHE_ID_RTL_R3P0)
-				writel_relaxed(l2x0_saved_regs.pwr_ctrl,
-					l2x0_base + L2X0_POWER_CTRL);
-		}
+	new_start = bcm_l2_phys_addr(start);
+	new_end = bcm_l2_phys_addr(end);
+
+	/* normal case, no cross section between start and end */
+	if (likely(bcm_addr_is_sys_emi(end) || !bcm_addr_is_sys_emi(start))) {
+		l2c210_clean_range(new_start, new_end);
+		return;
 	}
 
-	l2x0_resume();
+	/* They cross sections, so it can only be a cross from section
+	 * 2 to section 3
+	 */
+	l2c210_clean_range(new_start,
+		bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR-1));
+	l2c210_clean_range(bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR),
+		new_end);
 }
 
-static void aurora_resume(void)
+static void bcm_flush_range(unsigned long start, unsigned long end)
 {
-	if (!(readl(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN)) {
-		writel_relaxed(l2x0_saved_regs.aux_ctrl,
-				l2x0_base + L2X0_AUX_CTRL);
-		writel_relaxed(l2x0_saved_regs.ctrl, l2x0_base + L2X0_CTRL);
-	}
-}
+	unsigned long new_start, new_end;
 
-static void __init aurora_broadcast_l2_commands(void)
-{
-	__u32 u;
-	/* Enable Broadcasting of cache commands to L2*/
-	__asm__ __volatile__("mrc p15, 1, %0, c15, c2, 0" : "=r"(u));
-	u |= AURORA_CTRL_FW;		/* Set the FW bit */
-	__asm__ __volatile__("mcr p15, 1, %0, c15, c2, 0\n" : : "r"(u));
-	isb();
-}
+	BUG_ON(start < BCM_SYS_EMI_START_ADDR);
 
-static void __init aurora_of_setup(const struct device_node *np,
-				u32 *aux_val, u32 *aux_mask)
-{
-	u32 val = AURORA_ACR_REPLACEMENT_TYPE_SEMIPLRU;
-	u32 mask =  AURORA_ACR_REPLACEMENT_MASK;
+	if (unlikely(end <= start))
+		return;
 
-	of_property_read_u32(np, "cache-id-part",
-			&cache_id_part_number_from_dt);
+	if ((end - start) >= l2x0_size) {
+		outer_cache.flush_all();
+		return;
+	}
 
-	/* Determine and save the write policy */
-	l2_wt_override = of_property_read_bool(np, "wt-override");
+	new_start = bcm_l2_phys_addr(start);
+	new_end = bcm_l2_phys_addr(end);
 
-	if (l2_wt_override) {
-		val |= AURORA_ACR_FORCE_WRITE_THRO_POLICY;
-		mask |= AURORA_ACR_FORCE_WRITE_POLICY_MASK;
+	/* normal case, no cross section between start and end */
+	if (likely(bcm_addr_is_sys_emi(end) || !bcm_addr_is_sys_emi(start))) {
+		l2c210_flush_range(new_start, new_end);
+		return;
 	}
 
-	*aux_val &= ~mask;
-	*aux_val |= val;
-	*aux_mask &= ~mask;
+	/* They cross sections, so it can only be a cross from section
+	 * 2 to section 3
+	 */
+	l2c210_flush_range(new_start,
+		bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR-1));
+	l2c210_flush_range(bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR),
+		new_end);
 }
 
-static const struct l2x0_of_data pl310_data = {
-	.setup = pl310_of_setup,
-	.save  = pl310_save,
+/* Broadcom L2C-310 start from ARMs R3P2 or later, and require no fixups */
+static const struct l2c_init_data of_bcm_l2x0_data __initconst = {
+	.type = "BCM-L2C-310",
+	.way_size_0 = SZ_8K,
+	.num_lock = 8,
+	.of_parse = l2c310_of_parse,
+	.enable = l2c310_enable,
+	.save  = l2c310_save,
 	.outer_cache = {
-		.resume      = pl310_resume,
-		.inv_range   = l2x0_inv_range,
-		.clean_range = l2x0_clean_range,
-		.flush_range = l2x0_flush_range,
-		.sync        = l2x0_cache_sync,
-		.flush_all   = l2x0_flush_all,
-		.inv_all     = l2x0_inv_all,
-		.disable     = l2x0_disable,
-		.set_debug   = pl310_set_debug,
+		.inv_range   = bcm_inv_range,
+		.clean_range = bcm_clean_range,
+		.flush_range = bcm_flush_range,
+		.flush_all   = l2c210_flush_all,
+		.disable     = l2c310_disable,
+		.sync        = l2c210_sync,
+		.resume      = l2c310_resume,
 	},
 };
 
-static const struct l2x0_of_data l2x0_data = {
-	.setup = l2x0_of_setup,
-	.save  = NULL,
-	.outer_cache = {
-		.resume      = l2x0_resume,
-		.inv_range   = l2x0_inv_range,
-		.clean_range = l2x0_clean_range,
-		.flush_range = l2x0_flush_range,
-		.sync        = l2x0_cache_sync,
-		.flush_all   = l2x0_flush_all,
-		.inv_all     = l2x0_inv_all,
-		.disable     = l2x0_disable,
-	},
-};
+static void __init tauros3_save(void __iomem *base)
+{
+	l2c_save(base);
 
-static const struct l2x0_of_data aurora_with_outer_data = {
-	.setup = aurora_of_setup,
-	.save  = aurora_save,
-	.outer_cache = {
-		.resume      = aurora_resume,
-		.inv_range   = aurora_inv_range,
-		.clean_range = aurora_clean_range,
-		.flush_range = aurora_flush_range,
-		.sync        = l2x0_cache_sync,
-		.flush_all   = l2x0_flush_all,
-		.inv_all     = l2x0_inv_all,
-		.disable     = l2x0_disable,
-	},
-};
+	l2x0_saved_regs.aux2_ctrl =
+		readl_relaxed(base + TAUROS3_AUX2_CTRL);
+	l2x0_saved_regs.prefetch_ctrl =
+		readl_relaxed(base + L310_PREFETCH_CTRL);
+}
 
-static const struct l2x0_of_data aurora_no_outer_data = {
-	.setup = aurora_of_setup,
-	.save  = aurora_save,
+static void tauros3_resume(void)
+{
+	void __iomem *base = l2x0_base;
+
+	if (!(readl_relaxed(base + L2X0_CTRL) & L2X0_CTRL_EN)) {
+		writel_relaxed(l2x0_saved_regs.aux2_ctrl,
+			       base + TAUROS3_AUX2_CTRL);
+		writel_relaxed(l2x0_saved_regs.prefetch_ctrl,
+			       base + L310_PREFETCH_CTRL);
+
+		l2c_enable(base, l2x0_saved_regs.aux_ctrl, 8);
+	}
+}
+
+static const struct l2c_init_data of_tauros3_data __initconst = {
+	.type = "Tauros3",
+	.way_size_0 = SZ_8K,
+	.num_lock = 8,
+	.enable = l2c_enable,
+	.save  = tauros3_save,
+	/* Tauros3 broadcasts L1 cache operations to L2 */
 	.outer_cache = {
-		.resume      = aurora_resume,
+		.resume      = tauros3_resume,
 	},
 };
 
+#define L2C_ID(name, fns) { .compatible = name, .data = (void *)&fns }
 static const struct of_device_id l2x0_ids[] __initconst = {
-	{ .compatible = "arm,pl310-cache", .data = (void *)&pl310_data },
-	{ .compatible = "arm,l220-cache", .data = (void *)&l2x0_data },
-	{ .compatible = "arm,l210-cache", .data = (void *)&l2x0_data },
-	{ .compatible = "marvell,aurora-system-cache",
-	  .data = (void *)&aurora_no_outer_data},
-	{ .compatible = "marvell,aurora-outer-cache",
-	  .data = (void *)&aurora_with_outer_data},
+	L2C_ID("arm,l210-cache", of_l2c210_data),
+	L2C_ID("arm,l220-cache", of_l2c220_data),
+	L2C_ID("arm,pl310-cache", of_l2c310_data),
+	L2C_ID("brcm,bcm11351-a2-pl310-cache", of_bcm_l2x0_data),
+	L2C_ID("marvell,aurora-outer-cache", of_aurora_with_outer_data),
+	L2C_ID("marvell,aurora-system-cache", of_aurora_no_outer_data),
+	L2C_ID("marvell,tauros3-cache", of_tauros3_data),
+	/* Deprecated IDs */
+	L2C_ID("bcm,bcm11351-a2-pl310-cache", of_bcm_l2x0_data),
 	{}
 };
 
 int __init l2x0_of_init(u32 aux_val, u32 aux_mask)
 {
+	const struct l2c_init_data *data;
 	struct device_node *np;
-	const struct l2x0_of_data *data;
 	struct resource res;
+	u32 cache_id, old_aux;
 
 	np = of_find_matching_node(NULL, l2x0_ids);
 	if (!np)
@@ -799,24 +1514,33 @@ int __init l2x0_of_init(u32 aux_val, u32 aux_mask)
 
 	data = of_match_node(l2x0_ids, np)->data;
 
-	/* L2 configuration can only be changed if the cache is disabled */
-	if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN)) {
-		if (data->setup)
-			data->setup(np, &aux_val, &aux_mask);
+	if (of_device_is_compatible(np, "arm,pl310-cache") &&
+	    of_property_read_bool(np, "arm,io-coherent"))
+		data = &of_l2c310_coherent_data;
 
-		/* For aurora cache in no outer mode select the
-		 * correct mode using the coprocessor*/
-		if (data == &aurora_no_outer_data)
-			aurora_broadcast_l2_commands();
+	old_aux = readl_relaxed(l2x0_base + L2X0_AUX_CTRL);
+	if (old_aux != ((old_aux & aux_mask) | aux_val)) {
+		pr_warn("L2C: platform modifies aux control register: 0x%08x -> 0x%08x\n",
+		        old_aux, (old_aux & aux_mask) | aux_val);
+	} else if (aux_mask != ~0U && aux_val != 0) {
+		pr_alert("L2C: platform provided aux values match the hardware, so have no effect.  Please remove them.\n");
 	}
 
-	if (data->save)
-		data->save();
+	/* All L2 caches are unified, so this property should be specified */
+	if (!of_property_read_bool(np, "cache-unified"))
+		pr_err("L2C: device tree omits to specify unified cache\n");
 
-	of_init = true;
-	l2x0_init(l2x0_base, aux_val, aux_mask);
+	/* L2 configuration can only be changed if the cache is disabled */
+	if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN))
+		if (data->of_parse)
+			data->of_parse(np, &aux_val, &aux_mask);
+
+	if (cache_id_part_number_from_dt)
+		cache_id = cache_id_part_number_from_dt;
+	else
+		cache_id = readl_relaxed(l2x0_base + L2X0_CACHE_ID);
 
-	memcpy(&outer_cache, &data->outer_cache, sizeof(outer_cache));
+	__l2c_init(data, aux_val, aux_mask, cache_id);
 
 	return 0;
 }
diff --git a/arch/arm/mm/cache-nop.S b/arch/arm/mm/cache-nop.S
new file mode 100644
index 00000000000..8e12ddca003
--- /dev/null
+++ b/arch/arm/mm/cache-nop.S
@@ -0,0 +1,50 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <linux/init.h>
+
+#include "proc-macros.S"
+
+ENTRY(nop_flush_icache_all)
+	mov	pc, lr
+ENDPROC(nop_flush_icache_all)
+
+	.globl nop_flush_kern_cache_all
+	.equ nop_flush_kern_cache_all, nop_flush_icache_all
+
+	.globl nop_flush_kern_cache_louis
+	.equ nop_flush_kern_cache_louis, nop_flush_icache_all
+
+	.globl nop_flush_user_cache_all
+	.equ nop_flush_user_cache_all, nop_flush_icache_all
+
+	.globl nop_flush_user_cache_range
+	.equ nop_flush_user_cache_range, nop_flush_icache_all
+
+	.globl nop_coherent_kern_range
+	.equ nop_coherent_kern_range, nop_flush_icache_all
+
+ENTRY(nop_coherent_user_range)
+	mov	r0, 0
+	mov	pc, lr
+ENDPROC(nop_coherent_user_range)
+
+	.globl nop_flush_kern_dcache_area
+	.equ nop_flush_kern_dcache_area, nop_flush_icache_all
+
+	.globl nop_dma_flush_range
+	.equ nop_dma_flush_range, nop_flush_icache_all
+
+	.globl nop_dma_map_area
+	.equ nop_dma_map_area, nop_flush_icache_all
+
+	.globl nop_dma_unmap_area
+	.equ nop_dma_unmap_area, nop_flush_icache_all
+
+	__INITDATA
+
+	@ define struct cpu_cache_fns (see <asm/cacheflush.h> and proc-macros.S)
+	define_cache_functions nop
diff --git a/arch/arm/mm/cache-tauros2.c b/arch/arm/mm/cache-tauros2.c
index 1be0f4e5e6e..b273739e635 100644
--- a/arch/arm/mm/cache-tauros2.c
+++ b/arch/arm/mm/cache-tauros2.c
@@ -33,7 +33,7 @@
  * outer cache operations into the kernel image if the kernel has been
  * configured to support a pre-v7 CPU.
  */
-#if __LINUX_ARM_ARCH__ < 7
+#ifdef CONFIG_CPU_32v5
 /*
  * Low-level cache maintenance operations.
  */
@@ -229,33 +229,6 @@ static void __init tauros2_internal_init(unsigned int features)
 	}
 #endif
 
-#ifdef CONFIG_CPU_32v6
-	/*
-	 * Check whether this CPU lacks support for the v7 hierarchical
-	 * cache ops.  (PJ4 is in its v6 personality mode if the MMFR3
-	 * register indicates no support for the v7 hierarchical cache
-	 * ops.)
-	 */
-	if (cpuid_scheme() && (read_mmfr3() & 0xf) == 0) {
-		/*
-		 * When Tauros2 is used in an ARMv6 system, the L2
-		 * enable bit is in the ARMv6 ARM-mandated position
-		 * (bit [26] of the System Control Register).
-		 */
-		if (!(get_cr() & 0x04000000)) {
-			printk(KERN_INFO "Tauros2: Enabling L2 cache.\n");
-			adjust_cr(0x04000000, 0x04000000);
-		}
-
-		mode = "ARMv6";
-		outer_cache.inv_range = tauros2_inv_range;
-		outer_cache.clean_range = tauros2_clean_range;
-		outer_cache.flush_range = tauros2_flush_range;
-		outer_cache.disable = tauros2_disable;
-		outer_cache.resume = tauros2_resume;
-	}
-#endif
-
 #ifdef CONFIG_CPU_32v7
 	/*
 	 * Check whether this CPU has support for the v7 hierarchical
diff --git a/arch/arm/mm/cache-tauros3.h b/arch/arm/mm/cache-tauros3.h
new file mode 100644
index 00000000000..02c0a97cbc0
--- /dev/null
+++ b/arch/arm/mm/cache-tauros3.h
@@ -0,0 +1,41 @@
+/*
+ * Marvell Tauros3 cache controller includes
+ *
+ * Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
+ *
+ * based on GPL'ed 2.6 kernel sources
+ *  (c) Marvell International Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef __ASM_ARM_HARDWARE_TAUROS3_H
+#define __ASM_ARM_HARDWARE_TAUROS3_H
+
+/*
+ * Marvell Tauros3 L2CC is compatible with PL310 r0p0
+ * but with PREFETCH_CTRL (r2p0) and an additional event counter.
+ * Also, there is AUX2_CTRL for some Marvell specific control.
+ */
+
+#define TAUROS3_EVENT_CNT2_CFG		0x224
+#define TAUROS3_EVENT_CNT2_VAL		0x228
+#define TAUROS3_INV_ALL			0x780
+#define TAUROS3_CLEAN_ALL		0x784
+#define TAUROS3_AUX2_CTRL		0x820
+
+/* Registers shifts and masks */
+#define TAUROS3_AUX2_CTRL_LINEFILL_BURST8_EN	(1 << 2)
+
+#endif
diff --git a/arch/arm/mm/cache-v3.S b/arch/arm/mm/cache-v3.S
deleted file mode 100644
index 8a3fadece8d..00000000000
--- a/arch/arm/mm/cache-v3.S
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- *  linux/arch/arm/mm/cache-v3.S
- *
- *  Copyright (C) 1997-2002 Russell king
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <asm/page.h>
-#include "proc-macros.S"
-
-/*
- *	flush_icache_all()
- *
- *	Unconditionally clean and invalidate the entire icache.
- */
-ENTRY(v3_flush_icache_all)
-	mov	pc, lr
-ENDPROC(v3_flush_icache_all)
-
-/*
- *	flush_user_cache_all()
- *
- *	Invalidate all cache entries in a particular address
- *	space.
- *
- *	- mm	- mm_struct describing address space
- */
-ENTRY(v3_flush_user_cache_all)
-	/* FALLTHROUGH */
-/*
- *	flush_kern_cache_all()
- *
- *	Clean and invalidate the entire cache.
- */
-ENTRY(v3_flush_kern_cache_all)
-	/* FALLTHROUGH */
-
-/*
- *	flush_user_cache_range(start, end, flags)
- *
- *	Invalidate a range of cache entries in the specified
- *	address space.
- *
- *	- start - start address (may not be aligned)
- *	- end	- end address (exclusive, may not be aligned)
- *	- flags	- vma_area_struct flags describing address space
- */
-ENTRY(v3_flush_user_cache_range)
-	mov	ip, #0
-	mcreq	p15, 0, ip, c7, c0, 0		@ flush ID cache
-	mov	pc, lr
-
-/*
- *	coherent_kern_range(start, end)
- *
- *	Ensure coherency between the Icache and the Dcache in the
- *	region described by start.  If you have non-snooping
- *	Harvard caches, you need to implement this function.
- *
- *	- start  - virtual start address
- *	- end	 - virtual end address
- */
-ENTRY(v3_coherent_kern_range)
-	/* FALLTHROUGH */
-
-/*
- *	coherent_user_range(start, end)
- *
- *	Ensure coherency between the Icache and the Dcache in the
- *	region described by start.  If you have non-snooping
- *	Harvard caches, you need to implement this function.
- *
- *	- start  - virtual start address
- *	- end	 - virtual end address
- */
-ENTRY(v3_coherent_user_range)
-	mov	r0, #0
-	mov	pc, lr
-
-/*
- *	flush_kern_dcache_area(void *page, size_t size)
- *
- *	Ensure no D cache aliasing occurs, either with itself or
- *	the I cache
- *
- *	- addr	- kernel address
- *	- size	- region size
- */
-ENTRY(v3_flush_kern_dcache_area)
-	/* FALLTHROUGH */
-
-/*
- *	dma_flush_range(start, end)
- *
- *	Clean and invalidate the specified virtual address range.
- *
- *	- start  - virtual start address
- *	- end	 - virtual end address
- */
-ENTRY(v3_dma_flush_range)
-	mov	r0, #0
-	mcr	p15, 0, r0, c7, c0, 0		@ flush ID cache
-	mov	pc, lr
-
-/*
- *	dma_unmap_area(start, size, dir)
- *	- start	- kernel virtual start address
- *	- size	- size of region
- *	- dir	- DMA direction
- */
-ENTRY(v3_dma_unmap_area)
-	teq	r2, #DMA_TO_DEVICE
-	bne	v3_dma_flush_range
-	/* FALLTHROUGH */
-
-/*
- *	dma_map_area(start, size, dir)
- *	- start	- kernel virtual start address
- *	- size	- size of region
- *	- dir	- DMA direction
- */
-ENTRY(v3_dma_map_area)
-	mov	pc, lr
-ENDPROC(v3_dma_unmap_area)
-ENDPROC(v3_dma_map_area)
-
-	.globl	v3_flush_kern_cache_louis
-	.equ	v3_flush_kern_cache_louis, v3_flush_kern_cache_all
-
-	__INITDATA
-
-	@ define struct cpu_cache_fns (see <asm/cacheflush.h> and proc-macros.S)
-	define_cache_functions v3
diff --git a/arch/arm/mm/cache-v4.S b/arch/arm/mm/cache-v4.S
index 43e5d77be67..a7ba68f59f0 100644
--- a/arch/arm/mm/cache-v4.S
+++ b/arch/arm/mm/cache-v4.S
@@ -58,7 +58,7 @@ ENTRY(v4_flush_kern_cache_all)
 ENTRY(v4_flush_user_cache_range)
 #ifdef CONFIG_CPU_CP15
 	mov	ip, #0
-	mcreq	p15, 0, ip, c7, c7, 0		@ flush ID cache
+	mcr	p15, 0, ip, c7, c7, 0		@ flush ID cache
 	mov	pc, lr
 #else
 	/* FALLTHROUGH */
diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index 7539ec27506..615c99e38ba 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -19,6 +19,52 @@
 #include "proc-macros.S"
 
 /*
+ * The secondary kernel init calls v7_flush_dcache_all before it enables
+ * the L1; however, the L1 comes out of reset in an undefined state, so
+ * the clean + invalidate performed by v7_flush_dcache_all causes a bunch
+ * of cache lines with uninitialized data and uninitialized tags to get
+ * written out to memory, which does really unpleasant things to the main
+ * processor.  We fix this by performing an invalidate, rather than a
+ * clean + invalidate, before jumping into the kernel.
+ *
+ * This function is cloned from arch/arm/mach-tegra/headsmp.S, and needs
+ * to be called for both secondary cores startup and primary core resume
+ * procedures.
+ */
+ENTRY(v7_invalidate_l1)
+       mov     r0, #0
+       mcr     p15, 2, r0, c0, c0, 0
+       mrc     p15, 1, r0, c0, c0, 0
+
+       ldr     r1, =0x7fff
+       and     r2, r1, r0, lsr #13
+
+       ldr     r1, =0x3ff
+
+       and     r3, r1, r0, lsr #3      @ NumWays - 1
+       add     r2, r2, #1              @ NumSets
+
+       and     r0, r0, #0x7
+       add     r0, r0, #4      @ SetShift
+
+       clz     r1, r3          @ WayShift
+       add     r4, r3, #1      @ NumWays
+1:     sub     r2, r2, #1      @ NumSets--
+       mov     r3, r4          @ Temp = NumWays
+2:     subs    r3, r3, #1      @ Temp--
+       mov     r5, r3, lsl r1
+       mov     r6, r2, lsl r0
+       orr     r5, r5, r6      @ Reg = (Temp<<WayShift)|(NumSets<<SetShift)
+       mcr     p15, 0, r5, c7, c6, 2
+       bgt     2b
+       cmp     r2, #0
+       bgt     1b
+       dsb     st
+       isb
+       mov     pc, lr
+ENDPROC(v7_invalidate_l1)
+
+/*
  *	v7_flush_icache_all()
  *
  *	Flush the whole I-cache.
@@ -46,6 +92,14 @@ ENTRY(v7_flush_dcache_louis)
 	mrc	p15, 1, r0, c0, c0, 1		@ read clidr, r0 = clidr
 	ALT_SMP(ands	r3, r0, #(7 << 21))	@ extract LoUIS from clidr
 	ALT_UP(ands	r3, r0, #(7 << 27))	@ extract LoUU from clidr
+#ifdef CONFIG_ARM_ERRATA_643719
+	ALT_SMP(mrceq	p15, 0, r2, c0, c0, 0)	@ read main ID register
+	ALT_UP(moveq	pc, lr)			@ LoUU is zero, so nothing to do
+	ldreq	r1, =0x410fc090                 @ ID of ARM Cortex A9 r0p?
+	biceq	r2, r2, #0x0000000f             @ clear minor revision number
+	teqeq	r2, r1                          @ test for errata affected core and if so...
+	orreqs	r3, #(1 << 21)			@   fix LoUIS value (and set flags state to 'ne')
+#endif
 	ALT_SMP(mov	r3, r3, lsr #20)	@ r3 = LoUIS * 2
 	ALT_UP(mov	r3, r3, lsr #26)	@ r3 = LoUU * 2
 	moveq	pc, lr				@ return if level == 0
@@ -92,18 +146,18 @@ flush_levels:
 	ldr	r7, =0x7fff
 	ands	r7, r7, r1, lsr #13		@ extract max number of the index size
 loop1:
-	mov	r9, r4				@ create working copy of max way size
+	mov	r9, r7				@ create working copy of max index
 loop2:
- ARM(	orr	r11, r10, r9, lsl r5	)	@ factor way and cache number into r11
- THUMB(	lsl	r6, r9, r5		)
+ ARM(	orr	r11, r10, r4, lsl r5	)	@ factor way and cache number into r11
+ THUMB(	lsl	r6, r4, r5		)
  THUMB(	orr	r11, r10, r6		)	@ factor way and cache number into r11
- ARM(	orr	r11, r11, r7, lsl r2	)	@ factor index number into r11
- THUMB(	lsl	r6, r7, r2		)
+ ARM(	orr	r11, r11, r9, lsl r2	)	@ factor index number into r11
+ THUMB(	lsl	r6, r9, r2		)
  THUMB(	orr	r11, r11, r6		)	@ factor index number into r11
 	mcr	p15, 0, r11, c7, c14, 2		@ clean & invalidate by set/way
-	subs	r9, r9, #1			@ decrement the way
+	subs	r9, r9, #1			@ decrement the index
 	bge	loop2
-	subs	r7, r7, #1			@ decrement the index
+	subs	r4, r4, #1			@ decrement the way
 	bge	loop1
 skip:
 	add	r10, r10, #2			@ increment cache number
@@ -112,7 +166,7 @@ skip:
 finished:
 	mov	r10, #0				@ swith back to cache level 0
 	mcr	p15, 2, r10, c0, c0, 0		@ select current cache level in cssr
-	dsb
+	dsb	st
 	isb
 	mov	pc, lr
 ENDPROC(v7_flush_dcache_all)
@@ -228,7 +282,7 @@ ENTRY(v7_coherent_user_range)
 	add	r12, r12, r2
 	cmp	r12, r1
 	blo	1b
-	dsb
+	dsb	ishst
 	icache_line_size r2, r3
 	sub	r3, r2, #1
 	bic	r12, r0, r3
@@ -240,7 +294,7 @@ ENTRY(v7_coherent_user_range)
 	mov	r0, #0
 	ALT_SMP(mcr	p15, 0, r0, c7, c1, 6)	@ invalidate BTB Inner Shareable
 	ALT_UP(mcr	p15, 0, r0, c7, c5, 6)	@ invalidate BTB
-	dsb
+	dsb	ishst
 	isb
 	mov	pc, lr
 
@@ -281,7 +335,7 @@ ENTRY(v7_flush_kern_dcache_area)
 	add	r0, r0, r2
 	cmp	r0, r1
 	blo	1b
-	dsb
+	dsb	st
 	mov	pc, lr
 ENDPROC(v7_flush_kern_dcache_area)
 
@@ -314,7 +368,7 @@ v7_dma_inv_range:
 	add	r0, r0, r2
 	cmp	r0, r1
 	blo	1b
-	dsb
+	dsb	st
 	mov	pc, lr
 ENDPROC(v7_dma_inv_range)
 
@@ -336,7 +390,7 @@ v7_dma_clean_range:
 	add	r0, r0, r2
 	cmp	r0, r1
 	blo	1b
-	dsb
+	dsb	st
 	mov	pc, lr
 ENDPROC(v7_dma_clean_range)
 
@@ -358,7 +412,7 @@ ENTRY(v7_dma_flush_range)
 	add	r0, r0, r2
 	cmp	r0, r1
 	blo	1b
-	dsb
+	dsb	st
 	mov	pc, lr
 ENDPROC(v7_dma_flush_range)
 
diff --git a/arch/arm/mm/context.c b/arch/arm/mm/context.c
index bc4a5e9ebb7..6eb97b3a748 100644
--- a/arch/arm/mm/context.c
+++ b/arch/arm/mm/context.c
@@ -20,6 +20,7 @@
 #include <asm/smp_plat.h>
 #include <asm/thread_notify.h>
 #include <asm/tlbflush.h>
+#include <asm/proc-fns.h>
 
 /*
  * On ARMv6, we have the following structure in the Context ID:
@@ -34,12 +35,12 @@
  * The ASID is used to tag entries in the CPU caches and TLBs.
  * The context ID is used by debuggers and trace logic, and
  * should be unique within all running processes.
+ *
+ * In big endian operation, the two 32 bit words are swapped if accessed
+ * by non-64-bit operations.
  */
 #define ASID_FIRST_VERSION	(1ULL << ASID_BITS)
-#define NUM_USER_ASIDS		(ASID_FIRST_VERSION - 1)
-
-#define ASID_TO_IDX(asid)	((asid & ~ASID_MASK) - 1)
-#define IDX_TO_ASID(idx)	((idx + 1) & ~ASID_MASK)
+#define NUM_USER_ASIDS		ASID_FIRST_VERSION
 
 static DEFINE_RAW_SPINLOCK(cpu_asid_lock);
 static atomic64_t asid_generation = ATOMIC64_INIT(ASID_FIRST_VERSION);
@@ -49,27 +50,49 @@ static DEFINE_PER_CPU(atomic64_t, active_asids);
 static DEFINE_PER_CPU(u64, reserved_asids);
 static cpumask_t tlb_flush_pending;
 
-#ifdef CONFIG_ARM_LPAE
-static void cpu_set_reserved_ttbr0(void)
+#ifdef CONFIG_ARM_ERRATA_798181
+void a15_erratum_get_cpumask(int this_cpu, struct mm_struct *mm,
+			     cpumask_t *mask)
 {
-	unsigned long ttbl = __pa(swapper_pg_dir);
-	unsigned long ttbh = 0;
+	int cpu;
+	unsigned long flags;
+	u64 context_id, asid;
 
-	/*
-	 * Set TTBR0 to swapper_pg_dir which contains only global entries. The
-	 * ASID is set to 0.
-	 */
-	asm volatile(
-	"	mcrr	p15, 0, %0, %1, c2		@ set TTBR0\n"
-	:
-	: "r" (ttbl), "r" (ttbh));
-	isb();
+	raw_spin_lock_irqsave(&cpu_asid_lock, flags);
+	context_id = mm->context.id.counter;
+	for_each_online_cpu(cpu) {
+		if (cpu == this_cpu)
+			continue;
+		/*
+		 * We only need to send an IPI if the other CPUs are
+		 * running the same ASID as the one being invalidated.
+		 */
+		asid = per_cpu(active_asids, cpu).counter;
+		if (asid == 0)
+			asid = per_cpu(reserved_asids, cpu);
+		if (context_id == asid)
+			cpumask_set_cpu(cpu, mask);
+	}
+	raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
 }
+#endif
+
+#ifdef CONFIG_ARM_LPAE
+/*
+ * With LPAE, the ASID and page tables are updated atomicly, so there is
+ * no need for a reserved set of tables (the active ASID tracking prevents
+ * any issues across a rollover).
+ */
+#define cpu_set_reserved_ttbr0()
 #else
 static void cpu_set_reserved_ttbr0(void)
 {
 	u32 ttb;
-	/* Copy TTBR1 into TTBR0 */
+	/*
+	 * Copy TTBR1 into TTBR0.
+	 * This points at swapper_pg_dir, which contains only global
+	 * entries so any speculative walks are perfectly safe.
+	 */
 	asm volatile(
 	"	mrc	p15, 0, %0, c2, c0, 1		@ read TTBR1\n"
 	"	mcr	p15, 0, %0, c2, c0, 0		@ set TTBR0\n"
@@ -125,16 +148,22 @@ static void flush_context(unsigned int cpu)
 			asid = 0;
 		} else {
 			asid = atomic64_xchg(&per_cpu(active_asids, i), 0);
-			__set_bit(ASID_TO_IDX(asid), asid_map);
+			/*
+			 * If this CPU has already been through a
+			 * rollover, but hasn't run another task in
+			 * the meantime, we must preserve its reserved
+			 * ASID, as this is the only trace we have of
+			 * the process it is still running.
+			 */
+			if (asid == 0)
+				asid = per_cpu(reserved_asids, i);
+			__set_bit(asid & ~ASID_MASK, asid_map);
 		}
 		per_cpu(reserved_asids, i) = asid;
 	}
 
 	/* Queue a TLB invalidate and flush the I-cache if necessary. */
-	if (!tlb_ops_need_broadcast())
-		cpumask_set_cpu(cpu, &tlb_flush_pending);
-	else
-		cpumask_setall(&tlb_flush_pending);
+	cpumask_setall(&tlb_flush_pending);
 
 	if (icache_is_vivt_asid_tagged())
 		__flush_icache_all();
@@ -149,9 +178,10 @@ static int is_reserved_asid(u64 asid)
 	return 0;
 }
 
-static void new_context(struct mm_struct *mm, unsigned int cpu)
+static u64 new_context(struct mm_struct *mm, unsigned int cpu)
 {
-	u64 asid = mm->context.id;
+	static u32 cur_idx = 1;
+	u64 asid = atomic64_read(&mm->context.id);
 	u64 generation = atomic64_read(&asid_generation);
 
 	if (asid != 0 && is_reserved_asid(asid)) {
@@ -164,51 +194,65 @@ static void new_context(struct mm_struct *mm, unsigned int cpu)
 		/*
 		 * Allocate a free ASID. If we can't find one, take a
 		 * note of the currently active ASIDs and mark the TLBs
-		 * as requiring flushes.
+		 * as requiring flushes. We always count from ASID #1,
+		 * as we reserve ASID #0 to switch via TTBR0 and to
+		 * avoid speculative page table walks from hitting in
+		 * any partial walk caches, which could be populated
+		 * from overlapping level-1 descriptors used to map both
+		 * the module area and the userspace stack.
 		 */
-		asid = find_first_zero_bit(asid_map, NUM_USER_ASIDS);
+		asid = find_next_zero_bit(asid_map, NUM_USER_ASIDS, cur_idx);
 		if (asid == NUM_USER_ASIDS) {
 			generation = atomic64_add_return(ASID_FIRST_VERSION,
 							 &asid_generation);
 			flush_context(cpu);
-			asid = find_first_zero_bit(asid_map, NUM_USER_ASIDS);
+			asid = find_next_zero_bit(asid_map, NUM_USER_ASIDS, 1);
 		}
 		__set_bit(asid, asid_map);
-		asid = generation | IDX_TO_ASID(asid);
+		cur_idx = asid;
+		asid |= generation;
 		cpumask_clear(mm_cpumask(mm));
 	}
 
-	mm->context.id = asid;
+	return asid;
 }
 
 void check_and_switch_context(struct mm_struct *mm, struct task_struct *tsk)
 {
 	unsigned long flags;
 	unsigned int cpu = smp_processor_id();
+	u64 asid;
 
 	if (unlikely(mm->context.vmalloc_seq != init_mm.context.vmalloc_seq))
 		__check_vmalloc_seq(mm);
 
 	/*
-	 * Required during context switch to avoid speculative page table
-	 * walking with the wrong TTBR.
+	 * We cannot update the pgd and the ASID atomicly with classic
+	 * MMU, so switch exclusively to global mappings to avoid
+	 * speculative page table walking with the wrong TTBR.
 	 */
 	cpu_set_reserved_ttbr0();
 
-	if (!((mm->context.id ^ atomic64_read(&asid_generation)) >> ASID_BITS)
-	    && atomic64_xchg(&per_cpu(active_asids, cpu), mm->context.id))
+	asid = atomic64_read(&mm->context.id);
+	if (!((asid ^ atomic64_read(&asid_generation)) >> ASID_BITS)
+	    && atomic64_xchg(&per_cpu(active_asids, cpu), asid))
 		goto switch_mm_fastpath;
 
 	raw_spin_lock_irqsave(&cpu_asid_lock, flags);
 	/* Check that our ASID belongs to the current generation. */
-	if ((mm->context.id ^ atomic64_read(&asid_generation)) >> ASID_BITS)
-		new_context(mm, cpu);
-
-	atomic64_set(&per_cpu(active_asids, cpu), mm->context.id);
-	cpumask_set_cpu(cpu, mm_cpumask(mm));
+	asid = atomic64_read(&mm->context.id);
+	if ((asid ^ atomic64_read(&asid_generation)) >> ASID_BITS) {
+		asid = new_context(mm, cpu);
+		atomic64_set(&mm->context.id, asid);
+	}
 
-	if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending))
+	if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending)) {
+		local_flush_bp_all();
 		local_flush_tlb_all();
+	}
+
+	atomic64_set(&per_cpu(active_asids, cpu), asid);
+	cpumask_set_cpu(cpu, mm_cpumask(mm));
 	raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
 
 switch_mm_fastpath:
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 076c26d4386..1f88db06b13 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -9,6 +9,7 @@
  *
  *  DMA uncached mapping support.
  */
+#include <linux/bootmem.h>
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/gfp.h>
@@ -157,9 +158,47 @@ struct dma_map_ops arm_coherent_dma_ops = {
 };
 EXPORT_SYMBOL(arm_coherent_dma_ops);
 
+static int __dma_supported(struct device *dev, u64 mask, bool warn)
+{
+	unsigned long max_dma_pfn;
+
+	/*
+	 * If the mask allows for more memory than we can address,
+	 * and we actually have that much memory, then we must
+	 * indicate that DMA to this device is not supported.
+	 */
+	if (sizeof(mask) != sizeof(dma_addr_t) &&
+	    mask > (dma_addr_t)~0 &&
+	    dma_to_pfn(dev, ~0) < max_pfn) {
+		if (warn) {
+			dev_warn(dev, "Coherent DMA mask %#llx is larger than dma_addr_t allows\n",
+				 mask);
+			dev_warn(dev, "Driver did not use or check the return value from dma_set_coherent_mask()?\n");
+		}
+		return 0;
+	}
+
+	max_dma_pfn = min(max_pfn, arm_dma_pfn_limit);
+
+	/*
+	 * Translate the device's DMA mask to a PFN limit.  This
+	 * PFN number includes the page which we can DMA to.
+	 */
+	if (dma_to_pfn(dev, mask) < max_dma_pfn) {
+		if (warn)
+			dev_warn(dev, "Coherent DMA mask %#llx (pfn %#lx-%#lx) covers a smaller range of system memory than the DMA zone pfn 0x0-%#lx\n",
+				 mask,
+				 dma_to_pfn(dev, 0), dma_to_pfn(dev, mask) + 1,
+				 max_dma_pfn + 1);
+		return 0;
+	}
+
+	return 1;
+}
+
 static u64 get_coherent_dma_mask(struct device *dev)
 {
-	u64 mask = (u64)arm_dma_limit;
+	u64 mask = (u64)DMA_BIT_MASK(32);
 
 	if (dev) {
 		mask = dev->coherent_dma_mask;
@@ -173,12 +212,8 @@ static u64 get_coherent_dma_mask(struct device *dev)
 			return 0;
 		}
 
-		if ((~mask) & (u64)arm_dma_limit) {
-			dev_warn(dev, "coherent DMA mask %#llx is smaller "
-				 "than system GFP_DMA mask %#llx\n",
-				 mask, (u64)arm_dma_limit);
+		if (!__dma_supported(dev, mask, true))
 			return 0;
-		}
 	}
 
 	return mask;
@@ -186,13 +221,24 @@ static u64 get_coherent_dma_mask(struct device *dev)
 
 static void __dma_clear_buffer(struct page *page, size_t size)
 {
-	void *ptr;
 	/*
 	 * Ensure that the allocated pages are zeroed, and that any data
 	 * lurking in the kernel direct-mapped region is invalidated.
 	 */
-	ptr = page_address(page);
-	if (ptr) {
+	if (PageHighMem(page)) {
+		phys_addr_t base = __pfn_to_phys(page_to_pfn(page));
+		phys_addr_t end = base + size;
+		while (size > 0) {
+			void *ptr = kmap_atomic(page);
+			memset(ptr, 0, PAGE_SIZE);
+			dmac_flush_range(ptr, ptr + PAGE_SIZE);
+			kunmap_atomic(ptr);
+			page++;
+			size -= PAGE_SIZE;
+		}
+		outer_flush_range(base, end);
+	} else {
+		void *ptr = page_address(page);
 		memset(ptr, 0, size);
 		dmac_flush_range(ptr, ptr + size);
 		outer_flush_range(__pa(ptr), __pa(ptr) + size);
@@ -238,12 +284,10 @@ static void __dma_free_buffer(struct page *page, size_t size)
 }
 
 #ifdef CONFIG_MMU
-#ifdef CONFIG_HUGETLB_PAGE
-#error ARM Coherent DMA allocator does not (yet) support huge TLB
-#endif
 
 static void *__alloc_from_contiguous(struct device *dev, size_t size,
-				     pgprot_t prot, struct page **ret_page);
+				     pgprot_t prot, struct page **ret_page,
+				     const void *caller);
 
 static void *__alloc_remap_buffer(struct device *dev, size_t size, gfp_t gfp,
 				 pgprot_t prot, struct page **ret_page,
@@ -329,7 +373,8 @@ void __init init_dma_coherent_pool_size(unsigned long size)
 static int __init atomic_pool_init(void)
 {
 	struct dma_pool *pool = &atomic_pool;
-	pgprot_t prot = pgprot_dmacoherent(pgprot_kernel);
+	pgprot_t prot = pgprot_dmacoherent(PAGE_KERNEL);
+	gfp_t gfp = GFP_KERNEL | GFP_DMA;
 	unsigned long nr_pages = pool->size >> PAGE_SHIFT;
 	unsigned long *bitmap;
 	struct page *page;
@@ -345,11 +390,12 @@ static int __init atomic_pool_init(void)
 	if (!pages)
 		goto no_pages;
 
-	if (IS_ENABLED(CONFIG_CMA))
-		ptr = __alloc_from_contiguous(NULL, pool->size, prot, &page);
+	if (dev_get_cma_area(NULL))
+		ptr = __alloc_from_contiguous(NULL, pool->size, prot, &page,
+					      atomic_pool_init);
 	else
-		ptr = __alloc_remap_buffer(NULL, pool->size, GFP_KERNEL, prot,
-					   &page, NULL);
+		ptr = __alloc_remap_buffer(NULL, pool->size, gfp, prot, &page,
+					   atomic_pool_init);
 	if (ptr) {
 		int i;
 
@@ -415,12 +461,21 @@ void __init dma_contiguous_remap(void)
 		map.type = MT_MEMORY_DMA_READY;
 
 		/*
-		 * Clear previous low-memory mapping
+		 * Clear previous low-memory mapping to ensure that the
+		 * TLB does not see any conflicting entries, then flush
+		 * the TLB of the old entries before creating new mappings.
+		 *
+		 * This ensures that any speculatively loaded TLB entries
+		 * (even though they may be rare) can not cause any problems,
+		 * and ensures that this code is architecturally compliant.
 		 */
 		for (addr = __phys_to_virt(start); addr < __phys_to_virt(end);
 		     addr += PMD_SIZE)
 			pmd_clear(pmd_off_k(addr));
 
+		flush_tlb_kernel_range(__phys_to_virt(start),
+				       __phys_to_virt(end));
+
 		iotable_init(&map, 1);
 	}
 }
@@ -441,7 +496,6 @@ static void __dma_remap(struct page *page, size_t size, pgprot_t prot)
 	unsigned end = start + size;
 
 	apply_to_page_range(&init_mm, start, size, __dma_update_pte, &prot);
-	dsb();
 	flush_tlb_kernel_range(start, end);
 }
 
@@ -542,27 +596,41 @@ static int __free_from_pool(void *start, size_t size)
 }
 
 static void *__alloc_from_contiguous(struct device *dev, size_t size,
-				     pgprot_t prot, struct page **ret_page)
+				     pgprot_t prot, struct page **ret_page,
+				     const void *caller)
 {
 	unsigned long order = get_order(size);
 	size_t count = size >> PAGE_SHIFT;
 	struct page *page;
+	void *ptr;
 
 	page = dma_alloc_from_contiguous(dev, count, order);
 	if (!page)
 		return NULL;
 
 	__dma_clear_buffer(page, size);
-	__dma_remap(page, size, prot);
 
+	if (PageHighMem(page)) {
+		ptr = __dma_alloc_remap(page, size, GFP_KERNEL, prot, caller);
+		if (!ptr) {
+			dma_release_from_contiguous(dev, page, count);
+			return NULL;
+		}
+	} else {
+		__dma_remap(page, size, prot);
+		ptr = page_address(page);
+	}
 	*ret_page = page;
-	return page_address(page);
+	return ptr;
 }
 
 static void __free_from_contiguous(struct device *dev, struct page *page,
-				   size_t size)
+				   void *cpu_addr, size_t size)
 {
-	__dma_remap(page, size, pgprot_kernel);
+	if (PageHighMem(page))
+		__dma_free_remap(cpu_addr, size);
+	else
+		__dma_remap(page, size, PAGE_KERNEL);
 	dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT);
 }
 
@@ -583,9 +651,9 @@ static inline pgprot_t __get_dma_pgprot(struct dma_attrs *attrs, pgprot_t prot)
 #define __get_dma_pgprot(attrs, prot)	__pgprot(0)
 #define __alloc_remap_buffer(dev, size, gfp, prot, ret, c)	NULL
 #define __alloc_from_pool(size, ret_page)			NULL
-#define __alloc_from_contiguous(dev, size, prot, ret)		NULL
+#define __alloc_from_contiguous(dev, size, prot, ret, c)	NULL
 #define __free_from_pool(cpu_addr, size)			0
-#define __free_from_contiguous(dev, page, size)			do { } while (0)
+#define __free_from_contiguous(dev, page, cpu_addr, size)	do { } while (0)
 #define __dma_free_remap(cpu_addr, size)			do { } while (0)
 
 #endif	/* CONFIG_MMU */
@@ -640,12 +708,12 @@ static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 
 	if (is_coherent || nommu())
 		addr = __alloc_simple_buffer(dev, size, gfp, &page);
-	else if (gfp & GFP_ATOMIC)
+	else if (!(gfp & __GFP_WAIT))
 		addr = __alloc_from_pool(size, &page);
-	else if (!IS_ENABLED(CONFIG_CMA))
+	else if (!dev_get_cma_area(dev))
 		addr = __alloc_remap_buffer(dev, size, gfp, prot, &page, caller);
 	else
-		addr = __alloc_from_contiguous(dev, size, prot, &page);
+		addr = __alloc_from_contiguous(dev, size, prot, &page, caller);
 
 	if (addr)
 		*handle = pfn_to_dma(dev, page_to_pfn(page));
@@ -660,7 +728,7 @@ static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 		    gfp_t gfp, struct dma_attrs *attrs)
 {
-	pgprot_t prot = __get_dma_pgprot(attrs, pgprot_kernel);
+	pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
 	void *memory;
 
 	if (dma_alloc_from_coherent(dev, size, handle, &memory))
@@ -673,7 +741,7 @@ void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 static void *arm_coherent_dma_alloc(struct device *dev, size_t size,
 	dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs)
 {
-	pgprot_t prot = __get_dma_pgprot(attrs, pgprot_kernel);
+	pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
 	void *memory;
 
 	if (dma_alloc_from_coherent(dev, size, handle, &memory))
@@ -731,7 +799,7 @@ static void __arm_dma_free(struct device *dev, size_t size, void *cpu_addr,
 		__dma_free_buffer(page, size);
 	} else if (__free_from_pool(cpu_addr, size)) {
 		return;
-	} else if (!IS_ENABLED(CONFIG_CMA)) {
+	} else if (!dev_get_cma_area(dev)) {
 		__dma_free_remap(cpu_addr, size);
 		__dma_free_buffer(page, size);
 	} else {
@@ -739,7 +807,7 @@ static void __arm_dma_free(struct device *dev, size_t size, void *cpu_addr,
 		 * Non-atomic allocations cannot be freed with IRQs disabled
 		 */
 		WARN_ON(irqs_disabled());
-		__free_from_contiguous(dev, page, size);
+		__free_from_contiguous(dev, page, cpu_addr, size);
 	}
 }
 
@@ -795,16 +863,17 @@ static void dma_cache_maint_page(struct page *page, unsigned long offset,
 		if (PageHighMem(page)) {
 			if (len + offset > PAGE_SIZE)
 				len = PAGE_SIZE - offset;
-			vaddr = kmap_high_get(page);
-			if (vaddr) {
-				vaddr += offset;
-				op(vaddr, len, dir);
-				kunmap_high(page);
-			} else if (cache_is_vipt()) {
-				/* unmapped pages might still be cached */
+
+			if (cache_is_vipt_nonaliasing()) {
 				vaddr = kmap_atomic(page);
 				op(vaddr + offset, len, dir);
 				kunmap_atomic(vaddr);
+			} else {
+				vaddr = kmap_high_get(page);
+				if (vaddr) {
+					op(vaddr + offset, len, dir);
+					kunmap_high(page);
+				}
 			}
 		} else {
 			vaddr = page_address(page) + offset;
@@ -825,7 +894,7 @@ static void dma_cache_maint_page(struct page *page, unsigned long offset,
 static void __dma_page_cpu_to_dev(struct page *page, unsigned long off,
 	size_t size, enum dma_data_direction dir)
 {
-	unsigned long paddr;
+	phys_addr_t paddr;
 
 	dma_cache_maint_page(page, off, size, dir, dmac_map_area);
 
@@ -841,20 +910,35 @@ static void __dma_page_cpu_to_dev(struct page *page, unsigned long off,
 static void __dma_page_dev_to_cpu(struct page *page, unsigned long off,
 	size_t size, enum dma_data_direction dir)
 {
-	unsigned long paddr = page_to_phys(page) + off;
+	phys_addr_t paddr = page_to_phys(page) + off;
 
 	/* FIXME: non-speculating: not required */
-	/* don't bother invalidating if DMA to device */
-	if (dir != DMA_TO_DEVICE)
+	/* in any case, don't bother invalidating if DMA to device */
+	if (dir != DMA_TO_DEVICE) {
 		outer_inv_range(paddr, paddr + size);
 
-	dma_cache_maint_page(page, off, size, dir, dmac_unmap_area);
+		dma_cache_maint_page(page, off, size, dir, dmac_unmap_area);
+	}
 
 	/*
-	 * Mark the D-cache clean for this page to avoid extra flushing.
+	 * Mark the D-cache clean for these pages to avoid extra flushing.
 	 */
-	if (dir != DMA_TO_DEVICE && off == 0 && size >= PAGE_SIZE)
-		set_bit(PG_dcache_clean, &page->flags);
+	if (dir != DMA_TO_DEVICE && size >= PAGE_SIZE) {
+		unsigned long pfn;
+		size_t left = size;
+
+		pfn = page_to_pfn(page) + off / PAGE_SIZE;
+		off %= PAGE_SIZE;
+		if (off) {
+			pfn++;
+			left -= PAGE_SIZE - off;
+		}
+		while (left >= PAGE_SIZE) {
+			page = pfn_to_page(pfn++);
+			set_bit(PG_dcache_clean, &page->flags);
+			left -= PAGE_SIZE;
+		}
+	}
 }
 
 /**
@@ -965,9 +1049,7 @@ void arm_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
  */
 int dma_supported(struct device *dev, u64 mask)
 {
-	if (mask < (u64)arm_dma_limit)
-		return 0;
-	return 1;
+	return __dma_supported(dev, mask, false);
 }
 EXPORT_SYMBOL(dma_supported);
 
@@ -994,45 +1076,98 @@ fs_initcall(dma_debug_do_init);
 
 /* IOMMU */
 
+static int extend_iommu_mapping(struct dma_iommu_mapping *mapping);
+
 static inline dma_addr_t __alloc_iova(struct dma_iommu_mapping *mapping,
 				      size_t size)
 {
 	unsigned int order = get_order(size);
 	unsigned int align = 0;
 	unsigned int count, start;
+	size_t mapping_size = mapping->bits << PAGE_SHIFT;
 	unsigned long flags;
+	dma_addr_t iova;
+	int i;
 
-	count = ((PAGE_ALIGN(size) >> PAGE_SHIFT) +
-		 (1 << mapping->order) - 1) >> mapping->order;
+	if (order > CONFIG_ARM_DMA_IOMMU_ALIGNMENT)
+		order = CONFIG_ARM_DMA_IOMMU_ALIGNMENT;
 
-	if (order > mapping->order)
-		align = (1 << (order - mapping->order)) - 1;
+	count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	align = (1 << order) - 1;
 
 	spin_lock_irqsave(&mapping->lock, flags);
-	start = bitmap_find_next_zero_area(mapping->bitmap, mapping->bits, 0,
-					   count, align);
-	if (start > mapping->bits) {
-		spin_unlock_irqrestore(&mapping->lock, flags);
-		return DMA_ERROR_CODE;
+	for (i = 0; i < mapping->nr_bitmaps; i++) {
+		start = bitmap_find_next_zero_area(mapping->bitmaps[i],
+				mapping->bits, 0, count, align);
+
+		if (start > mapping->bits)
+			continue;
+
+		bitmap_set(mapping->bitmaps[i], start, count);
+		break;
 	}
 
-	bitmap_set(mapping->bitmap, start, count);
+	/*
+	 * No unused range found. Try to extend the existing mapping
+	 * and perform a second attempt to reserve an IO virtual
+	 * address range of size bytes.
+	 */
+	if (i == mapping->nr_bitmaps) {
+		if (extend_iommu_mapping(mapping)) {
+			spin_unlock_irqrestore(&mapping->lock, flags);
+			return DMA_ERROR_CODE;
+		}
+
+		start = bitmap_find_next_zero_area(mapping->bitmaps[i],
+				mapping->bits, 0, count, align);
+
+		if (start > mapping->bits) {
+			spin_unlock_irqrestore(&mapping->lock, flags);
+			return DMA_ERROR_CODE;
+		}
+
+		bitmap_set(mapping->bitmaps[i], start, count);
+	}
 	spin_unlock_irqrestore(&mapping->lock, flags);
 
-	return mapping->base + (start << (mapping->order + PAGE_SHIFT));
+	iova = mapping->base + (mapping_size * i);
+	iova += start << PAGE_SHIFT;
+
+	return iova;
 }
 
 static inline void __free_iova(struct dma_iommu_mapping *mapping,
 			       dma_addr_t addr, size_t size)
 {
-	unsigned int start = (addr - mapping->base) >>
-			     (mapping->order + PAGE_SHIFT);
-	unsigned int count = ((size >> PAGE_SHIFT) +
-			      (1 << mapping->order) - 1) >> mapping->order;
+	unsigned int start, count;
+	size_t mapping_size = mapping->bits << PAGE_SHIFT;
 	unsigned long flags;
+	dma_addr_t bitmap_base;
+	u32 bitmap_index;
+
+	if (!size)
+		return;
+
+	bitmap_index = (u32) (addr - mapping->base) / (u32) mapping_size;
+	BUG_ON(addr < mapping->base || bitmap_index > mapping->extensions);
+
+	bitmap_base = mapping->base + mapping_size * bitmap_index;
+
+	start = (addr - bitmap_base) >>	PAGE_SHIFT;
+
+	if (addr + size > bitmap_base + mapping_size) {
+		/*
+		 * The address range to be freed reaches into the iova
+		 * range of the next bitmap. This should not happen as
+		 * we don't allow this in __alloc_iova (at the
+		 * moment).
+		 */
+		BUG();
+	} else
+		count = size >> PAGE_SHIFT;
 
 	spin_lock_irqsave(&mapping->lock, flags);
-	bitmap_clear(mapping->bitmap, start, count);
+	bitmap_clear(mapping->bitmaps[bitmap_index], start, count);
 	spin_unlock_irqrestore(&mapping->lock, flags);
 }
 
@@ -1068,12 +1203,17 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
 		return pages;
 	}
 
+	/*
+	 * IOMMU can map any pages, so himem can also be used here
+	 */
+	gfp |= __GFP_NOWARN | __GFP_HIGHMEM;
+
 	while (count) {
 		int j, order = __fls(count);
 
-		pages[i] = alloc_pages(gfp | __GFP_NOWARN, order);
+		pages[i] = alloc_pages(gfp, order);
 		while (!pages[i] && order)
-			pages[i] = alloc_pages(gfp | __GFP_NOWARN, --order);
+			pages[i] = alloc_pages(gfp, --order);
 		if (!pages[i])
 			goto error;
 
@@ -1182,7 +1322,8 @@ __iommu_create_mapping(struct device *dev, struct page **pages, size_t size)
 				break;
 
 		len = (j - i) << PAGE_SHIFT;
-		ret = iommu_map(mapping->domain, iova, phys, len, 0);
+		ret = iommu_map(mapping->domain, iova, phys, len,
+				IOMMU_READ|IOMMU_WRITE);
 		if (ret < 0)
 			goto fail;
 		iova += len;
@@ -1257,26 +1398,35 @@ err_mapping:
 	return NULL;
 }
 
-static void __iommu_free_atomic(struct device *dev, struct page **pages,
+static void __iommu_free_atomic(struct device *dev, void *cpu_addr,
 				dma_addr_t handle, size_t size)
 {
 	__iommu_remove_mapping(dev, handle, size);
-	__free_from_pool(page_address(pages[0]), size);
+	__free_from_pool(cpu_addr, size);
 }
 
 static void *arm_iommu_alloc_attrs(struct device *dev, size_t size,
 	    dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs)
 {
-	pgprot_t prot = __get_dma_pgprot(attrs, pgprot_kernel);
+	pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
 	struct page **pages;
 	void *addr = NULL;
 
 	*handle = DMA_ERROR_CODE;
 	size = PAGE_ALIGN(size);
 
-	if (gfp & GFP_ATOMIC)
+	if (!(gfp & __GFP_WAIT))
 		return __iommu_alloc_atomic(dev, size, handle);
 
+	/*
+	 * Following is a work-around (a.k.a. hack) to prevent pages
+	 * with __GFP_COMP being passed to split_page() which cannot
+	 * handle them.  The real problem is that this flag probably
+	 * should be 0 on ARM as it is not supported on this
+	 * platform; see CONFIG_HUGETLBFS.
+	 */
+	gfp &= ~(__GFP_COMP);
+
 	pages = __iommu_alloc_buffer(dev, size, gfp, attrs);
 	if (!pages)
 		return NULL;
@@ -1335,16 +1485,17 @@ static int arm_iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
 void arm_iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr,
 			  dma_addr_t handle, struct dma_attrs *attrs)
 {
-	struct page **pages = __iommu_get_pages(cpu_addr, attrs);
+	struct page **pages;
 	size = PAGE_ALIGN(size);
 
-	if (!pages) {
-		WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr);
+	if (__in_atomic_pool(cpu_addr, size)) {
+		__iommu_free_atomic(dev, cpu_addr, handle, size);
 		return;
 	}
 
-	if (__in_atomic_pool(cpu_addr, size)) {
-		__iommu_free_atomic(dev, pages, handle, size);
+	pages = __iommu_get_pages(cpu_addr, attrs);
+	if (!pages) {
+		WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr);
 		return;
 	}
 
@@ -1371,6 +1522,27 @@ static int arm_iommu_get_sgtable(struct device *dev, struct sg_table *sgt,
 					 GFP_KERNEL);
 }
 
+static int __dma_direction_to_prot(enum dma_data_direction dir)
+{
+	int prot;
+
+	switch (dir) {
+	case DMA_BIDIRECTIONAL:
+		prot = IOMMU_READ | IOMMU_WRITE;
+		break;
+	case DMA_TO_DEVICE:
+		prot = IOMMU_READ;
+		break;
+	case DMA_FROM_DEVICE:
+		prot = IOMMU_WRITE;
+		break;
+	default:
+		prot = 0;
+	}
+
+	return prot;
+}
+
 /*
  * Map a part of the scatter-gather list into contiguous io address space
  */
@@ -1384,6 +1556,7 @@ static int __map_sg_chunk(struct device *dev, struct scatterlist *sg,
 	int ret = 0;
 	unsigned int count;
 	struct scatterlist *s;
+	int prot;
 
 	size = PAGE_ALIGN(size);
 	*handle = DMA_ERROR_CODE;
@@ -1400,7 +1573,9 @@ static int __map_sg_chunk(struct device *dev, struct scatterlist *sg,
 			!dma_get_attr(DMA_ATTR_SKIP_CPU_SYNC, attrs))
 			__dma_page_cpu_to_dev(sg_page(s), s->offset, s->length, dir);
 
-		ret = iommu_map(mapping->domain, iova, phys, len, 0);
+		prot = __dma_direction_to_prot(dir);
+
+		ret = iommu_map(mapping->domain, iova, phys, len, prot);
 		if (ret < 0)
 			goto fail;
 		count += len >> PAGE_SHIFT;
@@ -1599,13 +1774,15 @@ static dma_addr_t arm_coherent_iommu_map_page(struct device *dev, struct page *p
 {
 	struct dma_iommu_mapping *mapping = dev->archdata.mapping;
 	dma_addr_t dma_addr;
-	int ret, len = PAGE_ALIGN(size + offset);
+	int ret, prot, len = PAGE_ALIGN(size + offset);
 
 	dma_addr = __alloc_iova(mapping, len);
 	if (dma_addr == DMA_ERROR_CODE)
 		return dma_addr;
 
-	ret = iommu_map(mapping->domain, dma_addr, page_to_phys(page), len, 0);
+	prot = __dma_direction_to_prot(dir);
+
+	ret = iommu_map(mapping->domain, dma_addr, page_to_phys(page), len, prot);
 	if (ret < 0)
 		goto fail;
 
@@ -1732,6 +1909,8 @@ struct dma_map_ops iommu_ops = {
 	.unmap_sg		= arm_iommu_unmap_sg,
 	.sync_sg_for_cpu	= arm_iommu_sync_sg_for_cpu,
 	.sync_sg_for_device	= arm_iommu_sync_sg_for_device,
+
+	.set_dma_mask		= arm_dma_set_mask,
 };
 
 struct dma_map_ops iommu_coherent_ops = {
@@ -1745,14 +1924,15 @@ struct dma_map_ops iommu_coherent_ops = {
 
 	.map_sg		= arm_coherent_iommu_map_sg,
 	.unmap_sg	= arm_coherent_iommu_unmap_sg,
+
+	.set_dma_mask	= arm_dma_set_mask,
 };
 
 /**
  * arm_iommu_create_mapping
  * @bus: pointer to the bus holding the client device (for IOMMU calls)
  * @base: start address of the valid IO address space
- * @size: size of the valid IO address space
- * @order: accuracy of the IO addresses allocations
+ * @size: maximum size of the valid IO address space
  *
  * Creates a mapping structure which holds information about used/unused
  * IO address ranges, which is required to perform memory allocation and
@@ -1762,59 +1942,97 @@ struct dma_map_ops iommu_coherent_ops = {
  * arm_iommu_attach_device function.
  */
 struct dma_iommu_mapping *
-arm_iommu_create_mapping(struct bus_type *bus, dma_addr_t base, size_t size,
-			 int order)
+arm_iommu_create_mapping(struct bus_type *bus, dma_addr_t base, size_t size)
 {
-	unsigned int count = size >> (PAGE_SHIFT + order);
-	unsigned int bitmap_size = BITS_TO_LONGS(count) * sizeof(long);
+	unsigned int bits = size >> PAGE_SHIFT;
+	unsigned int bitmap_size = BITS_TO_LONGS(bits) * sizeof(long);
 	struct dma_iommu_mapping *mapping;
+	int extensions = 1;
 	int err = -ENOMEM;
 
-	if (!count)
+	if (!bitmap_size)
 		return ERR_PTR(-EINVAL);
 
+	if (bitmap_size > PAGE_SIZE) {
+		extensions = bitmap_size / PAGE_SIZE;
+		bitmap_size = PAGE_SIZE;
+	}
+
 	mapping = kzalloc(sizeof(struct dma_iommu_mapping), GFP_KERNEL);
 	if (!mapping)
 		goto err;
 
-	mapping->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
-	if (!mapping->bitmap)
+	mapping->bitmap_size = bitmap_size;
+	mapping->bitmaps = kzalloc(extensions * sizeof(unsigned long *),
+				GFP_KERNEL);
+	if (!mapping->bitmaps)
 		goto err2;
 
+	mapping->bitmaps[0] = kzalloc(bitmap_size, GFP_KERNEL);
+	if (!mapping->bitmaps[0])
+		goto err3;
+
+	mapping->nr_bitmaps = 1;
+	mapping->extensions = extensions;
 	mapping->base = base;
 	mapping->bits = BITS_PER_BYTE * bitmap_size;
-	mapping->order = order;
+
 	spin_lock_init(&mapping->lock);
 
 	mapping->domain = iommu_domain_alloc(bus);
 	if (!mapping->domain)
-		goto err3;
+		goto err4;
 
 	kref_init(&mapping->kref);
 	return mapping;
+err4:
+	kfree(mapping->bitmaps[0]);
 err3:
-	kfree(mapping->bitmap);
+	kfree(mapping->bitmaps);
 err2:
 	kfree(mapping);
 err:
 	return ERR_PTR(err);
 }
+EXPORT_SYMBOL_GPL(arm_iommu_create_mapping);
 
 static void release_iommu_mapping(struct kref *kref)
 {
+	int i;
 	struct dma_iommu_mapping *mapping =
 		container_of(kref, struct dma_iommu_mapping, kref);
 
 	iommu_domain_free(mapping->domain);
-	kfree(mapping->bitmap);
+	for (i = 0; i < mapping->nr_bitmaps; i++)
+		kfree(mapping->bitmaps[i]);
+	kfree(mapping->bitmaps);
 	kfree(mapping);
 }
 
+static int extend_iommu_mapping(struct dma_iommu_mapping *mapping)
+{
+	int next_bitmap;
+
+	if (mapping->nr_bitmaps > mapping->extensions)
+		return -EINVAL;
+
+	next_bitmap = mapping->nr_bitmaps;
+	mapping->bitmaps[next_bitmap] = kzalloc(mapping->bitmap_size,
+						GFP_ATOMIC);
+	if (!mapping->bitmaps[next_bitmap])
+		return -ENOMEM;
+
+	mapping->nr_bitmaps++;
+
+	return 0;
+}
+
 void arm_iommu_release_mapping(struct dma_iommu_mapping *mapping)
 {
 	if (mapping)
 		kref_put(&mapping->kref, release_iommu_mapping);
 }
+EXPORT_SYMBOL_GPL(arm_iommu_release_mapping);
 
 /**
  * arm_iommu_attach_device
@@ -1843,5 +2061,32 @@ int arm_iommu_attach_device(struct device *dev,
 	pr_debug("Attached IOMMU controller to %s device.\n", dev_name(dev));
 	return 0;
 }
+EXPORT_SYMBOL_GPL(arm_iommu_attach_device);
+
+/**
+ * arm_iommu_detach_device
+ * @dev: valid struct device pointer
+ *
+ * Detaches the provided device from a previously attached map.
+ * This voids the dma operations (dma_map_ops pointer)
+ */
+void arm_iommu_detach_device(struct device *dev)
+{
+	struct dma_iommu_mapping *mapping;
+
+	mapping = to_dma_iommu_mapping(dev);
+	if (!mapping) {
+		dev_warn(dev, "Not attached\n");
+		return;
+	}
+
+	iommu_detach_device(mapping->domain, dev);
+	kref_put(&mapping->kref, release_iommu_mapping);
+	dev->archdata.mapping = NULL;
+	set_dma_ops(dev, NULL);
+
+	pr_debug("Detached IOMMU controller from %s device.\n", dev_name(dev));
+}
+EXPORT_SYMBOL_GPL(arm_iommu_detach_device);
 
 #endif
diff --git a/arch/arm/mm/dump.c b/arch/arm/mm/dump.c
new file mode 100644
index 00000000000..c508f41a43b
--- /dev/null
+++ b/arch/arm/mm/dump.c
@@ -0,0 +1,365 @@
+/*
+ * Debug helper to dump the current kernel pagetables of the system
+ * so that we can see what the various memory ranges are set to.
+ *
+ * Derived from x86 implementation:
+ * (C) Copyright 2008 Intel Corporation
+ *
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/seq_file.h>
+
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+
+struct addr_marker {
+	unsigned long start_address;
+	const char *name;
+};
+
+static struct addr_marker address_markers[] = {
+	{ MODULES_VADDR,	"Modules" },
+	{ PAGE_OFFSET,		"Kernel Mapping" },
+	{ 0,			"vmalloc() Area" },
+	{ VMALLOC_END,		"vmalloc() End" },
+	{ FIXADDR_START,	"Fixmap Area" },
+	{ CONFIG_VECTORS_BASE,	"Vectors" },
+	{ CONFIG_VECTORS_BASE + PAGE_SIZE * 2, "Vectors End" },
+	{ -1,			NULL },
+};
+
+struct pg_state {
+	struct seq_file *seq;
+	const struct addr_marker *marker;
+	unsigned long start_address;
+	unsigned level;
+	u64 current_prot;
+};
+
+struct prot_bits {
+	u64		mask;
+	u64		val;
+	const char	*set;
+	const char	*clear;
+};
+
+static const struct prot_bits pte_bits[] = {
+	{
+		.mask	= L_PTE_USER,
+		.val	= L_PTE_USER,
+		.set	= "USR",
+		.clear	= "   ",
+	}, {
+		.mask	= L_PTE_RDONLY,
+		.val	= L_PTE_RDONLY,
+		.set	= "ro",
+		.clear	= "RW",
+	}, {
+		.mask	= L_PTE_XN,
+		.val	= L_PTE_XN,
+		.set	= "NX",
+		.clear	= "x ",
+	}, {
+		.mask	= L_PTE_SHARED,
+		.val	= L_PTE_SHARED,
+		.set	= "SHD",
+		.clear	= "   ",
+	}, {
+		.mask	= L_PTE_MT_MASK,
+		.val	= L_PTE_MT_UNCACHED,
+		.set	= "SO/UNCACHED",
+	}, {
+		.mask	= L_PTE_MT_MASK,
+		.val	= L_PTE_MT_BUFFERABLE,
+		.set	= "MEM/BUFFERABLE/WC",
+	}, {
+		.mask	= L_PTE_MT_MASK,
+		.val	= L_PTE_MT_WRITETHROUGH,
+		.set	= "MEM/CACHED/WT",
+	}, {
+		.mask	= L_PTE_MT_MASK,
+		.val	= L_PTE_MT_WRITEBACK,
+		.set	= "MEM/CACHED/WBRA",
+#ifndef CONFIG_ARM_LPAE
+	}, {
+		.mask	= L_PTE_MT_MASK,
+		.val	= L_PTE_MT_MINICACHE,
+		.set	= "MEM/MINICACHE",
+#endif
+	}, {
+		.mask	= L_PTE_MT_MASK,
+		.val	= L_PTE_MT_WRITEALLOC,
+		.set	= "MEM/CACHED/WBWA",
+	}, {
+		.mask	= L_PTE_MT_MASK,
+		.val	= L_PTE_MT_DEV_SHARED,
+		.set	= "DEV/SHARED",
+#ifndef CONFIG_ARM_LPAE
+	}, {
+		.mask	= L_PTE_MT_MASK,
+		.val	= L_PTE_MT_DEV_NONSHARED,
+		.set	= "DEV/NONSHARED",
+#endif
+	}, {
+		.mask	= L_PTE_MT_MASK,
+		.val	= L_PTE_MT_DEV_WC,
+		.set	= "DEV/WC",
+	}, {
+		.mask	= L_PTE_MT_MASK,
+		.val	= L_PTE_MT_DEV_CACHED,
+		.set	= "DEV/CACHED",
+	},
+};
+
+static const struct prot_bits section_bits[] = {
+#ifdef CONFIG_ARM_LPAE
+	{
+		.mask	= PMD_SECT_USER,
+		.val	= PMD_SECT_USER,
+		.set	= "USR",
+	}, {
+		.mask	= PMD_SECT_RDONLY,
+		.val	= PMD_SECT_RDONLY,
+		.set	= "ro",
+		.clear	= "RW",
+#elif __LINUX_ARM_ARCH__ >= 6
+	{
+		.mask	= PMD_SECT_APX | PMD_SECT_AP_READ | PMD_SECT_AP_WRITE,
+		.val	= PMD_SECT_APX | PMD_SECT_AP_WRITE,
+		.set	= "    ro",
+	}, {
+		.mask	= PMD_SECT_APX | PMD_SECT_AP_READ | PMD_SECT_AP_WRITE,
+		.val	= PMD_SECT_AP_WRITE,
+		.set	= "    RW",
+	}, {
+		.mask	= PMD_SECT_APX | PMD_SECT_AP_READ | PMD_SECT_AP_WRITE,
+		.val	= PMD_SECT_AP_READ,
+		.set	= "USR ro",
+	}, {
+		.mask	= PMD_SECT_APX | PMD_SECT_AP_READ | PMD_SECT_AP_WRITE,
+		.val	= PMD_SECT_AP_READ | PMD_SECT_AP_WRITE,
+		.set	= "USR RW",
+#else /* ARMv4/ARMv5  */
+	/* These are approximate */
+	{
+		.mask   = PMD_SECT_AP_READ | PMD_SECT_AP_WRITE,
+		.val    = 0,
+		.set    = "    ro",
+	}, {
+		.mask   = PMD_SECT_AP_READ | PMD_SECT_AP_WRITE,
+		.val    = PMD_SECT_AP_WRITE,
+		.set    = "    RW",
+	}, {
+		.mask   = PMD_SECT_AP_READ | PMD_SECT_AP_WRITE,
+		.val    = PMD_SECT_AP_READ,
+		.set    = "USR ro",
+	}, {
+		.mask   = PMD_SECT_AP_READ | PMD_SECT_AP_WRITE,
+		.val    = PMD_SECT_AP_READ | PMD_SECT_AP_WRITE,
+		.set    = "USR RW",
+#endif
+	}, {
+		.mask	= PMD_SECT_XN,
+		.val	= PMD_SECT_XN,
+		.set	= "NX",
+		.clear	= "x ",
+	}, {
+		.mask	= PMD_SECT_S,
+		.val	= PMD_SECT_S,
+		.set	= "SHD",
+		.clear	= "   ",
+	},
+};
+
+struct pg_level {
+	const struct prot_bits *bits;
+	size_t num;
+	u64 mask;
+};
+
+static struct pg_level pg_level[] = {
+	{
+	}, { /* pgd */
+	}, { /* pud */
+	}, { /* pmd */
+		.bits	= section_bits,
+		.num	= ARRAY_SIZE(section_bits),
+	}, { /* pte */
+		.bits	= pte_bits,
+		.num	= ARRAY_SIZE(pte_bits),
+	},
+};
+
+static void dump_prot(struct pg_state *st, const struct prot_bits *bits, size_t num)
+{
+	unsigned i;
+
+	for (i = 0; i < num; i++, bits++) {
+		const char *s;
+
+		if ((st->current_prot & bits->mask) == bits->val)
+			s = bits->set;
+		else
+			s = bits->clear;
+
+		if (s)
+			seq_printf(st->seq, " %s", s);
+	}
+}
+
+static void note_page(struct pg_state *st, unsigned long addr, unsigned level, u64 val)
+{
+	static const char units[] = "KMGTPE";
+	u64 prot = val & pg_level[level].mask;
+
+	if (addr < USER_PGTABLES_CEILING)
+		return;
+
+	if (!st->level) {
+		st->level = level;
+		st->current_prot = prot;
+		seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+	} else if (prot != st->current_prot || level != st->level ||
+		   addr >= st->marker[1].start_address) {
+		const char *unit = units;
+		unsigned long delta;
+
+		if (st->current_prot) {
+			seq_printf(st->seq, "0x%08lx-0x%08lx   ",
+				   st->start_address, addr);
+
+			delta = (addr - st->start_address) >> 10;
+			while (!(delta & 1023) && unit[1]) {
+				delta >>= 10;
+				unit++;
+			}
+			seq_printf(st->seq, "%9lu%c", delta, *unit);
+			if (pg_level[st->level].bits)
+				dump_prot(st, pg_level[st->level].bits, pg_level[st->level].num);
+			seq_printf(st->seq, "\n");
+		}
+
+		if (addr >= st->marker[1].start_address) {
+			st->marker++;
+			seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+		}
+		st->start_address = addr;
+		st->current_prot = prot;
+		st->level = level;
+	}
+}
+
+static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start)
+{
+	pte_t *pte = pte_offset_kernel(pmd, 0);
+	unsigned long addr;
+	unsigned i;
+
+	for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
+		addr = start + i * PAGE_SIZE;
+		note_page(st, addr, 4, pte_val(*pte));
+	}
+}
+
+static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
+{
+	pmd_t *pmd = pmd_offset(pud, 0);
+	unsigned long addr;
+	unsigned i;
+
+	for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
+		addr = start + i * PMD_SIZE;
+		if (pmd_none(*pmd) || pmd_large(*pmd) || !pmd_present(*pmd))
+			note_page(st, addr, 3, pmd_val(*pmd));
+		else
+			walk_pte(st, pmd, addr);
+
+		if (SECTION_SIZE < PMD_SIZE && pmd_large(pmd[1]))
+			note_page(st, addr + SECTION_SIZE, 3, pmd_val(pmd[1]));
+	}
+}
+
+static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
+{
+	pud_t *pud = pud_offset(pgd, 0);
+	unsigned long addr;
+	unsigned i;
+
+	for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
+		addr = start + i * PUD_SIZE;
+		if (!pud_none(*pud)) {
+			walk_pmd(st, pud, addr);
+		} else {
+			note_page(st, addr, 2, pud_val(*pud));
+		}
+	}
+}
+
+static void walk_pgd(struct seq_file *m)
+{
+	pgd_t *pgd = swapper_pg_dir;
+	struct pg_state st;
+	unsigned long addr;
+	unsigned i, pgdoff = USER_PGTABLES_CEILING / PGDIR_SIZE;
+
+	memset(&st, 0, sizeof(st));
+	st.seq = m;
+	st.marker = address_markers;
+
+	pgd += pgdoff;
+
+	for (i = pgdoff; i < PTRS_PER_PGD; i++, pgd++) {
+		addr = i * PGDIR_SIZE;
+		if (!pgd_none(*pgd)) {
+			walk_pud(&st, pgd, addr);
+		} else {
+			note_page(&st, addr, 1, pgd_val(*pgd));
+		}
+	}
+
+	note_page(&st, 0, 0, 0);
+}
+
+static int ptdump_show(struct seq_file *m, void *v)
+{
+	walk_pgd(m);
+	return 0;
+}
+
+static int ptdump_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ptdump_show, NULL);
+}
+
+static const struct file_operations ptdump_fops = {
+	.open		= ptdump_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int ptdump_init(void)
+{
+	struct dentry *pe;
+	unsigned i, j;
+
+	for (i = 0; i < ARRAY_SIZE(pg_level); i++)
+		if (pg_level[i].bits)
+			for (j = 0; j < pg_level[i].num; j++)
+				pg_level[i].mask |= pg_level[i].bits[j].mask;
+
+	address_markers[2].start_address = VMALLOC_START;
+
+	pe = debugfs_create_file("kernel_page_tables", 0400, NULL, NULL,
+				 &ptdump_fops);
+	return pe ? 0 : -ENOMEM;
+}
+__initcall(ptdump_init);
diff --git a/arch/arm/mm/extable.c b/arch/arm/mm/extable.c
index 9d285626bc7..312e15e6d00 100644
--- a/arch/arm/mm/extable.c
+++ b/arch/arm/mm/extable.c
@@ -9,8 +9,13 @@ int fixup_exception(struct pt_regs *regs)
 	const struct exception_table_entry *fixup;
 
 	fixup = search_exception_tables(instruction_pointer(regs));
-	if (fixup)
+	if (fixup) {
 		regs->ARM_pc = fixup->fixup;
+#ifdef CONFIG_THUMB2_KERNEL
+		/* Clear the IT state to avoid nasty surprises in the fixup */
+		regs->ARM_cpsr &= ~PSR_IT_MASK;
+#endif
+	}
 
 	return fixup != NULL;
 }
diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index 2a5907b5c8d..ff379ac115d 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -65,7 +65,7 @@ static int do_adjust_pte(struct vm_area_struct *vma, unsigned long address,
 	return ret;
 }
 
-#if USE_SPLIT_PTLOCKS
+#if USE_SPLIT_PTE_PTLOCKS
 /*
  * If we are using split PTE locks, then we need to take the page
  * lock here.  Otherwise we are using shared mm->page_table_lock
@@ -84,10 +84,10 @@ static inline void do_pte_unlock(spinlock_t *ptl)
 {
 	spin_unlock(ptl);
 }
-#else /* !USE_SPLIT_PTLOCKS */
+#else /* !USE_SPLIT_PTE_PTLOCKS */
 static inline void do_pte_lock(spinlock_t *ptl) {}
 static inline void do_pte_unlock(spinlock_t *ptl) {}
-#endif /* USE_SPLIT_PTLOCKS */
+#endif /* USE_SPLIT_PTE_PTLOCKS */
 
 static int adjust_pte(struct vm_area_struct *vma, unsigned long address,
 	unsigned long pfn)
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 5dbf13f954f..eb8830a4c5e 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -261,9 +261,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	struct task_struct *tsk;
 	struct mm_struct *mm;
 	int fault, sig, code;
-	int write = fsr & FSR_WRITE;
-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
-				(write ? FAULT_FLAG_WRITE : 0);
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
 	if (notify_page_fault(regs, fsr))
 		return 0;
@@ -282,6 +280,11 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	if (in_atomic() || !mm)
 		goto no_context;
 
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
+	if (fsr & FSR_WRITE)
+		flags |= FAULT_FLAG_WRITE;
+
 	/*
 	 * As per x86, we may deadlock here.  However, since the kernel only
 	 * validly references user space from well defined areas of the code,
@@ -349,6 +352,13 @@ retry:
 	if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
 		return 0;
 
+	/*
+	 * If we are in kernel mode at this point, we
+	 * have no context to handle this fault with.
+	 */
+	if (!user_mode(regs))
+		goto no_context;
+
 	if (fault & VM_FAULT_OOM) {
 		/*
 		 * We ran out of memory, call the OOM killer, and return to
@@ -359,13 +369,6 @@ retry:
 		return 0;
 	}
 
-	/*
-	 * If we are in kernel mode at this point, we
-	 * have no context to handle this fault with.
-	 */
-	if (!user_mode(regs))
-		goto no_context;
-
 	if (fault & VM_FAULT_SIGBUS) {
 		/*
 		 * We had some memory, but were unable to
@@ -491,12 +494,14 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
  * Some section permission faults need to be handled gracefully.
  * They can happen due to a __{get,put}_user during an oops.
  */
+#ifndef CONFIG_ARM_LPAE
 static int
 do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
 	do_bad_area(addr, fsr, regs);
 	return 0;
 }
+#endif /* CONFIG_ARM_LPAE */
 
 /*
  * This abort handler always returns "fault".
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index 1c8f7f56417..43d54f5b26b 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -17,6 +17,7 @@
 #include <asm/highmem.h>
 #include <asm/smp_plat.h>
 #include <asm/tlbflush.h>
+#include <linux/hugetlb.h>
 
 #include "mm.h"
 
@@ -103,17 +104,20 @@ void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr, unsig
 #define flush_icache_alias(pfn,vaddr,len)	do { } while (0)
 #endif
 
+#define FLAG_PA_IS_EXEC 1
+#define FLAG_PA_CORE_IN_MM 2
+
 static void flush_ptrace_access_other(void *args)
 {
 	__flush_icache_all();
 }
 
-static
-void flush_ptrace_access(struct vm_area_struct *vma, struct page *page,
-			 unsigned long uaddr, void *kaddr, unsigned long len)
+static inline
+void __flush_ptrace_access(struct page *page, unsigned long uaddr, void *kaddr,
+			   unsigned long len, unsigned int flags)
 {
 	if (cache_is_vivt()) {
-		if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) {
+		if (flags & FLAG_PA_CORE_IN_MM) {
 			unsigned long addr = (unsigned long)kaddr;
 			__cpuc_coherent_kern_range(addr, addr + len);
 		}
@@ -127,7 +131,7 @@ void flush_ptrace_access(struct vm_area_struct *vma, struct page *page,
 	}
 
 	/* VIPT non-aliasing D-cache */
-	if (vma->vm_flags & VM_EXEC) {
+	if (flags & FLAG_PA_IS_EXEC) {
 		unsigned long addr = (unsigned long)kaddr;
 		if (icache_is_vipt_aliasing())
 			flush_icache_alias(page_to_pfn(page), uaddr, len);
@@ -139,6 +143,26 @@ void flush_ptrace_access(struct vm_area_struct *vma, struct page *page,
 	}
 }
 
+static
+void flush_ptrace_access(struct vm_area_struct *vma, struct page *page,
+			 unsigned long uaddr, void *kaddr, unsigned long len)
+{
+	unsigned int flags = 0;
+	if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm)))
+		flags |= FLAG_PA_CORE_IN_MM;
+	if (vma->vm_flags & VM_EXEC)
+		flags |= FLAG_PA_IS_EXEC;
+	__flush_ptrace_access(page, uaddr, kaddr, len, flags);
+}
+
+void flush_uprobe_xol_access(struct page *page, unsigned long uaddr,
+			     void *kaddr, unsigned long len)
+{
+	unsigned int flags = FLAG_PA_CORE_IN_MM|FLAG_PA_IS_EXEC;
+
+	__flush_ptrace_access(page, uaddr, kaddr, len, flags);
+}
+
 /*
  * Copy user data from/to a page which is mapped into a different
  * processes address space.  Really, we want to allow our "user
@@ -168,17 +192,24 @@ void __flush_dcache_page(struct address_space *mapping, struct page *page)
 	 * coherent with the kernels mapping.
 	 */
 	if (!PageHighMem(page)) {
-		__cpuc_flush_dcache_area(page_address(page), PAGE_SIZE);
+		size_t page_size = PAGE_SIZE << compound_order(page);
+		__cpuc_flush_dcache_area(page_address(page), page_size);
 	} else {
-		void *addr = kmap_high_get(page);
-		if (addr) {
-			__cpuc_flush_dcache_area(addr, PAGE_SIZE);
-			kunmap_high(page);
-		} else if (cache_is_vipt()) {
-			/* unmapped pages might still be cached */
-			addr = kmap_atomic(page);
-			__cpuc_flush_dcache_area(addr, PAGE_SIZE);
-			kunmap_atomic(addr);
+		unsigned long i;
+		if (cache_is_vipt_nonaliasing()) {
+			for (i = 0; i < (1 << compound_order(page)); i++) {
+				void *addr = kmap_atomic(page + i);
+				__cpuc_flush_dcache_area(addr, PAGE_SIZE);
+				kunmap_atomic(addr);
+			}
+		} else {
+			for (i = 0; i < (1 << compound_order(page)); i++) {
+				void *addr = kmap_high_get(page + i);
+				if (addr) {
+					__cpuc_flush_dcache_area(addr, PAGE_SIZE);
+					kunmap_high(page + i);
+				}
+			}
 		}
 	}
 
@@ -284,7 +315,7 @@ void flush_dcache_page(struct page *page)
 	mapping = page_mapping(page);
 
 	if (!cache_ops_need_broadcast() &&
-	    mapping && !mapping_mapped(mapping))
+	    mapping && !page_mapped(page))
 		clear_bit(PG_dcache_clean, &page->flags);
 	else {
 		__flush_dcache_page(mapping, page);
@@ -298,6 +329,39 @@ void flush_dcache_page(struct page *page)
 EXPORT_SYMBOL(flush_dcache_page);
 
 /*
+ * Ensure cache coherency for the kernel mapping of this page. We can
+ * assume that the page is pinned via kmap.
+ *
+ * If the page only exists in the page cache and there are no user
+ * space mappings, this is a no-op since the page was already marked
+ * dirty at creation.  Otherwise, we need to flush the dirty kernel
+ * cache lines directly.
+ */
+void flush_kernel_dcache_page(struct page *page)
+{
+	if (cache_is_vivt() || cache_is_vipt_aliasing()) {
+		struct address_space *mapping;
+
+		mapping = page_mapping(page);
+
+		if (!mapping || mapping_mapped(mapping)) {
+			void *addr;
+
+			addr = page_address(page);
+			/*
+			 * kmap_atomic() doesn't set the page virtual
+			 * address for highmem pages, and
+			 * kunmap_atomic() takes care of cache
+			 * flushing already.
+			 */
+			if (!IS_ENABLED(CONFIG_HIGHMEM) || addr)
+				__cpuc_flush_dcache_area(addr, PAGE_SIZE);
+		}
+	}
+}
+EXPORT_SYMBOL(flush_kernel_dcache_page);
+
+/*
  * Flush an anonymous page so that users of get_user_pages()
  * can safely access the data.  The expected sequence is:
  *
diff --git a/arch/arm/mm/fsr-3level.c b/arch/arm/mm/fsr-3level.c
index 05a4e943183..ab4409a2307 100644
--- a/arch/arm/mm/fsr-3level.c
+++ b/arch/arm/mm/fsr-3level.c
@@ -9,11 +9,11 @@ static struct fsr_info fsr_info[] = {
 	{ do_page_fault,	SIGSEGV, SEGV_MAPERR,	"level 3 translation fault"	},
 	{ do_bad,		SIGBUS,  0,		"reserved access flag fault"	},
 	{ do_bad,		SIGSEGV, SEGV_ACCERR,	"level 1 access flag fault"	},
-	{ do_bad,		SIGSEGV, SEGV_ACCERR,	"level 2 access flag fault"	},
+	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 2 access flag fault"	},
 	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 3 access flag fault"	},
 	{ do_bad,		SIGBUS,  0,		"reserved permission fault"	},
 	{ do_bad,		SIGSEGV, SEGV_ACCERR,	"level 1 permission fault"	},
-	{ do_sect_fault,	SIGSEGV, SEGV_ACCERR,	"level 2 permission fault"	},
+	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 2 permission fault"	},
 	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 3 permission fault"	},
 	{ do_bad,		SIGBUS,  0,		"synchronous external abort"	},
 	{ do_bad,		SIGBUS,  0,		"asynchronous external abort"	},
diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
index 21b9e1bf9b7..45aeaaca905 100644
--- a/arch/arm/mm/highmem.c
+++ b/arch/arm/mm/highmem.c
@@ -18,6 +18,21 @@
 #include <asm/tlbflush.h>
 #include "mm.h"
 
+pte_t *fixmap_page_table;
+
+static inline void set_fixmap_pte(int idx, pte_t pte)
+{
+	unsigned long vaddr = __fix_to_virt(idx);
+	set_pte_ext(fixmap_page_table + idx, pte, 0);
+	local_flush_tlb_kernel_page(vaddr);
+}
+
+static inline pte_t get_fixmap_pte(unsigned long vaddr)
+{
+	unsigned long idx = __virt_to_fix(vaddr);
+	return *(fixmap_page_table + idx);
+}
+
 void *kmap(struct page *page)
 {
 	might_sleep();
@@ -63,20 +78,20 @@ void *kmap_atomic(struct page *page)
 	type = kmap_atomic_idx_push();
 
 	idx = type + KM_TYPE_NR * smp_processor_id();
-	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+	vaddr = __fix_to_virt(idx);
 #ifdef CONFIG_DEBUG_HIGHMEM
 	/*
 	 * With debugging enabled, kunmap_atomic forces that entry to 0.
 	 * Make sure it was indeed properly unmapped.
 	 */
-	BUG_ON(!pte_none(get_top_pte(vaddr)));
+	BUG_ON(!pte_none(*(fixmap_page_table + idx)));
 #endif
 	/*
 	 * When debugging is off, kunmap_atomic leaves the previous mapping
 	 * in place, so the contained TLB flush ensures the TLB is updated
 	 * with the new mapping.
 	 */
-	set_top_pte(vaddr, mk_pte(page, kmap_prot));
+	set_fixmap_pte(idx, mk_pte(page, kmap_prot));
 
 	return (void *)vaddr;
 }
@@ -94,8 +109,8 @@ void __kunmap_atomic(void *kvaddr)
 		if (cache_is_vivt())
 			__cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
 #ifdef CONFIG_DEBUG_HIGHMEM
-		BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
-		set_top_pte(vaddr, __pte(0));
+		BUG_ON(vaddr != __fix_to_virt(idx));
+		set_fixmap_pte(idx, __pte(0));
 #else
 		(void) idx;  /* to kill a warning */
 #endif
@@ -117,11 +132,11 @@ void *kmap_atomic_pfn(unsigned long pfn)
 
 	type = kmap_atomic_idx_push();
 	idx = type + KM_TYPE_NR * smp_processor_id();
-	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+	vaddr = __fix_to_virt(idx);
 #ifdef CONFIG_DEBUG_HIGHMEM
-	BUG_ON(!pte_none(get_top_pte(vaddr)));
+	BUG_ON(!pte_none(*(fixmap_page_table + idx)));
 #endif
-	set_top_pte(vaddr, pfn_pte(pfn, kmap_prot));
+	set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
 
 	return (void *)vaddr;
 }
@@ -133,5 +148,5 @@ struct page *kmap_atomic_to_page(const void *ptr)
 	if (vaddr < FIXADDR_START)
 		return virt_to_page(ptr);
 
-	return pte_page(get_top_pte(vaddr));
+	return pte_page(get_fixmap_pte(vaddr));
 }
diff --git a/arch/arm/mm/hugetlbpage.c b/arch/arm/mm/hugetlbpage.c
new file mode 100644
index 00000000000..66781bf3407
--- /dev/null
+++ b/arch/arm/mm/hugetlbpage.c
@@ -0,0 +1,58 @@
+/*
+ * arch/arm/mm/hugetlbpage.c
+ *
+ * Copyright (C) 2012 ARM Ltd.
+ *
+ * Based on arch/x86/include/asm/hugetlb.h and Bill Carson's patches
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
+#include <linux/err.h>
+#include <linux/sysctl.h>
+#include <asm/mman.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
+
+/*
+ * On ARM, huge pages are backed by pmd's rather than pte's, so we do a lot
+ * of type casting from pmd_t * to pte_t *.
+ */
+
+struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
+			      int write)
+{
+	return ERR_PTR(-EINVAL);
+}
+
+int pud_huge(pud_t pud)
+{
+	return 0;
+}
+
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+	return 0;
+}
+
+int pmd_huge(pmd_t pmd)
+{
+	return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
+}
diff --git a/arch/arm/mm/idmap.c b/arch/arm/mm/idmap.c
index 99db769307e..c447ec70e86 100644
--- a/arch/arm/mm/idmap.c
+++ b/arch/arm/mm/idmap.c
@@ -1,4 +1,6 @@
+#include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/slab.h>
 
 #include <asm/cputype.h>
 #include <asm/idmap.h>
@@ -7,7 +9,13 @@
 #include <asm/sections.h>
 #include <asm/system_info.h>
 
+/*
+ * Note: accesses outside of the kernel image and the identity map area
+ * are not supported on any CPU using the idmap tables as its current
+ * page tables.
+ */
 pgd_t *idmap_pgd;
+phys_addr_t (*arch_virt_to_idmap) (unsigned long x);
 
 #ifdef CONFIG_ARM_LPAE
 static void idmap_add_pmd(pud_t *pud, unsigned long addr, unsigned long end,
@@ -22,6 +30,13 @@ static void idmap_add_pmd(pud_t *pud, unsigned long addr, unsigned long end,
 			pr_warning("Failed to allocate identity pmd.\n");
 			return;
 		}
+		/*
+		 * Copy the original PMD to ensure that the PMD entries for
+		 * the kernel image are preserved.
+		 */
+		if (!pud_none(*pud))
+			memcpy(pmd, pmd_offset(pud, 0),
+			       PTRS_PER_PMD * sizeof(pmd_t));
 		pud_populate(&init_mm, pud, pmd);
 		pmd += pmd_index(addr);
 	} else
@@ -59,11 +74,18 @@ static void idmap_add_pud(pgd_t *pgd, unsigned long addr, unsigned long end,
 	} while (pud++, addr = next, addr != end);
 }
 
-static void identity_mapping_add(pgd_t *pgd, unsigned long addr, unsigned long end)
+static void identity_mapping_add(pgd_t *pgd, const char *text_start,
+				 const char *text_end, unsigned long prot)
 {
-	unsigned long prot, next;
+	unsigned long addr, end;
+	unsigned long next;
+
+	addr = virt_to_idmap(text_start);
+	end = virt_to_idmap(text_end);
+	pr_info("Setting up static identity map for 0x%lx - 0x%lx\n", addr, end);
+
+	prot |= PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AF;
 
-	prot = PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AF;
 	if (cpu_architecture() <= CPU_ARCH_ARMv5TEJ && !cpu_is_xscale())
 		prot |= PMD_BIT4;
 
@@ -78,19 +100,12 @@ extern char  __idmap_text_start[], __idmap_text_end[];
 
 static int __init init_static_idmap(void)
 {
-	phys_addr_t idmap_start, idmap_end;
-
 	idmap_pgd = pgd_alloc(&init_mm);
 	if (!idmap_pgd)
 		return -ENOMEM;
 
-	/* Add an identity mapping for the physical address of the section. */
-	idmap_start = virt_to_phys((void *)__idmap_text_start);
-	idmap_end = virt_to_phys((void *)__idmap_text_end);
-
-	pr_info("Setting up static identity map for 0x%llx - 0x%llx\n",
-		(long long)idmap_start, (long long)idmap_end);
-	identity_mapping_add(idmap_pgd, idmap_start, idmap_end);
+	identity_mapping_add(idmap_pgd, __idmap_text_start,
+			     __idmap_text_end, 0);
 
 	/* Flush L1 for the hardware to see this page table content */
 	flush_cache_louis();
@@ -108,6 +123,7 @@ void setup_mm_for_reboot(void)
 {
 	/* Switch to the identity mapping. */
 	cpu_switch_mm(idmap_pgd, &init_mm);
+	local_flush_bp_all();
 
 #ifdef CONFIG_CPU_HAS_ASID
 	/*
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index ad722f1208a..659c75d808d 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -23,6 +23,7 @@
 #include <linux/dma-contiguous.h>
 #include <linux/sizes.h>
 
+#include <asm/cp15.h>
 #include <asm/mach-types.h>
 #include <asm/memblock.h>
 #include <asm/prom.h>
@@ -36,12 +37,21 @@
 
 #include "mm.h"
 
-static unsigned long phys_initrd_start __initdata = 0;
+#ifdef CONFIG_CPU_CP15_MMU
+unsigned long __init __clear_cr(unsigned long mask)
+{
+	cr_alignment = cr_alignment & ~mask;
+	return cr_alignment;
+}
+#endif
+
+static phys_addr_t phys_initrd_start __initdata = 0;
 static unsigned long phys_initrd_size __initdata = 0;
 
 static int __init early_initrd(char *p)
 {
-	unsigned long start, size;
+	phys_addr_t start;
+	unsigned long size;
 	char *endp;
 
 	start = memparse(p, &endp);
@@ -75,37 +85,26 @@ static int __init parse_tag_initrd2(const struct tag *tag)
 
 __tagtable(ATAG_INITRD2, parse_tag_initrd2);
 
-#ifdef CONFIG_OF_FLATTREE
-void __init early_init_dt_setup_initrd_arch(unsigned long start, unsigned long end)
-{
-	phys_initrd_start = start;
-	phys_initrd_size = end - start;
-}
-#endif /* CONFIG_OF_FLATTREE */
-
 /*
  * This keeps memory configuration data used by a couple memory
  * initialization functions, as well as show_mem() for the skipping
  * of holes in the memory map.  It is populated by arm_add_memory().
  */
-struct meminfo meminfo;
-
 void show_mem(unsigned int filter)
 {
 	int free = 0, total = 0, reserved = 0;
-	int shared = 0, cached = 0, slab = 0, i;
-	struct meminfo * mi = &meminfo;
+	int shared = 0, cached = 0, slab = 0;
+	struct memblock_region *reg;
 
 	printk("Mem-info:\n");
 	show_free_areas(filter);
 
-	for_each_bank (i, mi) {
-		struct membank *bank = &mi->bank[i];
+	for_each_memblock (memory, reg) {
 		unsigned int pfn1, pfn2;
 		struct page *page, *end;
 
-		pfn1 = bank_pfn_start(bank);
-		pfn2 = bank_pfn_end(bank);
+		pfn1 = memblock_region_memory_base_pfn(reg);
+		pfn2 = memblock_region_memory_end_pfn(reg);
 
 		page = pfn_to_page(pfn1);
 		end  = pfn_to_page(pfn2 - 1) + 1;
@@ -122,8 +121,9 @@ void show_mem(unsigned int filter)
 				free++;
 			else
 				shared += page_count(page) - 1;
-			page++;
-		} while (page < end);
+			pfn1++;
+			page = pfn_to_page(pfn1);
+		} while (pfn1 < pfn2);
 	}
 
 	printk("%d pages of RAM\n", total);
@@ -137,73 +137,14 @@ void show_mem(unsigned int filter)
 static void __init find_limits(unsigned long *min, unsigned long *max_low,
 			       unsigned long *max_high)
 {
-	struct meminfo *mi = &meminfo;
-	int i;
-
-	/* This assumes the meminfo array is properly sorted */
-	*min = bank_pfn_start(&mi->bank[0]);
-	for_each_bank (i, mi)
-		if (mi->bank[i].highmem)
-				break;
-	*max_low = bank_pfn_end(&mi->bank[i - 1]);
-	*max_high = bank_pfn_end(&mi->bank[mi->nr_banks - 1]);
-}
-
-static void __init arm_bootmem_init(unsigned long start_pfn,
-	unsigned long end_pfn)
-{
-	struct memblock_region *reg;
-	unsigned int boot_pages;
-	phys_addr_t bitmap;
-	pg_data_t *pgdat;
-
-	/*
-	 * Allocate the bootmem bitmap page.  This must be in a region
-	 * of memory which has already been mapped.
-	 */
-	boot_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
-	bitmap = memblock_alloc_base(boot_pages << PAGE_SHIFT, L1_CACHE_BYTES,
-				__pfn_to_phys(end_pfn));
-
-	/*
-	 * Initialise the bootmem allocator, handing the
-	 * memory banks over to bootmem.
-	 */
-	node_set_online(0);
-	pgdat = NODE_DATA(0);
-	init_bootmem_node(pgdat, __phys_to_pfn(bitmap), start_pfn, end_pfn);
-
-	/* Free the lowmem regions from memblock into bootmem. */
-	for_each_memblock(memory, reg) {
-		unsigned long start = memblock_region_memory_base_pfn(reg);
-		unsigned long end = memblock_region_memory_end_pfn(reg);
-
-		if (end >= end_pfn)
-			end = end_pfn;
-		if (start >= end)
-			break;
-
-		free_bootmem(__pfn_to_phys(start), (end - start) << PAGE_SHIFT);
-	}
-
-	/* Reserve the lowmem memblock reserved regions in bootmem. */
-	for_each_memblock(reserved, reg) {
-		unsigned long start = memblock_region_reserved_base_pfn(reg);
-		unsigned long end = memblock_region_reserved_end_pfn(reg);
-
-		if (end >= end_pfn)
-			end = end_pfn;
-		if (start >= end)
-			break;
-
-		reserve_bootmem(__pfn_to_phys(start),
-			        (end - start) << PAGE_SHIFT, BOOTMEM_DEFAULT);
-	}
+	*max_low = PFN_DOWN(memblock_get_current_limit());
+	*min = PFN_UP(memblock_start_of_DRAM());
+	*max_high = PFN_DOWN(memblock_end_of_DRAM());
 }
 
 #ifdef CONFIG_ZONE_DMA
 
-unsigned long arm_dma_zone_size __read_mostly;
+phys_addr_t arm_dma_zone_size __read_mostly;
 EXPORT_SYMBOL(arm_dma_zone_size);
 
 /*
@@ -213,6 +154,7 @@ EXPORT_SYMBOL(arm_dma_zone_size);
  * so a successful GFP_DMA allocation will always satisfy this.
  */
 phys_addr_t arm_dma_limit;
+unsigned long arm_dma_pfn_limit;
 
 static void __init arm_adjust_dma_zone(unsigned long *size, unsigned long *hole,
 	unsigned long dma_size)
@@ -227,7 +169,7 @@ static void __init arm_adjust_dma_zone(unsigned long *size, unsigned long *hole,
 }
 #endif
 
-void __init setup_dma_zone(struct machine_desc *mdesc)
+void __init setup_dma_zone(const struct machine_desc *mdesc)
 {
 #ifdef CONFIG_ZONE_DMA
 	if (mdesc->dma_zone_size) {
@@ -235,10 +177,11 @@ void __init setup_dma_zone(struct machine_desc *mdesc)
 		arm_dma_limit = PHYS_OFFSET + arm_dma_zone_size - 1;
 	} else
 		arm_dma_limit = 0xffffffff;
+	arm_dma_pfn_limit = arm_dma_limit >> PAGE_SHIFT;
 #endif
 }
 
-static void __init arm_bootmem_free(unsigned long min, unsigned long max_low,
+static void __init zone_sizes_init(unsigned long min, unsigned long max_low,
 	unsigned long max_high)
 {
 	unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
@@ -331,13 +274,8 @@ phys_addr_t __init arm_memblock_steal(phys_addr_t size, phys_addr_t align)
 	return phys;
 }
 
-void __init arm_memblock_init(struct meminfo *mi, struct machine_desc *mdesc)
+void __init arm_memblock_init(const struct machine_desc *mdesc)
 {
-	int i;
-
-	for (i = 0; i < mi->nr_banks; i++)
-		memblock_add(mi->bank[i].start, mi->bank[i].size);
-
 	/* Register the kernel text, kernel data and initrd with memblock. */
 #ifdef CONFIG_XIP_KERNEL
 	memblock_reserve(__pa(_sdata), _end - _sdata);
@@ -345,16 +283,22 @@ void __init arm_memblock_init(struct meminfo *mi, struct machine_desc *mdesc)
 	memblock_reserve(__pa(_stext), _end - _stext);
 #endif
 #ifdef CONFIG_BLK_DEV_INITRD
+	/* FDT scan will populate initrd_start */
+	if (initrd_start && !phys_initrd_size) {
+		phys_initrd_start = __virt_to_phys(initrd_start);
+		phys_initrd_size = initrd_end - initrd_start;
+	}
+	initrd_start = initrd_end = 0;
 	if (phys_initrd_size &&
 	    !memblock_is_region_memory(phys_initrd_start, phys_initrd_size)) {
-		pr_err("INITRD: 0x%08lx+0x%08lx is not a memory region - disabling initrd\n",
-		       phys_initrd_start, phys_initrd_size);
+		pr_err("INITRD: 0x%08llx+0x%08lx is not a memory region - disabling initrd\n",
+		       (u64)phys_initrd_start, phys_initrd_size);
 		phys_initrd_start = phys_initrd_size = 0;
 	}
 	if (phys_initrd_size &&
 	    memblock_is_region_reserved(phys_initrd_start, phys_initrd_size)) {
-		pr_err("INITRD: 0x%08lx+0x%08lx overlaps in-use memory region - disabling initrd\n",
-		       phys_initrd_start, phys_initrd_size);
+		pr_err("INITRD: 0x%08llx+0x%08lx overlaps in-use memory region - disabling initrd\n",
+		       (u64)phys_initrd_start, phys_initrd_size);
 		phys_initrd_start = phys_initrd_size = 0;
 	}
 	if (phys_initrd_size) {
@@ -367,12 +311,13 @@ void __init arm_memblock_init(struct meminfo *mi, struct machine_desc *mdesc)
 #endif
 
 	arm_mm_memblock_reserve();
-	arm_dt_memblock_reserve();
 
 	/* reserve any platform specific memblock areas */
 	if (mdesc->reserve)
 		mdesc->reserve();
 
+	early_init_fdt_scan_reserved_mem();
+
 	/*
 	 * reserve memory for DMA contigouos allocations,
 	 * must come from DMA area inside low memory
@@ -380,7 +325,6 @@ void __init arm_memblock_init(struct meminfo *mi, struct machine_desc *mdesc)
 	dma_contiguous_reserve(min(arm_dma_limit, arm_lowmem_limit));
 
 	arm_memblock_steal_permitted = false;
-	memblock_allow_resize();
 	memblock_dump_all();
 }
 
@@ -388,12 +332,11 @@ void __init bootmem_init(void)
 {
 	unsigned long min, max_low, max_high;
 
+	memblock_allow_resize();
 	max_low = max_high = 0;
 
 	find_limits(&min, &max_low, &max_high);
 
-	arm_bootmem_init(min, max_low);
-
 	/*
 	 * Sparsemem tries to allocate bootmem in memory_present(),
 	 * so must be done after the fixed reservations
@@ -410,36 +353,16 @@ void __init bootmem_init(void)
 	 * the sparse mem_map arrays initialized by sparse_init()
 	 * for memmap_init_zone(), otherwise all PFNs are invalid.
 	 */
-	arm_bootmem_free(min, max_low, max_high);
+	zone_sizes_init(min, max_low, max_high);
 
 	/*
 	 * This doesn't seem to be used by the Linux memory manager any
 	 * more, but is used by ll_rw_block.  If we can get rid of it, we
 	 * also get rid of some of the stuff above as well.
-	 *
-	 * Note: max_low_pfn and max_pfn reflect the number of _pages_ in
-	 * the system, not the maximum PFN.
 	 */
-	max_low_pfn = max_low - PHYS_PFN_OFFSET;
-	max_pfn = max_high - PHYS_PFN_OFFSET;
-}
-
-static inline int free_area(unsigned long pfn, unsigned long end, char *s)
-{
-	unsigned int pages = 0, size = (end - pfn) << (PAGE_SHIFT - 10);
-
-	for (; pfn < end; pfn++) {
-		struct page *page = pfn_to_page(pfn);
-		ClearPageReserved(page);
-		init_page_count(page);
-		__free_page(page);
-		pages++;
-	}
-
-	if (size && s)
-		printk(KERN_INFO "Freeing %s memory: %dK\n", s, size);
-
-	return pages;
+	min_low_pfn = min;
+	max_low_pfn = max_low;
+	max_pfn = max_high;
 }
 
 /*
@@ -457,7 +380,7 @@ static inline void
 free_memmap(unsigned long start_pfn, unsigned long end_pfn)
 {
 	struct page *start_pg, *end_pg;
-	unsigned long pg, pgend;
+	phys_addr_t pg, pgend;
 
 	/*
 	 * Convert start_pfn/end_pfn to a struct page pointer.
@@ -469,75 +392,82 @@ free_memmap(unsigned long start_pfn, unsigned long end_pfn)
 	 * Convert to physical addresses, and
 	 * round start upwards and end downwards.
 	 */
-	pg = (unsigned long)PAGE_ALIGN(__pa(start_pg));
-	pgend = (unsigned long)__pa(end_pg) & PAGE_MASK;
+	pg = PAGE_ALIGN(__pa(start_pg));
+	pgend = __pa(end_pg) & PAGE_MASK;
 
 	/*
 	 * If there are free pages between these,
 	 * free the section of the memmap array.
 	 */
 	if (pg < pgend)
-		free_bootmem(pg, pgend - pg);
+		memblock_free_early(pg, pgend - pg);
 }
 
 /*
  * The mem_map array can get very big.  Free the unused area of the memory map.
  */
-static void __init free_unused_memmap(struct meminfo *mi)
+static void __init free_unused_memmap(void)
 {
-	unsigned long bank_start, prev_bank_end = 0;
-	unsigned int i;
+	unsigned long start, prev_end = 0;
+	struct memblock_region *reg;
 
 	/*
 	 * This relies on each bank being in address order.
 	 * The banks are sorted previously in bootmem_init().
 	 */
-	for_each_bank(i, mi) {
-		struct membank *bank = &mi->bank[i];
-
-		bank_start = bank_pfn_start(bank);
+	for_each_memblock(memory, reg) {
+		start = memblock_region_memory_base_pfn(reg);
 
 #ifdef CONFIG_SPARSEMEM
 		/*
 		 * Take care not to free memmap entries that don't exist
 		 * due to SPARSEMEM sections which aren't present.
 		 */
-		bank_start = min(bank_start,
-				 ALIGN(prev_bank_end, PAGES_PER_SECTION));
+		start = min(start,
+				 ALIGN(prev_end, PAGES_PER_SECTION));
 #else
 		/*
 		 * Align down here since the VM subsystem insists that the
 		 * memmap entries are valid from the bank start aligned to
 		 * MAX_ORDER_NR_PAGES.
 		 */
-		bank_start = round_down(bank_start, MAX_ORDER_NR_PAGES);
+		start = round_down(start, MAX_ORDER_NR_PAGES);
 #endif
 		/*
 		 * If we had a previous bank, and there is a space
 		 * between the current bank and the previous, free it.
 		 */
-		if (prev_bank_end && prev_bank_end < bank_start)
-			free_memmap(prev_bank_end, bank_start);
+		if (prev_end && prev_end < start)
+			free_memmap(prev_end, start);
 
 		/*
 		 * Align up here since the VM subsystem insists that the
 		 * memmap entries are valid from the bank end aligned to
 		 * MAX_ORDER_NR_PAGES.
 		 */
-		prev_bank_end = ALIGN(bank_pfn_end(bank), MAX_ORDER_NR_PAGES);
+		prev_end = ALIGN(memblock_region_memory_end_pfn(reg),
+				 MAX_ORDER_NR_PAGES);
 	}
 
 #ifdef CONFIG_SPARSEMEM
-	if (!IS_ALIGNED(prev_bank_end, PAGES_PER_SECTION))
-		free_memmap(prev_bank_end,
-			    ALIGN(prev_bank_end, PAGES_PER_SECTION));
+	if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION))
+		free_memmap(prev_end,
+			    ALIGN(prev_end, PAGES_PER_SECTION));
 #endif
 }
 
+#ifdef CONFIG_HIGHMEM
+static inline void free_area_high(unsigned long pfn, unsigned long end)
+{
+	for (; pfn < end; pfn++)
+		free_highmem_page(pfn_to_page(pfn));
+}
+#endif
+
 static void __init free_highpages(void)
 {
 #ifdef CONFIG_HIGHMEM
-	unsigned long max_low = max_low_pfn + PHYS_PFN_OFFSET;
+	unsigned long max_low = max_low_pfn;
 	struct memblock_region *mem, *res;
 
 	/* set highmem page free */
@@ -569,8 +499,7 @@ static void __init free_highpages(void)
 			if (res_end > end)
 				res_end = end;
 			if (res_start != start)
-				totalhigh_pages += free_area(start, res_start,
-							     NULL);
+				free_area_high(start, res_start);
 			start = res_end;
 			if (start == end)
 				break;
@@ -578,9 +507,8 @@ static void __init free_highpages(void)
 
 		/* And now free anything which remains */
 		if (start < end)
-			totalhigh_pages += free_area(start, end, NULL);
+			free_area_high(start, end);
 	}
-	totalram_pages += totalhigh_pages;
 #endif
 }
 
@@ -591,71 +519,26 @@ static void __init free_highpages(void)
  */
 void __init mem_init(void)
 {
-	unsigned long reserved_pages, free_pages;
-	struct memblock_region *reg;
-	int i;
 #ifdef CONFIG_HAVE_TCM
 	/* These pointers are filled in on TCM detection */
 	extern u32 dtcm_end;
 	extern u32 itcm_end;
 #endif
 
-	max_mapnr   = pfn_to_page(max_pfn + PHYS_PFN_OFFSET) - mem_map;
+	set_max_mapnr(pfn_to_page(max_pfn) - mem_map);
 
 	/* this will put all unused low memory onto the freelists */
-	free_unused_memmap(&meminfo);
-
-	totalram_pages += free_all_bootmem();
+	free_unused_memmap();
+	free_all_bootmem();
 
 #ifdef CONFIG_SA1111
 	/* now that our DMA memory is actually so designated, we can free it */
-	totalram_pages += free_area(PHYS_PFN_OFFSET,
-				    __phys_to_pfn(__pa(swapper_pg_dir)), NULL);
+	free_reserved_area(__va(PHYS_OFFSET), swapper_pg_dir, -1, NULL);
 #endif
 
 	free_highpages();
 
-	reserved_pages = free_pages = 0;
-
-	for_each_bank(i, &meminfo) {
-		struct membank *bank = &meminfo.bank[i];
-		unsigned int pfn1, pfn2;
-		struct page *page, *end;
-
-		pfn1 = bank_pfn_start(bank);
-		pfn2 = bank_pfn_end(bank);
-
-		page = pfn_to_page(pfn1);
-		end  = pfn_to_page(pfn2 - 1) + 1;
-
-		do {
-			if (PageReserved(page))
-				reserved_pages++;
-			else if (!page_count(page))
-				free_pages++;
-			page++;
-		} while (page < end);
-	}
-
-	/*
-	 * Since our memory may not be contiguous, calculate the
-	 * real number of pages we have in this system
-	 */
-	printk(KERN_INFO "Memory:");
-	num_physpages = 0;
-	for_each_memblock(memory, reg) {
-		unsigned long pages = memblock_region_memory_end_pfn(reg) -
-			memblock_region_memory_base_pfn(reg);
-		num_physpages += pages;
-		printk(" %ldMB", pages >> (20 - PAGE_SHIFT));
-	}
-	printk(" = %luMB total\n", num_physpages >> (20 - PAGE_SHIFT));
-
-	printk(KERN_NOTICE "Memory: %luk/%luk available, %luk reserved, %luK highmem\n",
-		nr_free_pages() << (PAGE_SHIFT-10),
-		free_pages << (PAGE_SHIFT-10),
-		reserved_pages << (PAGE_SHIFT-10),
-		totalhigh_pages << (PAGE_SHIFT-10));
+	mem_init_print_info(NULL);
 
 #define MLK(b, t) b, t, ((t) - (b)) >> 10
 #define MLM(b, t) b, t, ((t) - (b)) >> 20
@@ -721,7 +604,7 @@ void __init mem_init(void)
 	BUG_ON(PKMAP_BASE + LAST_PKMAP * PAGE_SIZE	> PAGE_OFFSET);
 #endif
 
-	if (PAGE_SIZE >= 16384 && num_physpages <= 128) {
+	if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) {
 		extern int sysctl_overcommit_memory;
 		/*
 		 * On a machine this small we won't get
@@ -738,16 +621,12 @@ void free_initmem(void)
 	extern char __tcm_start, __tcm_end;
 
 	poison_init_mem(&__tcm_start, &__tcm_end - &__tcm_start);
-	totalram_pages += free_area(__phys_to_pfn(__pa(&__tcm_start)),
-				    __phys_to_pfn(__pa(&__tcm_end)),
-				    "TCM link");
+	free_reserved_area(&__tcm_start, &__tcm_end, -1, "TCM link");
 #endif
 
 	poison_init_mem(__init_begin, __init_end - __init_begin);
 	if (!machine_is_integrator() && !machine_is_cintegrator())
-		totalram_pages += free_area(__phys_to_pfn(__pa(__init_begin)),
-					    __phys_to_pfn(__pa(__init_end)),
-					    "init");
+		free_initmem_default(-1);
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
@@ -758,9 +637,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 {
 	if (!keep_initrd) {
 		poison_init_mem((void *)start, PAGE_ALIGN(end) - start);
-		totalram_pages += free_area(__phys_to_pfn(__pa(start)),
-					    __phys_to_pfn(__pa(end)),
-					    "initrd");
+		free_reserved_area((void *)start, (void *)end, -1, "initrd");
 	}
 }
 
diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c
index 88fd86cf3d9..d1e5ad7ab3b 100644
--- a/arch/arm/mm/ioremap.c
+++ b/arch/arm/mm/ioremap.c
@@ -39,6 +39,70 @@
 #include <asm/mach/pci.h>
 #include "mm.h"
 
+
+LIST_HEAD(static_vmlist);
+
+static struct static_vm *find_static_vm_paddr(phys_addr_t paddr,
+			size_t size, unsigned int mtype)
+{
+	struct static_vm *svm;
+	struct vm_struct *vm;
+
+	list_for_each_entry(svm, &static_vmlist, list) {
+		vm = &svm->vm;
+		if (!(vm->flags & VM_ARM_STATIC_MAPPING))
+			continue;
+		if ((vm->flags & VM_ARM_MTYPE_MASK) != VM_ARM_MTYPE(mtype))
+			continue;
+
+		if (vm->phys_addr > paddr ||
+			paddr + size - 1 > vm->phys_addr + vm->size - 1)
+			continue;
+
+		return svm;
+	}
+
+	return NULL;
+}
+
+struct static_vm *find_static_vm_vaddr(void *vaddr)
+{
+	struct static_vm *svm;
+	struct vm_struct *vm;
+
+	list_for_each_entry(svm, &static_vmlist, list) {
+		vm = &svm->vm;
+
+		/* static_vmlist is ascending order */
+		if (vm->addr > vaddr)
+			break;
+
+		if (vm->addr <= vaddr && vm->addr + vm->size > vaddr)
+			return svm;
+	}
+
+	return NULL;
+}
+
+void __init add_static_vm_early(struct static_vm *svm)
+{
+	struct static_vm *curr_svm;
+	struct vm_struct *vm;
+	void *vaddr;
+
+	vm = &svm->vm;
+	vm_area_add_early(vm);
+	vaddr = vm->addr;
+
+	list_for_each_entry(curr_svm, &static_vmlist, list) {
+		vm = &curr_svm->vm;
+
+		if (vm->addr > vaddr)
+			break;
+	}
+	list_add_tail(&svm->list, &curr_svm->list);
+}
+
 int ioremap_page(unsigned long virt, unsigned long phys,
 		 const struct mem_type *mtype)
 {
@@ -197,13 +261,14 @@ void __iomem * __arm_ioremap_pfn_caller(unsigned long pfn,
 	const struct mem_type *type;
 	int err;
 	unsigned long addr;
- 	struct vm_struct * area;
+	struct vm_struct *area;
+	phys_addr_t paddr = __pfn_to_phys(pfn);
 
 #ifndef CONFIG_ARM_LPAE
 	/*
 	 * High mappings must be supersection aligned
 	 */
-	if (pfn >= 0x100000 && (__pfn_to_phys(pfn) & ~SUPERSECTION_MASK))
+	if (pfn >= 0x100000 && (paddr & ~SUPERSECTION_MASK))
 		return NULL;
 #endif
 
@@ -219,24 +284,16 @@ void __iomem * __arm_ioremap_pfn_caller(unsigned long pfn,
 	/*
 	 * Try to reuse one of the static mapping whenever possible.
 	 */
-	read_lock(&vmlist_lock);
-	for (area = vmlist; area; area = area->next) {
-		if (!size || (sizeof(phys_addr_t) == 4 && pfn >= 0x100000))
-			break;
-		if (!(area->flags & VM_ARM_STATIC_MAPPING))
-			continue;
-		if ((area->flags & VM_ARM_MTYPE_MASK) != VM_ARM_MTYPE(mtype))
-			continue;
-		if (__phys_to_pfn(area->phys_addr) > pfn ||
-		    __pfn_to_phys(pfn) + size-1 > area->phys_addr + area->size-1)
-			continue;
-		/* we can drop the lock here as we know *area is static */
-		read_unlock(&vmlist_lock);
-		addr = (unsigned long)area->addr;
-		addr += __pfn_to_phys(pfn) - area->phys_addr;
-		return (void __iomem *) (offset + addr);
+	if (size && !(sizeof(phys_addr_t) == 4 && pfn >= 0x100000)) {
+		struct static_vm *svm;
+
+		svm = find_static_vm_paddr(paddr, size, mtype);
+		if (svm) {
+			addr = (unsigned long)svm->vm.addr;
+			addr += paddr - svm->vm.phys_addr;
+			return (void __iomem *) (offset + addr);
+		}
 	}
-	read_unlock(&vmlist_lock);
 
 	/*
 	 * Don't allow RAM to be mapped - this causes problems with ARMv6+
@@ -248,21 +305,21 @@ void __iomem * __arm_ioremap_pfn_caller(unsigned long pfn,
  	if (!area)
  		return NULL;
  	addr = (unsigned long)area->addr;
-	area->phys_addr = __pfn_to_phys(pfn);
+	area->phys_addr = paddr;
 
 #if !defined(CONFIG_SMP) && !defined(CONFIG_ARM_LPAE)
 	if (DOMAIN_IO == 0 &&
 	    (((cpu_architecture() >= CPU_ARCH_ARMv6) && (get_cr() & CR_XP)) ||
 	       cpu_is_xsc3()) && pfn >= 0x100000 &&
-	       !((__pfn_to_phys(pfn) | size | addr) & ~SUPERSECTION_MASK)) {
+	       !((paddr | size | addr) & ~SUPERSECTION_MASK)) {
 		area->flags |= VM_ARM_SECTION_MAPPING;
 		err = remap_area_supersections(addr, pfn, size, type);
-	} else if (!((__pfn_to_phys(pfn) | size | addr) & ~PMD_MASK)) {
+	} else if (!((paddr | size | addr) & ~PMD_MASK)) {
 		area->flags |= VM_ARM_SECTION_MAPPING;
 		err = remap_area_sections(addr, pfn, size, type);
 	} else
 #endif
-		err = ioremap_page_range(addr, addr + size, __pfn_to_phys(pfn),
+		err = ioremap_page_range(addr, addr + size, paddr,
 					 __pgprot(type->prot_pte));
 
 	if (err) {
@@ -274,10 +331,10 @@ void __iomem * __arm_ioremap_pfn_caller(unsigned long pfn,
 	return (void __iomem *) (offset + addr);
 }
 
-void __iomem *__arm_ioremap_caller(unsigned long phys_addr, size_t size,
+void __iomem *__arm_ioremap_caller(phys_addr_t phys_addr, size_t size,
 	unsigned int mtype, void *caller)
 {
-	unsigned long last_addr;
+	phys_addr_t last_addr;
  	unsigned long offset = phys_addr & ~PAGE_MASK;
  	unsigned long pfn = __phys_to_pfn(phys_addr);
 
@@ -310,12 +367,12 @@ __arm_ioremap_pfn(unsigned long pfn, unsigned long offset, size_t size,
 }
 EXPORT_SYMBOL(__arm_ioremap_pfn);
 
-void __iomem * (*arch_ioremap_caller)(unsigned long, size_t,
+void __iomem * (*arch_ioremap_caller)(phys_addr_t, size_t,
 				      unsigned int, void *) =
 	__arm_ioremap_caller;
 
 void __iomem *
-__arm_ioremap(unsigned long phys_addr, size_t size, unsigned int mtype)
+__arm_ioremap(phys_addr_t phys_addr, size_t size, unsigned int mtype)
 {
 	return arch_ioremap_caller(phys_addr, size, mtype,
 		__builtin_return_address(0));
@@ -330,14 +387,14 @@ EXPORT_SYMBOL(__arm_ioremap);
  * CONFIG_GENERIC_ALLOCATOR for allocating external memory.
  */
 void __iomem *
-__arm_ioremap_exec(unsigned long phys_addr, size_t size, bool cached)
+__arm_ioremap_exec(phys_addr_t phys_addr, size_t size, bool cached)
 {
 	unsigned int mtype;
 
 	if (cached)
-		mtype = MT_MEMORY;
+		mtype = MT_MEMORY_RWX;
 	else
-		mtype = MT_MEMORY_NONCACHED;
+		mtype = MT_MEMORY_RWX_NONCACHED;
 
 	return __arm_ioremap_caller(phys_addr, size, mtype,
 			__builtin_return_address(0));
@@ -346,34 +403,28 @@ __arm_ioremap_exec(unsigned long phys_addr, size_t size, bool cached)
 void __iounmap(volatile void __iomem *io_addr)
 {
 	void *addr = (void *)(PAGE_MASK & (unsigned long)io_addr);
-	struct vm_struct *vm;
+	struct static_vm *svm;
+
+	/* If this is a static mapping, we must leave it alone */
+	svm = find_static_vm_vaddr(addr);
+	if (svm)
+		return;
 
-	read_lock(&vmlist_lock);
-	for (vm = vmlist; vm; vm = vm->next) {
-		if (vm->addr > addr)
-			break;
-		if (!(vm->flags & VM_IOREMAP))
-			continue;
-		/* If this is a static mapping we must leave it alone */
-		if ((vm->flags & VM_ARM_STATIC_MAPPING) &&
-		    (vm->addr <= addr) && (vm->addr + vm->size > addr)) {
-			read_unlock(&vmlist_lock);
-			return;
-		}
 #if !defined(CONFIG_SMP) && !defined(CONFIG_ARM_LPAE)
+	{
+		struct vm_struct *vm;
+
+		vm = find_vm_area(addr);
+
 		/*
 		 * If this is a section based mapping we need to handle it
 		 * specially as the VM subsystem does not know how to handle
 		 * such a beast.
 		 */
-		if ((vm->addr == addr) &&
-		    (vm->flags & VM_ARM_SECTION_MAPPING)) {
+		if (vm && (vm->flags & VM_ARM_SECTION_MAPPING))
 			unmap_area_sections((unsigned long)vm->addr, vm->size);
-			break;
-		}
-#endif
 	}
-	read_unlock(&vmlist_lock);
+#endif
 
 	vunmap(addr);
 }
@@ -387,6 +438,13 @@ void __arm_iounmap(volatile void __iomem *io_addr)
 EXPORT_SYMBOL(__arm_iounmap);
 
 #ifdef CONFIG_PCI
+static int pci_ioremap_mem_type = MT_DEVICE;
+
+void pci_ioremap_set_mem_type(int mem_type)
+{
+	pci_ioremap_mem_type = mem_type;
+}
+
 int pci_ioremap_io(unsigned int offset, phys_addr_t phys_addr)
 {
 	BUG_ON(offset + SZ_64K > IO_SPACE_LIMIT);
@@ -394,7 +452,7 @@ int pci_ioremap_io(unsigned int offset, phys_addr_t phys_addr)
 	return ioremap_page_range(PCI_IO_VIRT_BASE + offset,
 				  PCI_IO_VIRT_BASE + offset + SZ_64K,
 				  phys_addr,
-				  __pgprot(get_mem_type(MT_DEVICE)->prot_pte));
+				  __pgprot(get_mem_type(pci_ioremap_mem_type)->prot_pte));
 }
 EXPORT_SYMBOL_GPL(pci_ioremap_io);
 #endif
diff --git a/arch/arm/mm/l2c-common.c b/arch/arm/mm/l2c-common.c
new file mode 100644
index 00000000000..10a3cf28c36
--- /dev/null
+++ b/arch/arm/mm/l2c-common.c
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C) 2010 ARM Ltd.
+ * Written by Catalin Marinas <catalin.marinas@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/bug.h>
+#include <linux/smp.h>
+#include <asm/outercache.h>
+
+void outer_disable(void)
+{
+	WARN_ON(!irqs_disabled());
+	WARN_ON(num_online_cpus() > 1);
+
+	if (outer_cache.disable)
+		outer_cache.disable();
+}
diff --git a/arch/arm/mm/l2c-l2x0-resume.S b/arch/arm/mm/l2c-l2x0-resume.S
new file mode 100644
index 00000000000..99b05f21a59
--- /dev/null
+++ b/arch/arm/mm/l2c-l2x0-resume.S
@@ -0,0 +1,58 @@
+/*
+ * L2C-310 early resume code.  This can be used by platforms to restore
+ * the settings of their L2 cache controller before restoring the
+ * processor state.
+ *
+ * This code can only be used to if you are running in the secure world.
+ */
+#include <linux/linkage.h>
+#include <asm/hardware/cache-l2x0.h>
+
+	.text
+
+ENTRY(l2c310_early_resume)
+	adr	r0, 1f
+	ldr	r2, [r0]
+	add	r0, r2, r0
+
+	ldmia	r0, {r1, r2, r3, r4, r5, r6, r7, r8}
+	@ r1 = phys address of L2C-310 controller
+	@ r2 = aux_ctrl
+	@ r3 = tag_latency
+	@ r4 = data_latency
+	@ r5 = filter_start
+	@ r6 = filter_end
+	@ r7 = prefetch_ctrl
+	@ r8 = pwr_ctrl
+
+	@ Check that the address has been initialised
+	teq	r1, #0
+	moveq	pc, lr
+
+	@ The prefetch and power control registers are revision dependent
+	@ and can be written whether or not the L2 cache is enabled
+	ldr	r0, [r1, #L2X0_CACHE_ID]
+	and	r0, r0, #L2X0_CACHE_ID_RTL_MASK
+	cmp	r0, #L310_CACHE_ID_RTL_R2P0
+	strcs	r7, [r1, #L310_PREFETCH_CTRL]
+	cmp	r0, #L310_CACHE_ID_RTL_R3P0
+	strcs	r8, [r1, #L310_POWER_CTRL]
+
+	@ Don't setup the L2 cache if it is already enabled
+	ldr	r0, [r1, #L2X0_CTRL]
+	tst	r0, #L2X0_CTRL_EN
+	movne	pc, lr
+
+	str	r3, [r1, #L310_TAG_LATENCY_CTRL]
+	str	r4, [r1, #L310_DATA_LATENCY_CTRL]
+	str	r6, [r1, #L310_ADDR_FILTER_END]
+	str	r5, [r1, #L310_ADDR_FILTER_START]
+
+	str	r2, [r1, #L2X0_AUX_CTRL]
+	mov	r9, #L2X0_CTRL_EN
+	str	r9, [r1, #L2X0_CTRL]
+	mov	pc, lr
+ENDPROC(l2c310_early_resume)
+
+	.align
+1:	.long	l2x0_saved_regs - .
diff --git a/arch/arm/mm/mm.h b/arch/arm/mm/mm.h
index a8ee92da354..ce727d47275 100644
--- a/arch/arm/mm/mm.h
+++ b/arch/arm/mm/mm.h
@@ -1,4 +1,8 @@
 #ifdef CONFIG_MMU
+#include <linux/list.h>
+#include <linux/vmalloc.h>
+
+#include <asm/pgtable.h>
 
 /* the upper-most page table pointer */
 extern pmd_t *top_pmd;
@@ -36,6 +40,7 @@ static inline pmd_t *pmd_off_k(unsigned long virt)
 
 struct mem_type {
 	pteval_t prot_pte;
+	pteval_t prot_pte_s2;
 	pmdval_t prot_l1;
 	pmdval_t prot_sect;
 	unsigned int domain;
@@ -65,12 +70,24 @@ extern void __flush_dcache_page(struct address_space *mapping, struct page *page
 /* consistent regions used by dma_alloc_attrs() */
 #define VM_ARM_DMA_CONSISTENT	0x20000000
 
+
+struct static_vm {
+	struct vm_struct vm;
+	struct list_head list;
+};
+
+extern struct list_head static_vmlist;
+extern struct static_vm *find_static_vm_vaddr(void *vaddr);
+extern __init void add_static_vm_early(struct static_vm *svm);
+
 #endif
 
 #ifdef CONFIG_ZONE_DMA
 extern phys_addr_t arm_dma_limit;
+extern unsigned long arm_dma_pfn_limit;
 #else
 #define arm_dma_limit ((phys_addr_t)~0)
+#define arm_dma_pfn_limit (~0ul >> PAGE_SHIFT)
 #endif
 
 extern phys_addr_t arm_lowmem_limit;
@@ -78,3 +95,5 @@ extern phys_addr_t arm_lowmem_limit;
 void __init bootmem_init(void);
 void arm_mm_memblock_reserve(void);
 void dma_contiguous_remap(void);
+
+unsigned long __clear_cr(unsigned long mask);
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index 10062ceadd1..5e85ed37136 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -146,7 +146,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 
 	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
 	info.length = len;
-	info.low_limit = PAGE_SIZE;
+	info.low_limit = FIRST_USER_ADDRESS;
 	info.high_limit = mm->mmap_base;
 	info.align_mask = do_align ? (PAGE_MASK & (SHMLBA - 1)) : 0;
 	info.align_offset = pgoff << PAGE_SHIFT;
@@ -181,11 +181,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	if (mmap_is_legacy()) {
 		mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
 		mm->get_unmapped_area = arch_get_unmapped_area;
-		mm->unmap_area = arch_unmap_area;
 	} else {
 		mm->mmap_base = mmap_base(random_factor);
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-		mm->unmap_area = arch_unmap_area_topdown;
 	}
 }
 
@@ -204,13 +202,11 @@ int valid_phys_addr_range(phys_addr_t addr, size_t size)
 }
 
 /*
- * We don't use supersection mappings for mmap() on /dev/mem, which
- * means that we can't map the memory area above the 4G barrier into
- * userspace.
+ * Do not allow /dev/mem mappings beyond the supported physical range.
  */
 int valid_mmap_phys_addr_range(unsigned long pfn, size_t size)
 {
-	return !(pfn + (size >> PAGE_SHIFT) > 0x00100000);
+	return (pfn + (size >> PAGE_SHIFT)) <= (1 + (PHYS_MASK >> PAGE_SHIFT));
 }
 
 #ifdef CONFIG_STRICT_DEVMEM
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index ce328c7f5c9..6e3ba8d112a 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -22,18 +22,23 @@
 #include <asm/cputype.h>
 #include <asm/sections.h>
 #include <asm/cachetype.h>
+#include <asm/sections.h>
 #include <asm/setup.h>
 #include <asm/smp_plat.h>
 #include <asm/tlb.h>
 #include <asm/highmem.h>
 #include <asm/system_info.h>
 #include <asm/traps.h>
+#include <asm/procinfo.h>
+#include <asm/memory.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
 #include <asm/mach/pci.h>
+#include <asm/fixmap.h>
 
 #include "mm.h"
+#include "tcm.h"
 
 /*
  * empty_zero_page is a special page that is used for
@@ -57,6 +62,9 @@ static unsigned int cachepolicy __initdata = CPOLICY_WRITEBACK;
 static unsigned int ecc_mask __initdata = 0;
 pgprot_t pgprot_user;
 pgprot_t pgprot_kernel;
+pgprot_t pgprot_hyp_device;
+pgprot_t pgprot_s2;
+pgprot_t pgprot_s2_device;
 
 EXPORT_SYMBOL(pgprot_user);
 EXPORT_SYMBOL(pgprot_kernel);
@@ -66,59 +74,98 @@ struct cachepolicy {
 	unsigned int	cr_mask;
 	pmdval_t	pmd;
 	pteval_t	pte;
+	pteval_t	pte_s2;
 };
 
+#ifdef CONFIG_ARM_LPAE
+#define s2_policy(policy)	policy
+#else
+#define s2_policy(policy)	0
+#endif
+
 static struct cachepolicy cache_policies[] __initdata = {
 	{
 		.policy		= "uncached",
 		.cr_mask	= CR_W|CR_C,
 		.pmd		= PMD_SECT_UNCACHED,
 		.pte		= L_PTE_MT_UNCACHED,
+		.pte_s2		= s2_policy(L_PTE_S2_MT_UNCACHED),
 	}, {
 		.policy		= "buffered",
 		.cr_mask	= CR_C,
 		.pmd		= PMD_SECT_BUFFERED,
 		.pte		= L_PTE_MT_BUFFERABLE,
+		.pte_s2		= s2_policy(L_PTE_S2_MT_UNCACHED),
 	}, {
 		.policy		= "writethrough",
 		.cr_mask	= 0,
 		.pmd		= PMD_SECT_WT,
 		.pte		= L_PTE_MT_WRITETHROUGH,
+		.pte_s2		= s2_policy(L_PTE_S2_MT_WRITETHROUGH),
 	}, {
 		.policy		= "writeback",
 		.cr_mask	= 0,
 		.pmd		= PMD_SECT_WB,
 		.pte		= L_PTE_MT_WRITEBACK,
+		.pte_s2		= s2_policy(L_PTE_S2_MT_WRITEBACK),
 	}, {
 		.policy		= "writealloc",
 		.cr_mask	= 0,
 		.pmd		= PMD_SECT_WBWA,
 		.pte		= L_PTE_MT_WRITEALLOC,
+		.pte_s2		= s2_policy(L_PTE_S2_MT_WRITEBACK),
 	}
 };
 
+#ifdef CONFIG_CPU_CP15
+static unsigned long initial_pmd_value __initdata = 0;
+
 /*
- * These are useful for identifying cache coherency
- * problems by allowing the cache or the cache and
- * writebuffer to be turned off.  (Note: the write
- * buffer should not be on and the cache off).
+ * Initialise the cache_policy variable with the initial state specified
+ * via the "pmd" value.  This is used to ensure that on ARMv6 and later,
+ * the C code sets the page tables up with the same policy as the head
+ * assembly code, which avoids an illegal state where the TLBs can get
+ * confused.  See comments in early_cachepolicy() for more information.
  */
-static int __init early_cachepolicy(char *p)
+void __init init_default_cache_policy(unsigned long pmd)
 {
 	int i;
 
+	initial_pmd_value = pmd;
+
+	pmd &= PMD_SECT_TEX(1) | PMD_SECT_BUFFERABLE | PMD_SECT_CACHEABLE;
+
+	for (i = 0; i < ARRAY_SIZE(cache_policies); i++)
+		if (cache_policies[i].pmd == pmd) {
+			cachepolicy = i;
+			break;
+		}
+
+	if (i == ARRAY_SIZE(cache_policies))
+		pr_err("ERROR: could not find cache policy\n");
+}
+
+/*
+ * These are useful for identifying cache coherency problems by allowing
+ * the cache or the cache and writebuffer to be turned off.  (Note: the
+ * write buffer should not be on and the cache off).
+ */
+static int __init early_cachepolicy(char *p)
+{
+	int i, selected = -1;
+
 	for (i = 0; i < ARRAY_SIZE(cache_policies); i++) {
 		int len = strlen(cache_policies[i].policy);
 
 		if (memcmp(p, cache_policies[i].policy, len) == 0) {
-			cachepolicy = i;
-			cr_alignment &= ~cache_policies[i].cr_mask;
-			cr_no_alignment &= ~cache_policies[i].cr_mask;
+			selected = i;
 			break;
 		}
 	}
-	if (i == ARRAY_SIZE(cache_policies))
-		printk(KERN_ERR "ERROR: unknown or unsupported cache policy\n");
+
+	if (selected == -1)
+		pr_err("ERROR: unknown or unsupported cache policy\n");
+
 	/*
 	 * This restriction is partly to do with the way we boot; it is
 	 * unpredictable to have memory mapped using two different sets of
@@ -126,12 +173,18 @@ static int __init early_cachepolicy(char *p)
 	 * change these attributes once the initial assembly has setup the
 	 * page tables.
 	 */
-	if (cpu_architecture() >= CPU_ARCH_ARMv6) {
-		printk(KERN_WARNING "Only cachepolicy=writeback supported on ARMv6 and later\n");
-		cachepolicy = CPOLICY_WRITEBACK;
+	if (cpu_architecture() >= CPU_ARCH_ARMv6 && selected != cachepolicy) {
+		pr_warn("Only cachepolicy=%s supported on ARMv6 and later\n",
+			cache_policies[cachepolicy].policy);
+		return 0;
+	}
+
+	if (selected != cachepolicy) {
+		unsigned long cr = __clear_cr(cache_policies[selected].cr_mask);
+		cachepolicy = selected;
+		flush_cache_all();
+		set_cr(cr);
 	}
-	flush_cache_all();
-	set_cr(cr_alignment);
 	return 0;
 }
 early_param("cachepolicy", early_cachepolicy);
@@ -166,42 +219,33 @@ static int __init early_ecc(char *p)
 early_param("ecc", early_ecc);
 #endif
 
-static int __init noalign_setup(char *__unused)
+#else /* ifdef CONFIG_CPU_CP15 */
+
+static int __init early_cachepolicy(char *p)
 {
-	cr_alignment &= ~CR_A;
-	cr_no_alignment &= ~CR_A;
-	set_cr(cr_alignment);
-	return 1;
+	pr_warning("cachepolicy kernel parameter not supported without cp15\n");
 }
-__setup("noalign", noalign_setup);
+early_param("cachepolicy", early_cachepolicy);
 
-#ifndef CONFIG_SMP
-void adjust_cr(unsigned long mask, unsigned long set)
+static int __init noalign_setup(char *__unused)
 {
-	unsigned long flags;
-
-	mask &= ~CR_A;
-
-	set &= mask;
-
-	local_irq_save(flags);
-
-	cr_no_alignment = (cr_no_alignment & ~mask) | set;
-	cr_alignment = (cr_alignment & ~mask) | set;
-
-	set_cr((get_cr() & ~mask) | set);
-
-	local_irq_restore(flags);
+	pr_warning("noalign kernel parameter not supported without cp15\n");
 }
-#endif
+__setup("noalign", noalign_setup);
+
+#endif /* ifdef CONFIG_CPU_CP15 / else */
 
 #define PROT_PTE_DEVICE		L_PTE_PRESENT|L_PTE_YOUNG|L_PTE_DIRTY|L_PTE_XN
+#define PROT_PTE_S2_DEVICE	PROT_PTE_DEVICE
 #define PROT_SECT_DEVICE	PMD_TYPE_SECT|PMD_SECT_AP_WRITE
 
 static struct mem_type mem_types[] = {
 	[MT_DEVICE] = {		  /* Strongly ordered / ARMv6 shared device */
 		.prot_pte	= PROT_PTE_DEVICE | L_PTE_MT_DEV_SHARED |
 				  L_PTE_SHARED,
+		.prot_pte_s2	= s2_policy(PROT_PTE_S2_DEVICE) |
+				  s2_policy(L_PTE_S2_MT_DEV_SHARED) |
+				  L_PTE_SHARED,
 		.prot_l1	= PMD_TYPE_TABLE,
 		.prot_sect	= PROT_SECT_DEVICE | PMD_SECT_S,
 		.domain		= DOMAIN_IO,
@@ -252,36 +296,43 @@ static struct mem_type mem_types[] = {
 		.prot_l1   = PMD_TYPE_TABLE,
 		.domain    = DOMAIN_USER,
 	},
-	[MT_MEMORY] = {
+	[MT_MEMORY_RWX] = {
 		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY,
 		.prot_l1   = PMD_TYPE_TABLE,
 		.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
 		.domain    = DOMAIN_KERNEL,
 	},
+	[MT_MEMORY_RW] = {
+		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
+			     L_PTE_XN,
+		.prot_l1   = PMD_TYPE_TABLE,
+		.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
+		.domain    = DOMAIN_KERNEL,
+	},
 	[MT_ROM] = {
 		.prot_sect = PMD_TYPE_SECT,
 		.domain    = DOMAIN_KERNEL,
 	},
-	[MT_MEMORY_NONCACHED] = {
+	[MT_MEMORY_RWX_NONCACHED] = {
 		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
 				L_PTE_MT_BUFFERABLE,
 		.prot_l1   = PMD_TYPE_TABLE,
 		.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
 		.domain    = DOMAIN_KERNEL,
 	},
-	[MT_MEMORY_DTCM] = {
+	[MT_MEMORY_RW_DTCM] = {
 		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
 				L_PTE_XN,
 		.prot_l1   = PMD_TYPE_TABLE,
 		.prot_sect = PMD_TYPE_SECT | PMD_SECT_XN,
 		.domain    = DOMAIN_KERNEL,
 	},
-	[MT_MEMORY_ITCM] = {
+	[MT_MEMORY_RWX_ITCM] = {
 		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY,
 		.prot_l1   = PMD_TYPE_TABLE,
 		.domain    = DOMAIN_KERNEL,
 	},
-	[MT_MEMORY_SO] = {
+	[MT_MEMORY_RW_SO] = {
 		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
 				L_PTE_MT_UNCACHED | L_PTE_XN,
 		.prot_l1   = PMD_TYPE_TABLE,
@@ -290,7 +341,8 @@ static struct mem_type mem_types[] = {
 		.domain    = DOMAIN_KERNEL,
 	},
 	[MT_MEMORY_DMA_READY] = {
-		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY,
+		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
+				L_PTE_XN,
 		.prot_l1   = PMD_TYPE_TABLE,
 		.domain    = DOMAIN_KERNEL,
 	},
@@ -302,6 +354,44 @@ const struct mem_type *get_mem_type(unsigned int type)
 }
 EXPORT_SYMBOL(get_mem_type);
 
+#define PTE_SET_FN(_name, pteop) \
+static int pte_set_##_name(pte_t *ptep, pgtable_t token, unsigned long addr, \
+			void *data) \
+{ \
+	pte_t pte = pteop(*ptep); \
+\
+	set_pte_ext(ptep, pte, 0); \
+	return 0; \
+} \
+
+#define SET_MEMORY_FN(_name, callback) \
+int set_memory_##_name(unsigned long addr, int numpages) \
+{ \
+	unsigned long start = addr; \
+	unsigned long size = PAGE_SIZE*numpages; \
+	unsigned end = start + size; \
+\
+	if (start < MODULES_VADDR || start >= MODULES_END) \
+		return -EINVAL;\
+\
+	if (end < MODULES_VADDR || end >= MODULES_END) \
+		return -EINVAL; \
+\
+	apply_to_page_range(&init_mm, start, size, callback, NULL); \
+	flush_tlb_kernel_range(start, end); \
+	return 0;\
+}
+
+PTE_SET_FN(ro, pte_wrprotect)
+PTE_SET_FN(rw, pte_mkwrite)
+PTE_SET_FN(x, pte_mkexec)
+PTE_SET_FN(nx, pte_mknexec)
+
+SET_MEMORY_FN(ro, pte_set_ro)
+SET_MEMORY_FN(rw, pte_set_rw)
+SET_MEMORY_FN(x, pte_set_x)
+SET_MEMORY_FN(nx, pte_set_nx)
+
 /*
  * Adjust the PMD section entries according to the CPU in use.
  */
@@ -310,6 +400,7 @@ static void __init build_mem_type_table(void)
 	struct cachepolicy *cp;
 	unsigned int cr = get_cr();
 	pteval_t user_pgprot, kern_pgprot, vecs_pgprot;
+	pteval_t hyp_device_pgprot, s2_pgprot, s2_device_pgprot;
 	int cpu_arch = cpu_architecture();
 	int i;
 
@@ -327,8 +418,17 @@ static void __init build_mem_type_table(void)
 			cachepolicy = CPOLICY_WRITEBACK;
 		ecc_mask = 0;
 	}
-	if (is_smp())
-		cachepolicy = CPOLICY_WRITEALLOC;
+
+	if (is_smp()) {
+		if (cachepolicy != CPOLICY_WRITEALLOC) {
+			pr_warn("Forcing write-allocate cache policy for SMP\n");
+			cachepolicy = CPOLICY_WRITEALLOC;
+		}
+		if (!(initial_pmd_value & PMD_SECT_S)) {
+			pr_warn("Forcing shared mappings for SMP\n");
+			initial_pmd_value |= PMD_SECT_S;
+		}
+	}
 
 	/*
 	 * Strip out features not present on earlier architectures.
@@ -374,6 +474,9 @@ static void __init build_mem_type_table(void)
 			mem_types[MT_DEVICE_NONSHARED].prot_sect |= PMD_SECT_XN;
 			mem_types[MT_DEVICE_CACHED].prot_sect |= PMD_SECT_XN;
 			mem_types[MT_DEVICE_WC].prot_sect |= PMD_SECT_XN;
+
+			/* Also setup NX memory mapping */
+			mem_types[MT_MEMORY_RW].prot_sect |= PMD_SECT_XN;
 		}
 		if (cpu_arch >= CPU_ARCH_ARMv7 && (cr & CR_TRE)) {
 			/*
@@ -421,6 +524,19 @@ static void __init build_mem_type_table(void)
 	 */
 	cp = &cache_policies[cachepolicy];
 	vecs_pgprot = kern_pgprot = user_pgprot = cp->pte;
+	s2_pgprot = cp->pte_s2;
+	hyp_device_pgprot = mem_types[MT_DEVICE].prot_pte;
+	s2_device_pgprot = mem_types[MT_DEVICE].prot_pte_s2;
+
+	/*
+	 * We don't use domains on ARMv6 (since this causes problems with
+	 * v6/v7 kernels), so we must use a separate memory type for user
+	 * r/o, kernel r/w to map the vectors page.
+	 */
+#ifndef CONFIG_ARM_LPAE
+	if (cpu_arch == CPU_ARCH_ARMv6)
+		vecs_pgprot |= L_PTE_MT_VECTORS;
+#endif
 
 	/*
 	 * ARMv6 and above have extended page tables.
@@ -436,23 +552,27 @@ static void __init build_mem_type_table(void)
 		mem_types[MT_CACHECLEAN].prot_sect |= PMD_SECT_APX|PMD_SECT_AP_WRITE;
 #endif
 
-		if (is_smp()) {
-			/*
-			 * Mark memory with the "shared" attribute
-			 * for SMP systems
-			 */
+		/*
+		 * If the initial page tables were created with the S bit
+		 * set, then we need to do the same here for the same
+		 * reasons given in early_cachepolicy().
+		 */
+		if (initial_pmd_value & PMD_SECT_S) {
 			user_pgprot |= L_PTE_SHARED;
 			kern_pgprot |= L_PTE_SHARED;
 			vecs_pgprot |= L_PTE_SHARED;
+			s2_pgprot |= L_PTE_SHARED;
 			mem_types[MT_DEVICE_WC].prot_sect |= PMD_SECT_S;
 			mem_types[MT_DEVICE_WC].prot_pte |= L_PTE_SHARED;
 			mem_types[MT_DEVICE_CACHED].prot_sect |= PMD_SECT_S;
 			mem_types[MT_DEVICE_CACHED].prot_pte |= L_PTE_SHARED;
-			mem_types[MT_MEMORY].prot_sect |= PMD_SECT_S;
-			mem_types[MT_MEMORY].prot_pte |= L_PTE_SHARED;
+			mem_types[MT_MEMORY_RWX].prot_sect |= PMD_SECT_S;
+			mem_types[MT_MEMORY_RWX].prot_pte |= L_PTE_SHARED;
+			mem_types[MT_MEMORY_RW].prot_sect |= PMD_SECT_S;
+			mem_types[MT_MEMORY_RW].prot_pte |= L_PTE_SHARED;
 			mem_types[MT_MEMORY_DMA_READY].prot_pte |= L_PTE_SHARED;
-			mem_types[MT_MEMORY_NONCACHED].prot_sect |= PMD_SECT_S;
-			mem_types[MT_MEMORY_NONCACHED].prot_pte |= L_PTE_SHARED;
+			mem_types[MT_MEMORY_RWX_NONCACHED].prot_sect |= PMD_SECT_S;
+			mem_types[MT_MEMORY_RWX_NONCACHED].prot_pte |= L_PTE_SHARED;
 		}
 	}
 
@@ -463,15 +583,15 @@ static void __init build_mem_type_table(void)
 	if (cpu_arch >= CPU_ARCH_ARMv6) {
 		if (cpu_arch >= CPU_ARCH_ARMv7 && (cr & CR_TRE)) {
 			/* Non-cacheable Normal is XCB = 001 */
-			mem_types[MT_MEMORY_NONCACHED].prot_sect |=
+			mem_types[MT_MEMORY_RWX_NONCACHED].prot_sect |=
 				PMD_SECT_BUFFERED;
 		} else {
 			/* For both ARMv6 and non-TEX-remapping ARMv7 */
-			mem_types[MT_MEMORY_NONCACHED].prot_sect |=
+			mem_types[MT_MEMORY_RWX_NONCACHED].prot_sect |=
 				PMD_SECT_TEX(1);
 		}
 	} else {
-		mem_types[MT_MEMORY_NONCACHED].prot_sect |= PMD_SECT_BUFFERABLE;
+		mem_types[MT_MEMORY_RWX_NONCACHED].prot_sect |= PMD_SECT_BUFFERABLE;
 	}
 
 #ifdef CONFIG_ARM_LPAE
@@ -498,13 +618,18 @@ static void __init build_mem_type_table(void)
 	pgprot_user   = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | user_pgprot);
 	pgprot_kernel = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG |
 				 L_PTE_DIRTY | kern_pgprot);
+	pgprot_s2  = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | s2_pgprot);
+	pgprot_s2_device  = __pgprot(s2_device_pgprot);
+	pgprot_hyp_device  = __pgprot(hyp_device_pgprot);
 
 	mem_types[MT_LOW_VECTORS].prot_l1 |= ecc_mask;
 	mem_types[MT_HIGH_VECTORS].prot_l1 |= ecc_mask;
-	mem_types[MT_MEMORY].prot_sect |= ecc_mask | cp->pmd;
-	mem_types[MT_MEMORY].prot_pte |= kern_pgprot;
+	mem_types[MT_MEMORY_RWX].prot_sect |= ecc_mask | cp->pmd;
+	mem_types[MT_MEMORY_RWX].prot_pte |= kern_pgprot;
+	mem_types[MT_MEMORY_RW].prot_sect |= ecc_mask | cp->pmd;
+	mem_types[MT_MEMORY_RW].prot_pte |= kern_pgprot;
 	mem_types[MT_MEMORY_DMA_READY].prot_pte |= kern_pgprot;
-	mem_types[MT_MEMORY_NONCACHED].prot_sect |= ecc_mask;
+	mem_types[MT_MEMORY_RWX_NONCACHED].prot_sect |= ecc_mask;
 	mem_types[MT_ROM].prot_sect |= cp->pmd;
 
 	switch (cp->pmd) {
@@ -516,8 +641,8 @@ static void __init build_mem_type_table(void)
 		mem_types[MT_CACHECLEAN].prot_sect |= PMD_SECT_WB;
 		break;
 	}
-	printk("Memory policy: ECC %sabled, Data cache %s\n",
-		ecc_mask ? "en" : "dis", cp->policy);
+	pr_info("Memory policy: %sData cache %s\n",
+		ecc_mask ? "ECC enabled, " : "", cp->policy);
 
 	for (i = 0; i < ARRAY_SIZE(mem_types); i++) {
 		struct mem_type *t = &mem_types[i];
@@ -576,50 +701,74 @@ static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr,
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 }
 
-static void __init alloc_init_section(pud_t *pud, unsigned long addr,
-				      unsigned long end, phys_addr_t phys,
-				      const struct mem_type *type)
+static void __init __map_init_section(pmd_t *pmd, unsigned long addr,
+			unsigned long end, phys_addr_t phys,
+			const struct mem_type *type)
 {
-	pmd_t *pmd = pmd_offset(pud, addr);
+	pmd_t *p = pmd;
 
+#ifndef CONFIG_ARM_LPAE
 	/*
-	 * Try a section mapping - end, addr and phys must all be aligned
-	 * to a section boundary.  Note that PMDs refer to the individual
-	 * L1 entries, whereas PGDs refer to a group of L1 entries making
-	 * up one logical pointer to an L2 table.
+	 * In classic MMU format, puds and pmds are folded in to
+	 * the pgds. pmd_offset gives the PGD entry. PGDs refer to a
+	 * group of L1 entries making up one logical pointer to
+	 * an L2 table (2MB), where as PMDs refer to the individual
+	 * L1 entries (1MB). Hence increment to get the correct
+	 * offset for odd 1MB sections.
+	 * (See arch/arm/include/asm/pgtable-2level.h)
 	 */
-	if (type->prot_sect && ((addr | end | phys) & ~SECTION_MASK) == 0) {
-		pmd_t *p = pmd;
-
-#ifndef CONFIG_ARM_LPAE
-		if (addr & SECTION_SIZE)
-			pmd++;
+	if (addr & SECTION_SIZE)
+		pmd++;
 #endif
+	do {
+		*pmd = __pmd(phys | type->prot_sect);
+		phys += SECTION_SIZE;
+	} while (pmd++, addr += SECTION_SIZE, addr != end);
 
-		do {
-			*pmd = __pmd(phys | type->prot_sect);
-			phys += SECTION_SIZE;
-		} while (pmd++, addr += SECTION_SIZE, addr != end);
+	flush_pmd_entry(p);
+}
 
-		flush_pmd_entry(p);
-	} else {
+static void __init alloc_init_pmd(pud_t *pud, unsigned long addr,
+				      unsigned long end, phys_addr_t phys,
+				      const struct mem_type *type)
+{
+	pmd_t *pmd = pmd_offset(pud, addr);
+	unsigned long next;
+
+	do {
 		/*
-		 * No need to loop; pte's aren't interested in the
-		 * individual L1 entries.
+		 * With LPAE, we must loop over to map
+		 * all the pmds for the given range.
 		 */
-		alloc_init_pte(pmd, addr, end, __phys_to_pfn(phys), type);
-	}
+		next = pmd_addr_end(addr, end);
+
+		/*
+		 * Try a section mapping - addr, next and phys must all be
+		 * aligned to a section boundary.
+		 */
+		if (type->prot_sect &&
+				((addr | next | phys) & ~SECTION_MASK) == 0) {
+			__map_init_section(pmd, addr, next, phys, type);
+		} else {
+			alloc_init_pte(pmd, addr, next,
+						__phys_to_pfn(phys), type);
+		}
+
+		phys += next - addr;
+
+	} while (pmd++, addr = next, addr != end);
 }
 
 static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr,
-	unsigned long end, unsigned long phys, const struct mem_type *type)
+				  unsigned long end, phys_addr_t phys,
+				  const struct mem_type *type)
 {
 	pud_t *pud = pud_offset(pgd, addr);
 	unsigned long next;
 
 	do {
 		next = pud_addr_end(addr, end);
-		alloc_init_section(pud, addr, next, phys, type);
+		alloc_init_pmd(pud, addr, next, phys, type);
 		phys += next - addr;
 	} while (pud++, addr = next, addr != end);
 }
@@ -757,21 +906,24 @@ void __init iotable_init(struct map_desc *io_desc, int nr)
 {
 	struct map_desc *md;
 	struct vm_struct *vm;
+	struct static_vm *svm;
 
 	if (!nr)
 		return;
 
-	vm = early_alloc_aligned(sizeof(*vm) * nr, __alignof__(*vm));
+	svm = early_alloc_aligned(sizeof(*svm) * nr, __alignof__(*svm));
 
 	for (md = io_desc; nr; md++, nr--) {
 		create_mapping(md);
+
+		vm = &svm->vm;
 		vm->addr = (void *)(md->virtual & PAGE_MASK);
 		vm->size = PAGE_ALIGN(md->length + (md->virtual & ~PAGE_MASK));
 		vm->phys_addr = __pfn_to_phys(md->pfn);
 		vm->flags = VM_IOREMAP | VM_ARM_STATIC_MAPPING;
 		vm->flags |= VM_ARM_MTYPE(md->type);
 		vm->caller = iotable_init;
-		vm_area_add_early(vm++);
+		add_static_vm_early(svm++);
 	}
 }
 
@@ -779,13 +931,16 @@ void __init vm_reserve_area_early(unsigned long addr, unsigned long size,
 				  void *caller)
 {
 	struct vm_struct *vm;
+	struct static_vm *svm;
+
+	svm = early_alloc_aligned(sizeof(*svm), __alignof__(*svm));
 
-	vm = early_alloc_aligned(sizeof(*vm), __alignof__(*vm));
+	vm = &svm->vm;
 	vm->addr = (void *)addr;
 	vm->size = size;
 	vm->flags = VM_IOREMAP | VM_ARM_EMPTY_MAPPING;
 	vm->caller = caller;
-	vm_area_add_early(vm);
+	add_static_vm_early(svm);
 }
 
 #ifndef CONFIG_ARM_LPAE
@@ -810,14 +965,13 @@ static void __init pmd_empty_section_gap(unsigned long addr)
 
 static void __init fill_pmd_gaps(void)
 {
+	struct static_vm *svm;
 	struct vm_struct *vm;
 	unsigned long addr, next = 0;
 	pmd_t *pmd;
 
-	/* we're still single threaded hence no lock needed here */
-	for (vm = vmlist; vm; vm = vm->next) {
-		if (!(vm->flags & (VM_ARM_STATIC_MAPPING | VM_ARM_EMPTY_MAPPING)))
-			continue;
+	list_for_each_entry(svm, &static_vmlist, list) {
+		vm = &svm->vm;
 		addr = (unsigned long)vm->addr;
 		if (addr < next)
 			continue;
@@ -857,19 +1011,12 @@ static void __init fill_pmd_gaps(void)
 #if defined(CONFIG_PCI) && !defined(CONFIG_NEED_MACH_IO_H)
 static void __init pci_reserve_io(void)
 {
-	struct vm_struct *vm;
-	unsigned long addr;
+	struct static_vm *svm;
 
-	/* we're still single threaded hence no lock needed here */
-	for (vm = vmlist; vm; vm = vm->next) {
-		if (!(vm->flags & VM_ARM_STATIC_MAPPING))
-			continue;
-		addr = (unsigned long)vm->addr;
-		addr &= ~(SZ_2M - 1);
-		if (addr == PCI_IO_VIRT_BASE)
-			return;
+	svm = find_static_vm_vaddr((void *)PCI_IO_VIRT_BASE);
+	if (svm)
+		return;
 
-	}
 	vm_reserve_area_early(PCI_IO_VIRT_BASE, SZ_2M, pci_reserve_io);
 }
 #else
@@ -888,7 +1035,7 @@ void __init debug_ll_io_init(void)
 	map.virtual &= PAGE_MASK;
 	map.length = PAGE_SIZE;
 	map.type = MT_DEVICE;
-	create_mapping(&map);
+	iotable_init(&map, 1);
 }
 #endif
 
@@ -927,113 +1074,85 @@ phys_addr_t arm_lowmem_limit __initdata = 0;
 
 void __init sanity_check_meminfo(void)
 {
-	int i, j, highmem = 0;
+	phys_addr_t memblock_limit = 0;
+	int highmem = 0;
+	phys_addr_t vmalloc_limit = __pa(vmalloc_min - 1) + 1;
+	struct memblock_region *reg;
 
-	for (i = 0, j = 0; i < meminfo.nr_banks; i++) {
-		struct membank *bank = &meminfo.bank[j];
-		*bank = meminfo.bank[i];
+	for_each_memblock(memory, reg) {
+		phys_addr_t block_start = reg->base;
+		phys_addr_t block_end = reg->base + reg->size;
+		phys_addr_t size_limit = reg->size;
 
-		if (bank->start > ULONG_MAX)
+		if (reg->base >= vmalloc_limit)
 			highmem = 1;
+		else
+			size_limit = vmalloc_limit - reg->base;
 
-#ifdef CONFIG_HIGHMEM
-		if (__va(bank->start) >= vmalloc_min ||
-		    __va(bank->start) < (void *)PAGE_OFFSET)
-			highmem = 1;
 
-		bank->highmem = highmem;
+		if (!IS_ENABLED(CONFIG_HIGHMEM) || cache_is_vipt_aliasing()) {
 
-		/*
-		 * Split those memory banks which are partially overlapping
-		 * the vmalloc area greatly simplifying things later.
-		 */
-		if (!highmem && __va(bank->start) < vmalloc_min &&
-		    bank->size > vmalloc_min - __va(bank->start)) {
-			if (meminfo.nr_banks >= NR_BANKS) {
-				printk(KERN_CRIT "NR_BANKS too low, "
-						 "ignoring high memory\n");
-			} else {
-				memmove(bank + 1, bank,
-					(meminfo.nr_banks - i) * sizeof(*bank));
-				meminfo.nr_banks++;
-				i++;
-				bank[1].size -= vmalloc_min - __va(bank->start);
-				bank[1].start = __pa(vmalloc_min - 1) + 1;
-				bank[1].highmem = highmem = 1;
-				j++;
+			if (highmem) {
+				pr_notice("Ignoring RAM at %pa-%pa (!CONFIG_HIGHMEM)\n",
+					&block_start, &block_end);
+				memblock_remove(reg->base, reg->size);
+				continue;
 			}
-			bank->size = vmalloc_min - __va(bank->start);
-		}
-#else
-		bank->highmem = highmem;
 
-		/*
-		 * Highmem banks not allowed with !CONFIG_HIGHMEM.
-		 */
-		if (highmem) {
-			printk(KERN_NOTICE "Ignoring RAM at %.8llx-%.8llx "
-			       "(!CONFIG_HIGHMEM).\n",
-			       (unsigned long long)bank->start,
-			       (unsigned long long)bank->start + bank->size - 1);
-			continue;
-		}
-
-		/*
-		 * Check whether this memory bank would entirely overlap
-		 * the vmalloc area.
-		 */
-		if (__va(bank->start) >= vmalloc_min ||
-		    __va(bank->start) < (void *)PAGE_OFFSET) {
-			printk(KERN_NOTICE "Ignoring RAM at %.8llx-%.8llx "
-			       "(vmalloc region overlap).\n",
-			       (unsigned long long)bank->start,
-			       (unsigned long long)bank->start + bank->size - 1);
-			continue;
-		}
+			if (reg->size > size_limit) {
+				phys_addr_t overlap_size = reg->size - size_limit;
 
-		/*
-		 * Check whether this memory bank would partially overlap
-		 * the vmalloc area.
-		 */
-		if (__va(bank->start + bank->size - 1) >= vmalloc_min ||
-		    __va(bank->start + bank->size - 1) <= __va(bank->start)) {
-			unsigned long newsize = vmalloc_min - __va(bank->start);
-			printk(KERN_NOTICE "Truncating RAM at %.8llx-%.8llx "
-			       "to -%.8llx (vmalloc region overlap).\n",
-			       (unsigned long long)bank->start,
-			       (unsigned long long)bank->start + bank->size - 1,
-			       (unsigned long long)bank->start + newsize - 1);
-			bank->size = newsize;
+				pr_notice("Truncating RAM at %pa-%pa to -%pa",
+				      &block_start, &block_end, &vmalloc_limit);
+				memblock_remove(vmalloc_limit, overlap_size);
+				block_end = vmalloc_limit;
+			}
 		}
-#endif
-		if (!bank->highmem && bank->start + bank->size > arm_lowmem_limit)
-			arm_lowmem_limit = bank->start + bank->size;
 
-		j++;
-	}
-#ifdef CONFIG_HIGHMEM
-	if (highmem) {
-		const char *reason = NULL;
+		if (!highmem) {
+			if (block_end > arm_lowmem_limit) {
+				if (reg->size > size_limit)
+					arm_lowmem_limit = vmalloc_limit;
+				else
+					arm_lowmem_limit = block_end;
+			}
 
-		if (cache_is_vipt_aliasing()) {
 			/*
-			 * Interactions between kmap and other mappings
-			 * make highmem support with aliasing VIPT caches
-			 * rather difficult.
+			 * Find the first non-section-aligned page, and point
+			 * memblock_limit at it. This relies on rounding the
+			 * limit down to be section-aligned, which happens at
+			 * the end of this function.
+			 *
+			 * With this algorithm, the start or end of almost any
+			 * bank can be non-section-aligned. The only exception
+			 * is that the start of the bank 0 must be section-
+			 * aligned, since otherwise memory would need to be
+			 * allocated when mapping the start of bank 0, which
+			 * occurs before any free memory is mapped.
 			 */
-			reason = "with VIPT aliasing cache";
-		}
-		if (reason) {
-			printk(KERN_CRIT "HIGHMEM is not supported %s, ignoring high memory\n",
-				reason);
-			while (j > 0 && meminfo.bank[j - 1].highmem)
-				j--;
+			if (!memblock_limit) {
+				if (!IS_ALIGNED(block_start, SECTION_SIZE))
+					memblock_limit = block_start;
+				else if (!IS_ALIGNED(block_end, SECTION_SIZE))
+					memblock_limit = arm_lowmem_limit;
+			}
+
 		}
 	}
-#endif
-	meminfo.nr_banks = j;
+
 	high_memory = __va(arm_lowmem_limit - 1) + 1;
-	memblock_set_current_limit(arm_lowmem_limit);
+
+	/*
+	 * Round the memblock limit down to a section size.  This
+	 * helps to ensure that we will allocate memory from the
+	 * last full section, which should be mapped.
+	 */
+	if (memblock_limit)
+		memblock_limit = round_down(memblock_limit, SECTION_SIZE);
+	if (!memblock_limit)
+		memblock_limit = arm_lowmem_limit;
+
+	memblock_set_current_limit(memblock_limit);
 }
 
 static inline void prepare_page_table(void)
@@ -1105,7 +1224,7 @@ void __init arm_mm_memblock_reserve(void)
  * called function.  This means you can't use any function or debugging
  * method which may touch any device, otherwise the kernel _will_ crash.
  */
-static void __init devicemaps_init(struct machine_desc *mdesc)
+static void __init devicemaps_init(const struct machine_desc *mdesc)
 {
 	struct map_desc map;
 	unsigned long addr;
@@ -1114,7 +1233,7 @@ static void __init devicemaps_init(struct machine_desc *mdesc)
 	/*
 	 * Allocate the vector page early.
 	 */
-	vectors = early_alloc(PAGE_SIZE);
+	vectors = early_alloc(PAGE_SIZE * 2);
 
 	early_trap_init(vectors);
 
@@ -1159,20 +1278,34 @@ static void __init devicemaps_init(struct machine_desc *mdesc)
 	map.pfn = __phys_to_pfn(virt_to_phys(vectors));
 	map.virtual = 0xffff0000;
 	map.length = PAGE_SIZE;
+#ifdef CONFIG_KUSER_HELPERS
 	map.type = MT_HIGH_VECTORS;
+#else
+	map.type = MT_LOW_VECTORS;
+#endif
 	create_mapping(&map);
 
 	if (!vectors_high()) {
 		map.virtual = 0;
+		map.length = PAGE_SIZE * 2;
 		map.type = MT_LOW_VECTORS;
 		create_mapping(&map);
 	}
 
+	/* Now create a kernel read-only mapping */
+	map.pfn += 1;
+	map.virtual = 0xffff0000 + PAGE_SIZE;
+	map.length = PAGE_SIZE;
+	map.type = MT_LOW_VECTORS;
+	create_mapping(&map);
+
 	/*
 	 * Ask the machine support to map in the statically mapped devices.
 	 */
 	if (mdesc->map_io)
 		mdesc->map_io();
+	else
+		debug_ll_io_init();
 	fill_pmd_gaps();
 
 	/* Reserve fixed i/o space in VMALLOC region */
@@ -1193,12 +1326,17 @@ static void __init kmap_init(void)
 #ifdef CONFIG_HIGHMEM
 	pkmap_page_table = early_pte_alloc(pmd_off_k(PKMAP_BASE),
 		PKMAP_BASE, _PAGE_KERNEL_TABLE);
+
+	fixmap_page_table = early_pte_alloc(pmd_off_k(FIXADDR_START),
+		FIXADDR_START, _PAGE_KERNEL_TABLE);
 #endif
 }
 
 static void __init map_lowmem(void)
 {
 	struct memblock_region *reg;
+	unsigned long kernel_x_start = round_down(__pa(_stext), SECTION_SIZE);
+	unsigned long kernel_x_end = round_up(__pa(__init_end), SECTION_SIZE);
 
 	/* Map all the lowmem memory banks. */
 	for_each_memblock(memory, reg) {
@@ -1211,31 +1349,138 @@ static void __init map_lowmem(void)
 		if (start >= end)
 			break;
 
-		map.pfn = __phys_to_pfn(start);
-		map.virtual = __phys_to_virt(start);
-		map.length = end - start;
-		map.type = MT_MEMORY;
+		if (end < kernel_x_start || start >= kernel_x_end) {
+			map.pfn = __phys_to_pfn(start);
+			map.virtual = __phys_to_virt(start);
+			map.length = end - start;
+			map.type = MT_MEMORY_RWX;
 
-		create_mapping(&map);
+			create_mapping(&map);
+		} else {
+			/* This better cover the entire kernel */
+			if (start < kernel_x_start) {
+				map.pfn = __phys_to_pfn(start);
+				map.virtual = __phys_to_virt(start);
+				map.length = kernel_x_start - start;
+				map.type = MT_MEMORY_RW;
+
+				create_mapping(&map);
+			}
+
+			map.pfn = __phys_to_pfn(kernel_x_start);
+			map.virtual = __phys_to_virt(kernel_x_start);
+			map.length = kernel_x_end - kernel_x_start;
+			map.type = MT_MEMORY_RWX;
+
+			create_mapping(&map);
+
+			if (kernel_x_end < end) {
+				map.pfn = __phys_to_pfn(kernel_x_end);
+				map.virtual = __phys_to_virt(kernel_x_end);
+				map.length = end - kernel_x_end;
+				map.type = MT_MEMORY_RW;
+
+				create_mapping(&map);
+			}
+		}
+	}
+}
+
+#ifdef CONFIG_ARM_LPAE
+/*
+ * early_paging_init() recreates boot time page table setup, allowing machines
+ * to switch over to a high (>4G) address space on LPAE systems
+ */
+void __init early_paging_init(const struct machine_desc *mdesc,
+			      struct proc_info_list *procinfo)
+{
+	pmdval_t pmdprot = procinfo->__cpu_mm_mmu_flags;
+	unsigned long map_start, map_end;
+	pgd_t *pgd0, *pgdk;
+	pud_t *pud0, *pudk, *pud_start;
+	pmd_t *pmd0, *pmdk;
+	phys_addr_t phys;
+	int i;
+
+	if (!(mdesc->init_meminfo))
+		return;
+
+	/* remap kernel code and data */
+	map_start = init_mm.start_code & PMD_MASK;
+	map_end   = ALIGN(init_mm.brk, PMD_SIZE);
+
+	/* get a handle on things... */
+	pgd0 = pgd_offset_k(0);
+	pud_start = pud0 = pud_offset(pgd0, 0);
+	pmd0 = pmd_offset(pud0, 0);
+
+	pgdk = pgd_offset_k(map_start);
+	pudk = pud_offset(pgdk, map_start);
+	pmdk = pmd_offset(pudk, map_start);
+
+	mdesc->init_meminfo();
+
+	/* Run the patch stub to update the constants */
+	fixup_pv_table(&__pv_table_begin,
+		(&__pv_table_end - &__pv_table_begin) << 2);
+
+	/*
+	 * Cache cleaning operations for self-modifying code
+	 * We should clean the entries by MVA but running a
+	 * for loop over every pv_table entry pointer would
+	 * just complicate the code.
+	 */
+	flush_cache_louis();
+	dsb(ishst);
+	isb();
+
+	/* remap level 1 table */
+	for (i = 0; i < PTRS_PER_PGD; pud0++, i++) {
+		set_pud(pud0,
+			__pud(__pa(pmd0) | PMD_TYPE_TABLE | L_PGD_SWAPPER));
+		pmd0 += PTRS_PER_PMD;
 	}
+
+	/* remap pmds for kernel mapping */
+	phys = __pa(map_start);
+	do {
+		*pmdk++ = __pmd(phys | pmdprot);
+		phys += PMD_SIZE;
+	} while (phys < map_end);
+
+	flush_cache_all();
+	cpu_switch_mm(pgd0, &init_mm);
+	cpu_set_ttbr(1, __pa(pgd0) + TTBR1_OFFSET);
+	local_flush_bp_all();
+	local_flush_tlb_all();
 }
 
+#else
+
+void __init early_paging_init(const struct machine_desc *mdesc,
+			      struct proc_info_list *procinfo)
+{
+	if (mdesc->init_meminfo)
+		mdesc->init_meminfo();
+}
+
+#endif
+
 /*
  * paging_init() sets up the page tables, initialises the zone memory
  * maps, and sets up the zero page, bad page and bad page tables.
  */
-void __init paging_init(struct machine_desc *mdesc)
+void __init paging_init(const struct machine_desc *mdesc)
 {
 	void *zero_page;
 
-	memblock_set_current_limit(arm_lowmem_limit);
-
 	build_mem_type_table();
 	prepare_page_table();
 	map_lowmem();
 	dma_contiguous_remap();
 	devicemaps_init(mdesc);
 	kmap_init();
+	tcm_init();
 
 	top_pmd = pmd_off_k(0xffff0000);
 
diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c
index d51225f90ae..a014dfacd5c 100644
--- a/arch/arm/mm/nommu.c
+++ b/arch/arm/mm/nommu.c
@@ -8,6 +8,7 @@
 #include <linux/pagemap.h>
 #include <linux/io.h>
 #include <linux/memblock.h>
+#include <linux/kernel.h>
 
 #include <asm/cacheflush.h>
 #include <asm/sections.h>
@@ -15,32 +16,310 @@
 #include <asm/setup.h>
 #include <asm/traps.h>
 #include <asm/mach/arch.h>
+#include <asm/cputype.h>
+#include <asm/mpu.h>
+#include <asm/procinfo.h>
 
 #include "mm.h"
 
+#ifdef CONFIG_ARM_MPU
+struct mpu_rgn_info mpu_rgn_info;
+
+/* Region number */
+static void rgnr_write(u32 v)
+{
+	asm("mcr        p15, 0, %0, c6, c2, 0" : : "r" (v));
+}
+
+/* Data-side / unified region attributes */
+
+/* Region access control register */
+static void dracr_write(u32 v)
+{
+	asm("mcr        p15, 0, %0, c6, c1, 4" : : "r" (v));
+}
+
+/* Region size register */
+static void drsr_write(u32 v)
+{
+	asm("mcr        p15, 0, %0, c6, c1, 2" : : "r" (v));
+}
+
+/* Region base address register */
+static void drbar_write(u32 v)
+{
+	asm("mcr        p15, 0, %0, c6, c1, 0" : : "r" (v));
+}
+
+static u32 drbar_read(void)
+{
+	u32 v;
+	asm("mrc        p15, 0, %0, c6, c1, 0" : "=r" (v));
+	return v;
+}
+/* Optional instruction-side region attributes */
+
+/* I-side Region access control register */
+static void iracr_write(u32 v)
+{
+	asm("mcr        p15, 0, %0, c6, c1, 5" : : "r" (v));
+}
+
+/* I-side Region size register */
+static void irsr_write(u32 v)
+{
+	asm("mcr        p15, 0, %0, c6, c1, 3" : : "r" (v));
+}
+
+/* I-side Region base address register */
+static void irbar_write(u32 v)
+{
+	asm("mcr        p15, 0, %0, c6, c1, 1" : : "r" (v));
+}
+
+static unsigned long irbar_read(void)
+{
+	unsigned long v;
+	asm("mrc        p15, 0, %0, c6, c1, 1" : "=r" (v));
+	return v;
+}
+
+/* MPU initialisation functions */
+void __init sanity_check_meminfo_mpu(void)
+{
+	int i;
+	phys_addr_t phys_offset = PHYS_OFFSET;
+	phys_addr_t aligned_region_size, specified_mem_size, rounded_mem_size;
+	struct memblock_region *reg;
+	bool first = true;
+	phys_addr_t mem_start;
+	phys_addr_t mem_end;
+
+	for_each_memblock(memory, reg) {
+		if (first) {
+			/*
+			 * Initially only use memory continuous from
+			 * PHYS_OFFSET */
+			if (reg->base != phys_offset)
+				panic("First memory bank must be contiguous from PHYS_OFFSET");
+
+			mem_start = reg->base;
+			mem_end = reg->base + reg->size;
+			specified_mem_size = reg->size;
+			first = false;
+		} else {
+			/*
+			 * memblock auto merges contiguous blocks, remove
+			 * all blocks afterwards
+			 */
+			pr_notice("Ignoring RAM after %pa, memory at %pa ignored\n",
+				  &mem_start, &reg->base);
+			memblock_remove(reg->base, reg->size);
+		}
+	}
+
+	/*
+	 * MPU has curious alignment requirements: Size must be power of 2, and
+	 * region start must be aligned to the region size
+	 */
+	if (phys_offset != 0)
+		pr_info("PHYS_OFFSET != 0 => MPU Region size constrained by alignment requirements\n");
+
+	/*
+	 * Maximum aligned region might overflow phys_addr_t if phys_offset is
+	 * 0. Hence we keep everything below 4G until we take the smaller of
+	 * the aligned_region_size and rounded_mem_size, one of which is
+	 * guaranteed to be smaller than the maximum physical address.
+	 */
+	aligned_region_size = (phys_offset - 1) ^ (phys_offset);
+	/* Find the max power-of-two sized region that fits inside our bank */
+	rounded_mem_size = (1 <<  __fls(specified_mem_size)) - 1;
+
+	/* The actual region size is the smaller of the two */
+	aligned_region_size = aligned_region_size < rounded_mem_size
+				? aligned_region_size + 1
+				: rounded_mem_size + 1;
+
+	if (aligned_region_size != specified_mem_size) {
+		pr_warn("Truncating memory from %pa to %pa (MPU region constraints)",
+				&specified_mem_size, &aligned_region_size);
+		memblock_remove(mem_start + aligned_region_size,
+				specified_mem_size - aligned_round_size);
+
+		mem_end = mem_start + aligned_region_size;
+	}
+
+	pr_debug("MPU Region from %pa size %pa (end %pa))\n",
+		&phys_offset, &aligned_region_size, &mem_end);
+
+}
+
+static int mpu_present(void)
+{
+	return ((read_cpuid_ext(CPUID_EXT_MMFR0) & MMFR0_PMSA) == MMFR0_PMSAv7);
+}
+
+static int mpu_max_regions(void)
+{
+	/*
+	 * We don't support a different number of I/D side regions so if we
+	 * have separate instruction and data memory maps then return
+	 * whichever side has a smaller number of supported regions.
+	 */
+	u32 dregions, iregions, mpuir;
+	mpuir = read_cpuid(CPUID_MPUIR);
+
+	dregions = iregions = (mpuir & MPUIR_DREGION_SZMASK) >> MPUIR_DREGION;
+
+	/* Check for separate d-side and i-side memory maps */
+	if (mpuir & MPUIR_nU)
+		iregions = (mpuir & MPUIR_IREGION_SZMASK) >> MPUIR_IREGION;
+
+	/* Use the smallest of the two maxima */
+	return min(dregions, iregions);
+}
+
+static int mpu_iside_independent(void)
+{
+	/* MPUIR.nU specifies whether there is *not* a unified memory map */
+	return read_cpuid(CPUID_MPUIR) & MPUIR_nU;
+}
+
+static int mpu_min_region_order(void)
+{
+	u32 drbar_result, irbar_result;
+	/* We've kept a region free for this probing */
+	rgnr_write(MPU_PROBE_REGION);
+	isb();
+	/*
+	 * As per ARM ARM, write 0xFFFFFFFC to DRBAR to find the minimum
+	 * region order
+	*/
+	drbar_write(0xFFFFFFFC);
+	drbar_result = irbar_result = drbar_read();
+	drbar_write(0x0);
+	/* If the MPU is non-unified, we use the larger of the two minima*/
+	if (mpu_iside_independent()) {
+		irbar_write(0xFFFFFFFC);
+		irbar_result = irbar_read();
+		irbar_write(0x0);
+	}
+	isb(); /* Ensure that MPU region operations have completed */
+	/* Return whichever result is larger */
+	return __ffs(max(drbar_result, irbar_result));
+}
+
+static int mpu_setup_region(unsigned int number, phys_addr_t start,
+			unsigned int size_order, unsigned int properties)
+{
+	u32 size_data;
+
+	/* We kept a region free for probing resolution of MPU regions*/
+	if (number > mpu_max_regions() || number == MPU_PROBE_REGION)
+		return -ENOENT;
+
+	if (size_order > 32)
+		return -ENOMEM;
+
+	if (size_order < mpu_min_region_order())
+		return -ENOMEM;
+
+	/* Writing N to bits 5:1 (RSR_SZ)  specifies region size 2^N+1 */
+	size_data = ((size_order - 1) << MPU_RSR_SZ) | 1 << MPU_RSR_EN;
+
+	dsb(); /* Ensure all previous data accesses occur with old mappings */
+	rgnr_write(number);
+	isb();
+	drbar_write(start);
+	dracr_write(properties);
+	isb(); /* Propagate properties before enabling region */
+	drsr_write(size_data);
+
+	/* Check for independent I-side registers */
+	if (mpu_iside_independent()) {
+		irbar_write(start);
+		iracr_write(properties);
+		isb();
+		irsr_write(size_data);
+	}
+	isb();
+
+	/* Store region info (we treat i/d side the same, so only store d) */
+	mpu_rgn_info.rgns[number].dracr = properties;
+	mpu_rgn_info.rgns[number].drbar = start;
+	mpu_rgn_info.rgns[number].drsr = size_data;
+	return 0;
+}
+
+/*
+* Set up default MPU regions, doing nothing if there is no MPU
+*/
+void __init mpu_setup(void)
+{
+	int region_err;
+	if (!mpu_present())
+		return;
+
+	region_err = mpu_setup_region(MPU_RAM_REGION, PHYS_OFFSET,
+					ilog2(meminfo.bank[0].size),
+					MPU_AP_PL1RW_PL0RW | MPU_RGN_NORMAL);
+	if (region_err) {
+		panic("MPU region initialization failure! %d", region_err);
+	} else {
+		pr_info("Using ARMv7 PMSA Compliant MPU. "
+			 "Region independence: %s, Max regions: %d\n",
+			mpu_iside_independent() ? "Yes" : "No",
+			mpu_max_regions());
+	}
+}
+#else
+static void sanity_check_meminfo_mpu(void) {}
+static void __init mpu_setup(void) {}
+#endif /* CONFIG_ARM_MPU */
+
 void __init arm_mm_memblock_reserve(void)
 {
+#ifndef CONFIG_CPU_V7M
 	/*
 	 * Register the exception vector page.
 	 * some architectures which the DRAM is the exception vector to trap,
 	 * alloc_page breaks with error, although it is not NULL, but "0."
 	 */
 	memblock_reserve(CONFIG_VECTORS_BASE, PAGE_SIZE);
+#else /* ifndef CONFIG_CPU_V7M */
+	/*
+	 * There is no dedicated vector page on V7-M. So nothing needs to be
+	 * reserved here.
+	 */
+#endif
 }
 
 void __init sanity_check_meminfo(void)
 {
-	phys_addr_t end = bank_phys_end(&meminfo.bank[meminfo.nr_banks - 1]);
+	phys_addr_t end;
+	sanity_check_meminfo_mpu();
+	end = memblock_end_of_DRAM();
 	high_memory = __va(end - 1) + 1;
+	memblock_set_current_limit(end);
+}
+
+/*
+ * early_paging_init() recreates boot time page table setup, allowing machines
+ * to switch over to a high (>4G) address space on LPAE systems
+ */
+void __init early_paging_init(const struct machine_desc *mdesc,
+			      struct proc_info_list *procinfo)
+{
 }
 
 /*
  * paging_init() sets up the page tables, initialises the zone memory
  * maps, and sets up the zero page, bad page and bad page tables.
  */
-void __init paging_init(struct machine_desc *mdesc)
+void __init paging_init(const struct machine_desc *mdesc)
 {
 	early_trap_init((void *)CONFIG_VECTORS_BASE);
+	mpu_setup();
 	bootmem_init();
 }
 
@@ -57,6 +336,12 @@ void flush_dcache_page(struct page *page)
 }
 EXPORT_SYMBOL(flush_dcache_page);
 
+void flush_kernel_dcache_page(struct page *page)
+{
+	__cpuc_flush_dcache_area(page_address(page), PAGE_SIZE);
+}
+EXPORT_SYMBOL(flush_kernel_dcache_page);
+
 void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
 		       unsigned long uaddr, void *dst, const void *src,
 		       unsigned long len)
@@ -81,16 +366,16 @@ void __iomem *__arm_ioremap_pfn_caller(unsigned long pfn, unsigned long offset,
 	return __arm_ioremap_pfn(pfn, offset, size, mtype);
 }
 
-void __iomem *__arm_ioremap(unsigned long phys_addr, size_t size,
+void __iomem *__arm_ioremap(phys_addr_t phys_addr, size_t size,
 			    unsigned int mtype)
 {
 	return (void __iomem *)phys_addr;
 }
 EXPORT_SYMBOL(__arm_ioremap);
 
-void __iomem * (*arch_ioremap_caller)(unsigned long, size_t, unsigned int, void *);
+void __iomem * (*arch_ioremap_caller)(phys_addr_t, size_t, unsigned int, void *);
 
-void __iomem *__arm_ioremap_caller(unsigned long phys_addr, size_t size,
+void __iomem *__arm_ioremap_caller(phys_addr_t phys_addr, size_t size,
 				   unsigned int mtype, void *caller)
 {
 	return __arm_ioremap(phys_addr, size, mtype);
diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c
index 0acb089d0f7..249379535be 100644
--- a/arch/arm/mm/pgd.c
+++ b/arch/arm/mm/pgd.c
@@ -23,7 +23,7 @@
 #define __pgd_alloc()	kmalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL)
 #define __pgd_free(pgd)	kfree(pgd)
 #else
-#define __pgd_alloc()	(pgd_t *)__get_free_pages(GFP_KERNEL, 2)
+#define __pgd_alloc()	(pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_REPEAT, 2)
 #define __pgd_free(pgd)	free_pages((unsigned long)pgd, 2)
 #endif
 
@@ -87,7 +87,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 		init_pud = pud_offset(init_pgd, 0);
 		init_pmd = pmd_offset(init_pud, 0);
 		init_pte = pte_offset_map(init_pmd, 0);
-		set_pte_ext(new_pte, *init_pte, 0);
+		set_pte_ext(new_pte + 0, init_pte[0], 0);
+		set_pte_ext(new_pte + 1, init_pte[1], 0);
 		pte_unmap(init_pte);
 		pte_unmap(new_pte);
 	}
diff --git a/arch/arm/mm/proc-arm1020.S b/arch/arm/mm/proc-arm1020.S
index 2bb61e703d6..d1a2d05971e 100644
--- a/arch/arm/mm/proc-arm1020.S
+++ b/arch/arm/mm/proc-arm1020.S
@@ -443,8 +443,6 @@ ENTRY(cpu_arm1020_set_pte_ext)
 #endif /* CONFIG_MMU */
 	mov	pc, lr
 
-	__CPUINIT
-
 	.type	__arm1020_setup, #function
 __arm1020_setup:
 	mov	r0, #0
diff --git a/arch/arm/mm/proc-arm1020e.S b/arch/arm/mm/proc-arm1020e.S
index 8f96aa40f51..9d89405c3d0 100644
--- a/arch/arm/mm/proc-arm1020e.S
+++ b/arch/arm/mm/proc-arm1020e.S
@@ -425,8 +425,6 @@ ENTRY(cpu_arm1020e_set_pte_ext)
 #endif /* CONFIG_MMU */
 	mov	pc, lr
 
-	__CPUINIT
-
 	.type	__arm1020e_setup, #function
 __arm1020e_setup:
 	mov	r0, #0
diff --git a/arch/arm/mm/proc-arm1022.S b/arch/arm/mm/proc-arm1022.S
index 8ebe4a469a2..6f01a0ae3b3 100644
--- a/arch/arm/mm/proc-arm1022.S
+++ b/arch/arm/mm/proc-arm1022.S
@@ -407,8 +407,6 @@ ENTRY(cpu_arm1022_set_pte_ext)
 #endif /* CONFIG_MMU */
 	mov	pc, lr
 
-	__CPUINIT
-
 	.type	__arm1022_setup, #function
 __arm1022_setup:
 	mov	r0, #0
diff --git a/arch/arm/mm/proc-arm1026.S b/arch/arm/mm/proc-arm1026.S
index 093fc7e520c..4799a24b43e 100644
--- a/arch/arm/mm/proc-arm1026.S
+++ b/arch/arm/mm/proc-arm1026.S
@@ -396,9 +396,6 @@ ENTRY(cpu_arm1026_set_pte_ext)
 #endif /* CONFIG_MMU */
 	mov	pc, lr
 
-
-	__CPUINIT
-
 	.type	__arm1026_setup, #function
 __arm1026_setup:
 	mov	r0, #0
diff --git a/arch/arm/mm/proc-arm720.S b/arch/arm/mm/proc-arm720.S
index 0ac908c7ade..d42c37f9f5b 100644
--- a/arch/arm/mm/proc-arm720.S
+++ b/arch/arm/mm/proc-arm720.S
@@ -116,8 +116,6 @@ ENTRY(cpu_arm720_reset)
 ENDPROC(cpu_arm720_reset)
 		.popsection
 
-	__CPUINIT
-
 	.type	__arm710_setup, #function
 __arm710_setup:
 	mov	r0, #0
diff --git a/arch/arm/mm/proc-arm740.S b/arch/arm/mm/proc-arm740.S
index dc5de5d53f2..9b0ae90cbf1 100644
--- a/arch/arm/mm/proc-arm740.S
+++ b/arch/arm/mm/proc-arm740.S
@@ -60,8 +60,6 @@ ENTRY(cpu_arm740_reset)
 ENDPROC(cpu_arm740_reset)
 	.popsection
 
-	__CPUINIT
-
 	.type	__arm740_setup, #function
 __arm740_setup:
 	mov	r0, #0
@@ -77,24 +75,27 @@ __arm740_setup:
 	mcr	p15, 0, r0, c6,	c0		@ set area 0, default
 
 	ldr	r0, =(CONFIG_DRAM_BASE & 0xFFFFF000) @ base[31:12] of RAM
-	ldr	r1, =(CONFIG_DRAM_SIZE >> 12)	@ size of RAM (must be >= 4KB)
-	mov	r2, #10				@ 11 is the minimum (4KB)
-1:	add	r2, r2, #1			@ area size *= 2
-	mov	r1, r1, lsr #1
+	ldr	r3, =(CONFIG_DRAM_SIZE >> 12)	@ size of RAM (must be >= 4KB)
+	mov	r4, #10				@ 11 is the minimum (4KB)
+1:	add	r4, r4, #1			@ area size *= 2
+	movs	r3, r3, lsr #1
 	bne	1b				@ count not zero r-shift
-	orr	r0, r0, r2, lsl #1		@ the area register value
+	orr	r0, r0, r4, lsl #1		@ the area register value
 	orr	r0, r0, #1			@ set enable bit
 	mcr	p15, 0, r0, c6,	c1		@ set area 1, RAM
 
 	ldr	r0, =(CONFIG_FLASH_MEM_BASE & 0xFFFFF000) @ base[31:12] of FLASH
-	ldr	r1, =(CONFIG_FLASH_SIZE >> 12)	@ size of FLASH (must be >= 4KB)
-	mov	r2, #10				@ 11 is the minimum (4KB)
-1:	add	r2, r2, #1			@ area size *= 2
-	mov	r1, r1, lsr #1
+	ldr	r3, =(CONFIG_FLASH_SIZE >> 12)	@ size of FLASH (must be >= 4KB)
+	cmp	r3, #0
+	moveq	r0, #0
+	beq	2f
+	mov	r4, #10				@ 11 is the minimum (4KB)
+1:	add	r4, r4, #1			@ area size *= 2
+	movs	r3, r3, lsr #1
 	bne	1b				@ count not zero r-shift
-	orr	r0, r0, r2, lsl #1		@ the area register value
+	orr	r0, r0, r4, lsl #1		@ the area register value
 	orr	r0, r0, #1			@ set enable bit
-	mcr	p15, 0, r0, c6,	c2		@ set area 2, ROM/FLASH
+2:	mcr	p15, 0, r0, c6,	c2		@ set area 2, ROM/FLASH
 
 	mov	r0, #0x06
 	mcr	p15, 0, r0, c2, c0		@ Region 1&2 cacheable
@@ -137,13 +138,14 @@ __arm740_proc_info:
 	.long	0x41807400
 	.long	0xfffffff0
 	.long	0
+	.long	0
 	b	__arm740_setup
 	.long	cpu_arch_name
 	.long	cpu_elf_name
-	.long	HWCAP_SWP | HWCAP_HALF | HWCAP_26BIT
+	.long	HWCAP_SWP | HWCAP_HALF | HWCAP_THUMB | HWCAP_26BIT
 	.long	cpu_arm740_name
 	.long	arm740_processor_functions
 	.long	0
 	.long	0
-	.long	v3_cache_fns			@ cache model
+	.long	v4_cache_fns			@ cache model
 	.size	__arm740_proc_info, . - __arm740_proc_info
diff --git a/arch/arm/mm/proc-arm7tdmi.S b/arch/arm/mm/proc-arm7tdmi.S
index 6ddea3e464b..f6cc3f63ce3 100644
--- a/arch/arm/mm/proc-arm7tdmi.S
+++ b/arch/arm/mm/proc-arm7tdmi.S
@@ -51,8 +51,6 @@ ENTRY(cpu_arm7tdmi_reset)
 ENDPROC(cpu_arm7tdmi_reset)
 		.popsection
 
-		__CPUINIT
-
 		.type	__arm7tdmi_setup, #function
 __arm7tdmi_setup:
 		mov	pc, lr
diff --git a/arch/arm/mm/proc-arm920.S b/arch/arm/mm/proc-arm920.S
index 2c3b9421ab5..549557df6d5 100644
--- a/arch/arm/mm/proc-arm920.S
+++ b/arch/arm/mm/proc-arm920.S
@@ -387,7 +387,7 @@ ENTRY(cpu_arm920_set_pte_ext)
 /* Suspend/resume support: taken from arch/arm/plat-s3c24xx/sleep.S */
 .globl	cpu_arm920_suspend_size
 .equ	cpu_arm920_suspend_size, 4 * 3
-#ifdef CONFIG_PM_SLEEP
+#ifdef CONFIG_ARM_CPU_SUSPEND
 ENTRY(cpu_arm920_do_suspend)
 	stmfd	sp!, {r4 - r6, lr}
 	mrc	p15, 0, r4, c13, c0, 0	@ PID
@@ -410,8 +410,6 @@ ENTRY(cpu_arm920_do_resume)
 ENDPROC(cpu_arm920_do_resume)
 #endif
 
-	__CPUINIT
-
 	.type	__arm920_setup, #function
 __arm920_setup:
 	mov	r0, #0
diff --git a/arch/arm/mm/proc-arm922.S b/arch/arm/mm/proc-arm922.S
index 4464c49d744..2a758b06c6f 100644
--- a/arch/arm/mm/proc-arm922.S
+++ b/arch/arm/mm/proc-arm922.S
@@ -388,8 +388,6 @@ ENTRY(cpu_arm922_set_pte_ext)
 #endif /* CONFIG_MMU */
 	mov	pc, lr
 
-	__CPUINIT
-
 	.type	__arm922_setup, #function
 __arm922_setup:
 	mov	r0, #0
diff --git a/arch/arm/mm/proc-arm925.S b/arch/arm/mm/proc-arm925.S
index 281eb9b9c1d..ba0d58e1a2a 100644
--- a/arch/arm/mm/proc-arm925.S
+++ b/arch/arm/mm/proc-arm925.S
@@ -438,8 +438,6 @@ ENTRY(cpu_arm925_set_pte_ext)
 #endif /* CONFIG_MMU */
 	mov	pc, lr
 
-	__CPUINIT
-
 	.type	__arm925_setup, #function
 __arm925_setup:
 	mov	r0, #0
@@ -504,6 +502,7 @@ __\name\()_proc_info:
 	.long	\cpu_val
 	.long	\cpu_mask
 	.long   PMD_TYPE_SECT | \
+		PMD_SECT_CACHEABLE | \
 		PMD_BIT4 | \
 		PMD_SECT_AP_WRITE | \
 		PMD_SECT_AP_READ
diff --git a/arch/arm/mm/proc-arm926.S b/arch/arm/mm/proc-arm926.S
index f1803f7e297..0f098f407c9 100644
--- a/arch/arm/mm/proc-arm926.S
+++ b/arch/arm/mm/proc-arm926.S
@@ -402,7 +402,7 @@ ENTRY(cpu_arm926_set_pte_ext)
 /* Suspend/resume support: taken from arch/arm/plat-s3c24xx/sleep.S */
 .globl	cpu_arm926_suspend_size
 .equ	cpu_arm926_suspend_size, 4 * 3
-#ifdef CONFIG_PM_SLEEP
+#ifdef CONFIG_ARM_CPU_SUSPEND
 ENTRY(cpu_arm926_do_suspend)
 	stmfd	sp!, {r4 - r6, lr}
 	mrc	p15, 0, r4, c13, c0, 0	@ PID
@@ -425,8 +425,6 @@ ENTRY(cpu_arm926_do_resume)
 ENDPROC(cpu_arm926_do_resume)
 #endif
 
-	__CPUINIT
-
 	.type	__arm926_setup, #function
 __arm926_setup:
 	mov	r0, #0
diff --git a/arch/arm/mm/proc-arm940.S b/arch/arm/mm/proc-arm940.S
index 8da189d4a40..1c39a704ff6 100644
--- a/arch/arm/mm/proc-arm940.S
+++ b/arch/arm/mm/proc-arm940.S
@@ -273,8 +273,6 @@ ENDPROC(arm940_dma_unmap_area)
 	@ define struct cpu_cache_fns (see <asm/cacheflush.h> and proc-macros.S)
 	define_cache_functions arm940
 
-	__CPUINIT
-
 	.type	__arm940_setup, #function
 __arm940_setup:
 	mov	r0, #0
diff --git a/arch/arm/mm/proc-arm946.S b/arch/arm/mm/proc-arm946.S
index f666cf34075..0289cd905e7 100644
--- a/arch/arm/mm/proc-arm946.S
+++ b/arch/arm/mm/proc-arm946.S
@@ -326,8 +326,6 @@ ENTRY(cpu_arm946_dcache_clean_area)
 	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
-	__CPUINIT
-
 	.type	__arm946_setup, #function
 __arm946_setup:
 	mov	r0, #0
diff --git a/arch/arm/mm/proc-arm9tdmi.S b/arch/arm/mm/proc-arm9tdmi.S
index 8881391dfb9..f51197ba754 100644
--- a/arch/arm/mm/proc-arm9tdmi.S
+++ b/arch/arm/mm/proc-arm9tdmi.S
@@ -51,8 +51,6 @@ ENTRY(cpu_arm9tdmi_reset)
 ENDPROC(cpu_arm9tdmi_reset)
 		.popsection
 
-		__CPUINIT
-
 		.type	__arm9tdmi_setup, #function
 __arm9tdmi_setup:
 		mov	pc, lr
diff --git a/arch/arm/mm/proc-fa526.S b/arch/arm/mm/proc-fa526.S
index d217e9795d7..2dfc0f1d3bf 100644
--- a/arch/arm/mm/proc-fa526.S
+++ b/arch/arm/mm/proc-fa526.S
@@ -81,7 +81,6 @@ ENDPROC(cpu_fa526_reset)
  */
 	.align	4
 ENTRY(cpu_fa526_do_idle)
-	mcr	p15, 0, r0, c7, c0, 4		@ Wait for interrupt
 	mov	pc, lr
 
 
@@ -136,8 +135,6 @@ ENTRY(cpu_fa526_set_pte_ext)
 #endif
 	mov	pc, lr
 
-	__CPUINIT
-
 	.type	__fa526_setup, #function
 __fa526_setup:
 	/* On return of this routine, r0 must carry correct flags for CFG register */
diff --git a/arch/arm/mm/proc-feroceon.S b/arch/arm/mm/proc-feroceon.S
index 4106b09e0c2..db79b62c92f 100644
--- a/arch/arm/mm/proc-feroceon.S
+++ b/arch/arm/mm/proc-feroceon.S
@@ -514,7 +514,31 @@ ENTRY(cpu_feroceon_set_pte_ext)
 #endif
 	mov	pc, lr
 
-	__CPUINIT
+/* Suspend/resume support: taken from arch/arm/mm/proc-arm926.S */
+.globl	cpu_feroceon_suspend_size
+.equ	cpu_feroceon_suspend_size, 4 * 3
+#ifdef CONFIG_ARM_CPU_SUSPEND
+ENTRY(cpu_feroceon_do_suspend)
+	stmfd	sp!, {r4 - r6, lr}
+	mrc	p15, 0, r4, c13, c0, 0	@ PID
+	mrc	p15, 0, r5, c3, c0, 0	@ Domain ID
+	mrc	p15, 0, r6, c1, c0, 0	@ Control register
+	stmia	r0, {r4 - r6}
+	ldmfd	sp!, {r4 - r6, pc}
+ENDPROC(cpu_feroceon_do_suspend)
+
+ENTRY(cpu_feroceon_do_resume)
+	mov	ip, #0
+	mcr	p15, 0, ip, c8, c7, 0	@ invalidate I+D TLBs
+	mcr	p15, 0, ip, c7, c7, 0	@ invalidate I+D caches
+	ldmia	r0, {r4 - r6}
+	mcr	p15, 0, r4, c13, c0, 0	@ PID
+	mcr	p15, 0, r5, c3, c0, 0	@ Domain ID
+	mcr	p15, 0, r1, c2, c0, 0	@ TTB address
+	mov	r0, r6			@ control register
+	b	cpu_resume_mmu
+ENDPROC(cpu_feroceon_do_resume)
+#endif
 
 	.type	__feroceon_setup, #function
 __feroceon_setup:
diff --git a/arch/arm/mm/proc-macros.S b/arch/arm/mm/proc-macros.S
index eb6aa73bc8b..ee1d8059395 100644
--- a/arch/arm/mm/proc-macros.S
+++ b/arch/arm/mm/proc-macros.S
@@ -38,9 +38,14 @@
 
 /*
  * mmid - get context id from mm pointer (mm->context.id)
+ * note, this field is 64bit, so in big-endian the two words are swapped too.
  */
 	.macro	mmid, rd, rn
+#ifdef __ARMEB__
+	ldr	\rd, [\rn, #MM_CONTEXT_ID + 4 ]
+#else
 	ldr	\rd, [\rn, #MM_CONTEXT_ID]
+#endif
 	.endm
 
 /*
@@ -107,13 +112,9 @@
  *  100x   1   0   1	r/o	no acc
  *  10x0   1   0   1	r/o	no acc
  *  1011   0   0   1	r/w	no acc
- *  110x   0   1   0	r/w	r/o
- *  11x0   0   1   0	r/w	r/o
- *  1111   0   1   1	r/w	r/w
- *
- * If !CONFIG_CPU_USE_DOMAINS, the following permissions are changed:
  *  110x   1   1   1	r/o	r/o
  *  11x0   1   1   1	r/o	r/o
+ *  1111   0   1   1	r/w	r/w
  */
 	.macro	armv6_mt_table pfx
 \pfx\()_mt_table:
@@ -132,7 +133,7 @@
 	.long	PTE_EXT_TEX(2)					@ L_PTE_MT_DEV_NONSHARED
 	.long	0x00						@ unused
 	.long	0x00						@ unused
-	.long	0x00						@ unused
+	.long	PTE_CACHEABLE | PTE_BUFFERABLE | PTE_EXT_APX	@ L_PTE_MT_VECTORS
 	.endm
 
 	.macro	armv6_set_pte_ext pfx
@@ -153,24 +154,21 @@
 
 	tst	r1, #L_PTE_USER
 	orrne	r3, r3, #PTE_EXT_AP1
-#ifdef CONFIG_CPU_USE_DOMAINS
-	@ allow kernel read/write access to read-only user pages
 	tstne	r3, #PTE_EXT_APX
-	bicne	r3, r3, #PTE_EXT_APX | PTE_EXT_AP0
-#endif
+
+	@ user read-only -> kernel read-only
+	bicne	r3, r3, #PTE_EXT_AP0
 
 	tst	r1, #L_PTE_XN
 	orrne	r3, r3, #PTE_EXT_XN
 
-	orr	r3, r3, r2
+	eor	r3, r3, r2
 
 	tst	r1, #L_PTE_YOUNG
 	tstne	r1, #L_PTE_PRESENT
 	moveq	r3, #0
-#ifndef CONFIG_CPU_USE_DOMAINS
 	tstne	r1, #L_PTE_NONE
 	movne	r3, #0
-#endif
 
 	str	r3, [r0]
 	mcr	p15, 0, r0, c7, c10, 1		@ flush_pte
@@ -328,3 +326,8 @@ ENTRY(\name\()_tlb_fns)
 	.endif
 	.size	\name\()_tlb_fns, . - \name\()_tlb_fns
 .endm
+
+.macro globl_equ x, y
+	.globl	\x
+	.equ	\x, \y
+.endm
diff --git a/arch/arm/mm/proc-mohawk.S b/arch/arm/mm/proc-mohawk.S
index 82f9cdc751d..40acba59573 100644
--- a/arch/arm/mm/proc-mohawk.S
+++ b/arch/arm/mm/proc-mohawk.S
@@ -350,7 +350,7 @@ ENTRY(cpu_mohawk_set_pte_ext)
 
 .globl	cpu_mohawk_suspend_size
 .equ	cpu_mohawk_suspend_size, 4 * 6
-#ifdef CONFIG_PM_SLEEP
+#ifdef CONFIG_ARM_CPU_SUSPEND
 ENTRY(cpu_mohawk_do_suspend)
 	stmfd	sp!, {r4 - r9, lr}
 	mrc	p14, 0, r4, c6, c0, 0	@ clock configuration, for turbo mode
@@ -383,8 +383,6 @@ ENTRY(cpu_mohawk_do_resume)
 ENDPROC(cpu_mohawk_do_resume)
 #endif
 
-	__CPUINIT
-
 	.type	__mohawk_setup, #function
 __mohawk_setup:
 	mov	r0, #0
diff --git a/arch/arm/mm/proc-sa110.S b/arch/arm/mm/proc-sa110.S
index 775d70fba93..c45319c8f1d 100644
--- a/arch/arm/mm/proc-sa110.S
+++ b/arch/arm/mm/proc-sa110.S
@@ -159,8 +159,6 @@ ENTRY(cpu_sa110_set_pte_ext)
 #endif
 	mov	pc, lr
 
-	__CPUINIT
-
 	.type	__sa110_setup, #function
 __sa110_setup:
 	mov	r10, #0
diff --git a/arch/arm/mm/proc-sa1100.S b/arch/arm/mm/proc-sa1100.S
index 3aa0da11fd8..09d241ae2db 100644
--- a/arch/arm/mm/proc-sa1100.S
+++ b/arch/arm/mm/proc-sa1100.S
@@ -172,7 +172,7 @@ ENTRY(cpu_sa1100_set_pte_ext)
 
 .globl	cpu_sa1100_suspend_size
 .equ	cpu_sa1100_suspend_size, 4 * 3
-#ifdef CONFIG_PM_SLEEP
+#ifdef CONFIG_ARM_CPU_SUSPEND
 ENTRY(cpu_sa1100_do_suspend)
 	stmfd	sp!, {r4 - r6, lr}
 	mrc	p15, 0, r4, c3, c0, 0		@ domain ID
@@ -198,8 +198,6 @@ ENTRY(cpu_sa1100_do_resume)
 ENDPROC(cpu_sa1100_do_resume)
 #endif
 
-	__CPUINIT
-
 	.type	__sa1100_setup, #function
 __sa1100_setup:
 	mov	r0, #0
diff --git a/arch/arm/mm/proc-syms.c b/arch/arm/mm/proc-syms.c
index 3e6210b4d6d..054b491ff76 100644
--- a/arch/arm/mm/proc-syms.c
+++ b/arch/arm/mm/proc-syms.c
@@ -17,7 +17,9 @@
 
 #ifndef MULTI_CPU
 EXPORT_SYMBOL(cpu_dcache_clean_area);
+#ifdef CONFIG_MMU
 EXPORT_SYMBOL(cpu_set_pte_ext);
+#endif
 #else
 EXPORT_SYMBOL(processor);
 #endif
diff --git a/arch/arm/mm/proc-v6.S b/arch/arm/mm/proc-v6.S
index 09c5233f4df..32b3558321c 100644
--- a/arch/arm/mm/proc-v6.S
+++ b/arch/arm/mm/proc-v6.S
@@ -80,12 +80,10 @@ ENTRY(cpu_v6_do_idle)
 	mov	pc, lr
 
 ENTRY(cpu_v6_dcache_clean_area)
-#ifndef TLB_CAN_READ_FROM_L1_CACHE
 1:	mcr	p15, 0, r0, c7, c10, 1		@ clean D entry
 	add	r0, r0, #D_CACHE_LINE_SIZE
 	subs	r1, r1, #D_CACHE_LINE_SIZE
 	bhi	1b
-#endif
 	mov	pc, lr
 
 /*
@@ -101,7 +99,7 @@ ENTRY(cpu_v6_dcache_clean_area)
 ENTRY(cpu_v6_switch_mm)
 #ifdef CONFIG_MMU
 	mov	r2, #0
-	ldr	r1, [r1, #MM_CONTEXT_ID]	@ get mm->context.id
+	mmid	r1, r1				@ get mm->context.id
 	ALT_SMP(orr	r0, r0, #TTB_FLAGS_SMP)
 	ALT_UP(orr	r0, r0, #TTB_FLAGS_UP)
 	mcr	p15, 0, r2, c7, c5, 6		@ flush BTAC/BTB
@@ -138,12 +136,14 @@ ENTRY(cpu_v6_set_pte_ext)
 /* Suspend/resume support: taken from arch/arm/mach-s3c64xx/sleep.S */
 .globl	cpu_v6_suspend_size
 .equ	cpu_v6_suspend_size, 4 * 6
-#ifdef CONFIG_PM_SLEEP
+#ifdef CONFIG_ARM_CPU_SUSPEND
 ENTRY(cpu_v6_do_suspend)
 	stmfd	sp!, {r4 - r9, lr}
 	mrc	p15, 0, r4, c13, c0, 0	@ FCSE/PID
+#ifdef CONFIG_MMU
 	mrc	p15, 0, r5, c3, c0, 0	@ Domain ID
 	mrc	p15, 0, r6, c2, c0, 1	@ Translation table base 1
+#endif
 	mrc	p15, 0, r7, c1, c0, 1	@ auxiliary control register
 	mrc	p15, 0, r8, c1, c0, 2	@ co-processor access control
 	mrc	p15, 0, r9, c1, c0, 0	@ control register
@@ -160,14 +160,16 @@ ENTRY(cpu_v6_do_resume)
 	mcr	p15, 0, ip, c13, c0, 1	@ set reserved context ID
 	ldmia	r0, {r4 - r9}
 	mcr	p15, 0, r4, c13, c0, 0	@ FCSE/PID
+#ifdef CONFIG_MMU
 	mcr	p15, 0, r5, c3, c0, 0	@ Domain ID
 	ALT_SMP(orr	r1, r1, #TTB_FLAGS_SMP)
 	ALT_UP(orr	r1, r1, #TTB_FLAGS_UP)
 	mcr	p15, 0, r1, c2, c0, 0	@ Translation table base 0
 	mcr	p15, 0, r6, c2, c0, 1	@ Translation table base 1
+	mcr	p15, 0, ip, c2, c0, 2	@ TTB control register
+#endif
 	mcr	p15, 0, r7, c1, c0, 1	@ auxiliary control register
 	mcr	p15, 0, r8, c1, c0, 2	@ co-processor access control
-	mcr	p15, 0, ip, c2, c0, 2	@ TTB control register
 	mcr	p15, 0, ip, c7, c5, 4	@ ISB
 	mov	r0, r9			@ control register
 	b	cpu_resume_mmu
@@ -178,8 +180,6 @@ ENDPROC(cpu_v6_do_resume)
 
 	.align
 
-	__CPUINIT
-
 /*
  *	__v6_setup
  *
@@ -208,7 +208,6 @@ __v6_setup:
 	mcr	p15, 0, r0, c7, c14, 0		@ clean+invalidate D cache
 	mcr	p15, 0, r0, c7, c5, 0		@ invalidate I cache
 	mcr	p15, 0, r0, c7, c15, 0		@ clean+invalidate cache
-	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 #ifdef CONFIG_MMU
 	mcr	p15, 0, r0, c8, c7, 0		@ invalidate I + D TLBs
 	mcr	p15, 0, r0, c2, c0, 2		@ TTB control register
@@ -218,11 +217,11 @@ __v6_setup:
 	ALT_UP(orr	r8, r8, #TTB_FLAGS_UP)
 	mcr	p15, 0, r8, c2, c0, 1		@ load TTB1
 #endif /* CONFIG_MMU */
+	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer and
+						@ complete invalidations
 	adr	r5, v6_crval
 	ldmia	r5, {r5, r6}
-#ifdef CONFIG_CPU_ENDIAN_BE8
-	orr	r6, r6, #1 << 25		@ big-endian page tables
-#endif
+ ARM_BE8(orr	r6, r6, #1 << 25)		@ big-endian page tables
 	mrc	p15, 0, r0, c1, c0, 0		@ read control register
 	bic	r0, r0, r5			@ clear bits them
 	orr	r0, r0, r6			@ set them
diff --git a/arch/arm/mm/proc-v7-2level.S b/arch/arm/mm/proc-v7-2level.S
index 6d98c13ab82..1f52915f2b2 100644
--- a/arch/arm/mm/proc-v7-2level.S
+++ b/arch/arm/mm/proc-v7-2level.S
@@ -40,7 +40,7 @@
 ENTRY(cpu_v7_switch_mm)
 #ifdef CONFIG_MMU
 	mov	r2, #0
-	ldr	r1, [r1, #MM_CONTEXT_ID]	@ get mm->context.id
+	mmid	r1, r1				@ get mm->context.id
 	ALT_SMP(orr	r0, r0, #TTB_FLAGS_SMP)
 	ALT_UP(orr	r0, r0, #TTB_FLAGS_UP)
 #ifdef CONFIG_ARM_ERRATA_430973
@@ -90,27 +90,21 @@ ENTRY(cpu_v7_set_pte_ext)
 
 	tst	r1, #L_PTE_USER
 	orrne	r3, r3, #PTE_EXT_AP1
-#ifdef CONFIG_CPU_USE_DOMAINS
-	@ allow kernel read/write access to read-only user pages
-	tstne	r3, #PTE_EXT_APX
-	bicne	r3, r3, #PTE_EXT_APX | PTE_EXT_AP0
-#endif
 
 	tst	r1, #L_PTE_XN
 	orrne	r3, r3, #PTE_EXT_XN
 
 	tst	r1, #L_PTE_YOUNG
 	tstne	r1, #L_PTE_VALID
-#ifndef CONFIG_CPU_USE_DOMAINS
 	eorne	r1, r1, #L_PTE_NONE
 	tstne	r1, #L_PTE_NONE
-#endif
 	moveq	r3, #0
 
  ARM(	str	r3, [r0, #2048]! )
  THUMB(	add	r0, r0, #2048 )
  THUMB(	str	r3, [r0] )
-	mcr	p15, 0, r0, c7, c10, 1		@ flush_pte
+	ALT_SMP(W(nop))
+	ALT_UP (mcr	p15, 0, r0, c7, c10, 1)		@ flush_pte
 #endif
 	mov	pc, lr
 ENDPROC(cpu_v7_set_pte_ext)
@@ -159,8 +153,6 @@ ENDPROC(cpu_v7_set_pte_ext)
 	mcr	p15, 0, \ttbr1, c2, c0, 1	@ load TTB1
 	.endm
 
-	__CPUINIT
-
 	/*   AT
 	 *  TFR   EV X F   I D LR    S
 	 * .EEE ..EE PUI. .T.T 4RVI ZWRS BLDP WCAM
@@ -171,5 +163,3 @@ ENDPROC(cpu_v7_set_pte_ext)
 	.type	v7_crval, #object
 v7_crval:
 	crval	clear=0x2120c302, mmuset=0x10c03c7d, ucset=0x00c01c7c
-
-	.previous
diff --git a/arch/arm/mm/proc-v7-3level.S b/arch/arm/mm/proc-v7-3level.S
index 7b56386f949..22e3ad63500 100644
--- a/arch/arm/mm/proc-v7-3level.S
+++ b/arch/arm/mm/proc-v7-3level.S
@@ -39,6 +39,14 @@
 #define TTB_FLAGS_SMP	(TTB_IRGN_WBWA|TTB_S|TTB_RGN_OC_WBWA)
 #define PMD_FLAGS_SMP	(PMD_SECT_WBWA|PMD_SECT_S)
 
+#ifndef __ARMEB__
+#  define rpgdl	r0
+#  define rpgdh	r1
+#else
+#  define rpgdl	r1
+#  define rpgdh	r0
+#endif
+
 /*
  * cpu_v7_switch_mm(pgd_phys, tsk)
  *
@@ -47,15 +55,23 @@
  */
 ENTRY(cpu_v7_switch_mm)
 #ifdef CONFIG_MMU
-	ldr	r1, [r1, #MM_CONTEXT_ID]	@ get mm->context.id
-	and	r3, r1, #0xff
-	mov	r3, r3, lsl #(48 - 32)		@ ASID
-	mcrr	p15, 0, r0, r3, c2		@ set TTB 0
+	mmid	r2, r2
+	asid	r2, r2
+	orr	rpgdh, rpgdh, r2, lsl #(48 - 32)	@ upper 32-bits of pgd
+	mcrr	p15, 0, rpgdl, rpgdh, c2		@ set TTB 0
 	isb
 #endif
 	mov	pc, lr
 ENDPROC(cpu_v7_switch_mm)
 
+#ifdef __ARMEB__
+#define rl r3
+#define rh r2
+#else
+#define rl r2
+#define rh r3
+#endif
+
 /*
  * cpu_v7_set_pte_ext(ptep, pte)
  *
@@ -65,15 +81,16 @@ ENDPROC(cpu_v7_switch_mm)
  */
 ENTRY(cpu_v7_set_pte_ext)
 #ifdef CONFIG_MMU
-	tst	r2, #L_PTE_VALID
+	tst	rl, #L_PTE_VALID
 	beq	1f
-	tst	r3, #1 << (57 - 32)		@ L_PTE_NONE
-	bicne	r2, #L_PTE_VALID
+	tst	rh, #1 << (57 - 32)		@ L_PTE_NONE
+	bicne	rl, #L_PTE_VALID
 	bne	1f
-	tst	r3, #1 << (55 - 32)		@ L_PTE_DIRTY
-	orreq	r2, #L_PTE_RDONLY
+	tst	rh, #1 << (55 - 32)		@ L_PTE_DIRTY
+	orreq	rl, #L_PTE_RDONLY
 1:	strd	r2, r3, [r0]
-	mcr	p15, 0, r0, c7, c10, 1		@ flush_pte
+	ALT_SMP(W(nop))
+	ALT_UP (mcr	p15, 0, r0, c7, c10, 1)		@ flush_pte
 #endif
 	mov	pc, lr
 ENDPROC(cpu_v7_set_pte_ext)
@@ -105,7 +122,8 @@ ENDPROC(cpu_v7_set_pte_ext)
 	 */
 	.macro	v7_ttb_setup, zero, ttbr0, ttbr1, tmp
 	ldr	\tmp, =swapper_pg_dir		@ swapper_pg_dir virtual address
-	cmp	\ttbr1, \tmp			@ PHYS_OFFSET > PAGE_OFFSET? (branch below)
+	mov	\tmp, \tmp, lsr #ARCH_PGD_SHIFT
+	cmp	\ttbr1, \tmp			@ PHYS_OFFSET > PAGE_OFFSET?
 	mrc	p15, 0, \tmp, c2, c0, 2		@ TTB control register
 	orr	\tmp, \tmp, #TTB_EAE
 	ALT_SMP(orr	\tmp, \tmp, #TTB_FLAGS_SMP)
@@ -113,31 +131,23 @@ ENDPROC(cpu_v7_set_pte_ext)
 	ALT_SMP(orr	\tmp, \tmp, #TTB_FLAGS_SMP << 16)
 	ALT_UP(orr	\tmp, \tmp, #TTB_FLAGS_UP << 16)
 	/*
-	 * TTBR0/TTBR1 split (PAGE_OFFSET):
-	 *   0x40000000: T0SZ = 2, T1SZ = 0 (not used)
-	 *   0x80000000: T0SZ = 0, T1SZ = 1
-	 *   0xc0000000: T0SZ = 0, T1SZ = 2
-	 *
-	 * Only use this feature if PHYS_OFFSET <= PAGE_OFFSET, otherwise
-	 * booting secondary CPUs would end up using TTBR1 for the identity
-	 * mapping set up in TTBR0.
+	 * Only use split TTBRs if PHYS_OFFSET <= PAGE_OFFSET (cmp above),
+	 * otherwise booting secondary CPUs would end up using TTBR1 for the
+	 * identity mapping set up in TTBR0.
 	 */
-	bhi	9001f				@ PHYS_OFFSET > PAGE_OFFSET?
-	orr	\tmp, \tmp, #(((PAGE_OFFSET >> 30) - 1) << 16) @ TTBCR.T1SZ
-#if defined CONFIG_VMSPLIT_2G
-	/* PAGE_OFFSET == 0x80000000, T1SZ == 1 */
-	add	\ttbr1, \ttbr1, #1 << 4		@ skip two L1 entries
-#elif defined CONFIG_VMSPLIT_3G
-	/* PAGE_OFFSET == 0xc0000000, T1SZ == 2 */
-	add	\ttbr1, \ttbr1, #4096 * (1 + 3)	@ only L2 used, skip pgd+3*pmd
-#endif
-	/* CONFIG_VMSPLIT_1G does not need TTBR1 adjustment */
-9001:	mcr	p15, 0, \tmp, c2, c0, 2		@ TTB control register
-	mcrr	p15, 1, \ttbr1, \zero, c2	@ load TTBR1
+	orrls	\tmp, \tmp, #TTBR1_SIZE				@ TTBCR.T1SZ
+	mcr	p15, 0, \tmp, c2, c0, 2				@ TTBCR
+	mov	\tmp, \ttbr1, lsr #(32 - ARCH_PGD_SHIFT)	@ upper bits
+	mov	\ttbr1, \ttbr1, lsl #ARCH_PGD_SHIFT		@ lower bits
+	addls	\ttbr1, \ttbr1, #TTBR1_OFFSET
+	mcrr	p15, 1, \ttbr1, \zero, c2			@ load TTBR1
+	mov	\tmp, \ttbr0, lsr #(32 - ARCH_PGD_SHIFT)	@ upper bits
+	mov	\ttbr0, \ttbr0, lsl #ARCH_PGD_SHIFT		@ lower bits
+	mcrr	p15, 0, \ttbr0, \zero, c2			@ load TTBR0
+	mcrr	p15, 1, \ttbr1, \zero, c2			@ load TTBR1
+	mcrr	p15, 0, \ttbr0, \zero, c2			@ load TTBR0
 	.endm
 
-	__CPUINIT
-
 	/*
 	 *   AT
 	 *  TFR   EV X F   IHD LR    S
@@ -149,5 +159,3 @@ ENDPROC(cpu_v7_set_pte_ext)
 	.type	v7_crval, #object
 v7_crval:
 	crval	clear=0x0120c302, mmuset=0x30c23c7d, ucset=0x00c01c7c
-
-	.previous
diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S
index 3a3c015f8d5..3db2c2f04a3 100644
--- a/arch/arm/mm/proc-v7.S
+++ b/arch/arm/mm/proc-v7.S
@@ -75,14 +75,15 @@ ENTRY(cpu_v7_do_idle)
 ENDPROC(cpu_v7_do_idle)
 
 ENTRY(cpu_v7_dcache_clean_area)
-#ifndef TLB_CAN_READ_FROM_L1_CACHE
-	dcache_line_size r2, r3
-1:	mcr	p15, 0, r0, c7, c10, 1		@ clean D entry
+	ALT_SMP(W(nop))			@ MP extensions imply L1 PTW
+	ALT_UP_B(1f)
+	mov	pc, lr
+1:	dcache_line_size r2, r3
+2:	mcr	p15, 0, r0, c7, c10, 1		@ clean D entry
 	add	r0, r0, r2
 	subs	r1, r1, r2
-	bhi	1b
-	dsb
-#endif
+	bhi	2b
+	dsb	ishst
 	mov	pc, lr
 ENDPROC(cpu_v7_dcache_clean_area)
 
@@ -91,48 +92,59 @@ ENDPROC(cpu_v7_dcache_clean_area)
 
 /* Suspend/resume support: derived from arch/arm/mach-s5pv210/sleep.S */
 .globl	cpu_v7_suspend_size
-.equ	cpu_v7_suspend_size, 4 * 8
+.equ	cpu_v7_suspend_size, 4 * 9
 #ifdef CONFIG_ARM_CPU_SUSPEND
 ENTRY(cpu_v7_do_suspend)
 	stmfd	sp!, {r4 - r10, lr}
 	mrc	p15, 0, r4, c13, c0, 0	@ FCSE/PID
 	mrc	p15, 0, r5, c13, c0, 3	@ User r/o thread ID
 	stmia	r0!, {r4 - r5}
+#ifdef CONFIG_MMU
 	mrc	p15, 0, r6, c3, c0, 0	@ Domain ID
+#ifdef CONFIG_ARM_LPAE
+	mrrc	p15, 1, r5, r7, c2	@ TTB 1
+#else
 	mrc	p15, 0, r7, c2, c0, 1	@ TTB 1
+#endif
 	mrc	p15, 0, r11, c2, c0, 2	@ TTB control register
+#endif
 	mrc	p15, 0, r8, c1, c0, 0	@ Control register
 	mrc	p15, 0, r9, c1, c0, 1	@ Auxiliary control register
 	mrc	p15, 0, r10, c1, c0, 2	@ Co-processor access control
-	stmia	r0, {r6 - r11}
+	stmia	r0, {r5 - r11}
 	ldmfd	sp!, {r4 - r10, pc}
 ENDPROC(cpu_v7_do_suspend)
 
 ENTRY(cpu_v7_do_resume)
 	mov	ip, #0
-	mcr	p15, 0, ip, c8, c7, 0	@ invalidate TLBs
 	mcr	p15, 0, ip, c7, c5, 0	@ invalidate I cache
 	mcr	p15, 0, ip, c13, c0, 1	@ set reserved context ID
 	ldmia	r0!, {r4 - r5}
 	mcr	p15, 0, r4, c13, c0, 0	@ FCSE/PID
 	mcr	p15, 0, r5, c13, c0, 3	@ User r/o thread ID
-	ldmia	r0, {r6 - r11}
+	ldmia	r0, {r5 - r11}
+#ifdef CONFIG_MMU
+	mcr	p15, 0, ip, c8, c7, 0	@ invalidate TLBs
 	mcr	p15, 0, r6, c3, c0, 0	@ Domain ID
-#ifndef CONFIG_ARM_LPAE
+#ifdef CONFIG_ARM_LPAE
+	mcrr	p15, 0, r1, ip, c2	@ TTB 0
+	mcrr	p15, 1, r5, r7, c2	@ TTB 1
+#else
 	ALT_SMP(orr	r1, r1, #TTB_FLAGS_SMP)
 	ALT_UP(orr	r1, r1, #TTB_FLAGS_UP)
-#endif
 	mcr	p15, 0, r1, c2, c0, 0	@ TTB 0
 	mcr	p15, 0, r7, c2, c0, 1	@ TTB 1
+#endif
 	mcr	p15, 0, r11, c2, c0, 2	@ TTB control register
-	mrc	p15, 0, r4, c1, c0, 1	@ Read Auxiliary control register
-	teq	r4, r9			@ Is it already set?
-	mcrne	p15, 0, r9, c1, c0, 1	@ No, so write it
-	mcr	p15, 0, r10, c1, c0, 2	@ Co-processor access control
 	ldr	r4, =PRRR		@ PRRR
 	ldr	r5, =NMRR		@ NMRR
 	mcr	p15, 0, r4, c10, c2, 0	@ write PRRR
 	mcr	p15, 0, r5, c10, c2, 1	@ write NMRR
+#endif	/* CONFIG_MMU */
+	mrc	p15, 0, r4, c1, c0, 1	@ Read Auxiliary control register
+	teq	r4, r9			@ Is it already set?
+	mcrne	p15, 0, r9, c1, c0, 1	@ No, so write it
+	mcr	p15, 0, r10, c1, c0, 2	@ Co-processor access control
 	isb
 	dsb
 	mov	r0, r8			@ control register
@@ -140,7 +152,50 @@ ENTRY(cpu_v7_do_resume)
 ENDPROC(cpu_v7_do_resume)
 #endif
 
-	__CPUINIT
+#ifdef CONFIG_CPU_PJ4B
+	globl_equ	cpu_pj4b_switch_mm,     cpu_v7_switch_mm
+	globl_equ	cpu_pj4b_set_pte_ext,	cpu_v7_set_pte_ext
+	globl_equ	cpu_pj4b_proc_init,	cpu_v7_proc_init
+	globl_equ	cpu_pj4b_proc_fin, 	cpu_v7_proc_fin
+	globl_equ	cpu_pj4b_reset,	   	cpu_v7_reset
+#ifdef CONFIG_PJ4B_ERRATA_4742
+ENTRY(cpu_pj4b_do_idle)
+	dsb					@ WFI may enter a low-power mode
+	wfi
+	dsb					@barrier
+	mov	pc, lr
+ENDPROC(cpu_pj4b_do_idle)
+#else
+	globl_equ	cpu_pj4b_do_idle,  	cpu_v7_do_idle
+#endif
+	globl_equ	cpu_pj4b_dcache_clean_area,	cpu_v7_dcache_clean_area
+#ifdef CONFIG_ARM_CPU_SUSPEND
+ENTRY(cpu_pj4b_do_suspend)
+	stmfd	sp!, {r6 - r10}
+	mrc	p15, 1, r6, c15, c1, 0  @ save CP15 - extra features
+	mrc	p15, 1, r7, c15, c2, 0	@ save CP15 - Aux Func Modes Ctrl 0
+	mrc	p15, 1, r8, c15, c1, 2	@ save CP15 - Aux Debug Modes Ctrl 2
+	mrc	p15, 1, r9, c15, c1, 1  @ save CP15 - Aux Debug Modes Ctrl 1
+	mrc	p15, 0, r10, c9, c14, 0  @ save CP15 - PMC
+	stmia	r0!, {r6 - r10}
+	ldmfd	sp!, {r6 - r10}
+	b cpu_v7_do_suspend
+ENDPROC(cpu_pj4b_do_suspend)
+
+ENTRY(cpu_pj4b_do_resume)
+	ldmia	r0!, {r6 - r10}
+	mcr	p15, 1, r6, c15, c1, 0  @ save CP15 - extra features
+	mcr	p15, 1, r7, c15, c2, 0	@ save CP15 - Aux Func Modes Ctrl 0
+	mcr	p15, 1, r8, c15, c1, 2	@ save CP15 - Aux Debug Modes Ctrl 2
+	mcr	p15, 1, r9, c15, c1, 1  @ save CP15 - Aux Debug Modes Ctrl 1
+	mcr	p15, 0, r10, c9, c14, 0  @ save CP15 - PMC
+	b cpu_v7_do_resume
+ENDPROC(cpu_pj4b_do_resume)
+#endif
+.globl	cpu_pj4b_suspend_size
+.equ	cpu_pj4b_suspend_size, 4 * 14
+
+#endif
 
 /*
  *	__v7_setup
@@ -155,10 +210,13 @@ ENDPROC(cpu_v7_do_resume)
  */
 __v7_ca5mp_setup:
 __v7_ca9mp_setup:
-	mov	r10, #(1 << 0)			@ TLB ops broadcasting
+__v7_cr7mp_setup:
+	mov	r10, #(1 << 0)			@ Cache/TLB ops broadcasting
 	b	1f
 __v7_ca7mp_setup:
+__v7_ca12mp_setup:
 __v7_ca15mp_setup:
+__v7_ca17mp_setup:
 	mov	r10, #0
 1:
 #ifdef CONFIG_SMP
@@ -303,9 +361,20 @@ __v7_setup:
 1:
 #endif
 
-3:	mov	r10, #0
+	/* Cortex-A15 Errata */
+3:	ldr	r10, =0x00000c0f		@ Cortex-A15 primary part number
+	teq	r0, r10
+	bne	4f
+
+#ifdef CONFIG_ARM_ERRATA_773022
+	cmp	r6, #0x4			@ only present up to r0p4
+	mrcle	p15, 0, r10, c1, c0, 1		@ read aux control register
+	orrle	r10, r10, #1 << 1		@ disable loop buffer
+	mcrle	p15, 0, r10, c1, c0, 1		@ write aux control register
+#endif
+
+4:	mov	r10, #0
 	mcr	p15, 0, r10, c7, c5, 0		@ I+BTB cache invalidate
-	dsb
 #ifdef CONFIG_MMU
 	mcr	p15, 0, r10, c8, c7, 0		@ invalidate I + D TLBs
 	v7_ttb_setup r10, r4, r8, r5		@ TTBCR, TTBRx setup
@@ -314,6 +383,7 @@ __v7_setup:
 	mcr	p15, 0, r5, c10, c2, 0		@ write PRRR
 	mcr	p15, 0, r6, c10, c2, 1		@ write NMRR
 #endif
+	dsb					@ Complete invalidations
 #ifndef CONFIG_ARM_THUMBEE
 	mrc	p15, 0, r0, c0, c1, 0		@ read ID_PFR0 for ThumbEE
 	and	r0, r0, #(0xf << 12)		@ ThumbEE enabled field
@@ -328,9 +398,7 @@ __v7_setup:
 #endif
 	adr	r5, v7_crval
 	ldmia	r5, {r5, r6}
-#ifdef CONFIG_CPU_ENDIAN_BE8
-	orr	r6, r6, #1 << 25		@ big-endian page tables
-#endif
+ ARM_BE8(orr	r6, r6, #1 << 25)		@ big-endian page tables
 #ifdef CONFIG_SWP_EMULATE
 	orr     r5, r5, #(1 << 10)              @ set SW bit in "clear"
 	bic     r6, r6, #(1 << 10)              @ clear it in "mmuset"
@@ -350,6 +418,9 @@ __v7_setup_stack:
 
 	@ define struct processor (see <asm/proc-fns.h> and proc-macros.S)
 	define_processor_functions v7, dabort=v7_early_abort, pabort=v7_pabort, suspend=1
+#ifdef CONFIG_CPU_PJ4B
+	define_processor_functions pj4b, dabort=v7_early_abort, pabort=v7_pabort, suspend=1
+#endif
 
 	.section ".rodata"
 
@@ -362,7 +433,7 @@ __v7_setup_stack:
 	/*
 	 * Standard v7 proc info content
 	 */
-.macro __v7_proc initfunc, mm_mmuflags = 0, io_mmuflags = 0, hwcaps = 0
+.macro __v7_proc initfunc, mm_mmuflags = 0, io_mmuflags = 0, hwcaps = 0, proc_fns = v7_processor_functions
 	ALT_SMP(.long	PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AP_READ | \
 			PMD_SECT_AF | PMD_FLAGS_SMP | \mm_mmuflags)
 	ALT_UP(.long	PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AP_READ | \
@@ -375,7 +446,7 @@ __v7_setup_stack:
 	.long	HWCAP_SWP | HWCAP_HALF | HWCAP_THUMB | HWCAP_FAST_MULT | \
 		HWCAP_EDSP | HWCAP_TLS | \hwcaps
 	.long	cpu_v7_name
-	.long	v7_processor_functions
+	.long	\proc_fns
 	.long	v7wbi_tlb_fns
 	.long	v6_user_fns
 	.long	v7_cache_fns
@@ -402,16 +473,29 @@ __v7_ca9mp_proc_info:
 	__v7_proc __v7_ca9mp_setup
 	.size	__v7_ca9mp_proc_info, . - __v7_ca9mp_proc_info
 
+#endif	/* CONFIG_ARM_LPAE */
+
 	/*
 	 * Marvell PJ4B processor.
 	 */
+#ifdef CONFIG_CPU_PJ4B
 	.type   __v7_pj4b_proc_info, #object
 __v7_pj4b_proc_info:
-	.long	0x562f5840
-	.long	0xfffffff0
-	__v7_proc __v7_pj4b_setup
+	.long	0x560f5800
+	.long	0xff0fff00
+	__v7_proc __v7_pj4b_setup, proc_fns = pj4b_processor_functions
 	.size	__v7_pj4b_proc_info, . - __v7_pj4b_proc_info
-#endif	/* CONFIG_ARM_LPAE */
+#endif
+
+	/*
+	 * ARM Ltd. Cortex R7 processor.
+	 */
+	.type	__v7_cr7mp_proc_info, #object
+__v7_cr7mp_proc_info:
+	.long	0x410fc170
+	.long	0xff0ffff0
+	__v7_proc __v7_cr7mp_setup
+	.size	__v7_cr7mp_proc_info, . - __v7_cr7mp_proc_info
 
 	/*
 	 * ARM Ltd. Cortex A7 processor.
@@ -420,20 +504,55 @@ __v7_pj4b_proc_info:
 __v7_ca7mp_proc_info:
 	.long	0x410fc070
 	.long	0xff0ffff0
-	__v7_proc __v7_ca7mp_setup, hwcaps = HWCAP_IDIV
+	__v7_proc __v7_ca7mp_setup
 	.size	__v7_ca7mp_proc_info, . - __v7_ca7mp_proc_info
 
 	/*
+	 * ARM Ltd. Cortex A12 processor.
+	 */
+	.type	__v7_ca12mp_proc_info, #object
+__v7_ca12mp_proc_info:
+	.long	0x410fc0d0
+	.long	0xff0ffff0
+	__v7_proc __v7_ca12mp_setup
+	.size	__v7_ca12mp_proc_info, . - __v7_ca12mp_proc_info
+
+	/*
 	 * ARM Ltd. Cortex A15 processor.
 	 */
 	.type	__v7_ca15mp_proc_info, #object
 __v7_ca15mp_proc_info:
 	.long	0x410fc0f0
 	.long	0xff0ffff0
-	__v7_proc __v7_ca15mp_setup, hwcaps = HWCAP_IDIV
+	__v7_proc __v7_ca15mp_setup
 	.size	__v7_ca15mp_proc_info, . - __v7_ca15mp_proc_info
 
 	/*
+	 * ARM Ltd. Cortex A17 processor.
+	 */
+	.type	__v7_ca17mp_proc_info, #object
+__v7_ca17mp_proc_info:
+	.long	0x410fc0e0
+	.long	0xff0ffff0
+	__v7_proc __v7_ca17mp_setup
+	.size	__v7_ca17mp_proc_info, . - __v7_ca17mp_proc_info
+
+	/*
+	 * Qualcomm Inc. Krait processors.
+	 */
+	.type	__krait_proc_info, #object
+__krait_proc_info:
+	.long	0x510f0400		@ Required ID value
+	.long	0xff0ffc00		@ Mask for ID
+	/*
+	 * Some Krait processors don't indicate support for SDIV and UDIV
+	 * instructions in the ARM instruction set, even though they actually
+	 * do support them.
+	 */
+	__v7_proc __v7_setup, hwcaps = HWCAP_IDIV
+	.size	__krait_proc_info, . - __krait_proc_info
+
+	/*
 	 * Match any ARMv7 processor core.
 	 */
 	.type	__v7_proc_info, #object
diff --git a/arch/arm/mm/proc-v7m.S b/arch/arm/mm/proc-v7m.S
new file mode 100644
index 00000000000..1ca37c72f12
--- /dev/null
+++ b/arch/arm/mm/proc-v7m.S
@@ -0,0 +1,159 @@
+/*
+ *  linux/arch/arm/mm/proc-v7m.S
+ *
+ *  Copyright (C) 2008 ARM Ltd.
+ *  Copyright (C) 2001 Deep Blue Solutions Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  This is the "shell" of the ARMv7-M processor support.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/v7m.h>
+#include "proc-macros.S"
+
+ENTRY(cpu_v7m_proc_init)
+	mov	pc, lr
+ENDPROC(cpu_v7m_proc_init)
+
+ENTRY(cpu_v7m_proc_fin)
+	mov	pc, lr
+ENDPROC(cpu_v7m_proc_fin)
+
+/*
+ *	cpu_v7m_reset(loc)
+ *
+ *	Perform a soft reset of the system.  Put the CPU into the
+ *	same state as it would be if it had been reset, and branch
+ *	to what would be the reset vector.
+ *
+ *	- loc   - location to jump to for soft reset
+ */
+	.align	5
+ENTRY(cpu_v7m_reset)
+	mov	pc, r0
+ENDPROC(cpu_v7m_reset)
+
+/*
+ *	cpu_v7m_do_idle()
+ *
+ *	Idle the processor (eg, wait for interrupt).
+ *
+ *	IRQs are already disabled.
+ */
+ENTRY(cpu_v7m_do_idle)
+	wfi
+	mov	pc, lr
+ENDPROC(cpu_v7m_do_idle)
+
+ENTRY(cpu_v7m_dcache_clean_area)
+	mov	pc, lr
+ENDPROC(cpu_v7m_dcache_clean_area)
+
+/*
+ * There is no MMU, so here is nothing to do.
+ */
+ENTRY(cpu_v7m_switch_mm)
+	mov	pc, lr
+ENDPROC(cpu_v7m_switch_mm)
+
+.globl	cpu_v7m_suspend_size
+.equ	cpu_v7m_suspend_size, 0
+
+#ifdef CONFIG_ARM_CPU_SUSPEND
+ENTRY(cpu_v7m_do_suspend)
+	mov	pc, lr
+ENDPROC(cpu_v7m_do_suspend)
+
+ENTRY(cpu_v7m_do_resume)
+	mov	pc, lr
+ENDPROC(cpu_v7m_do_resume)
+#endif
+
+	.section ".text.init", #alloc, #execinstr
+
+/*
+ *	__v7m_setup
+ *
+ *	This should be able to cover all ARMv7-M cores.
+ */
+__v7m_setup:
+	@ Configure the vector table base address
+	ldr	r0, =BASEADDR_V7M_SCB
+	ldr	r12, =vector_table
+	str	r12, [r0, V7M_SCB_VTOR]
+
+	@ enable UsageFault, BusFault and MemManage fault.
+	ldr	r5, [r0, #V7M_SCB_SHCSR]
+	orr	r5, #(V7M_SCB_SHCSR_USGFAULTENA | V7M_SCB_SHCSR_BUSFAULTENA | V7M_SCB_SHCSR_MEMFAULTENA)
+	str	r5, [r0, #V7M_SCB_SHCSR]
+
+	@ Lower the priority of the SVC and PendSV exceptions
+	mov	r5, #0x80000000
+	str	r5, [r0, V7M_SCB_SHPR2]	@ set SVC priority
+	mov	r5, #0x00800000
+	str	r5, [r0, V7M_SCB_SHPR3]	@ set PendSV priority
+
+	@ SVC to run the kernel in this mode
+	adr	r1, BSYM(1f)
+	ldr	r5, [r12, #11 * 4]	@ read the SVC vector entry
+	str	r1, [r12, #11 * 4]	@ write the temporary SVC vector entry
+	mov	r6, lr			@ save LR
+	mov	r7, sp			@ save SP
+	ldr	sp, =__v7m_setup_stack_top
+	cpsie	i
+	svc	#0
+1:	cpsid	i
+	str	r5, [r12, #11 * 4]	@ restore the original SVC vector entry
+	mov	lr, r6			@ restore LR
+	mov	sp, r7			@ restore SP
+
+	@ Special-purpose control register
+	mov	r1, #1
+	msr	control, r1		@ Thread mode has unpriviledged access
+
+	@ Configure the System Control Register to ensure 8-byte stack alignment
+	@ Note the STKALIGN bit is either RW or RAO.
+	ldr	r12, [r0, V7M_SCB_CCR]	@ system control register
+	orr	r12, #V7M_SCB_CCR_STKALIGN
+	str	r12, [r0, V7M_SCB_CCR]
+	mov	pc, lr
+ENDPROC(__v7m_setup)
+
+	.align 2
+__v7m_setup_stack:
+	.space	4 * 8				@ 8 registers
+__v7m_setup_stack_top:
+
+	define_processor_functions v7m, dabort=nommu_early_abort, pabort=legacy_pabort, nommu=1
+
+	.section ".rodata"
+	string cpu_arch_name, "armv7m"
+	string cpu_elf_name "v7m"
+	string cpu_v7m_name "ARMv7-M"
+
+	.section ".proc.info.init", #alloc, #execinstr
+
+	/*
+	 * Match any ARMv7-M processor core.
+	 */
+	.type	__v7m_proc_info, #object
+__v7m_proc_info:
+	.long	0x000f0000		@ Required ID value
+	.long	0x000f0000		@ Mask for ID
+	.long   0			@ proc_info_list.__cpu_mm_mmu_flags
+	.long   0			@ proc_info_list.__cpu_io_mmu_flags
+	b	__v7m_setup		@ proc_info_list.__cpu_flush
+	.long	cpu_arch_name
+	.long	cpu_elf_name
+	.long	HWCAP_HALF|HWCAP_THUMB|HWCAP_FAST_MULT
+	.long	cpu_v7m_name
+	.long	v7m_processor_functions	@ proc_info_list.proc
+	.long	0			@ proc_info_list.tlb
+	.long	0			@ proc_info_list.user
+	.long	nop_cache_fns		@ proc_info_list.cache
+	.size	__v7m_proc_info, . - __v7m_proc_info
+
diff --git a/arch/arm/mm/proc-xsc3.S b/arch/arm/mm/proc-xsc3.S
index eb93d6487f3..dc164589004 100644
--- a/arch/arm/mm/proc-xsc3.S
+++ b/arch/arm/mm/proc-xsc3.S
@@ -413,7 +413,7 @@ ENTRY(cpu_xsc3_set_pte_ext)
 
 .globl	cpu_xsc3_suspend_size
 .equ	cpu_xsc3_suspend_size, 4 * 6
-#ifdef CONFIG_PM_SLEEP
+#ifdef CONFIG_ARM_CPU_SUSPEND
 ENTRY(cpu_xsc3_do_suspend)
 	stmfd	sp!, {r4 - r9, lr}
 	mrc	p14, 0, r4, c6, c0, 0	@ clock configuration, for turbo mode
@@ -446,8 +446,6 @@ ENTRY(cpu_xsc3_do_resume)
 ENDPROC(cpu_xsc3_do_resume)
 #endif
 
-	__CPUINIT
-
 	.type	__xsc3_setup, #function
 __xsc3_setup:
 	mov	r0, #PSR_F_BIT|PSR_I_BIT|SVC_MODE
diff --git a/arch/arm/mm/proc-xscale.S b/arch/arm/mm/proc-xscale.S
index 25510361aa1..d19b1cfcad9 100644
--- a/arch/arm/mm/proc-xscale.S
+++ b/arch/arm/mm/proc-xscale.S
@@ -528,7 +528,7 @@ ENTRY(cpu_xscale_set_pte_ext)
 
 .globl	cpu_xscale_suspend_size
 .equ	cpu_xscale_suspend_size, 4 * 6
-#ifdef CONFIG_PM_SLEEP
+#ifdef CONFIG_ARM_CPU_SUSPEND
 ENTRY(cpu_xscale_do_suspend)
 	stmfd	sp!, {r4 - r9, lr}
 	mrc	p14, 0, r4, c6, c0, 0	@ clock configuration, for turbo mode
@@ -558,8 +558,6 @@ ENTRY(cpu_xscale_do_resume)
 ENDPROC(cpu_xscale_do_resume)
 #endif
 
-	__CPUINIT
-
 	.type	__xscale_setup, #function
 __xscale_setup:
 	mcr	p15, 0, ip, c7, c7, 0		@ invalidate I, D caches & BTB
diff --git a/arch/arm/mm/tcm.h b/arch/arm/mm/tcm.h
new file mode 100644
index 00000000000..8015ad434a4
--- /dev/null
+++ b/arch/arm/mm/tcm.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) 2008-2009 ST-Ericsson AB
+ * License terms: GNU General Public License (GPL) version 2
+ * TCM memory handling for ARM systems
+ *
+ * Author: Linus Walleij <linus.walleij@stericsson.com>
+ * Author: Rickard Andersson <rickard.andersson@stericsson.com>
+ */
+
+#ifdef CONFIG_HAVE_TCM
+void __init tcm_init(void);
+#else
+/* No TCM support, just blank inlines to be optimized out */
+inline void tcm_init(void)
+{
+}
+#endif
diff --git a/arch/arm/mm/tlb-v7.S b/arch/arm/mm/tlb-v7.S
index ea94765acf9..355308767ba 100644
--- a/arch/arm/mm/tlb-v7.S
+++ b/arch/arm/mm/tlb-v7.S
@@ -35,7 +35,7 @@
 ENTRY(v7wbi_flush_user_tlb_range)
 	vma_vm_mm r3, r2			@ get vma->vm_mm
 	mmid	r3, r3				@ get vm_mm->context.id
-	dsb
+	dsb	ish
 	mov	r0, r0, lsr #PAGE_SHIFT		@ align address
 	mov	r1, r1, lsr #PAGE_SHIFT
 	asid	r3, r3				@ mask ASID
@@ -56,7 +56,7 @@ ENTRY(v7wbi_flush_user_tlb_range)
 	add	r0, r0, #PAGE_SZ
 	cmp	r0, r1
 	blo	1b
-	dsb
+	dsb	ish
 	mov	pc, lr
 ENDPROC(v7wbi_flush_user_tlb_range)
 
@@ -69,7 +69,7 @@ ENDPROC(v7wbi_flush_user_tlb_range)
  *	- end   - end address (exclusive, may not be aligned)
  */
 ENTRY(v7wbi_flush_kern_tlb_range)
-	dsb
+	dsb	ish
 	mov	r0, r0, lsr #PAGE_SHIFT		@ align address
 	mov	r1, r1, lsr #PAGE_SHIFT
 	mov	r0, r0, lsl #PAGE_SHIFT
@@ -84,7 +84,7 @@ ENTRY(v7wbi_flush_kern_tlb_range)
 	add	r0, r0, #PAGE_SZ
 	cmp	r0, r1
 	blo	1b
-	dsb
+	dsb	ish
 	isb
 	mov	pc, lr
 ENDPROC(v7wbi_flush_kern_tlb_range)
diff --git a/arch/arm/mm/vmregion.c b/arch/arm/mm/vmregion.c
deleted file mode 100644
index a631016e1f8..00000000000
--- a/arch/arm/mm/vmregion.c
+++ /dev/null
@@ -1,205 +0,0 @@
-#include <linux/fs.h>
-#include <linux/spinlock.h>
-#include <linux/list.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-
-#include "vmregion.h"
-
-/*
- * VM region handling support.
- *
- * This should become something generic, handling VM region allocations for
- * vmalloc and similar (ioremap, module space, etc).
- *
- * I envisage vmalloc()'s supporting vm_struct becoming:
- *
- *  struct vm_struct {
- *    struct vmregion	region;
- *    unsigned long	flags;
- *    struct page	**pages;
- *    unsigned int	nr_pages;
- *    unsigned long	phys_addr;
- *  };
- *
- * get_vm_area() would then call vmregion_alloc with an appropriate
- * struct vmregion head (eg):
- *
- *  struct vmregion vmalloc_head = {
- *	.vm_list	= LIST_HEAD_INIT(vmalloc_head.vm_list),
- *	.vm_start	= VMALLOC_START,
- *	.vm_end		= VMALLOC_END,
- *  };
- *
- * However, vmalloc_head.vm_start is variable (typically, it is dependent on
- * the amount of RAM found at boot time.)  I would imagine that get_vm_area()
- * would have to initialise this each time prior to calling vmregion_alloc().
- */
-
-struct arm_vmregion *
-arm_vmregion_alloc(struct arm_vmregion_head *head, size_t align,
-		   size_t size, gfp_t gfp, const void *caller)
-{
-	unsigned long start = head->vm_start, addr = head->vm_end;
-	unsigned long flags;
-	struct arm_vmregion *c, *new;
-
-	if (head->vm_end - head->vm_start < size) {
-		printk(KERN_WARNING "%s: allocation too big (requested %#x)\n",
-			__func__, size);
-		goto out;
-	}
-
-	new = kmalloc(sizeof(struct arm_vmregion), gfp);
-	if (!new)
-		goto out;
-
-	new->caller = caller;
-
-	spin_lock_irqsave(&head->vm_lock, flags);
-
-	addr = rounddown(addr - size, align);
-	list_for_each_entry_reverse(c, &head->vm_list, vm_list) {
-		if (addr >= c->vm_end)
-			goto found;
-		addr = rounddown(c->vm_start - size, align);
-		if (addr < start)
-			goto nospc;
-	}
-
- found:
-	/*
-	 * Insert this entry after the one we found.
-	 */
-	list_add(&new->vm_list, &c->vm_list);
-	new->vm_start = addr;
-	new->vm_end = addr + size;
-	new->vm_active = 1;
-
-	spin_unlock_irqrestore(&head->vm_lock, flags);
-	return new;
-
- nospc:
-	spin_unlock_irqrestore(&head->vm_lock, flags);
-	kfree(new);
- out:
-	return NULL;
-}
-
-static struct arm_vmregion *__arm_vmregion_find(struct arm_vmregion_head *head, unsigned long addr)
-{
-	struct arm_vmregion *c;
-
-	list_for_each_entry(c, &head->vm_list, vm_list) {
-		if (c->vm_active && c->vm_start == addr)
-			goto out;
-	}
-	c = NULL;
- out:
-	return c;
-}
-
-struct arm_vmregion *arm_vmregion_find(struct arm_vmregion_head *head, unsigned long addr)
-{
-	struct arm_vmregion *c;
-	unsigned long flags;
-
-	spin_lock_irqsave(&head->vm_lock, flags);
-	c = __arm_vmregion_find(head, addr);
-	spin_unlock_irqrestore(&head->vm_lock, flags);
-	return c;
-}
-
-struct arm_vmregion *arm_vmregion_find_remove(struct arm_vmregion_head *head, unsigned long addr)
-{
-	struct arm_vmregion *c;
-	unsigned long flags;
-
-	spin_lock_irqsave(&head->vm_lock, flags);
-	c = __arm_vmregion_find(head, addr);
-	if (c)
-		c->vm_active = 0;
-	spin_unlock_irqrestore(&head->vm_lock, flags);
-	return c;
-}
-
-void arm_vmregion_free(struct arm_vmregion_head *head, struct arm_vmregion *c)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&head->vm_lock, flags);
-	list_del(&c->vm_list);
-	spin_unlock_irqrestore(&head->vm_lock, flags);
-
-	kfree(c);
-}
-
-#ifdef CONFIG_PROC_FS
-static int arm_vmregion_show(struct seq_file *m, void *p)
-{
-	struct arm_vmregion *c = list_entry(p, struct arm_vmregion, vm_list);
-
-	seq_printf(m, "0x%08lx-0x%08lx %7lu", c->vm_start, c->vm_end,
-		c->vm_end - c->vm_start);
-	if (c->caller)
-		seq_printf(m, " %pS", (void *)c->caller);
-	seq_putc(m, '\n');
-	return 0;
-}
-
-static void *arm_vmregion_start(struct seq_file *m, loff_t *pos)
-{
-	struct arm_vmregion_head *h = m->private;
-	spin_lock_irq(&h->vm_lock);
-	return seq_list_start(&h->vm_list, *pos);
-}
-
-static void *arm_vmregion_next(struct seq_file *m, void *p, loff_t *pos)
-{
-	struct arm_vmregion_head *h = m->private;
-	return seq_list_next(p, &h->vm_list, pos);
-}
-
-static void arm_vmregion_stop(struct seq_file *m, void *p)
-{
-	struct arm_vmregion_head *h = m->private;
-	spin_unlock_irq(&h->vm_lock);
-}
-
-static const struct seq_operations arm_vmregion_ops = {
-	.start	= arm_vmregion_start,
-	.stop	= arm_vmregion_stop,
-	.next	= arm_vmregion_next,
-	.show	= arm_vmregion_show,
-};
-
-static int arm_vmregion_open(struct inode *inode, struct file *file)
-{
-	struct arm_vmregion_head *h = PDE(inode)->data;
-	int ret = seq_open(file, &arm_vmregion_ops);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-		m->private = h;
-	}
-	return ret;
-}
-
-static const struct file_operations arm_vmregion_fops = {
-	.open	= arm_vmregion_open,
-	.read	= seq_read,
-	.llseek	= seq_lseek,
-	.release = seq_release,
-};
-
-int arm_vmregion_create_proc(const char *path, struct arm_vmregion_head *h)
-{
-	proc_create_data(path, S_IRUSR, NULL, &arm_vmregion_fops, h);
-	return 0;
-}
-#else
-int arm_vmregion_create_proc(const char *path, struct arm_vmregion_head *h)
-{
-	return 0;
-}
-#endif
diff --git a/arch/arm/mm/vmregion.h b/arch/arm/mm/vmregion.h
deleted file mode 100644
index 0f5a5f2a2c7..00000000000
--- a/arch/arm/mm/vmregion.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef VMREGION_H
-#define VMREGION_H
-
-#include <linux/spinlock.h>
-#include <linux/list.h>
-
-struct page;
-
-struct arm_vmregion_head {
-	spinlock_t		vm_lock;
-	struct list_head	vm_list;
-	unsigned long		vm_start;
-	unsigned long		vm_end;
-};
-
-struct arm_vmregion {
-	struct list_head	vm_list;
-	unsigned long		vm_start;
-	unsigned long		vm_end;
-	int			vm_active;
-	const void		*caller;
-};
-
-struct arm_vmregion *arm_vmregion_alloc(struct arm_vmregion_head *, size_t, size_t, gfp_t, const void *);
-struct arm_vmregion *arm_vmregion_find(struct arm_vmregion_head *, unsigned long);
-struct arm_vmregion *arm_vmregion_find_remove(struct arm_vmregion_head *, unsigned long);
-void arm_vmregion_free(struct arm_vmregion_head *, struct arm_vmregion *);
-
-int arm_vmregion_create_proc(const char *, struct arm_vmregion_head *);
-
-#endif