9 files changed, 781 insertions, 691 deletions
diff --git a/arch/arc/mm/Makefile b/arch/arc/mm/Makefile
index 168dc146a8f..ac95cc239c1 100644
--- a/arch/arc/mm/Makefile
+++ b/arch/arc/mm/Makefile
@@ -7,4 +7,4 @@
 #
 
 obj-y	:= extable.o ioremap.o dma.o fault.o init.o
-obj-y	+= tlb.o tlbex.o cache_arc700.o
+obj-y	+= tlb.o tlbex.o cache_arc700.o mmap.o
diff --git a/arch/arc/mm/cache_arc700.c b/arch/arc/mm/cache_arc700.c
index 88d617d8423..353b202c37c 100644
--- a/arch/arc/mm/cache_arc700.c
+++ b/arch/arc/mm/cache_arc700.c
@@ -68,24 +68,14 @@
 #include <linux/mmu_context.h>
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
+#include <linux/pagemap.h>
 #include <asm/cacheflush.h>
 #include <asm/cachectl.h>
 #include <asm/setup.h>
 
-
-#ifdef CONFIG_ARC_HAS_ICACHE
-static void __ic_line_inv_no_alias(unsigned long, int);
-static void __ic_line_inv_2_alias(unsigned long, int);
-static void __ic_line_inv_4_alias(unsigned long, int);
-
-/* Holds the ptr to flush routine, dependign on size due to aliasing issues */
-static void (*___flush_icache_rtn) (unsigned long, int);
-#endif
-
-char *arc_cache_mumbojumbo(int cpu_id, char *buf, int len)
+char *arc_cache_mumbojumbo(int c, char *buf, int len)
 {
 	int n = 0;
-	unsigned int c = smp_processor_id();
 
 #define PR_CACHE(p, enb, str)						\
 {									\
@@ -98,8 +88,10 @@ char *arc_cache_mumbojumbo(int cpu_id, char *buf, int len)
 			enb ?  "" : "DISABLED (kernel-build)");		\
 }
 
-	PR_CACHE(&cpuinfo_arc700[c].icache, __CONFIG_ARC_HAS_ICACHE, "I-Cache");
-	PR_CACHE(&cpuinfo_arc700[c].dcache, __CONFIG_ARC_HAS_DCACHE, "D-Cache");
+	PR_CACHE(&cpuinfo_arc700[c].icache, IS_ENABLED(CONFIG_ARC_HAS_ICACHE),
+			"I-Cache");
+	PR_CACHE(&cpuinfo_arc700[c].dcache, IS_ENABLED(CONFIG_ARC_HAS_DCACHE),
+			"D-Cache");
 
 	return buf;
 }
@@ -109,17 +101,23 @@ char *arc_cache_mumbojumbo(int cpu_id, char *buf, int len)
  * the cpuinfo structure for later use.
  * No Validation done here, simply read/convert the BCRs
  */
-void __init read_decode_cache_bcr(void)
+void read_decode_cache_bcr(void)
 {
-	struct bcr_cache ibcr, dbcr;
 	struct cpuinfo_arc_cache *p_ic, *p_dc;
 	unsigned int cpu = smp_processor_id();
+	struct bcr_cache {
+#ifdef CONFIG_CPU_BIG_ENDIAN
+		unsigned int pad:12, line_len:4, sz:4, config:4, ver:8;
+#else
+		unsigned int ver:8, config:4, sz:4, line_len:4, pad:12;
+#endif
+	} ibcr, dbcr;
 
 	p_ic = &cpuinfo_arc700[cpu].icache;
 	READ_BCR(ARC_REG_IC_BCR, ibcr);
 
-	if (ibcr.config == 0x3)
-		p_ic->assoc = 2;
+	BUG_ON(ibcr.config != 3);
+	p_ic->assoc = 2;		/* Fixed to 2w set assoc */
 	p_ic->line_len = 8 << ibcr.line_len;
 	p_ic->sz = 0x200 << ibcr.sz;
 	p_ic->ver = ibcr.ver;
@@ -127,8 +125,8 @@ void __init read_decode_cache_bcr(void)
 	p_dc = &cpuinfo_arc700[cpu].dcache;
 	READ_BCR(ARC_REG_DC_BCR, dbcr);
 
-	if (dbcr.config == 0x2)
-		p_dc->assoc = 4;
+	BUG_ON(dbcr.config != 2);
+	p_dc->assoc = 4;		/* Fixed to 4w set assoc */
 	p_dc->line_len = 16 << dbcr.line_len;
 	p_dc->sz = 0x200 << dbcr.sz;
 	p_dc->ver = dbcr.ver;
@@ -141,112 +139,116 @@ void __init read_decode_cache_bcr(void)
  * 3. Enable the Caches, setup default flush mode for D-Cache
  * 3. Calculate the SHMLBA used by user space
  */
-void __init arc_cache_init(void)
+void arc_cache_init(void)
 {
-	unsigned int temp;
-	unsigned int cpu = smp_processor_id();
-	struct cpuinfo_arc_cache *ic = &cpuinfo_arc700[cpu].icache;
-	struct cpuinfo_arc_cache *dc = &cpuinfo_arc700[cpu].dcache;
-	int way_pg_ratio = way_pg_ratio;
+	unsigned int __maybe_unused cpu = smp_processor_id();
+	struct cpuinfo_arc_cache __maybe_unused *ic, __maybe_unused *dc;
 	char str[256];
 
 	printk(arc_cache_mumbojumbo(0, str, sizeof(str)));
 
-	if (!ic->ver)
-		goto chk_dc;
-
 #ifdef CONFIG_ARC_HAS_ICACHE
-	/* 1. Confirm some of I-cache params which Linux assumes */
-	if ((ic->assoc != ARC_ICACHE_WAYS) ||
-	    (ic->line_len != ARC_ICACHE_LINE_LEN)) {
-		panic("Cache H/W doesn't match kernel Config");
-	}
-#if (CONFIG_ARC_MMU_VER > 2)
-	if (ic->ver != 3) {
-		if (running_on_hw)
-			panic("Cache ver doesn't match MMU ver\n");
-
-		/* For ISS - suggest the toggles to use */
-		pr_err("Use -prop=icache_version=3,-prop=dcache_version=3\n");
-
+	ic = &cpuinfo_arc700[cpu].icache;
+	if (ic->ver) {
+		if (ic->line_len != L1_CACHE_BYTES)
+			panic("ICache line [%d] != kernel Config [%d]",
+			      ic->line_len, L1_CACHE_BYTES);
+
+		if (ic->ver != CONFIG_ARC_MMU_VER)
+			panic("Cache ver [%d] doesn't match MMU ver [%d]\n",
+			      ic->ver, CONFIG_ARC_MMU_VER);
 	}
 #endif
 
-	/*
-	 * if Cache way size is <= page size then no aliasing exhibited
-	 * otherwise ratio determines num of aliases.
-	 * e.g. 32K I$, 2 way set assoc, 8k pg size
-	 *       way-sz = 32k/2 = 16k
-	 *       way-pg-ratio = 16k/8k = 2, so 2 aliases possible
-	 *       (meaning 1 line could be in 2 possible locations).
-	 */
-	way_pg_ratio = ic->sz / ARC_ICACHE_WAYS / PAGE_SIZE;
-	switch (way_pg_ratio) {
-	case 0:
-	case 1:
-		___flush_icache_rtn = __ic_line_inv_no_alias;
-		break;
-	case 2:
-		___flush_icache_rtn = __ic_line_inv_2_alias;
-		break;
-	case 4:
-		___flush_icache_rtn = __ic_line_inv_4_alias;
-		break;
-	default:
-		panic("Unsupported I-Cache Sz\n");
-	}
-#endif
+#ifdef CONFIG_ARC_HAS_DCACHE
+	dc = &cpuinfo_arc700[cpu].dcache;
+	if (dc->ver) {
+		unsigned int dcache_does_alias;
 
-	/* Enable/disable I-Cache */
-	temp = read_aux_reg(ARC_REG_IC_CTRL);
+		if (dc->line_len != L1_CACHE_BYTES)
+			panic("DCache line [%d] != kernel Config [%d]",
+			      dc->line_len, L1_CACHE_BYTES);
 
-#ifdef CONFIG_ARC_HAS_ICACHE
-	temp &= ~IC_CTRL_CACHE_DISABLE;
-#else
-	temp |= IC_CTRL_CACHE_DISABLE;
+		/* check for D-Cache aliasing */
+		dcache_does_alias = (dc->sz / dc->assoc) > PAGE_SIZE;
+
+		if (dcache_does_alias && !cache_is_vipt_aliasing())
+			panic("Enable CONFIG_ARC_CACHE_VIPT_ALIASING\n");
+		else if (!dcache_does_alias && cache_is_vipt_aliasing())
+			panic("Don't need CONFIG_ARC_CACHE_VIPT_ALIASING\n");
+	}
 #endif
+}
 
-	write_aux_reg(ARC_REG_IC_CTRL, temp);
+#define OP_INV		0x1
+#define OP_FLUSH	0x2
+#define OP_FLUSH_N_INV	0x3
+#define OP_INV_IC	0x4
 
-chk_dc:
-	if (!dc->ver)
-		return;
+/*
+ * Common Helper for Line Operations on {I,D}-Cache
+ */
+static inline void __cache_line_loop(unsigned long paddr, unsigned long vaddr,
+				     unsigned long sz, const int cacheop)
+{
+	unsigned int aux_cmd, aux_tag;
+	int num_lines;
+	const int full_page_op = __builtin_constant_p(sz) && sz == PAGE_SIZE;
 
-#ifdef CONFIG_ARC_HAS_DCACHE
-	if ((dc->assoc != ARC_DCACHE_WAYS) ||
-	    (dc->line_len != ARC_DCACHE_LINE_LEN)) {
-		panic("Cache H/W doesn't match kernel Config");
+	if (cacheop == OP_INV_IC) {
+		aux_cmd = ARC_REG_IC_IVIL;
+#if (CONFIG_ARC_MMU_VER > 2)
+		aux_tag = ARC_REG_IC_PTAG;
+#endif
 	}
-
-	/* check for D-Cache aliasing */
-	if ((dc->sz / ARC_DCACHE_WAYS) > PAGE_SIZE)
-		panic("D$ aliasing not handled right now\n");
+	else {
+		/* d$ cmd: INV (discard or wback-n-discard) OR FLUSH (wback) */
+		aux_cmd = cacheop & OP_INV ? ARC_REG_DC_IVDL : ARC_REG_DC_FLDL;
+#if (CONFIG_ARC_MMU_VER > 2)
+		aux_tag = ARC_REG_DC_PTAG;
 #endif
+	}
 
-	/* Set the default Invalidate Mode to "simpy discard dirty lines"
-	 *  as this is more frequent then flush before invalidate
-	 * Ofcourse we toggle this default behviour when desired
+	/* Ensure we properly floor/ceil the non-line aligned/sized requests
+	 * and have @paddr - aligned to cache line and integral @num_lines.
+	 * This however can be avoided for page sized since:
+	 *  -@paddr will be cache-line aligned already (being page aligned)
+	 *  -@sz will be integral multiple of line size (being page sized).
 	 */
-	temp = read_aux_reg(ARC_REG_DC_CTRL);
-	temp &= ~DC_CTRL_INV_MODE_FLUSH;
+	if (!full_page_op) {
+		sz += paddr & ~CACHE_LINE_MASK;
+		paddr &= CACHE_LINE_MASK;
+		vaddr &= CACHE_LINE_MASK;
+	}
 
-#ifdef CONFIG_ARC_HAS_DCACHE
-	/* Enable D-Cache: Clear Bit 0 */
-	write_aux_reg(ARC_REG_DC_CTRL, temp & ~IC_CTRL_CACHE_DISABLE);
+	num_lines = DIV_ROUND_UP(sz, L1_CACHE_BYTES);
+
+#if (CONFIG_ARC_MMU_VER <= 2)
+	/* MMUv2 and before: paddr contains stuffed vaddrs bits */
+	paddr |= (vaddr >> PAGE_SHIFT) & 0x1F;
 #else
-	/* Flush D cache */
-	write_aux_reg(ARC_REG_DC_FLSH, 0x1);
-	/* Disable D cache */
-	write_aux_reg(ARC_REG_DC_CTRL, temp | IC_CTRL_CACHE_DISABLE);
+	/* if V-P const for loop, PTAG can be written once outside loop */
+	if (full_page_op)
+		write_aux_reg(aux_tag, paddr);
 #endif
 
-	return;
+	while (num_lines-- > 0) {
+#if (CONFIG_ARC_MMU_VER > 2)
+		/* MMUv3, cache ops require paddr seperately */
+		if (!full_page_op) {
+			write_aux_reg(aux_tag, paddr);
+			paddr += L1_CACHE_BYTES;
+		}
+
+		write_aux_reg(aux_cmd, vaddr);
+		vaddr += L1_CACHE_BYTES;
+#else
+		write_aux_reg(aux_cmd, paddr);
+		paddr += L1_CACHE_BYTES;
+#endif
+	}
 }
 
-#define OP_INV		0x1
-#define OP_FLUSH	0x2
-#define OP_FLUSH_N_INV	0x3
-
 #ifdef CONFIG_ARC_HAS_DCACHE
 
 /***************************************************************
@@ -267,11 +269,9 @@ static inline void wait_for_flush(void)
  */
 static inline void __dc_entire_op(const int cacheop)
 {
-	unsigned long flags, tmp = tmp;
+	unsigned int tmp = tmp;
 	int aux;
 
-	local_irq_save(flags);
-
 	if (cacheop == OP_FLUSH_N_INV) {
 		/* Dcache provides 2 cmd: FLUSH or INV
 		 * INV inturn has sub-modes: DISCARD or FLUSH-BEFORE
@@ -295,58 +295,18 @@ static inline void __dc_entire_op(const int cacheop)
 	/* Switch back the DISCARD ONLY Invalidate mode */
 	if (cacheop == OP_FLUSH_N_INV)
 		write_aux_reg(ARC_REG_DC_CTRL, tmp & ~DC_CTRL_INV_MODE_FLUSH);
-
-	local_irq_restore(flags);
 }
 
-/*
- * Per Line Operation on D-Cache
- * Doesn't deal with type-of-op/IRQ-disabling/waiting-for-flush-to-complete
- * It's sole purpose is to help gcc generate ZOL
- */
-static inline void __dc_line_loop(unsigned long start, unsigned long sz,
-					  int aux_reg)
-{
-	int num_lines, slack;
-
-	/* Ensure we properly floor/ceil the non-line aligned/sized requests
-	 * and have @start - aligned to cache line and integral @num_lines.
-	 * This however can be avoided for page sized since:
-	 *  -@start will be cache-line aligned already (being page aligned)
-	 *  -@sz will be integral multiple of line size (being page sized).
-	 */
-	if (!(__builtin_constant_p(sz) && sz == PAGE_SIZE)) {
-		slack = start & ~DCACHE_LINE_MASK;
-		sz += slack;
-		start -= slack;
-	}
-
-	num_lines = DIV_ROUND_UP(sz, ARC_DCACHE_LINE_LEN);
-
-	while (num_lines-- > 0) {
-#if (CONFIG_ARC_MMU_VER > 2)
-		/*
-		 * Just as for I$, in MMU v3, D$ ops also require
-		 * "tag" bits in DC_PTAG, "index" bits in FLDL,IVDL ops
-		 * But we pass phy addr for both. This works since Linux
-		 * doesn't support aliasing configs for D$, yet.
-		 * Thus paddr is enough to provide both tag and index.
-		 */
-		write_aux_reg(ARC_REG_DC_PTAG, start);
-#endif
-		write_aux_reg(aux_reg, start);
-		start += ARC_DCACHE_LINE_LEN;
-	}
-}
+/* For kernel mappings cache operation: index is same as paddr */
+#define __dc_line_op_k(p, sz, op)	__dc_line_op(p, p, sz, op)
 
 /*
  * D-Cache : Per Line INV (discard or wback+discard) or FLUSH (wback)
  */
-static inline void __dc_line_op(unsigned long start, unsigned long sz,
-					const int cacheop)
+static inline void __dc_line_op(unsigned long paddr, unsigned long vaddr,
+				unsigned long sz, const int cacheop)
 {
 	unsigned long flags, tmp = tmp;
-	int aux;
 
 	local_irq_save(flags);
 
@@ -361,12 +321,7 @@ static inline void __dc_line_op(unsigned long start, unsigned long sz,
 		write_aux_reg(ARC_REG_DC_CTRL, tmp | DC_CTRL_INV_MODE_FLUSH);
 	}
 
-	if (cacheop & OP_INV)	/* Inv / flush-n-inv use same cmd reg */
-		aux = ARC_REG_DC_IVDL;
-	else
-		aux = ARC_REG_DC_FLDL;
-
-	__dc_line_loop(start, sz, aux);
+	__cache_line_loop(paddr, vaddr, sz, cacheop);
 
 	if (cacheop & OP_FLUSH)	/* flush / flush-n-inv both wait */
 		wait_for_flush();
@@ -381,7 +336,8 @@ static inline void __dc_line_op(unsigned long start, unsigned long sz,
 #else
 
 #define __dc_entire_op(cacheop)
-#define __dc_line_op(start, sz, cacheop)
+#define __dc_line_op(paddr, vaddr, sz, cacheop)
+#define __dc_line_op_k(paddr, sz, cacheop)
 
 #endif /* CONFIG_ARC_HAS_DCACHE */
 
@@ -391,75 +347,38 @@ static inline void __dc_line_op(unsigned long start, unsigned long sz,
 /*
  *		I-Cache Aliasing in ARC700 VIPT caches
  *
- * For fetching code from I$, ARC700 uses vaddr (embedded in program code)
- * to "index" into SET of cache-line and paddr from MMU to match the TAG
- * in the WAYS of SET.
- *
- * However the CDU iterface (to flush/inv) lines from software, only takes
- * paddr (to have simpler hardware interface). For simpler cases, using paddr
- * alone suffices.
- * e.g. 2-way-set-assoc, 16K I$ (8k MMU pg sz, 32b cache line size):
- *      way_sz = cache_sz / num_ways = 16k/2 = 8k
- *      num_sets = way_sz / line_sz = 8k/32 = 256 => 8 bits
- *   Ignoring the bottom 5 bits corresp to the off within a 32b cacheline,
- *   bits req for calc set-index = bits 12:5 (0 based). Since this range fits
- *   inside the bottom 13 bits of paddr, which are same for vaddr and paddr
- *   (with 8k pg sz), paddr alone can be safely used by CDU to unambigously
- *   locate a cache-line.
+ * ARC VIPT I-cache uses vaddr to index into cache and paddr to match the tag.
+ * The orig Cache Management Module "CDU" only required paddr to invalidate a
+ * certain line since it sufficed as index in Non-Aliasing VIPT cache-geometry.
+ * Infact for distinct V1,V2,P: all of {V1-P},{V2-P},{P-P} would end up fetching
+ * the exact same line.
  *
- * However for a difft sized cache, say 32k I$, above math yields need
- * for 14 bits of vaddr to locate a cache line, which can't be provided by
- * paddr, since the bit 13 (0 based) might differ between the two.
- *
- * This lack of extra bits needed for correct line addressing, defines the
- * classical problem of Cache aliasing with VIPT architectures
- * num_aliases = 1 << extra_bits
- * e.g. 2-way-set-assoc, 32K I$ with 8k MMU pg sz => 2 aliases
- *      2-way-set-assoc, 64K I$ with 8k MMU pg sz => 4 aliases
- *      2-way-set-assoc, 16K I$ with 8k MMU pg sz => NO aliases
+ * However for larger Caches (way-size > page-size) - i.e. in Aliasing config,
+ * paddr alone could not be used to correctly index the cache.
  *
  * ------------------
  * MMU v1/v2 (Fixed Page Size 8k)
  * ------------------
  * The solution was to provide CDU with these additonal vaddr bits. These
- * would be bits [x:13], x would depend on cache-geom.
+ * would be bits [x:13], x would depend on cache-geometry, 13 comes from
+ * standard page size of 8k.
  * H/w folks chose [17:13] to be a future safe range, and moreso these 5 bits
  * of vaddr could easily be "stuffed" in the paddr as bits [4:0] since the
  * orig 5 bits of paddr were anyways ignored by CDU line ops, as they
  * represent the offset within cache-line. The adv of using this "clumsy"
- * interface for additional info was no new reg was needed in CDU.
+ * interface for additional info was no new reg was needed in CDU programming
+ * model.
  *
  * 17:13 represented the max num of bits passable, actual bits needed were
  * fewer, based on the num-of-aliases possible.
  * -for 2 alias possibility, only bit 13 needed (32K cache)
  * -for 4 alias possibility, bits 14:13 needed (64K cache)
  *
- * Since vaddr was not available for all instances of I$ flush req by core
- * kernel, the only safe way (non-optimal though) was to kill all possible
- * lines which could represent an alias (even if they didnt represent one
- * in execution).
- * e.g. for 64K I$, 4 aliases possible, so we did
- *      flush start
- *      flush start | 0x01
- *      flush start | 0x2
- *      flush start | 0x3
- *
- * The penalty was invoking the operation itself, since tag match is anyways
- * paddr based, a line which didn't represent an alias would not match the
- * paddr, hence wont be killed
- *
- * Note that aliasing concerns are independent of line-sz for a given cache
- * geometry (size + set_assoc) because the extra bits required by line-sz are
- * reduced from the set calc.
- * e.g. 2-way-set-assoc, 32K I$ with 8k MMU pg sz and using math above
- *  32b line-sz: 9 bits set-index-calc, 5 bits offset-in-line => 1 extra bit
- *  64b line-sz: 8 bits set-index-calc, 6 bits offset-in-line => 1 extra bit
- *
  * ------------------
  * MMU v3
  * ------------------
- * This ver of MMU supports var page sizes (1k-16k) - Linux will support
- * 8k (default), 16k and 4k.
+ * This ver of MMU supports variable page sizes (1k-16k): although Linux will
+ * only support 8k (default), 16k and 4k.
  * However from hardware perspective, smaller page sizes aggrevate aliasing
  * meaning more vaddr bits needed to disambiguate the cache-line-op ;
  * the existing scheme of piggybacking won't work for certain configurations.
@@ -468,144 +387,44 @@ static inline void __dc_line_op(unsigned long start, unsigned long sz,
  */
 
 /***********************************************************
- * Machine specific helpers for per line I-Cache invalidate.
- * 3 routines to accpunt for 1, 2, 4 aliases possible
+ * Machine specific helper for per line I-Cache invalidate.
  */
-
-static void __ic_line_inv_no_alias(unsigned long start, int num_lines)
-{
-	while (num_lines-- > 0) {
-#if (CONFIG_ARC_MMU_VER > 2)
-		write_aux_reg(ARC_REG_IC_PTAG, start);
-#endif
-		write_aux_reg(ARC_REG_IC_IVIL, start);
-		start += ARC_ICACHE_LINE_LEN;
-	}
-}
-
-static void __ic_line_inv_2_alias(unsigned long start, int num_lines)
-{
-	while (num_lines-- > 0) {
-
-#if (CONFIG_ARC_MMU_VER > 2)
-		/*
-		 *  MMU v3, CDU prog model (for line ops) now uses a new IC_PTAG
-		 * reg to pass the "tag" bits and existing IVIL reg only looks
-		 * at bits relevant for "index" (details above)
-		 * Programming Notes:
-		 * -when writing tag to PTAG reg, bit chopping can be avoided,
-		 *  CDU ignores non-tag bits.
-		 * -Ideally "index" must be computed from vaddr, but it is not
-		 *  avail in these rtns. So to be safe, we kill the lines in all
-		 *  possible indexes corresp to num of aliases possible for
-		 *  given cache config.
-		 */
-		write_aux_reg(ARC_REG_IC_PTAG, start);
-		write_aux_reg(ARC_REG_IC_IVIL,
-				  start & ~(0x1 << PAGE_SHIFT));
-		write_aux_reg(ARC_REG_IC_IVIL, start | (0x1 << PAGE_SHIFT));
-#else
-		write_aux_reg(ARC_REG_IC_IVIL, start);
-		write_aux_reg(ARC_REG_IC_IVIL, start | 0x01);
-#endif
-		start += ARC_ICACHE_LINE_LEN;
-	}
-}
-
-static void __ic_line_inv_4_alias(unsigned long start, int num_lines)
-{
-	while (num_lines-- > 0) {
-
-#if (CONFIG_ARC_MMU_VER > 2)
-		write_aux_reg(ARC_REG_IC_PTAG, start);
-
-		write_aux_reg(ARC_REG_IC_IVIL,
-				  start & ~(0x3 << PAGE_SHIFT));
-		write_aux_reg(ARC_REG_IC_IVIL,
-				  start & ~(0x2 << PAGE_SHIFT));
-		write_aux_reg(ARC_REG_IC_IVIL,
-				  start & ~(0x1 << PAGE_SHIFT));
-		write_aux_reg(ARC_REG_IC_IVIL, start | (0x3 << PAGE_SHIFT));
-#else
-		write_aux_reg(ARC_REG_IC_IVIL, start);
-		write_aux_reg(ARC_REG_IC_IVIL, start | 0x01);
-		write_aux_reg(ARC_REG_IC_IVIL, start | 0x02);
-		write_aux_reg(ARC_REG_IC_IVIL, start | 0x03);
-#endif
-		start += ARC_ICACHE_LINE_LEN;
-	}
-}
-
-static void __ic_line_inv(unsigned long start, unsigned long sz)
+static void __ic_line_inv_vaddr_local(unsigned long paddr, unsigned long vaddr,
+				unsigned long sz)
 {
 	unsigned long flags;
-	int num_lines, slack;
-
-	/*
-	 * Ensure we properly floor/ceil the non-line aligned/sized requests
-	 * and have @start - aligned to cache line, and integral @num_lines
-	 * However page sized flushes can be compile time optimised.
-	 *  -@start will be cache-line aligned already (being page aligned)
-	 *  -@sz will be integral multiple of line size (being page sized).
-	 */
-	if (!(__builtin_constant_p(sz) && sz == PAGE_SIZE)) {
-		slack = start & ~ICACHE_LINE_MASK;
-		sz += slack;
-		start -= slack;
-	}
-
-	num_lines = DIV_ROUND_UP(sz, ARC_ICACHE_LINE_LEN);
 
 	local_irq_save(flags);
-	(*___flush_icache_rtn) (start, num_lines);
+	__cache_line_loop(paddr, vaddr, sz, OP_INV_IC);
 	local_irq_restore(flags);
 }
 
-/* Unlike routines above, having vaddr for flush op (along with paddr),
- * prevents the need to speculatively kill the lines in multiple sets
- * based on ratio of way_sz : pg_sz
- */
-static void __ic_line_inv_vaddr(unsigned long phy_start,
-					 unsigned long vaddr, unsigned long sz)
+static inline void __ic_entire_inv(void)
 {
-	unsigned long flags;
-	int num_lines, slack;
-	unsigned int addr;
-
-	slack = phy_start & ~ICACHE_LINE_MASK;
-	sz += slack;
-	phy_start -= slack;
-	num_lines = DIV_ROUND_UP(sz, ARC_ICACHE_LINE_LEN);
-
-#if (CONFIG_ARC_MMU_VER > 2)
-	vaddr &= ~ICACHE_LINE_MASK;
-	addr = phy_start;
-#else
-	/* bits 17:13 of vaddr go as bits 4:0 of paddr */
-	addr = phy_start | ((vaddr >> 13) & 0x1F);
-#endif
+	write_aux_reg(ARC_REG_IC_IVIC, 1);
+	read_aux_reg(ARC_REG_IC_CTRL);	/* blocks */
+}
 
-	local_irq_save(flags);
-	while (num_lines-- > 0) {
-#if (CONFIG_ARC_MMU_VER > 2)
-		/* tag comes from phy addr */
-		write_aux_reg(ARC_REG_IC_PTAG, addr);
+struct ic_line_inv_vaddr_ipi {
+	unsigned long paddr, vaddr;
+	int sz;
+};
 
-		/* index bits come from vaddr */
-		write_aux_reg(ARC_REG_IC_IVIL, vaddr);
-		vaddr += ARC_ICACHE_LINE_LEN;
-#else
-		/* this paddr contains vaddrs bits as needed */
-		write_aux_reg(ARC_REG_IC_IVIL, addr);
-#endif
-		addr += ARC_ICACHE_LINE_LEN;
-	}
-	local_irq_restore(flags);
+static void __ic_line_inv_vaddr_helper(void *info)
+{
+        struct ic_line_inv_vaddr_ipi *ic_inv = (struct ic_line_inv_vaddr_ipi*) info;
+        __ic_line_inv_vaddr_local(ic_inv->paddr, ic_inv->vaddr, ic_inv->sz);
 }
 
+static void __ic_line_inv_vaddr(unsigned long paddr, unsigned long vaddr,
+				unsigned long sz)
+{
+	struct ic_line_inv_vaddr_ipi ic_inv = { paddr, vaddr , sz};
+	on_each_cpu(__ic_line_inv_vaddr_helper, &ic_inv, 1);
+}
 #else
 
-#define __ic_line_inv(start, sz)
+#define __ic_entire_inv()
 #define __ic_line_inv_vaddr(pstart, vstart, sz)
 
 #endif /* CONFIG_ARC_HAS_ICACHE */
@@ -615,35 +434,72 @@ static void __ic_line_inv_vaddr(unsigned long phy_start,
  * Exported APIs
  */
 
-/* TBD: use pg_arch_1 to optimize this */
+/*
+ * Handle cache congruency of kernel and userspace mappings of page when kernel
+ * writes-to/reads-from
+ *
+ * The idea is to defer flushing of kernel mapping after a WRITE, possible if:
+ *  -dcache is NOT aliasing, hence any U/K-mappings of page are congruent
+ *  -U-mapping doesn't exist yet for page (finalised in update_mmu_cache)
+ *  -In SMP, if hardware caches are coherent
+ *
+ * There's a corollary case, where kernel READs from a userspace mapped page.
+ * If the U-mapping is not congruent to to K-mapping, former needs flushing.
+ */
 void flush_dcache_page(struct page *page)
 {
-	__dc_line_op((unsigned long)page_address(page), PAGE_SIZE, OP_FLUSH);
+	struct address_space *mapping;
+
+	if (!cache_is_vipt_aliasing()) {
+		clear_bit(PG_dc_clean, &page->flags);
+		return;
+	}
+
+	/* don't handle anon pages here */
+	mapping = page_mapping(page);
+	if (!mapping)
+		return;
+
+	/*
+	 * pagecache page, file not yet mapped to userspace
+	 * Make a note that K-mapping is dirty
+	 */
+	if (!mapping_mapped(mapping)) {
+		clear_bit(PG_dc_clean, &page->flags);
+	} else if (page_mapped(page)) {
+
+		/* kernel reading from page with U-mapping */
+		void *paddr = page_address(page);
+		unsigned long vaddr = page->index << PAGE_CACHE_SHIFT;
+
+		if (addr_not_cache_congruent(paddr, vaddr))
+			__flush_dcache_page(paddr, vaddr);
+	}
 }
 EXPORT_SYMBOL(flush_dcache_page);
 
 
 void dma_cache_wback_inv(unsigned long start, unsigned long sz)
 {
-	__dc_line_op(start, sz, OP_FLUSH_N_INV);
+	__dc_line_op_k(start, sz, OP_FLUSH_N_INV);
 }
 EXPORT_SYMBOL(dma_cache_wback_inv);
 
 void dma_cache_inv(unsigned long start, unsigned long sz)
 {
-	__dc_line_op(start, sz, OP_INV);
+	__dc_line_op_k(start, sz, OP_INV);
 }
 EXPORT_SYMBOL(dma_cache_inv);
 
 void dma_cache_wback(unsigned long start, unsigned long sz)
 {
-	__dc_line_op(start, sz, OP_FLUSH);
+	__dc_line_op_k(start, sz, OP_FLUSH);
 }
 EXPORT_SYMBOL(dma_cache_wback);
 
 /*
- * This is API for making I/D Caches consistent when modifying code
- * (loadable modules, kprobes,  etc)
+ * This is API for making I/D Caches consistent when modifying
+ * kernel code (loadable modules, kprobes, kgdb...)
  * This is called on insmod, with kernel virtual address for CODE of
  * the module. ARC cache maintenance ops require PHY address thus we
  * need to convert vmalloc addr to PHY addr
@@ -652,7 +508,6 @@ void flush_icache_range(unsigned long kstart, unsigned long kend)
 {
 	unsigned int tot_sz, off, sz;
 	unsigned long phy, pfn;
-	unsigned long flags;
 
 	/* printk("Kernel Cache Cohenercy: %lx to %lx\n",kstart, kend); */
 
@@ -673,8 +528,13 @@ void flush_icache_range(unsigned long kstart, unsigned long kend)
 
 	/* Case: Kernel Phy addr (0x8000_0000 onwards) */
 	if (likely(kstart > PAGE_OFFSET)) {
-		__ic_line_inv(kstart, kend - kstart);
-		__dc_line_op(kstart, kend - kstart, OP_FLUSH);
+		/*
+		 * The 2nd arg despite being paddr will be used to index icache
+		 * This is OK since no alternate virtual mappings will exist
+		 * given the callers for this case: kprobe/kgdb in built-in
+		 * kernel code only.
+		 */
+		__sync_icache_dcache(kstart, kstart, kend - kstart);
 		return;
 	}
 
@@ -692,70 +552,144 @@ void flush_icache_range(unsigned long kstart, unsigned long kend)
 		pfn = vmalloc_to_pfn((void *)kstart);
 		phy = (pfn << PAGE_SHIFT) + off;
 		sz = min_t(unsigned int, tot_sz, PAGE_SIZE - off);
-		local_irq_save(flags);
-		__dc_line_op(phy, sz, OP_FLUSH);
-		__ic_line_inv(phy, sz);
-		local_irq_restore(flags);
+		__sync_icache_dcache(phy, kstart, sz);
 		kstart += sz;
 		tot_sz -= sz;
 	}
 }
 
 /*
- * Optimised ver of flush_icache_range() with spec callers: ptrace/signals
- * where vaddr is also available. This allows passing both vaddr and paddr
- * bits to CDU for cache flush, short-circuting the current pessimistic algo
- * which kills all possible aliases.
- * An added adv of knowing that vaddr is user-vaddr avoids various checks
- * and handling for k-vaddr, k-paddr as done in orig ver above
+ * General purpose helper to make I and D cache lines consistent.
+ * @paddr is phy addr of region
+ * @vaddr is typically user vaddr (breakpoint) or kernel vaddr (vmalloc)
+ *    However in one instance, when called by kprobe (for a breakpt in
+ *    builtin kernel code) @vaddr will be paddr only, meaning CDU operation will
+ *    use a paddr to index the cache (despite VIPT). This is fine since since a
+ *    builtin kernel page will not have any virtual mappings.
+ *    kprobe on loadable module will be kernel vaddr.
  */
-void flush_icache_range_vaddr(unsigned long paddr, unsigned long u_vaddr,
-			      int len)
+void __sync_icache_dcache(unsigned long paddr, unsigned long vaddr, int len)
 {
-	__ic_line_inv_vaddr(paddr, u_vaddr, len);
-	__dc_line_op(paddr, len, OP_FLUSH);
+	__dc_line_op(paddr, vaddr, len, OP_FLUSH_N_INV);
+	__ic_line_inv_vaddr(paddr, vaddr, len);
+}
+
+/* wrapper to compile time eliminate alignment checks in flush loop */
+void __inv_icache_page(unsigned long paddr, unsigned long vaddr)
+{
+	__ic_line_inv_vaddr(paddr, vaddr, PAGE_SIZE);
 }
 
 /*
- * XXX: This also needs to be optim using pg_arch_1
- * This is called when a page-cache page is about to be mapped into a
- * user process' address space.  It offers an opportunity for a
- * port to ensure d-cache/i-cache coherency if necessary.
+ * wrapper to clearout kernel or userspace mappings of a page
+ * For kernel mappings @vaddr == @paddr
  */
-void flush_icache_page(struct vm_area_struct *vma, struct page *page)
+void ___flush_dcache_page(unsigned long paddr, unsigned long vaddr)
 {
-	if (!(vma->vm_flags & VM_EXEC))
-		return;
-
-	__ic_line_inv((unsigned long)page_address(page), PAGE_SIZE);
+	__dc_line_op(paddr, vaddr & PAGE_MASK, PAGE_SIZE, OP_FLUSH_N_INV);
 }
 
-void flush_icache_all(void)
+noinline void flush_cache_all(void)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
 
-	write_aux_reg(ARC_REG_IC_IVIC, 1);
+	__ic_entire_inv();
+	__dc_entire_op(OP_FLUSH_N_INV);
 
-	/* lr will not complete till the icache inv operation is not over */
-	read_aux_reg(ARC_REG_IC_CTRL);
 	local_irq_restore(flags);
+
 }
 
-noinline void flush_cache_all(void)
+#ifdef CONFIG_ARC_CACHE_VIPT_ALIASING
+
+void flush_cache_mm(struct mm_struct *mm)
 {
-	unsigned long flags;
+	flush_cache_all();
+}
 
-	local_irq_save(flags);
+void flush_cache_page(struct vm_area_struct *vma, unsigned long u_vaddr,
+		      unsigned long pfn)
+{
+	unsigned int paddr = pfn << PAGE_SHIFT;
 
-	flush_icache_all();
-	__dc_entire_op(OP_FLUSH_N_INV);
+	u_vaddr &= PAGE_MASK;
 
-	local_irq_restore(flags);
+	___flush_dcache_page(paddr, u_vaddr);
+
+	if (vma->vm_flags & VM_EXEC)
+		__inv_icache_page(paddr, u_vaddr);
+}
+
+void flush_cache_range(struct vm_area_struct *vma, unsigned long start,
+		       unsigned long end)
+{
+	flush_cache_all();
+}
+
+void flush_anon_page(struct vm_area_struct *vma, struct page *page,
+		     unsigned long u_vaddr)
+{
+	/* TBD: do we really need to clear the kernel mapping */
+	__flush_dcache_page(page_address(page), u_vaddr);
+	__flush_dcache_page(page_address(page), page_address(page));
 
 }
 
+#endif
+
+void copy_user_highpage(struct page *to, struct page *from,
+	unsigned long u_vaddr, struct vm_area_struct *vma)
+{
+	void *kfrom = page_address(from);
+	void *kto = page_address(to);
+	int clean_src_k_mappings = 0;
+
+	/*
+	 * If SRC page was already mapped in userspace AND it's U-mapping is
+	 * not congruent with K-mapping, sync former to physical page so that
+	 * K-mapping in memcpy below, sees the right data
+	 *
+	 * Note that while @u_vaddr refers to DST page's userspace vaddr, it is
+	 * equally valid for SRC page as well
+	 */
+	if (page_mapped(from) && addr_not_cache_congruent(kfrom, u_vaddr)) {
+		__flush_dcache_page(kfrom, u_vaddr);
+		clean_src_k_mappings = 1;
+	}
+
+	copy_page(kto, kfrom);
+
+	/*
+	 * Mark DST page K-mapping as dirty for a later finalization by
+	 * update_mmu_cache(). Although the finalization could have been done
+	 * here as well (given that both vaddr/paddr are available).
+	 * But update_mmu_cache() already has code to do that for other
+	 * non copied user pages (e.g. read faults which wire in pagecache page
+	 * directly).
+	 */
+	clear_bit(PG_dc_clean, &to->flags);
+
+	/*
+	 * if SRC was already usermapped and non-congruent to kernel mapping
+	 * sync the kernel mapping back to physical page
+	 */
+	if (clean_src_k_mappings) {
+		__flush_dcache_page(kfrom, kfrom);
+		set_bit(PG_dc_clean, &from->flags);
+	} else {
+		clear_bit(PG_dc_clean, &from->flags);
+	}
+}
+
+void clear_user_page(void *to, unsigned long u_vaddr, struct page *page)
+{
+	clear_page(to);
+	clear_bit(PG_dc_clean, &page->flags);
+}
+
+
 /**********************************************************************
  * Explicit Cache flush request from user space via syscall
  * Needed for JITs which generate code on the fly
diff --git a/arch/arc/mm/extable.c b/arch/arc/mm/extable.c
index 014172ba843..aa652e28132 100644
--- a/arch/arc/mm/extable.c
+++ b/arch/arc/mm/extable.c
@@ -27,7 +27,7 @@ int fixup_exception(struct pt_regs *regs)
 
 #ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
 
-long arc_copy_from_user_noinline(void *to, const void __user * from,
+long arc_copy_from_user_noinline(void *to, const void __user *from,
 		unsigned long n)
 {
 	return __arc_copy_from_user(to, from, n);
@@ -48,7 +48,7 @@ unsigned long arc_clear_user_noinline(void __user *to,
 }
 EXPORT_SYMBOL(arc_clear_user_noinline);
 
-long arc_strncpy_from_user_noinline (char *dst, const char __user *src,
+long arc_strncpy_from_user_noinline(char *dst, const char __user *src,
 		long count)
 {
 	return __arc_strncpy_from_user(dst, src, count);
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index af55aab803d..9c69552350c 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -12,12 +12,12 @@
 #include <linux/sched.h>
 #include <linux/errno.h>
 #include <linux/ptrace.h>
-#include <linux/version.h>
 #include <linux/uaccess.h>
 #include <linux/kdebug.h>
 #include <asm/pgalloc.h>
+#include <asm/mmu.h>
 
-static int handle_vmalloc_fault(struct mm_struct *mm, unsigned long address)
+static int handle_vmalloc_fault(unsigned long address)
 {
 	/*
 	 * Synchronize this task's top level page-table
@@ -27,7 +27,7 @@ static int handle_vmalloc_fault(struct mm_struct *mm, unsigned long address)
 	pud_t *pud, *pud_k;
 	pmd_t *pmd, *pmd_k;
 
-	pgd = pgd_offset_fast(mm, address);
+	pgd = pgd_offset_fast(current->active_mm, address);
 	pgd_k = pgd_offset_k(address);
 
 	if (!pgd_present(*pgd_k))
@@ -52,16 +52,15 @@ bad_area:
 	return 1;
 }
 
-void do_page_fault(struct pt_regs *regs, int write, unsigned long address,
-		   unsigned long cause_code)
+void do_page_fault(unsigned long address, struct pt_regs *regs)
 {
 	struct vm_area_struct *vma = NULL;
 	struct task_struct *tsk = current;
 	struct mm_struct *mm = tsk->mm;
 	siginfo_t info;
 	int fault, ret;
-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
-				(write ? FAULT_FLAG_WRITE : 0);
+	int write = regs->ecr_cause & ECR_C_PROTV_STORE;  /* ST/EX */
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
 	/*
 	 * We fault-in kernel-space virtual memory on-demand. The
@@ -73,7 +72,7 @@ void do_page_fault(struct pt_regs *regs, int write, unsigned long address,
 	 * nothing more.
 	 */
 	if (address >= VMALLOC_START && address <= VMALLOC_END) {
-		ret = handle_vmalloc_fault(mm, address);
+		ret = handle_vmalloc_fault(address);
 		if (unlikely(ret))
 			goto bad_area_nosemaphore;
 		else
@@ -89,6 +88,8 @@ void do_page_fault(struct pt_regs *regs, int write, unsigned long address,
 	if (in_atomic() || !mm)
 		goto no_context;
 
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
 retry:
 	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, address);
@@ -110,18 +111,19 @@ good_area:
 
 	/* Handle protection violation, execute on heap or stack */
 
-	if (cause_code == ((ECR_V_PROTV << 16) | ECR_C_PROTV_INST_FETCH))
+	if ((regs->ecr_vec == ECR_V_PROTV) &&
+	    (regs->ecr_cause == ECR_C_PROTV_INST_FETCH))
 		goto bad_area;
 
 	if (write) {
 		if (!(vma->vm_flags & VM_WRITE))
 			goto bad_area;
+		flags |= FAULT_FLAG_WRITE;
 	} else {
 		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
 			goto bad_area;
 	}
 
-survive:
 	/*
 	 * If for any reason at all we couldn't handle the fault,
 	 * make sure we exit gracefully rather than endlessly redo
@@ -177,7 +179,6 @@ bad_area_nosemaphore:
 	/* User mode accesses just cause a SIGSEGV */
 	if (user_mode(regs)) {
 		tsk->thread.fault_address = address;
-		tsk->thread.cause_code = cause_code;
 		info.si_signo = SIGSEGV;
 		info.si_errno = 0;
 		/* info.si_code has been set above */
@@ -198,17 +199,15 @@ no_context:
 	if (fixup_exception(regs))
 		return;
 
-	die("Oops", regs, address, cause_code);
+	die("Oops", regs, address);
 
 out_of_memory:
-	if (is_global_init(tsk)) {
-		yield();
-		goto survive;
-	}
 	up_read(&mm->mmap_sem);
 
-	if (user_mode(regs))
-		do_group_exit(SIGKILL);	/* This will never return */
+	if (user_mode(regs)) {
+		pagefault_out_of_memory();
+		return;
+	}
 
 	goto no_context;
 
@@ -219,7 +218,6 @@ do_sigbus:
 		goto no_context;
 
 	tsk->thread.fault_address = address;
-	tsk->thread.cause_code = cause_code;
 	info.si_signo = SIGBUS;
 	info.si_errno = 0;
 	info.si_code = BUS_ADRERR;
diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
index caf797de23f..523412369f7 100644
--- a/arch/arc/mm/init.c
+++ b/arch/arc/mm/init.c
@@ -10,8 +10,8 @@
 #include <linux/mm.h>
 #include <linux/bootmem.h>
 #include <linux/memblock.h>
-#ifdef CONFIG_BLOCK_DEV_RAM
-#include <linux/blk.h>
+#ifdef CONFIG_BLK_DEV_INITRD
+#include <linux/initrd.h>
 #endif
 #include <linux/swap.h>
 #include <linux/module.h>
@@ -45,6 +45,24 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size)
 	pr_info("Memory size set via devicetree %ldM\n", TO_MB(arc_mem_sz));
 }
 
+#ifdef CONFIG_BLK_DEV_INITRD
+static int __init early_initrd(char *p)
+{
+	unsigned long start, size;
+	char *endp;
+
+	start = memparse(p, &endp);
+	if (*endp == ',') {
+		size = memparse(endp + 1, NULL);
+
+		initrd_start = (unsigned long)__va(start);
+		initrd_end = (unsigned long)__va(start + size);
+	}
+	return 0;
+}
+early_param("initrd", early_initrd);
+#endif
+
 /*
  * First memory setup routine called from setup_arch()
  * 1. setup swapper's mm @init_mm
@@ -77,17 +95,23 @@ void __init setup_arch_memory(void)
 	/* Last usable page of low mem (no HIGHMEM yet for ARC port) */
 	max_low_pfn = max_pfn = PFN_DOWN(end_mem);
 
-	max_mapnr = num_physpages = max_low_pfn - min_low_pfn;
+	max_mapnr = max_low_pfn - min_low_pfn;
 
 	/*------------- reserve kernel image -----------------------*/
 	memblock_reserve(CONFIG_LINUX_LINK_BASE,
 			 __pa(_end) - CONFIG_LINUX_LINK_BASE);
 
+#ifdef CONFIG_BLK_DEV_INITRD
+	/*------------- reserve initrd image -----------------------*/
+	if (initrd_start)
+		memblock_reserve(__pa(initrd_start), initrd_end - initrd_start);
+#endif
+
 	memblock_dump_all();
 
 	/*-------------- node setup --------------------------------*/
 	memset(zones_size, 0, sizeof(zones_size));
-	zones_size[ZONE_NORMAL] = num_physpages;
+	zones_size[ZONE_NORMAL] = max_low_pfn - min_low_pfn;
 
 	/*
 	 * We can't use the helper free_area_init(zones[]) because it uses
@@ -109,56 +133,9 @@ void __init setup_arch_memory(void)
  */
 void __init mem_init(void)
 {
-	int codesize, datasize, initsize, reserved_pages, free_pages;
-	int tmp;
-
 	high_memory = (void *)(CONFIG_LINUX_LINK_BASE + arc_mem_sz);
-
-	totalram_pages = free_all_bootmem();
-
-	/* count all reserved pages [kernel code/data/mem_map..] */
-	reserved_pages = 0;
-	for (tmp = 0; tmp < max_mapnr; tmp++)
-		if (PageReserved(mem_map + tmp))
-			reserved_pages++;
-
-	/* XXX: nr_free_pages() is equivalent */
-	free_pages = max_mapnr - reserved_pages;
-
-	/*
-	 * For the purpose of display below, split the "reserve mem"
-	 * kernel code/data is already shown explicitly,
-	 * Show any other reservations (mem_map[ ] et al)
-	 */
-	reserved_pages -= (((unsigned int)_end - CONFIG_LINUX_LINK_BASE) >>
-								PAGE_SHIFT);
-
-	codesize = _etext - _text;
-	datasize = _end - _etext;
-	initsize = __init_end - __init_begin;
-
-	pr_info("Memory Available: %dM / %ldM (%dK code, %dK data, %dK init, %dK reserv)\n",
-		PAGES_TO_MB(free_pages),
-		TO_MB(arc_mem_sz),
-		TO_KB(codesize), TO_KB(datasize), TO_KB(initsize),
-		PAGES_TO_KB(reserved_pages));
-}
-
-static void __init free_init_pages(const char *what, unsigned long begin,
-				   unsigned long end)
-{
-	unsigned long addr;
-
-	pr_info("Freeing %s: %ldk [%lx] to [%lx]\n",
-		what, TO_KB(end - begin), begin, end);
-
-	/* need to check that the page we free is not a partial page */
-	for (addr = begin; addr + PAGE_SIZE <= end; addr += PAGE_SIZE) {
-		ClearPageReserved(virt_to_page(addr));
-		init_page_count(virt_to_page(addr));
-		free_page(addr);
-		totalram_pages++;
-	}
+	free_all_bootmem();
+	mem_init_print_info(NULL);
 }
 
 /*
@@ -166,22 +143,12 @@ static void __init free_init_pages(const char *what, unsigned long begin,
  */
 void __init_refok free_initmem(void)
 {
-	free_init_pages("unused kernel memory",
-			(unsigned long)__init_begin,
-			(unsigned long)__init_end);
+	free_initmem_default(-1);
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
 void __init free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_init_pages("initrd memory", start, end);
+	free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
 #endif
-
-#ifdef CONFIG_OF_FLATTREE
-void __init early_init_dt_setup_initrd_arch(unsigned long start,
-					    unsigned long end)
-{
-	pr_err("%s(%lx, %lx)\n", __func__, start, end);
-}
-#endif /* CONFIG_OF_FLATTREE */
diff --git a/arch/arc/mm/ioremap.c b/arch/arc/mm/ioremap.c
index 3e5c92c7993..739e65f355d 100644
--- a/arch/arc/mm/ioremap.c
+++ b/arch/arc/mm/ioremap.c
@@ -12,7 +12,7 @@
 #include <linux/io.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <asm/cache.h>
+#include <linux/cache.h>
 
 void __iomem *ioremap(unsigned long paddr, unsigned long size)
 {
diff --git a/arch/arc/mm/mmap.c b/arch/arc/mm/mmap.c
new file mode 100644
index 00000000000..2e06d56e987
--- /dev/null
+++ b/arch/arc/mm/mmap.c
@@ -0,0 +1,78 @@
+/*
+ * ARC700 mmap
+ *
+ * (started from arm version - for VIPT alias handling)
+ *
+ * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sched.h>
+#include <asm/cacheflush.h>
+
+#define COLOUR_ALIGN(addr, pgoff)			\
+	((((addr) + SHMLBA - 1) & ~(SHMLBA - 1)) +	\
+	 (((pgoff) << PAGE_SHIFT) & (SHMLBA - 1)))
+
+/*
+ * Ensure that shared mappings are correctly aligned to
+ * avoid aliasing issues with VIPT caches.
+ * We need to ensure that
+ * a specific page of an object is always mapped at a multiple of
+ * SHMLBA bytes.
+ */
+unsigned long
+arch_get_unmapped_area(struct file *filp, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	int do_align = 0;
+	int aliasing = cache_is_vipt_aliasing();
+	struct vm_unmapped_area_info info;
+
+	/*
+	 * We only need to do colour alignment if D cache aliases.
+	 */
+	if (aliasing)
+		do_align = filp || (flags & MAP_SHARED);
+
+	/*
+	 * We enforce the MAP_FIXED case.
+	 */
+	if (flags & MAP_FIXED) {
+		if (aliasing && flags & MAP_SHARED &&
+		    (addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1))
+			return -EINVAL;
+		return addr;
+	}
+
+	if (len > TASK_SIZE)
+		return -ENOMEM;
+
+	if (addr) {
+		if (do_align)
+			addr = COLOUR_ALIGN(addr, pgoff);
+		else
+			addr = PAGE_ALIGN(addr);
+
+		vma = find_vma(mm, addr);
+		if (TASK_SIZE - len >= addr &&
+		    (!vma || addr + len <= vma->vm_start))
+			return addr;
+	}
+
+	info.flags = 0;
+	info.length = len;
+	info.low_limit = mm->mmap_base;
+	info.high_limit = TASK_SIZE;
+	info.align_mask = do_align ? (PAGE_MASK & (SHMLBA - 1)) : 0;
+	info.align_offset = pgoff << PAGE_SHIFT;
+	return vm_unmapped_area(&info);
+}
diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c
index 9b9ce23f4ec..e1acf0ce564 100644
--- a/arch/arc/mm/tlb.c
+++ b/arch/arc/mm/tlb.c
@@ -52,10 +52,11 @@
  */
 
 #include <linux/module.h>
+#include <linux/bug.h>
 #include <asm/arcregs.h>
 #include <asm/setup.h>
 #include <asm/mmu_context.h>
-#include <asm/tlb.h>
+#include <asm/mmu.h>
 
 /*			Need for ARC MMU v2
  *
@@ -97,49 +98,47 @@
  * J-TLB entry got evicted/replaced.
  */
 
-/* A copy of the ASID from the PID reg is kept in asid_cache */
-int asid_cache = FIRST_ASID;
 
-/* ASID to mm struct mapping. We have one extra entry corresponding to
- * NO_ASID to save us a compare when clearing the mm entry for old asid
- * see get_new_mmu_context (asm-arc/mmu_context.h)
- */
-struct mm_struct *asid_mm_map[NUM_ASID + 1];
+/* A copy of the ASID from the PID reg is kept in asid_cache */
+DEFINE_PER_CPU(unsigned int, asid_cache) = MM_CTXT_FIRST_CYCLE;
 
 /*
  * Utility Routine to erase a J-TLB entry
- * The procedure is to look it up in the MMU. If found, ERASE it by
- *  issuing a TlbWrite CMD with PD0 = PD1 = 0
+ * Caller needs to setup Index Reg (manually or via getIndex)
  */
-
-static void __tlb_entry_erase(void)
+static inline void __tlb_entry_erase(void)
 {
 	write_aux_reg(ARC_REG_TLBPD1, 0);
 	write_aux_reg(ARC_REG_TLBPD0, 0);
 	write_aux_reg(ARC_REG_TLBCOMMAND, TLBWrite);
 }
 
-static void tlb_entry_erase(unsigned int vaddr_n_asid)
+static inline unsigned int tlb_entry_lkup(unsigned long vaddr_n_asid)
 {
 	unsigned int idx;
 
-	/* Locate the TLB entry for this vaddr + ASID */
 	write_aux_reg(ARC_REG_TLBPD0, vaddr_n_asid);
+
 	write_aux_reg(ARC_REG_TLBCOMMAND, TLBProbe);
 	idx = read_aux_reg(ARC_REG_TLBINDEX);
 
+	return idx;
+}
+
+static void tlb_entry_erase(unsigned int vaddr_n_asid)
+{
+	unsigned int idx;
+
+	/* Locate the TLB entry for this vaddr + ASID */
+	idx = tlb_entry_lkup(vaddr_n_asid);
+
 	/* No error means entry found, zero it out */
 	if (likely(!(idx & TLB_LKUP_ERR))) {
 		__tlb_entry_erase();
-	} else {		/* Some sort of Error */
-
+	} else {
 		/* Duplicate entry error */
-		if (idx & 0x1) {
-			/* TODO we need to handle this case too */
-			pr_emerg("unhandled Duplicate flush for %x\n",
-			       vaddr_n_asid);
-		}
-		/* else entry not found so nothing to do */
+		WARN(idx == TLB_DUP_ERR, "Probe returned Dup PD for %x\n",
+					   vaddr_n_asid);
 	}
 }
 
@@ -158,7 +157,7 @@ static void utlb_invalidate(void)
 {
 #if (CONFIG_ARC_MMU_VER >= 2)
 
-#if (CONFIG_ARC_MMU_VER < 3)
+#if (CONFIG_ARC_MMU_VER == 2)
 	/* MMU v2 introduced the uTLB Flush command.
 	 * There was however an obscure hardware bug, where uTLB flush would
 	 * fail when a prior probe for J-TLB (both totally unrelated) would
@@ -181,6 +180,36 @@ static void utlb_invalidate(void)
 
 }
 
+static void tlb_entry_insert(unsigned int pd0, unsigned int pd1)
+{
+	unsigned int idx;
+
+	/*
+	 * First verify if entry for this vaddr+ASID already exists
+	 * This also sets up PD0 (vaddr, ASID..) for final commit
+	 */
+	idx = tlb_entry_lkup(pd0);
+
+	/*
+	 * If Not already present get a free slot from MMU.
+	 * Otherwise, Probe would have located the entry and set INDEX Reg
+	 * with existing location. This will cause Write CMD to over-write
+	 * existing entry with new PD0 and PD1
+	 */
+	if (likely(idx & TLB_LKUP_ERR))
+		write_aux_reg(ARC_REG_TLBCOMMAND, TLBGetIndex);
+
+	/* setup the other half of TLB entry (pfn, rwx..) */
+	write_aux_reg(ARC_REG_TLBPD1, pd1);
+
+	/*
+	 * Commit the Entry to MMU
+	 * It doesnt sound safe to use the TLBWriteNI cmd here
+	 * which doesn't flush uTLBs. I'd rather be safe than sorry.
+	 */
+	write_aux_reg(ARC_REG_TLBCOMMAND, TLBWrite);
+}
+
 /*
  * Un-conditionally (without lookup) erase the entire MMU contents
  */
@@ -223,13 +252,14 @@ noinline void local_flush_tlb_mm(struct mm_struct *mm)
 		return;
 
 	/*
-	 * Workaround for Android weirdism:
-	 * A binder VMA could end up in a task such that vma->mm != tsk->mm
-	 * old code would cause h/w - s/w ASID to get out of sync
+	 * - Move to a new ASID, but only if the mm is still wired in
+	 *   (Android Binder ended up calling this for vma->mm != tsk->mm,
+	 *    causing h/w - s/w ASID to get out of sync)
+	 * - Also get_new_mmu_context() new implementation allocates a new
+	 *   ASID only if it is not allocated already - so unallocate first
 	 */
-	if (current->mm != mm)
-		destroy_context(mm);
-	else
+	destroy_context(mm);
+	if (current->mm == mm)
 		get_new_mmu_context(mm);
 }
 
@@ -244,8 +274,8 @@ noinline void local_flush_tlb_mm(struct mm_struct *mm)
 void local_flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 			   unsigned long end)
 {
+	const unsigned int cpu = smp_processor_id();
 	unsigned long flags;
-	unsigned int asid;
 
 	/* If range @start to @end is more than 32 TLB entries deep,
 	 * its better to move to a new ASID rather than searching for
@@ -267,11 +297,10 @@ void local_flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 	start &= PAGE_MASK;
 
 	local_irq_save(flags);
-	asid = vma->vm_mm->context.asid;
 
-	if (asid != NO_ASID) {
+	if (asid_mm(vma->vm_mm, cpu) != MM_CTXT_NO_ASID) {
 		while (start < end) {
-			tlb_entry_erase(start | (asid & 0xff));
+			tlb_entry_erase(start | hw_pid(vma->vm_mm, cpu));
 			start += PAGE_SIZE;
 		}
 	}
@@ -318,6 +347,7 @@ void local_flush_tlb_kernel_range(unsigned long start, unsigned long end)
 
 void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
 {
+	const unsigned int cpu = smp_processor_id();
 	unsigned long flags;
 
 	/* Note that it is critical that interrupts are DISABLED between
@@ -325,23 +355,95 @@ void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
 	 */
 	local_irq_save(flags);
 
-	if (vma->vm_mm->context.asid != NO_ASID) {
-		tlb_entry_erase((page & PAGE_MASK) |
-				(vma->vm_mm->context.asid & 0xff));
+	if (asid_mm(vma->vm_mm, cpu) != MM_CTXT_NO_ASID) {
+		tlb_entry_erase((page & PAGE_MASK) | hw_pid(vma->vm_mm, cpu));
 		utlb_invalidate();
 	}
 
 	local_irq_restore(flags);
 }
 
+#ifdef CONFIG_SMP
+
+struct tlb_args {
+	struct vm_area_struct *ta_vma;
+	unsigned long ta_start;
+	unsigned long ta_end;
+};
+
+static inline void ipi_flush_tlb_page(void *arg)
+{
+	struct tlb_args *ta = arg;
+
+	local_flush_tlb_page(ta->ta_vma, ta->ta_start);
+}
+
+static inline void ipi_flush_tlb_range(void *arg)
+{
+	struct tlb_args *ta = arg;
+
+	local_flush_tlb_range(ta->ta_vma, ta->ta_start, ta->ta_end);
+}
+
+static inline void ipi_flush_tlb_kernel_range(void *arg)
+{
+	struct tlb_args *ta = (struct tlb_args *)arg;
+
+	local_flush_tlb_kernel_range(ta->ta_start, ta->ta_end);
+}
+
+void flush_tlb_all(void)
+{
+	on_each_cpu((smp_call_func_t)local_flush_tlb_all, NULL, 1);
+}
+
+void flush_tlb_mm(struct mm_struct *mm)
+{
+	on_each_cpu_mask(mm_cpumask(mm), (smp_call_func_t)local_flush_tlb_mm,
+			 mm, 1);
+}
+
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
+{
+	struct tlb_args ta = {
+		.ta_vma = vma,
+		.ta_start = uaddr
+	};
+
+	on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_page, &ta, 1);
+}
+
+void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+		     unsigned long end)
+{
+	struct tlb_args ta = {
+		.ta_vma = vma,
+		.ta_start = start,
+		.ta_end = end
+	};
+
+	on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_range, &ta, 1);
+}
+
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+	struct tlb_args ta = {
+		.ta_start = start,
+		.ta_end = end
+	};
+
+	on_each_cpu(ipi_flush_tlb_kernel_range, &ta, 1);
+}
+#endif
+
 /*
  * Routine to create a TLB entry
  */
 void create_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep)
 {
 	unsigned long flags;
-	unsigned int idx, asid_or_sasid;
-	unsigned long pd0_flags;
+	unsigned int asid_or_sasid, rwx;
+	unsigned long pd0, pd1;
 
 	/*
 	 * create_tlb() assumes that current->mm == vma->mm, since
@@ -373,73 +475,112 @@ void create_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep)
 
 	local_irq_save(flags);
 
-	tlb_paranoid_check(vma->vm_mm->context.asid, address);
+	tlb_paranoid_check(asid_mm(vma->vm_mm, smp_processor_id()), address);
 
 	address &= PAGE_MASK;
 
 	/* update this PTE credentials */
 	pte_val(*ptep) |= (_PAGE_PRESENT | _PAGE_ACCESSED);
 
-	/* Create HW TLB entry Flags (in PD0) from PTE Flags */
-#if (CONFIG_ARC_MMU_VER <= 2)
-	pd0_flags = ((pte_val(*ptep) & PTE_BITS_IN_PD0) >> 1);
-#else
-	pd0_flags = ((pte_val(*ptep) & PTE_BITS_IN_PD0));
-#endif
+	/* Create HW TLB(PD0,PD1) from PTE  */
 
 	/* ASID for this task */
 	asid_or_sasid = read_aux_reg(ARC_REG_PID) & 0xff;
 
-	write_aux_reg(ARC_REG_TLBPD0, address | pd0_flags | asid_or_sasid);
-
-	/* Load remaining info in PD1 (Page Frame Addr and Kx/Kw/Kr Flags) */
-	write_aux_reg(ARC_REG_TLBPD1, (pte_val(*ptep) & PTE_BITS_IN_PD1));
-
-	/* First verify if entry for this vaddr+ASID already exists */
-	write_aux_reg(ARC_REG_TLBCOMMAND, TLBProbe);
-	idx = read_aux_reg(ARC_REG_TLBINDEX);
+	pd0 = address | asid_or_sasid | (pte_val(*ptep) & PTE_BITS_IN_PD0);
 
 	/*
-	 * If Not already present get a free slot from MMU.
-	 * Otherwise, Probe would have located the entry and set INDEX Reg
-	 * with existing location. This will cause Write CMD to over-write
-	 * existing entry with new PD0 and PD1
+	 * ARC MMU provides fully orthogonal access bits for K/U mode,
+	 * however Linux only saves 1 set to save PTE real-estate
+	 * Here we convert 3 PTE bits into 6 MMU bits:
+	 * -Kernel only entries have Kr Kw Kx 0 0 0
+	 * -User entries have mirrored K and U bits
 	 */
-	if (likely(idx & TLB_LKUP_ERR))
-		write_aux_reg(ARC_REG_TLBCOMMAND, TLBGetIndex);
+	rwx = pte_val(*ptep) & PTE_BITS_RWX;
 
-	/*
-	 * Commit the Entry to MMU
-	 * It doesnt sound safe to use the TLBWriteNI cmd here
-	 * which doesn't flush uTLBs. I'd rather be safe than sorry.
-	 */
-	write_aux_reg(ARC_REG_TLBCOMMAND, TLBWrite);
+	if (pte_val(*ptep) & _PAGE_GLOBAL)
+		rwx <<= 3;		/* r w x => Kr Kw Kx 0 0 0 */
+	else
+		rwx |= (rwx << 3);	/* r w x => Kr Kw Kx Ur Uw Ux */
+
+	pd1 = rwx | (pte_val(*ptep) & PTE_BITS_NON_RWX_IN_PD1);
+
+	tlb_entry_insert(pd0, pd1);
 
 	local_irq_restore(flags);
 }
 
-/* arch hook called by core VM at the end of handle_mm_fault( ),
- * when a new PTE is entered in Page Tables or an existing one
- * is modified. We aggresively pre-install a TLB entry
+/*
+ * Called at the end of pagefault, for a userspace mapped page
+ *  -pre-install the corresponding TLB entry into MMU
+ *  -Finalize the delayed D-cache flush of kernel mapping of page due to
+ *  	flush_dcache_page(), copy_user_page()
+ *
+ * Note that flush (when done) involves both WBACK - so physical page is
+ * in sync as well as INV - so any non-congruent aliases don't remain
  */
-
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long vaddress,
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long vaddr_unaligned,
 		      pte_t *ptep)
 {
+	unsigned long vaddr = vaddr_unaligned & PAGE_MASK;
+	unsigned long paddr = pte_val(*ptep) & PAGE_MASK;
+	struct page *page = pfn_to_page(pte_pfn(*ptep));
+
+	create_tlb(vma, vaddr, ptep);
 
-	create_tlb(vma, vaddress, ptep);
+	if (page == ZERO_PAGE(0)) {
+		return;
+	}
+
+	/*
+	 * Exec page : Independent of aliasing/page-color considerations,
+	 *	       since icache doesn't snoop dcache on ARC, any dirty
+	 *	       K-mapping of a code page needs to be wback+inv so that
+	 *	       icache fetch by userspace sees code correctly.
+	 * !EXEC page: If K-mapping is NOT congruent to U-mapping, flush it
+	 *	       so userspace sees the right data.
+	 *  (Avoids the flush for Non-exec + congruent mapping case)
+	 */
+	if ((vma->vm_flags & VM_EXEC) ||
+	     addr_not_cache_congruent(paddr, vaddr)) {
+
+		int dirty = !test_and_set_bit(PG_dc_clean, &page->flags);
+		if (dirty) {
+			/* wback + inv dcache lines */
+			__flush_dcache_page(paddr, paddr);
+
+			/* invalidate any existing icache lines */
+			if (vma->vm_flags & VM_EXEC)
+				__inv_icache_page(paddr, vaddr);
+		}
+	}
 }
 
 /* Read the Cache Build Confuration Registers, Decode them and save into
  * the cpuinfo structure for later use.
  * No Validation is done here, simply read/convert the BCRs
  */
-void __init read_decode_mmu_bcr(void)
+void read_decode_mmu_bcr(void)
 {
-	unsigned int tmp;
-	struct bcr_mmu_1_2 *mmu2;	/* encoded MMU2 attr */
-	struct bcr_mmu_3 *mmu3;		/* encoded MMU3 attr */
 	struct cpuinfo_arc_mmu *mmu = &cpuinfo_arc700[smp_processor_id()].mmu;
+	unsigned int tmp;
+	struct bcr_mmu_1_2 {
+#ifdef CONFIG_CPU_BIG_ENDIAN
+		unsigned int ver:8, ways:4, sets:4, u_itlb:8, u_dtlb:8;
+#else
+		unsigned int u_dtlb:8, u_itlb:8, sets:4, ways:4, ver:8;
+#endif
+	} *mmu2;
+
+	struct bcr_mmu_3 {
+#ifdef CONFIG_CPU_BIG_ENDIAN
+	unsigned int ver:8, ways:4, sets:4, osm:1, reserv:3, pg_sz:4,
+		     u_itlb:4, u_dtlb:4;
+#else
+	unsigned int u_dtlb:4, u_itlb:4, pg_sz:4, reserv:3, osm:1, sets:4,
+		     ways:4, ver:8;
+#endif
+	} *mmu3;
 
 	tmp = read_aux_reg(ARC_REG_MMU_BCR);
 	mmu->ver = (tmp >> 24);
@@ -466,7 +607,7 @@ void __init read_decode_mmu_bcr(void)
 char *arc_mmu_mumbojumbo(int cpu_id, char *buf, int len)
 {
 	int n = 0;
-	struct cpuinfo_arc_mmu *p_mmu = &cpuinfo_arc700[smp_processor_id()].mmu;
+	struct cpuinfo_arc_mmu *p_mmu = &cpuinfo_arc700[cpu_id].mmu;
 
 	n += scnprintf(buf + n, len - n, "ARC700 MMU [v%x]\t: %dk PAGE, ",
 		       p_mmu->ver, TO_KB(p_mmu->pg_sz));
@@ -475,12 +616,12 @@ char *arc_mmu_mumbojumbo(int cpu_id, char *buf, int len)
 		       "J-TLB %d (%dx%d), uDTLB %d, uITLB %d, %s\n",
 		       p_mmu->num_tlb, p_mmu->sets, p_mmu->ways,
 		       p_mmu->u_dtlb, p_mmu->u_itlb,
-		       __CONFIG_ARC_MMU_SASID_VAL ? "SASID" : "");
+		       IS_ENABLED(CONFIG_ARC_MMU_SASID) ? "SASID" : "");
 
 	return buf;
 }
 
-void __init arc_mmu_init(void)
+void arc_mmu_init(void)
 {
 	char str[256];
 	struct cpuinfo_arc_mmu *mmu = &cpuinfo_arc700[smp_processor_id()].mmu;
@@ -503,13 +644,6 @@ void __init arc_mmu_init(void)
 	if (mmu->pg_sz != PAGE_SIZE)
 		panic("MMU pg size != PAGE_SIZE (%luk)\n", TO_KB(PAGE_SIZE));
 
-	/*
-	 * ASID mgmt data structures are compile time init
-	 *  asid_cache = FIRST_ASID and asid_mm_map[] all zeroes
-	 */
-
-	local_flush_tlb_all();
-
 	/* Enable the MMU */
 	write_aux_reg(ARC_REG_PID, MMU_ENABLE);
 
@@ -551,9 +685,9 @@ void do_tlb_overlap_fault(unsigned long cause, unsigned long address,
 			  struct pt_regs *regs)
 {
 	int set, way, n;
-	unsigned int pd0[4], pd1[4];	/* assume max 4 ways */
 	unsigned long flags, is_valid;
 	struct cpuinfo_arc_mmu *mmu = &cpuinfo_arc700[smp_processor_id()].mmu;
+	unsigned int pd0[mmu->ways], pd1[mmu->ways];
 
 	local_irq_save(flags);
 
@@ -578,7 +712,7 @@ void do_tlb_overlap_fault(unsigned long cause, unsigned long address,
 			continue;
 
 		/* Scan the set for duplicate ways: needs a nested loop */
-		for (way = 0; way < mmu->ways; way++) {
+		for (way = 0; way < mmu->ways - 1; way++) {
 			if (!pd0[way])
 				continue;
 
@@ -621,25 +755,28 @@ void do_tlb_overlap_fault(unsigned long cause, unsigned long address,
  * Low Level ASM TLB handler calls this if it finds that HW and SW ASIDS
  * don't match
  */
-void print_asid_mismatch(int is_fast_path)
+void print_asid_mismatch(int mm_asid, int mmu_asid, int is_fast_path)
 {
-	int pid_sw, pid_hw;
-	pid_sw = current->active_mm->context.asid;
-	pid_hw = read_aux_reg(ARC_REG_PID) & 0xff;
-
 	pr_emerg("ASID Mismatch in %s Path Handler: sw-pid=0x%x hw-pid=0x%x\n",
-	       is_fast_path ? "Fast" : "Slow", pid_sw, pid_hw);
+	       is_fast_path ? "Fast" : "Slow", mm_asid, mmu_asid);
 
 	__asm__ __volatile__("flag 1");
 }
 
-void tlb_paranoid_check(unsigned int pid_sw, unsigned long addr)
+void tlb_paranoid_check(unsigned int mm_asid, unsigned long addr)
 {
-	unsigned int pid_hw;
+	unsigned int mmu_asid;
 
-	pid_hw = read_aux_reg(ARC_REG_PID) & 0xff;
+	mmu_asid = read_aux_reg(ARC_REG_PID) & 0xff;
 
-	if (addr < 0x70000000 && ((pid_hw != pid_sw) || (pid_sw == NO_ASID)))
-		print_asid_mismatch(0);
+	/*
+	 * At the time of a TLB miss/installation
+	 *   - HW version needs to match SW version
+	 *   - SW needs to have a valid ASID
+	 */
+	if (addr < 0x70000000 &&
+	    ((mm_asid == MM_CTXT_NO_ASID) ||
+	      (mmu_asid != (mm_asid & MM_CTXT_ASID_MASK))))
+		print_asid_mismatch(mm_asid, mmu_asid, 0);
 }
 #endif
diff --git a/arch/arc/mm/tlbex.S b/arch/arc/mm/tlbex.S
index 9df765dc7c3..79bfc81358c 100644
--- a/arch/arc/mm/tlbex.S
+++ b/arch/arc/mm/tlbex.S
@@ -39,22 +39,41 @@
 
 #include <linux/linkage.h>
 #include <asm/entry.h>
-#include <asm/tlb.h>
+#include <asm/mmu.h>
 #include <asm/pgtable.h>
 #include <asm/arcregs.h>
 #include <asm/cache.h>
 #include <asm/processor.h>
-#if (CONFIG_ARC_MMU_VER == 1)
 #include <asm/tlb-mmu1.h>
-#endif
 
-;--------------------------------------------------------------------------
-; scratch memory to save the registers (r0-r3) used to code TLB refill Handler
-; For details refer to comments before TLBMISS_FREEUP_REGS below
+;-----------------------------------------------------------------
+; ARC700 Exception Handling doesn't auto-switch stack and it only provides
+; ONE scratch AUX reg "ARC_REG_SCRATCH_DATA0"
+;
+; For Non-SMP, the scratch AUX reg is repurposed to cache task PGD, so a
+; "global" is used to free-up FIRST core reg to be able to code the rest of
+; exception prologue (IRQ auto-disabled on Exceptions, so it's IRQ-safe).
+; Since the Fast Path TLB Miss handler is coded with 4 regs, the remaining 3
+; need to be saved as well by extending the "global" to be 4 words. Hence
+;	".size   ex_saved_reg1, 16"
+; [All of this dance is to avoid stack switching for each TLB Miss, since we
+; only need to save only a handful of regs, as opposed to complete reg file]
+;
+; For ARC700 SMP, the "global" obviously can't be used for free up the FIRST
+; core reg as it will not be SMP safe.
+; Thus scratch AUX reg is used (and no longer used to cache task PGD).
+; To save the rest of 3 regs - per cpu, the global is made "per-cpu".
+; Epilogue thus has to locate the "per-cpu" storage for regs.
+; To avoid cache line bouncing the per-cpu global is aligned/sized per
+; L1_CACHE_SHIFT, despite fundamentally needing to be 12 bytes only. Hence
+;	".size   ex_saved_reg1, (CONFIG_NR_CPUS << L1_CACHE_SHIFT)"
+
+; As simple as that....
 ;--------------------------------------------------------------------------
 
+; scratch memory to save [r0-r3] used to code TLB refill Handler
 ARCFP_DATA ex_saved_reg1
-	.align 1 << L1_CACHE_SHIFT	; IMP: Must be Cache Line aligned
+	.align 1 << L1_CACHE_SHIFT
 	.type   ex_saved_reg1, @object
 #ifdef CONFIG_SMP
 	.size   ex_saved_reg1, (CONFIG_NR_CPUS << L1_CACHE_SHIFT)
@@ -66,6 +85,44 @@ ex_saved_reg1:
 	.zero 16
 #endif
 
+.macro TLBMISS_FREEUP_REGS
+#ifdef CONFIG_SMP
+	sr  r0, [ARC_REG_SCRATCH_DATA0]	; freeup r0 to code with
+	GET_CPU_ID  r0			; get to per cpu scratch mem,
+	lsl r0, r0, L1_CACHE_SHIFT	; cache line wide per cpu
+	add r0, @ex_saved_reg1, r0
+#else
+	st    r0, [@ex_saved_reg1]
+	mov_s r0, @ex_saved_reg1
+#endif
+	st_s  r1, [r0, 4]
+	st_s  r2, [r0, 8]
+	st_s  r3, [r0, 12]
+
+	; VERIFY if the ASID in MMU-PID Reg is same as
+	; one in Linux data structures
+
+	tlb_paranoid_check_asm
+.endm
+
+.macro TLBMISS_RESTORE_REGS
+#ifdef CONFIG_SMP
+	GET_CPU_ID  r0			; get to per cpu scratch mem
+	lsl r0, r0, L1_CACHE_SHIFT	; each is cache line wide
+	add r0, @ex_saved_reg1, r0
+	ld_s  r3, [r0,12]
+	ld_s  r2, [r0, 8]
+	ld_s  r1, [r0, 4]
+	lr    r0, [ARC_REG_SCRATCH_DATA0]
+#else
+	mov_s r0, @ex_saved_reg1
+	ld_s  r3, [r0,12]
+	ld_s  r2, [r0, 8]
+	ld_s  r1, [r0, 4]
+	ld_s  r0, [r0]
+#endif
+.endm
+
 ;============================================================================
 ;  Troubleshooting Stuff
 ;============================================================================
@@ -76,34 +133,35 @@ ex_saved_reg1:
 ; In bizzare scenrios SW and HW ASID can get out-of-sync which is trouble.
 ; So we try to detect this in TLB Mis shandler
 
-
-.macro DBG_ASID_MISMATCH
+.macro tlb_paranoid_check_asm
 
 #ifdef CONFIG_ARC_DBG_TLB_PARANOIA
 
-	; make sure h/w ASID is same as s/w ASID
-
 	GET_CURR_TASK_ON_CPU  r3
 	ld r0, [r3, TASK_ACT_MM]
 	ld r0, [r0, MM_CTXT+MM_CTXT_ASID]
+	breq r0, 0, 55f	; Error if no ASID allocated
 
 	lr r1, [ARC_REG_PID]
 	and r1, r1, 0xFF
-	breq r1, r0, 5f
 
+	and r2, r0, 0xFF	; MMU PID bits only for comparison
+	breq r1, r2, 5f
+
+55:
 	; Error if H/w and S/w ASID don't match, but NOT if in kernel mode
-	lr  r0, [erstatus]
-	bbit0 r0, STATUS_U_BIT, 5f
+	lr  r2, [erstatus]
+	bbit0 r2, STATUS_U_BIT, 5f
 
 	; We sure are in troubled waters, Flag the error, but to do so
 	; need to switch to kernel mode stack to call error routine
 	GET_TSK_STACK_BASE   r3, sp
 
 	; Call printk to shoutout aloud
-	mov r0, 1
+	mov r2, 1
 	j print_asid_mismatch
 
-5:   ; ASIDs match so proceed normally
+5:	; ASIDs match so proceed normally
 	nop
 
 #endif
@@ -147,9 +205,9 @@ ex_saved_reg1:
 #ifdef CONFIG_ARC_DBG_TLB_MISS_COUNT
 	and.f 0, r0, _PAGE_PRESENT
 	bz   1f
-	ld   r2, [num_pte_not_present]
-	add  r2, r2, 1
-	st   r2, [num_pte_not_present]
+	ld   r3, [num_pte_not_present]
+	add  r3, r3, 1
+	st   r3, [num_pte_not_present]
 1:
 #endif
 
@@ -161,13 +219,17 @@ ex_saved_reg1:
 ; IN: r0 = PTE, r1 = ptr to PTE
 
 .macro CONV_PTE_TO_TLB
-	and r3, r0, PTE_BITS_IN_PD1 ; Extract permission flags+PFN from PTE
-	sr  r3, [ARC_REG_TLBPD1]    ; these go in PD1
+	and    r3, r0, PTE_BITS_RWX	;       r w x
+	lsl    r2, r3, 3		; r w x 0 0 0
+	and.f  0,  r0, _PAGE_GLOBAL
+	or.z   r2, r2, r3		; r w x r w x
+
+	and r3, r0, PTE_BITS_NON_RWX_IN_PD1 ; Extract PFN+cache bits from PTE
+	or  r3, r3, r2
+
+	sr  r3, [ARC_REG_TLBPD1]    	; these go in PD1
 
 	and r2, r0, PTE_BITS_IN_PD0 ; Extract other PTE flags: (V)alid, (G)lb
-#if (CONFIG_ARC_MMU_VER <= 2)   /* Neednot be done with v3 onwards */
-	lsr r2, r2                  ; shift PTE flags to match layout in PD0
-#endif
 
 	lr  r3,[ARC_REG_TLBPD0]     ; MMU prepares PD0 with vaddr and asid
 
@@ -191,68 +253,6 @@ ex_saved_reg1:
 #endif
 .endm
 
-;-----------------------------------------------------------------
-; ARC700 Exception Handling doesn't auto-switch stack and it only provides
-; ONE scratch AUX reg "ARC_REG_SCRATCH_DATA0"
-;
-; For Non-SMP, the scratch AUX reg is repurposed to cache task PGD, so a
-; "global" is used to free-up FIRST core reg to be able to code the rest of
-; exception prologue (IRQ auto-disabled on Exceptions, so it's IRQ-safe).
-; Since the Fast Path TLB Miss handler is coded with 4 regs, the remaining 3
-; need to be saved as well by extending the "global" to be 4 words. Hence
-;	".size   ex_saved_reg1, 16"
-; [All of this dance is to avoid stack switching for each TLB Miss, since we
-; only need to save only a handful of regs, as opposed to complete reg file]
-;
-; For ARC700 SMP, the "global" obviously can't be used for free up the FIRST
-; core reg as it will not be SMP safe.
-; Thus scratch AUX reg is used (and no longer used to cache task PGD).
-; To save the rest of 3 regs - per cpu, the global is made "per-cpu".
-; Epilogue thus has to locate the "per-cpu" storage for regs.
-; To avoid cache line bouncing the per-cpu global is aligned/sized per
-; L1_CACHE_SHIFT, despite fundamentally needing to be 12 bytes only. Hence
-;	".size   ex_saved_reg1, (CONFIG_NR_CPUS << L1_CACHE_SHIFT)"
-
-; As simple as that....
-
-.macro TLBMISS_FREEUP_REGS
-#ifdef CONFIG_SMP
-	sr  r0, [ARC_REG_SCRATCH_DATA0]	; freeup r0 to code with
-	GET_CPU_ID  r0			; get to per cpu scratch mem,
-	lsl r0, r0, L1_CACHE_SHIFT	; cache line wide per cpu
-	add r0, @ex_saved_reg1, r0
-#else
-	st    r0, [@ex_saved_reg1]
-	mov_s r0, @ex_saved_reg1
-#endif
-	st_s  r1, [r0, 4]
-	st_s  r2, [r0, 8]
-	st_s  r3, [r0, 12]
-
-	; VERIFY if the ASID in MMU-PID Reg is same as
-	; one in Linux data structures
-
-	DBG_ASID_MISMATCH
-.endm
-
-;-----------------------------------------------------------------
-.macro TLBMISS_RESTORE_REGS
-#ifdef CONFIG_SMP
-	GET_CPU_ID  r0			; get to per cpu scratch mem
-	lsl r0, r0, L1_CACHE_SHIFT	; each is cache line wide
-	add r0, @ex_saved_reg1, r0
-	ld_s  r3, [r0,12]
-	ld_s  r2, [r0, 8]
-	ld_s  r1, [r0, 4]
-	lr    r0, [ARC_REG_SCRATCH_DATA0]
-#else
-	mov_s r0, @ex_saved_reg1
-	ld_s  r3, [r0,12]
-	ld_s  r2, [r0, 8]
-	ld_s  r1, [r0, 4]
-	ld_s  r0, [r0]
-#endif
-.endm
 
 ARCFP_CODE	;Fast Path Code, candidate for ICCM
 
@@ -260,7 +260,7 @@ ARCFP_CODE	;Fast Path Code, candidate for ICCM
 ; I-TLB Miss Exception Handler
 ;-----------------------------------------------------------------------------
 
-ARC_ENTRY EV_TLBMissI
+ENTRY(EV_TLBMissI)
 
 	TLBMISS_FREEUP_REGS
 
@@ -271,35 +271,35 @@ ARC_ENTRY EV_TLBMissI
 #endif
 
 	;----------------------------------------------------------------
-	; Get the PTE corresponding to V-addr accessed
+	; Get the PTE corresponding to V-addr accessed, r2 is setup with EFA
 	LOAD_FAULT_PTE
 
 	;----------------------------------------------------------------
 	; VERIFY_PTE: Check if PTE permissions approp for executing code
 	cmp_s   r2, VMALLOC_START
-	mov.lo  r2, (_PAGE_PRESENT | _PAGE_READ | _PAGE_EXECUTE)
-	mov.hs  r2, (_PAGE_PRESENT | _PAGE_K_READ | _PAGE_K_EXECUTE)
+	mov_s   r2, (_PAGE_PRESENT | _PAGE_EXECUTE)
+	or.hs   r2, r2, _PAGE_GLOBAL
 
 	and     r3, r0, r2  ; Mask out NON Flag bits from PTE
 	xor.f   r3, r3, r2  ; check ( ( pte & flags_test ) == flags_test )
 	bnz     do_slow_path_pf
 
 	; Let Linux VM know that the page was accessed
-	or      r0, r0, (_PAGE_PRESENT | _PAGE_ACCESSED)  ; set Accessed Bit
-	st_s    r0, [r1]                                  ; Write back PTE
+	or      r0, r0, _PAGE_ACCESSED  ; set Accessed Bit
+	st_s    r0, [r1]                ; Write back PTE
 
 	CONV_PTE_TO_TLB
 	COMMIT_ENTRY_TO_MMU
 	TLBMISS_RESTORE_REGS
 	rtie
 
-ARC_EXIT EV_TLBMissI
+END(EV_TLBMissI)
 
 ;-----------------------------------------------------------------------------
 ; D-TLB Miss Exception Handler
 ;-----------------------------------------------------------------------------
 
-ARC_ENTRY EV_TLBMissD
+ENTRY(EV_TLBMissD)
 
 	TLBMISS_FREEUP_REGS
 
@@ -311,32 +311,27 @@ ARC_ENTRY EV_TLBMissD
 
 	;----------------------------------------------------------------
 	; Get the PTE corresponding to V-addr accessed
-	; If PTE exists, it will setup, r0 = PTE, r1 = Ptr to PTE
+	; If PTE exists, it will setup, r0 = PTE, r1 = Ptr to PTE, r2 = EFA
 	LOAD_FAULT_PTE
 
 	;----------------------------------------------------------------
 	; VERIFY_PTE: Chk if PTE permissions approp for data access (R/W/R+W)
 
-	mov_s   r2, 0
+	cmp_s	r2, VMALLOC_START
+	mov_s   r2, _PAGE_PRESENT	; common bit for K/U PTE
+	or.hs	r2, r2, _PAGE_GLOBAL	; kernel PTE only
+
+	; Linux PTE [RWX] bits are semantically overloaded:
+	; -If PAGE_GLOBAL set, they refer to kernel-only flags (vmalloc)
+	; -Otherwise they are user-mode permissions, and those are exactly
+	;  same for kernel mode as well (e.g. copy_(to|from)_user)
+
 	lr      r3, [ecr]
 	btst_s  r3, ECR_C_BIT_DTLB_LD_MISS	; Read Access
 	or.nz   r2, r2, _PAGE_READ      	; chk for Read flag in PTE
 	btst_s  r3, ECR_C_BIT_DTLB_ST_MISS	; Write Access
 	or.nz   r2, r2, _PAGE_WRITE     	; chk for Write flag in PTE
-	; Above laddering takes care of XCHG access
-	;   which is both Read and Write
-
-	; If kernel mode access, ; make _PAGE_xx flags as _PAGE_K_xx
-	; For copy_(to|from)_user, despite exception taken in kernel mode,
-	; this code is not hit, because EFA would still be the user mode
-	; address (EFA < 0x6000_0000).
-	; This code is for legit kernel mode faults, vmalloc specifically
-	; (EFA: 0x7000_0000 to 0x7FFF_FFFF)
-
-	lr      r3, [efa]
-	cmp     r3, VMALLOC_START - 1   ; If kernel mode access
-	asl.hi  r2, r2, 3               ; make _PAGE_xx flags as _PAGE_K_xx
-	or      r2, r2, _PAGE_PRESENT   ; Common flag for K/U mode
+	; Above laddering takes care of XCHG access (both R and W)
 
 	; By now, r2 setup with all the Flags we need to check in PTE
 	and     r3, r0, r2              ; Mask out NON Flag bits from PTE
@@ -345,7 +340,7 @@ ARC_ENTRY EV_TLBMissD
 	;----------------------------------------------------------------
 	; UPDATE_PTE: Let Linux VM know that page was accessed/dirty
 	lr      r3, [ecr]
-	or      r0, r0, (_PAGE_PRESENT | _PAGE_ACCESSED) ; Accessed bit always
+	or      r0, r0, _PAGE_ACCESSED        ; Accessed bit always
 	btst_s  r3,  ECR_C_BIT_DTLB_ST_MISS   ; See if it was a Write Access ?
 	or.nz   r0, r0, _PAGE_MODIFIED        ; if Write, set Dirty bit as well
 	st_s    r0, [r1]                      ; Write back PTE
@@ -371,28 +366,11 @@ do_slow_path_pf:
 
 	; Slow path TLB Miss handled as a regular ARC Exception
 	; (stack switching / save the complete reg-file).
-	; That requires freeing up r9
-	EXCPN_PROLOG_FREEUP_REG r9
-
-	lr  r9, [erstatus]
-
-	SWITCH_TO_KERNEL_STK
-	SAVE_ALL_SYS
+	EXCEPTION_PROLOGUE
 
 	; ------- setup args for Linux Page fault Hanlder ---------
-	mov_s r0, sp
-	lr  r2, [efa]
-	lr  r3, [ecr]
-
-	; Both st and ex imply WRITE access of some sort, hence do_page_fault( )
-	; invoked with write=1 for DTLB-st/ex Miss and write=0 for ITLB miss or
-	; DTLB-ld Miss
-	; DTLB Miss Cause code is ld = 0x01 , st = 0x02, ex = 0x03
-	; Following code uses that fact that st/ex have one bit in common
-
-	btst_s r3,  ECR_C_BIT_DTLB_ST_MISS
-	mov.z  r1, 0
-	mov.nz r1, 1
+	mov_s r1, sp
+	lr    r0, [efa]
 
 	; We don't want exceptions to be disabled while the fault is handled.
 	; Now that we have saved the context we return from exception hence
@@ -403,6 +381,4 @@ do_slow_path_pf:
 	bl  do_page_fault
 	b   ret_from_exception
 
-ARC_EXIT EV_TLBMissD
-
-ARC_ENTRY EV_TLBMissB   ; Bogus entry to measure sz of DTLBMiss hdlr
+END(EV_TLBMissD)