5 files changed, 509 insertions, 485 deletions
diff --git a/arch/arc/mm/cache_arc700.c b/arch/arc/mm/cache_arc700.c
index aedce190544..353b202c37c 100644
--- a/arch/arc/mm/cache_arc700.c
+++ b/arch/arc/mm/cache_arc700.c
@@ -73,10 +73,9 @@
 #include <asm/cachectl.h>
 #include <asm/setup.h>
 
-char *arc_cache_mumbojumbo(int cpu_id, char *buf, int len)
+char *arc_cache_mumbojumbo(int c, char *buf, int len)
 {
 	int n = 0;
-	unsigned int c = smp_processor_id();
 
 #define PR_CACHE(p, enb, str)						\
 {									\
@@ -89,8 +88,10 @@ char *arc_cache_mumbojumbo(int cpu_id, char *buf, int len)
 			enb ?  "" : "DISABLED (kernel-build)");		\
 }
 
-	PR_CACHE(&cpuinfo_arc700[c].icache, __CONFIG_ARC_HAS_ICACHE, "I-Cache");
-	PR_CACHE(&cpuinfo_arc700[c].dcache, __CONFIG_ARC_HAS_DCACHE, "D-Cache");
+	PR_CACHE(&cpuinfo_arc700[c].icache, IS_ENABLED(CONFIG_ARC_HAS_ICACHE),
+			"I-Cache");
+	PR_CACHE(&cpuinfo_arc700[c].dcache, IS_ENABLED(CONFIG_ARC_HAS_DCACHE),
+			"D-Cache");
 
 	return buf;
 }
@@ -100,17 +101,23 @@ char *arc_cache_mumbojumbo(int cpu_id, char *buf, int len)
  * the cpuinfo structure for later use.
  * No Validation done here, simply read/convert the BCRs
  */
-void __cpuinit read_decode_cache_bcr(void)
+void read_decode_cache_bcr(void)
 {
-	struct bcr_cache ibcr, dbcr;
 	struct cpuinfo_arc_cache *p_ic, *p_dc;
 	unsigned int cpu = smp_processor_id();
+	struct bcr_cache {
+#ifdef CONFIG_CPU_BIG_ENDIAN
+		unsigned int pad:12, line_len:4, sz:4, config:4, ver:8;
+#else
+		unsigned int ver:8, config:4, sz:4, line_len:4, pad:12;
+#endif
+	} ibcr, dbcr;
 
 	p_ic = &cpuinfo_arc700[cpu].icache;
 	READ_BCR(ARC_REG_IC_BCR, ibcr);
 
-	if (ibcr.config == 0x3)
-		p_ic->assoc = 2;
+	BUG_ON(ibcr.config != 3);
+	p_ic->assoc = 2;		/* Fixed to 2w set assoc */
 	p_ic->line_len = 8 << ibcr.line_len;
 	p_ic->sz = 0x200 << ibcr.sz;
 	p_ic->ver = ibcr.ver;
@@ -118,8 +125,8 @@ void __cpuinit read_decode_cache_bcr(void)
 	p_dc = &cpuinfo_arc700[cpu].dcache;
 	READ_BCR(ARC_REG_DC_BCR, dbcr);
 
-	if (dbcr.config == 0x2)
-		p_dc->assoc = 4;
+	BUG_ON(dbcr.config != 2);
+	p_dc->assoc = 4;		/* Fixed to 4w set assoc */
 	p_dc->line_len = 16 << dbcr.line_len;
 	p_dc->sz = 0x200 << dbcr.sz;
 	p_dc->ver = dbcr.ver;
@@ -132,93 +139,116 @@ void __cpuinit read_decode_cache_bcr(void)
  * 3. Enable the Caches, setup default flush mode for D-Cache
  * 3. Calculate the SHMLBA used by user space
  */
-void __cpuinit arc_cache_init(void)
+void arc_cache_init(void)
 {
-	unsigned int temp;
-	unsigned int cpu = smp_processor_id();
-	struct cpuinfo_arc_cache *ic = &cpuinfo_arc700[cpu].icache;
-	struct cpuinfo_arc_cache *dc = &cpuinfo_arc700[cpu].dcache;
-	int way_pg_ratio = way_pg_ratio;
-	int dcache_does_alias;
+	unsigned int __maybe_unused cpu = smp_processor_id();
+	struct cpuinfo_arc_cache __maybe_unused *ic, __maybe_unused *dc;
 	char str[256];
 
 	printk(arc_cache_mumbojumbo(0, str, sizeof(str)));
 
-	if (!ic->ver)
-		goto chk_dc;
-
 #ifdef CONFIG_ARC_HAS_ICACHE
-	/* 1. Confirm some of I-cache params which Linux assumes */
-	if ((ic->assoc != ARC_ICACHE_WAYS) ||
-	    (ic->line_len != ARC_ICACHE_LINE_LEN)) {
-		panic("Cache H/W doesn't match kernel Config");
+	ic = &cpuinfo_arc700[cpu].icache;
+	if (ic->ver) {
+		if (ic->line_len != L1_CACHE_BYTES)
+			panic("ICache line [%d] != kernel Config [%d]",
+			      ic->line_len, L1_CACHE_BYTES);
+
+		if (ic->ver != CONFIG_ARC_MMU_VER)
+			panic("Cache ver [%d] doesn't match MMU ver [%d]\n",
+			      ic->ver, CONFIG_ARC_MMU_VER);
 	}
-#if (CONFIG_ARC_MMU_VER > 2)
-	if (ic->ver != 3) {
-		if (running_on_hw)
-			panic("Cache ver doesn't match MMU ver\n");
+#endif
 
-		/* For ISS - suggest the toggles to use */
-		pr_err("Use -prop=icache_version=3,-prop=dcache_version=3\n");
+#ifdef CONFIG_ARC_HAS_DCACHE
+	dc = &cpuinfo_arc700[cpu].dcache;
+	if (dc->ver) {
+		unsigned int dcache_does_alias;
 
-	}
-#endif
-#endif
+		if (dc->line_len != L1_CACHE_BYTES)
+			panic("DCache line [%d] != kernel Config [%d]",
+			      dc->line_len, L1_CACHE_BYTES);
 
-	/* Enable/disable I-Cache */
-	temp = read_aux_reg(ARC_REG_IC_CTRL);
+		/* check for D-Cache aliasing */
+		dcache_does_alias = (dc->sz / dc->assoc) > PAGE_SIZE;
 
-#ifdef CONFIG_ARC_HAS_ICACHE
-	temp &= ~IC_CTRL_CACHE_DISABLE;
-#else
-	temp |= IC_CTRL_CACHE_DISABLE;
+		if (dcache_does_alias && !cache_is_vipt_aliasing())
+			panic("Enable CONFIG_ARC_CACHE_VIPT_ALIASING\n");
+		else if (!dcache_does_alias && cache_is_vipt_aliasing())
+			panic("Don't need CONFIG_ARC_CACHE_VIPT_ALIASING\n");
+	}
 #endif
+}
 
-	write_aux_reg(ARC_REG_IC_CTRL, temp);
+#define OP_INV		0x1
+#define OP_FLUSH	0x2
+#define OP_FLUSH_N_INV	0x3
+#define OP_INV_IC	0x4
 
-chk_dc:
-	if (!dc->ver)
-		return;
+/*
+ * Common Helper for Line Operations on {I,D}-Cache
+ */
+static inline void __cache_line_loop(unsigned long paddr, unsigned long vaddr,
+				     unsigned long sz, const int cacheop)
+{
+	unsigned int aux_cmd, aux_tag;
+	int num_lines;
+	const int full_page_op = __builtin_constant_p(sz) && sz == PAGE_SIZE;
 
-#ifdef CONFIG_ARC_HAS_DCACHE
-	if ((dc->assoc != ARC_DCACHE_WAYS) ||
-	    (dc->line_len != ARC_DCACHE_LINE_LEN)) {
-		panic("Cache H/W doesn't match kernel Config");
+	if (cacheop == OP_INV_IC) {
+		aux_cmd = ARC_REG_IC_IVIL;
+#if (CONFIG_ARC_MMU_VER > 2)
+		aux_tag = ARC_REG_IC_PTAG;
+#endif
 	}
-
-	dcache_does_alias = (dc->sz / ARC_DCACHE_WAYS) > PAGE_SIZE;
-
-	/* check for D-Cache aliasing */
-	if (dcache_does_alias && !cache_is_vipt_aliasing())
-		panic("Enable CONFIG_ARC_CACHE_VIPT_ALIASING\n");
-	else if (!dcache_does_alias && cache_is_vipt_aliasing())
-		panic("Don't need CONFIG_ARC_CACHE_VIPT_ALIASING\n");
+	else {
+		/* d$ cmd: INV (discard or wback-n-discard) OR FLUSH (wback) */
+		aux_cmd = cacheop & OP_INV ? ARC_REG_DC_IVDL : ARC_REG_DC_FLDL;
+#if (CONFIG_ARC_MMU_VER > 2)
+		aux_tag = ARC_REG_DC_PTAG;
 #endif
+	}
 
-	/* Set the default Invalidate Mode to "simpy discard dirty lines"
-	 *  as this is more frequent then flush before invalidate
-	 * Ofcourse we toggle this default behviour when desired
+	/* Ensure we properly floor/ceil the non-line aligned/sized requests
+	 * and have @paddr - aligned to cache line and integral @num_lines.
+	 * This however can be avoided for page sized since:
+	 *  -@paddr will be cache-line aligned already (being page aligned)
+	 *  -@sz will be integral multiple of line size (being page sized).
 	 */
-	temp = read_aux_reg(ARC_REG_DC_CTRL);
-	temp &= ~DC_CTRL_INV_MODE_FLUSH;
+	if (!full_page_op) {
+		sz += paddr & ~CACHE_LINE_MASK;
+		paddr &= CACHE_LINE_MASK;
+		vaddr &= CACHE_LINE_MASK;
+	}
 
-#ifdef CONFIG_ARC_HAS_DCACHE
-	/* Enable D-Cache: Clear Bit 0 */
-	write_aux_reg(ARC_REG_DC_CTRL, temp & ~IC_CTRL_CACHE_DISABLE);
+	num_lines = DIV_ROUND_UP(sz, L1_CACHE_BYTES);
+
+#if (CONFIG_ARC_MMU_VER <= 2)
+	/* MMUv2 and before: paddr contains stuffed vaddrs bits */
+	paddr |= (vaddr >> PAGE_SHIFT) & 0x1F;
 #else
-	/* Flush D cache */
-	write_aux_reg(ARC_REG_DC_FLSH, 0x1);
-	/* Disable D cache */
-	write_aux_reg(ARC_REG_DC_CTRL, temp | IC_CTRL_CACHE_DISABLE);
+	/* if V-P const for loop, PTAG can be written once outside loop */
+	if (full_page_op)
+		write_aux_reg(aux_tag, paddr);
 #endif
 
-	return;
+	while (num_lines-- > 0) {
+#if (CONFIG_ARC_MMU_VER > 2)
+		/* MMUv3, cache ops require paddr seperately */
+		if (!full_page_op) {
+			write_aux_reg(aux_tag, paddr);
+			paddr += L1_CACHE_BYTES;
+		}
+
+		write_aux_reg(aux_cmd, vaddr);
+		vaddr += L1_CACHE_BYTES;
+#else
+		write_aux_reg(aux_cmd, paddr);
+		paddr += L1_CACHE_BYTES;
+#endif
+	}
 }
 
-#define OP_INV		0x1
-#define OP_FLUSH	0x2
-#define OP_FLUSH_N_INV	0x3
-
 #ifdef CONFIG_ARC_HAS_DCACHE
 
 /***************************************************************
@@ -239,11 +269,9 @@ static inline void wait_for_flush(void)
  */
 static inline void __dc_entire_op(const int cacheop)
 {
-	unsigned long flags, tmp = tmp;
+	unsigned int tmp = tmp;
 	int aux;
 
-	local_irq_save(flags);
-
 	if (cacheop == OP_FLUSH_N_INV) {
 		/* Dcache provides 2 cmd: FLUSH or INV
 		 * INV inturn has sub-modes: DISCARD or FLUSH-BEFORE
@@ -267,55 +295,6 @@ static inline void __dc_entire_op(const int cacheop)
 	/* Switch back the DISCARD ONLY Invalidate mode */
 	if (cacheop == OP_FLUSH_N_INV)
 		write_aux_reg(ARC_REG_DC_CTRL, tmp & ~DC_CTRL_INV_MODE_FLUSH);
-
-	local_irq_restore(flags);
-}
-
-/*
- * Per Line Operation on D-Cache
- * Doesn't deal with type-of-op/IRQ-disabling/waiting-for-flush-to-complete
- * It's sole purpose is to help gcc generate ZOL
- * (aliasing VIPT dcache flushing needs both vaddr and paddr)
- */
-static inline void __dc_line_loop(unsigned long paddr, unsigned long vaddr,
-				  unsigned long sz, const int aux_reg)
-{
-	int num_lines;
-
-	/* Ensure we properly floor/ceil the non-line aligned/sized requests
-	 * and have @paddr - aligned to cache line and integral @num_lines.
-	 * This however can be avoided for page sized since:
-	 *  -@paddr will be cache-line aligned already (being page aligned)
-	 *  -@sz will be integral multiple of line size (being page sized).
-	 */
-	if (!(__builtin_constant_p(sz) && sz == PAGE_SIZE)) {
-		sz += paddr & ~DCACHE_LINE_MASK;
-		paddr &= DCACHE_LINE_MASK;
-		vaddr &= DCACHE_LINE_MASK;
-	}
-
-	num_lines = DIV_ROUND_UP(sz, ARC_DCACHE_LINE_LEN);
-
-#if (CONFIG_ARC_MMU_VER <= 2)
-	paddr |= (vaddr >> PAGE_SHIFT) & 0x1F;
-#endif
-
-	while (num_lines-- > 0) {
-#if (CONFIG_ARC_MMU_VER > 2)
-		/*
-		 * Just as for I$, in MMU v3, D$ ops also require
-		 * "tag" bits in DC_PTAG, "index" bits in FLDL,IVDL ops
-		 */
-		write_aux_reg(ARC_REG_DC_PTAG, paddr);
-
-		write_aux_reg(aux_reg, vaddr);
-		vaddr += ARC_DCACHE_LINE_LEN;
-#else
-		/* paddr contains stuffed vaddrs bits */
-		write_aux_reg(aux_reg, paddr);
-#endif
-		paddr += ARC_DCACHE_LINE_LEN;
-	}
 }
 
 /* For kernel mappings cache operation: index is same as paddr */
@@ -328,7 +307,6 @@ static inline void __dc_line_op(unsigned long paddr, unsigned long vaddr,
 				unsigned long sz, const int cacheop)
 {
 	unsigned long flags, tmp = tmp;
-	int aux;
 
 	local_irq_save(flags);
 
@@ -343,12 +321,7 @@ static inline void __dc_line_op(unsigned long paddr, unsigned long vaddr,
 		write_aux_reg(ARC_REG_DC_CTRL, tmp | DC_CTRL_INV_MODE_FLUSH);
 	}
 
-	if (cacheop & OP_INV)	/* Inv / flush-n-inv use same cmd reg */
-		aux = ARC_REG_DC_IVDL;
-	else
-		aux = ARC_REG_DC_FLDL;
-
-	__dc_line_loop(paddr, vaddr, sz, aux);
+	__cache_line_loop(paddr, vaddr, sz, cacheop);
 
 	if (cacheop & OP_FLUSH)	/* flush / flush-n-inv both wait */
 		wait_for_flush();
@@ -416,51 +389,42 @@ static inline void __dc_line_op(unsigned long paddr, unsigned long vaddr,
 /***********************************************************
  * Machine specific helper for per line I-Cache invalidate.
  */
-static void __ic_line_inv_vaddr(unsigned long paddr, unsigned long vaddr,
+static void __ic_line_inv_vaddr_local(unsigned long paddr, unsigned long vaddr,
 				unsigned long sz)
 {
 	unsigned long flags;
-	int num_lines;
-
-	/*
-	 * Ensure we properly floor/ceil the non-line aligned/sized requests:
-	 * However page sized flushes can be compile time optimised.
-	 *  -@paddr will be cache-line aligned already (being page aligned)
-	 *  -@sz will be integral multiple of line size (being page sized).
-	 */
-	if (!(__builtin_constant_p(sz) && sz == PAGE_SIZE)) {
-		sz += paddr & ~ICACHE_LINE_MASK;
-		paddr &= ICACHE_LINE_MASK;
-		vaddr &= ICACHE_LINE_MASK;
-	}
 
-	num_lines = DIV_ROUND_UP(sz, ARC_ICACHE_LINE_LEN);
+	local_irq_save(flags);
+	__cache_line_loop(paddr, vaddr, sz, OP_INV_IC);
+	local_irq_restore(flags);
+}
 
-#if (CONFIG_ARC_MMU_VER <= 2)
-	/* bits 17:13 of vaddr go as bits 4:0 of paddr */
-	paddr |= (vaddr >> PAGE_SHIFT) & 0x1F;
-#endif
+static inline void __ic_entire_inv(void)
+{
+	write_aux_reg(ARC_REG_IC_IVIC, 1);
+	read_aux_reg(ARC_REG_IC_CTRL);	/* blocks */
+}
 
-	local_irq_save(flags);
-	while (num_lines-- > 0) {
-#if (CONFIG_ARC_MMU_VER > 2)
-		/* tag comes from phy addr */
-		write_aux_reg(ARC_REG_IC_PTAG, paddr);
+struct ic_line_inv_vaddr_ipi {
+	unsigned long paddr, vaddr;
+	int sz;
+};
 
-		/* index bits come from vaddr */
-		write_aux_reg(ARC_REG_IC_IVIL, vaddr);
-		vaddr += ARC_ICACHE_LINE_LEN;
-#else
-		/* paddr contains stuffed vaddrs bits */
-		write_aux_reg(ARC_REG_IC_IVIL, paddr);
-#endif
-		paddr += ARC_ICACHE_LINE_LEN;
-	}
-	local_irq_restore(flags);
+static void __ic_line_inv_vaddr_helper(void *info)
+{
+        struct ic_line_inv_vaddr_ipi *ic_inv = (struct ic_line_inv_vaddr_ipi*) info;
+        __ic_line_inv_vaddr_local(ic_inv->paddr, ic_inv->vaddr, ic_inv->sz);
 }
 
+static void __ic_line_inv_vaddr(unsigned long paddr, unsigned long vaddr,
+				unsigned long sz)
+{
+	struct ic_line_inv_vaddr_ipi ic_inv = { paddr, vaddr , sz};
+	on_each_cpu(__ic_line_inv_vaddr_helper, &ic_inv, 1);
+}
 #else
 
+#define __ic_entire_inv()
 #define __ic_line_inv_vaddr(pstart, vstart, sz)
 
 #endif /* CONFIG_ARC_HAS_ICACHE */
@@ -487,7 +451,7 @@ void flush_dcache_page(struct page *page)
 	struct address_space *mapping;
 
 	if (!cache_is_vipt_aliasing()) {
-		set_bit(PG_arch_1, &page->flags);
+		clear_bit(PG_dc_clean, &page->flags);
 		return;
 	}
 
@@ -501,7 +465,7 @@ void flush_dcache_page(struct page *page)
 	 * Make a note that K-mapping is dirty
 	 */
 	if (!mapping_mapped(mapping)) {
-		set_bit(PG_arch_1, &page->flags);
+		clear_bit(PG_dc_clean, &page->flags);
 	} else if (page_mapped(page)) {
 
 		/* kernel reading from page with U-mapping */
@@ -597,21 +561,17 @@ void flush_icache_range(unsigned long kstart, unsigned long kend)
 /*
  * General purpose helper to make I and D cache lines consistent.
  * @paddr is phy addr of region
- * @vaddr is typically user or kernel vaddr (vmalloc)
- *    Howver in one instance, flush_icache_range() by kprobe (for a breakpt in
+ * @vaddr is typically user vaddr (breakpoint) or kernel vaddr (vmalloc)
+ *    However in one instance, when called by kprobe (for a breakpt in
  *    builtin kernel code) @vaddr will be paddr only, meaning CDU operation will
  *    use a paddr to index the cache (despite VIPT). This is fine since since a
- *    built-in kernel page will not have any virtual mappings (not even kernel)
- *    kprobe on loadable module is different as it will have kvaddr.
+ *    builtin kernel page will not have any virtual mappings.
+ *    kprobe on loadable module will be kernel vaddr.
  */
 void __sync_icache_dcache(unsigned long paddr, unsigned long vaddr, int len)
 {
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__ic_line_inv_vaddr(paddr, vaddr, len);
 	__dc_line_op(paddr, vaddr, len, OP_FLUSH_N_INV);
-	local_irq_restore(flags);
+	__ic_line_inv_vaddr(paddr, vaddr, len);
 }
 
 /* wrapper to compile time eliminate alignment checks in flush loop */
@@ -629,26 +589,13 @@ void ___flush_dcache_page(unsigned long paddr, unsigned long vaddr)
 	__dc_line_op(paddr, vaddr & PAGE_MASK, PAGE_SIZE, OP_FLUSH_N_INV);
 }
 
-void flush_icache_all(void)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-
-	write_aux_reg(ARC_REG_IC_IVIC, 1);
-
-	/* lr will not complete till the icache inv operation is not over */
-	read_aux_reg(ARC_REG_IC_CTRL);
-	local_irq_restore(flags);
-}
-
 noinline void flush_cache_all(void)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
 
-	flush_icache_all();
+	__ic_entire_inv();
 	__dc_entire_op(OP_FLUSH_N_INV);
 
 	local_irq_restore(flags);
@@ -667,7 +614,12 @@ void flush_cache_page(struct vm_area_struct *vma, unsigned long u_vaddr,
 {
 	unsigned int paddr = pfn << PAGE_SHIFT;
 
-	__sync_icache_dcache(paddr, u_vaddr, PAGE_SIZE);
+	u_vaddr &= PAGE_MASK;
+
+	___flush_dcache_page(paddr, u_vaddr);
+
+	if (vma->vm_flags & VM_EXEC)
+		__inv_icache_page(paddr, u_vaddr);
 }
 
 void flush_cache_range(struct vm_area_struct *vma, unsigned long start,
@@ -717,7 +669,7 @@ void copy_user_highpage(struct page *to, struct page *from,
 	 * non copied user pages (e.g. read faults which wire in pagecache page
 	 * directly).
 	 */
-	set_bit(PG_arch_1, &to->flags);
+	clear_bit(PG_dc_clean, &to->flags);
 
 	/*
 	 * if SRC was already usermapped and non-congruent to kernel mapping
@@ -725,15 +677,16 @@ void copy_user_highpage(struct page *to, struct page *from,
 	 */
 	if (clean_src_k_mappings) {
 		__flush_dcache_page(kfrom, kfrom);
+		set_bit(PG_dc_clean, &from->flags);
 	} else {
-		set_bit(PG_arch_1, &from->flags);
+		clear_bit(PG_dc_clean, &from->flags);
 	}
 }
 
 void clear_user_page(void *to, unsigned long u_vaddr, struct page *page)
 {
 	clear_page(to);
-	set_bit(PG_arch_1, &page->flags);
+	clear_bit(PG_dc_clean, &page->flags);
 }
 
 
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index 689ffd86d5e..9c69552350c 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -15,8 +15,9 @@
 #include <linux/uaccess.h>
 #include <linux/kdebug.h>
 #include <asm/pgalloc.h>
+#include <asm/mmu.h>
 
-static int handle_vmalloc_fault(struct mm_struct *mm, unsigned long address)
+static int handle_vmalloc_fault(unsigned long address)
 {
 	/*
 	 * Synchronize this task's top level page-table
@@ -26,7 +27,7 @@ static int handle_vmalloc_fault(struct mm_struct *mm, unsigned long address)
 	pud_t *pud, *pud_k;
 	pmd_t *pmd, *pmd_k;
 
-	pgd = pgd_offset_fast(mm, address);
+	pgd = pgd_offset_fast(current->active_mm, address);
 	pgd_k = pgd_offset_k(address);
 
 	if (!pgd_present(*pgd_k))
@@ -51,16 +52,15 @@ bad_area:
 	return 1;
 }
 
-void do_page_fault(struct pt_regs *regs, int write, unsigned long address,
-		   unsigned long cause_code)
+void do_page_fault(unsigned long address, struct pt_regs *regs)
 {
 	struct vm_area_struct *vma = NULL;
 	struct task_struct *tsk = current;
 	struct mm_struct *mm = tsk->mm;
 	siginfo_t info;
 	int fault, ret;
-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
-				(write ? FAULT_FLAG_WRITE : 0);
+	int write = regs->ecr_cause & ECR_C_PROTV_STORE;  /* ST/EX */
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
 	/*
 	 * We fault-in kernel-space virtual memory on-demand. The
@@ -72,7 +72,7 @@ void do_page_fault(struct pt_regs *regs, int write, unsigned long address,
 	 * nothing more.
 	 */
 	if (address >= VMALLOC_START && address <= VMALLOC_END) {
-		ret = handle_vmalloc_fault(mm, address);
+		ret = handle_vmalloc_fault(address);
 		if (unlikely(ret))
 			goto bad_area_nosemaphore;
 		else
@@ -88,6 +88,8 @@ void do_page_fault(struct pt_regs *regs, int write, unsigned long address,
 	if (in_atomic() || !mm)
 		goto no_context;
 
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
 retry:
 	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, address);
@@ -109,18 +111,19 @@ good_area:
 
 	/* Handle protection violation, execute on heap or stack */
 
-	if (cause_code == ((ECR_V_PROTV << 16) | ECR_C_PROTV_INST_FETCH))
+	if ((regs->ecr_vec == ECR_V_PROTV) &&
+	    (regs->ecr_cause == ECR_C_PROTV_INST_FETCH))
 		goto bad_area;
 
 	if (write) {
 		if (!(vma->vm_flags & VM_WRITE))
 			goto bad_area;
+		flags |= FAULT_FLAG_WRITE;
 	} else {
 		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
 			goto bad_area;
 	}
 
-survive:
 	/*
 	 * If for any reason at all we couldn't handle the fault,
 	 * make sure we exit gracefully rather than endlessly redo
@@ -176,7 +179,6 @@ bad_area_nosemaphore:
 	/* User mode accesses just cause a SIGSEGV */
 	if (user_mode(regs)) {
 		tsk->thread.fault_address = address;
-		tsk->thread.cause_code = cause_code;
 		info.si_signo = SIGSEGV;
 		info.si_errno = 0;
 		/* info.si_code has been set above */
@@ -197,17 +199,15 @@ no_context:
 	if (fixup_exception(regs))
 		return;
 
-	die("Oops", regs, address, cause_code);
+	die("Oops", regs, address);
 
 out_of_memory:
-	if (is_global_init(tsk)) {
-		yield();
-		goto survive;
-	}
 	up_read(&mm->mmap_sem);
 
-	if (user_mode(regs))
-		do_group_exit(SIGKILL);	/* This will never return */
+	if (user_mode(regs)) {
+		pagefault_out_of_memory();
+		return;
+	}
 
 	goto no_context;
 
@@ -218,7 +218,6 @@ do_sigbus:
 		goto no_context;
 
 	tsk->thread.fault_address = address;
-	tsk->thread.cause_code = cause_code;
 	info.si_signo = SIGBUS;
 	info.si_errno = 0;
 	info.si_code = BUS_ADRERR;
diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
index 4a177365b2c..523412369f7 100644
--- a/arch/arc/mm/init.c
+++ b/arch/arc/mm/init.c
@@ -10,6 +10,9 @@
 #include <linux/mm.h>
 #include <linux/bootmem.h>
 #include <linux/memblock.h>
+#ifdef CONFIG_BLK_DEV_INITRD
+#include <linux/initrd.h>
+#endif
 #include <linux/swap.h>
 #include <linux/module.h>
 #include <asm/page.h>
@@ -42,6 +45,24 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size)
 	pr_info("Memory size set via devicetree %ldM\n", TO_MB(arc_mem_sz));
 }
 
+#ifdef CONFIG_BLK_DEV_INITRD
+static int __init early_initrd(char *p)
+{
+	unsigned long start, size;
+	char *endp;
+
+	start = memparse(p, &endp);
+	if (*endp == ',') {
+		size = memparse(endp + 1, NULL);
+
+		initrd_start = (unsigned long)__va(start);
+		initrd_end = (unsigned long)__va(start + size);
+	}
+	return 0;
+}
+early_param("initrd", early_initrd);
+#endif
+
 /*
  * First memory setup routine called from setup_arch()
  * 1. setup swapper's mm @init_mm
@@ -74,17 +95,23 @@ void __init setup_arch_memory(void)
 	/* Last usable page of low mem (no HIGHMEM yet for ARC port) */
 	max_low_pfn = max_pfn = PFN_DOWN(end_mem);
 
-	max_mapnr = num_physpages = max_low_pfn - min_low_pfn;
+	max_mapnr = max_low_pfn - min_low_pfn;
 
 	/*------------- reserve kernel image -----------------------*/
 	memblock_reserve(CONFIG_LINUX_LINK_BASE,
 			 __pa(_end) - CONFIG_LINUX_LINK_BASE);
 
+#ifdef CONFIG_BLK_DEV_INITRD
+	/*------------- reserve initrd image -----------------------*/
+	if (initrd_start)
+		memblock_reserve(__pa(initrd_start), initrd_end - initrd_start);
+#endif
+
 	memblock_dump_all();
 
 	/*-------------- node setup --------------------------------*/
 	memset(zones_size, 0, sizeof(zones_size));
-	zones_size[ZONE_NORMAL] = num_physpages;
+	zones_size[ZONE_NORMAL] = max_low_pfn - min_low_pfn;
 
 	/*
 	 * We can't use the helper free_area_init(zones[]) because it uses
@@ -106,39 +133,9 @@ void __init setup_arch_memory(void)
  */
 void __init mem_init(void)
 {
-	int codesize, datasize, initsize, reserved_pages, free_pages;
-	int tmp;
-
 	high_memory = (void *)(CONFIG_LINUX_LINK_BASE + arc_mem_sz);
-
-	totalram_pages = free_all_bootmem();
-
-	/* count all reserved pages [kernel code/data/mem_map..] */
-	reserved_pages = 0;
-	for (tmp = 0; tmp < max_mapnr; tmp++)
-		if (PageReserved(mem_map + tmp))
-			reserved_pages++;
-
-	/* XXX: nr_free_pages() is equivalent */
-	free_pages = max_mapnr - reserved_pages;
-
-	/*
-	 * For the purpose of display below, split the "reserve mem"
-	 * kernel code/data is already shown explicitly,
-	 * Show any other reservations (mem_map[ ] et al)
-	 */
-	reserved_pages -= (((unsigned int)_end - CONFIG_LINUX_LINK_BASE) >>
-								PAGE_SHIFT);
-
-	codesize = _etext - _text;
-	datasize = _end - _etext;
-	initsize = __init_end - __init_begin;
-
-	pr_info("Memory Available: %dM / %ldM (%dK code, %dK data, %dK init, %dK reserv)\n",
-		PAGES_TO_MB(free_pages),
-		TO_MB(arc_mem_sz),
-		TO_KB(codesize), TO_KB(datasize), TO_KB(initsize),
-		PAGES_TO_KB(reserved_pages));
+	free_all_bootmem();
+	mem_init_print_info(NULL);
 }
 
 /*
@@ -146,20 +143,12 @@ void __init mem_init(void)
  */
 void __init_refok free_initmem(void)
 {
-	free_initmem_default(0);
+	free_initmem_default(-1);
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
 void __init free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area(start, end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
 #endif
-
-#ifdef CONFIG_OF_FLATTREE
-void __init early_init_dt_setup_initrd_arch(unsigned long start,
-					    unsigned long end)
-{
-	pr_err("%s(%lx, %lx)\n", __func__, start, end);
-}
-#endif /* CONFIG_OF_FLATTREE */
diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c
index fe1c5a073af..e1acf0ce564 100644
--- a/arch/arc/mm/tlb.c
+++ b/arch/arc/mm/tlb.c
@@ -52,10 +52,11 @@
  */
 
 #include <linux/module.h>
+#include <linux/bug.h>
 #include <asm/arcregs.h>
 #include <asm/setup.h>
 #include <asm/mmu_context.h>
-#include <asm/tlb.h>
+#include <asm/mmu.h>
 
 /*			Need for ARC MMU v2
  *
@@ -97,49 +98,47 @@
  * J-TLB entry got evicted/replaced.
  */
 
-/* A copy of the ASID from the PID reg is kept in asid_cache */
-int asid_cache = FIRST_ASID;
 
-/* ASID to mm struct mapping. We have one extra entry corresponding to
- * NO_ASID to save us a compare when clearing the mm entry for old asid
- * see get_new_mmu_context (asm-arc/mmu_context.h)
- */
-struct mm_struct *asid_mm_map[NUM_ASID + 1];
+/* A copy of the ASID from the PID reg is kept in asid_cache */
+DEFINE_PER_CPU(unsigned int, asid_cache) = MM_CTXT_FIRST_CYCLE;
 
 /*
  * Utility Routine to erase a J-TLB entry
- * The procedure is to look it up in the MMU. If found, ERASE it by
- *  issuing a TlbWrite CMD with PD0 = PD1 = 0
+ * Caller needs to setup Index Reg (manually or via getIndex)
  */
-
-static void __tlb_entry_erase(void)
+static inline void __tlb_entry_erase(void)
 {
 	write_aux_reg(ARC_REG_TLBPD1, 0);
 	write_aux_reg(ARC_REG_TLBPD0, 0);
 	write_aux_reg(ARC_REG_TLBCOMMAND, TLBWrite);
 }
 
-static void tlb_entry_erase(unsigned int vaddr_n_asid)
+static inline unsigned int tlb_entry_lkup(unsigned long vaddr_n_asid)
 {
 	unsigned int idx;
 
-	/* Locate the TLB entry for this vaddr + ASID */
 	write_aux_reg(ARC_REG_TLBPD0, vaddr_n_asid);
+
 	write_aux_reg(ARC_REG_TLBCOMMAND, TLBProbe);
 	idx = read_aux_reg(ARC_REG_TLBINDEX);
 
+	return idx;
+}
+
+static void tlb_entry_erase(unsigned int vaddr_n_asid)
+{
+	unsigned int idx;
+
+	/* Locate the TLB entry for this vaddr + ASID */
+	idx = tlb_entry_lkup(vaddr_n_asid);
+
 	/* No error means entry found, zero it out */
 	if (likely(!(idx & TLB_LKUP_ERR))) {
 		__tlb_entry_erase();
-	} else {		/* Some sort of Error */
-
+	} else {
 		/* Duplicate entry error */
-		if (idx & 0x1) {
-			/* TODO we need to handle this case too */
-			pr_emerg("unhandled Duplicate flush for %x\n",
-			       vaddr_n_asid);
-		}
-		/* else entry not found so nothing to do */
+		WARN(idx == TLB_DUP_ERR, "Probe returned Dup PD for %x\n",
+					   vaddr_n_asid);
 	}
 }
 
@@ -158,7 +157,7 @@ static void utlb_invalidate(void)
 {
 #if (CONFIG_ARC_MMU_VER >= 2)
 
-#if (CONFIG_ARC_MMU_VER < 3)
+#if (CONFIG_ARC_MMU_VER == 2)
 	/* MMU v2 introduced the uTLB Flush command.
 	 * There was however an obscure hardware bug, where uTLB flush would
 	 * fail when a prior probe for J-TLB (both totally unrelated) would
@@ -181,6 +180,36 @@ static void utlb_invalidate(void)
 
 }
 
+static void tlb_entry_insert(unsigned int pd0, unsigned int pd1)
+{
+	unsigned int idx;
+
+	/*
+	 * First verify if entry for this vaddr+ASID already exists
+	 * This also sets up PD0 (vaddr, ASID..) for final commit
+	 */
+	idx = tlb_entry_lkup(pd0);
+
+	/*
+	 * If Not already present get a free slot from MMU.
+	 * Otherwise, Probe would have located the entry and set INDEX Reg
+	 * with existing location. This will cause Write CMD to over-write
+	 * existing entry with new PD0 and PD1
+	 */
+	if (likely(idx & TLB_LKUP_ERR))
+		write_aux_reg(ARC_REG_TLBCOMMAND, TLBGetIndex);
+
+	/* setup the other half of TLB entry (pfn, rwx..) */
+	write_aux_reg(ARC_REG_TLBPD1, pd1);
+
+	/*
+	 * Commit the Entry to MMU
+	 * It doesnt sound safe to use the TLBWriteNI cmd here
+	 * which doesn't flush uTLBs. I'd rather be safe than sorry.
+	 */
+	write_aux_reg(ARC_REG_TLBCOMMAND, TLBWrite);
+}
+
 /*
  * Un-conditionally (without lookup) erase the entire MMU contents
  */
@@ -223,13 +252,14 @@ noinline void local_flush_tlb_mm(struct mm_struct *mm)
 		return;
 
 	/*
-	 * Workaround for Android weirdism:
-	 * A binder VMA could end up in a task such that vma->mm != tsk->mm
-	 * old code would cause h/w - s/w ASID to get out of sync
+	 * - Move to a new ASID, but only if the mm is still wired in
+	 *   (Android Binder ended up calling this for vma->mm != tsk->mm,
+	 *    causing h/w - s/w ASID to get out of sync)
+	 * - Also get_new_mmu_context() new implementation allocates a new
+	 *   ASID only if it is not allocated already - so unallocate first
 	 */
-	if (current->mm != mm)
-		destroy_context(mm);
-	else
+	destroy_context(mm);
+	if (current->mm == mm)
 		get_new_mmu_context(mm);
 }
 
@@ -244,8 +274,8 @@ noinline void local_flush_tlb_mm(struct mm_struct *mm)
 void local_flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 			   unsigned long end)
 {
+	const unsigned int cpu = smp_processor_id();
 	unsigned long flags;
-	unsigned int asid;
 
 	/* If range @start to @end is more than 32 TLB entries deep,
 	 * its better to move to a new ASID rather than searching for
@@ -267,11 +297,10 @@ void local_flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 	start &= PAGE_MASK;
 
 	local_irq_save(flags);
-	asid = vma->vm_mm->context.asid;
 
-	if (asid != NO_ASID) {
+	if (asid_mm(vma->vm_mm, cpu) != MM_CTXT_NO_ASID) {
 		while (start < end) {
-			tlb_entry_erase(start | (asid & 0xff));
+			tlb_entry_erase(start | hw_pid(vma->vm_mm, cpu));
 			start += PAGE_SIZE;
 		}
 	}
@@ -318,6 +347,7 @@ void local_flush_tlb_kernel_range(unsigned long start, unsigned long end)
 
 void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
 {
+	const unsigned int cpu = smp_processor_id();
 	unsigned long flags;
 
 	/* Note that it is critical that interrupts are DISABLED between
@@ -325,23 +355,95 @@ void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
 	 */
 	local_irq_save(flags);
 
-	if (vma->vm_mm->context.asid != NO_ASID) {
-		tlb_entry_erase((page & PAGE_MASK) |
-				(vma->vm_mm->context.asid & 0xff));
+	if (asid_mm(vma->vm_mm, cpu) != MM_CTXT_NO_ASID) {
+		tlb_entry_erase((page & PAGE_MASK) | hw_pid(vma->vm_mm, cpu));
 		utlb_invalidate();
 	}
 
 	local_irq_restore(flags);
 }
 
+#ifdef CONFIG_SMP
+
+struct tlb_args {
+	struct vm_area_struct *ta_vma;
+	unsigned long ta_start;
+	unsigned long ta_end;
+};
+
+static inline void ipi_flush_tlb_page(void *arg)
+{
+	struct tlb_args *ta = arg;
+
+	local_flush_tlb_page(ta->ta_vma, ta->ta_start);
+}
+
+static inline void ipi_flush_tlb_range(void *arg)
+{
+	struct tlb_args *ta = arg;
+
+	local_flush_tlb_range(ta->ta_vma, ta->ta_start, ta->ta_end);
+}
+
+static inline void ipi_flush_tlb_kernel_range(void *arg)
+{
+	struct tlb_args *ta = (struct tlb_args *)arg;
+
+	local_flush_tlb_kernel_range(ta->ta_start, ta->ta_end);
+}
+
+void flush_tlb_all(void)
+{
+	on_each_cpu((smp_call_func_t)local_flush_tlb_all, NULL, 1);
+}
+
+void flush_tlb_mm(struct mm_struct *mm)
+{
+	on_each_cpu_mask(mm_cpumask(mm), (smp_call_func_t)local_flush_tlb_mm,
+			 mm, 1);
+}
+
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
+{
+	struct tlb_args ta = {
+		.ta_vma = vma,
+		.ta_start = uaddr
+	};
+
+	on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_page, &ta, 1);
+}
+
+void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+		     unsigned long end)
+{
+	struct tlb_args ta = {
+		.ta_vma = vma,
+		.ta_start = start,
+		.ta_end = end
+	};
+
+	on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_range, &ta, 1);
+}
+
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+	struct tlb_args ta = {
+		.ta_start = start,
+		.ta_end = end
+	};
+
+	on_each_cpu(ipi_flush_tlb_kernel_range, &ta, 1);
+}
+#endif
+
 /*
  * Routine to create a TLB entry
  */
 void create_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep)
 {
 	unsigned long flags;
-	unsigned int idx, asid_or_sasid;
-	unsigned long pd0_flags;
+	unsigned int asid_or_sasid, rwx;
+	unsigned long pd0, pd1;
 
 	/*
 	 * create_tlb() assumes that current->mm == vma->mm, since
@@ -373,47 +475,37 @@ void create_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep)
 
 	local_irq_save(flags);
 
-	tlb_paranoid_check(vma->vm_mm->context.asid, address);
+	tlb_paranoid_check(asid_mm(vma->vm_mm, smp_processor_id()), address);
 
 	address &= PAGE_MASK;
 
 	/* update this PTE credentials */
 	pte_val(*ptep) |= (_PAGE_PRESENT | _PAGE_ACCESSED);
 
-	/* Create HW TLB entry Flags (in PD0) from PTE Flags */
-#if (CONFIG_ARC_MMU_VER <= 2)
-	pd0_flags = ((pte_val(*ptep) & PTE_BITS_IN_PD0) >> 1);
-#else
-	pd0_flags = ((pte_val(*ptep) & PTE_BITS_IN_PD0));
-#endif
+	/* Create HW TLB(PD0,PD1) from PTE  */
 
 	/* ASID for this task */
 	asid_or_sasid = read_aux_reg(ARC_REG_PID) & 0xff;
 
-	write_aux_reg(ARC_REG_TLBPD0, address | pd0_flags | asid_or_sasid);
-
-	/* Load remaining info in PD1 (Page Frame Addr and Kx/Kw/Kr Flags) */
-	write_aux_reg(ARC_REG_TLBPD1, (pte_val(*ptep) & PTE_BITS_IN_PD1));
-
-	/* First verify if entry for this vaddr+ASID already exists */
-	write_aux_reg(ARC_REG_TLBCOMMAND, TLBProbe);
-	idx = read_aux_reg(ARC_REG_TLBINDEX);
+	pd0 = address | asid_or_sasid | (pte_val(*ptep) & PTE_BITS_IN_PD0);
 
 	/*
-	 * If Not already present get a free slot from MMU.
-	 * Otherwise, Probe would have located the entry and set INDEX Reg
-	 * with existing location. This will cause Write CMD to over-write
-	 * existing entry with new PD0 and PD1
+	 * ARC MMU provides fully orthogonal access bits for K/U mode,
+	 * however Linux only saves 1 set to save PTE real-estate
+	 * Here we convert 3 PTE bits into 6 MMU bits:
+	 * -Kernel only entries have Kr Kw Kx 0 0 0
+	 * -User entries have mirrored K and U bits
 	 */
-	if (likely(idx & TLB_LKUP_ERR))
-		write_aux_reg(ARC_REG_TLBCOMMAND, TLBGetIndex);
+	rwx = pte_val(*ptep) & PTE_BITS_RWX;
 
-	/*
-	 * Commit the Entry to MMU
-	 * It doesnt sound safe to use the TLBWriteNI cmd here
-	 * which doesn't flush uTLBs. I'd rather be safe than sorry.
-	 */
-	write_aux_reg(ARC_REG_TLBCOMMAND, TLBWrite);
+	if (pte_val(*ptep) & _PAGE_GLOBAL)
+		rwx <<= 3;		/* r w x => Kr Kw Kx 0 0 0 */
+	else
+		rwx |= (rwx << 3);	/* r w x => Kr Kw Kx Ur Uw Ux */
+
+	pd1 = rwx | (pte_val(*ptep) & PTE_BITS_NON_RWX_IN_PD1);
+
+	tlb_entry_insert(pd0, pd1);
 
 	local_irq_restore(flags);
 }
@@ -432,9 +524,14 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long vaddr_unaligned,
 {
 	unsigned long vaddr = vaddr_unaligned & PAGE_MASK;
 	unsigned long paddr = pte_val(*ptep) & PAGE_MASK;
+	struct page *page = pfn_to_page(pte_pfn(*ptep));
 
 	create_tlb(vma, vaddr, ptep);
 
+	if (page == ZERO_PAGE(0)) {
+		return;
+	}
+
 	/*
 	 * Exec page : Independent of aliasing/page-color considerations,
 	 *	       since icache doesn't snoop dcache on ARC, any dirty
@@ -446,9 +543,8 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long vaddr_unaligned,
 	 */
 	if ((vma->vm_flags & VM_EXEC) ||
 	     addr_not_cache_congruent(paddr, vaddr)) {
-		struct page *page = pfn_to_page(pte_pfn(*ptep));
 
-		int dirty = test_and_clear_bit(PG_arch_1, &page->flags);
+		int dirty = !test_and_set_bit(PG_dc_clean, &page->flags);
 		if (dirty) {
 			/* wback + inv dcache lines */
 			__flush_dcache_page(paddr, paddr);
@@ -464,12 +560,27 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long vaddr_unaligned,
  * the cpuinfo structure for later use.
  * No Validation is done here, simply read/convert the BCRs
  */
-void __cpuinit read_decode_mmu_bcr(void)
+void read_decode_mmu_bcr(void)
 {
-	unsigned int tmp;
-	struct bcr_mmu_1_2 *mmu2;	/* encoded MMU2 attr */
-	struct bcr_mmu_3 *mmu3;		/* encoded MMU3 attr */
 	struct cpuinfo_arc_mmu *mmu = &cpuinfo_arc700[smp_processor_id()].mmu;
+	unsigned int tmp;
+	struct bcr_mmu_1_2 {
+#ifdef CONFIG_CPU_BIG_ENDIAN
+		unsigned int ver:8, ways:4, sets:4, u_itlb:8, u_dtlb:8;
+#else
+		unsigned int u_dtlb:8, u_itlb:8, sets:4, ways:4, ver:8;
+#endif
+	} *mmu2;
+
+	struct bcr_mmu_3 {
+#ifdef CONFIG_CPU_BIG_ENDIAN
+	unsigned int ver:8, ways:4, sets:4, osm:1, reserv:3, pg_sz:4,
+		     u_itlb:4, u_dtlb:4;
+#else
+	unsigned int u_dtlb:4, u_itlb:4, pg_sz:4, reserv:3, osm:1, sets:4,
+		     ways:4, ver:8;
+#endif
+	} *mmu3;
 
 	tmp = read_aux_reg(ARC_REG_MMU_BCR);
 	mmu->ver = (tmp >> 24);
@@ -505,12 +616,12 @@ char *arc_mmu_mumbojumbo(int cpu_id, char *buf, int len)
 		       "J-TLB %d (%dx%d), uDTLB %d, uITLB %d, %s\n",
 		       p_mmu->num_tlb, p_mmu->sets, p_mmu->ways,
 		       p_mmu->u_dtlb, p_mmu->u_itlb,
-		       __CONFIG_ARC_MMU_SASID_VAL ? "SASID" : "");
+		       IS_ENABLED(CONFIG_ARC_MMU_SASID) ? "SASID" : "");
 
 	return buf;
 }
 
-void __cpuinit arc_mmu_init(void)
+void arc_mmu_init(void)
 {
 	char str[256];
 	struct cpuinfo_arc_mmu *mmu = &cpuinfo_arc700[smp_processor_id()].mmu;
@@ -533,13 +644,6 @@ void __cpuinit arc_mmu_init(void)
 	if (mmu->pg_sz != PAGE_SIZE)
 		panic("MMU pg size != PAGE_SIZE (%luk)\n", TO_KB(PAGE_SIZE));
 
-	/*
-	 * ASID mgmt data structures are compile time init
-	 *  asid_cache = FIRST_ASID and asid_mm_map[] all zeroes
-	 */
-
-	local_flush_tlb_all();
-
 	/* Enable the MMU */
 	write_aux_reg(ARC_REG_PID, MMU_ENABLE);
 
@@ -581,9 +685,9 @@ void do_tlb_overlap_fault(unsigned long cause, unsigned long address,
 			  struct pt_regs *regs)
 {
 	int set, way, n;
-	unsigned int pd0[4], pd1[4];	/* assume max 4 ways */
 	unsigned long flags, is_valid;
 	struct cpuinfo_arc_mmu *mmu = &cpuinfo_arc700[smp_processor_id()].mmu;
+	unsigned int pd0[mmu->ways], pd1[mmu->ways];
 
 	local_irq_save(flags);
 
@@ -608,7 +712,7 @@ void do_tlb_overlap_fault(unsigned long cause, unsigned long address,
 			continue;
 
 		/* Scan the set for duplicate ways: needs a nested loop */
-		for (way = 0; way < mmu->ways; way++) {
+		for (way = 0; way < mmu->ways - 1; way++) {
 			if (!pd0[way])
 				continue;
 
@@ -651,25 +755,28 @@ void do_tlb_overlap_fault(unsigned long cause, unsigned long address,
  * Low Level ASM TLB handler calls this if it finds that HW and SW ASIDS
  * don't match
  */
-void print_asid_mismatch(int is_fast_path)
+void print_asid_mismatch(int mm_asid, int mmu_asid, int is_fast_path)
 {
-	int pid_sw, pid_hw;
-	pid_sw = current->active_mm->context.asid;
-	pid_hw = read_aux_reg(ARC_REG_PID) & 0xff;
-
 	pr_emerg("ASID Mismatch in %s Path Handler: sw-pid=0x%x hw-pid=0x%x\n",
-	       is_fast_path ? "Fast" : "Slow", pid_sw, pid_hw);
+	       is_fast_path ? "Fast" : "Slow", mm_asid, mmu_asid);
 
 	__asm__ __volatile__("flag 1");
 }
 
-void tlb_paranoid_check(unsigned int pid_sw, unsigned long addr)
+void tlb_paranoid_check(unsigned int mm_asid, unsigned long addr)
 {
-	unsigned int pid_hw;
+	unsigned int mmu_asid;
 
-	pid_hw = read_aux_reg(ARC_REG_PID) & 0xff;
+	mmu_asid = read_aux_reg(ARC_REG_PID) & 0xff;
 
-	if (addr < 0x70000000 && ((pid_hw != pid_sw) || (pid_sw == NO_ASID)))
-		print_asid_mismatch(0);
+	/*
+	 * At the time of a TLB miss/installation
+	 *   - HW version needs to match SW version
+	 *   - SW needs to have a valid ASID
+	 */
+	if (addr < 0x70000000 &&
+	    ((mm_asid == MM_CTXT_NO_ASID) ||
+	      (mmu_asid != (mm_asid & MM_CTXT_ASID_MASK))))
+		print_asid_mismatch(mm_asid, mmu_asid, 0);
 }
 #endif
diff --git a/arch/arc/mm/tlbex.S b/arch/arc/mm/tlbex.S
index 3357d26ffe5..79bfc81358c 100644
--- a/arch/arc/mm/tlbex.S
+++ b/arch/arc/mm/tlbex.S
@@ -39,22 +39,41 @@
 
 #include <linux/linkage.h>
 #include <asm/entry.h>
-#include <asm/tlb.h>
+#include <asm/mmu.h>
 #include <asm/pgtable.h>
 #include <asm/arcregs.h>
 #include <asm/cache.h>
 #include <asm/processor.h>
-#if (CONFIG_ARC_MMU_VER == 1)
 #include <asm/tlb-mmu1.h>
-#endif
 
-;--------------------------------------------------------------------------
-; scratch memory to save the registers (r0-r3) used to code TLB refill Handler
-; For details refer to comments before TLBMISS_FREEUP_REGS below
+;-----------------------------------------------------------------
+; ARC700 Exception Handling doesn't auto-switch stack and it only provides
+; ONE scratch AUX reg "ARC_REG_SCRATCH_DATA0"
+;
+; For Non-SMP, the scratch AUX reg is repurposed to cache task PGD, so a
+; "global" is used to free-up FIRST core reg to be able to code the rest of
+; exception prologue (IRQ auto-disabled on Exceptions, so it's IRQ-safe).
+; Since the Fast Path TLB Miss handler is coded with 4 regs, the remaining 3
+; need to be saved as well by extending the "global" to be 4 words. Hence
+;	".size   ex_saved_reg1, 16"
+; [All of this dance is to avoid stack switching for each TLB Miss, since we
+; only need to save only a handful of regs, as opposed to complete reg file]
+;
+; For ARC700 SMP, the "global" obviously can't be used for free up the FIRST
+; core reg as it will not be SMP safe.
+; Thus scratch AUX reg is used (and no longer used to cache task PGD).
+; To save the rest of 3 regs - per cpu, the global is made "per-cpu".
+; Epilogue thus has to locate the "per-cpu" storage for regs.
+; To avoid cache line bouncing the per-cpu global is aligned/sized per
+; L1_CACHE_SHIFT, despite fundamentally needing to be 12 bytes only. Hence
+;	".size   ex_saved_reg1, (CONFIG_NR_CPUS << L1_CACHE_SHIFT)"
+
+; As simple as that....
 ;--------------------------------------------------------------------------
 
+; scratch memory to save [r0-r3] used to code TLB refill Handler
 ARCFP_DATA ex_saved_reg1
-	.align 1 << L1_CACHE_SHIFT	; IMP: Must be Cache Line aligned
+	.align 1 << L1_CACHE_SHIFT
 	.type   ex_saved_reg1, @object
 #ifdef CONFIG_SMP
 	.size   ex_saved_reg1, (CONFIG_NR_CPUS << L1_CACHE_SHIFT)
@@ -66,6 +85,44 @@ ex_saved_reg1:
 	.zero 16
 #endif
 
+.macro TLBMISS_FREEUP_REGS
+#ifdef CONFIG_SMP
+	sr  r0, [ARC_REG_SCRATCH_DATA0]	; freeup r0 to code with
+	GET_CPU_ID  r0			; get to per cpu scratch mem,
+	lsl r0, r0, L1_CACHE_SHIFT	; cache line wide per cpu
+	add r0, @ex_saved_reg1, r0
+#else
+	st    r0, [@ex_saved_reg1]
+	mov_s r0, @ex_saved_reg1
+#endif
+	st_s  r1, [r0, 4]
+	st_s  r2, [r0, 8]
+	st_s  r3, [r0, 12]
+
+	; VERIFY if the ASID in MMU-PID Reg is same as
+	; one in Linux data structures
+
+	tlb_paranoid_check_asm
+.endm
+
+.macro TLBMISS_RESTORE_REGS
+#ifdef CONFIG_SMP
+	GET_CPU_ID  r0			; get to per cpu scratch mem
+	lsl r0, r0, L1_CACHE_SHIFT	; each is cache line wide
+	add r0, @ex_saved_reg1, r0
+	ld_s  r3, [r0,12]
+	ld_s  r2, [r0, 8]
+	ld_s  r1, [r0, 4]
+	lr    r0, [ARC_REG_SCRATCH_DATA0]
+#else
+	mov_s r0, @ex_saved_reg1
+	ld_s  r3, [r0,12]
+	ld_s  r2, [r0, 8]
+	ld_s  r1, [r0, 4]
+	ld_s  r0, [r0]
+#endif
+.endm
+
 ;============================================================================
 ;  Troubleshooting Stuff
 ;============================================================================
@@ -76,34 +133,35 @@ ex_saved_reg1:
 ; In bizzare scenrios SW and HW ASID can get out-of-sync which is trouble.
 ; So we try to detect this in TLB Mis shandler
 
-
-.macro DBG_ASID_MISMATCH
+.macro tlb_paranoid_check_asm
 
 #ifdef CONFIG_ARC_DBG_TLB_PARANOIA
 
-	; make sure h/w ASID is same as s/w ASID
-
 	GET_CURR_TASK_ON_CPU  r3
 	ld r0, [r3, TASK_ACT_MM]
 	ld r0, [r0, MM_CTXT+MM_CTXT_ASID]
+	breq r0, 0, 55f	; Error if no ASID allocated
 
 	lr r1, [ARC_REG_PID]
 	and r1, r1, 0xFF
-	breq r1, r0, 5f
 
+	and r2, r0, 0xFF	; MMU PID bits only for comparison
+	breq r1, r2, 5f
+
+55:
 	; Error if H/w and S/w ASID don't match, but NOT if in kernel mode
-	lr  r0, [erstatus]
-	bbit0 r0, STATUS_U_BIT, 5f
+	lr  r2, [erstatus]
+	bbit0 r2, STATUS_U_BIT, 5f
 
 	; We sure are in troubled waters, Flag the error, but to do so
 	; need to switch to kernel mode stack to call error routine
 	GET_TSK_STACK_BASE   r3, sp
 
 	; Call printk to shoutout aloud
-	mov r0, 1
+	mov r2, 1
 	j print_asid_mismatch
 
-5:   ; ASIDs match so proceed normally
+5:	; ASIDs match so proceed normally
 	nop
 
 #endif
@@ -147,9 +205,9 @@ ex_saved_reg1:
 #ifdef CONFIG_ARC_DBG_TLB_MISS_COUNT
 	and.f 0, r0, _PAGE_PRESENT
 	bz   1f
-	ld   r2, [num_pte_not_present]
-	add  r2, r2, 1
-	st   r2, [num_pte_not_present]
+	ld   r3, [num_pte_not_present]
+	add  r3, r3, 1
+	st   r3, [num_pte_not_present]
 1:
 #endif
 
@@ -161,13 +219,17 @@ ex_saved_reg1:
 ; IN: r0 = PTE, r1 = ptr to PTE
 
 .macro CONV_PTE_TO_TLB
-	and r3, r0, PTE_BITS_IN_PD1 ; Extract permission flags+PFN from PTE
-	sr  r3, [ARC_REG_TLBPD1]    ; these go in PD1
+	and    r3, r0, PTE_BITS_RWX	;       r w x
+	lsl    r2, r3, 3		; r w x 0 0 0
+	and.f  0,  r0, _PAGE_GLOBAL
+	or.z   r2, r2, r3		; r w x r w x
+
+	and r3, r0, PTE_BITS_NON_RWX_IN_PD1 ; Extract PFN+cache bits from PTE
+	or  r3, r3, r2
+
+	sr  r3, [ARC_REG_TLBPD1]    	; these go in PD1
 
 	and r2, r0, PTE_BITS_IN_PD0 ; Extract other PTE flags: (V)alid, (G)lb
-#if (CONFIG_ARC_MMU_VER <= 2)   /* Neednot be done with v3 onwards */
-	lsr r2, r2                  ; shift PTE flags to match layout in PD0
-#endif
 
 	lr  r3,[ARC_REG_TLBPD0]     ; MMU prepares PD0 with vaddr and asid
 
@@ -191,68 +253,6 @@ ex_saved_reg1:
 #endif
 .endm
 
-;-----------------------------------------------------------------
-; ARC700 Exception Handling doesn't auto-switch stack and it only provides
-; ONE scratch AUX reg "ARC_REG_SCRATCH_DATA0"
-;
-; For Non-SMP, the scratch AUX reg is repurposed to cache task PGD, so a
-; "global" is used to free-up FIRST core reg to be able to code the rest of
-; exception prologue (IRQ auto-disabled on Exceptions, so it's IRQ-safe).
-; Since the Fast Path TLB Miss handler is coded with 4 regs, the remaining 3
-; need to be saved as well by extending the "global" to be 4 words. Hence
-;	".size   ex_saved_reg1, 16"
-; [All of this dance is to avoid stack switching for each TLB Miss, since we
-; only need to save only a handful of regs, as opposed to complete reg file]
-;
-; For ARC700 SMP, the "global" obviously can't be used for free up the FIRST
-; core reg as it will not be SMP safe.
-; Thus scratch AUX reg is used (and no longer used to cache task PGD).
-; To save the rest of 3 regs - per cpu, the global is made "per-cpu".
-; Epilogue thus has to locate the "per-cpu" storage for regs.
-; To avoid cache line bouncing the per-cpu global is aligned/sized per
-; L1_CACHE_SHIFT, despite fundamentally needing to be 12 bytes only. Hence
-;	".size   ex_saved_reg1, (CONFIG_NR_CPUS << L1_CACHE_SHIFT)"
-
-; As simple as that....
-
-.macro TLBMISS_FREEUP_REGS
-#ifdef CONFIG_SMP
-	sr  r0, [ARC_REG_SCRATCH_DATA0]	; freeup r0 to code with
-	GET_CPU_ID  r0			; get to per cpu scratch mem,
-	lsl r0, r0, L1_CACHE_SHIFT	; cache line wide per cpu
-	add r0, @ex_saved_reg1, r0
-#else
-	st    r0, [@ex_saved_reg1]
-	mov_s r0, @ex_saved_reg1
-#endif
-	st_s  r1, [r0, 4]
-	st_s  r2, [r0, 8]
-	st_s  r3, [r0, 12]
-
-	; VERIFY if the ASID in MMU-PID Reg is same as
-	; one in Linux data structures
-
-	DBG_ASID_MISMATCH
-.endm
-
-;-----------------------------------------------------------------
-.macro TLBMISS_RESTORE_REGS
-#ifdef CONFIG_SMP
-	GET_CPU_ID  r0			; get to per cpu scratch mem
-	lsl r0, r0, L1_CACHE_SHIFT	; each is cache line wide
-	add r0, @ex_saved_reg1, r0
-	ld_s  r3, [r0,12]
-	ld_s  r2, [r0, 8]
-	ld_s  r1, [r0, 4]
-	lr    r0, [ARC_REG_SCRATCH_DATA0]
-#else
-	mov_s r0, @ex_saved_reg1
-	ld_s  r3, [r0,12]
-	ld_s  r2, [r0, 8]
-	ld_s  r1, [r0, 4]
-	ld_s  r0, [r0]
-#endif
-.endm
 
 ARCFP_CODE	;Fast Path Code, candidate for ICCM
 
@@ -260,7 +260,7 @@ ARCFP_CODE	;Fast Path Code, candidate for ICCM
 ; I-TLB Miss Exception Handler
 ;-----------------------------------------------------------------------------
 
-ARC_ENTRY EV_TLBMissI
+ENTRY(EV_TLBMissI)
 
 	TLBMISS_FREEUP_REGS
 
@@ -271,35 +271,35 @@ ARC_ENTRY EV_TLBMissI
 #endif
 
 	;----------------------------------------------------------------
-	; Get the PTE corresponding to V-addr accessed
+	; Get the PTE corresponding to V-addr accessed, r2 is setup with EFA
 	LOAD_FAULT_PTE
 
 	;----------------------------------------------------------------
 	; VERIFY_PTE: Check if PTE permissions approp for executing code
 	cmp_s   r2, VMALLOC_START
-	mov.lo  r2, (_PAGE_PRESENT | _PAGE_U_READ | _PAGE_U_EXECUTE)
-	mov.hs  r2, (_PAGE_PRESENT | _PAGE_K_READ | _PAGE_K_EXECUTE)
+	mov_s   r2, (_PAGE_PRESENT | _PAGE_EXECUTE)
+	or.hs   r2, r2, _PAGE_GLOBAL
 
 	and     r3, r0, r2  ; Mask out NON Flag bits from PTE
 	xor.f   r3, r3, r2  ; check ( ( pte & flags_test ) == flags_test )
 	bnz     do_slow_path_pf
 
 	; Let Linux VM know that the page was accessed
-	or      r0, r0, (_PAGE_PRESENT | _PAGE_ACCESSED)  ; set Accessed Bit
-	st_s    r0, [r1]                                  ; Write back PTE
+	or      r0, r0, _PAGE_ACCESSED  ; set Accessed Bit
+	st_s    r0, [r1]                ; Write back PTE
 
 	CONV_PTE_TO_TLB
 	COMMIT_ENTRY_TO_MMU
 	TLBMISS_RESTORE_REGS
 	rtie
 
-ARC_EXIT EV_TLBMissI
+END(EV_TLBMissI)
 
 ;-----------------------------------------------------------------------------
 ; D-TLB Miss Exception Handler
 ;-----------------------------------------------------------------------------
 
-ARC_ENTRY EV_TLBMissD
+ENTRY(EV_TLBMissD)
 
 	TLBMISS_FREEUP_REGS
 
@@ -311,32 +311,27 @@ ARC_ENTRY EV_TLBMissD
 
 	;----------------------------------------------------------------
 	; Get the PTE corresponding to V-addr accessed
-	; If PTE exists, it will setup, r0 = PTE, r1 = Ptr to PTE
+	; If PTE exists, it will setup, r0 = PTE, r1 = Ptr to PTE, r2 = EFA
 	LOAD_FAULT_PTE
 
 	;----------------------------------------------------------------
 	; VERIFY_PTE: Chk if PTE permissions approp for data access (R/W/R+W)
 
-	mov_s   r2, 0
+	cmp_s	r2, VMALLOC_START
+	mov_s   r2, _PAGE_PRESENT	; common bit for K/U PTE
+	or.hs	r2, r2, _PAGE_GLOBAL	; kernel PTE only
+
+	; Linux PTE [RWX] bits are semantically overloaded:
+	; -If PAGE_GLOBAL set, they refer to kernel-only flags (vmalloc)
+	; -Otherwise they are user-mode permissions, and those are exactly
+	;  same for kernel mode as well (e.g. copy_(to|from)_user)
+
 	lr      r3, [ecr]
 	btst_s  r3, ECR_C_BIT_DTLB_LD_MISS	; Read Access
-	or.nz   r2, r2, _PAGE_U_READ      	; chk for Read flag in PTE
+	or.nz   r2, r2, _PAGE_READ      	; chk for Read flag in PTE
 	btst_s  r3, ECR_C_BIT_DTLB_ST_MISS	; Write Access
-	or.nz   r2, r2, _PAGE_U_WRITE     	; chk for Write flag in PTE
-	; Above laddering takes care of XCHG access
-	;   which is both Read and Write
-
-	; If kernel mode access, ; make _PAGE_xx flags as _PAGE_K_xx
-	; For copy_(to|from)_user, despite exception taken in kernel mode,
-	; this code is not hit, because EFA would still be the user mode
-	; address (EFA < 0x6000_0000).
-	; This code is for legit kernel mode faults, vmalloc specifically
-	; (EFA: 0x7000_0000 to 0x7FFF_FFFF)
-
-	lr      r3, [efa]
-	cmp     r3, VMALLOC_START - 1   ; If kernel mode access
-	asl.hi  r2, r2, 3               ; make _PAGE_xx flags as _PAGE_K_xx
-	or      r2, r2, _PAGE_PRESENT   ; Common flag for K/U mode
+	or.nz   r2, r2, _PAGE_WRITE     	; chk for Write flag in PTE
+	; Above laddering takes care of XCHG access (both R and W)
 
 	; By now, r2 setup with all the Flags we need to check in PTE
 	and     r3, r0, r2              ; Mask out NON Flag bits from PTE
@@ -345,7 +340,7 @@ ARC_ENTRY EV_TLBMissD
 	;----------------------------------------------------------------
 	; UPDATE_PTE: Let Linux VM know that page was accessed/dirty
 	lr      r3, [ecr]
-	or      r0, r0, (_PAGE_PRESENT | _PAGE_ACCESSED) ; Accessed bit always
+	or      r0, r0, _PAGE_ACCESSED        ; Accessed bit always
 	btst_s  r3,  ECR_C_BIT_DTLB_ST_MISS   ; See if it was a Write Access ?
 	or.nz   r0, r0, _PAGE_MODIFIED        ; if Write, set Dirty bit as well
 	st_s    r0, [r1]                      ; Write back PTE
@@ -371,28 +366,11 @@ do_slow_path_pf:
 
 	; Slow path TLB Miss handled as a regular ARC Exception
 	; (stack switching / save the complete reg-file).
-	; That requires freeing up r9
-	EXCPN_PROLOG_FREEUP_REG r9
-
-	lr  r9, [erstatus]
-
-	SWITCH_TO_KERNEL_STK
-	SAVE_ALL_SYS
+	EXCEPTION_PROLOGUE
 
 	; ------- setup args for Linux Page fault Hanlder ---------
-	mov_s r0, sp
-	lr  r2, [efa]
-	lr  r3, [ecr]
-
-	; Both st and ex imply WRITE access of some sort, hence do_page_fault( )
-	; invoked with write=1 for DTLB-st/ex Miss and write=0 for ITLB miss or
-	; DTLB-ld Miss
-	; DTLB Miss Cause code is ld = 0x01 , st = 0x02, ex = 0x03
-	; Following code uses that fact that st/ex have one bit in common
-
-	btst_s r3,  ECR_C_BIT_DTLB_ST_MISS
-	mov.z  r1, 0
-	mov.nz r1, 1
+	mov_s r1, sp
+	lr    r0, [efa]
 
 	; We don't want exceptions to be disabled while the fault is handled.
 	; Now that we have saved the context we return from exception hence
@@ -403,6 +381,4 @@ do_slow_path_pf:
 	bl  do_page_fault
 	b   ret_from_exception
 
-ARC_EXIT EV_TLBMissD
-
-ARC_ENTRY EV_TLBMissB   ; Bogus entry to measure sz of DTLBMiss hdlr
+END(EV_TLBMissD)