27 files changed, 895 insertions, 300 deletions
diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c
index 6747eece84a..7b6c1075017 100644
--- a/arch/powerpc/mm/dma-noncoherent.c
+++ b/arch/powerpc/mm/dma-noncoherent.c
@@ -287,9 +287,7 @@ void __dma_free_coherent(size_t size, void *vaddr)
 			pte_clear(&init_mm, addr, ptep);
 			if (pfn_valid(pfn)) {
 				struct page *page = pfn_to_page(pfn);
-
-				ClearPageReserved(page);
-				__free_page(page);
+				__free_reserved_page(page);
 			}
 		}
 		addr += PAGE_SIZE;
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
index 07ba45b0f07..94cd728166d 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -52,6 +52,7 @@
 #include <asm/smp.h>
 #include <asm/machdep.h>
 #include <asm/setup.h>
+#include <asm/paca.h>
 
 #include "mmu_decl.h"
 
@@ -171,11 +172,10 @@ unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
 	return 1UL << camsize;
 }
 
-unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx)
+static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
+					unsigned long ram, int max_cam_idx)
 {
 	int i;
-	unsigned long virt = PAGE_OFFSET;
-	phys_addr_t phys = memstart_addr;
 	unsigned long amount_mapped = 0;
 
 	/* Calculate CAM values */
@@ -192,9 +192,23 @@ unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx)
 	}
 	tlbcam_index = i;
 
+#ifdef CONFIG_PPC64
+	get_paca()->tcd.esel_next = i;
+	get_paca()->tcd.esel_max = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
+	get_paca()->tcd.esel_first = i;
+#endif
+
 	return amount_mapped;
 }
 
+unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx)
+{
+	unsigned long virt = PAGE_OFFSET;
+	phys_addr_t phys = memstart_addr;
+
+	return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx);
+}
+
 #ifdef CONFIG_PPC32
 
 #if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS)
@@ -222,7 +236,9 @@ void __init adjust_total_lowmem(void)
 	/* adjust lowmem size to __max_low_memory */
 	ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem);
 
+	i = switch_to_as1();
 	__max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM);
+	restore_to_as0(i, 0, 0, 1);
 
 	pr_info("Memory CAM mapping: ");
 	for (i = 0; i < tlbcam_index - 1; i++)
@@ -241,4 +257,62 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 	/* 64M mapped initially according to head_fsl_booke.S */
 	memblock_set_current_limit(min_t(u64, limit, 0x04000000));
 }
+
+#ifdef CONFIG_RELOCATABLE
+int __initdata is_second_reloc;
+notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start)
+{
+	unsigned long base = KERNELBASE;
+
+	kernstart_addr = start;
+	if (is_second_reloc) {
+		virt_phys_offset = PAGE_OFFSET - memstart_addr;
+		return;
+	}
+
+	/*
+	 * Relocatable kernel support based on processing of dynamic
+	 * relocation entries. Before we get the real memstart_addr,
+	 * We will compute the virt_phys_offset like this:
+	 * virt_phys_offset = stext.run - kernstart_addr
+	 *
+	 * stext.run = (KERNELBASE & ~0x3ffffff) +
+	 *				(kernstart_addr & 0x3ffffff)
+	 * When we relocate, we have :
+	 *
+	 *	(kernstart_addr & 0x3ffffff) = (stext.run & 0x3ffffff)
+	 *
+	 * hence:
+	 *  virt_phys_offset = (KERNELBASE & ~0x3ffffff) -
+	 *                              (kernstart_addr & ~0x3ffffff)
+	 *
+	 */
+	start &= ~0x3ffffff;
+	base &= ~0x3ffffff;
+	virt_phys_offset = base - start;
+	early_get_first_memblock_info(__va(dt_ptr), NULL);
+	/*
+	 * We now get the memstart_addr, then we should check if this
+	 * address is the same as what the PAGE_OFFSET map to now. If
+	 * not we have to change the map of PAGE_OFFSET to memstart_addr
+	 * and do a second relocation.
+	 */
+	if (start != memstart_addr) {
+		int n;
+		long offset = start - memstart_addr;
+
+		is_second_reloc = 1;
+		n = switch_to_as1();
+		/* map a 64M area for the second relocation */
+		if (memstart_addr > start)
+			map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM);
+		else
+			map_mem_in_cams_addr(start, PAGE_OFFSET + offset,
+					0x4000000, CONFIG_LOWMEM_CAM_NUM);
+		restore_to_as0(n, offset, __va(dt_ptr), 1);
+		/* We should never reach here */
+		panic("Relocation error");
+	}
+}
+#endif
 #endif
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index 6936547018b..d8746684f60 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -36,6 +36,11 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
 	do {
 		pte_t pte = ACCESS_ONCE(*ptep);
 		struct page *page;
+		/*
+		 * Similar to the PMD case, NUMA hinting must take slow path
+		 */
+		if (pte_numa(pte))
+			return 0;
 
 		if ((pte_val(pte) & mask) != result)
 			return 0;
@@ -75,6 +80,14 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 		if (pmd_none(pmd) || pmd_trans_splitting(pmd))
 			return 0;
 		if (pmd_huge(pmd) || pmd_large(pmd)) {
+			/*
+			 * NUMA hinting faults need to be handled in the GUP
+			 * slowpath for accounting purposes and so that they
+			 * can be serialised against THP migration.
+			 */
+			if (pmd_numa(pmd))
+				return 0;
+
 			if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next,
 					 write, pages, nr))
 				return 0;
@@ -123,6 +136,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 	struct mm_struct *mm = current->mm;
 	unsigned long addr, len, end;
 	unsigned long next;
+	unsigned long flags;
 	pgd_t *pgdp;
 	int nr = 0;
 
@@ -156,7 +170,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 	 * So long as we atomically load page table pointers versus teardown,
 	 * we can follow the address down to the the page and take a ref on it.
 	 */
-	local_irq_disable();
+	local_irq_save(flags);
 
 	pgdp = pgd_offset(mm, addr);
 	do {
@@ -179,7 +193,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			break;
 	} while (pgdp++, addr = next, addr != end);
 
-	local_irq_enable();
+	local_irq_restore(flags);
 
 	return nr;
 }
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
index d3cbda62857..057cbbb4c57 100644
--- a/arch/powerpc/mm/hash_low_64.S
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -148,7 +148,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
 	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
 	andc	r0,r30,r0		/* r0 = pte & ~r0 */
 	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
-	ori	r3,r3,HPTE_R_C		/* Always add "C" bit for perf. */
+	/*
+	 * Always add "C" bit for perf. Memory coherence is always enabled
+	 */
+	ori	r3,r3,HPTE_R_C | HPTE_R_M
 
 	/* We eventually do the icache sync here (maybe inline that
 	 * code rather than call a C function...) 
@@ -156,7 +159,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
 BEGIN_FTR_SECTION
 	mr	r4,r30
 	mr	r5,r7
-	bl	.hash_page_do_lazy_icache
+	bl	hash_page_do_lazy_icache
 END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
 
 	/* At this point, r3 contains new PP bits, save them in
@@ -198,7 +201,8 @@ htab_insert_pte:
 	li	r8,MMU_PAGE_4K		/* page size */
 	li	r9,MMU_PAGE_4K		/* actual page size */
 	ld	r10,STK_PARAM(R9)(r1)	/* segment size */
-_GLOBAL(htab_call_hpte_insert1)
+.globl htab_call_hpte_insert1
+htab_call_hpte_insert1:
 	bl	.			/* Patched by htab_finish_init() */
 	cmpdi	0,r3,0
 	bge	htab_pte_insert_ok	/* Insertion successful */
@@ -222,7 +226,8 @@ _GLOBAL(htab_call_hpte_insert1)
 	li	r8,MMU_PAGE_4K		/* page size */
 	li	r9,MMU_PAGE_4K		/* actual page size */
 	ld	r10,STK_PARAM(R9)(r1)	/* segment size */
-_GLOBAL(htab_call_hpte_insert2)
+.globl htab_call_hpte_insert2
+htab_call_hpte_insert2:
 	bl	.			/* Patched by htab_finish_init() */
 	cmpdi	0,r3,0
 	bge+	htab_pte_insert_ok	/* Insertion successful */
@@ -239,7 +244,8 @@ _GLOBAL(htab_call_hpte_insert2)
 2:	and	r0,r5,r27
 	rldicr	r3,r0,3,63-3	/* r0 = (hash & mask) << 3 */	
 	/* Call ppc_md.hpte_remove */
-_GLOBAL(htab_call_hpte_remove)
+.globl htab_call_hpte_remove
+htab_call_hpte_remove:
 	bl	.			/* Patched by htab_finish_init() */
 
 	/* Try all again */
@@ -293,7 +299,8 @@ htab_modify_pte:
 	li	r7,MMU_PAGE_4K		/* actual page size */
 	ld	r8,STK_PARAM(R9)(r1)	/* segment size */
 	ld	r9,STK_PARAM(R8)(r1)	/* get "local" param */
-_GLOBAL(htab_call_hpte_updatepp)
+.globl htab_call_hpte_updatepp
+htab_call_hpte_updatepp:
 	bl	.			/* Patched by htab_finish_init() */
 
 	/* if we failed because typically the HPTE wasn't really here
@@ -457,7 +464,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
 	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
 	andc	r0,r3,r0		/* r0 = pte & ~r0 */
 	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
-	ori	r3,r3,HPTE_R_C		/* Always add "C" bit for perf. */
+	/*
+	 * Always add "C" bit for perf. Memory coherence is always enabled
+	 */
+	ori	r3,r3,HPTE_R_C | HPTE_R_M
 
 	/* We eventually do the icache sync here (maybe inline that
 	 * code rather than call a C function...)
@@ -465,7 +475,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
 BEGIN_FTR_SECTION
 	mr	r4,r30
 	mr	r5,r7
-	bl	.hash_page_do_lazy_icache
+	bl	hash_page_do_lazy_icache
 END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
 
 	/* At this point, r3 contains new PP bits, save them in
@@ -520,7 +530,8 @@ htab_special_pfn:
 	li	r8,MMU_PAGE_4K		/* page size */
 	li	r9,MMU_PAGE_4K		/* actual page size */
 	ld	r10,STK_PARAM(R9)(r1)	/* segment size */
-_GLOBAL(htab_call_hpte_insert1)
+.globl htab_call_hpte_insert1
+htab_call_hpte_insert1:
 	bl	.			/* patched by htab_finish_init() */
 	cmpdi	0,r3,0
 	bge	htab_pte_insert_ok	/* Insertion successful */
@@ -548,7 +559,8 @@ _GLOBAL(htab_call_hpte_insert1)
 	li	r8,MMU_PAGE_4K		/* page size */
 	li	r9,MMU_PAGE_4K		/* actual page size */
 	ld	r10,STK_PARAM(R9)(r1)	/* segment size */
-_GLOBAL(htab_call_hpte_insert2)
+.globl htab_call_hpte_insert2
+htab_call_hpte_insert2:
 	bl	.			/* patched by htab_finish_init() */
 	cmpdi	0,r3,0
 	bge+	htab_pte_insert_ok	/* Insertion successful */
@@ -565,7 +577,8 @@ _GLOBAL(htab_call_hpte_insert2)
 2:	and	r0,r5,r27
 	rldicr	r3,r0,3,63-3		/* r0 = (hash & mask) << 3 */
 	/* Call ppc_md.hpte_remove */
-_GLOBAL(htab_call_hpte_remove)
+.globl htab_call_hpte_remove
+htab_call_hpte_remove:
 	bl	.			/* patched by htab_finish_init() */
 
 	/* Try all again */
@@ -582,7 +595,7 @@ htab_inval_old_hpte:
 	li	r6,MMU_PAGE_64K		/* psize */
 	ld	r7,STK_PARAM(R9)(r1)	/* ssize */
 	ld	r8,STK_PARAM(R8)(r1)	/* local */
-	bl	.flush_hash_page
+	bl	flush_hash_page
 	/* Clear out _PAGE_HPTE_SUB bits in the new linux PTE */
 	lis	r0,_PAGE_HPTE_SUB@h
 	ori	r0,r0,_PAGE_HPTE_SUB@l
@@ -654,7 +667,8 @@ htab_modify_pte:
 	li	r7,MMU_PAGE_4K		/* actual page size */
 	ld	r8,STK_PARAM(R9)(r1)	/* segment size */
 	ld	r9,STK_PARAM(R8)(r1)	/* get "local" param */
-_GLOBAL(htab_call_hpte_updatepp)
+.globl htab_call_hpte_updatepp
+htab_call_hpte_updatepp:
 	bl	.			/* patched by htab_finish_init() */
 
 	/* if we failed because typically the HPTE wasn't really here
@@ -795,7 +809,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
 	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
 	andc	r0,r30,r0		/* r0 = pte & ~r0 */
 	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
-	ori	r3,r3,HPTE_R_C		/* Always add "C" bit for perf. */
+	/*
+	 * Always add "C" bit for perf. Memory coherence is always enabled
+	 */
+	ori	r3,r3,HPTE_R_C | HPTE_R_M
 
 	/* We eventually do the icache sync here (maybe inline that
 	 * code rather than call a C function...)
@@ -803,7 +820,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
 BEGIN_FTR_SECTION
 	mr	r4,r30
 	mr	r5,r7
-	bl	.hash_page_do_lazy_icache
+	bl	hash_page_do_lazy_icache
 END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
 
 	/* At this point, r3 contains new PP bits, save them in
@@ -848,7 +865,8 @@ ht64_insert_pte:
 	li	r8,MMU_PAGE_64K
 	li	r9,MMU_PAGE_64K		/* actual page size */
 	ld	r10,STK_PARAM(R9)(r1)	/* segment size */
-_GLOBAL(ht64_call_hpte_insert1)
+.globl ht64_call_hpte_insert1
+ht64_call_hpte_insert1:
 	bl	.			/* patched by htab_finish_init() */
 	cmpdi	0,r3,0
 	bge	ht64_pte_insert_ok	/* Insertion successful */
@@ -872,7 +890,8 @@ _GLOBAL(ht64_call_hpte_insert1)
 	li	r8,MMU_PAGE_64K
 	li	r9,MMU_PAGE_64K		/* actual page size */
 	ld	r10,STK_PARAM(R9)(r1)	/* segment size */
-_GLOBAL(ht64_call_hpte_insert2)
+.globl ht64_call_hpte_insert2
+ht64_call_hpte_insert2:
 	bl	.			/* patched by htab_finish_init() */
 	cmpdi	0,r3,0
 	bge+	ht64_pte_insert_ok	/* Insertion successful */
@@ -889,7 +908,8 @@ _GLOBAL(ht64_call_hpte_insert2)
 2:	and	r0,r5,r27
 	rldicr	r3,r0,3,63-3	/* r0 = (hash & mask) << 3 */
 	/* Call ppc_md.hpte_remove */
-_GLOBAL(ht64_call_hpte_remove)
+.globl ht64_call_hpte_remove
+ht64_call_hpte_remove:
 	bl	.			/* patched by htab_finish_init() */
 
 	/* Try all again */
@@ -943,7 +963,8 @@ ht64_modify_pte:
 	li	r7,MMU_PAGE_64K		/* actual page size */
 	ld	r8,STK_PARAM(R9)(r1)	/* segment size */
 	ld	r9,STK_PARAM(R8)(r1)	/* get "local" param */
-_GLOBAL(ht64_call_hpte_updatepp)
+.globl ht64_call_hpte_updatepp
+ht64_call_hpte_updatepp:
 	bl	.			/* patched by htab_finish_init() */
 
 	/* if we failed because typically the HPTE wasn't really here
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index c33d939120c..cf1d325eae8 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -35,7 +35,11 @@
 #define DBG_LOW(fmt...)
 #endif
 
+#ifdef __BIG_ENDIAN__
 #define HPTE_LOCK_BIT 3
+#else
+#define HPTE_LOCK_BIT (56+3)
+#endif
 
 DEFINE_RAW_SPINLOCK(native_tlbie_lock);
 
@@ -78,17 +82,14 @@ static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
 		va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
 		va |= penc << 12;
 		va |= ssize << 8;
-		/* Add AVAL part */
-		if (psize != apsize) {
-			/*
-			 * MPSS, 64K base page size and 16MB parge page size
-			 * We don't need all the bits, but rest of the bits
-			 * must be ignored by the processor.
-			 * vpn cover upto 65 bits of va. (0...65) and we need
-			 * 58..64 bits of va.
-			 */
-			va |= (vpn & 0xfe);
-		}
+		/*
+		 * AVAL bits:
+		 * We don't need all the bits, but rest of the bits
+		 * must be ignored by the processor.
+		 * vpn cover upto 65 bits of va. (0...65) and we need
+		 * 58..64 bits of va.
+		 */
+		va |= (vpn & 0xfe); /* AVAL */
 		va |= 1; /* L */
 		asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2)
 			     : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
@@ -129,17 +130,14 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
 		va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
 		va |= penc << 12;
 		va |= ssize << 8;
-		/* Add AVAL part */
-		if (psize != apsize) {
-			/*
-			 * MPSS, 64K base page size and 16MB parge page size
-			 * We don't need all the bits, but rest of the bits
-			 * must be ignored by the processor.
-			 * vpn cover upto 65 bits of va. (0...65) and we need
-			 * 58..64 bits of va.
-			 */
-			va |= (vpn & 0xfe);
-		}
+		/*
+		 * AVAL bits:
+		 * We don't need all the bits, but rest of the bits
+		 * must be ignored by the processor.
+		 * vpn cover upto 65 bits of va. (0...65) and we need
+		 * 58..64 bits of va.
+		 */
+		va |= (vpn & 0xfe);
 		va |= 1; /* L */
 		asm volatile(".long 0x7c000224 | (%0 << 11) | (1 << 21)"
 			     : : "r"(va) : "memory");
@@ -172,7 +170,7 @@ static inline void tlbie(unsigned long vpn, int psize, int apsize,
 
 static inline void native_lock_hpte(struct hash_pte *hptep)
 {
-	unsigned long *word = &hptep->v;
+	unsigned long *word = (unsigned long *)&hptep->v;
 
 	while (1) {
 		if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
@@ -184,7 +182,7 @@ static inline void native_lock_hpte(struct hash_pte *hptep)
 
 static inline void native_unlock_hpte(struct hash_pte *hptep)
 {
-	unsigned long *word = &hptep->v;
+	unsigned long *word = (unsigned long *)&hptep->v;
 
 	clear_bit_unlock(HPTE_LOCK_BIT, word);
 }
@@ -204,10 +202,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
 	}
 
 	for (i = 0; i < HPTES_PER_GROUP; i++) {
-		if (! (hptep->v & HPTE_V_VALID)) {
+		if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
 			/* retry with lock held */
 			native_lock_hpte(hptep);
-			if (! (hptep->v & HPTE_V_VALID))
+			if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID))
 				break;
 			native_unlock_hpte(hptep);
 		}
@@ -226,14 +224,14 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
 			i, hpte_v, hpte_r);
 	}
 
-	hptep->r = hpte_r;
+	hptep->r = cpu_to_be64(hpte_r);
 	/* Guarantee the second dword is visible before the valid bit */
 	eieio();
 	/*
 	 * Now set the first dword including the valid bit
 	 * NOTE: this also unlocks the hpte
 	 */
-	hptep->v = hpte_v;
+	hptep->v = cpu_to_be64(hpte_v);
 
 	__asm__ __volatile__ ("ptesync" : : : "memory");
 
@@ -254,12 +252,12 @@ static long native_hpte_remove(unsigned long hpte_group)
 
 	for (i = 0; i < HPTES_PER_GROUP; i++) {
 		hptep = htab_address + hpte_group + slot_offset;
-		hpte_v = hptep->v;
+		hpte_v = be64_to_cpu(hptep->v);
 
 		if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) {
 			/* retry with lock held */
 			native_lock_hpte(hptep);
-			hpte_v = hptep->v;
+			hpte_v = be64_to_cpu(hptep->v);
 			if ((hpte_v & HPTE_V_VALID)
 			    && !(hpte_v & HPTE_V_BOLTED))
 				break;
@@ -294,7 +292,7 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
 
 	native_lock_hpte(hptep);
 
-	hpte_v = hptep->v;
+	hpte_v = be64_to_cpu(hptep->v);
 	/*
 	 * We need to invalidate the TLB always because hpte_remove doesn't do
 	 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
@@ -308,8 +306,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
 	} else {
 		DBG_LOW(" -> hit\n");
 		/* Update the HPTE */
-		hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
-			(newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C));
+		hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) & ~(HPTE_R_PP | HPTE_R_N)) |
+			(newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C)));
 	}
 	native_unlock_hpte(hptep);
 
@@ -334,7 +332,7 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize)
 	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 	for (i = 0; i < HPTES_PER_GROUP; i++) {
 		hptep = htab_address + slot;
-		hpte_v = hptep->v;
+		hpte_v = be64_to_cpu(hptep->v);
 
 		if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
 			/* HPTE matches */
@@ -369,8 +367,9 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
 	hptep = htab_address + slot;
 
 	/* Update the HPTE */
-	hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
-		(newpp & (HPTE_R_PP | HPTE_R_N));
+	hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) &
+			~(HPTE_R_PP | HPTE_R_N)) |
+		(newpp & (HPTE_R_PP | HPTE_R_N)));
 	/*
 	 * Ensure it is out of the tlb too. Bolted entries base and
 	 * actual page size will be same.
@@ -392,7 +391,7 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
 
 	want_v = hpte_encode_avpn(vpn, bpsize, ssize);
 	native_lock_hpte(hptep);
-	hpte_v = hptep->v;
+	hpte_v = be64_to_cpu(hptep->v);
 
 	/*
 	 * We need to invalidate the TLB always because hpte_remove doesn't do
@@ -458,7 +457,7 @@ static void native_hugepage_invalidate(struct mm_struct *mm,
 		hptep = htab_address + slot;
 		want_v = hpte_encode_avpn(vpn, psize, ssize);
 		native_lock_hpte(hptep);
-		hpte_v = hptep->v;
+		hpte_v = be64_to_cpu(hptep->v);
 
 		/* Even if we miss, we need to invalidate the TLB */
 		if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
@@ -519,11 +518,12 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
 			int *psize, int *apsize, int *ssize, unsigned long *vpn)
 {
 	unsigned long avpn, pteg, vpi;
-	unsigned long hpte_v = hpte->v;
+	unsigned long hpte_v = be64_to_cpu(hpte->v);
+	unsigned long hpte_r = be64_to_cpu(hpte->r);
 	unsigned long vsid, seg_off;
 	int size, a_size, shift;
 	/* Look at the 8 bit LP value */
-	unsigned int lp = (hpte->r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
+	unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
 
 	if (!(hpte_v & HPTE_V_LARGE)) {
 		size   = MMU_PAGE_4K;
@@ -612,7 +612,7 @@ static void native_hpte_clear(void)
 		 * running,  right?  and for crash dump, we probably
 		 * don't want to wait for a maybe bad cpu.
 		 */
-		hpte_v = hptep->v;
+		hpte_v = be64_to_cpu(hptep->v);
 
 		/*
 		 * Call __tlbie() here rather than tlbie() since we
@@ -664,7 +664,7 @@ static void native_flush_hash_range(unsigned long number, int local)
 			hptep = htab_address + slot;
 			want_v = hpte_encode_avpn(vpn, psize, ssize);
 			native_lock_hpte(hptep);
-			hpte_v = hptep->v;
+			hpte_v = be64_to_cpu(hptep->v);
 			if (!HPTE_V_COMPARE(hpte_v, want_v) ||
 			    !(hpte_v & HPTE_V_VALID))
 				native_unlock_hpte(hptep);
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index bde8b558975..88fdd9d2507 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -169,9 +169,10 @@ static unsigned long htab_convert_pte_flags(unsigned long pteflags)
 	if ((pteflags & _PAGE_USER) && !((pteflags & _PAGE_RW) &&
 					 (pteflags & _PAGE_DIRTY)))
 		rflags |= 1;
-
-	/* Always add C */
-	return rflags | HPTE_R_C;
+	/*
+	 * Always add "C" bit for perf. Memory coherence is always enabled
+	 */
+	return rflags | HPTE_R_C | HPTE_R_M;
 }
 
 int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
@@ -206,6 +207,24 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
 		if (overlaps_kernel_text(vaddr, vaddr + step))
 			tprot &= ~HPTE_R_N;
 
+		/* Make kvm guest trampolines executable */
+		if (overlaps_kvm_tmp(vaddr, vaddr + step))
+			tprot &= ~HPTE_R_N;
+
+		/*
+		 * If relocatable, check if it overlaps interrupt vectors that
+		 * are copied down to real 0. For relocatable kernel
+		 * (e.g. kdump case) we copy interrupt vectors down to real
+		 * address 0. Mark that region as executable. This is
+		 * because on p8 system with relocation on exception feature
+		 * enabled, exceptions are raised with MMU (IR=DR=1) ON. Hence
+		 * in order to execute the interrupt handlers in virtual
+		 * mode the vector region need to be marked as executable.
+		 */
+		if ((PHYSICAL_START > MEMORY_START) &&
+			overlaps_interrupt_vector_text(vaddr, vaddr + step))
+				tprot &= ~HPTE_R_N;
+
 		hash = hpt_hash(vpn, shift, ssize);
 		hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
 
@@ -250,20 +269,19 @@ static int __init htab_dt_scan_seg_sizes(unsigned long node,
 					 const char *uname, int depth,
 					 void *data)
 {
-	char *type = of_get_flat_dt_prop(node, "device_type", NULL);
-	u32 *prop;
-	unsigned long size = 0;
+	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	const __be32 *prop;
+	int size = 0;
 
 	/* We are scanning "cpu" nodes only */
 	if (type == NULL || strcmp(type, "cpu") != 0)
 		return 0;
 
-	prop = (u32 *)of_get_flat_dt_prop(node, "ibm,processor-segment-sizes",
-					  &size);
+	prop = of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", &size);
 	if (prop == NULL)
 		return 0;
 	for (; size >= 4; size -= 4, ++prop) {
-		if (prop[0] == 40) {
+		if (be32_to_cpu(prop[0]) == 40) {
 			DBG("1T segment support detected\n");
 			cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT;
 			return 1;
@@ -306,24 +324,23 @@ static int __init htab_dt_scan_page_sizes(unsigned long node,
 					  const char *uname, int depth,
 					  void *data)
 {
-	char *type = of_get_flat_dt_prop(node, "device_type", NULL);
-	u32 *prop;
-	unsigned long size = 0;
+	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	const __be32 *prop;
+	int size = 0;
 
 	/* We are scanning "cpu" nodes only */
 	if (type == NULL || strcmp(type, "cpu") != 0)
 		return 0;
 
-	prop = (u32 *)of_get_flat_dt_prop(node,
-					  "ibm,segment-page-sizes", &size);
+	prop = of_get_flat_dt_prop(node, "ibm,segment-page-sizes", &size);
 	if (prop != NULL) {
 		pr_info("Page sizes from device-tree:\n");
 		size /= 4;
 		cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE);
 		while(size > 0) {
-			unsigned int base_shift = prop[0];
-			unsigned int slbenc = prop[1];
-			unsigned int lpnum = prop[2];
+			unsigned int base_shift = be32_to_cpu(prop[0]);
+			unsigned int slbenc = be32_to_cpu(prop[1]);
+			unsigned int lpnum = be32_to_cpu(prop[2]);
 			struct mmu_psize_def *def;
 			int idx, base_idx;
 
@@ -356,8 +373,8 @@ static int __init htab_dt_scan_page_sizes(unsigned long node,
 				def->tlbiel = 0;
 
 			while (size > 0 && lpnum) {
-				unsigned int shift = prop[0];
-				int penc  = prop[1];
+				unsigned int shift = be32_to_cpu(prop[0]);
+				int penc  = be32_to_cpu(prop[1]);
 
 				prop += 2; size -= 2;
 				lpnum--;
@@ -389,9 +406,9 @@ static int __init htab_dt_scan_page_sizes(unsigned long node,
 static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
 					const char *uname, int depth,
 					void *data) {
-	char *type = of_get_flat_dt_prop(node, "device_type", NULL);
-	unsigned long *addr_prop;
-	u32 *page_count_prop;
+	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	const __be64 *addr_prop;
+	const __be32 *page_count_prop;
 	unsigned int expected_pages;
 	long unsigned int phys_addr;
 	long unsigned int block_size;
@@ -405,12 +422,12 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
 	page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL);
 	if (page_count_prop == NULL)
 		return 0;
-	expected_pages = (1 << page_count_prop[0]);
+	expected_pages = (1 << be32_to_cpu(page_count_prop[0]));
 	addr_prop = of_get_flat_dt_prop(node, "reg", NULL);
 	if (addr_prop == NULL)
 		return 0;
-	phys_addr = addr_prop[0];
-	block_size = addr_prop[1];
+	phys_addr = be64_to_cpu(addr_prop[0]);
+	block_size = be64_to_cpu(addr_prop[1]);
 	if (block_size != (16 * GB))
 		return 0;
 	printk(KERN_INFO "Huge page(16GB) memory: "
@@ -432,6 +449,24 @@ static void mmu_psize_set_default_penc(void)
 			mmu_psize_defs[bpsize].penc[apsize] = -1;
 }
 
+#ifdef CONFIG_PPC_64K_PAGES
+
+static bool might_have_hea(void)
+{
+	/*
+	 * The HEA ethernet adapter requires awareness of the
+	 * GX bus. Without that awareness we can easily assume
+	 * we will never see an HEA ethernet device.
+	 */
+#ifdef CONFIG_IBMEBUS
+	return !cpu_has_feature(CPU_FTR_ARCH_207S);
+#else
+	return false;
+#endif
+}
+
+#endif /* #ifdef CONFIG_PPC_64K_PAGES */
+
 static void __init htab_init_page_sizes(void)
 {
 	int rc;
@@ -486,10 +521,11 @@ static void __init htab_init_page_sizes(void)
 			mmu_linear_psize = MMU_PAGE_64K;
 		if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) {
 			/*
-			 * Don't use 64k pages for ioremap on pSeries, since
-			 * that would stop us accessing the HEA ethernet.
+			 * When running on pSeries using 64k pages for ioremap
+			 * would stop us accessing the HEA ethernet. So if we
+			 * have the chance of ever seeing one, stay at 4k.
 			 */
-			if (!machine_is(pseries))
+			if (!might_have_hea() || !machine_is(pseries))
 				mmu_io_psize = MMU_PAGE_64K;
 		} else
 			mmu_ci_restrictions = 1;
@@ -533,17 +569,17 @@ static int __init htab_dt_scan_pftsize(unsigned long node,
 				       const char *uname, int depth,
 				       void *data)
 {
-	char *type = of_get_flat_dt_prop(node, "device_type", NULL);
-	u32 *prop;
+	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	const __be32 *prop;
 
 	/* We are scanning "cpu" nodes only */
 	if (type == NULL || strcmp(type, "cpu") != 0)
 		return 0;
 
-	prop = (u32 *)of_get_flat_dt_prop(node, "ibm,pft-size", NULL);
+	prop = of_get_flat_dt_prop(node, "ibm,pft-size", NULL);
 	if (prop != NULL) {
 		/* pft_size[0] is the NUMA CEC cookie */
-		ppc64_pft_size = prop[1];
+		ppc64_pft_size = be32_to_cpu(prop[1]);
 		return 1;
 	}
 	return 0;
@@ -590,47 +626,43 @@ int remove_section_mapping(unsigned long start, unsigned long end)
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
-#define FUNCTION_TEXT(A)	((*(unsigned long *)(A)))
+extern u32 htab_call_hpte_insert1[];
+extern u32 htab_call_hpte_insert2[];
+extern u32 htab_call_hpte_remove[];
+extern u32 htab_call_hpte_updatepp[];
+extern u32 ht64_call_hpte_insert1[];
+extern u32 ht64_call_hpte_insert2[];
+extern u32 ht64_call_hpte_remove[];
+extern u32 ht64_call_hpte_updatepp[];
 
 static void __init htab_finish_init(void)
 {
-	extern unsigned int *htab_call_hpte_insert1;
-	extern unsigned int *htab_call_hpte_insert2;
-	extern unsigned int *htab_call_hpte_remove;
-	extern unsigned int *htab_call_hpte_updatepp;
-
 #ifdef CONFIG_PPC_HAS_HASH_64K
-	extern unsigned int *ht64_call_hpte_insert1;
-	extern unsigned int *ht64_call_hpte_insert2;
-	extern unsigned int *ht64_call_hpte_remove;
-	extern unsigned int *ht64_call_hpte_updatepp;
-
 	patch_branch(ht64_call_hpte_insert1,
-		FUNCTION_TEXT(ppc_md.hpte_insert),
+		ppc_function_entry(ppc_md.hpte_insert),
 		BRANCH_SET_LINK);
 	patch_branch(ht64_call_hpte_insert2,
-		FUNCTION_TEXT(ppc_md.hpte_insert),
+		ppc_function_entry(ppc_md.hpte_insert),
 		BRANCH_SET_LINK);
 	patch_branch(ht64_call_hpte_remove,
-		FUNCTION_TEXT(ppc_md.hpte_remove),
+		ppc_function_entry(ppc_md.hpte_remove),
 		BRANCH_SET_LINK);
 	patch_branch(ht64_call_hpte_updatepp,
-		FUNCTION_TEXT(ppc_md.hpte_updatepp),
+		ppc_function_entry(ppc_md.hpte_updatepp),
 		BRANCH_SET_LINK);
-
 #endif /* CONFIG_PPC_HAS_HASH_64K */
 
 	patch_branch(htab_call_hpte_insert1,
-		FUNCTION_TEXT(ppc_md.hpte_insert),
+		ppc_function_entry(ppc_md.hpte_insert),
 		BRANCH_SET_LINK);
 	patch_branch(htab_call_hpte_insert2,
-		FUNCTION_TEXT(ppc_md.hpte_insert),
+		ppc_function_entry(ppc_md.hpte_insert),
 		BRANCH_SET_LINK);
 	patch_branch(htab_call_hpte_remove,
-		FUNCTION_TEXT(ppc_md.hpte_remove),
+		ppc_function_entry(ppc_md.hpte_remove),
 		BRANCH_SET_LINK);
 	patch_branch(htab_call_hpte_updatepp,
-		FUNCTION_TEXT(ppc_md.hpte_updatepp),
+		ppc_function_entry(ppc_md.hpte_updatepp),
 		BRANCH_SET_LINK);
 }
 
@@ -947,6 +979,22 @@ void hash_failure_debug(unsigned long ea, unsigned long access,
 		trap, vsid, ssize, psize, lpsize, pte);
 }
 
+static void check_paca_psize(unsigned long ea, struct mm_struct *mm,
+			     int psize, bool user_region)
+{
+	if (user_region) {
+		if (psize != get_paca_psize(ea)) {
+			get_paca()->context = mm->context;
+			slb_flush_and_rebolt();
+		}
+	} else if (get_paca()->vmalloc_sllp !=
+		   mmu_psize_defs[mmu_vmalloc_psize].sllp) {
+		get_paca()->vmalloc_sllp =
+			mmu_psize_defs[mmu_vmalloc_psize].sllp;
+		slb_vmalloc_update();
+	}
+}
+
 /* Result code is:
  *  0 - handled
  *  1 - normal page fault
@@ -1068,6 +1116,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 			WARN_ON(1);
 		}
 #endif
+		check_paca_psize(ea, mm, psize, user_region);
+
 		goto bail;
 	}
 
@@ -1108,17 +1158,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 #endif
 		}
 	}
-	if (user_region) {
-		if (psize != get_paca_psize(ea)) {
-			get_paca()->context = mm->context;
-			slb_flush_and_rebolt();
-		}
-	} else if (get_paca()->vmalloc_sllp !=
-		   mmu_psize_defs[mmu_vmalloc_psize].sllp) {
-		get_paca()->vmalloc_sllp =
-			mmu_psize_defs[mmu_vmalloc_psize].sllp;
-		slb_vmalloc_update();
-	}
+
+	check_paca_psize(ea, mm, psize, user_region);
 #endif /* CONFIG_PPC_64K_PAGES */
 
 #ifdef CONFIG_PPC_HAS_HASH_64K
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
index 34de9e0cdc3..826893fcb3a 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -127,7 +127,11 @@ repeat:
 
 		/* Add in WIMG bits */
 		rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
-				      _PAGE_COHERENT | _PAGE_GUARDED));
+				      _PAGE_GUARDED));
+		/*
+		 * enable the memory coherence always
+		 */
+		rflags |= HPTE_R_M;
 
 		/* Insert into the hash table, primary slot */
 		slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c
index 3bc700655fc..5e4ee257390 100644
--- a/arch/powerpc/mm/hugetlbpage-book3e.c
+++ b/arch/powerpc/mm/hugetlbpage-book3e.c
@@ -8,6 +8,44 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 
+#ifdef CONFIG_PPC_FSL_BOOK3E
+#ifdef CONFIG_PPC64
+static inline int tlb1_next(void)
+{
+	struct paca_struct *paca = get_paca();
+	struct tlb_core_data *tcd;
+	int this, next;
+
+	tcd = paca->tcd_ptr;
+	this = tcd->esel_next;
+
+	next = this + 1;
+	if (next >= tcd->esel_max)
+		next = tcd->esel_first;
+
+	tcd->esel_next = next;
+	return this;
+}
+#else
+static inline int tlb1_next(void)
+{
+	int index, ncams;
+
+	ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
+
+	index = __get_cpu_var(next_tlbcam_idx);
+
+	/* Just round-robin the entries and wrap when we hit the end */
+	if (unlikely(index == ncams - 1))
+		__get_cpu_var(next_tlbcam_idx) = tlbcam_index;
+	else
+		__get_cpu_var(next_tlbcam_idx)++;
+
+	return index;
+}
+#endif /* !PPC64 */
+#endif /* FSL */
+
 static inline int mmu_get_tsize(int psize)
 {
 	return mmu_psize_defs[psize].enc;
@@ -47,7 +85,7 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
 	struct mm_struct *mm;
 
 #ifdef CONFIG_PPC_FSL_BOOK3E
-	int index, ncams;
+	int index;
 #endif
 
 	if (unlikely(is_kernel_addr(ea)))
@@ -77,18 +115,11 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
 	}
 
 #ifdef CONFIG_PPC_FSL_BOOK3E
-	ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
-
 	/* We have to use the CAM(TLB1) on FSL parts for hugepages */
-	index = __get_cpu_var(next_tlbcam_idx);
+	index = tlb1_next();
 	mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1));
-
-	/* Just round-robin the entries and wrap when we hit the end */
-	if (unlikely(index == ncams - 1))
-		__get_cpu_var(next_tlbcam_idx) = tlbcam_index;
-	else
-		__get_cpu_var(next_tlbcam_idx)++;
 #endif
+
 	mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize);
 	mas2 = ea & ~((1UL << shift) - 1);
 	mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK;
@@ -103,7 +134,8 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
 	if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) {
 		mtspr(SPRN_MAS7_MAS3, mas7_3);
 	} else {
-		mtspr(SPRN_MAS7, upper_32_bits(mas7_3));
+		if (mmu_has_feature(MMU_FTR_BIG_PHYS))
+			mtspr(SPRN_MAS7, upper_32_bits(mas7_3));
 		mtspr(SPRN_MAS3, lower_32_bits(mas7_3));
 	}
 
@@ -117,6 +149,5 @@ void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
 	struct hstate *hstate = hstate_file(vma->vm_file);
 	unsigned long tsize = huge_page_shift(hstate) - 10;
 
-	__flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, tsize, 0);
-
+	__flush_tlb_page(vma->vm_mm, vmaddr, tsize, 0);
 }
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index 0b7fb676101..a5bcf930119 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -99,6 +99,10 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 		/* Add in WIMG bits */
 		rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
 				      _PAGE_COHERENT | _PAGE_GUARDED));
+		/*
+		 * enable the memory coherence always
+		 */
+		rflags |= HPTE_R_M;
 
 		slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0,
 					     mmu_psize, ssize);
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index d67db4bd672..7e70ae968e5 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -86,11 +86,6 @@ int pgd_huge(pgd_t pgd)
 	 */
 	return ((pgd_val(pgd) & 0x3) != 0x0);
 }
-
-int pmd_huge_support(void)
-{
-	return 1;
-}
 #else
 int pmd_huge(pmd_t pmd)
 {
@@ -106,11 +101,6 @@ int pgd_huge(pgd_t pgd)
 {
 	return 0;
 }
-
-int pmd_huge_support(void)
-{
-	return 0;
-}
 #endif
 
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
@@ -472,12 +462,13 @@ static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
 {
 	struct hugepd_freelist **batchp;
 
-	batchp = &__get_cpu_var(hugepd_freelist_cur);
+	batchp = &get_cpu_var(hugepd_freelist_cur);
 
 	if (atomic_read(&tlb->mm->mm_users) < 2 ||
 	    cpumask_equal(mm_cpumask(tlb->mm),
 			  cpumask_of(smp_processor_id()))) {
 		kmem_cache_free(hugepte_cache, hugepte);
+        put_cpu_var(hugepd_freelist_cur);
 		return;
 	}
 
@@ -491,6 +482,7 @@ static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
 		call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
 		*batchp = NULL;
 	}
+	put_cpu_var(hugepd_freelist_cur);
 }
 #endif
 
@@ -633,8 +625,6 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 
 /*
  * This function frees user-level page tables of a process.
- *
- * Must be called with pagetable lock held.
  */
 void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 			    unsigned long addr, unsigned long end,
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index d47d3dab487..cff59f1bec2 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -213,7 +213,12 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 	 */
 	BUG_ON(first_memblock_base != 0);
 
+#ifdef CONFIG_PIN_TLB
+	/* 8xx can only access 24MB at the moment */
+	memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01800000));
+#else
 	/* 8xx can only access 8MB at the moment */
 	memblock_set_current_limit(min_t(u64, first_memblock_size, 0x00800000));
+#endif
 }
 #endif /* CONFIG_8xx */
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index d0cd9e4c683..e3734edffa6 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -300,5 +300,58 @@ void vmemmap_free(unsigned long start, unsigned long end)
 {
 }
 
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+void register_page_bootmem_memmap(unsigned long section_nr,
+				  struct page *start_page, unsigned long size)
+{
+}
+
+/*
+ * We do not have access to the sparsemem vmemmap, so we fallback to
+ * walking the list of sparsemem blocks which we already maintain for
+ * the sake of crashdump. In the long run, we might want to maintain
+ * a tree if performance of that linear walk becomes a problem.
+ *
+ * realmode_pfn_to_page functions can fail due to:
+ * 1) As real sparsemem blocks do not lay in RAM continously (they
+ * are in virtual address space which is not available in the real mode),
+ * the requested page struct can be split between blocks so get_page/put_page
+ * may fail.
+ * 2) When huge pages are used, the get_page/put_page API will fail
+ * in real mode as the linked addresses in the page struct are virtual
+ * too.
+ */
+struct page *realmode_pfn_to_page(unsigned long pfn)
+{
+	struct vmemmap_backing *vmem_back;
+	struct page *page;
+	unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
+	unsigned long pg_va = (unsigned long) pfn_to_page(pfn);
+
+	for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back->list) {
+		if (pg_va < vmem_back->virt_addr)
+			continue;
+
+		/* Check that page struct is not split between real pages */
+		if ((pg_va + sizeof(struct page)) >
+				(vmem_back->virt_addr + page_size))
+			return NULL;
+
+		page = (struct page *) (vmem_back->phys + pg_va -
+				vmem_back->virt_addr);
+		return page;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
+
+#elif defined(CONFIG_FLATMEM)
+
+struct page *realmode_pfn_to_page(unsigned long pfn)
+{
+	struct page *page = pfn_to_page(pfn);
+	return page;
+}
+EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
 
+#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 1cf9c5b67f2..2c8e90f5789 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -139,9 +139,14 @@ int arch_remove_memory(u64 start, u64 size)
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	struct zone *zone;
+	int ret;
 
 	zone = page_zone(pfn_to_page(start_pfn));
-	return __remove_pages(zone, start_pfn, nr_pages);
+	ret = __remove_pages(zone, start_pfn, nr_pages);
+	if (!ret && (ppc_md.remove_memory))
+		ret = ppc_md.remove_memory(start, size);
+
+	return ret;
 }
 #endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
@@ -209,7 +214,7 @@ void __init do_init_bootmem(void)
 	/* Place all memblock_regions in the same node and merge contiguous
 	 * memblock_regions
 	 */
-	memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
+	memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
 
 	/* Add all physical memory to the bootmem map, mark each area
 	 * present.
@@ -297,12 +302,27 @@ void __init paging_init(void)
 }
 #endif /* ! CONFIG_NEED_MULTIPLE_NODES */
 
+static void __init register_page_bootmem_info(void)
+{
+	int i;
+
+	for_each_online_node(i)
+		register_page_bootmem_info_node(NODE_DATA(i));
+}
+
 void __init mem_init(void)
 {
+	/*
+	 * book3s is limited to 16 page sizes due to encoding this in
+	 * a 4-bit field for slices.
+	 */
+	BUILD_BUG_ON(MMU_PAGE_COUNT > 16);
+
 #ifdef CONFIG_SWIOTLB
 	swiotlb_init(0);
 #endif
 
+	register_page_bootmem_info();
 	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
 	set_max_mapnr(max_pfn);
 	free_all_bootmem();
@@ -498,7 +518,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
  * System memory should not be in /proc/iomem but various tools expect it
  * (eg kdump).
  */
-static int add_system_ram_resources(void)
+static int __init add_system_ram_resources(void)
 {
 	struct memblock_region *reg;
 
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index af3d78e1930..928ebe79668 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -410,17 +410,7 @@ void __init mmu_context_init(void)
 	} else if (mmu_has_feature(MMU_FTR_TYPE_47x)) {
 		first_context = 1;
 		last_context = 65535;
-	} else
-#ifdef CONFIG_PPC_BOOK3E_MMU
-	if (mmu_has_feature(MMU_FTR_TYPE_3E)) {
-		u32 mmucfg = mfspr(SPRN_MMUCFG);
-		u32 pid_bits = (mmucfg & MMUCFG_PIDSIZE_MASK)
-				>> MMUCFG_PIDSIZE_SHIFT;
-		first_context = 1;
-		last_context = (1UL << (pid_bits + 1)) - 1;
-	} else
-#endif
-	{
+	} else {
 		first_context = 1;
 		last_context = 255;
 	}
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 83eb5d5f53d..9615d82919b 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -148,6 +148,8 @@ extern unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
 extern void MMU_init_hw(void);
 extern unsigned long mmu_mapin_ram(unsigned long top);
 extern void adjust_total_lowmem(void);
+extern int switch_to_as1(void);
+extern void restore_to_as0(int esel, int offset, void *dt_ptr, int bootcpu);
 #endif
 extern void loadcam_entry(unsigned int index);
 
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index c916127f10c..3b181b22cd4 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -31,6 +31,8 @@
 #include <asm/sparsemem.h>
 #include <asm/prom.h>
 #include <asm/smp.h>
+#include <asm/cputhreads.h>
+#include <asm/topology.h>
 #include <asm/firmware.h>
 #include <asm/paca.h>
 #include <asm/hvcall.h>
@@ -152,9 +154,22 @@ static void __init get_node_active_region(unsigned long pfn,
 	}
 }
 
-static void map_cpu_to_node(int cpu, int node)
+static void reset_numa_cpu_lookup_table(void)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu)
+		numa_cpu_lookup_table[cpu] = -1;
+}
+
+static void update_numa_cpu_lookup_table(unsigned int cpu, int node)
 {
 	numa_cpu_lookup_table[cpu] = node;
+}
+
+static void map_cpu_to_node(int cpu, int node)
+{
+	update_numa_cpu_lookup_table(cpu, node);
 
 	dbg("adding cpu %d to node %d\n", cpu, node);
 
@@ -195,7 +210,7 @@ static const __be32 *of_get_usable_memory(struct device_node *memory)
 	u32 len;
 	prop = of_get_property(memory, "linux,drconf-usable-memory", &len);
 	if (!prop || len < sizeof(unsigned int))
-		return 0;
+		return NULL;
 	return prop;
 }
 
@@ -217,6 +232,7 @@ int __node_distance(int a, int b)
 
 	return distance;
 }
+EXPORT_SYMBOL(__node_distance);
 
 static void initialize_distance_lookup_table(int nid,
 		const __be32 *associativity)
@@ -522,11 +538,24 @@ static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
  */
 static int numa_setup_cpu(unsigned long lcpu)
 {
-	int nid = 0;
-	struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
+	int nid;
+	struct device_node *cpu;
+
+	/*
+	 * If a valid cpu-to-node mapping is already available, use it
+	 * directly instead of querying the firmware, since it represents
+	 * the most recent mapping notified to us by the platform (eg: VPHN).
+	 */
+	if ((nid = numa_cpu_lookup_table[lcpu]) >= 0) {
+		map_cpu_to_node(lcpu, nid);
+		return nid;
+	}
+
+	cpu = of_get_cpu_node(lcpu, NULL);
 
 	if (!cpu) {
 		WARN_ON(1);
+		nid = 0;
 		goto out;
 	}
 
@@ -542,16 +571,38 @@ out:
 	return nid;
 }
 
+static void verify_cpu_node_mapping(int cpu, int node)
+{
+	int base, sibling, i;
+
+	/* Verify that all the threads in the core belong to the same node */
+	base = cpu_first_thread_sibling(cpu);
+
+	for (i = 0; i < threads_per_core; i++) {
+		sibling = base + i;
+
+		if (sibling == cpu || cpu_is_offline(sibling))
+			continue;
+
+		if (cpu_to_node(sibling) != node) {
+			WARN(1, "CPU thread siblings %d and %d don't belong"
+				" to the same node!\n", cpu, sibling);
+			break;
+		}
+	}
+}
+
 static int cpu_numa_callback(struct notifier_block *nfb, unsigned long action,
 			     void *hcpu)
 {
 	unsigned long lcpu = (unsigned long)hcpu;
-	int ret = NOTIFY_DONE;
+	int ret = NOTIFY_DONE, nid;
 
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
-		numa_setup_cpu(lcpu);
+		nid = numa_setup_cpu(lcpu);
+		verify_cpu_node_mapping((int)lcpu, nid);
 		ret = NOTIFY_OK;
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
@@ -670,7 +721,8 @@ static void __init parse_drconf_memory(struct device_node *memory)
 			node_set_online(nid);
 			sz = numa_enforce_memory_limit(base, size);
 			if (sz)
-				memblock_set_node(base, sz, nid);
+				memblock_set_node(base, sz,
+						  &memblock.memory, nid);
 		} while (--ranges);
 	}
 }
@@ -760,7 +812,7 @@ new_range:
 				continue;
 		}
 
-		memblock_set_node(start, size, nid);
+		memblock_set_node(start, size, &memblock.memory, nid);
 
 		if (--ranges)
 			goto new_range;
@@ -797,7 +849,8 @@ static void __init setup_nonnuma(void)
 
 		fake_numa_create_new_node(end_pfn, &nid);
 		memblock_set_node(PFN_PHYS(start_pfn),
-				  PFN_PHYS(end_pfn - start_pfn), nid);
+				  PFN_PHYS(end_pfn - start_pfn),
+				  &memblock.memory, nid);
 		node_set_online(nid);
 	}
 }
@@ -938,8 +991,7 @@ static void __init mark_reserved_regions_for_nid(int nid)
 		unsigned long start_pfn = physbase >> PAGE_SHIFT;
 		unsigned long end_pfn = PFN_UP(physbase + size);
 		struct node_active_region node_ar;
-		unsigned long node_end_pfn = node->node_start_pfn +
-					     node->node_spanned_pages;
+		unsigned long node_end_pfn = pgdat_end_pfn(node);
 
 		/*
 		 * Check to make sure that this memblock.reserved area is
@@ -1068,6 +1120,7 @@ void __init do_init_bootmem(void)
 	 */
 	setup_node_to_cpumask_map();
 
+	reset_numa_cpu_lookup_table();
 	register_cpu_notifier(&ppc64_numa_nb);
 	cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
 			  (void *)(unsigned long)boot_cpuid);
@@ -1154,7 +1207,7 @@ static int hot_add_drconf_scn_to_nid(struct device_node *memory,
  * represented in the device tree as a node (i.e. memory@XXXX) for
  * each memblock.
  */
-int hot_add_node_scn_to_nid(unsigned long scn_addr)
+static int hot_add_node_scn_to_nid(unsigned long scn_addr)
 {
 	struct device_node *memory;
 	int nid = -1;
@@ -1235,7 +1288,7 @@ static u64 hot_add_drconf_memory_max(void)
         struct device_node *memory = NULL;
         unsigned int drconf_cell_cnt = 0;
         u64 lmb_size = 0;
-	const __be32 *dm = 0;
+	const __be32 *dm = NULL;
 
         memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
         if (memory) {
@@ -1446,6 +1499,33 @@ static int update_cpu_topology(void *data)
 	return 0;
 }
 
+static int update_lookup_table(void *data)
+{
+	struct topology_update_data *update;
+
+	if (!data)
+		return -EINVAL;
+
+	/*
+	 * Upon topology update, the numa-cpu lookup table needs to be updated
+	 * for all threads in the core, including offline CPUs, to ensure that
+	 * future hotplug operations respect the cpu-to-node associativity
+	 * properly.
+	 */
+	for (update = data; update; update = update->next) {
+		int nid, base, j;
+
+		nid = update->new_nid;
+		base = cpu_first_thread_sibling(update->cpu);
+
+		for (j = 0; j < threads_per_core; j++) {
+			update_numa_cpu_lookup_table(base + j, nid);
+		}
+	}
+
+	return 0;
+}
+
 /*
  * Update the node maps and sysfs entries for each cpu whose home node
  * has changed. Returns 1 when the topology has changed, and 0 otherwise.
@@ -1512,8 +1592,30 @@ int arch_update_cpu_topology(void)
 		cpu = cpu_last_thread_sibling(cpu);
 	}
 
+	/*
+	 * In cases where we have nothing to update (because the updates list
+	 * is too short or because the new topology is same as the old one),
+	 * skip invoking update_cpu_topology() via stop-machine(). This is
+	 * necessary (and not just a fast-path optimization) since stop-machine
+	 * can end up electing a random CPU to run update_cpu_topology(), and
+	 * thus trick us into setting up incorrect cpu-node mappings (since
+	 * 'updates' is kzalloc()'ed).
+	 *
+	 * And for the similar reason, we will skip all the following updating.
+	 */
+	if (!cpumask_weight(&updated_cpus))
+		goto out;
+
 	stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
 
+	/*
+	 * Update the numa-cpu lookup table with the new mappings, even for
+	 * offline CPUs. It is best to perform this update from the stop-
+	 * machine context.
+	 */
+	stop_machine(update_lookup_table, &updates[0],
+					cpumask_of(raw_smp_processor_id()));
+
 	for (ud = &updates[0]; ud; ud = ud->next) {
 		unregister_cpu_under_node(ud->cpu, ud->old_nid);
 		register_cpu_under_node(ud->cpu, ud->new_nid);
@@ -1525,6 +1627,7 @@ int arch_update_cpu_topology(void)
 		changed = 1;
 	}
 
+out:
 	kfree(updates);
 	return changed;
 }
@@ -1535,7 +1638,7 @@ static void topology_work_fn(struct work_struct *work)
 }
 static DECLARE_WORK(topology_work, topology_work_fn);
 
-void topology_schedule_update(void)
+static void topology_schedule_update(void)
 {
 	schedule_work(&topology_work);
 }
@@ -1698,7 +1801,7 @@ static const struct file_operations topology_ops = {
 static int topology_update_init(void)
 {
 	start_topology_update();
-	proc_create("powerpc/topology_updates", 644, NULL, &topology_ops);
+	proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops);
 
 	return 0;
 }
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index edda589795c..c695943a513 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -24,7 +24,6 @@
 #include <linux/kernel.h>
 #include <linux/gfp.h>
 #include <linux/mm.h>
-#include <linux/init.h>
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
 #include <linux/hugetlb.h>
@@ -32,8 +31,6 @@
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 
-#include "mmu_decl.h"
-
 static inline int is_exec_fault(void)
 {
 	return current->thread.regs && TRAP(current->thread.regs) == 0x400;
@@ -72,7 +69,7 @@ struct page * maybe_pte_to_page(pte_t pte)
  * support falls into the same category.
  */
 
-static pte_t set_pte_filter(pte_t pte, unsigned long addr)
+static pte_t set_pte_filter(pte_t pte)
 {
 	pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
 	if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
@@ -81,17 +78,6 @@ static pte_t set_pte_filter(pte_t pte, unsigned long addr)
 		if (!pg)
 			return pte;
 		if (!test_bit(PG_arch_1, &pg->flags)) {
-#ifdef CONFIG_8xx
-			/* On 8xx, cache control instructions (particularly
-			 * "dcbst" from flush_dcache_icache) fault as write
-			 * operation if there is an unpopulated TLB entry
-			 * for the address in question. To workaround that,
-			 * we invalidate the TLB here, thus avoiding dcbst
-			 * misbehaviour.
-			 */
-			/* 8xx doesn't care about PID, size or ind args */
-			_tlbil_va(addr, 0, 0, 0);
-#endif /* CONFIG_8xx */
 			flush_dcache_icache_page(pg);
 			set_bit(PG_arch_1, &pg->flags);
 		}
@@ -111,7 +97,7 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
  * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so
  * instead we "filter out" the exec permission for non clean pages.
  */
-static pte_t set_pte_filter(pte_t pte, unsigned long addr)
+static pte_t set_pte_filter(pte_t pte)
 {
 	struct page *pg;
 
@@ -187,13 +173,13 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
 		pte_t pte)
 {
 #ifdef CONFIG_DEBUG_VM
-	WARN_ON(pte_present(*ptep));
+	WARN_ON(pte_val(*ptep) & _PAGE_PRESENT);
 #endif
 	/* Note: mm->context.id might not yet have been assigned as
 	 * this context might not have been activated yet when this
 	 * is called.
 	 */
-	pte = set_pte_filter(pte, addr);
+	pte = set_pte_filter(pte);
 
 	/* Perform the setting of the PTE */
 	__set_pte_at(mm, addr, ptep, pte, 0);
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 6c856fb8c15..343a87fa78b 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -121,7 +121,10 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
 	ptepage = alloc_pages(flags, 0);
 	if (!ptepage)
 		return NULL;
-	pgtable_page_ctor(ptepage);
+	if (!pgtable_page_ctor(ptepage)) {
+		__free_page(ptepage);
+		return NULL;
+	}
 	return ptepage;
 }
 
@@ -296,6 +299,7 @@ int map_page(unsigned long va, phys_addr_t pa, int flags)
 		set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT,
 						     __pgprot(flags)));
 	}
+	smp_wmb();
 	return err;
 }
 
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 536eec72c0f..f6ce1f111f5 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -33,7 +33,6 @@
 #include <linux/swap.h>
 #include <linux/stddef.h>
 #include <linux/vmalloc.h>
-#include <linux/init.h>
 #include <linux/bootmem.h>
 #include <linux/memblock.h>
 #include <linux/slab.h>
@@ -153,6 +152,18 @@ int map_kernel_page(unsigned long ea, unsigned long pa, int flags)
 		}
 #endif /* !CONFIG_PPC_MMU_NOHASH */
 	}
+
+#ifdef CONFIG_PPC_BOOK3E_64
+	/*
+	 * With hardware tablewalk, a sync is needed to ensure that
+	 * subsequent accesses see the PTE we just wrote.  Unlike userspace
+	 * mappings, we can't tolerate spurious faults, so make sure
+	 * the new PTE will be seen the first time.
+	 */
+	mb();
+#else
+	smp_wmb();
+#endif
 	return 0;
 }
 
@@ -378,6 +389,10 @@ static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
 				       __GFP_REPEAT | __GFP_ZERO);
 	if (!page)
 		return NULL;
+	if (!kernel && !pgtable_page_ctor(page)) {
+		__free_page(page);
+		return NULL;
+	}
 
 	ret = page_address(page);
 	spin_lock(&mm->page_table_lock);
@@ -392,9 +407,6 @@ static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
 	}
 	spin_unlock(&mm->page_table_lock);
 
-	if (!kernel)
-		pgtable_page_ctor(page);
-
 	return (pte_t *)ret;
 }
 
@@ -498,7 +510,8 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
 }
 
 unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
-				  pmd_t *pmdp, unsigned long clr)
+				  pmd_t *pmdp, unsigned long clr,
+				  unsigned long set)
 {
 
 	unsigned long old, tmp;
@@ -514,14 +527,15 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
 		andi.	%1,%0,%6\n\
 		bne-	1b \n\
 		andc	%1,%0,%4 \n\
+		or	%1,%1,%7\n\
 		stdcx.	%1,0,%3 \n\
 		bne-	1b"
 	: "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
-	: "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY)
+	: "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY), "r" (set)
 	: "cc" );
 #else
 	old = pmd_val(*pmdp);
-	*pmdp = __pmd(old & ~clr);
+	*pmdp = __pmd((old & ~clr) | set);
 #endif
 	if (old & _PAGE_HASHPTE)
 		hpte_do_hugepage_flush(mm, addr, pmdp);
@@ -633,6 +647,11 @@ void pmdp_splitting_flush(struct vm_area_struct *vma,
 		if (old & _PAGE_HASHPTE)
 			hpte_do_hugepage_flush(vma->vm_mm, address, pmdp);
 	}
+	/*
+	 * This ensures that generic code that rely on IRQ disabling
+	 * to prevent a parallel THP split work as expected.
+	 */
+	kick_all_cpus_sync();
 }
 
 /*
@@ -686,7 +705,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 		pmd_t *pmdp, pmd_t pmd)
 {
 #ifdef CONFIG_DEBUG_VM
-	WARN_ON(!pmd_none(*pmdp));
+	WARN_ON(pmd_val(*pmdp) & _PAGE_PRESENT);
 	assert_spin_locked(&mm->page_table_lock);
 	WARN_ON(!pmd_trans_huge(pmd));
 #endif
@@ -696,7 +715,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 		     pmd_t *pmdp)
 {
-	pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT);
+	pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
 }
 
 /*
@@ -823,7 +842,7 @@ pmd_t pmdp_get_and_clear(struct mm_struct *mm,
 	unsigned long old;
 	pgtable_t *pgtable_slot;
 
-	old = pmd_hugepage_update(mm, addr, pmdp, ~0UL);
+	old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
 	old_pmd = __pmd(old);
 	/*
 	 * We have pmd == none and we are holding page_table_lock.
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 9d1d33cd2be..0399a670295 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -97,7 +97,7 @@ static inline void create_shadowed_slbe(unsigned long ea, int ssize,
 static void __slb_flush_and_rebolt(void)
 {
 	/* If you change this make sure you change SLB_NUM_BOLTED
-	 * appropriately too. */
+	 * and PR KVM appropriately too. */
 	unsigned long linear_llp, vmalloc_llp, lflags, vflags;
 	unsigned long ksp_esid_data, ksp_vsid_data;
 
@@ -256,10 +256,14 @@ static inline void patch_slb_encoding(unsigned int *insn_addr,
 	patch_instruction(insn_addr, insn);
 }
 
+extern u32 slb_compare_rr_to_size[];
+extern u32 slb_miss_kernel_load_linear[];
+extern u32 slb_miss_kernel_load_io[];
+extern u32 slb_compare_rr_to_size[];
+extern u32 slb_miss_kernel_load_vmemmap[];
+
 void slb_set_size(u16 size)
 {
-	extern unsigned int *slb_compare_rr_to_size;
-
 	if (mmu_slb_size == size)
 		return;
 
@@ -272,11 +276,7 @@ void slb_initialize(void)
 	unsigned long linear_llp, vmalloc_llp, io_llp;
 	unsigned long lflags, vflags;
 	static int slb_encoding_inited;
-	extern unsigned int *slb_miss_kernel_load_linear;
-	extern unsigned int *slb_miss_kernel_load_io;
-	extern unsigned int *slb_compare_rr_to_size;
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-	extern unsigned int *slb_miss_kernel_load_vmemmap;
 	unsigned long vmemmap_llp;
 #endif
 
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index 17aa6dfceb3..736d18b3cef 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -35,7 +35,7 @@ _GLOBAL(slb_allocate_realmode)
 	 * check for bad kernel/user address
 	 * (ea & ~REGION_MASK) >= PGTABLE_RANGE
 	 */
-	rldicr. r9,r3,4,(63 - 46 - 4)
+	rldicr. r9,r3,4,(63 - PGTABLE_EADDR_SIZE - 4)
 	bne-	8f
 
 	srdi	r9,r3,60		/* get region */
@@ -59,7 +59,8 @@ _GLOBAL(slb_allocate_realmode)
 	/* Linear mapping encoding bits, the "li" instruction below will
 	 * be patched by the kernel at boot
 	 */
-_GLOBAL(slb_miss_kernel_load_linear)
+.globl slb_miss_kernel_load_linear
+slb_miss_kernel_load_linear:
 	li	r11,0
 	/*
 	 * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
@@ -79,7 +80,8 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
 	/* Check virtual memmap region. To be patches at kernel boot */
 	cmpldi	cr0,r9,0xf
 	bne	1f
-_GLOBAL(slb_miss_kernel_load_vmemmap)
+.globl slb_miss_kernel_load_vmemmap
+slb_miss_kernel_load_vmemmap:
 	li	r11,0
 	b	6f
 1:
@@ -95,7 +97,8 @@ _GLOBAL(slb_miss_kernel_load_vmemmap)
 	b	6f
 5:
 	/* IO mapping */
-	_GLOBAL(slb_miss_kernel_load_io)
+.globl slb_miss_kernel_load_io
+slb_miss_kernel_load_io:
 	li	r11,0
 6:
 	/*
@@ -250,7 +253,8 @@ slb_finish_load:
 7:	ld	r10,PACASTABRR(r13)
 	addi	r10,r10,1
 	/* This gets soft patched on boot. */
-_GLOBAL(slb_compare_rr_to_size)
+.globl slb_compare_rr_to_size
+slb_compare_rr_to_size:
 	cmpldi	r10,0
 
 	blt+	4f
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 3e99c149271..b0c75cc15ef 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -258,7 +258,7 @@ static bool slice_scan_available(unsigned long addr,
 		slice = GET_HIGH_SLICE_INDEX(addr);
 		*boundary_addr = (slice + end) ?
 			((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP;
-		return !!(available.high_slices & (1u << slice));
+		return !!(available.high_slices & (1ul << slice));
 	}
 }
 
@@ -408,7 +408,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	if (fixed && (addr & ((1ul << pshift) - 1)))
 		return -EINVAL;
 	if (fixed && addr > (mm->task_size - len))
-		return -EINVAL;
+		return -ENOMEM;
 
 	/* If hint, make sure it matches our alignment restrictions */
 	if (!fixed && addr) {
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index a770df2dae7..6c0b1f5f8d2 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -78,7 +78,7 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	arch_enter_lazy_mmu_mode();
 	for (; npages > 0; --npages) {
-		pte_update(mm, addr, pte, 0, 0);
+		pte_update(mm, addr, pte, 0, 0, 0);
 		addr += PAGE_SIZE;
 		++pte;
 	}
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 36e44b4260e..c99f6510a0b 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -23,7 +23,6 @@
 
 #include <linux/kernel.h>
 #include <linux/mm.h>
-#include <linux/init.h>
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
 #include <asm/pgalloc.h>
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
index b4113bf8635..356e8b41fb0 100644
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -39,37 +39,49 @@
  *                                                                    *
  **********************************************************************/
 
+/*
+ * Note that, unlike non-bolted handlers, TLB_EXFRAME is not
+ * modified by the TLB miss handlers themselves, since the TLB miss
+ * handler code will not itself cause a recursive TLB miss.
+ *
+ * TLB_EXFRAME will be modified when crit/mc/debug exceptions are
+ * entered/exited.
+ */
 .macro tlb_prolog_bolted intnum addr
-	mtspr	SPRN_SPRG_GEN_SCRATCH,r13
+	mtspr	SPRN_SPRG_GEN_SCRATCH,r12
+	mfspr	r12,SPRN_SPRG_TLB_EXFRAME
+	std	r13,EX_TLB_R13(r12)
+	std	r10,EX_TLB_R10(r12)
 	mfspr	r13,SPRN_SPRG_PACA
-	std	r10,PACA_EXTLB+EX_TLB_R10(r13)
+
 	mfcr	r10
-	std	r11,PACA_EXTLB+EX_TLB_R11(r13)
+	std	r11,EX_TLB_R11(r12)
 #ifdef CONFIG_KVM_BOOKE_HV
 BEGIN_FTR_SECTION
 	mfspr	r11, SPRN_SRR1
 END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
 #endif
 	DO_KVM	\intnum, SPRN_SRR1
-	std	r16,PACA_EXTLB+EX_TLB_R16(r13)
+	std	r16,EX_TLB_R16(r12)
 	mfspr	r16,\addr		/* get faulting address */
-	std	r14,PACA_EXTLB+EX_TLB_R14(r13)
+	std	r14,EX_TLB_R14(r12)
 	ld	r14,PACAPGD(r13)
-	std	r15,PACA_EXTLB+EX_TLB_R15(r13)
-	std	r10,PACA_EXTLB+EX_TLB_CR(r13)
-	TLB_MISS_PROLOG_STATS_BOLTED
+	std	r15,EX_TLB_R15(r12)
+	std	r10,EX_TLB_CR(r12)
+	TLB_MISS_PROLOG_STATS
 .endm
 
 .macro tlb_epilog_bolted
-	ld	r14,PACA_EXTLB+EX_TLB_CR(r13)
-	ld	r10,PACA_EXTLB+EX_TLB_R10(r13)
-	ld	r11,PACA_EXTLB+EX_TLB_R11(r13)
+	ld	r14,EX_TLB_CR(r12)
+	ld	r10,EX_TLB_R10(r12)
+	ld	r11,EX_TLB_R11(r12)
+	ld	r13,EX_TLB_R13(r12)
 	mtcr	r14
-	ld	r14,PACA_EXTLB+EX_TLB_R14(r13)
-	ld	r15,PACA_EXTLB+EX_TLB_R15(r13)
-	TLB_MISS_RESTORE_STATS_BOLTED
-	ld	r16,PACA_EXTLB+EX_TLB_R16(r13)
-	mfspr	r13,SPRN_SPRG_GEN_SCRATCH
+	ld	r14,EX_TLB_R14(r12)
+	ld	r15,EX_TLB_R15(r12)
+	TLB_MISS_RESTORE_STATS
+	ld	r16,EX_TLB_R16(r12)
+	mfspr	r12,SPRN_SPRG_GEN_SCRATCH
 .endm
 
 /* Data TLB miss */
@@ -136,7 +148,7 @@ BEGIN_MMU_FTR_SECTION
 	 */
 	PPC_TLBSRX_DOT(0,R16)
 	ldx	r14,r14,r15		/* grab pgd entry */
-	beq	normal_tlb_miss_done	/* tlb exists already, bail */
+	beq	tlb_miss_done_bolted	/* tlb exists already, bail */
 MMU_FTR_SECTION_ELSE
 	ldx	r14,r14,r15		/* grab pgd entry */
 ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV)
@@ -192,6 +204,7 @@ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV)
 	mtspr	SPRN_MAS7_MAS3,r15
 	tlbwe
 
+tlb_miss_done_bolted:
 	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
 	tlb_epilog_bolted
 	rfi
@@ -239,6 +252,183 @@ itlb_miss_fault_bolted:
 	beq	tlb_miss_common_bolted
 	b	itlb_miss_kernel_bolted
 
+#ifdef CONFIG_PPC_FSL_BOOK3E
+/*
+ * TLB miss handling for e6500 and derivatives, using hardware tablewalk.
+ *
+ * Linear mapping is bolted: no virtual page table or nested TLB misses
+ * Indirect entries in TLB1, hardware loads resulting direct entries
+ *    into TLB0
+ * No HES or NV hint on TLB1, so we need to do software round-robin
+ * No tlbsrx. so we need a spinlock, and we have to deal
+ *    with MAS-damage caused by tlbsx
+ * 4K pages only
+ */
+
+	START_EXCEPTION(instruction_tlb_miss_e6500)
+	tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0
+
+	ld	r11,PACA_TCD_PTR(r13)
+	srdi.	r15,r16,60		/* get region */
+	ori	r16,r16,1
+
+	TLB_MISS_STATS_SAVE_INFO_BOLTED
+	bne	tlb_miss_kernel_e6500	/* user/kernel test */
+
+	b	tlb_miss_common_e6500
+
+	START_EXCEPTION(data_tlb_miss_e6500)
+	tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR
+
+	ld	r11,PACA_TCD_PTR(r13)
+	srdi.	r15,r16,60		/* get region */
+	rldicr	r16,r16,0,62
+
+	TLB_MISS_STATS_SAVE_INFO_BOLTED
+	bne	tlb_miss_kernel_e6500	/* user vs kernel check */
+
+/*
+ * This is the guts of the TLB miss handler for e6500 and derivatives.
+ * We are entered with:
+ *
+ * r16 = page of faulting address (low bit 0 if data, 1 if instruction)
+ * r15 = crap (free to use)
+ * r14 = page table base
+ * r13 = PACA
+ * r11 = tlb_per_core ptr
+ * r10 = cpu number
+ */
+tlb_miss_common_e6500:
+	/*
+	 * Search if we already have an indirect entry for that virtual
+	 * address, and if we do, bail out.
+	 *
+	 * MAS6:IND should be already set based on MAS4
+	 */
+1:	lbarx	r15,0,r11
+	lhz	r10,PACAPACAINDEX(r13)
+	cmpdi	r15,0
+	cmpdi	cr1,r15,1	/* set cr1.eq = 0 for non-recursive */
+	bne	2f
+	stbcx.	r10,0,r11
+	bne	1b
+3:
+	.subsection 1
+2:	cmpd	cr1,r15,r10	/* recursive lock due to mcheck/crit/etc? */
+	beq	cr1,3b		/* unlock will happen if cr1.eq = 0 */
+	lbz	r15,0(r11)
+	cmpdi	r15,0
+	bne	2b
+	b	1b
+	.previous
+
+	mfspr	r15,SPRN_MAS2
+
+	tlbsx	0,r16
+	mfspr	r10,SPRN_MAS1
+	andis.	r10,r10,MAS1_VALID@h
+	bne	tlb_miss_done_e6500
+
+	/* Undo MAS-damage from the tlbsx */
+	mfspr	r10,SPRN_MAS1
+	oris	r10,r10,MAS1_VALID@h
+	mtspr	SPRN_MAS1,r10
+	mtspr	SPRN_MAS2,r15
+
+	/* Now, we need to walk the page tables. First check if we are in
+	 * range.
+	 */
+	rldicl.	r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
+	bne-	tlb_miss_fault_e6500
+
+	rldicl	r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3
+	cmpldi	cr0,r14,0
+	clrrdi	r15,r15,3
+	beq-	tlb_miss_fault_e6500 /* No PGDIR, bail */
+	ldx	r14,r14,r15		/* grab pgd entry */
+
+	rldicl	r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3
+	clrrdi	r15,r15,3
+	cmpdi	cr0,r14,0
+	bge	tlb_miss_fault_e6500	/* Bad pgd entry or hugepage; bail */
+	ldx	r14,r14,r15		/* grab pud entry */
+
+	rldicl	r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3
+	clrrdi	r15,r15,3
+	cmpdi	cr0,r14,0
+	bge	tlb_miss_fault_e6500
+	ldx	r14,r14,r15		/* Grab pmd entry */
+
+	mfspr	r10,SPRN_MAS0
+	cmpdi	cr0,r14,0
+	bge	tlb_miss_fault_e6500
+
+	/* Now we build the MAS for a 2M indirect page:
+	 *
+	 * MAS 0   :	ESEL needs to be filled by software round-robin
+	 * MAS 1   :	Fully set up
+	 *               - PID already updated by caller if necessary
+	 *               - TSIZE for now is base ind page size always
+	 *               - TID already cleared if necessary
+	 * MAS 2   :	Default not 2M-aligned, need to be redone
+	 * MAS 3+7 :	Needs to be done
+	 */
+
+	ori	r14,r14,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT)
+	mtspr	SPRN_MAS7_MAS3,r14
+
+	clrrdi	r15,r16,21		/* make EA 2M-aligned */
+	mtspr	SPRN_MAS2,r15
+
+	lbz	r15,TCD_ESEL_NEXT(r11)
+	lbz	r16,TCD_ESEL_MAX(r11)
+	lbz	r14,TCD_ESEL_FIRST(r11)
+	rlwimi	r10,r15,16,0x00ff0000	/* insert esel_next into MAS0 */
+	addi	r15,r15,1		/* increment esel_next */
+	mtspr	SPRN_MAS0,r10
+	cmpw	r15,r16
+	iseleq	r15,r14,r15		/* if next == last use first */
+	stb	r15,TCD_ESEL_NEXT(r11)
+
+	tlbwe
+
+tlb_miss_done_e6500:
+	.macro	tlb_unlock_e6500
+	beq	cr1,1f		/* no unlock if lock was recursively grabbed */
+	li	r15,0
+	isync
+	stb	r15,0(r11)
+1:
+	.endm
+
+	tlb_unlock_e6500
+	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
+	tlb_epilog_bolted
+	rfi
+
+tlb_miss_kernel_e6500:
+	mfspr	r10,SPRN_MAS1
+	ld	r14,PACA_KERNELPGD(r13)
+	cmpldi	cr0,r15,8		/* Check for vmalloc region */
+	rlwinm	r10,r10,0,16,1		/* Clear TID */
+	mtspr	SPRN_MAS1,r10
+	beq+	tlb_miss_common_e6500
+
+tlb_miss_fault_e6500:
+	tlb_unlock_e6500
+	/* We need to check if it was an instruction miss */
+	andi.	r16,r16,1
+	bne	itlb_miss_fault_e6500
+dtlb_miss_fault_e6500:
+	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+	tlb_epilog_bolted
+	b	exc_data_storage_book3e
+itlb_miss_fault_e6500:
+	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+	tlb_epilog_bolted
+	b	exc_instruction_storage_book3e
+#endif /* CONFIG_PPC_FSL_BOOK3E */
+
 /**********************************************************************
  *                                                                    *
  * TLB miss handling for Book3E with TLB reservation and HES support  *
@@ -918,7 +1108,8 @@ tlb_load_linear:
 	ld	r11,PACATOC(r13)
 	ld	r11,linear_map_top@got(r11)
 	ld	r10,0(r11)
-	cmpld	cr0,r10,r16
+	tovirt(10,10)
+	cmpld	cr0,r16,r10
 	bge	tlb_load_linear_fault
 
 	/* MAS1 need whole new setup. */
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index 41cd68dee68..92cb18d52ea 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -43,6 +43,7 @@
 #include <asm/tlb.h>
 #include <asm/code-patching.h>
 #include <asm/hugetlb.h>
+#include <asm/paca.h>
 
 #include "mmu_decl.h"
 
@@ -58,6 +59,10 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
 		.shift	= 12,
 		.enc	= BOOK3E_PAGESZ_4K,
 	},
+	[MMU_PAGE_2M] = {
+		.shift	= 21,
+		.enc	= BOOK3E_PAGESZ_2M,
+	},
 	[MMU_PAGE_4M] = {
 		.shift	= 22,
 		.enc	= BOOK3E_PAGESZ_4M,
@@ -136,9 +141,18 @@ static inline int mmu_get_tsize(int psize)
 int mmu_linear_psize;		/* Page size used for the linear mapping */
 int mmu_pte_psize;		/* Page size used for PTE pages */
 int mmu_vmemmap_psize;		/* Page size used for the virtual mem map */
-int book3e_htw_enabled;		/* Is HW tablewalk enabled ? */
+int book3e_htw_mode;		/* HW tablewalk?  Value is PPC_HTW_* */
 unsigned long linear_map_top;	/* Top of linear mapping */
 
+
+/*
+ * Number of bytes to add to SPRN_SPRG_TLB_EXFRAME on crit/mcheck/debug
+ * exceptions.  This is used for bolted and e6500 TLB miss handlers which
+ * do not modify this SPRG in the TLB miss code; for other TLB miss handlers,
+ * this is set to zero.
+ */
+int extlb_level_exc;
+
 #endif /* CONFIG_PPC64 */
 
 #ifdef CONFIG_PPC_FSL_BOOK3E
@@ -305,7 +319,7 @@ void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
 void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
 {
 #ifdef CONFIG_HUGETLB_PAGE
-	if (is_vm_hugetlb_page(vma))
+	if (vma && is_vm_hugetlb_page(vma))
 		flush_hugetlb_page(vma, vmaddr);
 #endif
 
@@ -377,7 +391,7 @@ void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
 {
 	int tsize = mmu_psize_defs[mmu_pte_psize].enc;
 
-	if (book3e_htw_enabled) {
+	if (book3e_htw_mode != PPC_HTW_NONE) {
 		unsigned long start = address & PMD_MASK;
 		unsigned long end = address + PMD_SIZE;
 		unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift;
@@ -430,7 +444,7 @@ static void setup_page_sizes(void)
 			def = &mmu_psize_defs[psize];
 			shift = def->shift;
 
-			if (shift == 0)
+			if (shift == 0 || shift & 1)
 				continue;
 
 			/* adjust to be in terms of 4^shift Kb */
@@ -440,21 +454,40 @@ static void setup_page_sizes(void)
 				def->flags |= MMU_PAGE_SIZE_DIRECT;
 		}
 
-		goto no_indirect;
+		goto out;
 	}
 
 	if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) {
-		u32 tlb1ps = mfspr(SPRN_TLB1PS);
+		u32 tlb1cfg, tlb1ps;
+
+		tlb0cfg = mfspr(SPRN_TLB0CFG);
+		tlb1cfg = mfspr(SPRN_TLB1CFG);
+		tlb1ps = mfspr(SPRN_TLB1PS);
+		eptcfg = mfspr(SPRN_EPTCFG);
+
+		if ((tlb1cfg & TLBnCFG_IND) && (tlb0cfg & TLBnCFG_PT))
+			book3e_htw_mode = PPC_HTW_E6500;
+
+		/*
+		 * We expect 4K subpage size and unrestricted indirect size.
+		 * The lack of a restriction on indirect size is a Freescale
+		 * extension, indicated by PSn = 0 but SPSn != 0.
+		 */
+		if (eptcfg != 2)
+			book3e_htw_mode = PPC_HTW_NONE;
 
 		for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
 			struct mmu_psize_def *def = &mmu_psize_defs[psize];
 
 			if (tlb1ps & (1U << (def->shift - 10))) {
 				def->flags |= MMU_PAGE_SIZE_DIRECT;
+
+				if (book3e_htw_mode && psize == MMU_PAGE_2M)
+					def->flags |= MMU_PAGE_SIZE_INDIRECT;
 			}
 		}
 
-		goto no_indirect;
+		goto out;
 	}
 #endif
 
@@ -471,8 +504,11 @@ static void setup_page_sizes(void)
 	}
 
 	/* Indirect page sizes supported ? */
-	if ((tlb0cfg & TLBnCFG_IND) == 0)
-		goto no_indirect;
+	if ((tlb0cfg & TLBnCFG_IND) == 0 ||
+	    (tlb0cfg & TLBnCFG_PT) == 0)
+		goto out;
+
+	book3e_htw_mode = PPC_HTW_IBM;
 
 	/* Now, we only deal with one IND page size for each
 	 * direct size. Hopefully all implementations today are
@@ -497,8 +533,8 @@ static void setup_page_sizes(void)
 				def->ind = ps + 10;
 		}
 	}
- no_indirect:
 
+out:
 	/* Cleanup array and print summary */
 	pr_info("MMU: Supported page sizes\n");
 	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
@@ -518,44 +554,28 @@ static void setup_page_sizes(void)
 	}
 }
 
-static void __patch_exception(int exc, unsigned long addr)
-{
-	extern unsigned int interrupt_base_book3e;
- 	unsigned int *ibase = &interrupt_base_book3e;
- 
-	/* Our exceptions vectors start with a NOP and -then- a branch
-	 * to deal with single stepping from userspace which stops on
-	 * the second instruction. Thus we need to patch the second
-	 * instruction of the exception, not the first one
-	 */
-
-	patch_branch(ibase + (exc / 4) + 1, addr, 0);
-}
-
-#define patch_exception(exc, name) do { \
-	extern unsigned int name; \
-	__patch_exception((exc), (unsigned long)&name); \
-} while (0)
-
 static void setup_mmu_htw(void)
 {
-	/* Check if HW tablewalk is present, and if yes, enable it by:
-	 *
-	 * - patching the TLB miss handlers to branch to the
-	 *   one dedicates to it
-	 *
-	 * - setting the global book3e_htw_enabled
-       	 */
-	unsigned int tlb0cfg = mfspr(SPRN_TLB0CFG);
+	/*
+	 * If we want to use HW tablewalk, enable it by patching the TLB miss
+	 * handlers to branch to the one dedicated to it.
+	 */
 
-	if ((tlb0cfg & TLBnCFG_IND) &&
-	    (tlb0cfg & TLBnCFG_PT)) {
+	switch (book3e_htw_mode) {
+	case PPC_HTW_IBM:
 		patch_exception(0x1c0, exc_data_tlb_miss_htw_book3e);
 		patch_exception(0x1e0, exc_instruction_tlb_miss_htw_book3e);
-		book3e_htw_enabled = 1;
+		break;
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	case PPC_HTW_E6500:
+		extlb_level_exc = EX_TLB_SIZE;
+		patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e);
+		patch_exception(0x1e0, exc_instruction_tlb_miss_e6500_book3e);
+		break;
+#endif
 	}
 	pr_info("MMU: Book3E HW tablewalk %s\n",
-		book3e_htw_enabled ? "enabled" : "not supported");
+		book3e_htw_mode != PPC_HTW_NONE ? "enabled" : "not supported");
 }
 
 /*
@@ -576,8 +596,13 @@ static void __early_init_mmu(int boot_cpu)
 	/* XXX This should be decided at runtime based on supported
 	 * page sizes in the TLB, but for now let's assume 16M is
 	 * always there and a good fit (which it probably is)
+	 *
+	 * Freescale booke only supports 4K pages in TLB0, so use that.
 	 */
-	mmu_vmemmap_psize = MMU_PAGE_16M;
+	if (mmu_has_feature(MMU_FTR_TYPE_FSL_E))
+		mmu_vmemmap_psize = MMU_PAGE_4K;
+	else
+		mmu_vmemmap_psize = MMU_PAGE_16M;
 
 	/* XXX This code only checks for TLB 0 capabilities and doesn't
 	 *     check what page size combos are supported by the HW. It
@@ -595,8 +620,16 @@ static void __early_init_mmu(int boot_cpu)
 	/* Set MAS4 based on page table setting */
 
 	mas4 = 0x4 << MAS4_WIMGED_SHIFT;
-	if (book3e_htw_enabled) {
-		mas4 |= mas4 | MAS4_INDD;
+	switch (book3e_htw_mode) {
+	case PPC_HTW_E6500:
+		mas4 |= MAS4_INDD;
+		mas4 |= BOOK3E_PAGESZ_2M << MAS4_TSIZED_SHIFT;
+		mas4 |= MAS4_TLBSELD(1);
+		mmu_pte_psize = MMU_PAGE_2M;
+		break;
+
+	case PPC_HTW_IBM:
+		mas4 |= MAS4_INDD;
 #ifdef CONFIG_PPC_64K_PAGES
 		mas4 |=	BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT;
 		mmu_pte_psize = MMU_PAGE_256M;
@@ -604,13 +637,16 @@ static void __early_init_mmu(int boot_cpu)
 		mas4 |=	BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT;
 		mmu_pte_psize = MMU_PAGE_1M;
 #endif
-	} else {
+		break;
+
+	case PPC_HTW_NONE:
 #ifdef CONFIG_PPC_64K_PAGES
 		mas4 |=	BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT;
 #else
 		mas4 |=	BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT;
 #endif
 		mmu_pte_psize = mmu_virtual_psize;
+		break;
 	}
 	mtspr(SPRN_MAS4, mas4);
 
@@ -630,8 +666,12 @@ static void __early_init_mmu(int boot_cpu)
 		/* limit memory so we dont have linear faults */
 		memblock_enforce_memory_limit(linear_map_top);
 
-		patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e);
-		patch_exception(0x1e0, exc_instruction_tlb_miss_bolted_book3e);
+		if (book3e_htw_mode == PPC_HTW_NONE) {
+			extlb_level_exc = EX_TLB_SIZE;
+			patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e);
+			patch_exception(0x1e0,
+				exc_instruction_tlb_miss_bolted_book3e);
+		}
 	}
 #endif
 
diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S
index 626ad081639..43ff3c797fb 100644
--- a/arch/powerpc/mm/tlb_nohash_low.S
+++ b/arch/powerpc/mm/tlb_nohash_low.S
@@ -402,7 +402,9 @@ _GLOBAL(set_context)
  * Load TLBCAM[index] entry in to the L2 CAM MMU
  */
 _GLOBAL(loadcam_entry)
-	LOAD_REG_ADDR(r4, TLBCAM)
+	mflr	r5
+	LOAD_REG_ADDR_PIC(r4, TLBCAM)
+	mtlr	r5
 	mulli	r5,r3,TLBCAM_SIZE
 	add	r3,r5,r4
 	lwz	r4,TLBCAM_MAS0(r3)