45 files changed, 11231 insertions, 3222 deletions
diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c
index 3899ea97fbd..5810967511d 100644
--- a/arch/powerpc/mm/40x_mmu.c
+++ b/arch/powerpc/mm/40x_mmu.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/delay.h>
 #include <linux/highmem.h>
+#include <linux/memblock.h>
 
 #include <asm/pgalloc.h>
 #include <asm/prom.h>
@@ -47,6 +48,7 @@
 #include <asm/bootx.h>
 #include <asm/machdep.h>
 #include <asm/setup.h>
+
 #include "mmu_decl.h"
 
 extern int __map_without_ltlbs;
@@ -84,20 +86,20 @@ void __init MMU_init_hw(void)
 	 * vectors and the kernel live in real-mode.
 	 */
 
-        mtspr(SPRN_DCCR, 0xF0000000);	/* 512 MB of data space at 0x0. */
-        mtspr(SPRN_ICCR, 0xF0000000);	/* 512 MB of instr. space at 0x0. */
+        mtspr(SPRN_DCCR, 0xFFFF0000);	/* 2GByte of data space at 0x0. */
+        mtspr(SPRN_ICCR, 0xFFFF0000);	/* 2GByte of instr. space at 0x0. */
 }
 
 #define LARGE_PAGE_SIZE_16M	(1<<24)
 #define LARGE_PAGE_SIZE_4M	(1<<22)
 
-unsigned long __init mmu_mapin_ram(void)
+unsigned long __init mmu_mapin_ram(unsigned long top)
 {
-	unsigned long v, s;
+	unsigned long v, s, mapped;
 	phys_addr_t p;
 
 	v = KERNELBASE;
-	p = PPC_MEMSTART;
+	p = 0;
 	s = total_lowmem;
 
 	if (__map_without_ltlbs)
@@ -105,7 +107,7 @@ unsigned long __init mmu_mapin_ram(void)
 
 	while (s >= LARGE_PAGE_SIZE_16M) {
 		pmd_t *pmdp;
-		unsigned long val = p | _PMD_SIZE_16M | _PAGE_HWEXEC | _PAGE_HWWRITE;
+		unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_HWWRITE;
 
 		pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
 		pmd_val(*pmdp++) = val;
@@ -120,7 +122,7 @@ unsigned long __init mmu_mapin_ram(void)
 
 	while (s >= LARGE_PAGE_SIZE_4M) {
 		pmd_t *pmdp;
-		unsigned long val = p | _PMD_SIZE_4M | _PAGE_HWEXEC | _PAGE_HWWRITE;
+		unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_HWWRITE;
 
 		pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
 		pmd_val(*pmdp) = val;
@@ -130,5 +132,28 @@ unsigned long __init mmu_mapin_ram(void)
 		s -= LARGE_PAGE_SIZE_4M;
 	}
 
-	return total_lowmem - s;
+	mapped = total_lowmem - s;
+
+	/* If the size of RAM is not an exact power of two, we may not
+	 * have covered RAM in its entirety with 16 and 4 MiB
+	 * pages. Consequently, restrict the top end of RAM currently
+	 * allocable so that calls to the MEMBLOCK to allocate PTEs for "tail"
+	 * coverage with normal-sized pages (or other reasons) do not
+	 * attempt to allocate outside the allowed range.
+	 */
+	memblock_set_current_limit(mapped);
+
+	return mapped;
+}
+
+void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+				phys_addr_t first_memblock_size)
+{
+	/* We don't currently support the first MEMBLOCK not mapping 0
+	 * physical on those processors
+	 */
+	BUG_ON(first_memblock_base != 0);
+
+	/* 40x can only access 16MB at the moment (see head_40x.S) */
+	memblock_set_current_limit(min_t(u64, first_memblock_size, 0x00800000));
 }
diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c
index 04dc08798d3..82b1ff759e2 100644
--- a/arch/powerpc/mm/44x_mmu.c
+++ b/arch/powerpc/mm/44x_mmu.c
@@ -24,9 +24,11 @@
  */
 
 #include <linux/init.h>
+#include <linux/memblock.h>
+
 #include <asm/mmu.h>
-#include <asm/system.h>
 #include <asm/page.h>
+#include <asm/cacheflush.h>
 
 #include "mmu_decl.h"
 
@@ -37,11 +39,39 @@ unsigned int tlb_44x_index; /* = 0 */
 unsigned int tlb_44x_hwater = PPC44x_TLB_SIZE - 1 - PPC44x_EARLY_TLBS;
 int icache_44x_need_flush;
 
+unsigned long tlb_47x_boltmap[1024/8];
+
+static void ppc44x_update_tlb_hwater(void)
+{
+	extern unsigned int tlb_44x_patch_hwater_D[];
+	extern unsigned int tlb_44x_patch_hwater_I[];
+
+	/* The TLB miss handlers hard codes the watermark in a cmpli
+	 * instruction to improve performances rather than loading it
+	 * from the global variable. Thus, we patch the instructions
+	 * in the 2 TLB miss handlers when updating the value
+	 */
+	tlb_44x_patch_hwater_D[0] = (tlb_44x_patch_hwater_D[0] & 0xffff0000) |
+		tlb_44x_hwater;
+	flush_icache_range((unsigned long)&tlb_44x_patch_hwater_D[0],
+			   (unsigned long)&tlb_44x_patch_hwater_D[1]);
+	tlb_44x_patch_hwater_I[0] = (tlb_44x_patch_hwater_I[0] & 0xffff0000) |
+		tlb_44x_hwater;
+	flush_icache_range((unsigned long)&tlb_44x_patch_hwater_I[0],
+			   (unsigned long)&tlb_44x_patch_hwater_I[1]);
+}
+
 /*
- * "Pins" a 256MB TLB entry in AS0 for kernel lowmem
+ * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 44x type MMU
  */
 static void __init ppc44x_pin_tlb(unsigned int virt, unsigned int phys)
 {
+	unsigned int entry = tlb_44x_hwater--;
+
+	ppc44x_update_tlb_hwater();
+
+	mtspr(SPRN_MMUCR, 0);
+
 	__asm__ __volatile__(
 		"tlbwe	%2,%3,%4\n"
 		"tlbwe	%1,%3,%5\n"
@@ -50,26 +80,175 @@ static void __init ppc44x_pin_tlb(unsigned int virt, unsigned int phys)
 	: "r" (PPC44x_TLB_SW | PPC44x_TLB_SR | PPC44x_TLB_SX | PPC44x_TLB_G),
 	  "r" (phys),
 	  "r" (virt | PPC44x_TLB_VALID | PPC44x_TLB_256M),
-	  "r" (tlb_44x_hwater--), /* slot for this TLB entry */
+	  "r" (entry),
 	  "i" (PPC44x_TLB_PAGEID),
 	  "i" (PPC44x_TLB_XLAT),
 	  "i" (PPC44x_TLB_ATTRIB));
 }
 
+static int __init ppc47x_find_free_bolted(void)
+{
+	unsigned int mmube0 = mfspr(SPRN_MMUBE0);
+	unsigned int mmube1 = mfspr(SPRN_MMUBE1);
+
+	if (!(mmube0 & MMUBE0_VBE0))
+		return 0;
+	if (!(mmube0 & MMUBE0_VBE1))
+		return 1;
+	if (!(mmube0 & MMUBE0_VBE2))
+		return 2;
+	if (!(mmube1 & MMUBE1_VBE3))
+		return 3;
+	if (!(mmube1 & MMUBE1_VBE4))
+		return 4;
+	if (!(mmube1 & MMUBE1_VBE5))
+		return 5;
+	return -1;
+}
+
+static void __init ppc47x_update_boltmap(void)
+{
+	unsigned int mmube0 = mfspr(SPRN_MMUBE0);
+	unsigned int mmube1 = mfspr(SPRN_MMUBE1);
+
+	if (mmube0 & MMUBE0_VBE0)
+		__set_bit((mmube0 >> MMUBE0_IBE0_SHIFT) & 0xff,
+			  tlb_47x_boltmap);
+	if (mmube0 & MMUBE0_VBE1)
+		__set_bit((mmube0 >> MMUBE0_IBE1_SHIFT) & 0xff,
+			  tlb_47x_boltmap);
+	if (mmube0 & MMUBE0_VBE2)
+		__set_bit((mmube0 >> MMUBE0_IBE2_SHIFT) & 0xff,
+			  tlb_47x_boltmap);
+	if (mmube1 & MMUBE1_VBE3)
+		__set_bit((mmube1 >> MMUBE1_IBE3_SHIFT) & 0xff,
+			  tlb_47x_boltmap);
+	if (mmube1 & MMUBE1_VBE4)
+		__set_bit((mmube1 >> MMUBE1_IBE4_SHIFT) & 0xff,
+			  tlb_47x_boltmap);
+	if (mmube1 & MMUBE1_VBE5)
+		__set_bit((mmube1 >> MMUBE1_IBE5_SHIFT) & 0xff,
+			  tlb_47x_boltmap);
+}
+
+/*
+ * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU
+ */
+static void ppc47x_pin_tlb(unsigned int virt, unsigned int phys)
+{
+	unsigned int rA;
+	int bolted;
+
+	/* Base rA is HW way select, way 0, bolted bit set */
+	rA = 0x88000000;
+
+	/* Look for a bolted entry slot */
+	bolted = ppc47x_find_free_bolted();
+	BUG_ON(bolted < 0);
+
+	/* Insert bolted slot number */
+	rA |= bolted << 24;
+
+	pr_debug("256M TLB entry for 0x%08x->0x%08x in bolt slot %d\n",
+		 virt, phys, bolted);
+
+	mtspr(SPRN_MMUCR, 0);
+
+	__asm__ __volatile__(
+		"tlbwe	%2,%3,0\n"
+		"tlbwe	%1,%3,1\n"
+		"tlbwe	%0,%3,2\n"
+		:
+		: "r" (PPC47x_TLB2_SW | PPC47x_TLB2_SR |
+		       PPC47x_TLB2_SX
+#ifdef CONFIG_SMP
+		       | PPC47x_TLB2_M
+#endif
+		       ),
+		  "r" (phys),
+		  "r" (virt | PPC47x_TLB0_VALID | PPC47x_TLB0_256M),
+		  "r" (rA));
+}
+
 void __init MMU_init_hw(void)
 {
+	/* This is not useful on 47x but won't hurt either */
+	ppc44x_update_tlb_hwater();
+
 	flush_instruction_cache();
 }
 
-unsigned long __init mmu_mapin_ram(void)
+unsigned long __init mmu_mapin_ram(unsigned long top)
 {
 	unsigned long addr;
+	unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1);
 
 	/* Pin in enough TLBs to cover any lowmem not covered by the
 	 * initial 256M mapping established in head_44x.S */
-	for (addr = PPC_PIN_SIZE; addr < total_lowmem;
-	     addr += PPC_PIN_SIZE)
-		ppc44x_pin_tlb(addr + PAGE_OFFSET, addr);
+	for (addr = memstart + PPC_PIN_SIZE; addr < lowmem_end_addr;
+	     addr += PPC_PIN_SIZE) {
+		if (mmu_has_feature(MMU_FTR_TYPE_47x))
+			ppc47x_pin_tlb(addr + PAGE_OFFSET, addr);
+		else
+			ppc44x_pin_tlb(addr + PAGE_OFFSET, addr);
+	}
+	if (mmu_has_feature(MMU_FTR_TYPE_47x)) {
+		ppc47x_update_boltmap();
 
+#ifdef DEBUG
+		{
+			int i;
+
+			printk(KERN_DEBUG "bolted entries: ");
+			for (i = 0; i < 255; i++) {
+				if (test_bit(i, tlb_47x_boltmap))
+					printk("%d ", i);
+			}
+			printk("\n");
+		}
+#endif /* DEBUG */
+	}
 	return total_lowmem;
 }
+
+void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+				phys_addr_t first_memblock_size)
+{
+	u64 size;
+
+#ifndef CONFIG_NONSTATIC_KERNEL
+	/* We don't currently support the first MEMBLOCK not mapping 0
+	 * physical on those processors
+	 */
+	BUG_ON(first_memblock_base != 0);
+#endif
+
+	/* 44x has a 256M TLB entry pinned at boot */
+	size = (min_t(u64, first_memblock_size, PPC_PIN_SIZE));
+	memblock_set_current_limit(first_memblock_base + size);
+}
+
+#ifdef CONFIG_SMP
+void mmu_init_secondary(int cpu)
+{
+	unsigned long addr;
+	unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1);
+
+	/* Pin in enough TLBs to cover any lowmem not covered by the
+	 * initial 256M mapping established in head_44x.S
+	 *
+	 * WARNING: This is called with only the first 256M of the
+	 * linear mapping in the TLB and we can't take faults yet
+	 * so beware of what this code uses. It runs off a temporary
+	 * stack. current (r2) isn't initialized, smp_processor_id()
+	 * will not work, current thread info isn't accessible, ...
+	 */
+	for (addr = memstart + PPC_PIN_SIZE; addr < lowmem_end_addr;
+	     addr += PPC_PIN_SIZE) {
+		if (mmu_has_feature(MMU_FTR_TYPE_47x))
+			ppc47x_pin_tlb(addr + PAGE_OFFSET, addr);
+		else
+			ppc44x_pin_tlb(addr + PAGE_OFFSET, addr);
+	}
+}
+#endif /* CONFIG_SMP */
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 41649a5d360..51230ee6a40 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -2,24 +2,37 @@
 # Makefile for the linux ppc-specific parts of the memory manager.
 #
 
-ifeq ($(CONFIG_PPC64),y)
-EXTRA_CFLAGS	+= -mno-minimal-toc
-endif
+subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
+
+ccflags-$(CONFIG_PPC64)	:= $(NO_MINIMAL_TOC)
 
-obj-y				:= fault.o mem.o lmb.o \
+obj-y				:= fault.o mem.o pgtable.o gup.o mmap.o \
 				   init_$(CONFIG_WORD_SIZE).o \
-				   pgtable_$(CONFIG_WORD_SIZE).o \
-				   mmu_context_$(CONFIG_WORD_SIZE).o
-hash-$(CONFIG_PPC_NATIVE)	:= hash_native_64.o
-obj-$(CONFIG_PPC64)		+= hash_utils_64.o \
-				   slb_low.o slb.o stab.o mmap.o $(hash-y)
+				   pgtable_$(CONFIG_WORD_SIZE).o
+obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o \
+				   tlb_nohash_low.o
+obj-$(CONFIG_PPC_BOOK3E)	+= tlb_low_$(CONFIG_WORD_SIZE)e.o
+hash64-$(CONFIG_PPC_NATIVE)	:= hash_native_64.o
+obj-$(CONFIG_PPC_STD_MMU_64)	+= hash_utils_64.o \
+				   slb_low.o slb.o stab.o \
+				   $(hash64-y)
 obj-$(CONFIG_PPC_STD_MMU_32)	+= ppc_mmu_32.o
 obj-$(CONFIG_PPC_STD_MMU)	+= hash_low_$(CONFIG_WORD_SIZE).o \
-				   tlb_$(CONFIG_WORD_SIZE).o
+				   tlb_hash$(CONFIG_WORD_SIZE).o \
+				   mmu_context_hash$(CONFIG_WORD_SIZE).o
+obj-$(CONFIG_PPC_ICSWX)		+= icswx.o
+obj-$(CONFIG_PPC_ICSWX_PID)	+= icswx_pid.o
 obj-$(CONFIG_40x)		+= 40x_mmu.o
 obj-$(CONFIG_44x)		+= 44x_mmu.o
-obj-$(CONFIG_FSL_BOOKE)		+= fsl_booke_mmu.o
+obj-$(CONFIG_PPC_FSL_BOOK3E)	+= fsl_booke_mmu.o
 obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
 obj-$(CONFIG_PPC_MM_SLICES)	+= slice.o
-obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
+obj-y				+= hugetlbpage.o
+ifeq ($(CONFIG_HUGETLB_PAGE),y)
+obj-$(CONFIG_PPC_STD_MMU_64)	+= hugetlbpage-hash64.o
+obj-$(CONFIG_PPC_BOOK3E_MMU)	+= hugetlbpage-book3e.o
+endif
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o
 obj-$(CONFIG_PPC_SUBPAGE_PROT)	+= subpage-prot.o
+obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
+obj-$(CONFIG_HIGHMEM)		+= highmem.o
diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c
new file mode 100644
index 00000000000..7b6c1075017
--- /dev/null
+++ b/arch/powerpc/mm/dma-noncoherent.c
@@ -0,0 +1,419 @@
+/*
+ *  PowerPC version derived from arch/arm/mm/consistent.c
+ *    Copyright (C) 2001 Dan Malek (dmalek@jlc.net)
+ *
+ *  Copyright (C) 2000 Russell King
+ *
+ * Consistent memory allocators.  Used for DMA devices that want to
+ * share uncached memory with the processor core.  The function return
+ * is the virtual address and 'dma_handle' is the physical address.
+ * Mostly stolen from the ARM port, with some changes for PowerPC.
+ *						-- Dan
+ *
+ * Reorganized to get rid of the arch-specific consistent_* functions
+ * and provide non-coherent implementations for the DMA API. -Matt
+ *
+ * Added in_interrupt() safe dma_alloc_coherent()/dma_free_coherent()
+ * implementation. This is pulled straight from ARM and barely
+ * modified. -Matt
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/dma-mapping.h>
+#include <linux/export.h>
+
+#include <asm/tlbflush.h>
+
+#include "mmu_decl.h"
+
+/*
+ * This address range defaults to a value that is safe for all
+ * platforms which currently set CONFIG_NOT_COHERENT_CACHE. It
+ * can be further configured for specific applications under
+ * the "Advanced Setup" menu. -Matt
+ */
+#define CONSISTENT_BASE		(IOREMAP_TOP)
+#define CONSISTENT_END 		(CONSISTENT_BASE + CONFIG_CONSISTENT_SIZE)
+#define CONSISTENT_OFFSET(x)	(((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT)
+
+/*
+ * This is the page table (2MB) covering uncached, DMA consistent allocations
+ */
+static DEFINE_SPINLOCK(consistent_lock);
+
+/*
+ * VM region handling support.
+ *
+ * This should become something generic, handling VM region allocations for
+ * vmalloc and similar (ioremap, module space, etc).
+ *
+ * I envisage vmalloc()'s supporting vm_struct becoming:
+ *
+ *  struct vm_struct {
+ *    struct vm_region	region;
+ *    unsigned long	flags;
+ *    struct page	**pages;
+ *    unsigned int	nr_pages;
+ *    unsigned long	phys_addr;
+ *  };
+ *
+ * get_vm_area() would then call vm_region_alloc with an appropriate
+ * struct vm_region head (eg):
+ *
+ *  struct vm_region vmalloc_head = {
+ *	.vm_list	= LIST_HEAD_INIT(vmalloc_head.vm_list),
+ *	.vm_start	= VMALLOC_START,
+ *	.vm_end		= VMALLOC_END,
+ *  };
+ *
+ * However, vmalloc_head.vm_start is variable (typically, it is dependent on
+ * the amount of RAM found at boot time.)  I would imagine that get_vm_area()
+ * would have to initialise this each time prior to calling vm_region_alloc().
+ */
+struct ppc_vm_region {
+	struct list_head	vm_list;
+	unsigned long		vm_start;
+	unsigned long		vm_end;
+};
+
+static struct ppc_vm_region consistent_head = {
+	.vm_list	= LIST_HEAD_INIT(consistent_head.vm_list),
+	.vm_start	= CONSISTENT_BASE,
+	.vm_end		= CONSISTENT_END,
+};
+
+static struct ppc_vm_region *
+ppc_vm_region_alloc(struct ppc_vm_region *head, size_t size, gfp_t gfp)
+{
+	unsigned long addr = head->vm_start, end = head->vm_end - size;
+	unsigned long flags;
+	struct ppc_vm_region *c, *new;
+
+	new = kmalloc(sizeof(struct ppc_vm_region), gfp);
+	if (!new)
+		goto out;
+
+	spin_lock_irqsave(&consistent_lock, flags);
+
+	list_for_each_entry(c, &head->vm_list, vm_list) {
+		if ((addr + size) < addr)
+			goto nospc;
+		if ((addr + size) <= c->vm_start)
+			goto found;
+		addr = c->vm_end;
+		if (addr > end)
+			goto nospc;
+	}
+
+ found:
+	/*
+	 * Insert this entry _before_ the one we found.
+	 */
+	list_add_tail(&new->vm_list, &c->vm_list);
+	new->vm_start = addr;
+	new->vm_end = addr + size;
+
+	spin_unlock_irqrestore(&consistent_lock, flags);
+	return new;
+
+ nospc:
+	spin_unlock_irqrestore(&consistent_lock, flags);
+	kfree(new);
+ out:
+	return NULL;
+}
+
+static struct ppc_vm_region *ppc_vm_region_find(struct ppc_vm_region *head, unsigned long addr)
+{
+	struct ppc_vm_region *c;
+
+	list_for_each_entry(c, &head->vm_list, vm_list) {
+		if (c->vm_start == addr)
+			goto out;
+	}
+	c = NULL;
+ out:
+	return c;
+}
+
+/*
+ * Allocate DMA-coherent memory space and return both the kernel remapped
+ * virtual and bus address for that space.
+ */
+void *
+__dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp)
+{
+	struct page *page;
+	struct ppc_vm_region *c;
+	unsigned long order;
+	u64 mask = ISA_DMA_THRESHOLD, limit;
+
+	if (dev) {
+		mask = dev->coherent_dma_mask;
+
+		/*
+		 * Sanity check the DMA mask - it must be non-zero, and
+		 * must be able to be satisfied by a DMA allocation.
+		 */
+		if (mask == 0) {
+			dev_warn(dev, "coherent DMA mask is unset\n");
+			goto no_page;
+		}
+
+		if ((~mask) & ISA_DMA_THRESHOLD) {
+			dev_warn(dev, "coherent DMA mask %#llx is smaller "
+				 "than system GFP_DMA mask %#llx\n",
+				 mask, (unsigned long long)ISA_DMA_THRESHOLD);
+			goto no_page;
+		}
+	}
+
+
+	size = PAGE_ALIGN(size);
+	limit = (mask + 1) & ~mask;
+	if ((limit && size >= limit) ||
+	    size >= (CONSISTENT_END - CONSISTENT_BASE)) {
+		printk(KERN_WARNING "coherent allocation too big (requested %#x mask %#Lx)\n",
+		       size, mask);
+		return NULL;
+	}
+
+	order = get_order(size);
+
+	/* Might be useful if we ever have a real legacy DMA zone... */
+	if (mask != 0xffffffff)
+		gfp |= GFP_DMA;
+
+	page = alloc_pages(gfp, order);
+	if (!page)
+		goto no_page;
+
+	/*
+	 * Invalidate any data that might be lurking in the
+	 * kernel direct-mapped region for device DMA.
+	 */
+	{
+		unsigned long kaddr = (unsigned long)page_address(page);
+		memset(page_address(page), 0, size);
+		flush_dcache_range(kaddr, kaddr + size);
+	}
+
+	/*
+	 * Allocate a virtual address in the consistent mapping region.
+	 */
+	c = ppc_vm_region_alloc(&consistent_head, size,
+			    gfp & ~(__GFP_DMA | __GFP_HIGHMEM));
+	if (c) {
+		unsigned long vaddr = c->vm_start;
+		struct page *end = page + (1 << order);
+
+		split_page(page, order);
+
+		/*
+		 * Set the "dma handle"
+		 */
+		*handle = page_to_phys(page);
+
+		do {
+			SetPageReserved(page);
+			map_page(vaddr, page_to_phys(page),
+				 pgprot_noncached(PAGE_KERNEL));
+			page++;
+			vaddr += PAGE_SIZE;
+		} while (size -= PAGE_SIZE);
+
+		/*
+		 * Free the otherwise unused pages.
+		 */
+		while (page < end) {
+			__free_page(page);
+			page++;
+		}
+
+		return (void *)c->vm_start;
+	}
+
+	if (page)
+		__free_pages(page, order);
+ no_page:
+	return NULL;
+}
+EXPORT_SYMBOL(__dma_alloc_coherent);
+
+/*
+ * free a page as defined by the above mapping.
+ */
+void __dma_free_coherent(size_t size, void *vaddr)
+{
+	struct ppc_vm_region *c;
+	unsigned long flags, addr;
+	
+	size = PAGE_ALIGN(size);
+
+	spin_lock_irqsave(&consistent_lock, flags);
+
+	c = ppc_vm_region_find(&consistent_head, (unsigned long)vaddr);
+	if (!c)
+		goto no_area;
+
+	if ((c->vm_end - c->vm_start) != size) {
+		printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n",
+		       __func__, c->vm_end - c->vm_start, size);
+		dump_stack();
+		size = c->vm_end - c->vm_start;
+	}
+
+	addr = c->vm_start;
+	do {
+		pte_t *ptep;
+		unsigned long pfn;
+
+		ptep = pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(addr),
+							       addr),
+						    addr),
+					 addr);
+		if (!pte_none(*ptep) && pte_present(*ptep)) {
+			pfn = pte_pfn(*ptep);
+			pte_clear(&init_mm, addr, ptep);
+			if (pfn_valid(pfn)) {
+				struct page *page = pfn_to_page(pfn);
+				__free_reserved_page(page);
+			}
+		}
+		addr += PAGE_SIZE;
+	} while (size -= PAGE_SIZE);
+
+	flush_tlb_kernel_range(c->vm_start, c->vm_end);
+
+	list_del(&c->vm_list);
+
+	spin_unlock_irqrestore(&consistent_lock, flags);
+
+	kfree(c);
+	return;
+
+ no_area:
+	spin_unlock_irqrestore(&consistent_lock, flags);
+	printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n",
+	       __func__, vaddr);
+	dump_stack();
+}
+EXPORT_SYMBOL(__dma_free_coherent);
+
+/*
+ * make an area consistent.
+ */
+void __dma_sync(void *vaddr, size_t size, int direction)
+{
+	unsigned long start = (unsigned long)vaddr;
+	unsigned long end   = start + size;
+
+	switch (direction) {
+	case DMA_NONE:
+		BUG();
+	case DMA_FROM_DEVICE:
+		/*
+		 * invalidate only when cache-line aligned otherwise there is
+		 * the potential for discarding uncommitted data from the cache
+		 */
+		if ((start & (L1_CACHE_BYTES - 1)) || (size & (L1_CACHE_BYTES - 1)))
+			flush_dcache_range(start, end);
+		else
+			invalidate_dcache_range(start, end);
+		break;
+	case DMA_TO_DEVICE:		/* writeback only */
+		clean_dcache_range(start, end);
+		break;
+	case DMA_BIDIRECTIONAL:	/* writeback and invalidate */
+		flush_dcache_range(start, end);
+		break;
+	}
+}
+EXPORT_SYMBOL(__dma_sync);
+
+#ifdef CONFIG_HIGHMEM
+/*
+ * __dma_sync_page() implementation for systems using highmem.
+ * In this case, each page of a buffer must be kmapped/kunmapped
+ * in order to have a virtual address for __dma_sync(). This must
+ * not sleep so kmap_atomic()/kunmap_atomic() are used.
+ *
+ * Note: yes, it is possible and correct to have a buffer extend
+ * beyond the first page.
+ */
+static inline void __dma_sync_page_highmem(struct page *page,
+		unsigned long offset, size_t size, int direction)
+{
+	size_t seg_size = min((size_t)(PAGE_SIZE - offset), size);
+	size_t cur_size = seg_size;
+	unsigned long flags, start, seg_offset = offset;
+	int nr_segs = 1 + ((size - seg_size) + PAGE_SIZE - 1)/PAGE_SIZE;
+	int seg_nr = 0;
+
+	local_irq_save(flags);
+
+	do {
+		start = (unsigned long)kmap_atomic(page + seg_nr) + seg_offset;
+
+		/* Sync this buffer segment */
+		__dma_sync((void *)start, seg_size, direction);
+		kunmap_atomic((void *)start);
+		seg_nr++;
+
+		/* Calculate next buffer segment size */
+		seg_size = min((size_t)PAGE_SIZE, size - cur_size);
+
+		/* Add the segment size to our running total */
+		cur_size += seg_size;
+		seg_offset = 0;
+	} while (seg_nr < nr_segs);
+
+	local_irq_restore(flags);
+}
+#endif /* CONFIG_HIGHMEM */
+
+/*
+ * __dma_sync_page makes memory consistent. identical to __dma_sync, but
+ * takes a struct page instead of a virtual address
+ */
+void __dma_sync_page(struct page *page, unsigned long offset,
+	size_t size, int direction)
+{
+#ifdef CONFIG_HIGHMEM
+	__dma_sync_page_highmem(page, offset, size, direction);
+#else
+	unsigned long start = (unsigned long)page_address(page) + offset;
+	__dma_sync((void *)start, size, direction);
+#endif
+}
+EXPORT_SYMBOL(__dma_sync_page);
+
+/*
+ * Return the PFN for a given cpu virtual address returned by
+ * __dma_alloc_coherent. This is used by dma_mmap_coherent()
+ */
+unsigned long __dma_get_coherent_pfn(unsigned long cpu_addr)
+{
+	/* This should always be populated, so we don't test every
+	 * level. If that fails, we'll have a nice crash which
+	 * will be as good as a BUG_ON()
+	 */
+	pgd_t *pgd = pgd_offset_k(cpu_addr);
+	pud_t *pud = pud_offset(pgd, cpu_addr);
+	pmd_t *pmd = pmd_offset(pud, cpu_addr);
+	pte_t *ptep = pte_offset_kernel(pmd, cpu_addr);
+
+	if (pte_none(*ptep) || !pte_present(*ptep))
+		return 0;
+	return pte_pfn(*ptep);
+}
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 7b251079926..51ab9e7e6c3 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -29,16 +29,23 @@
 #include <linux/module.h>
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
+#include <linux/perf_event.h>
+#include <linux/magic.h>
+#include <linux/ratelimit.h>
+#include <linux/context_tracking.h>
 
+#include <asm/firmware.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
 #include <asm/mmu_context.h>
-#include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/tlbflush.h>
 #include <asm/siginfo.h>
+#include <asm/debug.h>
+#include <mm/mmu_decl.h>
 
+#include "icswx.h"
 
 #ifdef CONFIG_KPROBES
 static inline int notify_page_fault(struct pt_regs *regs)
@@ -99,31 +106,80 @@ static int store_updates_sp(struct pt_regs *regs)
 	}
 	return 0;
 }
+/*
+ * do_page_fault error handling helpers
+ */
 
-#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
-static void do_dabr(struct pt_regs *regs, unsigned long address,
-		    unsigned long error_code)
+#define MM_FAULT_RETURN		0
+#define MM_FAULT_CONTINUE	-1
+#define MM_FAULT_ERR(sig)	(sig)
+
+static int do_sigbus(struct pt_regs *regs, unsigned long address)
 {
 	siginfo_t info;
 
-	if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
-			11, SIGSEGV) == NOTIFY_STOP)
-		return;
+	up_read(&current->mm->mmap_sem);
 
-	if (debugger_dabr_match(regs))
-		return;
+	if (user_mode(regs)) {
+		current->thread.trap_nr = BUS_ADRERR;
+		info.si_signo = SIGBUS;
+		info.si_errno = 0;
+		info.si_code = BUS_ADRERR;
+		info.si_addr = (void __user *)address;
+		force_sig_info(SIGBUS, &info, current);
+		return MM_FAULT_RETURN;
+	}
+	return MM_FAULT_ERR(SIGBUS);
+}
+
+static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
+{
+	/*
+	 * Pagefault was interrupted by SIGKILL. We have no reason to
+	 * continue the pagefault.
+	 */
+	if (fatal_signal_pending(current)) {
+		/*
+		 * If we have retry set, the mmap semaphore will have
+		 * alrady been released in __lock_page_or_retry(). Else
+		 * we release it now.
+		 */
+		if (!(fault & VM_FAULT_RETRY))
+			up_read(&current->mm->mmap_sem);
+		/* Coming from kernel, we need to deal with uaccess fixups */
+		if (user_mode(regs))
+			return MM_FAULT_RETURN;
+		return MM_FAULT_ERR(SIGKILL);
+	}
+
+	/* No fault: be happy */
+	if (!(fault & VM_FAULT_ERROR))
+		return MM_FAULT_CONTINUE;
+
+	/* Out of memory */
+	if (fault & VM_FAULT_OOM) {
+		up_read(&current->mm->mmap_sem);
 
-	/* Clear the DABR */
-	set_dabr(0);
+		/*
+		 * We ran out of memory, or some other thing happened to us that
+		 * made us unable to handle the page fault gracefully.
+		 */
+		if (!user_mode(regs))
+			return MM_FAULT_ERR(SIGKILL);
+		pagefault_out_of_memory();
+		return MM_FAULT_RETURN;
+	}
+
+	/* Bus error. x86 handles HWPOISON here, we'll add this if/when
+	 * we support the feature in HW
+	 */
+	if (fault & VM_FAULT_SIGBUS)
+		return do_sigbus(regs, addr);
 
-	/* Deliver the signal to userspace */
-	info.si_signo = SIGTRAP;
-	info.si_errno = 0;
-	info.si_code = TRAP_HWBKPT;
-	info.si_addr = (void __user *)address;
-	force_sig_info(SIGTRAP, &info, current);
+	/* We don't understand the fault code, this is fatal */
+	BUG();
+	return MM_FAULT_CONTINUE;
 }
-#endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/
 
 /*
  * For 600- and 800-family processors, the error_code parameter is DSISR
@@ -141,13 +197,16 @@ static void do_dabr(struct pt_regs *regs, unsigned long address,
 int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 			    unsigned long error_code)
 {
+	enum ctx_state prev_state = exception_enter();
 	struct vm_area_struct * vma;
 	struct mm_struct *mm = current->mm;
-	siginfo_t info;
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 	int code = SEGV_MAPERR;
-	int is_write = 0, ret;
+	int is_write = 0;
 	int trap = TRAP(regs);
  	int is_exec = trap == 0x400;
+	int fault;
+	int rc = 0, store_update_sp = 0;
 
 #if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
 	/*
@@ -164,27 +223,49 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 	is_write = error_code & ESR_DST;
 #endif /* CONFIG_4xx || CONFIG_BOOKE */
 
+#ifdef CONFIG_PPC_ICSWX
+	/*
+	 * we need to do this early because this "data storage
+	 * interrupt" does not update the DAR/DEAR so we don't want to
+	 * look at it
+	 */
+	if (error_code & ICSWX_DSI_UCT) {
+		rc = acop_handle_fault(regs, address, error_code);
+		if (rc)
+			goto bail;
+	}
+#endif /* CONFIG_PPC_ICSWX */
+
 	if (notify_page_fault(regs))
-		return 0;
+		goto bail;
 
 	if (unlikely(debugger_fault_handler(regs)))
-		return 0;
+		goto bail;
 
 	/* On a kernel SLB miss we can only check for a valid exception entry */
-	if (!user_mode(regs) && (address >= TASK_SIZE))
-		return SIGSEGV;
+	if (!user_mode(regs) && (address >= TASK_SIZE)) {
+		rc = SIGSEGV;
+		goto bail;
+	}
 
-#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
+#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \
+			     defined(CONFIG_PPC_BOOK3S_64))
   	if (error_code & DSISR_DABRMATCH) {
-		/* DABR match */
-		do_dabr(regs, address, error_code);
-		return 0;
+		/* breakpoint match */
+		do_break(regs, address, error_code);
+		goto bail;
 	}
-#endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/
+#endif
+
+	/* We restore the interrupt state now */
+	if (!arch_irq_disabled_regs(regs))
+		local_irq_enable();
 
 	if (in_atomic() || mm == NULL) {
-		if (!user_mode(regs))
-			return SIGSEGV;
+		if (!user_mode(regs)) {
+			rc = SIGSEGV;
+			goto bail;
+		}
 		/* in_atomic() in user mode is really bad,
 		   as is current->mm == NULL. */
 		printk(KERN_EMERG "Page fault in user mode with "
@@ -194,6 +275,19 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 		die("Weird page fault", regs, SIGSEGV);
 	}
 
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+
+	/*
+	 * We want to do this outside mmap_sem, because reading code around nip
+	 * can result in fault, which will cause a deadlock when called with
+	 * mmap_sem held
+	 */
+	if (user_mode(regs))
+		store_update_sp = store_updates_sp(regs);
+
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
+
 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
 	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
@@ -213,7 +307,15 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 		if (!user_mode(regs) && !search_exception_tables(regs->nip))
 			goto bad_area_nosemaphore;
 
+retry:
 		down_read(&mm->mmap_sem);
+	} else {
+		/*
+		 * The above down_read_trylock() might have succeeded in
+		 * which case we'll have missed the might_sleep() from
+		 * down_read():
+		 */
+		might_sleep();
 	}
 
 	vma = find_vma(mm, address);
@@ -251,8 +353,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 		 * between the last mapped region and the stack will
 		 * expand the stack rather than segfaulting.
 		 */
-		if (address + 2048 < uregs->gpr[1]
-		    && (!user_mode(regs) || !store_updates_sp(regs)))
+		if (address + 2048 < uregs->gpr[1] && !store_update_sp)
 			goto bad_area;
 	}
 	if (expand_stack(vma, address))
@@ -267,6 +368,12 @@ good_area:
 		goto bad_area;
 #endif /* CONFIG_6xx */
 #if defined(CONFIG_8xx)
+	/* 8xx sometimes need to load a invalid/non-present TLBs.
+	 * These must be invalidated separately as linux mm don't.
+	 */
+	if (error_code & 0x40000000) /* no translation? */
+		_tlbil_va(address, 0, 0, 0);
+
         /* The MPC8xx seems to always set 0x80000000, which is
          * "undefined".  Of those that can be set, this is the only
          * one which seems bad.
@@ -277,48 +384,38 @@ good_area:
 #endif /* CONFIG_8xx */
 
 	if (is_exec) {
-#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
-		/* protection fault */
+#ifdef CONFIG_PPC_STD_MMU
+		/* Protection fault on exec go straight to failure on
+		 * Hash based MMUs as they either don't support per-page
+		 * execute permission, or if they do, it's handled already
+		 * at the hash level. This test would probably have to
+		 * be removed if we change the way this works to make hash
+		 * processors use the same I/D cache coherency mechanism
+		 * as embedded.
+		 */
 		if (error_code & DSISR_PROTFAULT)
 			goto bad_area;
+#endif /* CONFIG_PPC_STD_MMU */
+
 		/*
 		 * Allow execution from readable areas if the MMU does not
 		 * provide separate controls over reading and executing.
+		 *
+		 * Note: That code used to not be enabled for 4xx/BookE.
+		 * It is now as I/D cache coherency for these is done at
+		 * set_pte_at() time and I see no reason why the test
+		 * below wouldn't be valid on those processors. This -may-
+		 * break programs compiled with a really old ABI though.
 		 */
 		if (!(vma->vm_flags & VM_EXEC) &&
 		    (cpu_has_feature(CPU_FTR_NOEXECUTE) ||
 		     !(vma->vm_flags & (VM_READ | VM_WRITE))))
 			goto bad_area;
-#else
-		pte_t *ptep;
-		pmd_t *pmdp;
-
-		/* Since 4xx/Book-E supports per-page execute permission,
-		 * we lazily flush dcache to icache. */
-		ptep = NULL;
-		if (get_pteptr(mm, address, &ptep, &pmdp)) {
-			spinlock_t *ptl = pte_lockptr(mm, pmdp);
-			spin_lock(ptl);
-			if (pte_present(*ptep)) {
-				struct page *page = pte_page(*ptep);
-
-				if (!test_bit(PG_arch_1, &page->flags)) {
-					flush_dcache_icache_page(page);
-					set_bit(PG_arch_1, &page->flags);
-				}
-				pte_update(ptep, 0, _PAGE_HWEXEC);
-				_tlbie(address, mm->context.id);
-				pte_unmap_unlock(ptep, ptl);
-				up_read(&mm->mmap_sem);
-				return 0;
-			}
-			pte_unmap_unlock(ptep, ptl);
-		}
-#endif
 	/* a write */
 	} else if (is_write) {
 		if (!(vma->vm_flags & VM_WRITE))
 			goto bad_area;
+		flags |= FAULT_FLAG_WRITE;
 	/* a read */
 	} else {
 		/* protection fault */
@@ -333,21 +430,52 @@ good_area:
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
- survive:
-	ret = handle_mm_fault(mm, vma, address, is_write);
-	if (unlikely(ret & VM_FAULT_ERROR)) {
-		if (ret & VM_FAULT_OOM)
-			goto out_of_memory;
-		else if (ret & VM_FAULT_SIGBUS)
-			goto do_sigbus;
-		BUG();
+	fault = handle_mm_fault(mm, vma, address, flags);
+	if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
+		rc = mm_fault_error(regs, address, fault);
+		if (rc >= MM_FAULT_RETURN)
+			goto bail;
+		else
+			rc = 0;
 	}
-	if (ret & VM_FAULT_MAJOR)
-		current->maj_flt++;
-	else
-		current->min_flt++;
+
+	/*
+	 * Major/minor page fault accounting is only done on the
+	 * initial attempt. If we go through a retry, it is extremely
+	 * likely that the page will be found in page cache at that point.
+	 */
+	if (flags & FAULT_FLAG_ALLOW_RETRY) {
+		if (fault & VM_FAULT_MAJOR) {
+			current->maj_flt++;
+			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
+				      regs, address);
+#ifdef CONFIG_PPC_SMLPAR
+			if (firmware_has_feature(FW_FEATURE_CMO)) {
+				u32 page_ins;
+
+				preempt_disable();
+				page_ins = be32_to_cpu(get_lppaca()->page_ins);
+				page_ins += 1 << PAGE_FACTOR;
+				get_lppaca()->page_ins = cpu_to_be32(page_ins);
+				preempt_enable();
+			}
+#endif /* CONFIG_PPC_SMLPAR */
+		} else {
+			current->min_flt++;
+			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
+				      regs, address);
+		}
+		if (fault & VM_FAULT_RETRY) {
+			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
+			 * of starvation. */
+			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
+			goto retry;
+		}
+	}
+
 	up_read(&mm->mmap_sem);
-	return 0;
+	goto bail;
 
 bad_area:
 	up_read(&mm->mmap_sem);
@@ -356,44 +484,20 @@ bad_area_nosemaphore:
 	/* User mode accesses cause a SIGSEGV */
 	if (user_mode(regs)) {
 		_exception(SIGSEGV, regs, code, address);
-		return 0;
+		goto bail;
 	}
 
-	if (is_exec && (error_code & DSISR_PROTFAULT)
-	    && printk_ratelimit())
-		printk(KERN_CRIT "kernel tried to execute NX-protected"
-		       " page (%lx) - exploit attempt? (uid: %d)\n",
-		       address, current->uid);
+	if (is_exec && (error_code & DSISR_PROTFAULT))
+		printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected"
+				   " page (%lx) - exploit attempt? (uid: %d)\n",
+				   address, from_kuid(&init_user_ns, current_uid()));
 
-	return SIGSEGV;
+	rc = SIGSEGV;
 
-/*
- * We ran out of memory, or some other thing happened to us that made
- * us unable to handle the page fault gracefully.
- */
-out_of_memory:
-	up_read(&mm->mmap_sem);
-	if (is_global_init(current)) {
-		yield();
-		down_read(&mm->mmap_sem);
-		goto survive;
-	}
-	printk("VM: killing process %s\n", current->comm);
-	if (user_mode(regs))
-		do_group_exit(SIGKILL);
-	return SIGKILL;
+bail:
+	exception_exit(prev_state);
+	return rc;
 
-do_sigbus:
-	up_read(&mm->mmap_sem);
-	if (user_mode(regs)) {
-		info.si_signo = SIGBUS;
-		info.si_errno = 0;
-		info.si_code = BUS_ADRERR;
-		info.si_addr = (void __user *)address;
-		force_sig_info(SIGBUS, &info, current);
-		return 0;
-	}
-	return SIGBUS;
 }
 
 /*
@@ -404,6 +508,7 @@ do_sigbus:
 void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
 {
 	const struct exception_table_entry *entry;
+	unsigned long *stackend;
 
 	/* Are we prepared to handle this fault?  */
 	if ((entry = search_exception_tables(regs->nip)) != NULL) {
@@ -432,5 +537,9 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
 	printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n",
 		regs->nip);
 
+	stackend = end_of_stack(current);
+	if (current != &init_task && *stackend != STACK_END_MAGIC)
+		printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
+
 	die("Kernel access of bad area", regs, sig);
 }
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
index c93a966b7e4..94cd728166d 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -2,7 +2,7 @@
  * Modifications by Kumar Gala (galak@kernel.crashing.org) to support
  * E500 Book E processors.
  *
- * Copyright 2004 Freescale Semiconductor, Inc
+ * Copyright 2004,2010 Freescale Semiconductor, Inc.
  *
  * This file contains the routines for initializing the MMU
  * on the 4xx series of chips.
@@ -40,6 +40,7 @@
 #include <linux/init.h>
 #include <linux/delay.h>
 #include <linux/highmem.h>
+#include <linux/memblock.h>
 
 #include <asm/pgalloc.h>
 #include <asm/prom.h>
@@ -49,41 +50,34 @@
 #include <asm/mmu.h>
 #include <asm/uaccess.h>
 #include <asm/smp.h>
-#include <asm/bootx.h>
 #include <asm/machdep.h>
 #include <asm/setup.h>
+#include <asm/paca.h>
+
+#include "mmu_decl.h"
 
-extern void loadcam_entry(unsigned int index);
 unsigned int tlbcam_index;
-unsigned int num_tlbcam_entries;
-static unsigned long __cam0, __cam1, __cam2;
-extern unsigned long total_lowmem;
-extern unsigned long __max_low_memory;
-extern unsigned long __initial_memory_limit;
-#define MAX_LOW_MEM	CONFIG_LOWMEM_SIZE
-
-#define NUM_TLBCAMS	(16)
-
-struct tlbcam {
-   	u32	MAS0;
-	u32	MAS1;
-	u32	MAS2;
-	u32	MAS3;
-	u32	MAS7;
-} TLBCAM[NUM_TLBCAMS];
+
+#define NUM_TLBCAMS	(64)
+struct tlbcam TLBCAM[NUM_TLBCAMS];
 
 struct tlbcamrange {
-   	unsigned long start;
+	unsigned long start;
 	unsigned long limit;
 	phys_addr_t phys;
 } tlbcam_addrs[NUM_TLBCAMS];
 
 extern unsigned int tlbcam_index;
 
+unsigned long tlbcam_sz(int idx)
+{
+	return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1;
+}
+
 /*
  * Return PA for this VA if it is mapped by a CAM, or 0
  */
-unsigned long v_mapped_by_tlbcam(unsigned long va)
+phys_addr_t v_mapped_by_tlbcam(unsigned long va)
 {
 	int b;
 	for (b = 0; b < tlbcam_index; ++b)
@@ -95,29 +89,30 @@ unsigned long v_mapped_by_tlbcam(unsigned long va)
 /*
  * Return VA for a given PA or 0 if not mapped
  */
-unsigned long p_mapped_by_tlbcam(unsigned long pa)
+unsigned long p_mapped_by_tlbcam(phys_addr_t pa)
 {
 	int b;
 	for (b = 0; b < tlbcam_index; ++b)
 		if (pa >= tlbcam_addrs[b].phys
-	    	    && pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start)
+			&& pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start)
 		              +tlbcam_addrs[b].phys)
 			return tlbcam_addrs[b].start+(pa-tlbcam_addrs[b].phys);
 	return 0;
 }
 
 /*
- * Set up one of the I/D BAT (block address translation) register pairs.
- * The parameters are not checked; in particular size must be a power
- * of 4 between 4k and 256M.
+ * Set up a variable-size TLB entry (tlbcam). The parameters are not checked;
+ * in particular size must be a power of 4 between 4k and the max supported by
+ * an implementation; max may further be limited by what can be represented in
+ * an unsigned long (for example, 32-bit implementations cannot support a 4GB
+ * size).
  */
-void settlbcam(int index, unsigned long virt, phys_addr_t phys,
-		unsigned int size, int flags, unsigned int pid)
+static void settlbcam(int index, unsigned long virt, phys_addr_t phys,
+		unsigned long size, unsigned long flags, unsigned int pid)
 {
-	unsigned int tsize, lz;
+	unsigned int tsize;
 
-	asm ("cntlzw %0,%1" : "=r" (lz) : "r" (size));
-	tsize = (21 - lz) / 2;
+	tsize = __ilog2(size) - 10;
 
 #ifdef CONFIG_SMP
 	if ((flags & _PAGE_NO_CACHE) == 0)
@@ -134,18 +129,16 @@ void settlbcam(int index, unsigned long virt, phys_addr_t phys,
 	TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0;
 	TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0;
 
-	TLBCAM[index].MAS3 = (phys & PAGE_MASK) | MAS3_SX | MAS3_SR;
+	TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SX | MAS3_SR;
 	TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0);
+	if (mmu_has_feature(MMU_FTR_BIG_PHYS))
+		TLBCAM[index].MAS7 = (u64)phys >> 32;
 
-#ifndef CONFIG_KGDB /* want user access for breakpoints */
-	if (flags & _PAGE_USER) {
+	/* Below is unlikely -- only for large user pages or similar */
+	if (pte_user(flags)) {
 	   TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR;
 	   TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0);
 	}
-#else
-	TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR;
-	TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0);
-#endif
 
 	tlbcam_addrs[index].start = virt;
 	tlbcam_addrs[index].limit = virt + size - 1;
@@ -154,27 +147,77 @@ void settlbcam(int index, unsigned long virt, phys_addr_t phys,
 	loadcam_entry(index);
 }
 
-void invalidate_tlbcam_entry(int index)
+unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
+			  phys_addr_t phys)
 {
-	TLBCAM[index].MAS0 = MAS0_TLBSEL(1) | MAS0_ESEL(index);
-	TLBCAM[index].MAS1 = ~MAS1_VALID;
+	unsigned int camsize = __ilog2(ram);
+	unsigned int align = __ffs(virt | phys);
+	unsigned long max_cam;
 
-	loadcam_entry(index);
+	if ((mfspr(SPRN_MMUCFG) & MMUCFG_MAVN) == MMUCFG_MAVN_V1) {
+		/* Convert (4^max) kB to (2^max) bytes */
+		max_cam = ((mfspr(SPRN_TLB1CFG) >> 16) & 0xf) * 2 + 10;
+		camsize &= ~1U;
+		align &= ~1U;
+	} else {
+		/* Convert (2^max) kB to (2^max) bytes */
+		max_cam = __ilog2(mfspr(SPRN_TLB1PS)) + 10;
+	}
+
+	if (camsize > align)
+		camsize = align;
+	if (camsize > max_cam)
+		camsize = max_cam;
+
+	return 1UL << camsize;
 }
 
-void __init cam_mapin_ram(unsigned long cam0, unsigned long cam1,
-		unsigned long cam2)
+static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
+					unsigned long ram, int max_cam_idx)
 {
-	settlbcam(0, PAGE_OFFSET, PPC_MEMSTART, cam0, _PAGE_KERNEL, 0);
-	tlbcam_index++;
-	if (cam1) {
-		tlbcam_index++;
-		settlbcam(1, PAGE_OFFSET+cam0, PPC_MEMSTART+cam0, cam1, _PAGE_KERNEL, 0);
-	}
-	if (cam2) {
-		tlbcam_index++;
-		settlbcam(2, PAGE_OFFSET+cam0+cam1, PPC_MEMSTART+cam0+cam1, cam2, _PAGE_KERNEL, 0);
+	int i;
+	unsigned long amount_mapped = 0;
+
+	/* Calculate CAM values */
+	for (i = 0; ram && i < max_cam_idx; i++) {
+		unsigned long cam_sz;
+
+		cam_sz = calc_cam_sz(ram, virt, phys);
+		settlbcam(i, virt, phys, cam_sz, PAGE_KERNEL_X, 0);
+
+		ram -= cam_sz;
+		amount_mapped += cam_sz;
+		virt += cam_sz;
+		phys += cam_sz;
 	}
+	tlbcam_index = i;
+
+#ifdef CONFIG_PPC64
+	get_paca()->tcd.esel_next = i;
+	get_paca()->tcd.esel_max = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
+	get_paca()->tcd.esel_first = i;
+#endif
+
+	return amount_mapped;
+}
+
+unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx)
+{
+	unsigned long virt = PAGE_OFFSET;
+	phys_addr_t phys = memstart_addr;
+
+	return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx);
+}
+
+#ifdef CONFIG_PPC32
+
+#if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS)
+#error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS"
+#endif
+
+unsigned long __init mmu_mapin_ram(unsigned long top)
+{
+	return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1;
 }
 
 /*
@@ -185,53 +228,91 @@ void __init MMU_init_hw(void)
 	flush_instruction_cache();
 }
 
-unsigned long __init mmu_mapin_ram(void)
+void __init adjust_total_lowmem(void)
 {
-	cam_mapin_ram(__cam0, __cam1, __cam2);
+	unsigned long ram;
+	int i;
 
-	return __cam0 + __cam1 + __cam2;
-}
+	/* adjust lowmem size to __max_low_memory */
+	ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem);
 
+	i = switch_to_as1();
+	__max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM);
+	restore_to_as0(i, 0, 0, 1);
 
-void __init
-adjust_total_lowmem(void)
+	pr_info("Memory CAM mapping: ");
+	for (i = 0; i < tlbcam_index - 1; i++)
+		pr_cont("%lu/", tlbcam_sz(i) >> 20);
+	pr_cont("%lu Mb, residual: %dMb\n", tlbcam_sz(tlbcam_index - 1) >> 20,
+	        (unsigned int)((total_lowmem - __max_low_memory) >> 20));
+
+	memblock_set_current_limit(memstart_addr + __max_low_memory);
+}
+
+void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+				phys_addr_t first_memblock_size)
 {
-	unsigned long max_low_mem = MAX_LOW_MEM;
-	unsigned long cam_max = 0x10000000;
-	unsigned long ram;
+	phys_addr_t limit = first_memblock_base + first_memblock_size;
 
-	/* adjust CAM size to max_low_mem */
-	if (max_low_mem < cam_max)
-		cam_max = max_low_mem;
+	/* 64M mapped initially according to head_fsl_booke.S */
+	memblock_set_current_limit(min_t(u64, limit, 0x04000000));
+}
 
-	/* adjust lowmem size to max_low_mem */
-	if (max_low_mem < total_lowmem)
-		ram = max_low_mem;
-	else
-		ram = total_lowmem;
+#ifdef CONFIG_RELOCATABLE
+int __initdata is_second_reloc;
+notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start)
+{
+	unsigned long base = KERNELBASE;
 
-	/* Calculate CAM values */
-	__cam0 = 1UL << 2 * (__ilog2(ram) / 2);
-	if (__cam0 > cam_max)
-		__cam0 = cam_max;
-	ram -= __cam0;
-	if (ram) {
-		__cam1 = 1UL << 2 * (__ilog2(ram) / 2);
-		if (__cam1 > cam_max)
-			__cam1 = cam_max;
-		ram -= __cam1;
-	}
-	if (ram) {
-		__cam2 = 1UL << 2 * (__ilog2(ram) / 2);
-		if (__cam2 > cam_max)
-			__cam2 = cam_max;
-		ram -= __cam2;
+	kernstart_addr = start;
+	if (is_second_reloc) {
+		virt_phys_offset = PAGE_OFFSET - memstart_addr;
+		return;
 	}
 
-	printk(KERN_INFO "Memory CAM mapping: CAM0=%ldMb, CAM1=%ldMb,"
-			" CAM2=%ldMb residual: %ldMb\n",
-			__cam0 >> 20, __cam1 >> 20, __cam2 >> 20,
-			(total_lowmem - __cam0 - __cam1 - __cam2) >> 20);
-	__max_low_memory = max_low_mem = __cam0 + __cam1 + __cam2;
-	__initial_memory_limit = __max_low_memory;
+	/*
+	 * Relocatable kernel support based on processing of dynamic
+	 * relocation entries. Before we get the real memstart_addr,
+	 * We will compute the virt_phys_offset like this:
+	 * virt_phys_offset = stext.run - kernstart_addr
+	 *
+	 * stext.run = (KERNELBASE & ~0x3ffffff) +
+	 *				(kernstart_addr & 0x3ffffff)
+	 * When we relocate, we have :
+	 *
+	 *	(kernstart_addr & 0x3ffffff) = (stext.run & 0x3ffffff)
+	 *
+	 * hence:
+	 *  virt_phys_offset = (KERNELBASE & ~0x3ffffff) -
+	 *                              (kernstart_addr & ~0x3ffffff)
+	 *
+	 */
+	start &= ~0x3ffffff;
+	base &= ~0x3ffffff;
+	virt_phys_offset = base - start;
+	early_get_first_memblock_info(__va(dt_ptr), NULL);
+	/*
+	 * We now get the memstart_addr, then we should check if this
+	 * address is the same as what the PAGE_OFFSET map to now. If
+	 * not we have to change the map of PAGE_OFFSET to memstart_addr
+	 * and do a second relocation.
+	 */
+	if (start != memstart_addr) {
+		int n;
+		long offset = start - memstart_addr;
+
+		is_second_reloc = 1;
+		n = switch_to_as1();
+		/* map a 64M area for the second relocation */
+		if (memstart_addr > start)
+			map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM);
+		else
+			map_mem_in_cams_addr(start, PAGE_OFFSET + offset,
+					0x4000000, CONFIG_LOWMEM_CAM_NUM);
+		restore_to_as0(n, offset, __va(dt_ptr), 1);
+		/* We should never reach here */
+		panic("Relocation error");
+	}
 }
+#endif
+#endif
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
new file mode 100644
index 00000000000..d8746684f60
--- /dev/null
+++ b/arch/powerpc/mm/gup.c
@@ -0,0 +1,235 @@
+/*
+ * Lockless get_user_pages_fast for powerpc
+ *
+ * Copyright (C) 2008 Nick Piggin
+ * Copyright (C) 2008 Novell Inc.
+ */
+#undef DEBUG
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/vmstat.h>
+#include <linux/pagemap.h>
+#include <linux/rwsem.h>
+#include <asm/pgtable.h>
+
+#ifdef __HAVE_ARCH_PTE_SPECIAL
+
+/*
+ * The performance critical leaf functions are made noinline otherwise gcc
+ * inlines everything into a single function which results in too much
+ * register pressure.
+ */
+static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
+		unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long mask, result;
+	pte_t *ptep;
+
+	result = _PAGE_PRESENT|_PAGE_USER;
+	if (write)
+		result |= _PAGE_RW;
+	mask = result | _PAGE_SPECIAL;
+
+	ptep = pte_offset_kernel(&pmd, addr);
+	do {
+		pte_t pte = ACCESS_ONCE(*ptep);
+		struct page *page;
+		/*
+		 * Similar to the PMD case, NUMA hinting must take slow path
+		 */
+		if (pte_numa(pte))
+			return 0;
+
+		if ((pte_val(pte) & mask) != result)
+			return 0;
+		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+		page = pte_page(pte);
+		if (!page_cache_get_speculative(page))
+			return 0;
+		if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+			put_page(page);
+			return 0;
+		}
+		pages[*nr] = page;
+		(*nr)++;
+
+	} while (ptep++, addr += PAGE_SIZE, addr != end);
+
+	return 1;
+}
+
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+		int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pmd_t *pmdp;
+
+	pmdp = pmd_offset(&pud, addr);
+	do {
+		pmd_t pmd = ACCESS_ONCE(*pmdp);
+
+		next = pmd_addr_end(addr, end);
+		/*
+		 * If we find a splitting transparent hugepage we
+		 * return zero. That will result in taking the slow
+		 * path which will call wait_split_huge_page()
+		 * if the pmd is still in splitting state
+		 */
+		if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+			return 0;
+		if (pmd_huge(pmd) || pmd_large(pmd)) {
+			/*
+			 * NUMA hinting faults need to be handled in the GUP
+			 * slowpath for accounting purposes and so that they
+			 * can be serialised against THP migration.
+			 */
+			if (pmd_numa(pmd))
+				return 0;
+
+			if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next,
+					 write, pages, nr))
+				return 0;
+		} else if (is_hugepd(pmdp)) {
+			if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT,
+					addr, next, write, pages, nr))
+				return 0;
+		} else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+			return 0;
+	} while (pmdp++, addr = next, addr != end);
+
+	return 1;
+}
+
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
+		int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pud_t *pudp;
+
+	pudp = pud_offset(&pgd, addr);
+	do {
+		pud_t pud = ACCESS_ONCE(*pudp);
+
+		next = pud_addr_end(addr, end);
+		if (pud_none(pud))
+			return 0;
+		if (pud_huge(pud)) {
+			if (!gup_hugepte((pte_t *)pudp, PUD_SIZE, addr, next,
+					 write, pages, nr))
+				return 0;
+		} else if (is_hugepd(pudp)) {
+			if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT,
+					addr, next, write, pages, nr))
+				return 0;
+		} else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+			return 0;
+	} while (pudp++, addr = next, addr != end);
+
+	return 1;
+}
+
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			  struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr, len, end;
+	unsigned long next;
+	unsigned long flags;
+	pgd_t *pgdp;
+	int nr = 0;
+
+	pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");
+
+	start &= PAGE_MASK;
+	addr = start;
+	len = (unsigned long) nr_pages << PAGE_SHIFT;
+	end = start + len;
+
+	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+					start, len)))
+		return 0;
+
+	pr_devel("  aligned: %lx .. %lx\n", start, end);
+
+	/*
+	 * XXX: batch / limit 'nr', to avoid large irq off latency
+	 * needs some instrumenting to determine the common sizes used by
+	 * important workloads (eg. DB2), and whether limiting the batch size
+	 * will decrease performance.
+	 *
+	 * It seems like we're in the clear for the moment. Direct-IO is
+	 * the main guy that batches up lots of get_user_pages, and even
+	 * they are limited to 64-at-a-time which is not so many.
+	 */
+	/*
+	 * This doesn't prevent pagetable teardown, but does prevent
+	 * the pagetables from being freed on powerpc.
+	 *
+	 * So long as we atomically load page table pointers versus teardown,
+	 * we can follow the address down to the the page and take a ref on it.
+	 */
+	local_irq_save(flags);
+
+	pgdp = pgd_offset(mm, addr);
+	do {
+		pgd_t pgd = ACCESS_ONCE(*pgdp);
+
+		pr_devel("  %016lx: normal pgd %p\n", addr,
+			 (void *)pgd_val(pgd));
+		next = pgd_addr_end(addr, end);
+		if (pgd_none(pgd))
+			break;
+		if (pgd_huge(pgd)) {
+			if (!gup_hugepte((pte_t *)pgdp, PGDIR_SIZE, addr, next,
+					 write, pages, &nr))
+				break;
+		} else if (is_hugepd(pgdp)) {
+			if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT,
+					addr, next, write, pages, &nr))
+				break;
+		} else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+			break;
+	} while (pgdp++, addr = next, addr != end);
+
+	local_irq_restore(flags);
+
+	return nr;
+}
+
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	int nr, ret;
+
+	start &= PAGE_MASK;
+	nr = __get_user_pages_fast(start, nr_pages, write, pages);
+	ret = nr;
+
+	if (nr < nr_pages) {
+		pr_devel("  slow path ! nr = %d\n", nr);
+
+		/* Try to get the remaining pages with get_user_pages */
+		start += nr << PAGE_SHIFT;
+		pages += nr;
+
+		down_read(&mm->mmap_sem);
+		ret = get_user_pages(current, mm, start,
+				     nr_pages - nr, write, 0, pages, NULL);
+		up_read(&mm->mmap_sem);
+
+		/* Have to be a bit careful with return values */
+		if (nr > 0) {
+			if (ret < 0)
+				ret = nr;
+			else
+				ret += nr;
+		}
+	}
+
+	return ret;
+}
+
+#endif /* __HAVE_ARCH_PTE_SPECIAL */
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index e10d76a860d..115347f74ce 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -1,6 +1,4 @@
 /*
- *  $Id: hashtable.S,v 1.6 1999/10/08 01:56:15 paulus Exp $
- *
  *  PowerPC version
  *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
  *  Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP
@@ -38,46 +36,16 @@ mmu_hash_lock:
 #endif /* CONFIG_SMP */
 
 /*
- * Sync CPUs with hash_page taking & releasing the hash
- * table lock
- */
-#ifdef CONFIG_SMP
-	.text
-_GLOBAL(hash_page_sync)
-	mfmsr   r10
-	rlwinm  r0,r10,0,17,15          /* clear bit 16 (MSR_EE) */
-	mtmsr   r0
-	lis	r8,mmu_hash_lock@h
-	ori	r8,r8,mmu_hash_lock@l
-	lis	r0,0x0fff
-	b	10f
-11:	lwz	r6,0(r8)
-	cmpwi	0,r6,0
-	bne	11b
-10:	lwarx	r6,0,r8
-	cmpwi	0,r6,0
-	bne-	11b
-	stwcx.	r0,0,r8
-	bne-	10b
-	isync
-	eieio
-	li	r0,0
-	stw	r0,0(r8)
-	mtmsr	r10
-	blr
-#endif /* CONFIG_SMP */
-
-/*
  * Load a PTE into the hash table, if possible.
  * The address is in r4, and r3 contains an access flag:
  * _PAGE_RW (0x400) if a write.
  * r9 contains the SRR1 value, from which we use the MSR_PR bit.
- * SPRG3 contains the physical address of the current task's thread.
+ * SPRG_THREAD contains the physical address of the current task's thread.
  *
  * Returns to the caller if the access is illegal or there is no
  * mapping for the address.  Otherwise it places an appropriate PTE
  * in the hash table and returns from the exception.
- * Uses r0, r3 - r8, ctr, lr.
+ * Uses r0, r3 - r8, r10, ctr, lr.
  */
 	.text
 _GLOBAL(hash_page)
@@ -100,7 +68,7 @@ _GLOBAL(hash_page)
 	/* Get PTE (linux-style) and check access */
 	lis	r0,KERNELBASE@h		/* check if kernel address */
 	cmplw	0,r4,r0
-	mfspr	r8,SPRN_SPRG3		/* current task's THREAD (phys) */
+	mfspr	r8,SPRN_SPRG_THREAD	/* current task's THREAD (phys) */
 	ori	r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */
 	lwz	r5,PGDIR(r8)		/* virt page-table root */
 	blt+	112f			/* assume user more likely */
@@ -108,9 +76,15 @@ _GLOBAL(hash_page)
 	addi	r5,r5,swapper_pg_dir@l	/* kernel page table */
 	rlwimi	r3,r9,32-12,29,29	/* MSR_PR -> _PAGE_USER */
 112:	add	r5,r5,r7		/* convert to phys addr */
+#ifndef CONFIG_PTE_64BIT
 	rlwimi	r5,r4,12,20,29		/* insert top 10 bits of address */
 	lwz	r8,0(r5)		/* get pmd entry */
 	rlwinm.	r8,r8,0,0,19		/* extract address of pte page */
+#else
+	rlwinm	r8,r4,13,19,29		/* Compute pgdir/pmd offset */
+	lwzx	r8,r8,r5		/* Get L1 entry */
+	rlwinm.	r8,r8,0,0,20		/* extract pt base address */
+#endif
 #ifdef CONFIG_SMP
 	beq-	hash_page_out		/* return if no mapping */
 #else
@@ -120,7 +94,11 @@ _GLOBAL(hash_page)
 	   to the address following the rfi. */
 	beqlr-
 #endif
+#ifndef CONFIG_PTE_64BIT
 	rlwimi	r8,r4,22,20,29		/* insert next 10 bits of address */
+#else
+	rlwimi	r8,r4,23,20,28		/* compute pte address */
+#endif
 	rlwinm	r0,r3,32-3,24,24	/* _PAGE_RW access -> _PAGE_DIRTY */
 	ori	r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE
 
@@ -129,9 +107,15 @@ _GLOBAL(hash_page)
 	 * because almost always, there won't be a permission violation
 	 * and there won't already be an HPTE, and thus we will have
 	 * to update the PTE to set _PAGE_HASHPTE.  -- paulus.
+	 *
+	 * If PTE_64BIT is set, the low word is the flags word; use that
+	 * word for locking since it contains all the interesting bits.
 	 */
+#if (PTE_FLAGS_OFFSET != 0)
+	addi	r8,r8,PTE_FLAGS_OFFSET
+#endif
 retry:
-	lwarx	r6,0,r8			/* get linux-style pte */
+	lwarx	r6,0,r8			/* get linux-style pte, flag word */
 	andc.	r5,r3,r6		/* check access & ~permission */
 #ifdef CONFIG_SMP
 	bne-	hash_page_out		/* return if access not permitted */
@@ -139,6 +123,15 @@ retry:
 	bnelr-
 #endif
 	or	r5,r0,r6		/* set accessed/dirty bits */
+#ifdef CONFIG_PTE_64BIT
+#ifdef CONFIG_SMP
+	subf	r10,r6,r8		/* create false data dependency */
+	subi	r10,r10,PTE_FLAGS_OFFSET
+	lwzx	r10,r6,r10		/* Get upper PTE word */
+#else
+	lwz	r10,-PTE_FLAGS_OFFSET(r8)
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_PTE_64BIT */
 	stwcx.	r5,0,r8			/* attempt to update PTE */
 	bne-	retry			/* retry if someone got there first */
 
@@ -191,7 +184,7 @@ _GLOBAL(add_hash_page)
 	add	r3,r3,r0		/* note create_hpte trims to 24 bits */
 
 #ifdef CONFIG_SMP
-	rlwinm	r8,r1,0,0,18		/* use cpu number to make tag */
+	CURRENT_THREAD_INFO(r8, r1)	/* use cpu number to make tag */
 	lwz	r8,TI_CPU(r8)		/* to go in mmu_hash_lock */
 	oris	r8,r8,12
 #endif /* CONFIG_SMP */
@@ -205,9 +198,9 @@ _GLOBAL(add_hash_page)
 	 * we can't take a hash table miss (assuming the code is
 	 * covered by a BAT).  -- paulus
 	 */
-	mfmsr	r10
+	mfmsr	r9
 	SYNC
-	rlwinm	r0,r10,0,17,15		/* clear bit 16 (MSR_EE) */
+	rlwinm	r0,r9,0,17,15		/* clear bit 16 (MSR_EE) */
 	rlwinm	r0,r0,0,28,26		/* clear MSR_DR */
 	mtmsr	r0
 	SYNC_601
@@ -216,14 +209,14 @@ _GLOBAL(add_hash_page)
 	tophys(r7,0)
 
 #ifdef CONFIG_SMP
-	addis	r9,r7,mmu_hash_lock@ha
-	addi	r9,r9,mmu_hash_lock@l
-10:	lwarx	r0,0,r9			/* take the mmu_hash_lock */
+	addis	r6,r7,mmu_hash_lock@ha
+	addi	r6,r6,mmu_hash_lock@l
+10:	lwarx	r0,0,r6			/* take the mmu_hash_lock */
 	cmpi	0,r0,0
 	bne-	11f
-	stwcx.	r8,0,r9
+	stwcx.	r8,0,r6
 	beq+	12f
-11:	lwz	r0,0(r9)
+11:	lwz	r0,0(r6)
 	cmpi	0,r0,0
 	beq	10b
 	b	11b
@@ -236,10 +229,24 @@ _GLOBAL(add_hash_page)
 	 * HPTE, so we just unlock and return.
 	 */
 	mr	r8,r5
+#ifndef CONFIG_PTE_64BIT
 	rlwimi	r8,r4,22,20,29
+#else
+	rlwimi	r8,r4,23,20,28
+	addi	r8,r8,PTE_FLAGS_OFFSET
+#endif
 1:	lwarx	r6,0,r8
 	andi.	r0,r6,_PAGE_HASHPTE
 	bne	9f			/* if HASHPTE already set, done */
+#ifdef CONFIG_PTE_64BIT
+#ifdef CONFIG_SMP
+	subf	r10,r6,r8		/* create false data dependency */
+	subi	r10,r10,PTE_FLAGS_OFFSET
+	lwzx	r10,r6,r10		/* Get upper PTE word */
+#else
+	lwz	r10,-PTE_FLAGS_OFFSET(r8)
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_PTE_64BIT */
 	ori	r5,r6,_PAGE_HASHPTE
 	stwcx.	r5,0,r8
 	bne-	1b
@@ -248,13 +255,15 @@ _GLOBAL(add_hash_page)
 
 9:
 #ifdef CONFIG_SMP
+	addis	r6,r7,mmu_hash_lock@ha
+	addi	r6,r6,mmu_hash_lock@l
 	eieio
 	li	r0,0
-	stw	r0,0(r9)		/* clear mmu_hash_lock */
+	stw	r0,0(r6)		/* clear mmu_hash_lock */
 #endif
 
 	/* reenable interrupts and DR */
-	mtmsr	r10
+	mtmsr	r9
 	SYNC_601
 	isync
 
@@ -269,7 +278,8 @@ _GLOBAL(add_hash_page)
  * r5 contains the linux PTE, r6 contains the old value of the
  * linux PTE (before setting _PAGE_HASHPTE) and r7 contains the
  * offset to be added to addresses (0 if the MMU is on,
- * -KERNELBASE if it is off).
+ * -KERNELBASE if it is off).  r10 contains the upper half of
+ * the PTE if CONFIG_PTE_64BIT.
  * On SMP, the caller should have the mmu_hash_lock held.
  * We assume that the caller has (or will) set the _PAGE_HASHPTE
  * bit in the linux PTE in memory.  The value passed in r6 should
@@ -287,7 +297,7 @@ Hash_bits = 12				/* e.g. 256kB hash table */
 Hash_msk = (((1 << Hash_bits) - 1) * 64)
 
 /* defines for the PTE format for 32-bit PPCs */
-#define PTE_SIZE	8
+#define HPTE_SIZE	8
 #define PTEG_SIZE	64
 #define LG_PTEG_SIZE	6
 #define LDPTEu		lwzu
@@ -310,11 +320,16 @@ _GLOBAL(create_hpte)
 	and	r8,r8,r0		/* writable if _RW & _DIRTY */
 	rlwimi	r5,r5,32-1,30,30	/* _PAGE_USER -> PP msb */
 	rlwimi	r5,r5,32-2,31,31	/* _PAGE_USER -> PP lsb */
-	ori	r8,r8,0xe14		/* clear out reserved bits and M */
+	ori	r8,r8,0xe04		/* clear out reserved bits */
 	andc	r8,r5,r8		/* PP = user? (rw&dirty? 2: 3): 0 */
 BEGIN_FTR_SECTION
-	ori	r8,r8,_PAGE_COHERENT	/* set M (coherence required) */
-END_FTR_SECTION_IFSET(CPU_FTR_NEED_COHERENT)
+	rlwinm	r8,r8,0,~_PAGE_COHERENT	/* clear M (coherence not required) */
+END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
+#ifdef CONFIG_PTE_64BIT
+	/* Put the XPN bits into the PTE */
+	rlwimi	r8,r10,8,20,22
+	rlwimi	r8,r10,2,29,29
+#endif
 
 	/* Construct the high word of the PPC-style PTE (r5) */
 	rlwinm	r5,r3,7,1,24		/* put VSID in 0x7fffff80 bits */
@@ -344,8 +359,8 @@ _GLOBAL(hash_page_patch_A)
 
 	/* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */
 	mtctr	r0
-	addi	r4,r3,-PTE_SIZE
-1:	LDPTEu	r6,PTE_SIZE(r4)		/* get next PTE */
+	addi	r4,r3,-HPTE_SIZE
+1:	LDPTEu	r6,HPTE_SIZE(r4)	/* get next PTE */
 	CMPPTE	0,r6,r5
 	bdnzf	2,1b			/* loop while ctr != 0 && !cr0.eq */
 	beq+	found_slot
@@ -355,9 +370,9 @@ _GLOBAL(hash_page_patch_A)
 _GLOBAL(hash_page_patch_B)
 	xoris	r4,r3,Hash_msk>>16	/* compute secondary hash */
 	xori	r4,r4,(-PTEG_SIZE & 0xffff)
-	addi	r4,r4,-PTE_SIZE
+	addi	r4,r4,-HPTE_SIZE
 	mtctr	r0
-2:	LDPTEu	r6,PTE_SIZE(r4)
+2:	LDPTEu	r6,HPTE_SIZE(r4)
 	CMPPTE	0,r6,r5
 	bdnzf	2,2b
 	beq+	found_slot
@@ -365,8 +380,8 @@ _GLOBAL(hash_page_patch_B)
 
 	/* Search the primary PTEG for an empty slot */
 10:	mtctr	r0
-	addi	r4,r3,-PTE_SIZE		/* search primary PTEG */
-1:	LDPTEu	r6,PTE_SIZE(r4)		/* get next PTE */
+	addi	r4,r3,-HPTE_SIZE	/* search primary PTEG */
+1:	LDPTEu	r6,HPTE_SIZE(r4)	/* get next PTE */
 	TST_V(r6)			/* test valid bit */
 	bdnzf	2,1b			/* loop while ctr != 0 && !cr0.eq */
 	beq+	found_empty
@@ -382,9 +397,9 @@ _GLOBAL(hash_page_patch_B)
 _GLOBAL(hash_page_patch_C)
 	xoris	r4,r3,Hash_msk>>16	/* compute secondary hash */
 	xori	r4,r4,(-PTEG_SIZE & 0xffff)
-	addi	r4,r4,-PTE_SIZE
+	addi	r4,r4,-HPTE_SIZE
 	mtctr	r0
-2:	LDPTEu	r6,PTE_SIZE(r4)
+2:	LDPTEu	r6,HPTE_SIZE(r4)
 	TST_V(r6)
 	bdnzf	2,2b
 	beq+	found_empty
@@ -411,11 +426,11 @@ _GLOBAL(hash_page_patch_C)
 
 1:	addis	r4,r7,next_slot@ha		/* get next evict slot */
 	lwz	r6,next_slot@l(r4)
-	addi	r6,r6,PTE_SIZE			/* search for candidate */
-	andi.	r6,r6,7*PTE_SIZE
+	addi	r6,r6,HPTE_SIZE			/* search for candidate */
+	andi.	r6,r6,7*HPTE_SIZE
 	stw	r6,next_slot@l(r4)
 	add	r4,r3,r6
-	LDPTE	r0,PTE_SIZE/2(r4)		/* get PTE second word */
+	LDPTE	r0,HPTE_SIZE/2(r4)		/* get PTE second word */
 	clrrwi	r0,r0,12
 	lis	r6,etext@h
 	ori	r6,r6,etext@l			/* get etext */
@@ -428,7 +443,7 @@ _GLOBAL(hash_page_patch_C)
 found_empty:
 	STPTE	r5,0(r4)
 found_slot:
-	STPTE	r8,PTE_SIZE/2(r4)
+	STPTE	r8,HPTE_SIZE/2(r4)
 
 #else /* CONFIG_SMP */
 /*
@@ -454,7 +469,7 @@ found_slot:
 	STPTE	r5,0(r4)
 	sync
 	TLBSYNC
-	STPTE	r8,PTE_SIZE/2(r4) /* put in correct RPN, WIMG, PP bits */
+	STPTE	r8,HPTE_SIZE/2(r4) /* put in correct RPN, WIMG, PP bits */
 	sync
 	SET_V(r5)
 	STPTE	r5,0(r4)	/* finally set V bit in PTE */
@@ -501,14 +516,18 @@ _GLOBAL(flush_hash_pages)
 	isync
 
 	/* First find a PTE in the range that has _PAGE_HASHPTE set */
+#ifndef CONFIG_PTE_64BIT
 	rlwimi	r5,r4,22,20,29
-1:	lwz	r0,0(r5)
+#else
+	rlwimi	r5,r4,23,20,28
+#endif
+1:	lwz	r0,PTE_FLAGS_OFFSET(r5)
 	cmpwi	cr1,r6,1
 	andi.	r0,r0,_PAGE_HASHPTE
 	bne	2f
 	ble	cr1,19f
 	addi	r4,r4,0x1000
-	addi	r5,r5,4
+	addi	r5,r5,PTE_SIZE
 	addi	r6,r6,-1
 	b	1b
 
@@ -526,7 +545,7 @@ _GLOBAL(flush_hash_pages)
 #ifdef CONFIG_SMP
 	addis	r9,r7,mmu_hash_lock@ha
 	addi	r9,r9,mmu_hash_lock@l
-	rlwinm	r8,r1,0,0,18
+	CURRENT_THREAD_INFO(r8, r1)
 	add	r8,r8,r7
 	lwz	r8,TI_CPU(r8)
 	oris	r8,r8,9
@@ -547,7 +566,10 @@ _GLOBAL(flush_hash_pages)
 	 * already clear, we're done (for this pte).  If not,
 	 * clear it (atomically) and proceed.  -- paulus.
 	 */
-33:	lwarx	r8,0,r5			/* fetch the pte */
+#if (PTE_FLAGS_OFFSET != 0)
+	addi	r5,r5,PTE_FLAGS_OFFSET
+#endif
+33:	lwarx	r8,0,r5			/* fetch the pte flags word */
 	andi.	r0,r8,_PAGE_HASHPTE
 	beq	8f			/* done if HASHPTE is already clear */
 	rlwinm	r8,r8,0,31,29		/* clear HASHPTE bit */
@@ -564,8 +586,8 @@ _GLOBAL(flush_hash_patch_A)
 	/* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */
 	li	r0,8			/* PTEs/group */
 	mtctr	r0
-	addi	r12,r8,-PTE_SIZE
-1:	LDPTEu	r0,PTE_SIZE(r12)	/* get next PTE */
+	addi	r12,r8,-HPTE_SIZE
+1:	LDPTEu	r0,HPTE_SIZE(r12)	/* get next PTE */
 	CMPPTE	0,r0,r11
 	bdnzf	2,1b			/* loop while ctr != 0 && !cr0.eq */
 	beq+	3f
@@ -576,9 +598,9 @@ _GLOBAL(flush_hash_patch_A)
 _GLOBAL(flush_hash_patch_B)
 	xoris	r12,r8,Hash_msk>>16	/* compute secondary hash */
 	xori	r12,r12,(-PTEG_SIZE & 0xffff)
-	addi	r12,r12,-PTE_SIZE
+	addi	r12,r12,-HPTE_SIZE
 	mtctr	r0
-2:	LDPTEu	r0,PTE_SIZE(r12)
+2:	LDPTEu	r0,HPTE_SIZE(r12)
 	CMPPTE	0,r0,r11
 	bdnzf	2,2b
 	xori	r11,r11,PTE_H		/* clear H again */
@@ -592,7 +614,7 @@ _GLOBAL(flush_hash_patch_B)
 
 8:	ble	cr1,9f			/* if all ptes checked */
 81:	addi	r6,r6,-1
-	addi	r5,r5,4			/* advance to next pte */
+	addi	r5,r5,PTE_SIZE
 	addi	r4,r4,0x1000
 	lwz	r0,0(r5)		/* check next pte */
 	cmpwi	cr1,r6,1
@@ -611,3 +633,80 @@ _GLOBAL(flush_hash_patch_B)
 	SYNC_601
 	isync
 	blr
+
+/*
+ * Flush an entry from the TLB
+ */
+_GLOBAL(_tlbie)
+#ifdef CONFIG_SMP
+	CURRENT_THREAD_INFO(r8, r1)
+	lwz	r8,TI_CPU(r8)
+	oris	r8,r8,11
+	mfmsr	r10
+	SYNC
+	rlwinm	r0,r10,0,17,15		/* clear bit 16 (MSR_EE) */
+	rlwinm	r0,r0,0,28,26		/* clear DR */
+	mtmsr	r0
+	SYNC_601
+	isync
+	lis	r9,mmu_hash_lock@h
+	ori	r9,r9,mmu_hash_lock@l
+	tophys(r9,r9)
+10:	lwarx	r7,0,r9
+	cmpwi	0,r7,0
+	bne-	10b
+	stwcx.	r8,0,r9
+	bne-	10b
+	eieio
+	tlbie	r3
+	sync
+	TLBSYNC
+	li	r0,0
+	stw	r0,0(r9)		/* clear mmu_hash_lock */
+	mtmsr	r10
+	SYNC_601
+	isync
+#else /* CONFIG_SMP */
+	tlbie	r3
+	sync
+#endif /* CONFIG_SMP */
+	blr
+
+/*
+ * Flush the entire TLB. 603/603e only
+ */
+_GLOBAL(_tlbia)
+#if defined(CONFIG_SMP)
+	CURRENT_THREAD_INFO(r8, r1)
+	lwz	r8,TI_CPU(r8)
+	oris	r8,r8,10
+	mfmsr	r10
+	SYNC
+	rlwinm	r0,r10,0,17,15		/* clear bit 16 (MSR_EE) */
+	rlwinm	r0,r0,0,28,26		/* clear DR */
+	mtmsr	r0
+	SYNC_601
+	isync
+	lis	r9,mmu_hash_lock@h
+	ori	r9,r9,mmu_hash_lock@l
+	tophys(r9,r9)
+10:	lwarx	r7,0,r9
+	cmpwi	0,r7,0
+	bne-	10b
+	stwcx.	r8,0,r9
+	bne-	10b
+	sync
+	tlbia
+	sync
+	TLBSYNC
+	li	r0,0
+	stw	r0,0(r9)		/* clear mmu_hash_lock */
+	mtmsr	r10
+	SYNC_601
+	isync
+#else /* CONFIG_SMP */
+	sync
+	tlbia
+	sync
+#endif /* CONFIG_SMP */
+	blr
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
index 21d24848647..057cbbb4c57 100644
--- a/arch/powerpc/mm/hash_low_64.S
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -34,14 +34,6 @@
  *         |   CR save area			(SP + 8)
  * SP ---> +-- Back chain			(SP + 0)
  */
-#define STACKFRAMESIZE	256
-
-/* Save parameters offsets */
-#define STK_PARM(i)	(STACKFRAMESIZE + 48 + ((i)-3)*8)
-
-/* Save non-volatile offsets */
-#define STK_REG(i)	(112 + ((i)-14)*8)
-
 
 #ifndef CONFIG_PPC_64K_PAGES
 
@@ -64,25 +56,22 @@ _GLOBAL(__hash_page_4K)
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
 	/* Save all params that we need after a function call */
-	std	r6,STK_PARM(r6)(r1)
-	std	r8,STK_PARM(r8)(r1)
-	std	r9,STK_PARM(r9)(r1)
+	std	r6,STK_PARAM(R6)(r1)
+	std	r8,STK_PARAM(R8)(r1)
+	std	r9,STK_PARAM(R9)(r1)
 	
-	/* Add _PAGE_PRESENT to access */
-	ori	r4,r4,_PAGE_PRESENT
-
 	/* Save non-volatile registers.
 	 * r31 will hold "old PTE"
 	 * r30 is "new PTE"
-	 * r29 is "va"
+	 * r29 is vpn
 	 * r28 is a hash value
 	 * r27 is hashtab mask (maybe dynamic patched instead ?)
 	 */
-	std	r27,STK_REG(r27)(r1)
-	std	r28,STK_REG(r28)(r1)
-	std	r29,STK_REG(r29)(r1)
-	std	r30,STK_REG(r30)(r1)
-	std	r31,STK_REG(r31)(r1)
+	std	r27,STK_REG(R27)(r1)
+	std	r28,STK_REG(R28)(r1)
+	std	r29,STK_REG(R29)(r1)
+	std	r30,STK_REG(R30)(r1)
+	std	r31,STK_REG(R31)(r1)
 	
 	/* Step 1:
 	 *
@@ -121,26 +110,34 @@ _GLOBAL(__hash_page_4K)
 BEGIN_FTR_SECTION
 	cmpdi	r9,0			/* check segment size */
 	bne	3f
-END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
-	/* Calc va and put it in r29 */
-	rldicr	r29,r5,28,63-28
-	rldicl	r3,r3,0,36
-	or	r29,r3,r29
-
-	/* Calculate hash value for primary slot and store it in r28 */
-	rldicl	r5,r5,0,25		/* vsid & 0x0000007fffffffff */
-	rldicl	r0,r3,64-12,48		/* (ea >> 12) & 0xffff */
-	xor	r28,r5,r0
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
+	/* Calc vpn and put it in r29 */
+	sldi	r29,r5,SID_SHIFT - VPN_SHIFT
+	rldicl  r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT)
+	or	r29,r28,r29
+	/*
+	 * Calculate hash value for primary slot and store it in r28
+	 * r3 = va, r5 = vsid
+	 * r0 = (va >> 12) & ((1ul << (28 - 12)) -1)
+	 */
+	rldicl	r0,r3,64-12,48
+	xor	r28,r5,r0		/* hash */
 	b	4f
 
-3:	/* Calc VA and hash in r29 and r28 for 1T segment */
-	sldi	r29,r5,40		/* vsid << 40 */
-	clrldi	r3,r3,24		/* ea & 0xffffffffff */
-	rldic	r28,r5,25,25		/* (vsid << 25) & 0x7fffffffff */
-	clrldi	r5,r5,40		/* vsid & 0xffffff */
-	rldicl	r0,r3,64-12,36		/* (ea >> 12) & 0xfffffff */
-	xor	r28,r28,r5
-	or	r29,r3,r29		/* VA */
+3:	/* Calc vpn and put it in r29 */
+	sldi	r29,r5,SID_SHIFT_1T - VPN_SHIFT
+	rldicl  r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT)
+	or	r29,r28,r29
+
+	/*
+	 * calculate hash value for primary slot and
+	 * store it in r28 for 1T segment
+	 * r3 = va, r5 = vsid
+	 */
+	sldi	r28,r5,25		/* vsid << 25 */
+	/* r0 =  (va >> 12) & ((1ul << (40 - 12)) -1) */
+	rldicl	r0,r3,64-12,36
+	xor	r28,r28,r5		/* vsid ^ ( vsid << 25) */
 	xor	r28,r28,r0		/* hash */
 
 	/* Convert linux PTE bits into HW equivalents */
@@ -151,7 +148,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
 	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
 	andc	r0,r30,r0		/* r0 = pte & ~r0 */
 	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
-	ori	r3,r3,HPTE_R_C		/* Always add "C" bit for perf. */
+	/*
+	 * Always add "C" bit for perf. Memory coherence is always enabled
+	 */
+	ori	r3,r3,HPTE_R_C | HPTE_R_M
 
 	/* We eventually do the icache sync here (maybe inline that
 	 * code rather than call a C function...) 
@@ -159,13 +159,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
 BEGIN_FTR_SECTION
 	mr	r4,r30
 	mr	r5,r7
-	bl	.hash_page_do_lazy_icache
+	bl	hash_page_do_lazy_icache
 END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
 
 	/* At this point, r3 contains new PP bits, save them in
 	 * place of "access" in the param area (sic)
 	 */
-	std	r3,STK_PARM(r4)(r1)
+	std	r3,STK_PARAM(R4)(r1)
 
 	/* Get htab_hash_mask */
 	ld	r4,htab_hash_mask@got(2)
@@ -195,12 +195,14 @@ htab_insert_pte:
 	rldicr	r3,r0,3,63-3		/* r3 = (hash & mask) << 3 */
 
 	/* Call ppc_md.hpte_insert */
-	ld	r6,STK_PARM(r4)(r1)	/* Retreive new pp bits */
-	mr	r4,r29			/* Retreive va */
+	ld	r6,STK_PARAM(R4)(r1)	/* Retrieve new pp bits */
+	mr	r4,r29			/* Retrieve vpn */
 	li	r7,0			/* !bolted, !secondary */
 	li	r8,MMU_PAGE_4K		/* page size */
-	ld	r9,STK_PARM(r9)(r1)	/* segment size */
-_GLOBAL(htab_call_hpte_insert1)
+	li	r9,MMU_PAGE_4K		/* actual page size */
+	ld	r10,STK_PARAM(R9)(r1)	/* segment size */
+.globl htab_call_hpte_insert1
+htab_call_hpte_insert1:
 	bl	.			/* Patched by htab_finish_init() */
 	cmpdi	0,r3,0
 	bge	htab_pte_insert_ok	/* Insertion successful */
@@ -218,12 +220,14 @@ _GLOBAL(htab_call_hpte_insert1)
 	rldicr	r3,r0,3,63-3	/* r0 = (~hash & mask) << 3 */
 	
 	/* Call ppc_md.hpte_insert */
-	ld	r6,STK_PARM(r4)(r1)	/* Retreive new pp bits */
-	mr	r4,r29			/* Retreive va */
+	ld	r6,STK_PARAM(R4)(r1)	/* Retrieve new pp bits */
+	mr	r4,r29			/* Retrieve vpn */
 	li	r7,HPTE_V_SECONDARY	/* !bolted, secondary */
 	li	r8,MMU_PAGE_4K		/* page size */
-	ld	r9,STK_PARM(r9)(r1)	/* segment size */
-_GLOBAL(htab_call_hpte_insert2)
+	li	r9,MMU_PAGE_4K		/* actual page size */
+	ld	r10,STK_PARAM(R9)(r1)	/* segment size */
+.globl htab_call_hpte_insert2
+htab_call_hpte_insert2:
 	bl	.			/* Patched by htab_finish_init() */
 	cmpdi	0,r3,0
 	bge+	htab_pte_insert_ok	/* Insertion successful */
@@ -240,7 +244,8 @@ _GLOBAL(htab_call_hpte_insert2)
 2:	and	r0,r5,r27
 	rldicr	r3,r0,3,63-3	/* r0 = (hash & mask) << 3 */	
 	/* Call ppc_md.hpte_remove */
-_GLOBAL(htab_call_hpte_remove)
+.globl htab_call_hpte_remove
+htab_call_hpte_remove:
 	bl	.			/* Patched by htab_finish_init() */
 
 	/* Try all again */
@@ -258,15 +263,15 @@ htab_pte_insert_ok:
 	 * (maybe add eieio may be good still ?)
 	 */
 htab_write_out_pte:
-	ld	r6,STK_PARM(r6)(r1)
+	ld	r6,STK_PARAM(R6)(r1)
 	std	r30,0(r6)
 	li	r3, 0
 htab_bail:
-	ld	r27,STK_REG(r27)(r1)
-	ld	r28,STK_REG(r28)(r1)
-	ld	r29,STK_REG(r29)(r1)
-	ld      r30,STK_REG(r30)(r1)
-	ld      r31,STK_REG(r31)(r1)
+	ld	r27,STK_REG(R27)(r1)
+	ld	r28,STK_REG(R28)(r1)
+	ld	r29,STK_REG(R29)(r1)
+	ld      r30,STK_REG(R30)(r1)
+	ld      r31,STK_REG(R31)(r1)
 	addi    r1,r1,STACKFRAMESIZE
 	ld      r0,16(r1)
 	mtlr    r0
@@ -289,11 +294,13 @@ htab_modify_pte:
 	add	r3,r0,r3	/* add slot idx */
 
 	/* Call ppc_md.hpte_updatepp */
-	mr	r5,r29			/* va */
-	li	r6,MMU_PAGE_4K		/* page size */
-	ld	r7,STK_PARM(r9)(r1)	/* segment size */
-	ld	r8,STK_PARM(r8)(r1)	/* get "local" param */
-_GLOBAL(htab_call_hpte_updatepp)
+	mr	r5,r29			/* vpn */
+	li	r6,MMU_PAGE_4K		/* base page size */
+	li	r7,MMU_PAGE_4K		/* actual page size */
+	ld	r8,STK_PARAM(R9)(r1)	/* segment size */
+	ld	r9,STK_PARAM(R8)(r1)	/* get "local" param */
+.globl htab_call_hpte_updatepp
+htab_call_hpte_updatepp:
 	bl	.			/* Patched by htab_finish_init() */
 
 	/* if we failed because typically the HPTE wasn't really here
@@ -315,7 +322,7 @@ htab_wrong_access:
 
 htab_pte_insert_failure:
 	/* Bail out restoring old PTE */
-	ld	r6,STK_PARM(r6)(r1)
+	ld	r6,STK_PARAM(R6)(r1)
 	std	r31,0(r6)
 	li	r3,-1
 	b	htab_bail
@@ -343,29 +350,26 @@ _GLOBAL(__hash_page_4K)
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
 	/* Save all params that we need after a function call */
-	std	r6,STK_PARM(r6)(r1)
-	std	r8,STK_PARM(r8)(r1)
-	std	r9,STK_PARM(r9)(r1)
-
-	/* Add _PAGE_PRESENT to access */
-	ori	r4,r4,_PAGE_PRESENT
+	std	r6,STK_PARAM(R6)(r1)
+	std	r8,STK_PARAM(R8)(r1)
+	std	r9,STK_PARAM(R9)(r1)
 
 	/* Save non-volatile registers.
 	 * r31 will hold "old PTE"
 	 * r30 is "new PTE"
-	 * r29 is "va"
+	 * r29 is vpn
 	 * r28 is a hash value
 	 * r27 is hashtab mask (maybe dynamic patched instead ?)
 	 * r26 is the hidx mask
 	 * r25 is the index in combo page
 	 */
-	std	r25,STK_REG(r25)(r1)
-	std	r26,STK_REG(r26)(r1)
-	std	r27,STK_REG(r27)(r1)
-	std	r28,STK_REG(r28)(r1)
-	std	r29,STK_REG(r29)(r1)
-	std	r30,STK_REG(r30)(r1)
-	std	r31,STK_REG(r31)(r1)
+	std	r25,STK_REG(R25)(r1)
+	std	r26,STK_REG(R26)(r1)
+	std	r27,STK_REG(R27)(r1)
+	std	r28,STK_REG(R28)(r1)
+	std	r29,STK_REG(R29)(r1)
+	std	r30,STK_REG(R30)(r1)
+	std	r31,STK_REG(R31)(r1)
 
 	/* Step 1:
 	 *
@@ -388,7 +392,7 @@ _GLOBAL(__hash_page_4K)
 	 */
 	rlwinm	r30,r4,32-9+7,31-7,31-7	/* _PAGE_RW -> _PAGE_DIRTY */
 	or	r30,r30,r31
-	ori	r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE
+	ori	r30,r30,_PAGE_BUSY | _PAGE_ACCESSED
 	oris	r30,r30,_PAGE_COMBO@h
 	/* Write the linux PTE atomically (setting busy) */
 	stdcx.	r30,0,r6
@@ -407,26 +411,42 @@ _GLOBAL(__hash_page_4K)
 BEGIN_FTR_SECTION
 	cmpdi	r9,0			/* check segment size */
 	bne	3f
-END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
-	/* Calc va and put it in r29 */
-	rldicr	r29,r5,28,63-28		/* r29 = (vsid << 28) */
-	rldicl	r3,r3,0,36		/* r3 = (ea & 0x0fffffff) */
-	or	r29,r3,r29		/* r29 = va */
-
-	/* Calculate hash value for primary slot and store it in r28 */
-	rldicl	r5,r5,0,25		/* vsid & 0x0000007fffffffff */
-	rldicl	r0,r3,64-12,48		/* (ea >> 12) & 0xffff */
-	xor	r28,r5,r0
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
+	/* Calc vpn and put it in r29 */
+	sldi	r29,r5,SID_SHIFT - VPN_SHIFT
+	/*
+	 * clrldi r3,r3,64 - SID_SHIFT -->  ea & 0xfffffff
+	 * srdi	 r28,r3,VPN_SHIFT
+	 */
+	rldicl  r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT)
+	or	r29,r28,r29
+	/*
+	 * Calculate hash value for primary slot and store it in r28
+	 * r3 = va, r5 = vsid
+	 * r0 = (va >> 12) & ((1ul << (28 - 12)) -1)
+	 */
+	rldicl	r0,r3,64-12,48
+	xor	r28,r5,r0		/* hash */
 	b	4f
 
-3:	/* Calc VA and hash in r29 and r28 for 1T segment */
-	sldi	r29,r5,40		/* vsid << 40 */
-	clrldi	r3,r3,24		/* ea & 0xffffffffff */
-	rldic	r28,r5,25,25		/* (vsid << 25) & 0x7fffffffff */
-	clrldi	r5,r5,40		/* vsid & 0xffffff */
-	rldicl	r0,r3,64-12,36		/* (ea >> 12) & 0xfffffff */
-	xor	r28,r28,r5
-	or	r29,r3,r29		/* VA */
+3:	/* Calc vpn and put it in r29 */
+	sldi	r29,r5,SID_SHIFT_1T - VPN_SHIFT
+	/*
+	 * clrldi r3,r3,64 - SID_SHIFT_1T -->  ea & 0xffffffffff
+	 * srdi	r28,r3,VPN_SHIFT
+	 */
+	rldicl  r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT)
+	or	r29,r28,r29
+
+	/*
+	 * Calculate hash value for primary slot and
+	 * store it in r28  for 1T segment
+	 * r3 = va, r5 = vsid
+	 */
+	sldi	r28,r5,25		/* vsid << 25 */
+	/* r0 = (va >> 12) & ((1ul << (40 - 12)) -1) */
+	rldicl	r0,r3,64-12,36
+	xor	r28,r28,r5		/* vsid ^ ( vsid << 25) */
 	xor	r28,r28,r0		/* hash */
 
 	/* Convert linux PTE bits into HW equivalents */
@@ -444,7 +464,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
 	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
 	andc	r0,r3,r0		/* r0 = pte & ~r0 */
 	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
-	ori	r3,r3,HPTE_R_C		/* Always add "C" bit for perf. */
+	/*
+	 * Always add "C" bit for perf. Memory coherence is always enabled
+	 */
+	ori	r3,r3,HPTE_R_C | HPTE_R_M
 
 	/* We eventually do the icache sync here (maybe inline that
 	 * code rather than call a C function...)
@@ -452,13 +475,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
 BEGIN_FTR_SECTION
 	mr	r4,r30
 	mr	r5,r7
-	bl	.hash_page_do_lazy_icache
+	bl	hash_page_do_lazy_icache
 END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
 
 	/* At this point, r3 contains new PP bits, save them in
 	 * place of "access" in the param area (sic)
 	 */
-	std	r3,STK_PARM(r4)(r1)
+	std	r3,STK_PARAM(R4)(r1)
 
 	/* Get htab_hash_mask */
 	ld	r4,htab_hash_mask@got(2)
@@ -468,7 +491,7 @@ END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
 	 * go to out-of-line code to try to modify the HPTE. We look for
 	 * the bit at (1 >> (index + 32))
 	 */
-	andi.	r0,r31,_PAGE_HASHPTE
+	rldicl.	r0,r31,64-12,48
 	li	r26,0			/* Default hidx */
 	beq	htab_insert_pte
 
@@ -479,8 +502,8 @@ END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
 	andis.	r0,r31,_PAGE_COMBO@h
 	beq	htab_inval_old_hpte
 
-	ld	r6,STK_PARM(r6)(r1)
-	ori	r26,r6,0x8000		/* Load the hidx mask */
+	ld	r6,STK_PARAM(R6)(r1)
+	ori	r26,r6,PTE_PAGE_HIDX_OFFSET /* Load the hidx mask. */
 	ld	r26,0(r26)
 	addi	r5,r25,36		/* Check actual HPTE_SUB bit, this */
 	rldcr.	r0,r31,r5,0		/* must match pgtable.h definition */
@@ -501,12 +524,14 @@ htab_special_pfn:
 	rldicr	r3,r0,3,63-3		/* r0 = (hash & mask) << 3 */
 
 	/* Call ppc_md.hpte_insert */
-	ld	r6,STK_PARM(r4)(r1)	/* Retreive new pp bits */
-	mr	r4,r29			/* Retreive va */
+	ld	r6,STK_PARAM(R4)(r1)	/* Retrieve new pp bits */
+	mr	r4,r29			/* Retrieve vpn */
 	li	r7,0			/* !bolted, !secondary */
 	li	r8,MMU_PAGE_4K		/* page size */
-	ld	r9,STK_PARM(r9)(r1)	/* segment size */
-_GLOBAL(htab_call_hpte_insert1)
+	li	r9,MMU_PAGE_4K		/* actual page size */
+	ld	r10,STK_PARAM(R9)(r1)	/* segment size */
+.globl htab_call_hpte_insert1
+htab_call_hpte_insert1:
 	bl	.			/* patched by htab_finish_init() */
 	cmpdi	0,r3,0
 	bge	htab_pte_insert_ok	/* Insertion successful */
@@ -528,12 +553,14 @@ _GLOBAL(htab_call_hpte_insert1)
 	rldicr	r3,r0,3,63-3		/* r0 = (~hash & mask) << 3 */
 
 	/* Call ppc_md.hpte_insert */
-	ld	r6,STK_PARM(r4)(r1)	/* Retreive new pp bits */
-	mr	r4,r29			/* Retreive va */
+	ld	r6,STK_PARAM(R4)(r1)	/* Retrieve new pp bits */
+	mr	r4,r29			/* Retrieve vpn */
 	li	r7,HPTE_V_SECONDARY	/* !bolted, secondary */
 	li	r8,MMU_PAGE_4K		/* page size */
-	ld	r9,STK_PARM(r9)(r1)	/* segment size */
-_GLOBAL(htab_call_hpte_insert2)
+	li	r9,MMU_PAGE_4K		/* actual page size */
+	ld	r10,STK_PARAM(R9)(r1)	/* segment size */
+.globl htab_call_hpte_insert2
+htab_call_hpte_insert2:
 	bl	.			/* patched by htab_finish_init() */
 	cmpdi	0,r3,0
 	bge+	htab_pte_insert_ok	/* Insertion successful */
@@ -550,7 +577,8 @@ _GLOBAL(htab_call_hpte_insert2)
 2:	and	r0,r5,r27
 	rldicr	r3,r0,3,63-3		/* r0 = (hash & mask) << 3 */
 	/* Call ppc_md.hpte_remove */
-_GLOBAL(htab_call_hpte_remove)
+.globl htab_call_hpte_remove
+htab_call_hpte_remove:
 	bl	.			/* patched by htab_finish_init() */
 
 	/* Try all again */
@@ -561,13 +589,17 @@ _GLOBAL(htab_call_hpte_remove)
 	 * useless now that the segment has been switched to 4k pages.
 	 */
 htab_inval_old_hpte:
-	mr	r3,r29			/* virtual addr */
+	mr	r3,r29			/* vpn */
 	mr	r4,r31			/* PTE.pte */
 	li	r5,0			/* PTE.hidx */
 	li	r6,MMU_PAGE_64K		/* psize */
-	ld	r7,STK_PARM(r9)(r1)	/* ssize */
-	ld	r8,STK_PARM(r8)(r1)	/* local */
-	bl	.flush_hash_page
+	ld	r7,STK_PARAM(R9)(r1)	/* ssize */
+	ld	r8,STK_PARAM(R8)(r1)	/* local */
+	bl	flush_hash_page
+	/* Clear out _PAGE_HPTE_SUB bits in the new linux PTE */
+	lis	r0,_PAGE_HPTE_SUB@h
+	ori	r0,r0,_PAGE_HPTE_SUB@l
+	andc	r30,r30,r0
 	b	htab_insert_pte
 	
 htab_bail_ok:
@@ -578,7 +610,7 @@ htab_pte_insert_ok:
 	/* Insert slot number & secondary bit in PTE second half,
 	 * clear _PAGE_BUSY and set approriate HPTE slot bit
 	 */
-	ld	r6,STK_PARM(r6)(r1)
+	ld	r6,STK_PARAM(R6)(r1)
 	li	r0,_PAGE_BUSY
 	andc	r30,r30,r0
 	/* HPTE SUB bit */
@@ -593,19 +625,19 @@ htab_pte_insert_ok:
 	sld	r4,r4,r5
 	andc	r26,r26,r4
 	or	r26,r26,r3
-	ori	r5,r6,0x8000
+	ori	r5,r6,PTE_PAGE_HIDX_OFFSET
 	std	r26,0(r5)
 	lwsync
 	std	r30,0(r6)
 	li	r3, 0
 htab_bail:
-	ld	r25,STK_REG(r25)(r1)
-	ld	r26,STK_REG(r26)(r1)
-	ld	r27,STK_REG(r27)(r1)
-	ld	r28,STK_REG(r28)(r1)
-	ld	r29,STK_REG(r29)(r1)
-	ld      r30,STK_REG(r30)(r1)
-	ld      r31,STK_REG(r31)(r1)
+	ld	r25,STK_REG(R25)(r1)
+	ld	r26,STK_REG(R26)(r1)
+	ld	r27,STK_REG(R27)(r1)
+	ld	r28,STK_REG(R28)(r1)
+	ld	r29,STK_REG(R29)(r1)
+	ld      r30,STK_REG(R30)(r1)
+	ld      r31,STK_REG(R31)(r1)
 	addi    r1,r1,STACKFRAMESIZE
 	ld      r0,16(r1)
 	mtlr    r0
@@ -630,11 +662,13 @@ htab_modify_pte:
 	add	r3,r0,r3	/* add slot idx */
 
 	/* Call ppc_md.hpte_updatepp */
-	mr	r5,r29			/* va */
-	li	r6,MMU_PAGE_4K		/* page size */
-	ld	r7,STK_PARM(r9)(r1)	/* segment size */
-	ld	r8,STK_PARM(r8)(r1)	/* get "local" param */
-_GLOBAL(htab_call_hpte_updatepp)
+	mr	r5,r29			/* vpn */
+	li	r6,MMU_PAGE_4K		/* base page size */
+	li	r7,MMU_PAGE_4K		/* actual page size */
+	ld	r8,STK_PARAM(R9)(r1)	/* segment size */
+	ld	r9,STK_PARAM(R8)(r1)	/* get "local" param */
+.globl htab_call_hpte_updatepp
+htab_call_hpte_updatepp:
 	bl	.			/* patched by htab_finish_init() */
 
 	/* if we failed because typically the HPTE wasn't really here
@@ -646,7 +680,7 @@ _GLOBAL(htab_call_hpte_updatepp)
 	/* Clear the BUSY bit and Write out the PTE */
 	li	r0,_PAGE_BUSY
 	andc	r30,r30,r0
-	ld	r6,STK_PARM(r6)(r1)
+	ld	r6,STK_PARAM(R6)(r1)
 	std	r30,0(r6)
 	li	r3,0
 	b	htab_bail
@@ -659,7 +693,7 @@ htab_wrong_access:
 
 htab_pte_insert_failure:
 	/* Bail out restoring old PTE */
-	ld	r6,STK_PARM(r6)(r1)
+	ld	r6,STK_PARAM(R6)(r1)
 	std	r31,0(r6)
 	li	r3,-1
 	b	htab_bail
@@ -679,25 +713,22 @@ _GLOBAL(__hash_page_64K)
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
 	/* Save all params that we need after a function call */
-	std	r6,STK_PARM(r6)(r1)
-	std	r8,STK_PARM(r8)(r1)
-	std	r9,STK_PARM(r9)(r1)
-
-	/* Add _PAGE_PRESENT to access */
-	ori	r4,r4,_PAGE_PRESENT
+	std	r6,STK_PARAM(R6)(r1)
+	std	r8,STK_PARAM(R8)(r1)
+	std	r9,STK_PARAM(R9)(r1)
 
 	/* Save non-volatile registers.
 	 * r31 will hold "old PTE"
 	 * r30 is "new PTE"
-	 * r29 is "va"
+	 * r29 is vpn
 	 * r28 is a hash value
 	 * r27 is hashtab mask (maybe dynamic patched instead ?)
 	 */
-	std	r27,STK_REG(r27)(r1)
-	std	r28,STK_REG(r28)(r1)
-	std	r29,STK_REG(r29)(r1)
-	std	r30,STK_REG(r30)(r1)
-	std	r31,STK_REG(r31)(r1)
+	std	r27,STK_REG(R27)(r1)
+	std	r28,STK_REG(R28)(r1)
+	std	r29,STK_REG(R29)(r1)
+	std	r30,STK_REG(R30)(r1)
+	std	r31,STK_REG(R31)(r1)
 
 	/* Step 1:
 	 *
@@ -720,13 +751,13 @@ BEGIN_FTR_SECTION
 	andi.	r0,r31,_PAGE_NO_CACHE
 	/* If so, bail out and refault as a 4k page */
 	bne-	ht64_bail_ok
-END_FTR_SECTION_IFCLR(CPU_FTR_CI_LARGE_PAGE)
+END_MMU_FTR_SECTION_IFCLR(MMU_FTR_CI_LARGE_PAGE)
 	/* Prepare new PTE value (turn access RW into DIRTY, then
-	 * add BUSY,HASHPTE and ACCESSED)
+	 * add BUSY and ACCESSED)
 	 */
 	rlwinm	r30,r4,32-9+7,31-7,31-7	/* _PAGE_RW -> _PAGE_DIRTY */
 	or	r30,r30,r31
-	ori	r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE
+	ori	r30,r30,_PAGE_BUSY | _PAGE_ACCESSED
 	/* Write the linux PTE atomically (setting busy) */
 	stdcx.	r30,0,r6
 	bne-	1b
@@ -741,26 +772,33 @@ END_FTR_SECTION_IFCLR(CPU_FTR_CI_LARGE_PAGE)
 BEGIN_FTR_SECTION
 	cmpdi	r9,0			/* check segment size */
 	bne	3f
-END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
-	/* Calc va and put it in r29 */
-	rldicr	r29,r5,28,63-28
-	rldicl	r3,r3,0,36
-	or	r29,r3,r29
-
-	/* Calculate hash value for primary slot and store it in r28 */
-	rldicl	r5,r5,0,25		/* vsid & 0x0000007fffffffff */
-	rldicl	r0,r3,64-16,52		/* (ea >> 16) & 0xfff */
-	xor	r28,r5,r0
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
+	/* Calc vpn and put it in r29 */
+	sldi	r29,r5,SID_SHIFT - VPN_SHIFT
+	rldicl  r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT)
+	or	r29,r28,r29
+
+	/* Calculate hash value for primary slot and store it in r28
+	 * r3 = va, r5 = vsid
+	 * r0 = (va >> 16) & ((1ul << (28 - 16)) -1)
+	 */
+	rldicl	r0,r3,64-16,52
+	xor	r28,r5,r0		/* hash */
 	b	4f
 
-3:	/* Calc VA and hash in r29 and r28 for 1T segment */
-	sldi	r29,r5,40		/* vsid << 40 */
-	clrldi	r3,r3,24		/* ea & 0xffffffffff */
-	rldic	r28,r5,25,25		/* (vsid << 25) & 0x7fffffffff */
-	clrldi	r5,r5,40		/* vsid & 0xffffff */
-	rldicl	r0,r3,64-16,40		/* (ea >> 16) & 0xffffff */
-	xor	r28,r28,r5
-	or	r29,r3,r29		/* VA */
+3:	/* Calc vpn and put it in r29 */
+	sldi	r29,r5,SID_SHIFT_1T - VPN_SHIFT
+	rldicl  r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT)
+	or	r29,r28,r29
+	/*
+	 * calculate hash value for primary slot and
+	 * store it in r28 for 1T segment
+	 * r3 = va, r5 = vsid
+	 */
+	sldi	r28,r5,25		/* vsid << 25 */
+	/* r0 = (va >> 16) & ((1ul << (40 - 16)) -1) */
+	rldicl	r0,r3,64-16,40
+	xor	r28,r28,r5		/* vsid ^ ( vsid << 25) */
 	xor	r28,r28,r0		/* hash */
 
 	/* Convert linux PTE bits into HW equivalents */
@@ -771,7 +809,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
 	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
 	andc	r0,r30,r0		/* r0 = pte & ~r0 */
 	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
-	ori	r3,r3,HPTE_R_C		/* Always add "C" bit for perf. */
+	/*
+	 * Always add "C" bit for perf. Memory coherence is always enabled
+	 */
+	ori	r3,r3,HPTE_R_C | HPTE_R_M
 
 	/* We eventually do the icache sync here (maybe inline that
 	 * code rather than call a C function...)
@@ -779,13 +820,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
 BEGIN_FTR_SECTION
 	mr	r4,r30
 	mr	r5,r7
-	bl	.hash_page_do_lazy_icache
+	bl	hash_page_do_lazy_icache
 END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
 
 	/* At this point, r3 contains new PP bits, save them in
 	 * place of "access" in the param area (sic)
 	 */
-	std	r3,STK_PARM(r4)(r1)
+	std	r3,STK_PARAM(R4)(r1)
 
 	/* Get htab_hash_mask */
 	ld	r4,htab_hash_mask@got(2)
@@ -794,18 +835,21 @@ END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
 	/* Check if we may already be in the hashtable, in this case, we
 	 * go to out-of-line code to try to modify the HPTE
 	 */
-	andi.	r0,r31,_PAGE_HASHPTE
+	rldicl.	r0,r31,64-12,48
 	bne	ht64_modify_pte
 
 ht64_insert_pte:
 	/* Clear hpte bits in new pte (we also clear BUSY btw) and
-	 * add _PAGE_HASHPTE
+	 * add _PAGE_HPTE_SUB0
 	 */
 	lis	r0,_PAGE_HPTEFLAGS@h
 	ori	r0,r0,_PAGE_HPTEFLAGS@l
 	andc	r30,r30,r0
+#ifdef CONFIG_PPC_64K_PAGES
+	oris	r30,r30,_PAGE_HPTE_SUB0@h
+#else
 	ori	r30,r30,_PAGE_HASHPTE
-
+#endif
 	/* Phyical address in r5 */
 	rldicl	r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
 	sldi	r5,r5,PAGE_SHIFT
@@ -815,12 +859,14 @@ ht64_insert_pte:
 	rldicr	r3,r0,3,63-3	/* r0 = (hash & mask) << 3 */
 
 	/* Call ppc_md.hpte_insert */
-	ld	r6,STK_PARM(r4)(r1)	/* Retreive new pp bits */
-	mr	r4,r29			/* Retreive va */
+	ld	r6,STK_PARAM(R4)(r1)	/* Retrieve new pp bits */
+	mr	r4,r29			/* Retrieve vpn */
 	li	r7,0			/* !bolted, !secondary */
 	li	r8,MMU_PAGE_64K
-	ld	r9,STK_PARM(r9)(r1)	/* segment size */
-_GLOBAL(ht64_call_hpte_insert1)
+	li	r9,MMU_PAGE_64K		/* actual page size */
+	ld	r10,STK_PARAM(R9)(r1)	/* segment size */
+.globl ht64_call_hpte_insert1
+ht64_call_hpte_insert1:
 	bl	.			/* patched by htab_finish_init() */
 	cmpdi	0,r3,0
 	bge	ht64_pte_insert_ok	/* Insertion successful */
@@ -838,12 +884,14 @@ _GLOBAL(ht64_call_hpte_insert1)
 	rldicr	r3,r0,3,63-3	/* r0 = (~hash & mask) << 3 */
 
 	/* Call ppc_md.hpte_insert */
-	ld	r6,STK_PARM(r4)(r1)	/* Retreive new pp bits */
-	mr	r4,r29			/* Retreive va */
+	ld	r6,STK_PARAM(R4)(r1)	/* Retrieve new pp bits */
+	mr	r4,r29			/* Retrieve vpn */
 	li	r7,HPTE_V_SECONDARY	/* !bolted, secondary */
 	li	r8,MMU_PAGE_64K
-	ld	r9,STK_PARM(r9)(r1)	/* segment size */
-_GLOBAL(ht64_call_hpte_insert2)
+	li	r9,MMU_PAGE_64K		/* actual page size */
+	ld	r10,STK_PARAM(R9)(r1)	/* segment size */
+.globl ht64_call_hpte_insert2
+ht64_call_hpte_insert2:
 	bl	.			/* patched by htab_finish_init() */
 	cmpdi	0,r3,0
 	bge+	ht64_pte_insert_ok	/* Insertion successful */
@@ -860,7 +908,8 @@ _GLOBAL(ht64_call_hpte_insert2)
 2:	and	r0,r5,r27
 	rldicr	r3,r0,3,63-3	/* r0 = (hash & mask) << 3 */
 	/* Call ppc_md.hpte_remove */
-_GLOBAL(ht64_call_hpte_remove)
+.globl ht64_call_hpte_remove
+ht64_call_hpte_remove:
 	bl	.			/* patched by htab_finish_init() */
 
 	/* Try all again */
@@ -878,15 +927,15 @@ ht64_pte_insert_ok:
 	 * (maybe add eieio may be good still ?)
 	 */
 ht64_write_out_pte:
-	ld	r6,STK_PARM(r6)(r1)
+	ld	r6,STK_PARAM(R6)(r1)
 	std	r30,0(r6)
 	li	r3, 0
 ht64_bail:
-	ld	r27,STK_REG(r27)(r1)
-	ld	r28,STK_REG(r28)(r1)
-	ld	r29,STK_REG(r29)(r1)
-	ld      r30,STK_REG(r30)(r1)
-	ld      r31,STK_REG(r31)(r1)
+	ld	r27,STK_REG(R27)(r1)
+	ld	r28,STK_REG(R28)(r1)
+	ld	r29,STK_REG(R29)(r1)
+	ld      r30,STK_REG(R30)(r1)
+	ld      r31,STK_REG(R31)(r1)
 	addi    r1,r1,STACKFRAMESIZE
 	ld      r0,16(r1)
 	mtlr    r0
@@ -909,11 +958,13 @@ ht64_modify_pte:
 	add	r3,r0,r3	/* add slot idx */
 
 	/* Call ppc_md.hpte_updatepp */
-	mr	r5,r29			/* va */
-	li	r6,MMU_PAGE_64K
-	ld	r7,STK_PARM(r9)(r1)	/* segment size */
-	ld	r8,STK_PARM(r8)(r1)	/* get "local" param */
-_GLOBAL(ht64_call_hpte_updatepp)
+	mr	r5,r29			/* vpn */
+	li	r6,MMU_PAGE_64K		/* base page size */
+	li	r7,MMU_PAGE_64K		/* actual page size */
+	ld	r8,STK_PARAM(R9)(r1)	/* segment size */
+	ld	r9,STK_PARAM(R8)(r1)	/* get "local" param */
+.globl ht64_call_hpte_updatepp
+ht64_call_hpte_updatepp:
 	bl	.			/* patched by htab_finish_init() */
 
 	/* if we failed because typically the HPTE wasn't really here
@@ -935,7 +986,7 @@ ht64_wrong_access:
 
 ht64_pte_insert_failure:
 	/* Bail out restoring old PTE */
-	ld	r6,STK_PARM(r6)(r1)
+	ld	r6,STK_PARAM(R6)(r1)
 	std	r31,0(r6)
 	li	r3,-1
 	b	ht64_bail
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 34e5c0b219b..cf1d325eae8 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -14,10 +14,10 @@
 
 #include <linux/spinlock.h>
 #include <linux/bitops.h>
+#include <linux/of.h>
 #include <linux/threads.h>
 #include <linux/smp.h>
 
-#include <asm/abs_addr.h>
 #include <asm/machdep.h>
 #include <asm/mmu.h>
 #include <asm/mmu_context.h>
@@ -27,6 +27,7 @@
 #include <asm/cputable.h>
 #include <asm/udbg.h>
 #include <asm/kexec.h>
+#include <asm/ppc-opcode.h>
 
 #ifdef DEBUG_LOW
 #define DBG_LOW(fmt...) udbg_printf(fmt)
@@ -34,52 +35,110 @@
 #define DBG_LOW(fmt...)
 #endif
 
+#ifdef __BIG_ENDIAN__
 #define HPTE_LOCK_BIT 3
+#else
+#define HPTE_LOCK_BIT (56+3)
+#endif
 
-static DEFINE_SPINLOCK(native_tlbie_lock);
+DEFINE_RAW_SPINLOCK(native_tlbie_lock);
 
-static inline void __tlbie(unsigned long va, int psize, int ssize)
+static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
 {
+	unsigned long va;
 	unsigned int penc;
+	unsigned long sllp;
 
-	/* clear top 16 bits, non SLS segment */
+	/*
+	 * We need 14 to 65 bits of va for a tlibe of 4K page
+	 * With vpn we ignore the lower VPN_SHIFT bits already.
+	 * And top two bits are already ignored because we can
+	 * only accomadate 76 bits in a 64 bit vpn with a VPN_SHIFT
+	 * of 12.
+	 */
+	va = vpn << VPN_SHIFT;
+	/*
+	 * clear top 16 bits of 64bit va, non SLS segment
+	 * Older versions of the architecture (2.02 and earler) require the
+	 * masking of the top 16 bits.
+	 */
 	va &= ~(0xffffULL << 48);
 
 	switch (psize) {
 	case MMU_PAGE_4K:
-		va &= ~0xffful;
+		/* clear out bits after (52) [0....52.....63] */
+		va &= ~((1ul << (64 - 52)) - 1);
 		va |= ssize << 8;
-		asm volatile("tlbie %0,0" : : "r" (va) : "memory");
+		sllp = ((mmu_psize_defs[apsize].sllp & SLB_VSID_L) >> 6) |
+			((mmu_psize_defs[apsize].sllp & SLB_VSID_LP) >> 4);
+		va |= sllp << 5;
+		asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2)
+			     : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
+			     : "memory");
 		break;
 	default:
-		penc = mmu_psize_defs[psize].penc;
-		va &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
+		/* We need 14 to 14 + i bits of va */
+		penc = mmu_psize_defs[psize].penc[apsize];
+		va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
 		va |= penc << 12;
 		va |= ssize << 8;
-		asm volatile("tlbie %0,1" : : "r" (va) : "memory");
+		/*
+		 * AVAL bits:
+		 * We don't need all the bits, but rest of the bits
+		 * must be ignored by the processor.
+		 * vpn cover upto 65 bits of va. (0...65) and we need
+		 * 58..64 bits of va.
+		 */
+		va |= (vpn & 0xfe); /* AVAL */
+		va |= 1; /* L */
+		asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2)
+			     : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
+			     : "memory");
 		break;
 	}
 }
 
-static inline void __tlbiel(unsigned long va, int psize, int ssize)
+static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
 {
+	unsigned long va;
 	unsigned int penc;
+	unsigned long sllp;
 
-	/* clear top 16 bits, non SLS segment */
+	/* VPN_SHIFT can be atmost 12 */
+	va = vpn << VPN_SHIFT;
+	/*
+	 * clear top 16 bits of 64 bit va, non SLS segment
+	 * Older versions of the architecture (2.02 and earler) require the
+	 * masking of the top 16 bits.
+	 */
 	va &= ~(0xffffULL << 48);
 
 	switch (psize) {
 	case MMU_PAGE_4K:
-		va &= ~0xffful;
+		/* clear out bits after(52) [0....52.....63] */
+		va &= ~((1ul << (64 - 52)) - 1);
 		va |= ssize << 8;
+		sllp = ((mmu_psize_defs[apsize].sllp & SLB_VSID_L) >> 6) |
+			((mmu_psize_defs[apsize].sllp & SLB_VSID_LP) >> 4);
+		va |= sllp << 5;
 		asm volatile(".long 0x7c000224 | (%0 << 11) | (0 << 21)"
 			     : : "r"(va) : "memory");
 		break;
 	default:
-		penc = mmu_psize_defs[psize].penc;
-		va &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
+		/* We need 14 to 14 + i bits of va */
+		penc = mmu_psize_defs[psize].penc[apsize];
+		va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
 		va |= penc << 12;
 		va |= ssize << 8;
+		/*
+		 * AVAL bits:
+		 * We don't need all the bits, but rest of the bits
+		 * must be ignored by the processor.
+		 * vpn cover upto 65 bits of va. (0...65) and we need
+		 * 58..64 bits of va.
+		 */
+		va |= (vpn & 0xfe);
+		va |= 1; /* L */
 		asm volatile(".long 0x7c000224 | (%0 << 11) | (1 << 21)"
 			     : : "r"(va) : "memory");
 		break;
@@ -87,33 +146,34 @@ static inline void __tlbiel(unsigned long va, int psize, int ssize)
 
 }
 
-static inline void tlbie(unsigned long va, int psize, int ssize, int local)
+static inline void tlbie(unsigned long vpn, int psize, int apsize,
+			 int ssize, int local)
 {
-	unsigned int use_local = local && cpu_has_feature(CPU_FTR_TLBIEL);
-	int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
+	unsigned int use_local = local && mmu_has_feature(MMU_FTR_TLBIEL);
+	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
 
 	if (use_local)
 		use_local = mmu_psize_defs[psize].tlbiel;
 	if (lock_tlbie && !use_local)
-		spin_lock(&native_tlbie_lock);
+		raw_spin_lock(&native_tlbie_lock);
 	asm volatile("ptesync": : :"memory");
 	if (use_local) {
-		__tlbiel(va, psize, ssize);
+		__tlbiel(vpn, psize, apsize, ssize);
 		asm volatile("ptesync": : :"memory");
 	} else {
-		__tlbie(va, psize, ssize);
+		__tlbie(vpn, psize, apsize, ssize);
 		asm volatile("eieio; tlbsync; ptesync": : :"memory");
 	}
 	if (lock_tlbie && !use_local)
-		spin_unlock(&native_tlbie_lock);
+		raw_spin_unlock(&native_tlbie_lock);
 }
 
 static inline void native_lock_hpte(struct hash_pte *hptep)
 {
-	unsigned long *word = &hptep->v;
+	unsigned long *word = (unsigned long *)&hptep->v;
 
 	while (1) {
-		if (!test_and_set_bit(HPTE_LOCK_BIT, word))
+		if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
 			break;
 		while(test_bit(HPTE_LOCK_BIT, word))
 			cpu_relax();
@@ -122,31 +182,30 @@ static inline void native_lock_hpte(struct hash_pte *hptep)
 
 static inline void native_unlock_hpte(struct hash_pte *hptep)
 {
-	unsigned long *word = &hptep->v;
+	unsigned long *word = (unsigned long *)&hptep->v;
 
-	asm volatile("lwsync":::"memory");
-	clear_bit(HPTE_LOCK_BIT, word);
+	clear_bit_unlock(HPTE_LOCK_BIT, word);
 }
 
-static long native_hpte_insert(unsigned long hpte_group, unsigned long va,
+static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
 			unsigned long pa, unsigned long rflags,
-			unsigned long vflags, int psize, int ssize)
+			unsigned long vflags, int psize, int apsize, int ssize)
 {
 	struct hash_pte *hptep = htab_address + hpte_group;
 	unsigned long hpte_v, hpte_r;
 	int i;
 
 	if (!(vflags & HPTE_V_BOLTED)) {
-		DBG_LOW("    insert(group=%lx, va=%016lx, pa=%016lx,"
+		DBG_LOW("    insert(group=%lx, vpn=%016lx, pa=%016lx,"
 			" rflags=%lx, vflags=%lx, psize=%d)\n",
-			hpte_group, va, pa, rflags, vflags, psize);
+			hpte_group, vpn, pa, rflags, vflags, psize);
 	}
 
 	for (i = 0; i < HPTES_PER_GROUP; i++) {
-		if (! (hptep->v & HPTE_V_VALID)) {
+		if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
 			/* retry with lock held */
 			native_lock_hpte(hptep);
-			if (! (hptep->v & HPTE_V_VALID))
+			if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID))
 				break;
 			native_unlock_hpte(hptep);
 		}
@@ -157,22 +216,22 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long va,
 	if (i == HPTES_PER_GROUP)
 		return -1;
 
-	hpte_v = hpte_encode_v(va, psize, ssize) | vflags | HPTE_V_VALID;
-	hpte_r = hpte_encode_r(pa, psize) | rflags;
+	hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
+	hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
 
 	if (!(vflags & HPTE_V_BOLTED)) {
 		DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n",
 			i, hpte_v, hpte_r);
 	}
 
-	hptep->r = hpte_r;
+	hptep->r = cpu_to_be64(hpte_r);
 	/* Guarantee the second dword is visible before the valid bit */
 	eieio();
 	/*
 	 * Now set the first dword including the valid bit
 	 * NOTE: this also unlocks the hpte
 	 */
-	hptep->v = hpte_v;
+	hptep->v = cpu_to_be64(hpte_v);
 
 	__asm__ __volatile__ ("ptesync" : : : "memory");
 
@@ -193,12 +252,12 @@ static long native_hpte_remove(unsigned long hpte_group)
 
 	for (i = 0; i < HPTES_PER_GROUP; i++) {
 		hptep = htab_address + hpte_group + slot_offset;
-		hpte_v = hptep->v;
+		hpte_v = be64_to_cpu(hptep->v);
 
 		if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) {
 			/* retry with lock held */
 			native_lock_hpte(hptep);
-			hpte_v = hptep->v;
+			hpte_v = be64_to_cpu(hptep->v);
 			if ((hpte_v & HPTE_V_VALID)
 			    && !(hpte_v & HPTE_V_BOLTED))
 				break;
@@ -219,41 +278,46 @@ static long native_hpte_remove(unsigned long hpte_group)
 }
 
 static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
-				 unsigned long va, int psize, int ssize,
-				 int local)
+				 unsigned long vpn, int bpsize,
+				 int apsize, int ssize, int local)
 {
 	struct hash_pte *hptep = htab_address + slot;
 	unsigned long hpte_v, want_v;
 	int ret = 0;
 
-	want_v = hpte_encode_v(va, psize, ssize);
+	want_v = hpte_encode_avpn(vpn, bpsize, ssize);
 
-	DBG_LOW("    update(va=%016lx, avpnv=%016lx, hash=%016lx, newpp=%x)",
-		va, want_v & HPTE_V_AVPN, slot, newpp);
+	DBG_LOW("    update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)",
+		vpn, want_v & HPTE_V_AVPN, slot, newpp);
 
 	native_lock_hpte(hptep);
 
-	hpte_v = hptep->v;
-
-	/* Even if we miss, we need to invalidate the TLB */
+	hpte_v = be64_to_cpu(hptep->v);
+	/*
+	 * We need to invalidate the TLB always because hpte_remove doesn't do
+	 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
+	 * random entry from it. When we do that we don't invalidate the TLB
+	 * (hpte_remove) because we assume the old translation is still
+	 * technically "valid".
+	 */
 	if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) {
 		DBG_LOW(" -> miss\n");
 		ret = -1;
 	} else {
 		DBG_LOW(" -> hit\n");
 		/* Update the HPTE */
-		hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
-			(newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C));
+		hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) & ~(HPTE_R_PP | HPTE_R_N)) |
+			(newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C)));
 	}
 	native_unlock_hpte(hptep);
 
 	/* Ensure it is out of the tlb too. */
-	tlbie(va, psize, ssize, local);
+	tlbie(vpn, bpsize, apsize, ssize, local);
 
 	return ret;
 }
 
-static long native_hpte_find(unsigned long va, int psize, int ssize)
+static long native_hpte_find(unsigned long vpn, int psize, int ssize)
 {
 	struct hash_pte *hptep;
 	unsigned long hash;
@@ -261,14 +325,14 @@ static long native_hpte_find(unsigned long va, int psize, int ssize)
 	long slot;
 	unsigned long want_v, hpte_v;
 
-	hash = hpt_hash(va, mmu_psize_defs[psize].shift, ssize);
-	want_v = hpte_encode_v(va, psize, ssize);
+	hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
+	want_v = hpte_encode_avpn(vpn, psize, ssize);
 
 	/* Bolted mappings are only ever in the primary group */
 	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 	for (i = 0; i < HPTES_PER_GROUP; i++) {
 		hptep = htab_address + slot;
-		hpte_v = hptep->v;
+		hpte_v = be64_to_cpu(hptep->v);
 
 		if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
 			/* HPTE matches */
@@ -289,28 +353,32 @@ static long native_hpte_find(unsigned long va, int psize, int ssize)
 static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
 				       int psize, int ssize)
 {
-	unsigned long vsid, va;
+	unsigned long vpn;
+	unsigned long vsid;
 	long slot;
 	struct hash_pte *hptep;
 
 	vsid = get_kernel_vsid(ea, ssize);
-	va = hpt_va(ea, vsid, ssize);
+	vpn = hpt_vpn(ea, vsid, ssize);
 
-	slot = native_hpte_find(va, psize, ssize);
+	slot = native_hpte_find(vpn, psize, ssize);
 	if (slot == -1)
 		panic("could not find page to bolt\n");
 	hptep = htab_address + slot;
 
 	/* Update the HPTE */
-	hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
-		(newpp & (HPTE_R_PP | HPTE_R_N));
-
-	/* Ensure it is out of the tlb too. */
-	tlbie(va, psize, ssize, 0);
+	hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) &
+			~(HPTE_R_PP | HPTE_R_N)) |
+		(newpp & (HPTE_R_PP | HPTE_R_N)));
+	/*
+	 * Ensure it is out of the tlb too. Bolted entries base and
+	 * actual page size will be same.
+	 */
+	tlbie(vpn, psize, psize, ssize, 0);
 }
 
-static void native_hpte_invalidate(unsigned long slot, unsigned long va,
-				   int psize, int ssize, int local)
+static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
+				   int bpsize, int apsize, int ssize, int local)
 {
 	struct hash_pte *hptep = htab_address + slot;
 	unsigned long hpte_v;
@@ -319,13 +387,19 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long va,
 
 	local_irq_save(flags);
 
-	DBG_LOW("    invalidate(va=%016lx, hash: %x)\n", va, slot);
+	DBG_LOW("    invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot);
 
-	want_v = hpte_encode_v(va, psize, ssize);
+	want_v = hpte_encode_avpn(vpn, bpsize, ssize);
 	native_lock_hpte(hptep);
-	hpte_v = hptep->v;
+	hpte_v = be64_to_cpu(hptep->v);
 
-	/* Even if we miss, we need to invalidate the TLB */
+	/*
+	 * We need to invalidate the TLB always because hpte_remove doesn't do
+	 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
+	 * random entry from it. When we do that we don't invalidate the TLB
+	 * (hpte_remove) because we assume the old translation is still
+	 * technically "valid".
+	 */
 	if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
 		native_unlock_hpte(hptep);
 	else
@@ -333,73 +407,175 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long va,
 		hptep->v = 0;
 
 	/* Invalidate the TLB */
-	tlbie(va, psize, ssize, local);
+	tlbie(vpn, bpsize, apsize, ssize, local);
+
+	local_irq_restore(flags);
+}
+
+static void native_hugepage_invalidate(struct mm_struct *mm,
+				       unsigned char *hpte_slot_array,
+				       unsigned long addr, int psize)
+{
+	int ssize = 0, i;
+	int lock_tlbie;
+	struct hash_pte *hptep;
+	int actual_psize = MMU_PAGE_16M;
+	unsigned int max_hpte_count, valid;
+	unsigned long flags, s_addr = addr;
+	unsigned long hpte_v, want_v, shift;
+	unsigned long hidx, vpn = 0, vsid, hash, slot;
+
+	shift = mmu_psize_defs[psize].shift;
+	max_hpte_count = 1U << (PMD_SHIFT - shift);
+
+	local_irq_save(flags);
+	for (i = 0; i < max_hpte_count; i++) {
+		valid = hpte_valid(hpte_slot_array, i);
+		if (!valid)
+			continue;
+		hidx =  hpte_hash_index(hpte_slot_array, i);
+
+		/* get the vpn */
+		addr = s_addr + (i * (1ul << shift));
+		if (!is_kernel_addr(addr)) {
+			ssize = user_segment_size(addr);
+			vsid = get_vsid(mm->context.id, addr, ssize);
+			WARN_ON(vsid == 0);
+		} else {
+			vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+			ssize = mmu_kernel_ssize;
+		}
+
+		vpn = hpt_vpn(addr, vsid, ssize);
+		hash = hpt_hash(vpn, shift, ssize);
+		if (hidx & _PTEIDX_SECONDARY)
+			hash = ~hash;
+
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += hidx & _PTEIDX_GROUP_IX;
+
+		hptep = htab_address + slot;
+		want_v = hpte_encode_avpn(vpn, psize, ssize);
+		native_lock_hpte(hptep);
+		hpte_v = be64_to_cpu(hptep->v);
+
+		/* Even if we miss, we need to invalidate the TLB */
+		if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
+			native_unlock_hpte(hptep);
+		else
+			/* Invalidate the hpte. NOTE: this also unlocks it */
+			hptep->v = 0;
+	}
+	/*
+	 * Since this is a hugepage, we just need a single tlbie.
+	 * use the last vpn.
+	 */
+	lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+	if (lock_tlbie)
+		raw_spin_lock(&native_tlbie_lock);
+
+	asm volatile("ptesync":::"memory");
+	__tlbie(vpn, psize, actual_psize, ssize);
+	asm volatile("eieio; tlbsync; ptesync":::"memory");
+
+	if (lock_tlbie)
+		raw_spin_unlock(&native_tlbie_lock);
 
 	local_irq_restore(flags);
 }
 
-#define LP_SHIFT	12
-#define LP_BITS		8
-#define LP_MASK(i)	((0xFF >> (i)) << LP_SHIFT)
+static inline int __hpte_actual_psize(unsigned int lp, int psize)
+{
+	int i, shift;
+	unsigned int mask;
+
+	/* start from 1 ignoring MMU_PAGE_4K */
+	for (i = 1; i < MMU_PAGE_COUNT; i++) {
+
+		/* invalid penc */
+		if (mmu_psize_defs[psize].penc[i] == -1)
+			continue;
+		/*
+		 * encoding bits per actual page size
+		 *        PTE LP     actual page size
+		 *    rrrr rrrz		>=8KB
+		 *    rrrr rrzz		>=16KB
+		 *    rrrr rzzz		>=32KB
+		 *    rrrr zzzz		>=64KB
+		 * .......
+		 */
+		shift = mmu_psize_defs[i].shift - LP_SHIFT;
+		if (shift > LP_BITS)
+			shift = LP_BITS;
+		mask = (1 << shift) - 1;
+		if ((lp & mask) == mmu_psize_defs[psize].penc[i])
+			return i;
+	}
+	return -1;
+}
 
 static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
-			int *psize, int *ssize, unsigned long *va)
+			int *psize, int *apsize, int *ssize, unsigned long *vpn)
 {
-	unsigned long hpte_r = hpte->r;
-	unsigned long hpte_v = hpte->v;
-	unsigned long avpn;
-	int i, size, shift, penc;
-
-	if (!(hpte_v & HPTE_V_LARGE))
-		size = MMU_PAGE_4K;
-	else {
-		for (i = 0; i < LP_BITS; i++) {
-			if ((hpte_r & LP_MASK(i+1)) == LP_MASK(i+1))
-				break;
-		}
-		penc = LP_MASK(i+1) >> LP_SHIFT;
+	unsigned long avpn, pteg, vpi;
+	unsigned long hpte_v = be64_to_cpu(hpte->v);
+	unsigned long hpte_r = be64_to_cpu(hpte->r);
+	unsigned long vsid, seg_off;
+	int size, a_size, shift;
+	/* Look at the 8 bit LP value */
+	unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
+
+	if (!(hpte_v & HPTE_V_LARGE)) {
+		size   = MMU_PAGE_4K;
+		a_size = MMU_PAGE_4K;
+	} else {
 		for (size = 0; size < MMU_PAGE_COUNT; size++) {
 
-			/* 4K pages are not represented by LP */
-			if (size == MMU_PAGE_4K)
-				continue;
-
 			/* valid entries have a shift value */
 			if (!mmu_psize_defs[size].shift)
 				continue;
 
-			if (penc == mmu_psize_defs[size].penc)
+			a_size = __hpte_actual_psize(lp, size);
+			if (a_size != -1)
 				break;
 		}
 	}
-
 	/* This works for all page sizes, and for 256M and 1T segments */
+	*ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
 	shift = mmu_psize_defs[size].shift;
-	avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm) << 23;
 
-	if (shift < 23) {
-		unsigned long vpi, vsid, pteg;
-
-		pteg = slot / HPTES_PER_GROUP;
-		if (hpte_v & HPTE_V_SECONDARY)
-			pteg = ~pteg;
-		switch (hpte_v >> HPTE_V_SSIZE_SHIFT) {
-		case MMU_SEGSIZE_256M:
-			vpi = ((avpn >> 28) ^ pteg) & htab_hash_mask;
-			break;
-		case MMU_SEGSIZE_1T:
-			vsid = avpn >> 40;
+	avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm);
+	pteg = slot / HPTES_PER_GROUP;
+	if (hpte_v & HPTE_V_SECONDARY)
+		pteg = ~pteg;
+
+	switch (*ssize) {
+	case MMU_SEGSIZE_256M:
+		/* We only have 28 - 23 bits of seg_off in avpn */
+		seg_off = (avpn & 0x1f) << 23;
+		vsid    =  avpn >> 5;
+		/* We can find more bits from the pteg value */
+		if (shift < 23) {
+			vpi = (vsid ^ pteg) & htab_hash_mask;
+			seg_off |= vpi << shift;
+		}
+		*vpn = vsid << (SID_SHIFT - VPN_SHIFT) | seg_off >> VPN_SHIFT;
+		break;
+	case MMU_SEGSIZE_1T:
+		/* We only have 40 - 23 bits of seg_off in avpn */
+		seg_off = (avpn & 0x1ffff) << 23;
+		vsid    = avpn >> 17;
+		if (shift < 23) {
 			vpi = (vsid ^ (vsid << 25) ^ pteg) & htab_hash_mask;
-			break;
-		default:
-			avpn = vpi = size = 0;
+			seg_off |= vpi << shift;
 		}
-		avpn |= (vpi << mmu_psize_defs[size].shift);
+		*vpn = vsid << (SID_SHIFT_1T - VPN_SHIFT) | seg_off >> VPN_SHIFT;
+		break;
+	default:
+		*vpn = size = 0;
 	}
-
-	*va = avpn;
-	*psize = size;
-	*ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
+	*psize  = size;
+	*apsize = a_size;
 }
 
 /*
@@ -412,11 +588,12 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
  */
 static void native_hpte_clear(void)
 {
+	unsigned long vpn = 0;
 	unsigned long slot, slots, flags;
 	struct hash_pte *hptep = htab_address;
-	unsigned long hpte_v, va;
+	unsigned long hpte_v;
 	unsigned long pteg_count;
-	int psize, ssize;
+	int psize, apsize, ssize;
 
 	pteg_count = htab_hash_mask + 1;
 
@@ -425,7 +602,7 @@ static void native_hpte_clear(void)
 	/* we take the tlbie lock and hold it.  Some hardware will
 	 * deadlock if we try to tlbie from two processors at once.
 	 */
-	spin_lock(&native_tlbie_lock);
+	raw_spin_lock(&native_tlbie_lock);
 
 	slots = pteg_count * HPTES_PER_GROUP;
 
@@ -435,21 +612,21 @@ static void native_hpte_clear(void)
 		 * running,  right?  and for crash dump, we probably
 		 * don't want to wait for a maybe bad cpu.
 		 */
-		hpte_v = hptep->v;
+		hpte_v = be64_to_cpu(hptep->v);
 
 		/*
 		 * Call __tlbie() here rather than tlbie() since we
 		 * already hold the native_tlbie_lock.
 		 */
 		if (hpte_v & HPTE_V_VALID) {
-			hpte_decode(hptep, slot, &psize, &ssize, &va);
+			hpte_decode(hptep, slot, &psize, &apsize, &ssize, &vpn);
 			hptep->v = 0;
-			__tlbie(va, psize, ssize);
+			__tlbie(vpn, psize, apsize, ssize);
 		}
 	}
 
 	asm volatile("eieio; tlbsync; ptesync":::"memory");
-	spin_unlock(&native_tlbie_lock);
+	raw_spin_unlock(&native_tlbie_lock);
 	local_irq_restore(flags);
 }
 
@@ -459,7 +636,8 @@ static void native_hpte_clear(void)
  */
 static void native_flush_hash_range(unsigned long number, int local)
 {
-	unsigned long va, hash, index, hidx, shift, slot;
+	unsigned long vpn;
+	unsigned long hash, index, hidx, shift, slot;
 	struct hash_pte *hptep;
 	unsigned long hpte_v;
 	unsigned long want_v;
@@ -473,20 +651,20 @@ static void native_flush_hash_range(unsigned long number, int local)
 	local_irq_save(flags);
 
 	for (i = 0; i < number; i++) {
-		va = batch->vaddr[i];
+		vpn = batch->vpn[i];
 		pte = batch->pte[i];
 
-		pte_iterate_hashed_subpages(pte, psize, va, index, shift) {
-			hash = hpt_hash(va, shift, ssize);
+		pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
+			hash = hpt_hash(vpn, shift, ssize);
 			hidx = __rpte_to_hidx(pte, index);
 			if (hidx & _PTEIDX_SECONDARY)
 				hash = ~hash;
 			slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 			slot += hidx & _PTEIDX_GROUP_IX;
 			hptep = htab_address + slot;
-			want_v = hpte_encode_v(va, psize, ssize);
+			want_v = hpte_encode_avpn(vpn, psize, ssize);
 			native_lock_hpte(hptep);
-			hpte_v = hptep->v;
+			hpte_v = be64_to_cpu(hptep->v);
 			if (!HPTE_V_COMPARE(hpte_v, want_v) ||
 			    !(hpte_v & HPTE_V_VALID))
 				native_unlock_hpte(hptep);
@@ -495,67 +673,44 @@ static void native_flush_hash_range(unsigned long number, int local)
 		} pte_iterate_hashed_end();
 	}
 
-	if (cpu_has_feature(CPU_FTR_TLBIEL) &&
+	if (mmu_has_feature(MMU_FTR_TLBIEL) &&
 	    mmu_psize_defs[psize].tlbiel && local) {
 		asm volatile("ptesync":::"memory");
 		for (i = 0; i < number; i++) {
-			va = batch->vaddr[i];
+			vpn = batch->vpn[i];
 			pte = batch->pte[i];
 
-			pte_iterate_hashed_subpages(pte, psize, va, index,
-						    shift) {
-				__tlbiel(va, psize, ssize);
+			pte_iterate_hashed_subpages(pte, psize,
+						    vpn, index, shift) {
+				__tlbiel(vpn, psize, psize, ssize);
 			} pte_iterate_hashed_end();
 		}
 		asm volatile("ptesync":::"memory");
 	} else {
-		int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
+		int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
 
 		if (lock_tlbie)
-			spin_lock(&native_tlbie_lock);
+			raw_spin_lock(&native_tlbie_lock);
 
 		asm volatile("ptesync":::"memory");
 		for (i = 0; i < number; i++) {
-			va = batch->vaddr[i];
+			vpn = batch->vpn[i];
 			pte = batch->pte[i];
 
-			pte_iterate_hashed_subpages(pte, psize, va, index,
-						    shift) {
-				__tlbie(va, psize, ssize);
+			pte_iterate_hashed_subpages(pte, psize,
+						    vpn, index, shift) {
+				__tlbie(vpn, psize, psize, ssize);
 			} pte_iterate_hashed_end();
 		}
 		asm volatile("eieio; tlbsync; ptesync":::"memory");
 
 		if (lock_tlbie)
-			spin_unlock(&native_tlbie_lock);
+			raw_spin_unlock(&native_tlbie_lock);
 	}
 
 	local_irq_restore(flags);
 }
 
-#ifdef CONFIG_PPC_PSERIES
-/* Disable TLB batching on nighthawk */
-static inline int tlb_batching_enabled(void)
-{
-	struct device_node *root = of_find_node_by_path("/");
-	int enabled = 1;
-
-	if (root) {
-		const char *model = of_get_property(root, "model", NULL);
-		if (model && !strcmp(model, "IBM,9076-N81"))
-			enabled = 0;
-		of_node_put(root);
-	}
-
-	return enabled;
-}
-#else
-static inline int tlb_batching_enabled(void)
-{
-	return 1;
-}
-#endif
-
 void __init hpte_init_native(void)
 {
 	ppc_md.hpte_invalidate	= native_hpte_invalidate;
@@ -564,6 +719,6 @@ void __init hpte_init_native(void)
 	ppc_md.hpte_insert	= native_hpte_insert;
 	ppc_md.hpte_remove	= native_hpte_remove;
 	ppc_md.hpte_clear_all	= native_hpte_clear;
-	if (tlb_batching_enabled())
-		ppc_md.flush_hash_range = native_flush_hash_range;
+	ppc_md.flush_hash_range = native_flush_hash_range;
+	ppc_md.hugepage_invalidate   = native_hugepage_invalidate;
 }
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index a83dfa3cf40..88fdd9d2507 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -27,10 +27,13 @@
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
 #include <linux/sysctl.h>
+#include <linux/export.h>
 #include <linux/ctype.h>
 #include <linux/cache.h>
 #include <linux/init.h>
 #include <linux/signal.h>
+#include <linux/memblock.h>
+#include <linux/context_tracking.h>
 
 #include <asm/processor.h>
 #include <asm/pgtable.h>
@@ -38,11 +41,9 @@
 #include <asm/mmu_context.h>
 #include <asm/page.h>
 #include <asm/types.h>
-#include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/machdep.h>
-#include <asm/lmb.h>
-#include <asm/abs_addr.h>
+#include <asm/prom.h>
 #include <asm/tlbflush.h>
 #include <asm/io.h>
 #include <asm/eeh.h>
@@ -52,6 +53,10 @@
 #include <asm/sections.h>
 #include <asm/spu.h>
 #include <asm/udbg.h>
+#include <asm/code-patching.h>
+#include <asm/fadump.h>
+#include <asm/firmware.h>
+#include <asm/tm.h>
 
 #ifdef DEBUG
 #define DBG(fmt...) udbg_printf(fmt)
@@ -67,6 +72,7 @@
 
 #define KB (1024)
 #define MB (1024*KB)
+#define GB (1024L*MB)
 
 /*
  * Note:  pte   --> Linux PTE
@@ -90,17 +96,18 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
 struct hash_pte *htab_address;
 unsigned long htab_size_bytes;
 unsigned long htab_hash_mask;
+EXPORT_SYMBOL_GPL(htab_hash_mask);
 int mmu_linear_psize = MMU_PAGE_4K;
 int mmu_virtual_psize = MMU_PAGE_4K;
 int mmu_vmalloc_psize = MMU_PAGE_4K;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+int mmu_vmemmap_psize = MMU_PAGE_4K;
+#endif
 int mmu_io_psize = MMU_PAGE_4K;
 int mmu_kernel_ssize = MMU_SEGSIZE_256M;
 int mmu_highuser_ssize = MMU_SEGSIZE_256M;
 u16 mmu_slb_size = 64;
-#ifdef CONFIG_HUGETLB_PAGE
-int mmu_huge_psize = MMU_PAGE_16M;
-unsigned int HPAGE_SHIFT;
-#endif
+EXPORT_SYMBOL_GPL(mmu_slb_size);
 #ifdef CONFIG_PPC_64K_PAGES
 int mmu_ci_restrictions;
 #endif
@@ -116,11 +123,11 @@ static DEFINE_SPINLOCK(linear_map_hash_lock);
 
 /* Pre-POWER4 CPUs (4k pages only)
  */
-struct mmu_psize_def mmu_psize_defaults_old[] = {
+static struct mmu_psize_def mmu_psize_defaults_old[] = {
 	[MMU_PAGE_4K] = {
 		.shift	= 12,
 		.sllp	= 0,
-		.penc	= 0,
+		.penc   = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
 		.avpnm	= 0,
 		.tlbiel = 0,
 	},
@@ -130,56 +137,100 @@ struct mmu_psize_def mmu_psize_defaults_old[] = {
  *
  * Support for 16Mb large pages
  */
-struct mmu_psize_def mmu_psize_defaults_gp[] = {
+static struct mmu_psize_def mmu_psize_defaults_gp[] = {
 	[MMU_PAGE_4K] = {
 		.shift	= 12,
 		.sllp	= 0,
-		.penc	= 0,
+		.penc   = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
 		.avpnm	= 0,
 		.tlbiel = 1,
 	},
 	[MMU_PAGE_16M] = {
 		.shift	= 24,
 		.sllp	= SLB_VSID_L,
-		.penc	= 0,
+		.penc   = {[0 ... MMU_PAGE_16M - 1] = -1, [MMU_PAGE_16M] = 0,
+			    [MMU_PAGE_16M + 1 ... MMU_PAGE_COUNT - 1] = -1 },
 		.avpnm	= 0x1UL,
 		.tlbiel = 0,
 	},
 };
 
+static unsigned long htab_convert_pte_flags(unsigned long pteflags)
+{
+	unsigned long rflags = pteflags & 0x1fa;
+
+	/* _PAGE_EXEC -> NOEXEC */
+	if ((pteflags & _PAGE_EXEC) == 0)
+		rflags |= HPTE_R_N;
+
+	/* PP bits. PAGE_USER is already PP bit 0x2, so we only
+	 * need to add in 0x1 if it's a read-only user page
+	 */
+	if ((pteflags & _PAGE_USER) && !((pteflags & _PAGE_RW) &&
+					 (pteflags & _PAGE_DIRTY)))
+		rflags |= 1;
+	/*
+	 * Always add "C" bit for perf. Memory coherence is always enabled
+	 */
+	return rflags | HPTE_R_C | HPTE_R_M;
+}
 
 int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
-		      unsigned long pstart, unsigned long mode,
+		      unsigned long pstart, unsigned long prot,
 		      int psize, int ssize)
 {
 	unsigned long vaddr, paddr;
 	unsigned int step, shift;
-	unsigned long tmp_mode;
 	int ret = 0;
 
 	shift = mmu_psize_defs[psize].shift;
 	step = 1 << shift;
 
+	prot = htab_convert_pte_flags(prot);
+
+	DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n",
+	    vstart, vend, pstart, prot, psize, ssize);
+
 	for (vaddr = vstart, paddr = pstart; vaddr < vend;
 	     vaddr += step, paddr += step) {
 		unsigned long hash, hpteg;
 		unsigned long vsid = get_kernel_vsid(vaddr, ssize);
-		unsigned long va = hpt_va(vaddr, vsid, ssize);
+		unsigned long vpn  = hpt_vpn(vaddr, vsid, ssize);
+		unsigned long tprot = prot;
 
-		tmp_mode = mode;
-		
-		/* Make non-kernel text non-executable */
-		if (!in_kernel_text(vaddr))
-			tmp_mode = mode | HPTE_R_N;
+		/*
+		 * If we hit a bad address return error.
+		 */
+		if (!vsid)
+			return -1;
+		/* Make kernel text executable */
+		if (overlaps_kernel_text(vaddr, vaddr + step))
+			tprot &= ~HPTE_R_N;
+
+		/* Make kvm guest trampolines executable */
+		if (overlaps_kvm_tmp(vaddr, vaddr + step))
+			tprot &= ~HPTE_R_N;
+
+		/*
+		 * If relocatable, check if it overlaps interrupt vectors that
+		 * are copied down to real 0. For relocatable kernel
+		 * (e.g. kdump case) we copy interrupt vectors down to real
+		 * address 0. Mark that region as executable. This is
+		 * because on p8 system with relocation on exception feature
+		 * enabled, exceptions are raised with MMU (IR=DR=1) ON. Hence
+		 * in order to execute the interrupt handlers in virtual
+		 * mode the vector region need to be marked as executable.
+		 */
+		if ((PHYSICAL_START > MEMORY_START) &&
+			overlaps_interrupt_vector_text(vaddr, vaddr + step))
+				tprot &= ~HPTE_R_N;
 
-		hash = hpt_hash(va, shift, ssize);
+		hash = hpt_hash(vpn, shift, ssize);
 		hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
 
-		DBG("htab_bolt_mapping: calling %p\n", ppc_md.hpte_insert);
-
 		BUG_ON(!ppc_md.hpte_insert);
-		ret = ppc_md.hpte_insert(hpteg, va, paddr,
-				tmp_mode, HPTE_V_BOLTED, psize, ssize);
+		ret = ppc_md.hpte_insert(hpteg, vpn, paddr, tprot,
+					 HPTE_V_BOLTED, psize, psize, ssize);
 
 		if (ret < 0)
 			break;
@@ -191,30 +242,52 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
 	return ret < 0 ? ret : 0;
 }
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+static int htab_remove_mapping(unsigned long vstart, unsigned long vend,
+		      int psize, int ssize)
+{
+	unsigned long vaddr;
+	unsigned int step, shift;
+
+	shift = mmu_psize_defs[psize].shift;
+	step = 1 << shift;
+
+	if (!ppc_md.hpte_removebolted) {
+		printk(KERN_WARNING "Platform doesn't implement "
+				"hpte_removebolted\n");
+		return -EINVAL;
+	}
+
+	for (vaddr = vstart; vaddr < vend; vaddr += step)
+		ppc_md.hpte_removebolted(vaddr, psize, ssize);
+
+	return 0;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
 static int __init htab_dt_scan_seg_sizes(unsigned long node,
 					 const char *uname, int depth,
 					 void *data)
 {
-	char *type = of_get_flat_dt_prop(node, "device_type", NULL);
-	u32 *prop;
-	unsigned long size = 0;
+	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	const __be32 *prop;
+	int size = 0;
 
 	/* We are scanning "cpu" nodes only */
 	if (type == NULL || strcmp(type, "cpu") != 0)
 		return 0;
 
-	prop = (u32 *)of_get_flat_dt_prop(node, "ibm,processor-segment-sizes",
-					  &size);
+	prop = of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", &size);
 	if (prop == NULL)
 		return 0;
 	for (; size >= 4; size -= 4, ++prop) {
-		if (prop[0] == 40) {
+		if (be32_to_cpu(prop[0]) == 40) {
 			DBG("1T segment support detected\n");
-			cur_cpu_spec->cpu_features |= CPU_FTR_1T_SEGMENT;
+			cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT;
 			return 1;
 		}
 	}
-	cur_cpu_spec->cpu_features &= ~CPU_FTR_NO_SLBIE_B;
+	cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
 	return 0;
 }
 
@@ -223,89 +296,184 @@ static void __init htab_init_seg_sizes(void)
 	of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL);
 }
 
+static int __init get_idx_from_shift(unsigned int shift)
+{
+	int idx = -1;
+
+	switch (shift) {
+	case 0xc:
+		idx = MMU_PAGE_4K;
+		break;
+	case 0x10:
+		idx = MMU_PAGE_64K;
+		break;
+	case 0x14:
+		idx = MMU_PAGE_1M;
+		break;
+	case 0x18:
+		idx = MMU_PAGE_16M;
+		break;
+	case 0x22:
+		idx = MMU_PAGE_16G;
+		break;
+	}
+	return idx;
+}
+
 static int __init htab_dt_scan_page_sizes(unsigned long node,
 					  const char *uname, int depth,
 					  void *data)
 {
-	char *type = of_get_flat_dt_prop(node, "device_type", NULL);
-	u32 *prop;
-	unsigned long size = 0;
+	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	const __be32 *prop;
+	int size = 0;
 
 	/* We are scanning "cpu" nodes only */
 	if (type == NULL || strcmp(type, "cpu") != 0)
 		return 0;
 
-	prop = (u32 *)of_get_flat_dt_prop(node,
-					  "ibm,segment-page-sizes", &size);
+	prop = of_get_flat_dt_prop(node, "ibm,segment-page-sizes", &size);
 	if (prop != NULL) {
-		DBG("Page sizes from device-tree:\n");
+		pr_info("Page sizes from device-tree:\n");
 		size /= 4;
-		cur_cpu_spec->cpu_features &= ~(CPU_FTR_16M_PAGE);
+		cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE);
 		while(size > 0) {
-			unsigned int shift = prop[0];
-			unsigned int slbenc = prop[1];
-			unsigned int lpnum = prop[2];
-			unsigned int lpenc = 0;
+			unsigned int base_shift = be32_to_cpu(prop[0]);
+			unsigned int slbenc = be32_to_cpu(prop[1]);
+			unsigned int lpnum = be32_to_cpu(prop[2]);
 			struct mmu_psize_def *def;
-			int idx = -1;
+			int idx, base_idx;
 
 			size -= 3; prop += 3;
-			while(size > 0 && lpnum) {
-				if (prop[0] == shift)
-					lpenc = prop[1];
-				prop += 2; size -= 2;
-				lpnum--;
-			}
-			switch(shift) {
-			case 0xc:
-				idx = MMU_PAGE_4K;
-				break;
-			case 0x10:
-				idx = MMU_PAGE_64K;
-				break;
-			case 0x14:
-				idx = MMU_PAGE_1M;
-				break;
-			case 0x18:
-				idx = MMU_PAGE_16M;
-				cur_cpu_spec->cpu_features |= CPU_FTR_16M_PAGE;
-				break;
-			case 0x22:
-				idx = MMU_PAGE_16G;
-				break;
-			}
-			if (idx < 0)
+			base_idx = get_idx_from_shift(base_shift);
+			if (base_idx < 0) {
+				/*
+				 * skip the pte encoding also
+				 */
+				prop += lpnum * 2; size -= lpnum * 2;
 				continue;
-			def = &mmu_psize_defs[idx];
-			def->shift = shift;
-			if (shift <= 23)
+			}
+			def = &mmu_psize_defs[base_idx];
+			if (base_idx == MMU_PAGE_16M)
+				cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE;
+
+			def->shift = base_shift;
+			if (base_shift <= 23)
 				def->avpnm = 0;
 			else
-				def->avpnm = (1 << (shift - 23)) - 1;
+				def->avpnm = (1 << (base_shift - 23)) - 1;
 			def->sllp = slbenc;
-			def->penc = lpenc;
-			/* We don't know for sure what's up with tlbiel, so
+			/*
+			 * We don't know for sure what's up with tlbiel, so
 			 * for now we only set it for 4K and 64K pages
 			 */
-			if (idx == MMU_PAGE_4K || idx == MMU_PAGE_64K)
+			if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K)
 				def->tlbiel = 1;
 			else
 				def->tlbiel = 0;
 
-			DBG(" %d: shift=%02x, sllp=%04x, avpnm=%08x, "
-			    "tlbiel=%d, penc=%d\n",
-			    idx, shift, def->sllp, def->avpnm, def->tlbiel,
-			    def->penc);
+			while (size > 0 && lpnum) {
+				unsigned int shift = be32_to_cpu(prop[0]);
+				int penc  = be32_to_cpu(prop[1]);
+
+				prop += 2; size -= 2;
+				lpnum--;
+
+				idx = get_idx_from_shift(shift);
+				if (idx < 0)
+					continue;
+
+				if (penc == -1)
+					pr_err("Invalid penc for base_shift=%d "
+					       "shift=%d\n", base_shift, shift);
+
+				def->penc[idx] = penc;
+				pr_info("base_shift=%d: shift=%d, sllp=0x%04lx,"
+					" avpnm=0x%08lx, tlbiel=%d, penc=%d\n",
+					base_shift, shift, def->sllp,
+					def->avpnm, def->tlbiel, def->penc[idx]);
+			}
 		}
 		return 1;
 	}
 	return 0;
 }
 
+#ifdef CONFIG_HUGETLB_PAGE
+/* Scan for 16G memory blocks that have been set aside for huge pages
+ * and reserve those blocks for 16G huge pages.
+ */
+static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
+					const char *uname, int depth,
+					void *data) {
+	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	const __be64 *addr_prop;
+	const __be32 *page_count_prop;
+	unsigned int expected_pages;
+	long unsigned int phys_addr;
+	long unsigned int block_size;
+
+	/* We are scanning "memory" nodes only */
+	if (type == NULL || strcmp(type, "memory") != 0)
+		return 0;
+
+	/* This property is the log base 2 of the number of virtual pages that
+	 * will represent this memory block. */
+	page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL);
+	if (page_count_prop == NULL)
+		return 0;
+	expected_pages = (1 << be32_to_cpu(page_count_prop[0]));
+	addr_prop = of_get_flat_dt_prop(node, "reg", NULL);
+	if (addr_prop == NULL)
+		return 0;
+	phys_addr = be64_to_cpu(addr_prop[0]);
+	block_size = be64_to_cpu(addr_prop[1]);
+	if (block_size != (16 * GB))
+		return 0;
+	printk(KERN_INFO "Huge page(16GB) memory: "
+			"addr = 0x%lX size = 0x%lX pages = %d\n",
+			phys_addr, block_size, expected_pages);
+	if (phys_addr + (16 * GB) <= memblock_end_of_DRAM()) {
+		memblock_reserve(phys_addr, block_size * expected_pages);
+		add_gpage(phys_addr, block_size, expected_pages);
+	}
+	return 0;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+static void mmu_psize_set_default_penc(void)
+{
+	int bpsize, apsize;
+	for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
+		for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++)
+			mmu_psize_defs[bpsize].penc[apsize] = -1;
+}
+
+#ifdef CONFIG_PPC_64K_PAGES
+
+static bool might_have_hea(void)
+{
+	/*
+	 * The HEA ethernet adapter requires awareness of the
+	 * GX bus. Without that awareness we can easily assume
+	 * we will never see an HEA ethernet device.
+	 */
+#ifdef CONFIG_IBMEBUS
+	return !cpu_has_feature(CPU_FTR_ARCH_207S);
+#else
+	return false;
+#endif
+}
+
+#endif /* #ifdef CONFIG_PPC_64K_PAGES */
+
 static void __init htab_init_page_sizes(void)
 {
 	int rc;
 
+	/* se the invalid penc to -1 */
+	mmu_psize_set_default_penc();
+
 	/* Default to 4K pages only */
 	memcpy(mmu_psize_defs, mmu_psize_defaults_old,
 	       sizeof(mmu_psize_defaults_old));
@@ -321,7 +489,7 @@ static void __init htab_init_page_sizes(void)
 	 * Not in the device-tree, let's fallback on known size
 	 * list for 16M capable GP & GR
 	 */
-	if (cpu_has_feature(CPU_FTR_16M_PAGE))
+	if (mmu_has_feature(MMU_FTR_16M_PAGE))
 		memcpy(mmu_psize_defs, mmu_psize_defaults_gp,
 		       sizeof(mmu_psize_defaults_gp));
  found:
@@ -351,34 +519,49 @@ static void __init htab_init_page_sizes(void)
 		mmu_vmalloc_psize = MMU_PAGE_64K;
 		if (mmu_linear_psize == MMU_PAGE_4K)
 			mmu_linear_psize = MMU_PAGE_64K;
-		if (cpu_has_feature(CPU_FTR_CI_LARGE_PAGE)) {
+		if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) {
 			/*
-			 * Don't use 64k pages for ioremap on pSeries, since
-			 * that would stop us accessing the HEA ethernet.
+			 * When running on pSeries using 64k pages for ioremap
+			 * would stop us accessing the HEA ethernet. So if we
+			 * have the chance of ever seeing one, stay at 4k.
 			 */
-			if (!machine_is(pseries))
+			if (!might_have_hea() || !machine_is(pseries))
 				mmu_io_psize = MMU_PAGE_64K;
 		} else
 			mmu_ci_restrictions = 1;
 	}
 #endif /* CONFIG_PPC_64K_PAGES */
 
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+	/* We try to use 16M pages for vmemmap if that is supported
+	 * and we have at least 1G of RAM at boot
+	 */
+	if (mmu_psize_defs[MMU_PAGE_16M].shift &&
+	    memblock_phys_mem_size() >= 0x40000000)
+		mmu_vmemmap_psize = MMU_PAGE_16M;
+	else if (mmu_psize_defs[MMU_PAGE_64K].shift)
+		mmu_vmemmap_psize = MMU_PAGE_64K;
+	else
+		mmu_vmemmap_psize = MMU_PAGE_4K;
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
 	printk(KERN_DEBUG "Page orders: linear mapping = %d, "
-	       "virtual = %d, io = %d\n",
+	       "virtual = %d, io = %d"
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+	       ", vmemmap = %d"
+#endif
+	       "\n",
 	       mmu_psize_defs[mmu_linear_psize].shift,
 	       mmu_psize_defs[mmu_virtual_psize].shift,
-	       mmu_psize_defs[mmu_io_psize].shift);
+	       mmu_psize_defs[mmu_io_psize].shift
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+	       ,mmu_psize_defs[mmu_vmemmap_psize].shift
+#endif
+	       );
 
 #ifdef CONFIG_HUGETLB_PAGE
-	/* Init large page size. Currently, we pick 16M or 1M depending
-	 * on what is available
-	 */
-	if (mmu_psize_defs[MMU_PAGE_16M].shift)
-		set_huge_psize(MMU_PAGE_16M);
-	/* With 4k/4level pagetables, we can't (for now) cope with a
-	 * huge page size < PMD_SIZE */
-	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
-		set_huge_psize(MMU_PAGE_1M);
+	/* Reserve 16G huge page memory sections for huge pages */
+	of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
 #endif /* CONFIG_HUGETLB_PAGE */
 }
 
@@ -386,17 +569,17 @@ static int __init htab_dt_scan_pftsize(unsigned long node,
 				       const char *uname, int depth,
 				       void *data)
 {
-	char *type = of_get_flat_dt_prop(node, "device_type", NULL);
-	u32 *prop;
+	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	const __be32 *prop;
 
 	/* We are scanning "cpu" nodes only */
 	if (type == NULL || strcmp(type, "cpu") != 0)
 		return 0;
 
-	prop = (u32 *)of_get_flat_dt_prop(node, "ibm,pft-size", NULL);
+	prop = of_get_flat_dt_prop(node, "ibm,pft-size", NULL);
 	if (prop != NULL) {
 		/* pft_size[0] is the NUMA CEC cookie */
-		ppc64_pft_size = prop[1];
+		ppc64_pft_size = be32_to_cpu(prop[1]);
 		return 1;
 	}
 	return 0;
@@ -404,7 +587,7 @@ static int __init htab_dt_scan_pftsize(unsigned long node,
 
 static unsigned long __init htab_get_table_size(void)
 {
-	unsigned long mem_size, rnd_mem_size, pteg_count;
+	unsigned long mem_size, rnd_mem_size, pteg_count, psize;
 
 	/* If hash size isn't already provided by the platform, we try to
 	 * retrieve it from the device-tree. If it's not there neither, we
@@ -416,70 +599,80 @@ static unsigned long __init htab_get_table_size(void)
 		return 1UL << ppc64_pft_size;
 
 	/* round mem_size up to next power of 2 */
-	mem_size = lmb_phys_mem_size();
+	mem_size = memblock_phys_mem_size();
 	rnd_mem_size = 1UL << __ilog2(mem_size);
 	if (rnd_mem_size < mem_size)
 		rnd_mem_size <<= 1;
 
 	/* # pages / 2 */
-	pteg_count = max(rnd_mem_size >> (12 + 1), 1UL << 11);
+	psize = mmu_psize_defs[mmu_virtual_psize].shift;
+	pteg_count = max(rnd_mem_size >> (psize + 1), 1UL << 11);
 
 	return pteg_count << 7;
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-void create_section_mapping(unsigned long start, unsigned long end)
+int create_section_mapping(unsigned long start, unsigned long end)
 {
-		BUG_ON(htab_bolt_mapping(start, end, __pa(start),
-			_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX,
-			mmu_linear_psize, mmu_kernel_ssize));
+	return htab_bolt_mapping(start, end, __pa(start),
+				 pgprot_val(PAGE_KERNEL), mmu_linear_psize,
+				 mmu_kernel_ssize);
 }
-#endif /* CONFIG_MEMORY_HOTPLUG */
 
-static inline void make_bl(unsigned int *insn_addr, void *func)
+int remove_section_mapping(unsigned long start, unsigned long end)
 {
-	unsigned long funcp = *((unsigned long *)func);
-	int offset = funcp - (unsigned long)insn_addr;
-
-	*insn_addr = (unsigned int)(0x48000001 | (offset & 0x03fffffc));
-	flush_icache_range((unsigned long)insn_addr, 4+
-			   (unsigned long)insn_addr);
+	return htab_remove_mapping(start, end, mmu_linear_psize,
+			mmu_kernel_ssize);
 }
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+extern u32 htab_call_hpte_insert1[];
+extern u32 htab_call_hpte_insert2[];
+extern u32 htab_call_hpte_remove[];
+extern u32 htab_call_hpte_updatepp[];
+extern u32 ht64_call_hpte_insert1[];
+extern u32 ht64_call_hpte_insert2[];
+extern u32 ht64_call_hpte_remove[];
+extern u32 ht64_call_hpte_updatepp[];
 
 static void __init htab_finish_init(void)
 {
-	extern unsigned int *htab_call_hpte_insert1;
-	extern unsigned int *htab_call_hpte_insert2;
-	extern unsigned int *htab_call_hpte_remove;
-	extern unsigned int *htab_call_hpte_updatepp;
-
 #ifdef CONFIG_PPC_HAS_HASH_64K
-	extern unsigned int *ht64_call_hpte_insert1;
-	extern unsigned int *ht64_call_hpte_insert2;
-	extern unsigned int *ht64_call_hpte_remove;
-	extern unsigned int *ht64_call_hpte_updatepp;
-
-	make_bl(ht64_call_hpte_insert1, ppc_md.hpte_insert);
-	make_bl(ht64_call_hpte_insert2, ppc_md.hpte_insert);
-	make_bl(ht64_call_hpte_remove, ppc_md.hpte_remove);
-	make_bl(ht64_call_hpte_updatepp, ppc_md.hpte_updatepp);
+	patch_branch(ht64_call_hpte_insert1,
+		ppc_function_entry(ppc_md.hpte_insert),
+		BRANCH_SET_LINK);
+	patch_branch(ht64_call_hpte_insert2,
+		ppc_function_entry(ppc_md.hpte_insert),
+		BRANCH_SET_LINK);
+	patch_branch(ht64_call_hpte_remove,
+		ppc_function_entry(ppc_md.hpte_remove),
+		BRANCH_SET_LINK);
+	patch_branch(ht64_call_hpte_updatepp,
+		ppc_function_entry(ppc_md.hpte_updatepp),
+		BRANCH_SET_LINK);
 #endif /* CONFIG_PPC_HAS_HASH_64K */
 
-	make_bl(htab_call_hpte_insert1, ppc_md.hpte_insert);
-	make_bl(htab_call_hpte_insert2, ppc_md.hpte_insert);
-	make_bl(htab_call_hpte_remove, ppc_md.hpte_remove);
-	make_bl(htab_call_hpte_updatepp, ppc_md.hpte_updatepp);
+	patch_branch(htab_call_hpte_insert1,
+		ppc_function_entry(ppc_md.hpte_insert),
+		BRANCH_SET_LINK);
+	patch_branch(htab_call_hpte_insert2,
+		ppc_function_entry(ppc_md.hpte_insert),
+		BRANCH_SET_LINK);
+	patch_branch(htab_call_hpte_remove,
+		ppc_function_entry(ppc_md.hpte_remove),
+		BRANCH_SET_LINK);
+	patch_branch(htab_call_hpte_updatepp,
+		ppc_function_entry(ppc_md.hpte_updatepp),
+		BRANCH_SET_LINK);
 }
 
-void __init htab_initialize(void)
+static void __init htab_initialize(void)
 {
 	unsigned long table;
 	unsigned long pteg_count;
-	unsigned long mode_rw;
+	unsigned long prot;
 	unsigned long base = 0, size = 0, limit;
-	int i;
-
-	extern unsigned long tce_alloc_start, tce_alloc_end;
+	struct memblock_region *reg;
 
 	DBG(" -> htab_initialize()\n");
 
@@ -489,7 +682,7 @@ void __init htab_initialize(void)
 	/* Initialize page sizes */
 	htab_init_page_sizes();
 
-	if (cpu_has_feature(CPU_FTR_1T_SEGMENT)) {
+	if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
 		mmu_kernel_ssize = MMU_SEGSIZE_1T;
 		mmu_highuser_ssize = MMU_SEGSIZE_1T;
 		printk(KERN_INFO "Using 1TB segments\n");
@@ -508,6 +701,16 @@ void __init htab_initialize(void)
 		/* Using a hypervisor which owns the htab */
 		htab_address = NULL;
 		_SDR1 = 0; 
+#ifdef CONFIG_FA_DUMP
+		/*
+		 * If firmware assisted dump is active firmware preserves
+		 * the contents of htab along with entire partition memory.
+		 * Clear the htab if firmware assisted dump is active so
+		 * that we dont end up using old mappings.
+		 */
+		if (is_fadump_active() && ppc_md.hpte_clear_all)
+			ppc_md.hpte_clear_all();
+#endif
 	} else {
 		/* Find storage for the HPT.  Must be contiguous in
 		 * the absolute address space. On cell we want it to be
@@ -516,14 +719,14 @@ void __init htab_initialize(void)
 		if (machine_is(cell))
 			limit = 0x80000000;
 		else
-			limit = 0;
+			limit = MEMBLOCK_ALLOC_ANYWHERE;
 
-		table = lmb_alloc_base(htab_size_bytes, htab_size_bytes, limit);
+		table = memblock_alloc_base(htab_size_bytes, htab_size_bytes, limit);
 
 		DBG("Hash table allocated at %lx, size: %lx\n", table,
 		    htab_size_bytes);
 
-		htab_address = abs_to_virt(table);
+		htab_address = __va(table);
 
 		/* htab absolute addr + encoded htabsize */
 		_SDR1 = table + __ilog2(pteg_count) - 11;
@@ -535,12 +738,12 @@ void __init htab_initialize(void)
 		mtspr(SPRN_SDR1, _SDR1);
 	}
 
-	mode_rw = _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX;
+	prot = pgprot_val(PAGE_KERNEL);
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
-	linear_map_hash_count = lmb_end_of_DRAM() >> PAGE_SHIFT;
-	linear_map_hash_slots = __va(lmb_alloc_base(linear_map_hash_count,
-						    1, lmb.rmo_size));
+	linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
+	linear_map_hash_slots = __va(memblock_alloc_base(linear_map_hash_count,
+						    1, ppc64_rma_size));
 	memset(linear_map_hash_slots, 0, linear_map_hash_count);
 #endif /* CONFIG_DEBUG_PAGEALLOC */
 
@@ -550,15 +753,16 @@ void __init htab_initialize(void)
 	 */
 
 	/* create bolted the linear mapping in the hash table */
-	for (i=0; i < lmb.memory.cnt; i++) {
-		base = (unsigned long)__va(lmb.memory.region[i].base);
-		size = lmb.memory.region[i].size;
+	for_each_memblock(memory, reg) {
+		base = (unsigned long)__va(reg->base);
+		size = reg->size;
 
-		DBG("creating mapping for region: %lx : %lx\n", base, size);
+		DBG("creating mapping for region: %lx..%lx (prot: %lx)\n",
+		    base, size, prot);
 
 #ifdef CONFIG_U3_DART
 		/* Do not map the DART space. Fortunately, it will be aligned
-		 * in such a way that it will not cross two lmb regions and
+		 * in such a way that it will not cross two memblock regions and
 		 * will fit within a single 16Mb page.
 		 * The DART space is assumed to be a full 16Mb region even if
 		 * we only use 2Mb of that space. We will use more of it later
@@ -571,22 +775,23 @@ void __init htab_initialize(void)
 			unsigned long dart_table_end = dart_tablebase + 16 * MB;
 			if (base != dart_tablebase)
 				BUG_ON(htab_bolt_mapping(base, dart_tablebase,
-							__pa(base), mode_rw,
+							__pa(base), prot,
 							mmu_linear_psize,
 							mmu_kernel_ssize));
 			if ((base + size) > dart_table_end)
 				BUG_ON(htab_bolt_mapping(dart_tablebase+16*MB,
 							base + size,
 							__pa(dart_table_end),
-							 mode_rw,
+							 prot,
 							 mmu_linear_psize,
 							 mmu_kernel_ssize));
 			continue;
 		}
 #endif /* CONFIG_U3_DART */
 		BUG_ON(htab_bolt_mapping(base, base + size, __pa(base),
-				mode_rw, mmu_linear_psize, mmu_kernel_ssize));
-       }
+				prot, mmu_linear_psize, mmu_kernel_ssize));
+	}
+	memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
 
 	/*
 	 * If we have a memory_limit and we've allocated TCEs then we need to
@@ -603,7 +808,7 @@ void __init htab_initialize(void)
 			tce_alloc_start = base + size + 1;
 
 		BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end,
-					 __pa(tce_alloc_start), mode_rw,
+					 __pa(tce_alloc_start), prot,
 					 mmu_linear_psize, mmu_kernel_ssize));
 	}
 
@@ -614,11 +819,41 @@ void __init htab_initialize(void)
 #undef KB
 #undef MB
 
-void htab_initialize_secondary(void)
+void __init early_init_mmu(void)
 {
+	/* Setup initial STAB address in the PACA */
+	get_paca()->stab_real = __pa((u64)&initial_stab);
+	get_paca()->stab_addr = (u64)&initial_stab;
+
+	/* Initialize the MMU Hash table and create the linear mapping
+	 * of memory. Has to be done before stab/slb initialization as
+	 * this is currently where the page size encoding is obtained
+	 */
+	htab_initialize();
+
+	/* Initialize stab / SLB management */
+	if (mmu_has_feature(MMU_FTR_SLB))
+		slb_initialize();
+	else
+		stab_initialize(get_paca()->stab_real);
+}
+
+#ifdef CONFIG_SMP
+void early_init_mmu_secondary(void)
+{
+	/* Initialize hash table for that CPU */
 	if (!firmware_has_feature(FW_FEATURE_LPAR))
 		mtspr(SPRN_SDR1, _SDR1);
+
+	/* Initialize STAB/SLB. We use a virtual address as it works
+	 * in real mode on pSeries.
+	 */
+	if (mmu_has_feature(MMU_FTR_SLB))
+		slb_initialize();
+	else
+		stab_initialize(get_paca()->stab_addr);
 }
+#endif /* CONFIG_SMP */
 
 /*
  * Called by asm hashtable.S for doing lazy icache flush
@@ -635,7 +870,7 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
 	/* page is dirty */
 	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
 		if (trap == 0x400) {
-			__flush_dcache_icache(page_address(page));
+			flush_dcache_icache_page(page);
 			set_bit(PG_arch_1, &page->flags);
 		} else
 			pp |= HPTE_R_N;
@@ -643,6 +878,31 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
 	return pp;
 }
 
+#ifdef CONFIG_PPC_MM_SLICES
+unsigned int get_paca_psize(unsigned long addr)
+{
+	u64 lpsizes;
+	unsigned char *hpsizes;
+	unsigned long index, mask_index;
+
+	if (addr < SLICE_LOW_TOP) {
+		lpsizes = get_paca()->context.low_slices_psize;
+		index = GET_LOW_SLICE_INDEX(addr);
+		return (lpsizes >> (index * 4)) & 0xF;
+	}
+	hpsizes = get_paca()->context.high_slices_psize;
+	index = GET_HIGH_SLICE_INDEX(addr);
+	mask_index = index & 0x1;
+	return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xF;
+}
+
+#else
+unsigned int get_paca_psize(unsigned long addr)
+{
+	return get_paca()->context.user_psize;
+}
+#endif
+
 /*
  * Demote a segment to using 4k pages.
  * For now this makes the whole process use 4k pages.
@@ -650,13 +910,13 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
 #ifdef CONFIG_PPC_64K_PAGES
 void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
 {
-	if (mm->context.user_psize == MMU_PAGE_4K)
+	if (get_slice_psize(mm, addr) == MMU_PAGE_4K)
 		return;
-	slice_set_user_psize(mm, MMU_PAGE_4K);
+	slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K);
 #ifdef CONFIG_SPU_BASE
 	spu_flush_all_slbs(mm);
 #endif
-	if (get_paca()->context.user_psize != MMU_PAGE_4K) {
+	if (get_paca_psize(addr) != MMU_PAGE_4K) {
 		get_paca()->context = mm->context;
 		slb_flush_and_rebolt();
 	}
@@ -671,15 +931,15 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
  * Result is 0: full permissions, _PAGE_RW: read-only,
  * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access.
  */
-static int subpage_protection(pgd_t *pgdir, unsigned long ea)
+static int subpage_protection(struct mm_struct *mm, unsigned long ea)
 {
-	struct subpage_prot_table *spt = pgd_subpage_prot(pgdir);
+	struct subpage_prot_table *spt = &mm->context.spt;
 	u32 spp = 0;
 	u32 **sbpm, *sbpp;
 
 	if (ea >= spt->maxaddr)
 		return 0;
-	if (ea < 0x100000000) {
+	if (ea < 0x100000000UL) {
 		/* addresses below 4GB use spt->low_prot */
 		sbpm = spt->low_prot;
 	} else {
@@ -701,12 +961,40 @@ static int subpage_protection(pgd_t *pgdir, unsigned long ea)
 }
 
 #else /* CONFIG_PPC_SUBPAGE_PROT */
-static inline int subpage_protection(pgd_t *pgdir, unsigned long ea)
+static inline int subpage_protection(struct mm_struct *mm, unsigned long ea)
 {
 	return 0;
 }
 #endif
 
+void hash_failure_debug(unsigned long ea, unsigned long access,
+			unsigned long vsid, unsigned long trap,
+			int ssize, int psize, int lpsize, unsigned long pte)
+{
+	if (!printk_ratelimit())
+		return;
+	pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n",
+		ea, access, current->comm);
+	pr_info("    trap=0x%lx vsid=0x%lx ssize=%d base psize=%d psize %d pte=0x%lx\n",
+		trap, vsid, ssize, psize, lpsize, pte);
+}
+
+static void check_paca_psize(unsigned long ea, struct mm_struct *mm,
+			     int psize, bool user_region)
+{
+	if (user_region) {
+		if (psize != get_paca_psize(ea)) {
+			get_paca()->context = mm->context;
+			slb_flush_and_rebolt();
+		}
+	} else if (get_paca()->vmalloc_sllp !=
+		   mmu_psize_defs[mmu_vmalloc_psize].sllp) {
+		get_paca()->vmalloc_sllp =
+			mmu_psize_defs[mmu_vmalloc_psize].sllp;
+		slb_vmalloc_update();
+	}
+}
+
 /* Result code is:
  *  0 - handled
  *  1 - normal page fault
@@ -715,22 +1003,19 @@ static inline int subpage_protection(pgd_t *pgdir, unsigned long ea)
  */
 int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 {
-	void *pgdir;
+	enum ctx_state prev_state = exception_enter();
+	pgd_t *pgdir;
 	unsigned long vsid;
 	struct mm_struct *mm;
 	pte_t *ptep;
-	cpumask_t tmp;
+	unsigned hugeshift;
+	const struct cpumask *tmp;
 	int rc, user_region = 0, local = 0;
 	int psize, ssize;
 
 	DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n",
 		ea, access, trap);
 
-	if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) {
-		DBG_LOW(" out of pgtable range !\n");
- 		return 1;
-	}
-
 	/* Get region & vsid */
  	switch (REGION_ID(ea)) {
 	case USER_REGION_ID:
@@ -738,13 +1023,10 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 		mm = current->mm;
 		if (! mm) {
 			DBG_LOW(" user region with no mm !\n");
-			return 1;
+			rc = 1;
+			goto bail;
 		}
-#ifdef CONFIG_PPC_MM_SLICES
 		psize = get_slice_psize(mm, ea);
-#else
-		psize = mm->context.user_psize;
-#endif
 		ssize = user_segment_size(ea);
 		vsid = get_vsid(mm->context.id, ea, ssize);
 		break;
@@ -761,62 +1043,94 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 		/* Not a valid range
 		 * Send the problem up to do_page_fault 
 		 */
-		return 1;
+		rc = 1;
+		goto bail;
 	}
 	DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid);
 
+	/* Bad address. */
+	if (!vsid) {
+		DBG_LOW("Bad address!\n");
+		rc = 1;
+		goto bail;
+	}
 	/* Get pgdir */
 	pgdir = mm->pgd;
-	if (pgdir == NULL)
-		return 1;
+	if (pgdir == NULL) {
+		rc = 1;
+		goto bail;
+	}
 
 	/* Check CPU locality */
-	tmp = cpumask_of_cpu(smp_processor_id());
-	if (user_region && cpus_equal(mm->cpu_vm_mask, tmp))
+	tmp = cpumask_of(smp_processor_id());
+	if (user_region && cpumask_equal(mm_cpumask(mm), tmp))
 		local = 1;
 
-#ifdef CONFIG_HUGETLB_PAGE
-	/* Handle hugepage regions */
-	if (HPAGE_SHIFT && psize == mmu_huge_psize) {
-		DBG_LOW(" -> huge page !\n");
-		return hash_huge_page(mm, access, ea, vsid, local, trap);
-	}
-#endif /* CONFIG_HUGETLB_PAGE */
-
 #ifndef CONFIG_PPC_64K_PAGES
-	/* If we use 4K pages and our psize is not 4K, then we are hitting
-	 * a special driver mapping, we need to align the address before
-	 * we fetch the PTE
+	/* If we use 4K pages and our psize is not 4K, then we might
+	 * be hitting a special driver mapping, and need to align the
+	 * address before we fetch the PTE.
+	 *
+	 * It could also be a hugepage mapping, in which case this is
+	 * not necessary, but it's not harmful, either.
 	 */
 	if (psize != MMU_PAGE_4K)
 		ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
 #endif /* CONFIG_PPC_64K_PAGES */
 
 	/* Get PTE and page size from page tables */
-	ptep = find_linux_pte(pgdir, ea);
+	ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
 	if (ptep == NULL || !pte_present(*ptep)) {
 		DBG_LOW(" no PTE !\n");
-		return 1;
+		rc = 1;
+		goto bail;
 	}
 
-#ifndef CONFIG_PPC_64K_PAGES
-	DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
-#else
-	DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep),
-		pte_val(*(ptep + PTRS_PER_PTE)));
-#endif
+	/* Add _PAGE_PRESENT to the required access perm */
+	access |= _PAGE_PRESENT;
+
 	/* Pre-check access permissions (will be re-checked atomically
 	 * in __hash_page_XX but this pre-check is a fast path
 	 */
 	if (access & ~pte_val(*ptep)) {
 		DBG_LOW(" no access !\n");
-		return 1;
+		rc = 1;
+		goto bail;
 	}
 
+	if (hugeshift) {
+		if (pmd_trans_huge(*(pmd_t *)ptep))
+			rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
+					     trap, local, ssize, psize);
+#ifdef CONFIG_HUGETLB_PAGE
+		else
+			rc = __hash_page_huge(ea, access, vsid, ptep, trap,
+					      local, ssize, hugeshift, psize);
+#else
+		else {
+			/*
+			 * if we have hugeshift, and is not transhuge with
+			 * hugetlb disabled, something is really wrong.
+			 */
+			rc = 1;
+			WARN_ON(1);
+		}
+#endif
+		check_paca_psize(ea, mm, psize, user_region);
+
+		goto bail;
+	}
+
+#ifndef CONFIG_PPC_64K_PAGES
+	DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
+#else
+	DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep),
+		pte_val(*(ptep + PTRS_PER_PTE)));
+#endif
 	/* Do actual hashing */
 #ifdef CONFIG_PPC_64K_PAGES
 	/* If _PAGE_4K_PFN is set, make sure this is a 4k segment */
-	if (pte_val(*ptep) & _PAGE_4K_PFN) {
+	if ((pte_val(*ptep) & _PAGE_4K_PFN) && psize == MMU_PAGE_64K) {
 		demote_segment_4k(mm, ea);
 		psize = MMU_PAGE_4K;
 	}
@@ -844,17 +1158,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 #endif
 		}
 	}
-	if (user_region) {
-		if (psize != get_paca()->context.user_psize) {
-			get_paca()->context = mm->context;
-			slb_flush_and_rebolt();
-		}
-	} else if (get_paca()->vmalloc_sllp !=
-		   mmu_psize_defs[mmu_vmalloc_psize].sllp) {
-		get_paca()->vmalloc_sllp =
-			mmu_psize_defs[mmu_vmalloc_psize].sllp;
-		slb_vmalloc_update();
-	}
+
+	check_paca_psize(ea, mm, psize, user_region);
 #endif /* CONFIG_PPC_64K_PAGES */
 
 #ifdef CONFIG_PPC_HAS_HASH_64K
@@ -863,7 +1168,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 	else
 #endif /* CONFIG_PPC_HAS_HASH_64K */
 	{
-		int spp = subpage_protection(pgdir, ea);
+		int spp = subpage_protection(mm, ea);
 		if (access & spp)
 			rc = -2;
 		else
@@ -871,6 +1176,12 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 					    local, ssize, spp);
 	}
 
+	/* Dump some info in case of hash insertion failure, they should
+	 * never happen so it is really useful to know if/when they do
+	 */
+	if (rc == -1)
+		hash_failure_debug(ea, access, vsid, trap, ssize, psize,
+				   psize, pte_val(*ptep));
 #ifndef CONFIG_PPC_64K_PAGES
 	DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep));
 #else
@@ -878,6 +1189,9 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 		pte_val(*(ptep + PTRS_PER_PTE)));
 #endif
 	DBG_LOW(" -> rc=%d\n", rc);
+
+bail:
+	exception_exit(prev_state);
 	return rc;
 }
 EXPORT_SYMBOL_GPL(hash_page);
@@ -885,13 +1199,12 @@ EXPORT_SYMBOL_GPL(hash_page);
 void hash_preload(struct mm_struct *mm, unsigned long ea,
 		  unsigned long access, unsigned long trap)
 {
+	int hugepage_shift;
 	unsigned long vsid;
-	void *pgdir;
+	pgd_t *pgdir;
 	pte_t *ptep;
-	cpumask_t mask;
 	unsigned long flags;
-	int local = 0;
-	int ssize;
+	int rc, ssize, local = 0;
 
 	BUG_ON(REGION_ID(ea) != USER_REGION_ID);
 
@@ -908,10 +1221,27 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
 	pgdir = mm->pgd;
 	if (pgdir == NULL)
 		return;
-	ptep = find_linux_pte(pgdir, ea);
-	if (!ptep)
+
+	/* Get VSID */
+	ssize = user_segment_size(ea);
+	vsid = get_vsid(mm->context.id, ea, ssize);
+	if (!vsid)
 		return;
+	/*
+	 * Hash doesn't like irqs. Walking linux page table with irq disabled
+	 * saves us from holding multiple locks.
+	 */
+	local_irq_save(flags);
 
+	/*
+	 * THP pages use update_mmu_cache_pmd. We don't do
+	 * hash preload there. Hence can ignore THP here
+	 */
+	ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift);
+	if (!ptep)
+		goto out_exit;
+
+	WARN_ON(hugepage_shift);
 #ifdef CONFIG_PPC_64K_PAGES
 	/* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on
 	 * a 64K kernel), then we don't preload, hash_page() will take
@@ -920,52 +1250,73 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
 	 * page size demotion here
 	 */
 	if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE))
-		return;
+		goto out_exit;
 #endif /* CONFIG_PPC_64K_PAGES */
 
-	/* Get VSID */
-	ssize = user_segment_size(ea);
-	vsid = get_vsid(mm->context.id, ea, ssize);
-
-	/* Hash doesn't like irqs */
-	local_irq_save(flags);
-
 	/* Is that local to this CPU ? */
-	mask = cpumask_of_cpu(smp_processor_id());
-	if (cpus_equal(mm->cpu_vm_mask, mask))
+	if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
 		local = 1;
 
 	/* Hash it in */
 #ifdef CONFIG_PPC_HAS_HASH_64K
 	if (mm->context.user_psize == MMU_PAGE_64K)
-		__hash_page_64K(ea, access, vsid, ptep, trap, local, ssize);
+		rc = __hash_page_64K(ea, access, vsid, ptep, trap, local, ssize);
 	else
 #endif /* CONFIG_PPC_HAS_HASH_64K */
-		__hash_page_4K(ea, access, vsid, ptep, trap, local, ssize,
-			       subpage_protection(pgdir, ea));
+		rc = __hash_page_4K(ea, access, vsid, ptep, trap, local, ssize,
+				    subpage_protection(mm, ea));
 
+	/* Dump some info in case of hash insertion failure, they should
+	 * never happen so it is really useful to know if/when they do
+	 */
+	if (rc == -1)
+		hash_failure_debug(ea, access, vsid, trap, ssize,
+				   mm->context.user_psize,
+				   mm->context.user_psize,
+				   pte_val(*ptep));
+out_exit:
 	local_irq_restore(flags);
 }
 
 /* WARNING: This is called from hash_low_64.S, if you change this prototype,
  *          do not forget to update the assembly call site !
  */
-void flush_hash_page(unsigned long va, real_pte_t pte, int psize, int ssize,
+void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
 		     int local)
 {
 	unsigned long hash, index, shift, hidx, slot;
 
-	DBG_LOW("flush_hash_page(va=%016x)\n", va);
-	pte_iterate_hashed_subpages(pte, psize, va, index, shift) {
-		hash = hpt_hash(va, shift, ssize);
+	DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn);
+	pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
+		hash = hpt_hash(vpn, shift, ssize);
 		hidx = __rpte_to_hidx(pte, index);
 		if (hidx & _PTEIDX_SECONDARY)
 			hash = ~hash;
 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 		slot += hidx & _PTEIDX_GROUP_IX;
-		DBG_LOW(" sub %d: hash=%x, hidx=%x\n", index, slot, hidx);
-		ppc_md.hpte_invalidate(slot, va, psize, ssize, local);
+		DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx);
+		/*
+		 * We use same base page size and actual psize, because we don't
+		 * use these functions for hugepage
+		 */
+		ppc_md.hpte_invalidate(slot, vpn, psize, psize, ssize, local);
 	} pte_iterate_hashed_end();
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	/* Transactions are not aborted by tlbiel, only tlbie.
+	 * Without, syncing a page back to a block device w/ PIO could pick up
+	 * transactional data (bad!) so we force an abort here.  Before the
+	 * sync the page will be made read-only, which will flush_hash_page.
+	 * BIG ISSUE here: if the kernel uses a page from userspace without
+	 * unmapping it first, it may see the speculated version.
+	 */
+	if (local && cpu_has_feature(CPU_FTR_TM) &&
+	    current->thread.regs &&
+	    MSR_TM_ACTIVE(current->thread.regs->msr)) {
+		tm_enable();
+		tm_abort(TM_CAUSE_TLBI);
+	}
+#endif
 }
 
 void flush_hash_range(unsigned long number, int local)
@@ -978,7 +1329,7 @@ void flush_hash_range(unsigned long number, int local)
 			&__get_cpu_var(ppc64_tlb_batch);
 
 		for (i = 0; i < number; i++)
-			flush_hash_page(batch->vaddr[i], batch->pte[i],
+			flush_hash_page(batch->vpn[i], batch->pte[i],
 					batch->psize, batch->ssize, local);
 	}
 }
@@ -989,6 +1340,8 @@ void flush_hash_range(unsigned long number, int local)
  */
 void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc)
 {
+	enum ctx_state prev_state = exception_enter();
+
 	if (user_mode(regs)) {
 #ifdef CONFIG_PPC_SUBPAGE_PROT
 		if (rc == -2)
@@ -998,24 +1351,64 @@ void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc)
 			_exception(SIGBUS, regs, BUS_ADRERR, address);
 	} else
 		bad_page_fault(regs, address, SIGBUS);
+
+	exception_exit(prev_state);
+}
+
+long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
+			   unsigned long pa, unsigned long rflags,
+			   unsigned long vflags, int psize, int ssize)
+{
+	unsigned long hpte_group;
+	long slot;
+
+repeat:
+	hpte_group = ((hash & htab_hash_mask) *
+		       HPTES_PER_GROUP) & ~0x7UL;
+
+	/* Insert into the hash table, primary slot */
+	slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, vflags,
+				  psize, psize, ssize);
+
+	/* Primary is full, try the secondary */
+	if (unlikely(slot == -1)) {
+		hpte_group = ((~hash & htab_hash_mask) *
+			      HPTES_PER_GROUP) & ~0x7UL;
+		slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags,
+					  vflags | HPTE_V_SECONDARY,
+					  psize, psize, ssize);
+		if (slot == -1) {
+			if (mftb() & 0x1)
+				hpte_group = ((hash & htab_hash_mask) *
+					      HPTES_PER_GROUP)&~0x7UL;
+
+			ppc_md.hpte_remove(hpte_group);
+			goto repeat;
+		}
+	}
+
+	return slot;
 }
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
 static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
 {
-	unsigned long hash, hpteg;
+	unsigned long hash;
 	unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
-	unsigned long va = hpt_va(vaddr, vsid, mmu_kernel_ssize);
-	unsigned long mode = _PAGE_ACCESSED | _PAGE_DIRTY |
-		_PAGE_COHERENT | PP_RWXX | HPTE_R_N;
-	int ret;
+	unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
+	unsigned long mode = htab_convert_pte_flags(PAGE_KERNEL);
+	long ret;
+
+	hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
+
+	/* Don't create HPTE entries for bad address */
+	if (!vsid)
+		return;
 
-	hash = hpt_hash(va, PAGE_SHIFT, mmu_kernel_ssize);
-	hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
+	ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode,
+				    HPTE_V_BOLTED,
+				    mmu_linear_psize, mmu_kernel_ssize);
 
-	ret = ppc_md.hpte_insert(hpteg, va, __pa(vaddr),
-				 mode, HPTE_V_BOLTED,
-				 mmu_linear_psize, mmu_kernel_ssize);
 	BUG_ON (ret < 0);
 	spin_lock(&linear_map_hash_lock);
 	BUG_ON(linear_map_hash_slots[lmi] & 0x80);
@@ -1027,9 +1420,9 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
 {
 	unsigned long hash, hidx, slot;
 	unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
-	unsigned long va = hpt_va(vaddr, vsid, mmu_kernel_ssize);
+	unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
 
-	hash = hpt_hash(va, PAGE_SHIFT, mmu_kernel_ssize);
+	hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
 	spin_lock(&linear_map_hash_lock);
 	BUG_ON(!(linear_map_hash_slots[lmi] & 0x80));
 	hidx = linear_map_hash_slots[lmi] & 0x7f;
@@ -1039,7 +1432,8 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
 		hash = ~hash;
 	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 	slot += hidx & _PTEIDX_GROUP_IX;
-	ppc_md.hpte_invalidate(slot, va, mmu_linear_psize, mmu_kernel_ssize, 0);
+	ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_linear_psize,
+			       mmu_kernel_ssize, 0);
 }
 
 void kernel_map_pages(struct page *page, int numpages, int enable)
@@ -1061,3 +1455,23 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
 	local_irq_restore(flags);
 }
 #endif /* CONFIG_DEBUG_PAGEALLOC */
+
+void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+				phys_addr_t first_memblock_size)
+{
+	/* We don't currently support the first MEMBLOCK not mapping 0
+	 * physical on those processors
+	 */
+	BUG_ON(first_memblock_base != 0);
+
+	/* On LPAR systems, the first entry is our RMA region,
+	 * non-LPAR 64-bit hash MMU systems don't have a limitation
+	 * on real mode access, but using the first entry works well
+	 * enough. We also clamp it to 1G to avoid some funky things
+	 * such as RTAS bugs etc...
+	 */
+	ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000);
+
+	/* Finally limit subsequent allocations */
+	memblock_set_current_limit(ppc64_rma_size);
+}
diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c
new file mode 100644
index 00000000000..e7450bdbe83
--- /dev/null
+++ b/arch/powerpc/mm/highmem.c
@@ -0,0 +1,86 @@
+/*
+ * highmem.c: virtual kernel memory mappings for high memory
+ *
+ * PowerPC version, stolen from the i386 version.
+ *
+ * Used in CONFIG_HIGHMEM systems for memory pages which
+ * are not addressable by direct kernel virtual addresses.
+ *
+ * Copyright (C) 1999 Gerhard Wichert, Siemens AG
+ *		      Gerhard.Wichert@pdb.siemens.de
+ *
+ *
+ * Redesigned the x86 32-bit VM architecture to deal with
+ * up to 16 Terrabyte physical memory. With current x86 CPUs
+ * we now support up to 64 Gigabytes physical RAM.
+ *
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ *
+ * Reworked for PowerPC by various contributors. Moved from
+ * highmem.h by Benjamin Herrenschmidt (c) 2009 IBM Corp.
+ */
+
+#include <linux/highmem.h>
+#include <linux/module.h>
+
+/*
+ * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap
+ * gives a more generic (and caching) interface. But kmap_atomic can
+ * be used in IRQ contexts, so in some (very limited) cases we need
+ * it.
+ */
+void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+{
+	unsigned long vaddr;
+	int idx, type;
+
+	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	pagefault_disable();
+	if (!PageHighMem(page))
+		return page_address(page);
+
+	type = kmap_atomic_idx_push();
+	idx = type + KM_TYPE_NR*smp_processor_id();
+	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+#ifdef CONFIG_DEBUG_HIGHMEM
+	BUG_ON(!pte_none(*(kmap_pte-idx)));
+#endif
+	__set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot), 1);
+	local_flush_tlb_page(NULL, vaddr);
+
+	return (void*) vaddr;
+}
+EXPORT_SYMBOL(kmap_atomic_prot);
+
+void __kunmap_atomic(void *kvaddr)
+{
+	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
+	int type;
+
+	if (vaddr < __fix_to_virt(FIX_KMAP_END)) {
+		pagefault_enable();
+		return;
+	}
+
+	type = kmap_atomic_idx();
+
+#ifdef CONFIG_DEBUG_HIGHMEM
+	{
+		unsigned int idx;
+
+		idx = type + KM_TYPE_NR * smp_processor_id();
+		BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+
+		/*
+		 * force other mappings to Oops if they'll try to access
+		 * this pte without first remap it
+		 */
+		pte_clear(&init_mm, vaddr, kmap_pte-idx);
+		local_flush_tlb_page(NULL, vaddr);
+	}
+#endif
+
+	kmap_atomic_idx_pop();
+	pagefault_enable();
+}
+EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
new file mode 100644
index 00000000000..826893fcb3a
--- /dev/null
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright IBM Corporation, 2013
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+/*
+ * PPC64 THP Support for hash based MMUs
+ */
+#include <linux/mm.h>
+#include <asm/machdep.h>
+
+int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
+		    pmd_t *pmdp, unsigned long trap, int local, int ssize,
+		    unsigned int psize)
+{
+	unsigned int index, valid;
+	unsigned char *hpte_slot_array;
+	unsigned long rflags, pa, hidx;
+	unsigned long old_pmd, new_pmd;
+	int ret, lpsize = MMU_PAGE_16M;
+	unsigned long vpn, hash, shift, slot;
+
+	/*
+	 * atomically mark the linux large page PMD busy and dirty
+	 */
+	do {
+		old_pmd = pmd_val(*pmdp);
+		/* If PMD busy, retry the access */
+		if (unlikely(old_pmd & _PAGE_BUSY))
+			return 0;
+		/* If PMD is trans splitting retry the access */
+		if (unlikely(old_pmd & _PAGE_SPLITTING))
+			return 0;
+		/* If PMD permissions don't match, take page fault */
+		if (unlikely(access & ~old_pmd))
+			return 1;
+		/*
+		 * Try to lock the PTE, add ACCESSED and DIRTY if it was
+		 * a write access
+		 */
+		new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED;
+		if (access & _PAGE_RW)
+			new_pmd |= _PAGE_DIRTY;
+	} while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp,
+					  old_pmd, new_pmd));
+	/*
+	 * PP bits. _PAGE_USER is already PP bit 0x2, so we only
+	 * need to add in 0x1 if it's a read-only user page
+	 */
+	rflags = new_pmd & _PAGE_USER;
+	if ((new_pmd & _PAGE_USER) && !((new_pmd & _PAGE_RW) &&
+					   (new_pmd & _PAGE_DIRTY)))
+		rflags |= 0x1;
+	/*
+	 * _PAGE_EXEC -> HW_NO_EXEC since it's inverted
+	 */
+	rflags |= ((new_pmd & _PAGE_EXEC) ? 0 : HPTE_R_N);
+
+#if 0
+	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
+
+		/*
+		 * No CPU has hugepages but lacks no execute, so we
+		 * don't need to worry about that case
+		 */
+		rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+	}
+#endif
+	/*
+	 * Find the slot index details for this ea, using base page size.
+	 */
+	shift = mmu_psize_defs[psize].shift;
+	index = (ea & ~HPAGE_PMD_MASK) >> shift;
+	BUG_ON(index >= 4096);
+
+	vpn = hpt_vpn(ea, vsid, ssize);
+	hash = hpt_hash(vpn, shift, ssize);
+	hpte_slot_array = get_hpte_slot_array(pmdp);
+
+	valid = hpte_valid(hpte_slot_array, index);
+	if (valid) {
+		/* update the hpte bits */
+		hidx =  hpte_hash_index(hpte_slot_array, index);
+		if (hidx & _PTEIDX_SECONDARY)
+			hash = ~hash;
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += hidx & _PTEIDX_GROUP_IX;
+
+		ret = ppc_md.hpte_updatepp(slot, rflags, vpn,
+					   psize, lpsize, ssize, local);
+		/*
+		 * We failed to update, try to insert a new entry.
+		 */
+		if (ret == -1) {
+			/*
+			 * large pte is marked busy, so we can be sure
+			 * nobody is looking at hpte_slot_array. hence we can
+			 * safely update this here.
+			 */
+			valid = 0;
+			new_pmd &= ~_PAGE_HPTEFLAGS;
+			hpte_slot_array[index] = 0;
+		} else
+			/* clear the busy bits and set the hash pte bits */
+			new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+	}
+
+	if (!valid) {
+		unsigned long hpte_group;
+
+		/* insert new entry */
+		pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
+repeat:
+		hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+
+		/* clear the busy bits and set the hash pte bits */
+		new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+
+		/* Add in WIMG bits */
+		rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
+				      _PAGE_GUARDED));
+		/*
+		 * enable the memory coherence always
+		 */
+		rflags |= HPTE_R_M;
+
+		/* Insert into the hash table, primary slot */
+		slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+					  psize, lpsize, ssize);
+		/*
+		 * Primary is full, try the secondary
+		 */
+		if (unlikely(slot == -1)) {
+			hpte_group = ((~hash & htab_hash_mask) *
+				      HPTES_PER_GROUP) & ~0x7UL;
+			slot = ppc_md.hpte_insert(hpte_group, vpn, pa,
+						  rflags, HPTE_V_SECONDARY,
+						  psize, lpsize, ssize);
+			if (slot == -1) {
+				if (mftb() & 0x1)
+					hpte_group = ((hash & htab_hash_mask) *
+						      HPTES_PER_GROUP) & ~0x7UL;
+
+				ppc_md.hpte_remove(hpte_group);
+				goto repeat;
+			}
+		}
+		/*
+		 * Hypervisor failure. Restore old pmd and return -1
+		 * similar to __hash_page_*
+		 */
+		if (unlikely(slot == -2)) {
+			*pmdp = __pmd(old_pmd);
+			hash_failure_debug(ea, access, vsid, trap, ssize,
+					   psize, lpsize, old_pmd);
+			return -1;
+		}
+		/*
+		 * large pte is marked busy, so we can be sure
+		 * nobody is looking at hpte_slot_array. hence we can
+		 * safely update this here.
+		 */
+		mark_hpte_slot_valid(hpte_slot_array, index, slot);
+	}
+	/*
+	 * No need to use ldarx/stdcx here
+	 */
+	*pmdp = __pmd(new_pmd & ~_PAGE_BUSY);
+	return 0;
+}
diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c
new file mode 100644
index 00000000000..5e4ee257390
--- /dev/null
+++ b/arch/powerpc/mm/hugetlbpage-book3e.c
@@ -0,0 +1,153 @@
+/*
+ * PPC Huge TLB Page Support for Book3E MMU
+ *
+ * Copyright (C) 2009 David Gibson, IBM Corporation.
+ * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
+ *
+ */
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+#ifdef CONFIG_PPC64
+static inline int tlb1_next(void)
+{
+	struct paca_struct *paca = get_paca();
+	struct tlb_core_data *tcd;
+	int this, next;
+
+	tcd = paca->tcd_ptr;
+	this = tcd->esel_next;
+
+	next = this + 1;
+	if (next >= tcd->esel_max)
+		next = tcd->esel_first;
+
+	tcd->esel_next = next;
+	return this;
+}
+#else
+static inline int tlb1_next(void)
+{
+	int index, ncams;
+
+	ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
+
+	index = __get_cpu_var(next_tlbcam_idx);
+
+	/* Just round-robin the entries and wrap when we hit the end */
+	if (unlikely(index == ncams - 1))
+		__get_cpu_var(next_tlbcam_idx) = tlbcam_index;
+	else
+		__get_cpu_var(next_tlbcam_idx)++;
+
+	return index;
+}
+#endif /* !PPC64 */
+#endif /* FSL */
+
+static inline int mmu_get_tsize(int psize)
+{
+	return mmu_psize_defs[psize].enc;
+}
+
+static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid)
+{
+	int found = 0;
+
+	mtspr(SPRN_MAS6, pid << 16);
+	if (mmu_has_feature(MMU_FTR_USE_TLBRSRV)) {
+		asm volatile(
+			"li	%0,0\n"
+			"tlbsx.	0,%1\n"
+			"bne	1f\n"
+			"li	%0,1\n"
+			"1:\n"
+			: "=&r"(found) : "r"(ea));
+	} else {
+		asm volatile(
+			"tlbsx	0,%1\n"
+			"mfspr	%0,0x271\n"
+			"srwi	%0,%0,31\n"
+			: "=&r"(found) : "r"(ea));
+	}
+
+	return found;
+}
+
+void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
+			    pte_t pte)
+{
+	unsigned long mas1, mas2;
+	u64 mas7_3;
+	unsigned long psize, tsize, shift;
+	unsigned long flags;
+	struct mm_struct *mm;
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	int index;
+#endif
+
+	if (unlikely(is_kernel_addr(ea)))
+		return;
+
+	mm = vma->vm_mm;
+
+#ifdef CONFIG_PPC_MM_SLICES
+	psize = get_slice_psize(mm, ea);
+	tsize = mmu_get_tsize(psize);
+	shift = mmu_psize_defs[psize].shift;
+#else
+	psize = vma_mmu_pagesize(vma);
+	shift = __ilog2(psize);
+	tsize = shift - 10;
+#endif
+
+	/*
+	 * We can't be interrupted while we're setting up the MAS
+	 * regusters or after we've confirmed that no tlb exists.
+	 */
+	local_irq_save(flags);
+
+	if (unlikely(book3e_tlb_exists(ea, mm->context.id))) {
+		local_irq_restore(flags);
+		return;
+	}
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	/* We have to use the CAM(TLB1) on FSL parts for hugepages */
+	index = tlb1_next();
+	mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1));
+#endif
+
+	mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize);
+	mas2 = ea & ~((1UL << shift) - 1);
+	mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK;
+	mas7_3 = (u64)pte_pfn(pte) << PAGE_SHIFT;
+	mas7_3 |= (pte_val(pte) >> PTE_BAP_SHIFT) & MAS3_BAP_MASK;
+	if (!pte_dirty(pte))
+		mas7_3 &= ~(MAS3_SW|MAS3_UW);
+
+	mtspr(SPRN_MAS1, mas1);
+	mtspr(SPRN_MAS2, mas2);
+
+	if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) {
+		mtspr(SPRN_MAS7_MAS3, mas7_3);
+	} else {
+		if (mmu_has_feature(MMU_FTR_BIG_PHYS))
+			mtspr(SPRN_MAS7, upper_32_bits(mas7_3));
+		mtspr(SPRN_MAS3, lower_32_bits(mas7_3));
+	}
+
+	asm volatile ("tlbwe");
+
+	local_irq_restore(flags);
+}
+
+void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+	struct hstate *hstate = hstate_file(vma->vm_file);
+	unsigned long tsize = huge_page_shift(hstate) - 10;
+
+	__flush_tlb_page(vma->vm_mm, vmaddr, tsize, 0);
+}
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
new file mode 100644
index 00000000000..a5bcf930119
--- /dev/null
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -0,0 +1,129 @@
+/*
+ * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later)
+ *
+ * Copyright (C) 2003 David Gibson, IBM Corporation.
+ *
+ * Based on the IA-32 version:
+ * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/machdep.h>
+
+extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
+				  unsigned long pa, unsigned long rlags,
+				  unsigned long vflags, int psize, int ssize);
+
+int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
+		     pte_t *ptep, unsigned long trap, int local, int ssize,
+		     unsigned int shift, unsigned int mmu_psize)
+{
+	unsigned long vpn;
+	unsigned long old_pte, new_pte;
+	unsigned long rflags, pa, sz;
+	long slot;
+
+	BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
+
+	/* Search the Linux page table for a match with va */
+	vpn = hpt_vpn(ea, vsid, ssize);
+
+	/* At this point, we have a pte (old_pte) which can be used to build
+	 * or update an HPTE. There are 2 cases:
+	 *
+	 * 1. There is a valid (present) pte with no associated HPTE (this is
+	 *	the most common case)
+	 * 2. There is a valid (present) pte with an associated HPTE. The
+	 *	current values of the pp bits in the HPTE prevent access
+	 *	because we are doing software DIRTY bit management and the
+	 *	page is currently not DIRTY.
+	 */
+
+
+	do {
+		old_pte = pte_val(*ptep);
+		/* If PTE busy, retry the access */
+		if (unlikely(old_pte & _PAGE_BUSY))
+			return 0;
+		/* If PTE permissions don't match, take page fault */
+		if (unlikely(access & ~old_pte))
+			return 1;
+		/* Try to lock the PTE, add ACCESSED and DIRTY if it was
+		 * a write access */
+		new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
+		if (access & _PAGE_RW)
+			new_pte |= _PAGE_DIRTY;
+	} while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
+					 old_pte, new_pte));
+
+	rflags = 0x2 | (!(new_pte & _PAGE_RW));
+	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
+	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
+	sz = ((1UL) << shift);
+	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+		/* No CPU has hugepages but lacks no execute, so we
+		 * don't need to worry about that case */
+		rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+
+	/* Check if pte already has an hpte (case 2) */
+	if (unlikely(old_pte & _PAGE_HASHPTE)) {
+		/* There MIGHT be an HPTE for this pte */
+		unsigned long hash, slot;
+
+		hash = hpt_hash(vpn, shift, ssize);
+		if (old_pte & _PAGE_F_SECOND)
+			hash = ~hash;
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += (old_pte & _PAGE_F_GIX) >> 12;
+
+		if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize,
+					 mmu_psize, ssize, local) == -1)
+			old_pte &= ~_PAGE_HPTEFLAGS;
+	}
+
+	if (likely(!(old_pte & _PAGE_HASHPTE))) {
+		unsigned long hash = hpt_hash(vpn, shift, ssize);
+
+		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+
+		/* clear HPTE slot informations in new PTE */
+#ifdef CONFIG_PPC_64K_PAGES
+		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
+#else
+		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+#endif
+		/* Add in WIMG bits */
+		rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
+				      _PAGE_COHERENT | _PAGE_GUARDED));
+		/*
+		 * enable the memory coherence always
+		 */
+		rflags |= HPTE_R_M;
+
+		slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0,
+					     mmu_psize, ssize);
+
+		/*
+		 * Hypervisor failure. Restore old pte and return -1
+		 * similar to __hash_page_*
+		 */
+		if (unlikely(slot == -2)) {
+			*ptep = __pte(old_pte);
+			hash_failure_debug(ea, access, vsid, trap, ssize,
+					   mmu_psize, mmu_psize, old_pte);
+			return -1;
+		}
+
+		new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
+	}
+
+	/*
+	 * No need to use ldarx/stdcx here
+	 */
+	*ptep = __pte(new_pte & ~_PAGE_BUSY);
+	return 0;
+}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index a02266dad21..7e70ae968e5 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -1,175 +1,529 @@
 /*
- * PPC64 (POWER4) Huge TLB Page Support for Kernel.
+ * PPC Huge TLB Page Support for Kernel.
  *
  * Copyright (C) 2003 David Gibson, IBM Corporation.
+ * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
  *
  * Based on the IA-32 version:
  * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
  */
 
-#include <linux/init.h>
-#include <linux/fs.h>
 #include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <linux/pagemap.h>
+#include <linux/io.h>
 #include <linux/slab.h>
-#include <linux/err.h>
-#include <linux/sysctl.h>
-#include <asm/mman.h>
+#include <linux/hugetlb.h>
+#include <linux/export.h>
+#include <linux/of_fdt.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>
+#include <linux/moduleparam.h>
+#include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
-#include <asm/tlbflush.h>
-#include <asm/mmu_context.h>
-#include <asm/machdep.h>
-#include <asm/cputable.h>
-#include <asm/spu.h>
+#include <asm/setup.h>
+#include <asm/hugetlb.h>
 
-#define HPAGE_SHIFT_64K	16
-#define HPAGE_SHIFT_16M	24
+#ifdef CONFIG_HUGETLB_PAGE
 
-#define NUM_LOW_AREAS	(0x100000000UL >> SID_SHIFT)
-#define NUM_HIGH_AREAS	(PGTABLE_RANGE >> HTLB_AREA_SHIFT)
+#define PAGE_SHIFT_64K	16
+#define PAGE_SHIFT_16M	24
+#define PAGE_SHIFT_16G	34
 
-unsigned int hugepte_shift;
-#define PTRS_PER_HUGEPTE	(1 << hugepte_shift)
-#define HUGEPTE_TABLE_SIZE	(sizeof(pte_t) << hugepte_shift)
+unsigned int HPAGE_SHIFT;
 
-#define HUGEPD_SHIFT		(HPAGE_SHIFT + hugepte_shift)
-#define HUGEPD_SIZE		(1UL << HUGEPD_SHIFT)
-#define HUGEPD_MASK		(~(HUGEPD_SIZE-1))
+/*
+ * Tracks gpages after the device tree is scanned and before the
+ * huge_boot_pages list is ready.  On non-Freescale implementations, this is
+ * just used to track 16G pages and so is a single array.  FSL-based
+ * implementations may have more than one gpage size, so we need multiple
+ * arrays
+ */
+#ifdef CONFIG_PPC_FSL_BOOK3E
+#define MAX_NUMBER_GPAGES	128
+struct psize_gpages {
+	u64 gpage_list[MAX_NUMBER_GPAGES];
+	unsigned int nr_gpages;
+};
+static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT];
+#else
+#define MAX_NUMBER_GPAGES	1024
+static u64 gpage_freearray[MAX_NUMBER_GPAGES];
+static unsigned nr_gpages;
+#endif
 
-#define huge_pgtable_cache	(pgtable_cache[HUGEPTE_CACHE_NUM])
+#define hugepd_none(hpd)	((hpd).pd == 0)
 
-/* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
- * will choke on pointers to hugepte tables, which is handy for
- * catching screwups early. */
-#define HUGEPD_OK	0x1
+#ifdef CONFIG_PPC_BOOK3S_64
+/*
+ * At this point we do the placement change only for BOOK3S 64. This would
+ * possibly work on other subarchs.
+ */
 
-typedef struct { unsigned long pd; } hugepd_t;
+/*
+ * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have
+ * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD;
+ */
+int pmd_huge(pmd_t pmd)
+{
+	/*
+	 * leaf pte for huge page, bottom two bits != 00
+	 */
+	return ((pmd_val(pmd) & 0x3) != 0x0);
+}
 
-#define hugepd_none(hpd)	((hpd).pd == 0)
+int pud_huge(pud_t pud)
+{
+	/*
+	 * leaf pte for huge page, bottom two bits != 00
+	 */
+	return ((pud_val(pud) & 0x3) != 0x0);
+}
+
+int pgd_huge(pgd_t pgd)
+{
+	/*
+	 * leaf pte for huge page, bottom two bits != 00
+	 */
+	return ((pgd_val(pgd) & 0x3) != 0x0);
+}
+#else
+int pmd_huge(pmd_t pmd)
+{
+	return 0;
+}
 
-static inline pte_t *hugepd_page(hugepd_t hpd)
+int pud_huge(pud_t pud)
 {
-	BUG_ON(!(hpd.pd & HUGEPD_OK));
-	return (pte_t *)(hpd.pd & ~HUGEPD_OK);
+	return 0;
 }
 
-static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr)
+int pgd_huge(pgd_t pgd)
 {
-	unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1));
-	pte_t *dir = hugepd_page(*hpdp);
+	return 0;
+}
+#endif
 
-	return dir + idx;
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+{
+	/* Only called for hugetlbfs pages, hence can ignore THP */
+	return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
 }
 
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
-			   unsigned long address)
+			   unsigned long address, unsigned pdshift, unsigned pshift)
 {
-	pte_t *new = kmem_cache_alloc(huge_pgtable_cache,
-				      GFP_KERNEL|__GFP_REPEAT);
+	struct kmem_cache *cachep;
+	pte_t *new;
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	int i;
+	int num_hugepd = 1 << (pshift - pdshift);
+	cachep = hugepte_cache;
+#else
+	cachep = PGT_CACHE(pdshift - pshift);
+#endif
+
+	new = kmem_cache_zalloc(cachep, GFP_KERNEL|__GFP_REPEAT);
+
+	BUG_ON(pshift > HUGEPD_SHIFT_MASK);
+	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
 
 	if (! new)
 		return -ENOMEM;
 
 	spin_lock(&mm->page_table_lock);
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	/*
+	 * We have multiple higher-level entries that point to the same
+	 * actual pte location.  Fill in each as we go and backtrack on error.
+	 * We need all of these so the DTLB pgtable walk code can find the
+	 * right higher-level entry without knowing if it's a hugepage or not.
+	 */
+	for (i = 0; i < num_hugepd; i++, hpdp++) {
+		if (unlikely(!hugepd_none(*hpdp)))
+			break;
+		else
+			/* We use the old format for PPC_FSL_BOOK3E */
+			hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
+	}
+	/* If we bailed from the for loop early, an error occurred, clean up */
+	if (i < num_hugepd) {
+		for (i = i - 1 ; i >= 0; i--, hpdp--)
+			hpdp->pd = 0;
+		kmem_cache_free(cachep, new);
+	}
+#else
 	if (!hugepd_none(*hpdp))
-		kmem_cache_free(huge_pgtable_cache, new);
-	else
-		hpdp->pd = (unsigned long)new | HUGEPD_OK;
+		kmem_cache_free(cachep, new);
+	else {
+#ifdef CONFIG_PPC_BOOK3S_64
+		hpdp->pd = (unsigned long)new |
+			    (shift_to_mmu_psize(pshift) << 2);
+#else
+		hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
+#endif
+	}
+#endif
 	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
 
-/* Base page size affects how we walk hugetlb page tables */
-#ifdef CONFIG_PPC_64K_PAGES
-#define hpmd_offset(pud, addr)		pmd_offset(pud, addr)
-#define hpmd_alloc(mm, pud, addr)	pmd_alloc(mm, pud, addr)
+/*
+ * These macros define how to determine which level of the page table holds
+ * the hpdp.
+ */
+#ifdef CONFIG_PPC_FSL_BOOK3E
+#define HUGEPD_PGD_SHIFT PGDIR_SHIFT
+#define HUGEPD_PUD_SHIFT PUD_SHIFT
 #else
-static inline
-pmd_t *hpmd_offset(pud_t *pud, unsigned long addr)
-{
-	if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
-		return pmd_offset(pud, addr);
-	else
-		return (pmd_t *) pud;
-}
-static inline
-pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr)
-{
-	if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
-		return pmd_alloc(mm, pud, addr);
-	else
-		return (pmd_t *) pud;
-}
+#define HUGEPD_PGD_SHIFT PUD_SHIFT
+#define HUGEPD_PUD_SHIFT PMD_SHIFT
 #endif
 
-/* Modelled after find_linux_pte() */
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+#ifdef CONFIG_PPC_BOOK3S_64
+/*
+ * At this point we do the placement change only for BOOK3S 64. This would
+ * possibly work on other subarchs.
+ */
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 {
 	pgd_t *pg;
 	pud_t *pu;
 	pmd_t *pm;
+	hugepd_t *hpdp = NULL;
+	unsigned pshift = __ffs(sz);
+	unsigned pdshift = PGDIR_SHIFT;
 
-	BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
-
-	addr &= HPAGE_MASK;
-
+	addr &= ~(sz-1);
 	pg = pgd_offset(mm, addr);
-	if (!pgd_none(*pg)) {
-		pu = pud_offset(pg, addr);
-		if (!pud_none(*pu)) {
-			pm = hpmd_offset(pu, addr);
-			if (!pmd_none(*pm))
-				return hugepte_offset((hugepd_t *)pm, addr);
+
+	if (pshift == PGDIR_SHIFT)
+		/* 16GB huge page */
+		return (pte_t *) pg;
+	else if (pshift > PUD_SHIFT)
+		/*
+		 * We need to use hugepd table
+		 */
+		hpdp = (hugepd_t *)pg;
+	else {
+		pdshift = PUD_SHIFT;
+		pu = pud_alloc(mm, pg, addr);
+		if (pshift == PUD_SHIFT)
+			return (pte_t *)pu;
+		else if (pshift > PMD_SHIFT)
+			hpdp = (hugepd_t *)pu;
+		else {
+			pdshift = PMD_SHIFT;
+			pm = pmd_alloc(mm, pu, addr);
+			if (pshift == PMD_SHIFT)
+				/* 16MB hugepage */
+				return (pte_t *)pm;
+			else
+				hpdp = (hugepd_t *)pm;
 		}
 	}
+	if (!hpdp)
+		return NULL;
 
-	return NULL;
+	BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
+
+	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
+		return NULL;
+
+	return hugepte_offset(hpdp, addr, pdshift);
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+#else
+
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 {
 	pgd_t *pg;
 	pud_t *pu;
 	pmd_t *pm;
 	hugepd_t *hpdp = NULL;
+	unsigned pshift = __ffs(sz);
+	unsigned pdshift = PGDIR_SHIFT;
 
-	BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
-
-	addr &= HPAGE_MASK;
+	addr &= ~(sz-1);
 
 	pg = pgd_offset(mm, addr);
-	pu = pud_alloc(mm, pg, addr);
 
-	if (pu) {
-		pm = hpmd_alloc(mm, pu, addr);
-		if (pm)
+	if (pshift >= HUGEPD_PGD_SHIFT) {
+		hpdp = (hugepd_t *)pg;
+	} else {
+		pdshift = PUD_SHIFT;
+		pu = pud_alloc(mm, pg, addr);
+		if (pshift >= HUGEPD_PUD_SHIFT) {
+			hpdp = (hugepd_t *)pu;
+		} else {
+			pdshift = PMD_SHIFT;
+			pm = pmd_alloc(mm, pu, addr);
 			hpdp = (hugepd_t *)pm;
+		}
 	}
 
-	if (! hpdp)
+	if (!hpdp)
 		return NULL;
 
-	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr))
+	BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
+
+	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
 		return NULL;
 
-	return hugepte_offset(hpdp, addr);
+	return hugepte_offset(hpdp, addr, pdshift);
+}
+#endif
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+/* Build list of addresses of gigantic pages.  This function is used in early
+ * boot before the buddy or bootmem allocator is setup.
+ */
+void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
+{
+	unsigned int idx = shift_to_mmu_psize(__ffs(page_size));
+	int i;
+
+	if (addr == 0)
+		return;
+
+	gpage_freearray[idx].nr_gpages = number_of_pages;
+
+	for (i = 0; i < number_of_pages; i++) {
+		gpage_freearray[idx].gpage_list[i] = addr;
+		addr += page_size;
+	}
+}
+
+/*
+ * Moves the gigantic page addresses from the temporary list to the
+ * huge_boot_pages list.
+ */
+int alloc_bootmem_huge_page(struct hstate *hstate)
+{
+	struct huge_bootmem_page *m;
+	int idx = shift_to_mmu_psize(huge_page_shift(hstate));
+	int nr_gpages = gpage_freearray[idx].nr_gpages;
+
+	if (nr_gpages == 0)
+		return 0;
+
+#ifdef CONFIG_HIGHMEM
+	/*
+	 * If gpages can be in highmem we can't use the trick of storing the
+	 * data structure in the page; allocate space for this
+	 */
+	m = alloc_bootmem(sizeof(struct huge_bootmem_page));
+	m->phys = gpage_freearray[idx].gpage_list[--nr_gpages];
+#else
+	m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]);
+#endif
+
+	list_add(&m->list, &huge_boot_pages);
+	gpage_freearray[idx].nr_gpages = nr_gpages;
+	gpage_freearray[idx].gpage_list[nr_gpages] = 0;
+	m->hstate = hstate;
+
+	return 1;
+}
+/*
+ * Scan the command line hugepagesz= options for gigantic pages; store those in
+ * a list that we use to allocate the memory once all options are parsed.
+ */
+
+unsigned long gpage_npages[MMU_PAGE_COUNT];
+
+static int __init do_gpage_early_setup(char *param, char *val,
+				       const char *unused)
+{
+	static phys_addr_t size;
+	unsigned long npages;
+
+	/*
+	 * The hugepagesz and hugepages cmdline options are interleaved.  We
+	 * use the size variable to keep track of whether or not this was done
+	 * properly and skip over instances where it is incorrect.  Other
+	 * command-line parsing code will issue warnings, so we don't need to.
+	 *
+	 */
+	if ((strcmp(param, "default_hugepagesz") == 0) ||
+	    (strcmp(param, "hugepagesz") == 0)) {
+		size = memparse(val, NULL);
+	} else if (strcmp(param, "hugepages") == 0) {
+		if (size != 0) {
+			if (sscanf(val, "%lu", &npages) <= 0)
+				npages = 0;
+			gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages;
+			size = 0;
+		}
+	}
+	return 0;
 }
 
+
+/*
+ * This function allocates physical space for pages that are larger than the
+ * buddy allocator can handle.  We want to allocate these in highmem because
+ * the amount of lowmem is limited.  This means that this function MUST be
+ * called before lowmem_end_addr is set up in MMU_init() in order for the lmb
+ * allocate to grab highmem.
+ */
+void __init reserve_hugetlb_gpages(void)
+{
+	static __initdata char cmdline[COMMAND_LINE_SIZE];
+	phys_addr_t size, base;
+	int i;
+
+	strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
+	parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0,
+			&do_gpage_early_setup);
+
+	/*
+	 * Walk gpage list in reverse, allocating larger page sizes first.
+	 * Skip over unsupported sizes, or sizes that have 0 gpages allocated.
+	 * When we reach the point in the list where pages are no longer
+	 * considered gpages, we're done.
+	 */
+	for (i = MMU_PAGE_COUNT-1; i >= 0; i--) {
+		if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0)
+			continue;
+		else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT))
+			break;
+
+		size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i));
+		base = memblock_alloc_base(size * gpage_npages[i], size,
+					   MEMBLOCK_ALLOC_ANYWHERE);
+		add_gpage(base, size, gpage_npages[i]);
+	}
+}
+
+#else /* !PPC_FSL_BOOK3E */
+
+/* Build list of addresses of gigantic pages.  This function is used in early
+ * boot before the buddy or bootmem allocator is setup.
+ */
+void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
+{
+	if (!addr)
+		return;
+	while (number_of_pages > 0) {
+		gpage_freearray[nr_gpages] = addr;
+		nr_gpages++;
+		number_of_pages--;
+		addr += page_size;
+	}
+}
+
+/* Moves the gigantic page addresses from the temporary list to the
+ * huge_boot_pages list.
+ */
+int alloc_bootmem_huge_page(struct hstate *hstate)
+{
+	struct huge_bootmem_page *m;
+	if (nr_gpages == 0)
+		return 0;
+	m = phys_to_virt(gpage_freearray[--nr_gpages]);
+	gpage_freearray[nr_gpages] = 0;
+	list_add(&m->list, &huge_boot_pages);
+	m->hstate = hstate;
+	return 1;
+}
+#endif
+
 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 {
 	return 0;
 }
 
-static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
+#ifdef CONFIG_PPC_FSL_BOOK3E
+#define HUGEPD_FREELIST_SIZE \
+	((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
+
+struct hugepd_freelist {
+	struct rcu_head	rcu;
+	unsigned int index;
+	void *ptes[0];
+};
+
+static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
+
+static void hugepd_free_rcu_callback(struct rcu_head *head)
+{
+	struct hugepd_freelist *batch =
+		container_of(head, struct hugepd_freelist, rcu);
+	unsigned int i;
+
+	for (i = 0; i < batch->index; i++)
+		kmem_cache_free(hugepte_cache, batch->ptes[i]);
+
+	free_page((unsigned long)batch);
+}
+
+static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
+{
+	struct hugepd_freelist **batchp;
+
+	batchp = &get_cpu_var(hugepd_freelist_cur);
+
+	if (atomic_read(&tlb->mm->mm_users) < 2 ||
+	    cpumask_equal(mm_cpumask(tlb->mm),
+			  cpumask_of(smp_processor_id()))) {
+		kmem_cache_free(hugepte_cache, hugepte);
+        put_cpu_var(hugepd_freelist_cur);
+		return;
+	}
+
+	if (*batchp == NULL) {
+		*batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
+		(*batchp)->index = 0;
+	}
+
+	(*batchp)->ptes[(*batchp)->index++] = hugepte;
+	if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
+		call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
+		*batchp = NULL;
+	}
+	put_cpu_var(hugepd_freelist_cur);
+}
+#endif
+
+static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
+			      unsigned long start, unsigned long end,
+			      unsigned long floor, unsigned long ceiling)
 {
 	pte_t *hugepte = hugepd_page(*hpdp);
+	int i;
+
+	unsigned long pdmask = ~((1UL << pdshift) - 1);
+	unsigned int num_hugepd = 1;
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	/* Note: On fsl the hpdp may be the first of several */
+	num_hugepd = (1 << (hugepd_shift(*hpdp) - pdshift));
+#else
+	unsigned int shift = hugepd_shift(*hpdp);
+#endif
+
+	start &= pdmask;
+	if (start < floor)
+		return;
+	if (ceiling) {
+		ceiling &= pdmask;
+		if (! ceiling)
+			return;
+	}
+	if (end - 1 > ceiling - 1)
+		return;
+
+	for (i = 0; i < num_hugepd; i++, hpdp++)
+		hpdp->pd = 0;
 
-	hpdp->pd = 0;
 	tlb->need_flush = 1;
-	pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM,
-						 PGF_CACHENUM_MASK));
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	hugepd_free(tlb, hugepte);
+#else
+	pgtable_free_tlb(tlb, hugepte, pdshift - shift);
+#endif
 }
 
 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -181,13 +535,29 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 	unsigned long start;
 
 	start = addr;
-	pmd = pmd_offset(pud, addr);
 	do {
+		pmd = pmd_offset(pud, addr);
 		next = pmd_addr_end(addr, end);
-		if (pmd_none(*pmd))
+		if (!is_hugepd(pmd)) {
+			/*
+			 * if it is not hugepd pointer, we should already find
+			 * it cleared.
+			 */
+			WARN_ON(!pmd_none_or_clear_bad(pmd));
 			continue;
-		free_hugepte_range(tlb, (hugepd_t *)pmd);
-	} while (pmd++, addr = next, addr != end);
+		}
+#ifdef CONFIG_PPC_FSL_BOOK3E
+		/*
+		 * Increment next by the size of the huge mapping since
+		 * there may be more than one entry at this level for a
+		 * single hugepage, but all of them point to
+		 * the same kmem cache that holds the hugepte.
+		 */
+		next = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
+#endif
+		free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
+				  addr, next, floor, ceiling);
+	} while (addr = next, addr != end);
 
 	start &= PUD_MASK;
 	if (start < floor)
@@ -202,7 +572,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 
 	pmd = pmd_offset(pud, start);
 	pud_clear(pud);
-	pmd_free_tlb(tlb, pmd);
+	pmd_free_tlb(tlb, pmd, start);
 }
 
 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -214,25 +584,28 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 	unsigned long start;
 
 	start = addr;
-	pud = pud_offset(pgd, addr);
 	do {
+		pud = pud_offset(pgd, addr);
 		next = pud_addr_end(addr, end);
-#ifdef CONFIG_PPC_64K_PAGES
-		if (pud_none_or_clear_bad(pud))
-			continue;
-		hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
-#else
-		if (HPAGE_SHIFT == HPAGE_SHIFT_64K) {
+		if (!is_hugepd(pud)) {
 			if (pud_none_or_clear_bad(pud))
 				continue;
-			hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
+			hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
+					       ceiling);
 		} else {
-			if (pud_none(*pud))
-				continue;
-			free_hugepte_range(tlb, (hugepd_t *)pud);
-		}
+#ifdef CONFIG_PPC_FSL_BOOK3E
+			/*
+			 * Increment next by the size of the huge mapping since
+			 * there may be more than one entry at this level for a
+			 * single hugepage, but all of them point to
+			 * the same kmem cache that holds the hugepte.
+			 */
+			next = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
 #endif
-	} while (pud++, addr = next, addr != end);
+			free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
+					  addr, next, floor, ceiling);
+		}
+	} while (addr = next, addr != end);
 
 	start &= PGDIR_MASK;
 	if (start < floor)
@@ -247,103 +620,57 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 
 	pud = pud_offset(pgd, start);
 	pgd_clear(pgd);
-	pud_free_tlb(tlb, pud);
+	pud_free_tlb(tlb, pud, start);
 }
 
 /*
  * This function frees user-level page tables of a process.
- *
- * Must be called with pagetable lock held.
  */
-void hugetlb_free_pgd_range(struct mmu_gather **tlb,
+void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 			    unsigned long addr, unsigned long end,
 			    unsigned long floor, unsigned long ceiling)
 {
 	pgd_t *pgd;
 	unsigned long next;
-	unsigned long start;
 
 	/*
-	 * Comments below take from the normal free_pgd_range().  They
-	 * apply here too.  The tests against HUGEPD_MASK below are
-	 * essential, because we *don't* test for this at the bottom
-	 * level.  Without them we'll attempt to free a hugepte table
-	 * when we unmap just part of it, even if there are other
-	 * active mappings using it.
-	 *
-	 * The next few lines have given us lots of grief...
+	 * Because there are a number of different possible pagetable
+	 * layouts for hugepage ranges, we limit knowledge of how
+	 * things should be laid out to the allocation path
+	 * (huge_pte_alloc(), above).  Everything else works out the
+	 * structure as it goes from information in the hugepd
+	 * pointers.  That means that we can't here use the
+	 * optimization used in the normal page free_pgd_range(), of
+	 * checking whether we're actually covering a large enough
+	 * range to have to do anything at the top level of the walk
+	 * instead of at the bottom.
 	 *
-	 * Why are we testing HUGEPD* at this top level?  Because
-	 * often there will be no work to do at all, and we'd prefer
-	 * not to go all the way down to the bottom just to discover
-	 * that.
-	 *
-	 * Why all these "- 1"s?  Because 0 represents both the bottom
-	 * of the address space and the top of it (using -1 for the
-	 * top wouldn't help much: the masks would do the wrong thing).
-	 * The rule is that addr 0 and floor 0 refer to the bottom of
-	 * the address space, but end 0 and ceiling 0 refer to the top
-	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
-	 * that end 0 case should be mythical).
-	 *
-	 * Wherever addr is brought up or ceiling brought down, we
-	 * must be careful to reject "the opposite 0" before it
-	 * confuses the subsequent tests.  But what about where end is
-	 * brought down by HUGEPD_SIZE below? no, end can't go down to
-	 * 0 there.
-	 *
-	 * Whereas we round start (addr) and ceiling down, by different
-	 * masks at different levels, in order to test whether a table
-	 * now has no other vmas using it, so can be freed, we don't
-	 * bother to round floor or end up - the tests don't need that.
+	 * To make sense of this, you should probably go read the big
+	 * block comment at the top of the normal free_pgd_range(),
+	 * too.
 	 */
 
-	addr &= HUGEPD_MASK;
-	if (addr < floor) {
-		addr += HUGEPD_SIZE;
-		if (!addr)
-			return;
-	}
-	if (ceiling) {
-		ceiling &= HUGEPD_MASK;
-		if (!ceiling)
-			return;
-	}
-	if (end - 1 > ceiling - 1)
-		end -= HUGEPD_SIZE;
-	if (addr > end - 1)
-		return;
-
-	start = addr;
-	pgd = pgd_offset((*tlb)->mm, addr);
 	do {
-		BUG_ON(get_slice_psize((*tlb)->mm, addr) != mmu_huge_psize);
 		next = pgd_addr_end(addr, end);
-		if (pgd_none_or_clear_bad(pgd))
-			continue;
-		hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
-	} while (pgd++, addr = next, addr != end);
-}
-
-void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
-		     pte_t *ptep, pte_t pte)
-{
-	if (pte_present(*ptep)) {
-		/* We open-code pte_clear because we need to pass the right
-		 * argument to hpte_need_flush (huge / !huge). Might not be
-		 * necessary anymore if we make hpte_need_flush() get the
-		 * page size from the slices
-		 */
-		pte_update(mm, addr & HPAGE_MASK, ptep, ~0UL, 1);
-	}
-	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
-}
-
-pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
-			      pte_t *ptep)
-{
-	unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
-	return __pte(old);
+		pgd = pgd_offset(tlb->mm, addr);
+		if (!is_hugepd(pgd)) {
+			if (pgd_none_or_clear_bad(pgd))
+				continue;
+			hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
+		} else {
+#ifdef CONFIG_PPC_FSL_BOOK3E
+			/*
+			 * Increment next by the size of the huge mapping since
+			 * there may be more than one entry at the pgd level
+			 * for a single hugepage, but all of them point to the
+			 * same kmem cache that holds the hugepte.
+			 */
+			next = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
+#endif
+			free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
+					  addr, next, floor, ceiling);
+		}
+	} while (addr = next, addr != end);
 }
 
 struct page *
@@ -351,23 +678,26 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 {
 	pte_t *ptep;
 	struct page *page;
+	unsigned shift;
+	unsigned long mask;
+	/*
+	 * Transparent hugepages are handled by generic code. We can skip them
+	 * here.
+	 */
+	ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
 
-	if (get_slice_psize(mm, address) != mmu_huge_psize)
+	/* Verify it is a huge page else bail. */
+	if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep))
 		return ERR_PTR(-EINVAL);
 
-	ptep = huge_pte_offset(mm, address);
+	mask = (1UL << shift) - 1;
 	page = pte_page(*ptep);
 	if (page)
-		page += (address % HPAGE_SIZE) / PAGE_SIZE;
+		page += (address & mask) / PAGE_SIZE;
 
 	return page;
 }
 
-int pmd_huge(pmd_t pmd)
-{
-	return 0;
-}
-
 struct page *
 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 		pmd_t *pmd, int write)
@@ -376,244 +706,384 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	return NULL;
 }
 
+static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
+				      unsigned long sz)
+{
+	unsigned long __boundary = (addr + sz) & ~(sz-1);
+	return (__boundary - 1 < end - 1) ? __boundary : end;
+}
+
+int gup_hugepd(hugepd_t *hugepd, unsigned pdshift,
+	       unsigned long addr, unsigned long end,
+	       int write, struct page **pages, int *nr)
+{
+	pte_t *ptep;
+	unsigned long sz = 1UL << hugepd_shift(*hugepd);
+	unsigned long next;
+
+	ptep = hugepte_offset(hugepd, addr, pdshift);
+	do {
+		next = hugepte_addr_end(addr, end, sz);
+		if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
+			return 0;
+	} while (ptep++, addr = next, addr != end);
+
+	return 1;
+}
 
+#ifdef CONFIG_PPC_MM_SLICES
 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 					unsigned long len, unsigned long pgoff,
 					unsigned long flags)
 {
-	return slice_get_unmapped_area(addr, len, flags,
-				       mmu_huge_psize, 1, 0);
+	struct hstate *hstate = hstate_file(file);
+	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
+
+	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
 }
+#endif
 
-/*
- * Called by asm hashtable.S for doing lazy icache flush
- */
-static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
-						  pte_t pte, int trap)
+unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
 {
-	struct page *page;
-	int i;
+#ifdef CONFIG_PPC_MM_SLICES
+	unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
 
-	if (!pfn_valid(pte_pfn(pte)))
-		return rflags;
+	return 1UL << mmu_psize_to_shift(psize);
+#else
+	if (!is_vm_hugetlb_page(vma))
+		return PAGE_SIZE;
 
-	page = pte_page(pte);
+	return huge_page_size(hstate_vma(vma));
+#endif
+}
 
-	/* page is dirty */
-	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
-		if (trap == 0x400) {
-			for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++)
-				__flush_dcache_icache(page_address(page+i));
-			set_bit(PG_arch_1, &page->flags);
-		} else {
-			rflags |= HPTE_R_N;
-		}
-	}
-	return rflags;
+static inline bool is_power_of_4(unsigned long x)
+{
+	if (is_power_of_2(x))
+		return (__ilog2(x) % 2) ? false : true;
+	return false;
 }
 
-int hash_huge_page(struct mm_struct *mm, unsigned long access,
-		   unsigned long ea, unsigned long vsid, int local,
-		   unsigned long trap)
+static int __init add_huge_page_size(unsigned long long size)
 {
-	pte_t *ptep;
-	unsigned long old_pte, new_pte;
-	unsigned long va, rflags, pa;
-	long slot;
-	int err = 1;
-	int ssize = user_segment_size(ea);
+	int shift = __ffs(size);
+	int mmu_psize;
 
-	ptep = huge_pte_offset(mm, ea);
+	/* Check that it is a page size supported by the hardware and
+	 * that it fits within pagetable and slice limits. */
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	if ((size < PAGE_SIZE) || !is_power_of_4(size))
+		return -EINVAL;
+#else
+	if (!is_power_of_2(size)
+	    || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
+		return -EINVAL;
+#endif
 
-	/* Search the Linux page table for a match with va */
-	va = hpt_va(ea, vsid, ssize);
+	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
+		return -EINVAL;
 
-	/*
-	 * If no pte found or not present, send the problem up to
-	 * do_page_fault
+#ifdef CONFIG_SPU_FS_64K_LS
+	/* Disable support for 64K huge pages when 64K SPU local store
+	 * support is enabled as the current implementation conflicts.
 	 */
-	if (unlikely(!ptep || pte_none(*ptep)))
-		goto out;
+	if (shift == PAGE_SHIFT_64K)
+		return -EINVAL;
+#endif /* CONFIG_SPU_FS_64K_LS */
 
-	/* 
-	 * Check the user's access rights to the page.  If access should be
-	 * prevented then send the problem up to do_page_fault.
-	 */
-	if (unlikely(access & ~pte_val(*ptep)))
-		goto out;
-	/*
-	 * At this point, we have a pte (old_pte) which can be used to build
-	 * or update an HPTE. There are 2 cases:
-	 *
-	 * 1. There is a valid (present) pte with no associated HPTE (this is 
-	 *	the most common case)
-	 * 2. There is a valid (present) pte with an associated HPTE. The
-	 *	current values of the pp bits in the HPTE prevent access
-	 *	because we are doing software DIRTY bit management and the
-	 *	page is currently not DIRTY. 
-	 */
+	BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
 
+	/* Return if huge page size has already been setup */
+	if (size_to_hstate(size))
+		return 0;
 
-	do {
-		old_pte = pte_val(*ptep);
-		if (old_pte & _PAGE_BUSY)
-			goto out;
-		new_pte = old_pte | _PAGE_BUSY |
-			_PAGE_ACCESSED | _PAGE_HASHPTE;
-	} while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
-					 old_pte, new_pte));
-
-	rflags = 0x2 | (!(new_pte & _PAGE_RW));
- 	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
-	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
-	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
-		/* No CPU has hugepages but lacks no execute, so we
-		 * don't need to worry about that case */
-		rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
-						       trap);
-
-	/* Check if pte already has an hpte (case 2) */
-	if (unlikely(old_pte & _PAGE_HASHPTE)) {
-		/* There MIGHT be an HPTE for this pte */
-		unsigned long hash, slot;
-
-		hash = hpt_hash(va, HPAGE_SHIFT, ssize);
-		if (old_pte & _PAGE_F_SECOND)
-			hash = ~hash;
-		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-		slot += (old_pte & _PAGE_F_GIX) >> 12;
-
-		if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize,
-					 ssize, local) == -1)
-			old_pte &= ~_PAGE_HPTEFLAGS;
-	}
+	hugetlb_add_hstate(shift - PAGE_SHIFT);
 
-	if (likely(!(old_pte & _PAGE_HASHPTE))) {
-		unsigned long hash = hpt_hash(va, HPAGE_SHIFT, ssize);
-		unsigned long hpte_group;
-
-		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
-
-repeat:
-		hpte_group = ((hash & htab_hash_mask) *
-			      HPTES_PER_GROUP) & ~0x7UL;
-
-		/* clear HPTE slot informations in new PTE */
-		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
-
-		/* Add in WIMG bits */
-		/* XXX We should store these in the pte */
-		/* --BenH: I think they are ... */
-		rflags |= _PAGE_COHERENT;
-
-		/* Insert into the hash table, primary slot */
-		slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
-					  mmu_huge_psize, ssize);
-
-		/* Primary is full, try the secondary */
-		if (unlikely(slot == -1)) {
-			hpte_group = ((~hash & htab_hash_mask) *
-				      HPTES_PER_GROUP) & ~0x7UL; 
-			slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
-						  HPTE_V_SECONDARY,
-						  mmu_huge_psize, ssize);
-			if (slot == -1) {
-				if (mftb() & 0x1)
-					hpte_group = ((hash & htab_hash_mask) *
-						      HPTES_PER_GROUP)&~0x7UL;
-
-				ppc_md.hpte_remove(hpte_group);
-				goto repeat;
-                        }
-		}
+	return 0;
+}
+
+static int __init hugepage_setup_sz(char *str)
+{
+	unsigned long long size;
+
+	size = memparse(str, &str);
+
+	if (add_huge_page_size(size) != 0)
+		printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
+
+	return 1;
+}
+__setup("hugepagesz=", hugepage_setup_sz);
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+struct kmem_cache *hugepte_cache;
+static int __init hugetlbpage_init(void)
+{
+	int psize;
+
+	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+		unsigned shift;
 
-		if (unlikely(slot == -2))
-			panic("hash_huge_page: pte_insert failed\n");
+		if (!mmu_psize_defs[psize].shift)
+			continue;
 
-		new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
+		shift = mmu_psize_to_shift(psize);
+
+		/* Don't treat normal page sizes as huge... */
+		if (shift != PAGE_SHIFT)
+			if (add_huge_page_size(1ULL << shift) < 0)
+				continue;
 	}
 
 	/*
-	 * No need to use ldarx/stdcx here
+	 * Create a kmem cache for hugeptes.  The bottom bits in the pte have
+	 * size information encoded in them, so align them to allow this
 	 */
-	*ptep = __pte(new_pte & ~_PAGE_BUSY);
+	hugepte_cache =  kmem_cache_create("hugepte-cache", sizeof(pte_t),
+					   HUGEPD_SHIFT_MASK + 1, 0, NULL);
+	if (hugepte_cache == NULL)
+		panic("%s: Unable to create kmem cache for hugeptes\n",
+		      __func__);
+
+	/* Default hpage size = 4M */
+	if (mmu_psize_defs[MMU_PAGE_4M].shift)
+		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
+	else
+		panic("%s: Unable to set default huge page size\n", __func__);
 
-	err = 0;
 
- out:
-	return err;
+	return 0;
 }
-
-void set_huge_psize(int psize)
-{
-	/* Check that it is a page size supported by the hardware and
-	 * that it fits within pagetable limits. */
-	if (mmu_psize_defs[psize].shift && mmu_psize_defs[psize].shift < SID_SHIFT &&
-		(mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
-			mmu_psize_defs[psize].shift == HPAGE_SHIFT_64K)) {
-		HPAGE_SHIFT = mmu_psize_defs[psize].shift;
-		mmu_huge_psize = psize;
-#ifdef CONFIG_PPC_64K_PAGES
-		hugepte_shift = (PMD_SHIFT-HPAGE_SHIFT);
 #else
-		if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
-			hugepte_shift = (PMD_SHIFT-HPAGE_SHIFT);
+static int __init hugetlbpage_init(void)
+{
+	int psize;
+
+	if (!mmu_has_feature(MMU_FTR_16M_PAGE))
+		return -ENODEV;
+
+	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+		unsigned shift;
+		unsigned pdshift;
+
+		if (!mmu_psize_defs[psize].shift)
+			continue;
+
+		shift = mmu_psize_to_shift(psize);
+
+		if (add_huge_page_size(1ULL << shift) < 0)
+			continue;
+
+		if (shift < PMD_SHIFT)
+			pdshift = PMD_SHIFT;
+		else if (shift < PUD_SHIFT)
+			pdshift = PUD_SHIFT;
 		else
-			hugepte_shift = (PUD_SHIFT-HPAGE_SHIFT);
-#endif
+			pdshift = PGDIR_SHIFT;
+		/*
+		 * if we have pdshift and shift value same, we don't
+		 * use pgt cache for hugepd.
+		 */
+		if (pdshift != shift) {
+			pgtable_cache_add(pdshift - shift, NULL);
+			if (!PGT_CACHE(pdshift - shift))
+				panic("hugetlbpage_init(): could not create "
+				      "pgtable cache for %d bit pagesize\n", shift);
+		}
+	}
 
-	} else
-		HPAGE_SHIFT = 0;
+	/* Set default large page size. Currently, we pick 16M or 1M
+	 * depending on what is available
+	 */
+	if (mmu_psize_defs[MMU_PAGE_16M].shift)
+		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
+	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
+		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
+
+	return 0;
 }
+#endif
+module_init(hugetlbpage_init);
 
-static int __init hugepage_setup_sz(char *str)
+void flush_dcache_icache_hugepage(struct page *page)
 {
-	unsigned long long size;
-	int mmu_psize = -1;
-	int shift;
+	int i;
+	void *start;
 
-	size = memparse(str, &str);
+	BUG_ON(!PageCompound(page));
 
-	shift = __ffs(size);
-	switch (shift) {
-#ifndef CONFIG_PPC_64K_PAGES
-	case HPAGE_SHIFT_64K:
-		mmu_psize = MMU_PAGE_64K;
-		break;
-#endif
-	case HPAGE_SHIFT_16M:
-		mmu_psize = MMU_PAGE_16M;
-		break;
+	for (i = 0; i < (1UL << compound_order(page)); i++) {
+		if (!PageHighMem(page)) {
+			__flush_dcache_icache(page_address(page+i));
+		} else {
+			start = kmap_atomic(page+i);
+			__flush_dcache_icache(start);
+			kunmap_atomic(start);
+		}
 	}
+}
 
-	if (mmu_psize >=0 && mmu_psize_defs[mmu_psize].shift)
-		set_huge_psize(mmu_psize);
-	else
-		printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
+#endif /* CONFIG_HUGETLB_PAGE */
 
-	return 1;
-}
-__setup("hugepagesz=", hugepage_setup_sz);
+/*
+ * We have 4 cases for pgds and pmds:
+ * (1) invalid (all zeroes)
+ * (2) pointer to next table, as normal; bottom 6 bits == 0
+ * (3) leaf pte for huge page, bottom two bits != 00
+ * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
+ *
+ * So long as we atomically load page table pointers we are safe against teardown,
+ * we can follow the address down to the the page and take a ref on it.
+ */
 
-static void zero_ctor(struct kmem_cache *cache, void *addr)
+pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
 {
-	memset(addr, 0, kmem_cache_size(cache));
+	pgd_t pgd, *pgdp;
+	pud_t pud, *pudp;
+	pmd_t pmd, *pmdp;
+	pte_t *ret_pte;
+	hugepd_t *hpdp = NULL;
+	unsigned pdshift = PGDIR_SHIFT;
+
+	if (shift)
+		*shift = 0;
+
+	pgdp = pgdir + pgd_index(ea);
+	pgd  = ACCESS_ONCE(*pgdp);
+	/*
+	 * Always operate on the local stack value. This make sure the
+	 * value don't get updated by a parallel THP split/collapse,
+	 * page fault or a page unmap. The return pte_t * is still not
+	 * stable. So should be checked there for above conditions.
+	 */
+	if (pgd_none(pgd))
+		return NULL;
+	else if (pgd_huge(pgd)) {
+		ret_pte = (pte_t *) pgdp;
+		goto out;
+	} else if (is_hugepd(&pgd))
+		hpdp = (hugepd_t *)&pgd;
+	else {
+		/*
+		 * Even if we end up with an unmap, the pgtable will not
+		 * be freed, because we do an rcu free and here we are
+		 * irq disabled
+		 */
+		pdshift = PUD_SHIFT;
+		pudp = pud_offset(&pgd, ea);
+		pud  = ACCESS_ONCE(*pudp);
+
+		if (pud_none(pud))
+			return NULL;
+		else if (pud_huge(pud)) {
+			ret_pte = (pte_t *) pudp;
+			goto out;
+		} else if (is_hugepd(&pud))
+			hpdp = (hugepd_t *)&pud;
+		else {
+			pdshift = PMD_SHIFT;
+			pmdp = pmd_offset(&pud, ea);
+			pmd  = ACCESS_ONCE(*pmdp);
+			/*
+			 * A hugepage collapse is captured by pmd_none, because
+			 * it mark the pmd none and do a hpte invalidate.
+			 *
+			 * A hugepage split is captured by pmd_trans_splitting
+			 * because we mark the pmd trans splitting and do a
+			 * hpte invalidate
+			 *
+			 */
+			if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+				return NULL;
+
+			if (pmd_huge(pmd) || pmd_large(pmd)) {
+				ret_pte = (pte_t *) pmdp;
+				goto out;
+			} else if (is_hugepd(&pmd))
+				hpdp = (hugepd_t *)&pmd;
+			else
+				return pte_offset_kernel(&pmd, ea);
+		}
+	}
+	if (!hpdp)
+		return NULL;
+
+	ret_pte = hugepte_offset(hpdp, ea, pdshift);
+	pdshift = hugepd_shift(*hpdp);
+out:
+	if (shift)
+		*shift = pdshift;
+	return ret_pte;
 }
+EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
 
-static int __init hugetlbpage_init(void)
+int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
+		unsigned long end, int write, struct page **pages, int *nr)
 {
-	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
-		return -ENODEV;
+	unsigned long mask;
+	unsigned long pte_end;
+	struct page *head, *page, *tail;
+	pte_t pte;
+	int refs;
 
-	huge_pgtable_cache = kmem_cache_create("hugepte_cache",
-					       HUGEPTE_TABLE_SIZE,
-					       HUGEPTE_TABLE_SIZE,
-					       0,
-					       zero_ctor);
-	if (! huge_pgtable_cache)
-		panic("hugetlbpage_init(): could not create hugepte cache\n");
+	pte_end = (addr + sz) & ~(sz-1);
+	if (pte_end < end)
+		end = pte_end;
 
-	return 0;
-}
+	pte = ACCESS_ONCE(*ptep);
+	mask = _PAGE_PRESENT | _PAGE_USER;
+	if (write)
+		mask |= _PAGE_RW;
 
-module_init(hugetlbpage_init);
+	if ((pte_val(pte) & mask) != mask)
+		return 0;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	/*
+	 * check for splitting here
+	 */
+	if (pmd_trans_splitting(pte_pmd(pte)))
+		return 0;
+#endif
+
+	/* hugepages are never "special" */
+	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+	refs = 0;
+	head = pte_page(pte);
+
+	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
+	tail = page;
+	do {
+		VM_BUG_ON(compound_head(page) != head);
+		pages[*nr] = page;
+		(*nr)++;
+		page++;
+		refs++;
+	} while (addr += PAGE_SIZE, addr != end);
+
+	if (!page_cache_add_speculative(head, refs)) {
+		*nr -= refs;
+		return 0;
+	}
+
+	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+		/* Could be optimized better */
+		*nr -= refs;
+		while (refs--)
+			put_page(head);
+		return 0;
+	}
+
+	/*
+	 * Any tail page need their mapcount reference taken before we
+	 * return.
+	 */
+	while (refs--) {
+		if (PageTail(tail))
+			get_huge_page_tail(tail);
+		tail++;
+	}
+
+	return 1;
+}
diff --git a/arch/powerpc/mm/icswx.c b/arch/powerpc/mm/icswx.c
new file mode 100644
index 00000000000..915412e4d5b
--- /dev/null
+++ b/arch/powerpc/mm/icswx.c
@@ -0,0 +1,292 @@
+/*
+ *  ICSWX and ACOP Management
+ *
+ *  Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+
+#include "icswx.h"
+
+/*
+ * The processor and its L2 cache cause the icswx instruction to
+ * generate a COP_REQ transaction on PowerBus. The transaction has no
+ * address, and the processor does not perform an MMU access to
+ * authenticate the transaction. The command portion of the PowerBus
+ * COP_REQ transaction includes the LPAR_ID (LPID) and the coprocessor
+ * Process ID (PID), which the coprocessor compares to the authorized
+ * LPID and PID held in the coprocessor, to determine if the process
+ * is authorized to generate the transaction.  The data of the COP_REQ
+ * transaction is 128-byte or less in size and is placed in cacheable
+ * memory on a 128-byte cache line boundary.
+ *
+ * The task to use a coprocessor should use use_cop() to mark the use
+ * of the Coprocessor Type (CT) and context switching. On a server
+ * class processor, the PID register is used only for coprocessor
+ * management + * and so a coprocessor PID is allocated before
+ * executing icswx + * instruction. Drop_cop() is used to free the
+ * coprocessor PID.
+ *
+ * Example:
+ * Host Fabric Interface (HFI) is a PowerPC network coprocessor.
+ * Each HFI have multiple windows. Each HFI window serves as a
+ * network device sending to and receiving from HFI network.
+ * HFI immediate send function uses icswx instruction. The immediate
+ * send function allows small (single cache-line) packets be sent
+ * without using the regular HFI send FIFO and doorbell, which are
+ * much slower than immediate send.
+ *
+ * For each task intending to use HFI immediate send, the HFI driver
+ * calls use_cop() to obtain a coprocessor PID for the task.
+ * The HFI driver then allocate a free HFI window and save the
+ * coprocessor PID to the HFI window to allow the task to use the
+ * HFI window.
+ *
+ * The HFI driver repeatedly creates immediate send packets and
+ * issues icswx instruction to send data through the HFI window.
+ * The HFI compares the coprocessor PID in the CPU PID register
+ * to the PID held in the HFI window to determine if the transaction
+ * is allowed.
+ *
+ * When the task to release the HFI window, the HFI driver calls
+ * drop_cop() to release the coprocessor PID.
+ */
+
+void switch_cop(struct mm_struct *next)
+{
+#ifdef CONFIG_PPC_ICSWX_PID
+	mtspr(SPRN_PID, next->context.cop_pid);
+#endif
+	mtspr(SPRN_ACOP, next->context.acop);
+}
+
+/**
+ * Start using a coprocessor.
+ * @acop: mask of coprocessor to be used.
+ * @mm: The mm the coprocessor to associate with. Most likely current mm.
+ *
+ * Return a positive PID if successful. Negative errno otherwise.
+ * The returned PID will be fed to the coprocessor to determine if an
+ * icswx transaction is authenticated.
+ */
+int use_cop(unsigned long acop, struct mm_struct *mm)
+{
+	int ret;
+
+	if (!cpu_has_feature(CPU_FTR_ICSWX))
+		return -ENODEV;
+
+	if (!mm || !acop)
+		return -EINVAL;
+
+	/* The page_table_lock ensures mm_users won't change under us */
+	spin_lock(&mm->page_table_lock);
+	spin_lock(mm->context.cop_lockp);
+
+	ret = get_cop_pid(mm);
+	if (ret < 0)
+		goto out;
+
+	/* update acop */
+	mm->context.acop |= acop;
+
+	sync_cop(mm);
+
+	/*
+	 * If this is a threaded process then there might be other threads
+	 * running. We need to send an IPI to force them to pick up any
+	 * change in PID and ACOP.
+	 */
+	if (atomic_read(&mm->mm_users) > 1)
+		smp_call_function(sync_cop, mm, 1);
+
+out:
+	spin_unlock(mm->context.cop_lockp);
+	spin_unlock(&mm->page_table_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(use_cop);
+
+/**
+ * Stop using a coprocessor.
+ * @acop: mask of coprocessor to be stopped.
+ * @mm: The mm the coprocessor associated with.
+ */
+void drop_cop(unsigned long acop, struct mm_struct *mm)
+{
+	int free_pid;
+
+	if (!cpu_has_feature(CPU_FTR_ICSWX))
+		return;
+
+	if (WARN_ON_ONCE(!mm))
+		return;
+
+	/* The page_table_lock ensures mm_users won't change under us */
+	spin_lock(&mm->page_table_lock);
+	spin_lock(mm->context.cop_lockp);
+
+	mm->context.acop &= ~acop;
+
+	free_pid = disable_cop_pid(mm);
+	sync_cop(mm);
+
+	/*
+	 * If this is a threaded process then there might be other threads
+	 * running. We need to send an IPI to force them to pick up any
+	 * change in PID and ACOP.
+	 */
+	if (atomic_read(&mm->mm_users) > 1)
+		smp_call_function(sync_cop, mm, 1);
+
+	if (free_pid != COP_PID_NONE)
+		free_cop_pid(free_pid);
+
+	spin_unlock(mm->context.cop_lockp);
+	spin_unlock(&mm->page_table_lock);
+}
+EXPORT_SYMBOL_GPL(drop_cop);
+
+static int acop_use_cop(int ct)
+{
+	/* There is no alternate policy, yet */
+	return -1;
+}
+
+/*
+ * Get the instruction word at the NIP
+ */
+static u32 acop_get_inst(struct pt_regs *regs)
+{
+	u32 inst;
+	u32 __user *p;
+
+	p = (u32 __user *)regs->nip;
+	if (!access_ok(VERIFY_READ, p, sizeof(*p)))
+		return 0;
+
+	if (__get_user(inst, p))
+		return 0;
+
+	return inst;
+}
+
+/**
+ * @regs: regsiters at time of interrupt
+ * @address: storage address
+ * @error_code: Fault code, usually the DSISR or ESR depending on
+ *		processor type
+ *
+ * Return 0 if we are able to resolve the data storage fault that
+ * results from a CT miss in the ACOP register.
+ */
+int acop_handle_fault(struct pt_regs *regs, unsigned long address,
+		      unsigned long error_code)
+{
+	int ct;
+	u32 inst = 0;
+
+	if (!cpu_has_feature(CPU_FTR_ICSWX)) {
+		pr_info("No coprocessors available");
+		_exception(SIGILL, regs, ILL_ILLOPN, address);
+	}
+
+	if (!user_mode(regs)) {
+		/* this could happen if the HV denies the
+		 * kernel access, for now we just die */
+		die("ICSWX from kernel failed", regs, SIGSEGV);
+	}
+
+	/* Some implementations leave us a hint for the CT */
+	ct = ICSWX_GET_CT_HINT(error_code);
+	if (ct < 0) {
+		/* we have to peek at the instruction word to figure out CT */
+		u32 ccw;
+		u32 rs;
+
+		inst = acop_get_inst(regs);
+		if (inst == 0)
+			return -1;
+
+		rs = (inst >> (31 - 10)) & 0x1f;
+		ccw = regs->gpr[rs];
+		ct = (ccw >> 16) & 0x3f;
+	}
+
+	/*
+	 * We could be here because another thread has enabled acop
+	 * but the ACOP register has yet to be updated.
+	 *
+	 * This should have been taken care of by the IPI to sync all
+	 * the threads (see smp_call_function(sync_cop, mm, 1)), but
+	 * that could take forever if there are a significant amount
+	 * of threads.
+	 *
+	 * Given the number of threads on some of these systems,
+	 * perhaps this is the best way to sync ACOP rather than whack
+	 * every thread with an IPI.
+	 */
+	if ((acop_copro_type_bit(ct) & current->active_mm->context.acop) != 0) {
+		sync_cop(current->active_mm);
+		return 0;
+	}
+
+	/* check for alternate policy */
+	if (!acop_use_cop(ct))
+		return 0;
+
+	/* at this point the CT is unknown to the system */
+	pr_warn("%s[%d]: Coprocessor %d is unavailable\n",
+		current->comm, current->pid, ct);
+
+	/* get inst if we don't already have it */
+	if (inst == 0) {
+		inst = acop_get_inst(regs);
+		if (inst == 0)
+			return -1;
+	}
+
+	/* Check if the instruction is the "record form" */
+	if (inst & 1) {
+		/*
+		 * the instruction is "record" form so we can reject
+		 * using CR0
+		 */
+		regs->ccr &= ~(0xful << 28);
+		regs->ccr |= ICSWX_RC_NOT_FOUND << 28;
+
+		/* Move on to the next instruction */
+		regs->nip += 4;
+	} else {
+		/*
+		 * There is no architected mechanism to report a bad
+		 * CT so we could either SIGILL or report nothing.
+		 * Since the non-record version should only bu used
+		 * for "hints" or "don't care" we should probably do
+		 * nothing.  However, I could see how some people
+		 * might want an SIGILL so it here if you want it.
+		 */
+#ifdef CONFIG_PPC_ICSWX_USE_SIGILL
+		_exception(SIGILL, regs, ILL_ILLOPN, address);
+#else
+		regs->nip += 4;
+#endif
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(acop_handle_fault);
diff --git a/arch/powerpc/mm/icswx.h b/arch/powerpc/mm/icswx.h
new file mode 100644
index 00000000000..6dedc08e62c
--- /dev/null
+++ b/arch/powerpc/mm/icswx.h
@@ -0,0 +1,68 @@
+#ifndef _ARCH_POWERPC_MM_ICSWX_H_
+#define _ARCH_POWERPC_MM_ICSWX_H_
+
+/*
+ *  ICSWX and ACOP Management
+ *
+ *  Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <asm/mmu_context.h>
+
+/* also used to denote that PIDs are not used */
+#define COP_PID_NONE 0
+
+static inline void sync_cop(void *arg)
+{
+	struct mm_struct *mm = arg;
+
+	if (mm == current->active_mm)
+		switch_cop(current->active_mm);
+}
+
+#ifdef CONFIG_PPC_ICSWX_PID
+extern int get_cop_pid(struct mm_struct *mm);
+extern int disable_cop_pid(struct mm_struct *mm);
+extern void free_cop_pid(int free_pid);
+#else
+#define get_cop_pid(m) (COP_PID_NONE)
+#define disable_cop_pid(m) (COP_PID_NONE)
+#define free_cop_pid(p)
+#endif
+
+/*
+ * These are implementation bits for architected registers.  If this
+ * ever becomes architecture the should be moved to reg.h et. al.
+ */
+/* UCT is the same bit for Server and Embedded */
+#define ICSWX_DSI_UCT		0x00004000  /* Unavailable Coprocessor Type */
+
+#ifdef CONFIG_PPC_BOOK3E
+/* Embedded implementation gives us no hints as to what the CT is */
+#define ICSWX_GET_CT_HINT(x) (-1)
+#else
+/* Server implementation contains the CT value in the DSISR */
+#define ICSWX_DSISR_CTMASK	0x00003f00
+#define ICSWX_GET_CT_HINT(x)	(((x) & ICSWX_DSISR_CTMASK) >> 8)
+#endif
+
+#define ICSWX_RC_STARTED	0x8	/* The request has been started */
+#define ICSWX_RC_NOT_IDLE	0x4	/* No coprocessor found idle */
+#define ICSWX_RC_NOT_FOUND	0x2	/* No coprocessor found */
+#define ICSWX_RC_UNDEFINED	0x1	/* Reserved */
+
+extern int acop_handle_fault(struct pt_regs *regs, unsigned long address,
+			     unsigned long error_code);
+
+static inline u64 acop_copro_type_bit(unsigned int type)
+{
+	return 1ULL << (63 - type);
+}
+
+#endif /* !_ARCH_POWERPC_MM_ICSWX_H_ */
diff --git a/arch/powerpc/mm/icswx_pid.c b/arch/powerpc/mm/icswx_pid.c
new file mode 100644
index 00000000000..91e30eb7d05
--- /dev/null
+++ b/arch/powerpc/mm/icswx_pid.c
@@ -0,0 +1,87 @@
+/*
+ *  ICSWX and ACOP/PID Management
+ *
+ *  Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/module.h>
+#include "icswx.h"
+
+#define COP_PID_MIN (COP_PID_NONE + 1)
+#define COP_PID_MAX (0xFFFF)
+
+static DEFINE_SPINLOCK(mmu_context_acop_lock);
+static DEFINE_IDA(cop_ida);
+
+static int new_cop_pid(struct ida *ida, int min_id, int max_id,
+		       spinlock_t *lock)
+{
+	int index;
+	int err;
+
+again:
+	if (!ida_pre_get(ida, GFP_KERNEL))
+		return -ENOMEM;
+
+	spin_lock(lock);
+	err = ida_get_new_above(ida, min_id, &index);
+	spin_unlock(lock);
+
+	if (err == -EAGAIN)
+		goto again;
+	else if (err)
+		return err;
+
+	if (index > max_id) {
+		spin_lock(lock);
+		ida_remove(ida, index);
+		spin_unlock(lock);
+		return -ENOMEM;
+	}
+
+	return index;
+}
+
+int get_cop_pid(struct mm_struct *mm)
+{
+	int pid;
+
+	if (mm->context.cop_pid == COP_PID_NONE) {
+		pid = new_cop_pid(&cop_ida, COP_PID_MIN, COP_PID_MAX,
+				  &mmu_context_acop_lock);
+		if (pid >= 0)
+			mm->context.cop_pid = pid;
+	}
+	return mm->context.cop_pid;
+}
+
+int disable_cop_pid(struct mm_struct *mm)
+{
+	int free_pid = COP_PID_NONE;
+
+	if ((!mm->context.acop) && (mm->context.cop_pid != COP_PID_NONE)) {
+		free_pid = mm->context.cop_pid;
+		mm->context.cop_pid = COP_PID_NONE;
+	}
+	return free_pid;
+}
+
+void free_cop_pid(int free_pid)
+{
+	spin_lock(&mmu_context_acop_lock);
+	ida_remove(&cop_ida, free_pid);
+	spin_unlock(&mmu_context_acop_lock);
+}
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 977cb1ee5e7..cff59f1bec2 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -30,37 +30,48 @@
 #include <linux/highmem.h>
 #include <linux/initrd.h>
 #include <linux/pagemap.h>
+#include <linux/memblock.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
 
 #include <asm/pgalloc.h>
 #include <asm/prom.h>
 #include <asm/io.h>
-#include <asm/mmu_context.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
 #include <asm/smp.h>
 #include <asm/machdep.h>
 #include <asm/btext.h>
 #include <asm/tlb.h>
-#include <asm/lmb.h>
 #include <asm/sections.h>
+#include <asm/hugetlb.h>
 
 #include "mmu_decl.h"
 
 #if defined(CONFIG_KERNEL_START_BOOL) || defined(CONFIG_LOWMEM_SIZE_BOOL)
-/* The ammount of lowmem must be within 0xF0000000 - KERNELBASE. */
-#if (CONFIG_LOWMEM_SIZE > (0xF0000000 - KERNELBASE))
-#error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_START_KERNEL"
+/* The amount of lowmem must be within 0xF0000000 - KERNELBASE. */
+#if (CONFIG_LOWMEM_SIZE > (0xF0000000 - PAGE_OFFSET))
+#error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_KERNEL_START"
 #endif
 #endif
 #define MAX_LOW_MEM	CONFIG_LOWMEM_SIZE
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+phys_addr_t total_memory;
+phys_addr_t total_lowmem;
 
-unsigned long total_memory;
-unsigned long total_lowmem;
+phys_addr_t memstart_addr = (phys_addr_t)~0ull;
+EXPORT_SYMBOL(memstart_addr);
+phys_addr_t kernstart_addr;
+EXPORT_SYMBOL(kernstart_addr);
 
-unsigned long ppc_memstart;
-unsigned long ppc_memoffset = PAGE_OFFSET;
+#ifdef CONFIG_RELOCATABLE_PPC32
+/* Used in __va()/__pa() */
+long long virt_phys_offset;
+EXPORT_SYMBOL(virt_phys_offset);
+#endif
+
+phys_addr_t lowmem_end_addr;
 
 int boot_mapsize;
 #ifdef CONFIG_PPC_PMAC
@@ -68,21 +79,11 @@ unsigned long agp_special_page;
 EXPORT_SYMBOL(agp_special_page);
 #endif
 
-#ifdef CONFIG_HIGHMEM
-pte_t *kmap_pte;
-pgprot_t kmap_prot;
-
-EXPORT_SYMBOL(kmap_prot);
-EXPORT_SYMBOL(kmap_pte);
-#endif
-
 void MMU_init(void);
 
 /* XXX should be in current.h  -- paulus */
 extern struct task_struct *current_set[NR_CPUS];
 
-extern int init_bootmem_done;
-
 /*
  * this tells the system to map all of ram with the segregs
  * (i.e. page tables) instead of the bats.
@@ -91,14 +92,13 @@ extern int init_bootmem_done;
 int __map_without_bats;
 int __map_without_ltlbs;
 
-/* max amount of low RAM to map in */
-unsigned long __max_low_memory = MAX_LOW_MEM;
-
 /*
- * limit of what is accessible with initial MMU setup -
- * 256MB usually, but only 16MB on 601.
+ * This tells the system to allow ioremapping memory marked as reserved.
  */
-unsigned long __initial_memory_limit = 0x10000000;
+int __allow_ioremap_reserved;
+
+/* max amount of low RAM to map in */
+unsigned long __max_low_memory = MAX_LOW_MEM;
 
 /*
  * Check for command-line options that affect what MMU_init will do.
@@ -129,24 +129,26 @@ void __init MMU_init(void)
 	if (ppc_md.progress)
 		ppc_md.progress("MMU:enter", 0x111);
 
-	/* 601 can only access 16MB at the moment */
-	if (PVR_VER(mfspr(SPRN_PVR)) == 1)
-		__initial_memory_limit = 0x01000000;
-	/* 8xx can only access 8MB at the moment */
-	if (PVR_VER(mfspr(SPRN_PVR)) == 0x50)
-		__initial_memory_limit = 0x00800000;
-
 	/* parse args from command line */
 	MMU_setup();
 
-	if (lmb.memory.cnt > 1) {
-		lmb.memory.cnt = 1;
-		lmb_analyze();
+	/*
+	 * Reserve gigantic pages for hugetlb.  This MUST occur before
+	 * lowmem_end_addr is initialized below.
+	 */
+	reserve_hugetlb_gpages();
+
+	if (memblock.memory.cnt > 1) {
+#ifndef CONFIG_WII
+		memblock_enforce_memory_limit(memblock.memory.regions[0].size);
 		printk(KERN_WARNING "Only using first contiguous memory region");
+#else
+		wii_memory_fixups();
+#endif
 	}
 
-	total_memory = lmb_end_of_DRAM();
-	total_lowmem = total_memory;
+	total_lowmem = total_memory = memblock_end_of_DRAM() - memstart_addr;
+	lowmem_end_addr = memstart_addr + total_lowmem;
 
 #ifdef CONFIG_FSL_BOOKE
 	/* Freescale Book-E parts expect lowmem to be mapped by fixed TLB
@@ -157,10 +159,10 @@ void __init MMU_init(void)
 
 	if (total_lowmem > __max_low_memory) {
 		total_lowmem = __max_low_memory;
+		lowmem_end_addr = memstart_addr + total_lowmem;
 #ifndef CONFIG_HIGHMEM
 		total_memory = total_lowmem;
-		lmb_enforce_memory_limit(total_lowmem);
-		lmb_analyze();
+		memblock_enforce_memory_limit(total_lowmem);
 #endif /* CONFIG_HIGHMEM */
 	}
 
@@ -174,21 +176,12 @@ void __init MMU_init(void)
 		ppc_md.progress("MMU:mapin", 0x301);
 	mapin_ram();
 
-#ifdef CONFIG_HIGHMEM
-	ioremap_base = PKMAP_BASE;
-#else
-	ioremap_base = 0xfe000000UL;	/* for now, could be 0xfffff000 */
-#endif /* CONFIG_HIGHMEM */
-	ioremap_bot = ioremap_base;
+	/* Initialize early top-down ioremap allocator */
+	ioremap_bot = IOREMAP_TOP;
 
 	/* Map in I/O resources */
 	if (ppc_md.progress)
 		ppc_md.progress("MMU:setio", 0x302);
-	if (ppc_md.setup_io_mappings)
-		ppc_md.setup_io_mappings();
-
-	/* Initialize the context management stuff */
-	mmu_context_init();
 
 	if (ppc_md.progress)
 		ppc_md.progress("MMU:exit", 0x211);
@@ -197,101 +190,35 @@ void __init MMU_init(void)
 #ifdef CONFIG_BOOTX_TEXT
 	btext_unmap();
 #endif
+
+	/* Shortly after that, the entire linear mapping will be available */
+	memblock_set_current_limit(lowmem_end_addr);
 }
 
 /* This is only called until mem_init is done. */
 void __init *early_get_page(void)
 {
-	void *p;
-
-	if (init_bootmem_done) {
-		p = alloc_bootmem_pages(PAGE_SIZE);
-	} else {
-		p = __va(lmb_alloc_base(PAGE_SIZE, PAGE_SIZE,
-					__initial_memory_limit));
-	}
-	return p;
-}
-
-/* Free up now-unused memory */
-static void free_sec(unsigned long start, unsigned long end, const char *name)
-{
-	unsigned long cnt = 0;
-
-	while (start < end) {
-		ClearPageReserved(virt_to_page(start));
-		init_page_count(virt_to_page(start));
-		free_page(start);
-		cnt++;
-		start += PAGE_SIZE;
- 	}
-	if (cnt) {
-		printk(" %ldk %s", cnt << (PAGE_SHIFT - 10), name);
-		totalram_pages += cnt;
-	}
-}
-
-void free_initmem(void)
-{
-#define FREESEC(TYPE) \
-	free_sec((unsigned long)(&__ ## TYPE ## _begin), \
-		 (unsigned long)(&__ ## TYPE ## _end), \
-		 #TYPE);
-
-	printk ("Freeing unused kernel memory:");
-	FREESEC(init);
- 	printk("\n");
-	ppc_md.progress = NULL;
-#undef FREESEC
+	if (init_bootmem_done)
+		return alloc_bootmem_pages(PAGE_SIZE);
+	else
+		return __va(memblock_alloc(PAGE_SIZE, PAGE_SIZE));
 }
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
+#ifdef CONFIG_8xx /* No 8xx specific .c file to put that in ... */
+void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+				phys_addr_t first_memblock_size)
 {
-	if (start < end)
-		printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
-	for (; start < end; start += PAGE_SIZE) {
-		ClearPageReserved(virt_to_page(start));
-		init_page_count(virt_to_page(start));
-		free_page(start);
-		totalram_pages++;
-	}
-}
+	/* We don't currently support the first MEMBLOCK not mapping 0
+	 * physical on those processors
+	 */
+	BUG_ON(first_memblock_base != 0);
+
+#ifdef CONFIG_PIN_TLB
+	/* 8xx can only access 24MB at the moment */
+	memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01800000));
+#else
+	/* 8xx can only access 8MB at the moment */
+	memblock_set_current_limit(min_t(u64, first_memblock_size, 0x00800000));
 #endif
-
-#ifdef CONFIG_PROC_KCORE
-static struct kcore_list kcore_vmem;
-
-static int __init setup_kcore(void)
-{
-	int i;
-
-	for (i = 0; i < lmb.memory.cnt; i++) {
-		unsigned long base;
-		unsigned long size;
-		struct kcore_list *kcore_mem;
-
-		base = lmb.memory.region[i].base;
-		size = lmb.memory.region[i].size;
-
-		kcore_mem = kmalloc(sizeof(struct kcore_list), GFP_ATOMIC);
-		if (!kcore_mem)
-			panic("%s: kmalloc failed\n", __FUNCTION__);
-
-		/* must stay under 32 bits */
-		if ( 0xfffffffful - (unsigned long)__va(base) < size) {
-			size = 0xfffffffful - (unsigned long)(__va(base));
-			printk(KERN_DEBUG "setup_kcore: restrict size=%lx\n",
-						size);
-		}
-
-		kclist_add(kcore_mem, __va(base), size);
-	}
-
-	kclist_add(&kcore_vmem, (void *)VMALLOC_START,
-		VMALLOC_END-VMALLOC_START);
-
-	return 0;
 }
-module_init(setup_kcore);
-#endif
+#endif /* CONFIG_8xx */
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index c0f5cff7703..e3734edffa6 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -19,6 +19,8 @@
  *
  */
 
+#undef DEBUG
+
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
@@ -38,11 +40,13 @@
 #include <linux/nodemask.h>
 #include <linux/module.h>
 #include <linux/poison.h>
+#include <linux/memblock.h>
+#include <linux/hugetlb.h>
+#include <linux/slab.h>
 
 #include <asm/pgalloc.h>
 #include <asm/page.h>
 #include <asm/prom.h>
-#include <asm/lmb.h>
 #include <asm/rtas.h>
 #include <asm/io.h>
 #include <asm/mmu_context.h>
@@ -57,13 +61,12 @@
 #include <asm/mmzone.h>
 #include <asm/cputable.h>
 #include <asm/sections.h>
-#include <asm/system.h>
 #include <asm/iommu.h>
-#include <asm/abs_addr.h>
 #include <asm/vdso.h>
 
 #include "mmu_decl.h"
 
+#ifdef CONFIG_PPC_STD_MMU_64
 #if PGTABLE_RANGE > USER_VSID_RANGE
 #warning Limited user VSID range means pagetable space is wasted
 #endif
@@ -71,109 +74,82 @@
 #if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
 #warning TASK_SIZE is smaller than it needs to be.
 #endif
+#endif /* CONFIG_PPC_STD_MMU_64 */
 
-/* max amount of RAM to use */
-unsigned long __max_memory;
+phys_addr_t memstart_addr = ~0;
+EXPORT_SYMBOL_GPL(memstart_addr);
+phys_addr_t kernstart_addr;
+EXPORT_SYMBOL_GPL(kernstart_addr);
 
-void free_initmem(void)
+static void pgd_ctor(void *addr)
 {
-	unsigned long addr;
-
-	addr = (unsigned long)__init_begin;
-	for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
-		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
-		ClearPageReserved(virt_to_page(addr));
-		init_page_count(virt_to_page(addr));
-		free_page(addr);
-		totalram_pages++;
-	}
-	printk ("Freeing unused kernel memory: %luk freed\n",
-		((unsigned long)__init_end - (unsigned long)__init_begin) >> 10);
+	memset(addr, 0, PGD_TABLE_SIZE);
 }
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
+static void pmd_ctor(void *addr)
 {
-	if (start < end)
-		printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
-	for (; start < end; start += PAGE_SIZE) {
-		ClearPageReserved(virt_to_page(start));
-		init_page_count(virt_to_page(start));
-		free_page(start);
-		totalram_pages++;
-	}
-}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	memset(addr, 0, PMD_TABLE_SIZE * 2);
+#else
+	memset(addr, 0, PMD_TABLE_SIZE);
 #endif
-
-#ifdef CONFIG_PROC_KCORE
-static struct kcore_list kcore_vmem;
-
-static int __init setup_kcore(void)
-{
-	int i;
-
-	for (i=0; i < lmb.memory.cnt; i++) {
-		unsigned long base, size;
-		struct kcore_list *kcore_mem;
-
-		base = lmb.memory.region[i].base;
-		size = lmb.memory.region[i].size;
-
-		/* GFP_ATOMIC to avoid might_sleep warnings during boot */
-		kcore_mem = kmalloc(sizeof(struct kcore_list), GFP_ATOMIC);
-		if (!kcore_mem)
-			panic("%s: kmalloc failed\n", __FUNCTION__);
-
-		kclist_add(kcore_mem, __va(base), size);
-	}
-
-	kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
-
-	return 0;
 }
-module_init(setup_kcore);
-#endif
 
-static void zero_ctor(struct kmem_cache *cache, void *addr)
+struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
+
+/*
+ * Create a kmem_cache() for pagetables.  This is not used for PTE
+ * pages - they're linked to struct page, come from the normal free
+ * pages pool and have a different entry size (see real_pte_t) to
+ * everything else.  Caches created by this function are used for all
+ * the higher level pagetables, and for hugepage pagetables.
+ */
+void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
 {
-	memset(addr, 0, kmem_cache_size(cache));
+	char *name;
+	unsigned long table_size = sizeof(void *) << shift;
+	unsigned long align = table_size;
+
+	/* When batching pgtable pointers for RCU freeing, we store
+	 * the index size in the low bits.  Table alignment must be
+	 * big enough to fit it.
+	 *
+	 * Likewise, hugeapge pagetable pointers contain a (different)
+	 * shift value in the low bits.  All tables must be aligned so
+	 * as to leave enough 0 bits in the address to contain it. */
+	unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1,
+				     HUGEPD_SHIFT_MASK + 1);
+	struct kmem_cache *new;
+
+	/* It would be nice if this was a BUILD_BUG_ON(), but at the
+	 * moment, gcc doesn't seem to recognize is_power_of_2 as a
+	 * constant expression, so so much for that. */
+	BUG_ON(!is_power_of_2(minalign));
+	BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE));
+
+	if (PGT_CACHE(shift))
+		return; /* Already have a cache of this size */
+
+	align = max_t(unsigned long, align, minalign);
+	name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift);
+	new = kmem_cache_create(name, table_size, align, 0, ctor);
+	pgtable_cache[shift - 1] = new;
+	pr_debug("Allocated pgtable cache for order %d\n", shift);
 }
 
-static const unsigned int pgtable_cache_size[2] = {
-	PGD_TABLE_SIZE, PMD_TABLE_SIZE
-};
-static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
-#ifdef CONFIG_PPC_64K_PAGES
-	"pgd_cache", "pmd_cache",
-#else
-	"pgd_cache", "pud_pmd_cache",
-#endif /* CONFIG_PPC_64K_PAGES */
-};
-
-#ifdef CONFIG_HUGETLB_PAGE
-/* Hugepages need one extra cache, initialized in hugetlbpage.c.  We
- * can't put into the tables above, because HPAGE_SHIFT is not compile
- * time constant. */
-struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+1];
-#else
-struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
-#endif
 
 void pgtable_cache_init(void)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) {
-		int size = pgtable_cache_size[i];
-		const char *name = pgtable_cache_name[i];
-
-		pr_debug("Allocating page table cache %s (#%d) "
-			"for size: %08x...\n", name, i, size);
-		pgtable_cache[i] = kmem_cache_create(name,
-						     size, size,
-						     SLAB_PANIC,
-						     zero_ctor);
-	}
+	pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
+	pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor);
+	if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_CACHE_INDEX))
+		panic("Couldn't allocate pgtable caches");
+	/* In all current configs, when the PUD index exists it's the
+	 * same size as either the pgd or pmd index.  Verify that the
+	 * initialization above has also created a PUD cache.  This
+	 * will need re-examiniation if we add new possibilities for
+	 * the pagetable layout. */
+	BUG_ON(PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE));
 }
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
@@ -183,7 +159,7 @@ void pgtable_cache_init(void)
  * do this by hand as the proffered address may not be correctly aligned.
  * Subtraction of non-aligned pointers produces undefined results.
  */
-unsigned long __meminit vmemmap_section_start(unsigned long page)
+static unsigned long __meminit vmemmap_section_start(unsigned long page)
 {
 	unsigned long offset = page - ((unsigned long)(vmemmap));
 
@@ -196,7 +172,7 @@ unsigned long __meminit vmemmap_section_start(unsigned long page)
  * which overlaps this vmemmap page is initialised then this page is
  * initialised already.
  */
-int __meminit vmemmap_populated(unsigned long start, int page_size)
+static int __meminit vmemmap_populated(unsigned long start, int page_size)
 {
 	unsigned long end = start + page_size;
 
@@ -207,21 +183,99 @@ int __meminit vmemmap_populated(unsigned long start, int page_size)
 	return 0;
 }
 
-int __meminit vmemmap_populate(struct page *start_page,
-					unsigned long nr_pages, int node)
+/* On hash-based CPUs, the vmemmap is bolted in the hash table.
+ *
+ * On Book3E CPUs, the vmemmap is currently mapped in the top half of
+ * the vmalloc space using normal page tables, though the size of
+ * pages encoded in the PTEs can be different
+ */
+
+#ifdef CONFIG_PPC_BOOK3E
+static void __meminit vmemmap_create_mapping(unsigned long start,
+					     unsigned long page_size,
+					     unsigned long phys)
+{
+	/* Create a PTE encoding without page size */
+	unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED |
+		_PAGE_KERNEL_RW;
+
+	/* PTEs only contain page size encodings up to 32M */
+	BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf);
+
+	/* Encode the size in the PTE */
+	flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8;
+
+	/* For each PTE for that area, map things. Note that we don't
+	 * increment phys because all PTEs are of the large size and
+	 * thus must have the low bits clear
+	 */
+	for (i = 0; i < page_size; i += PAGE_SIZE)
+		BUG_ON(map_kernel_page(start + i, phys, flags));
+}
+#else /* CONFIG_PPC_BOOK3E */
+static void __meminit vmemmap_create_mapping(unsigned long start,
+					     unsigned long page_size,
+					     unsigned long phys)
+{
+	int  mapped = htab_bolt_mapping(start, start + page_size, phys,
+					pgprot_val(PAGE_KERNEL),
+					mmu_vmemmap_psize,
+					mmu_kernel_ssize);
+	BUG_ON(mapped < 0);
+}
+#endif /* CONFIG_PPC_BOOK3E */
+
+struct vmemmap_backing *vmemmap_list;
+
+static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node)
 {
-	unsigned long mode_rw;
-	unsigned long start = (unsigned long)start_page;
-	unsigned long end = (unsigned long)(start_page + nr_pages);
-	unsigned long page_size = 1 << mmu_psize_defs[mmu_linear_psize].shift;
+	static struct vmemmap_backing *next;
+	static int num_left;
+
+	/* allocate a page when required and hand out chunks */
+	if (!next || !num_left) {
+		next = vmemmap_alloc_block(PAGE_SIZE, node);
+		if (unlikely(!next)) {
+			WARN_ON(1);
+			return NULL;
+		}
+		num_left = PAGE_SIZE / sizeof(struct vmemmap_backing);
+	}
 
-	mode_rw = _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX;
+	num_left--;
+
+	return next++;
+}
+
+static __meminit void vmemmap_list_populate(unsigned long phys,
+					    unsigned long start,
+					    int node)
+{
+	struct vmemmap_backing *vmem_back;
+
+	vmem_back = vmemmap_list_alloc(node);
+	if (unlikely(!vmem_back)) {
+		WARN_ON(1);
+		return;
+	}
+
+	vmem_back->phys = phys;
+	vmem_back->virt_addr = start;
+	vmem_back->list = vmemmap_list;
+
+	vmemmap_list = vmem_back;
+}
+
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
+{
+	unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
 
 	/* Align to the page size of the linear mapping. */
 	start = _ALIGN_DOWN(start, page_size);
 
+	pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node);
+
 	for (; start < end; start += page_size) {
-		int mapped;
 		void *p;
 
 		if (vmemmap_populated(start, page_size))
@@ -231,15 +285,73 @@ int __meminit vmemmap_populate(struct page *start_page,
 		if (!p)
 			return -ENOMEM;
 
-		pr_debug("vmemmap %08lx allocated at %p, physical %08lx.\n",
-			start, p, __pa(p));
+		vmemmap_list_populate(__pa(p), start, node);
 
-		mapped = htab_bolt_mapping(start, start + page_size,
-					__pa(p), mode_rw, mmu_linear_psize,
-					mmu_kernel_ssize);
-		BUG_ON(mapped < 0);
+		pr_debug("      * %016lx..%016lx allocated at %p\n",
+			 start, start + page_size, p);
+
+		vmemmap_create_mapping(start, page_size, __pa(p));
 	}
 
 	return 0;
 }
-#endif
+
+void vmemmap_free(unsigned long start, unsigned long end)
+{
+}
+
+void register_page_bootmem_memmap(unsigned long section_nr,
+				  struct page *start_page, unsigned long size)
+{
+}
+
+/*
+ * We do not have access to the sparsemem vmemmap, so we fallback to
+ * walking the list of sparsemem blocks which we already maintain for
+ * the sake of crashdump. In the long run, we might want to maintain
+ * a tree if performance of that linear walk becomes a problem.
+ *
+ * realmode_pfn_to_page functions can fail due to:
+ * 1) As real sparsemem blocks do not lay in RAM continously (they
+ * are in virtual address space which is not available in the real mode),
+ * the requested page struct can be split between blocks so get_page/put_page
+ * may fail.
+ * 2) When huge pages are used, the get_page/put_page API will fail
+ * in real mode as the linked addresses in the page struct are virtual
+ * too.
+ */
+struct page *realmode_pfn_to_page(unsigned long pfn)
+{
+	struct vmemmap_backing *vmem_back;
+	struct page *page;
+	unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
+	unsigned long pg_va = (unsigned long) pfn_to_page(pfn);
+
+	for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back->list) {
+		if (pg_va < vmem_back->virt_addr)
+			continue;
+
+		/* Check that page struct is not split between real pages */
+		if ((pg_va + sizeof(struct page)) >
+				(vmem_back->virt_addr + page_size))
+			return NULL;
+
+		page = (struct page *) (vmem_back->phys + pg_va -
+				vmem_back->virt_addr);
+		return page;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
+
+#elif defined(CONFIG_FLATMEM)
+
+struct page *realmode_pfn_to_page(unsigned long pfn)
+{
+	struct page *page = pfn_to_page(pfn);
+	return page;
+}
+EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
+
+#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */
diff --git a/arch/powerpc/mm/lmb.c b/arch/powerpc/mm/lmb.c
deleted file mode 100644
index 4ce23bcf8a5..00000000000
--- a/arch/powerpc/mm/lmb.c
+++ /dev/null
@@ -1,357 +0,0 @@
-/*
- * Procedures for maintaining information about logical memory blocks.
- *
- * Peter Bergner, IBM Corp.	June 2001.
- * Copyright (C) 2001 Peter Bergner.
- * 
- *      This program is free software; you can redistribute it and/or
- *      modify it under the terms of the GNU General Public License
- *      as published by the Free Software Foundation; either version
- *      2 of the License, or (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/bitops.h>
-#include <asm/types.h>
-#include <asm/page.h>
-#include <asm/prom.h>
-#include <asm/lmb.h>
-#ifdef CONFIG_PPC32
-#include "mmu_decl.h"		/* for __max_low_memory */
-#endif
-
-#undef DEBUG
-
-#ifdef DEBUG
-#include <asm/udbg.h>
-#define DBG(fmt...) udbg_printf(fmt)
-#else
-#define DBG(fmt...)
-#endif
-
-#define LMB_ALLOC_ANYWHERE	0
-
-struct lmb lmb;
-
-void lmb_dump_all(void)
-{
-#ifdef DEBUG
-	unsigned long i;
-
-	DBG("lmb_dump_all:\n");
-	DBG("    memory.cnt		  = 0x%lx\n", lmb.memory.cnt);
-	DBG("    memory.size		  = 0x%lx\n", lmb.memory.size);
-	for (i=0; i < lmb.memory.cnt ;i++) {
-		DBG("    memory.region[0x%x].base       = 0x%lx\n",
-			    i, lmb.memory.region[i].base);
-		DBG("		      .size     = 0x%lx\n",
-			    lmb.memory.region[i].size);
-	}
-
-	DBG("\n    reserved.cnt	  = 0x%lx\n", lmb.reserved.cnt);
-	DBG("    reserved.size	  = 0x%lx\n", lmb.reserved.size);
-	for (i=0; i < lmb.reserved.cnt ;i++) {
-		DBG("    reserved.region[0x%x].base       = 0x%lx\n",
-			    i, lmb.reserved.region[i].base);
-		DBG("		      .size     = 0x%lx\n",
-			    lmb.reserved.region[i].size);
-	}
-#endif /* DEBUG */
-}
-
-static unsigned long __init lmb_addrs_overlap(unsigned long base1,
-		unsigned long size1, unsigned long base2, unsigned long size2)
-{
-	return ((base1 < (base2+size2)) && (base2 < (base1+size1)));
-}
-
-static long __init lmb_addrs_adjacent(unsigned long base1, unsigned long size1,
-		unsigned long base2, unsigned long size2)
-{
-	if (base2 == base1 + size1)
-		return 1;
-	else if (base1 == base2 + size2)
-		return -1;
-
-	return 0;
-}
-
-static long __init lmb_regions_adjacent(struct lmb_region *rgn,
-		unsigned long r1, unsigned long r2)
-{
-	unsigned long base1 = rgn->region[r1].base;
-	unsigned long size1 = rgn->region[r1].size;
-	unsigned long base2 = rgn->region[r2].base;
-	unsigned long size2 = rgn->region[r2].size;
-
-	return lmb_addrs_adjacent(base1, size1, base2, size2);
-}
-
-static void __init lmb_remove_region(struct lmb_region *rgn, unsigned long r)
-{
-	unsigned long i;
-
-	for (i = r; i < rgn->cnt - 1; i++) {
-		rgn->region[i].base = rgn->region[i + 1].base;
-		rgn->region[i].size = rgn->region[i + 1].size;
-	}
-	rgn->cnt--;
-}
-
-/* Assumption: base addr of region 1 < base addr of region 2 */
-static void __init lmb_coalesce_regions(struct lmb_region *rgn,
-		unsigned long r1, unsigned long r2)
-{
-	rgn->region[r1].size += rgn->region[r2].size;
-	lmb_remove_region(rgn, r2);
-}
-
-/* This routine called with relocation disabled. */
-void __init lmb_init(void)
-{
-	/* Create a dummy zero size LMB which will get coalesced away later.
-	 * This simplifies the lmb_add() code below...
-	 */
-	lmb.memory.region[0].base = 0;
-	lmb.memory.region[0].size = 0;
-	lmb.memory.cnt = 1;
-
-	/* Ditto. */
-	lmb.reserved.region[0].base = 0;
-	lmb.reserved.region[0].size = 0;
-	lmb.reserved.cnt = 1;
-}
-
-/* This routine may be called with relocation disabled. */
-void __init lmb_analyze(void)
-{
-	int i;
-
-	lmb.memory.size = 0;
-
-	for (i = 0; i < lmb.memory.cnt; i++)
-		lmb.memory.size += lmb.memory.region[i].size;
-}
-
-/* This routine called with relocation disabled. */
-static long __init lmb_add_region(struct lmb_region *rgn, unsigned long base,
-				  unsigned long size)
-{
-	unsigned long coalesced = 0;
-	long adjacent, i;
-
-	/* First try and coalesce this LMB with another. */
-	for (i=0; i < rgn->cnt; i++) {
-		unsigned long rgnbase = rgn->region[i].base;
-		unsigned long rgnsize = rgn->region[i].size;
-
-		if ((rgnbase == base) && (rgnsize == size))
-			/* Already have this region, so we're done */
-			return 0;
-
-		adjacent = lmb_addrs_adjacent(base,size,rgnbase,rgnsize);
-		if ( adjacent > 0 ) {
-			rgn->region[i].base -= size;
-			rgn->region[i].size += size;
-			coalesced++;
-			break;
-		}
-		else if ( adjacent < 0 ) {
-			rgn->region[i].size += size;
-			coalesced++;
-			break;
-		}
-	}
-
-	if ((i < rgn->cnt-1) && lmb_regions_adjacent(rgn, i, i+1) ) {
-		lmb_coalesce_regions(rgn, i, i+1);
-		coalesced++;
-	}
-
-	if (coalesced)
-		return coalesced;
-	if (rgn->cnt >= MAX_LMB_REGIONS)
-		return -1;
-
-	/* Couldn't coalesce the LMB, so add it to the sorted table. */
-	for (i = rgn->cnt-1; i >= 0; i--) {
-		if (base < rgn->region[i].base) {
-			rgn->region[i+1].base = rgn->region[i].base;
-			rgn->region[i+1].size = rgn->region[i].size;
-		} else {
-			rgn->region[i+1].base = base;
-			rgn->region[i+1].size = size;
-			break;
-		}
-	}
-	rgn->cnt++;
-
-	return 0;
-}
-
-/* This routine may be called with relocation disabled. */
-long __init lmb_add(unsigned long base, unsigned long size)
-{
-	struct lmb_region *_rgn = &(lmb.memory);
-
-	/* On pSeries LPAR systems, the first LMB is our RMO region. */
-	if (base == 0)
-		lmb.rmo_size = size;
-
-	return lmb_add_region(_rgn, base, size);
-
-}
-
-long __init lmb_reserve(unsigned long base, unsigned long size)
-{
-	struct lmb_region *_rgn = &(lmb.reserved);
-
-	BUG_ON(0 == size);
-
-	return lmb_add_region(_rgn, base, size);
-}
-
-long __init lmb_overlaps_region(struct lmb_region *rgn, unsigned long base,
-				unsigned long size)
-{
-	unsigned long i;
-
-	for (i=0; i < rgn->cnt; i++) {
-		unsigned long rgnbase = rgn->region[i].base;
-		unsigned long rgnsize = rgn->region[i].size;
-		if ( lmb_addrs_overlap(base,size,rgnbase,rgnsize) ) {
-			break;
-		}
-	}
-
-	return (i < rgn->cnt) ? i : -1;
-}
-
-unsigned long __init lmb_alloc(unsigned long size, unsigned long align)
-{
-	return lmb_alloc_base(size, align, LMB_ALLOC_ANYWHERE);
-}
-
-unsigned long __init lmb_alloc_base(unsigned long size, unsigned long align,
-				    unsigned long max_addr)
-{
-	unsigned long alloc;
-
-	alloc = __lmb_alloc_base(size, align, max_addr);
-
-	if (alloc == 0)
-		panic("ERROR: Failed to allocate 0x%lx bytes below 0x%lx.\n",
-				size, max_addr);
-
-	return alloc;
-}
-
-unsigned long __init __lmb_alloc_base(unsigned long size, unsigned long align,
-				    unsigned long max_addr)
-{
-	long i, j;
-	unsigned long base = 0;
-
-	BUG_ON(0 == size);
-
-#ifdef CONFIG_PPC32
-	/* On 32-bit, make sure we allocate lowmem */
-	if (max_addr == LMB_ALLOC_ANYWHERE)
-		max_addr = __max_low_memory;
-#endif
-	for (i = lmb.memory.cnt-1; i >= 0; i--) {
-		unsigned long lmbbase = lmb.memory.region[i].base;
-		unsigned long lmbsize = lmb.memory.region[i].size;
-
-		if (max_addr == LMB_ALLOC_ANYWHERE)
-			base = _ALIGN_DOWN(lmbbase + lmbsize - size, align);
-		else if (lmbbase < max_addr) {
-			base = min(lmbbase + lmbsize, max_addr);
-			base = _ALIGN_DOWN(base - size, align);
-		} else
-			continue;
-
-		while ((lmbbase <= base) &&
-		       ((j = lmb_overlaps_region(&lmb.reserved, base, size)) >= 0) )
-			base = _ALIGN_DOWN(lmb.reserved.region[j].base - size,
-					   align);
-
-		if ((base != 0) && (lmbbase <= base))
-			break;
-	}
-
-	if (i < 0)
-		return 0;
-
-	lmb_add_region(&lmb.reserved, base, size);
-
-	return base;
-}
-
-/* You must call lmb_analyze() before this. */
-unsigned long __init lmb_phys_mem_size(void)
-{
-	return lmb.memory.size;
-}
-
-unsigned long __init lmb_end_of_DRAM(void)
-{
-	int idx = lmb.memory.cnt - 1;
-
-	return (lmb.memory.region[idx].base + lmb.memory.region[idx].size);
-}
-
-/* You must call lmb_analyze() after this. */
-void __init lmb_enforce_memory_limit(unsigned long memory_limit)
-{
-	unsigned long i, limit;
-	struct lmb_property *p;
-
-	if (! memory_limit)
-		return;
-
-	/* Truncate the lmb regions to satisfy the memory limit. */
-	limit = memory_limit;
-	for (i = 0; i < lmb.memory.cnt; i++) {
-		if (limit > lmb.memory.region[i].size) {
-			limit -= lmb.memory.region[i].size;
-			continue;
-		}
-
-		lmb.memory.region[i].size = limit;
-		lmb.memory.cnt = i + 1;
-		break;
-	}
-
-	if (lmb.memory.region[0].size < lmb.rmo_size)
-		lmb.rmo_size = lmb.memory.region[0].size;
-
-	/* And truncate any reserves above the limit also. */
-	for (i = 0; i < lmb.reserved.cnt; i++) {
-		p = &lmb.reserved.region[i];
-
-		if (p->base > memory_limit)
-			p->size = 0;
-		else if ((p->base + p->size) > memory_limit)
-			p->size = memory_limit - p->base;
-
-		if (p->size == 0) {
-			lmb_remove_region(&lmb.reserved, i);
-			i--;
-		}
-	}
-}
-
-int __init lmb_is_reserved(unsigned long addr)
-{
-	int i;
-
-	for (i = 0; i < lmb.reserved.cnt; i++) {
-		unsigned long upper = lmb.reserved.region[i].base +
-				      lmb.reserved.region[i].size - 1;
-		if ((addr >= lmb.reserved.region[i].base) && (addr <= upper))
-			return 1;
-	}
-	return 0;
-}
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index be5c506779a..2c8e90f5789 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -17,11 +17,12 @@
  *
  */
 
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
+#include <linux/gfp.h>
 #include <linux/types.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
@@ -31,6 +32,9 @@
 #include <linux/initrd.h>
 #include <linux/pagemap.h>
 #include <linux/suspend.h>
+#include <linux/memblock.h>
+#include <linux/hugetlb.h>
+#include <linux/slab.h>
 
 #include <asm/pgalloc.h>
 #include <asm/prom.h>
@@ -42,9 +46,12 @@
 #include <asm/machdep.h>
 #include <asm/btext.h>
 #include <asm/tlb.h>
-#include <asm/lmb.h>
 #include <asm/sections.h>
+#include <asm/sparsemem.h>
 #include <asm/vdso.h>
+#include <asm/fixmap.h>
+#include <asm/swiotlb.h>
+#include <asm/rtas.h>
 
 #include "mmu_decl.h"
 
@@ -55,27 +62,32 @@
 
 int init_bootmem_done;
 int mem_init_done;
-unsigned long memory_limit;
+unsigned long long memory_limit;
 
-int page_is_ram(unsigned long pfn)
+#ifdef CONFIG_HIGHMEM
+pte_t *kmap_pte;
+EXPORT_SYMBOL(kmap_pte);
+pgprot_t kmap_prot;
+EXPORT_SYMBOL(kmap_prot);
+
+static inline pte_t *virt_to_kpte(unsigned long vaddr)
 {
-	unsigned long paddr = (pfn << PAGE_SHIFT);
+	return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
+			vaddr), vaddr), vaddr);
+}
+#endif
 
+int page_is_ram(unsigned long pfn)
+{
 #ifndef CONFIG_PPC64	/* XXX for now */
-	return paddr < __pa(high_memory);
+	return pfn < max_pfn;
 #else
-	int i;
-	for (i=0; i < lmb.memory.cnt; i++) {
-		unsigned long base;
-
-		base = lmb.memory.region[i].base;
+	unsigned long paddr = (pfn << PAGE_SHIFT);
+	struct memblock_region *reg;
 
-		if ((paddr >= base) &&
-			(paddr < (base + lmb.memory.region[i].size))) {
+	for_each_memblock(memory, reg)
+		if (paddr >= reg->base && paddr < (reg->base + reg->size))
 			return 1;
-		}
-	}
-
 	return 0;
 #endif
 }
@@ -87,23 +99,14 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 		return ppc_md.phys_mem_access_prot(file, pfn, size, vma_prot);
 
 	if (!page_is_ram(pfn))
-		vma_prot = __pgprot(pgprot_val(vma_prot)
-				    | _PAGE_GUARDED | _PAGE_NO_CACHE);
+		vma_prot = pgprot_noncached(vma_prot);
+
 	return vma_prot;
 }
 EXPORT_SYMBOL(phys_mem_access_prot);
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 
-void online_page(struct page *page)
-{
-	ClearPageReserved(page);
-	init_page_count(page);
-	__free_page(page);
-	totalram_pages++;
-	num_physpages++;
-}
-
 #ifdef CONFIG_NUMA
 int memory_add_physaddr_to_nid(u64 start)
 {
@@ -111,7 +114,7 @@ int memory_add_physaddr_to_nid(u64 start)
 }
 #endif
 
-int __devinit arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size)
 {
 	struct pglist_data *pgdata;
 	struct zone *zone;
@@ -121,88 +124,60 @@ int __devinit arch_add_memory(int nid, u64 start, u64 size)
 	pgdata = NODE_DATA(nid);
 
 	start = (unsigned long)__va(start);
-	create_section_mapping(start, start + size);
+	if (create_section_mapping(start, start + size))
+		return -EINVAL;
 
 	/* this should work for most non-highmem platforms */
 	zone = pgdata->node_zones;
 
-	return __add_pages(zone, start_pfn, nr_pages);
+	return __add_pages(nid, zone, start_pfn, nr_pages);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-int remove_memory(u64 start, u64 size)
+int arch_remove_memory(u64 start, u64 size)
 {
-	unsigned long start_pfn, end_pfn;
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	struct zone *zone;
 	int ret;
 
-	start_pfn = start >> PAGE_SHIFT;
-	end_pfn = start_pfn + (size >> PAGE_SHIFT);
-	ret = offline_pages(start_pfn, end_pfn, 120 * HZ);
-	if (ret)
-		goto out;
-	/* Arch-specific calls go here - next patch */
-out:
+	zone = page_zone(pfn_to_page(start_pfn));
+	ret = __remove_pages(zone, start_pfn, nr_pages);
+	if (!ret && (ppc_md.remove_memory))
+		ret = ppc_md.remove_memory(start, size);
+
 	return ret;
 }
-#endif /* CONFIG_MEMORY_HOTREMOVE */
+#endif
+#endif /* CONFIG_MEMORY_HOTPLUG */
 
 /*
  * walk_memory_resource() needs to make sure there is no holes in a given
- * memory range. On PPC64, since this range comes from /sysfs, the range
- * is guaranteed to be valid, non-overlapping and can not contain any
- * holes. By the time we get here (memory add or remove), /proc/device-tree
- * is updated and correct. Only reason we need to check against device-tree
- * would be if we allow user-land to specify a memory range through a
- * system call/ioctl etc. instead of doing offline/online through /sysfs.
+ * memory range.  PPC64 does not maintain the memory layout in /proc/iomem.
+ * Instead it maintains it in memblock.memory structures.  Walk through the
+ * memory regions, find holes and callback for contiguous regions.
  */
 int
-walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg,
-			int (*func)(unsigned long, unsigned long, void *))
+walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
+		void *arg, int (*func)(unsigned long, unsigned long, void *))
 {
-	return  (*func)(start_pfn, nr_pages, arg);
-}
-
-#endif /* CONFIG_MEMORY_HOTPLUG */
-
-void show_mem(void)
-{
-	unsigned long total = 0, reserved = 0;
-	unsigned long shared = 0, cached = 0;
-	unsigned long highmem = 0;
-	struct page *page;
-	pg_data_t *pgdat;
-	unsigned long i;
-
-	printk("Mem-info:\n");
-	show_free_areas();
-	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
-	for_each_online_pgdat(pgdat) {
-		unsigned long flags;
-		pgdat_resize_lock(pgdat, &flags);
-		for (i = 0; i < pgdat->node_spanned_pages; i++) {
-			if (!pfn_valid(pgdat->node_start_pfn + i))
-				continue;
-			page = pgdat_page_nr(pgdat, i);
-			total++;
-			if (PageHighMem(page))
-				highmem++;
-			if (PageReserved(page))
-				reserved++;
-			else if (PageSwapCache(page))
-				cached++;
-			else if (page_count(page))
-				shared += page_count(page) - 1;
-		}
-		pgdat_resize_unlock(pgdat, &flags);
+	struct memblock_region *reg;
+	unsigned long end_pfn = start_pfn + nr_pages;
+	unsigned long tstart, tend;
+	int ret = -1;
+
+	for_each_memblock(memory, reg) {
+		tstart = max(start_pfn, memblock_region_memory_base_pfn(reg));
+		tend = min(end_pfn, memblock_region_memory_end_pfn(reg));
+		if (tstart >= tend)
+			continue;
+		ret = (*func)(tstart, tend - tstart, arg);
+		if (ret)
+			break;
 	}
-	printk("%ld pages of RAM\n", total);
-#ifdef CONFIG_HIGHMEM
-	printk("%ld pages of HIGHMEM\n", highmem);
-#endif
-	printk("%ld reserved pages\n", reserved);
-	printk("%ld pages shared\n", shared);
-	printk("%ld pages swap cached\n", cached);
+	return ret;
 }
+EXPORT_SYMBOL_GPL(walk_system_ram_range);
 
 /*
  * Initialize the bootmem system and give it all the memory we
@@ -212,14 +187,16 @@ void show_mem(void)
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 void __init do_init_bootmem(void)
 {
-	unsigned long i;
 	unsigned long start, bootmap_pages;
 	unsigned long total_pages;
+	struct memblock_region *reg;
 	int boot_mapsize;
 
-	max_pfn = total_pages = lmb_end_of_DRAM() >> PAGE_SHIFT;
+	max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
+	total_pages = (memblock_end_of_DRAM() - memstart_addr) >> PAGE_SHIFT;
 #ifdef CONFIG_HIGHMEM
 	total_pages = total_lowmem >> PAGE_SHIFT;
+	max_low_pfn = lowmem_end_addr >> PAGE_SHIFT;
 #endif
 
 	/*
@@ -229,48 +206,38 @@ void __init do_init_bootmem(void)
 	 */
 	bootmap_pages = bootmem_bootmap_pages(total_pages);
 
-	start = lmb_alloc(bootmap_pages << PAGE_SHIFT, PAGE_SIZE);
+	start = memblock_alloc(bootmap_pages << PAGE_SHIFT, PAGE_SIZE);
 
-	boot_mapsize = init_bootmem(start >> PAGE_SHIFT, total_pages);
+	min_low_pfn = MEMORY_START >> PAGE_SHIFT;
+	boot_mapsize = init_bootmem_node(NODE_DATA(0), start >> PAGE_SHIFT, min_low_pfn, max_low_pfn);
 
-	/* Add active regions with valid PFNs */
-	for (i = 0; i < lmb.memory.cnt; i++) {
-		unsigned long start_pfn, end_pfn;
-		start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT;
-		end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i);
-		add_active_range(0, start_pfn, end_pfn);
-	}
+	/* Place all memblock_regions in the same node and merge contiguous
+	 * memblock_regions
+	 */
+	memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
 
 	/* Add all physical memory to the bootmem map, mark each area
 	 * present.
 	 */
 #ifdef CONFIG_HIGHMEM
-	free_bootmem_with_active_regions(0, total_lowmem >> PAGE_SHIFT);
+	free_bootmem_with_active_regions(0, lowmem_end_addr >> PAGE_SHIFT);
 
 	/* reserve the sections we're already using */
-	for (i = 0; i < lmb.reserved.cnt; i++) {
-		unsigned long addr = lmb.reserved.region[i].base +
-				     lmb_size_bytes(&lmb.reserved, i) - 1;
-		if (addr < total_lowmem)
-			reserve_bootmem(lmb.reserved.region[i].base,
-					lmb_size_bytes(&lmb.reserved, i),
-					BOOTMEM_DEFAULT);
-		else if (lmb.reserved.region[i].base < total_lowmem) {
-			unsigned long adjusted_size = total_lowmem -
-				      lmb.reserved.region[i].base;
-			reserve_bootmem(lmb.reserved.region[i].base,
-					adjusted_size, BOOTMEM_DEFAULT);
+	for_each_memblock(reserved, reg) {
+		unsigned long top = reg->base + reg->size - 1;
+		if (top < lowmem_end_addr)
+			reserve_bootmem(reg->base, reg->size, BOOTMEM_DEFAULT);
+		else if (reg->base < lowmem_end_addr) {
+			unsigned long trunc_size = lowmem_end_addr - reg->base;
+			reserve_bootmem(reg->base, trunc_size, BOOTMEM_DEFAULT);
 		}
 	}
 #else
 	free_bootmem_with_active_regions(0, max_pfn);
 
 	/* reserve the sections we're already using */
-	for (i = 0; i < lmb.reserved.cnt; i++)
-		reserve_bootmem(lmb.reserved.region[i].base,
-				lmb_size_bytes(&lmb.reserved, i),
-				BOOTMEM_DEFAULT);
-
+	for_each_memblock(reserved, reg)
+		reserve_bootmem(reg->base, reg->size, BOOTMEM_DEFAULT);
 #endif
 	/* XXX need to clip this if using highmem? */
 	sparse_memory_present_with_active_regions(0);
@@ -281,22 +248,15 @@ void __init do_init_bootmem(void)
 /* mark pages that don't exist as nosave */
 static int __init mark_nonram_nosave(void)
 {
-	unsigned long lmb_next_region_start_pfn,
-		      lmb_region_max_pfn;
-	int i;
-
-	for (i = 0; i < lmb.memory.cnt - 1; i++) {
-		lmb_region_max_pfn =
-			(lmb.memory.region[i].base >> PAGE_SHIFT) +
-			(lmb.memory.region[i].size >> PAGE_SHIFT);
-		lmb_next_region_start_pfn =
-			lmb.memory.region[i+1].base >> PAGE_SHIFT;
-
-		if (lmb_region_max_pfn < lmb_next_region_start_pfn)
-			register_nosave_region(lmb_region_max_pfn,
-					       lmb_next_region_start_pfn);
+	struct memblock_region *reg, *prev = NULL;
+
+	for_each_memblock(memory, reg) {
+		if (prev &&
+		    memblock_region_memory_end_pfn(prev) < memblock_region_memory_base_pfn(reg))
+			register_nosave_region(memblock_region_memory_end_pfn(prev),
+					       memblock_region_memory_base_pfn(reg));
+		prev = reg;
 	}
-
 	return 0;
 }
 
@@ -305,28 +265,33 @@ static int __init mark_nonram_nosave(void)
  */
 void __init paging_init(void)
 {
-	unsigned long total_ram = lmb_phys_mem_size();
-	unsigned long top_of_ram = lmb_end_of_DRAM();
+	unsigned long long total_ram = memblock_phys_mem_size();
+	phys_addr_t top_of_ram = memblock_end_of_DRAM();
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
 
+#ifdef CONFIG_PPC32
+	unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1);
+	unsigned long end = __fix_to_virt(FIX_HOLE);
+
+	for (; v < end; v += PAGE_SIZE)
+		map_page(v, 0, 0); /* XXX gross */
+#endif
+
 #ifdef CONFIG_HIGHMEM
 	map_page(PKMAP_BASE, 0, 0);	/* XXX gross */
-	pkmap_page_table = pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k
-			(PKMAP_BASE), PKMAP_BASE), PKMAP_BASE), PKMAP_BASE);
-	map_page(KMAP_FIX_BEGIN, 0, 0);	/* XXX gross */
-	kmap_pte = pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k
-			(KMAP_FIX_BEGIN), KMAP_FIX_BEGIN), KMAP_FIX_BEGIN),
-			 KMAP_FIX_BEGIN);
+	pkmap_page_table = virt_to_kpte(PKMAP_BASE);
+
+	kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN));
 	kmap_prot = PAGE_KERNEL;
 #endif /* CONFIG_HIGHMEM */
 
-	printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
-	       top_of_ram, total_ram);
+	printk(KERN_DEBUG "Top of RAM: 0x%llx, Total RAM: 0x%llx\n",
+	       (unsigned long long)top_of_ram, total_ram);
 	printk(KERN_DEBUG "Memory hole size: %ldMB\n",
-	       (top_of_ram - total_ram) >> 20);
+	       (long int)((top_of_ram - total_ram) >> 20));
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 #ifdef CONFIG_HIGHMEM
-	max_zone_pfns[ZONE_DMA] = total_lowmem >> PAGE_SHIFT;
+	max_zone_pfns[ZONE_DMA] = lowmem_end_addr >> PAGE_SHIFT;
 	max_zone_pfns[ZONE_HIGHMEM] = top_of_ram >> PAGE_SHIFT;
 #else
 	max_zone_pfns[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
@@ -337,80 +302,88 @@ void __init paging_init(void)
 }
 #endif /* ! CONFIG_NEED_MULTIPLE_NODES */
 
-void __init mem_init(void)
+static void __init register_page_bootmem_info(void)
 {
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-	int nid;
-#endif
-	pg_data_t *pgdat;
-	unsigned long i;
-	struct page *page;
-	unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize;
+	int i;
 
-	num_physpages = lmb.memory.size >> PAGE_SHIFT;
-	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
+	for_each_online_node(i)
+		register_page_bootmem_info_node(NODE_DATA(i));
+}
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-        for_each_online_node(nid) {
-		if (NODE_DATA(nid)->node_spanned_pages != 0) {
-			printk("freeing bootmem node %d\n", nid);
-			totalram_pages +=
-				free_all_bootmem_node(NODE_DATA(nid));
-		}
-	}
-#else
-	max_mapnr = max_pfn;
-	totalram_pages += free_all_bootmem();
+void __init mem_init(void)
+{
+	/*
+	 * book3s is limited to 16 page sizes due to encoding this in
+	 * a 4-bit field for slices.
+	 */
+	BUILD_BUG_ON(MMU_PAGE_COUNT > 16);
+
+#ifdef CONFIG_SWIOTLB
+	swiotlb_init(0);
 #endif
-	for_each_online_pgdat(pgdat) {
-		for (i = 0; i < pgdat->node_spanned_pages; i++) {
-			if (!pfn_valid(pgdat->node_start_pfn + i))
-				continue;
-			page = pgdat_page_nr(pgdat, i);
-			if (PageReserved(page))
-				reservedpages++;
-		}
-	}
 
-	codesize = (unsigned long)&_sdata - (unsigned long)&_stext;
-	datasize = (unsigned long)&_edata - (unsigned long)&_sdata;
-	initsize = (unsigned long)&__init_end - (unsigned long)&__init_begin;
-	bsssize = (unsigned long)&__bss_stop - (unsigned long)&__bss_start;
+	register_page_bootmem_info();
+	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
+	set_max_mapnr(max_pfn);
+	free_all_bootmem();
 
 #ifdef CONFIG_HIGHMEM
 	{
 		unsigned long pfn, highmem_mapnr;
 
-		highmem_mapnr = total_lowmem >> PAGE_SHIFT;
+		highmem_mapnr = lowmem_end_addr >> PAGE_SHIFT;
 		for (pfn = highmem_mapnr; pfn < max_mapnr; ++pfn) {
+			phys_addr_t paddr = (phys_addr_t)pfn << PAGE_SHIFT;
 			struct page *page = pfn_to_page(pfn);
-			if (lmb_is_reserved(pfn << PAGE_SHIFT))
-				continue;
-			ClearPageReserved(page);
-			init_page_count(page);
-			__free_page(page);
-			totalhigh_pages++;
-			reservedpages--;
+			if (!memblock_is_reserved(paddr))
+				free_highmem_page(page);
 		}
-		totalram_pages += totalhigh_pages;
-		printk(KERN_DEBUG "High memory: %luk\n",
-		       totalhigh_pages << (PAGE_SHIFT-10));
 	}
 #endif /* CONFIG_HIGHMEM */
 
-	printk(KERN_INFO "Memory: %luk/%luk available (%luk kernel code, "
-	       "%luk reserved, %luk data, %luk bss, %luk init)\n",
-		(unsigned long)nr_free_pages() << (PAGE_SHIFT-10),
-		num_physpages << (PAGE_SHIFT-10),
-		codesize >> 10,
-		reservedpages << (PAGE_SHIFT-10),
-		datasize >> 10,
-		bsssize >> 10,
-		initsize >> 10);
+#if defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_SMP)
+	/*
+	 * If smp is enabled, next_tlbcam_idx is initialized in the cpu up
+	 * functions.... do it here for the non-smp case.
+	 */
+	per_cpu(next_tlbcam_idx, smp_processor_id()) =
+		(mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) - 1;
+#endif
+
+	mem_init_print_info(NULL);
+#ifdef CONFIG_PPC32
+	pr_info("Kernel virtual memory layout:\n");
+	pr_info("  * 0x%08lx..0x%08lx  : fixmap\n", FIXADDR_START, FIXADDR_TOP);
+#ifdef CONFIG_HIGHMEM
+	pr_info("  * 0x%08lx..0x%08lx  : highmem PTEs\n",
+		PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP));
+#endif /* CONFIG_HIGHMEM */
+#ifdef CONFIG_NOT_COHERENT_CACHE
+	pr_info("  * 0x%08lx..0x%08lx  : consistent mem\n",
+		IOREMAP_TOP, IOREMAP_TOP + CONFIG_CONSISTENT_SIZE);
+#endif /* CONFIG_NOT_COHERENT_CACHE */
+	pr_info("  * 0x%08lx..0x%08lx  : early ioremap\n",
+		ioremap_bot, IOREMAP_TOP);
+	pr_info("  * 0x%08lx..0x%08lx  : vmalloc & ioremap\n",
+		VMALLOC_START, VMALLOC_END);
+#endif /* CONFIG_PPC32 */
 
 	mem_init_done = 1;
 }
 
+void free_initmem(void)
+{
+	ppc_md.progress = ppc_printk_progress;
+	free_initmem_default(POISON_FREE_INITMEM);
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void __init free_initrd_mem(unsigned long start, unsigned long end)
+{
+	free_reserved_area((void *)start, (void *)end, -1, "initrd");
+}
+#endif
+
 /*
  * This is called when a page has been modified by the kernel.
  * It just marks the page as not i-cache clean.  We do the i-cache
@@ -428,24 +401,33 @@ EXPORT_SYMBOL(flush_dcache_page);
 
 void flush_dcache_icache_page(struct page *page)
 {
+#ifdef CONFIG_HUGETLB_PAGE
+	if (PageCompound(page)) {
+		flush_dcache_icache_hugepage(page);
+		return;
+	}
+#endif
 #ifdef CONFIG_BOOKE
-	void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE);
-	__flush_dcache_icache(start);
-	kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
+	{
+		void *start = kmap_atomic(page);
+		__flush_dcache_icache(start);
+		kunmap_atomic(start);
+	}
 #elif defined(CONFIG_8xx) || defined(CONFIG_PPC64)
 	/* On 8xx there is no need to kmap since highmem is not supported */
 	__flush_dcache_icache(page_address(page)); 
 #else
 	__flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT);
 #endif
-
 }
+EXPORT_SYMBOL(flush_dcache_icache_page);
+
 void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
 {
 	clear_page(page);
 
 	/*
-	 * We shouldnt have to do this, but some versions of glibc
+	 * We shouldn't have to do this, but some versions of glibc
 	 * require it (ld.so assumes zero filled pages are icache clean)
 	 * - Anton
 	 */
@@ -496,46 +478,17 @@ EXPORT_SYMBOL(flush_icache_user_range);
  * This must always be called with the pte lock held.
  */
 void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
-		      pte_t pte)
+		      pte_t *ptep)
 {
 #ifdef CONFIG_PPC_STD_MMU
+	/*
+	 * We don't need to worry about _PAGE_PRESENT here because we are
+	 * called with either mm->page_table_lock held or ptl lock held
+	 */
 	unsigned long access = 0, trap;
-#endif
-	unsigned long pfn = pte_pfn(pte);
-
-	/* handle i-cache coherency */
-	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) &&
-	    !cpu_has_feature(CPU_FTR_NOEXECUTE) &&
-	    pfn_valid(pfn)) {
-		struct page *page = pfn_to_page(pfn);
-#ifdef CONFIG_8xx
-		/* On 8xx, cache control instructions (particularly
-		 * "dcbst" from flush_dcache_icache) fault as write
-		 * operation if there is an unpopulated TLB entry
-		 * for the address in question. To workaround that,
-		 * we invalidate the TLB here, thus avoiding dcbst
-		 * misbehaviour.
-		 */
-		_tlbie(address, 0 /* 8xx doesn't care about PID */);
-#endif
-		/* The _PAGE_USER test should really be _PAGE_EXEC, but
-		 * older glibc versions execute some code from no-exec
-		 * pages, which for now we are supporting.  If exec-only
-		 * pages are ever implemented, this will have to change.
-		 */
-		if (!PageReserved(page) && (pte_val(pte) & _PAGE_USER)
-		    && !test_bit(PG_arch_1, &page->flags)) {
-			if (vma->vm_mm == current->active_mm) {
-				__flush_dcache_icache((void *) address);
-			} else
-				flush_dcache_icache_page(page);
-			set_bit(PG_arch_1, &page->flags);
-		}
-	}
 
-#ifdef CONFIG_PPC_STD_MMU
 	/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
-	if (!pte_young(pte) || address >= TASK_SIZE)
+	if (!pte_young(*ptep) || address >= TASK_SIZE)
 		return;
 
 	/* We try to figure out if we are coming from an instruction
@@ -554,4 +507,58 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 		return;
 	hash_preload(vma->vm_mm, address, access, trap);
 #endif /* CONFIG_PPC_STD_MMU */
+#if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \
+	&& defined(CONFIG_HUGETLB_PAGE)
+	if (is_vm_hugetlb_page(vma))
+		book3e_hugetlb_preload(vma, address, *ptep);
+#endif
+}
+
+/*
+ * System memory should not be in /proc/iomem but various tools expect it
+ * (eg kdump).
+ */
+static int __init add_system_ram_resources(void)
+{
+	struct memblock_region *reg;
+
+	for_each_memblock(memory, reg) {
+		struct resource *res;
+		unsigned long base = reg->base;
+		unsigned long size = reg->size;
+
+		res = kzalloc(sizeof(struct resource), GFP_KERNEL);
+		WARN_ON(!res);
+
+		if (res) {
+			res->name = "System RAM";
+			res->start = base;
+			res->end = base + size - 1;
+			res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+			WARN_ON(request_resource(&iomem_resource, res) < 0);
+		}
+	}
+
+	return 0;
+}
+subsys_initcall(add_system_ram_resources);
+
+#ifdef CONFIG_STRICT_DEVMEM
+/*
+ * devmem_is_allowed(): check to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ * Access has to be given to non-kernel-ram areas as well, these contain the
+ * PCI mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pfn)
+{
+	if (iomem_is_exclusive(pfn << PAGE_SHIFT))
+		return 0;
+	if (!page_is_ram(pfn))
+		return 1;
+	if (page_is_rtas_user_buf(pfn))
+		return 1;
+	return 0;
 }
+#endif /* CONFIG_STRICT_DEVMEM */
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index 86010fc7d3b..cb8bdbe4972 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -24,43 +24,59 @@
 
 #include <linux/personality.h>
 #include <linux/mm.h>
+#include <linux/random.h>
 #include <linux/sched.h>
 
 /*
  * Top of mmap area (just below the process stack).
  *
- * Leave an at least ~128 MB hole.
+ * Leave at least a ~128 MB hole on 32bit applications.
+ *
+ * On 64bit applications we randomise the stack by 1GB so we need to
+ * space our mmap start address by a further 1GB, otherwise there is a
+ * chance the mmap area will end up closer to the stack than our ulimit
+ * requires.
  */
-#define MIN_GAP (128*1024*1024)
+#define MIN_GAP32 (128*1024*1024)
+#define MIN_GAP64 ((128 + 1024)*1024*1024UL)
+#define MIN_GAP ((is_32bit_task()) ? MIN_GAP32 : MIN_GAP64)
 #define MAX_GAP (TASK_SIZE/6*5)
 
-static inline unsigned long mmap_base(void)
+static inline int mmap_is_legacy(void)
 {
-	unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
+	if (current->personality & ADDR_COMPAT_LAYOUT)
+		return 1;
 
-	if (gap < MIN_GAP)
-		gap = MIN_GAP;
-	else if (gap > MAX_GAP)
-		gap = MAX_GAP;
+	if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
+		return 1;
 
-	return TASK_SIZE - (gap & PAGE_MASK);
+	return sysctl_legacy_va_layout;
 }
 
-static inline int mmap_is_legacy(void)
+static unsigned long mmap_rnd(void)
 {
-	/*
-	 * Force standard allocation for 64 bit programs.
-	 */
-	if (!test_thread_flag(TIF_32BIT))
-		return 1;
+	unsigned long rnd = 0;
 
-	if (current->personality & ADDR_COMPAT_LAYOUT)
-		return 1;
+	if (current->flags & PF_RANDOMIZE) {
+		/* 8MB for 32bit, 1GB for 64bit */
+		if (is_32bit_task())
+			rnd = (long)(get_random_int() % (1<<(23-PAGE_SHIFT)));
+		else
+			rnd = (long)(get_random_int() % (1<<(30-PAGE_SHIFT)));
+	}
+	return rnd << PAGE_SHIFT;
+}
 
-	if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)
-		return 1;
+static inline unsigned long mmap_base(void)
+{
+	unsigned long gap = rlimit(RLIMIT_STACK);
 
-	return sysctl_legacy_va_layout;
+	if (gap < MIN_GAP)
+		gap = MIN_GAP;
+	else if (gap > MAX_GAP)
+		gap = MAX_GAP;
+
+	return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
 }
 
 /*
@@ -76,10 +92,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	if (mmap_is_legacy()) {
 		mm->mmap_base = TASK_UNMAPPED_BASE;
 		mm->get_unmapped_area = arch_get_unmapped_area;
-		mm->unmap_area = arch_unmap_area;
 	} else {
 		mm->mmap_base = mmap_base();
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-		mm->unmap_area = arch_unmap_area_topdown;
 	}
 }
diff --git a/arch/powerpc/mm/mmu_context_32.c b/arch/powerpc/mm/mmu_context_32.c
deleted file mode 100644
index cc32ba41d90..00000000000
--- a/arch/powerpc/mm/mmu_context_32.c
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * This file contains the routines for handling the MMU on those
- * PowerPC implementations where the MMU substantially follows the
- * architecture specification.  This includes the 6xx, 7xx, 7xxx,
- * 8260, and POWER3 implementations but excludes the 8xx and 4xx.
- *  -- paulus
- *
- *  Derived from arch/ppc/mm/init.c:
- *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
- *    Copyright (C) 1996 Paul Mackerras
- *
- *  Derived from "arch/i386/mm/init.c"
- *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/mm.h>
-#include <linux/init.h>
-
-#include <asm/mmu_context.h>
-#include <asm/tlbflush.h>
-
-unsigned long next_mmu_context;
-unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
-#ifdef FEW_CONTEXTS
-atomic_t nr_free_contexts;
-struct mm_struct *context_mm[LAST_CONTEXT+1];
-void steal_context(void);
-#endif /* FEW_CONTEXTS */
-
-/*
- * Initialize the context management stuff.
- */
-void __init
-mmu_context_init(void)
-{
-	/*
-	 * Some processors have too few contexts to reserve one for
-	 * init_mm, and require using context 0 for a normal task.
-	 * Other processors reserve the use of context zero for the kernel.
-	 * This code assumes FIRST_CONTEXT < 32.
-	 */
-	context_map[0] = (1 << FIRST_CONTEXT) - 1;
-	next_mmu_context = FIRST_CONTEXT;
-#ifdef FEW_CONTEXTS
-	atomic_set(&nr_free_contexts, LAST_CONTEXT - FIRST_CONTEXT + 1);
-#endif /* FEW_CONTEXTS */
-}
-
-#ifdef FEW_CONTEXTS
-/*
- * Steal a context from a task that has one at the moment.
- * This is only used on 8xx and 4xx and we presently assume that
- * they don't do SMP.  If they do then this will have to check
- * whether the MM we steal is in use.
- * We also assume that this is only used on systems that don't
- * use an MMU hash table - this is true for 8xx and 4xx.
- * This isn't an LRU system, it just frees up each context in
- * turn (sort-of pseudo-random replacement :).  This would be the
- * place to implement an LRU scheme if anyone was motivated to do it.
- *  -- paulus
- */
-void
-steal_context(void)
-{
-	struct mm_struct *mm;
-
-	/* free up context `next_mmu_context' */
-	/* if we shouldn't free context 0, don't... */
-	if (next_mmu_context < FIRST_CONTEXT)
-		next_mmu_context = FIRST_CONTEXT;
-	mm = context_mm[next_mmu_context];
-	flush_tlb_mm(mm);
-	destroy_context(mm);
-}
-#endif /* FEW_CONTEXTS */
diff --git a/arch/powerpc/mm/mmu_context_64.c b/arch/powerpc/mm/mmu_context_64.c
deleted file mode 100644
index 1db38ba1f54..00000000000
--- a/arch/powerpc/mm/mmu_context_64.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- *  MMU context allocation for 64-bit kernels.
- *
- *  Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/idr.h>
-
-#include <asm/mmu_context.h>
-
-static DEFINE_SPINLOCK(mmu_context_lock);
-static DEFINE_IDR(mmu_context_idr);
-
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
-{
-	int index;
-	int err;
-
-again:
-	if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
-		return -ENOMEM;
-
-	spin_lock(&mmu_context_lock);
-	err = idr_get_new_above(&mmu_context_idr, NULL, 1, &index);
-	spin_unlock(&mmu_context_lock);
-
-	if (err == -EAGAIN)
-		goto again;
-	else if (err)
-		return err;
-
-	if (index > MAX_CONTEXT) {
-		spin_lock(&mmu_context_lock);
-		idr_remove(&mmu_context_idr, index);
-		spin_unlock(&mmu_context_lock);
-		return -ENOMEM;
-	}
-
-	/* The old code would re-promote on fork, we don't do that
-	 * when using slices as it could cause problem promoting slices
-	 * that have been forced down to 4K
-	 */
-	if (slice_mm_new_context(mm))
-		slice_set_user_psize(mm, mmu_virtual_psize);
-	mm->context.id = index;
-
-	return 0;
-}
-
-void destroy_context(struct mm_struct *mm)
-{
-	spin_lock(&mmu_context_lock);
-	idr_remove(&mmu_context_idr, mm->context.id);
-	spin_unlock(&mmu_context_lock);
-
-	mm->context.id = NO_CONTEXT;
-}
diff --git a/arch/powerpc/mm/mmu_context_hash32.c b/arch/powerpc/mm/mmu_context_hash32.c
new file mode 100644
index 00000000000..78fef6726e1
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context_hash32.c
@@ -0,0 +1,119 @@
+/*
+ * This file contains the routines for handling the MMU on those
+ * PowerPC implementations where the MMU substantially follows the
+ * architecture specification.  This includes the 6xx, 7xx, 7xxx,
+ * 8260, and POWER3 implementations but excludes the 8xx and 4xx.
+ *  -- paulus
+ *
+ *  Derived from arch/ppc/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/export.h>
+
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
+
+/*
+ * On 32-bit PowerPC 6xx/7xx/7xxx CPUs, we use a set of 16 VSIDs
+ * (virtual segment identifiers) for each context.  Although the
+ * hardware supports 24-bit VSIDs, and thus >1 million contexts,
+ * we only use 32,768 of them.  That is ample, since there can be
+ * at most around 30,000 tasks in the system anyway, and it means
+ * that we can use a bitmap to indicate which contexts are in use.
+ * Using a bitmap means that we entirely avoid all of the problems
+ * that we used to have when the context number overflowed,
+ * particularly on SMP systems.
+ *  -- paulus.
+ */
+#define NO_CONTEXT      	((unsigned long) -1)
+#define LAST_CONTEXT    	32767
+#define FIRST_CONTEXT    	1
+
+/*
+ * This function defines the mapping from contexts to VSIDs (virtual
+ * segment IDs).  We use a skew on both the context and the high 4 bits
+ * of the 32-bit virtual address (the "effective segment ID") in order
+ * to spread out the entries in the MMU hash table.  Note, if this
+ * function is changed then arch/ppc/mm/hashtable.S will have to be
+ * changed to correspond.
+ *
+ *
+ * CTX_TO_VSID(ctx, va)	(((ctx) * (897 * 16) + ((va) >> 28) * 0x111) \
+ *				 & 0xffffff)
+ */
+
+static unsigned long next_mmu_context;
+static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
+
+unsigned long __init_new_context(void)
+{
+	unsigned long ctx = next_mmu_context;
+
+	while (test_and_set_bit(ctx, context_map)) {
+		ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx);
+		if (ctx > LAST_CONTEXT)
+			ctx = 0;
+	}
+	next_mmu_context = (ctx + 1) & LAST_CONTEXT;
+
+	return ctx;
+}
+EXPORT_SYMBOL_GPL(__init_new_context);
+
+/*
+ * Set up the context for a new address space.
+ */
+int init_new_context(struct task_struct *t, struct mm_struct *mm)
+{
+	mm->context.id = __init_new_context();
+
+	return 0;
+}
+
+/*
+ * Free a context ID. Make sure to call this with preempt disabled!
+ */
+void __destroy_context(unsigned long ctx)
+{
+	clear_bit(ctx, context_map);
+}
+EXPORT_SYMBOL_GPL(__destroy_context);
+
+/*
+ * We're finished using the context for an address space.
+ */
+void destroy_context(struct mm_struct *mm)
+{
+	preempt_disable();
+	if (mm->context.id != NO_CONTEXT) {
+		__destroy_context(mm->context.id);
+		mm->context.id = NO_CONTEXT;
+	}
+	preempt_enable();
+}
+
+/*
+ * Initialize the context management stuff.
+ */
+void __init mmu_context_init(void)
+{
+	/* Reserve context 0 for kernel use */
+	context_map[0] = (1 << FIRST_CONTEXT) - 1;
+	next_mmu_context = FIRST_CONTEXT;
+}
diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
new file mode 100644
index 00000000000..178876aef40
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context_hash64.c
@@ -0,0 +1,146 @@
+/*
+ *  MMU context allocation for 64-bit kernels.
+ *
+ *  Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+
+#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
+
+#include "icswx.h"
+
+static DEFINE_SPINLOCK(mmu_context_lock);
+static DEFINE_IDA(mmu_context_ida);
+
+int __init_new_context(void)
+{
+	int index;
+	int err;
+
+again:
+	if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL))
+		return -ENOMEM;
+
+	spin_lock(&mmu_context_lock);
+	err = ida_get_new_above(&mmu_context_ida, 1, &index);
+	spin_unlock(&mmu_context_lock);
+
+	if (err == -EAGAIN)
+		goto again;
+	else if (err)
+		return err;
+
+	if (index > MAX_USER_CONTEXT) {
+		spin_lock(&mmu_context_lock);
+		ida_remove(&mmu_context_ida, index);
+		spin_unlock(&mmu_context_lock);
+		return -ENOMEM;
+	}
+
+	return index;
+}
+EXPORT_SYMBOL_GPL(__init_new_context);
+
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+	int index;
+
+	index = __init_new_context();
+	if (index < 0)
+		return index;
+
+	/* The old code would re-promote on fork, we don't do that
+	 * when using slices as it could cause problem promoting slices
+	 * that have been forced down to 4K
+	 */
+	if (slice_mm_new_context(mm))
+		slice_set_user_psize(mm, mmu_virtual_psize);
+	subpage_prot_init_new_context(mm);
+	mm->context.id = index;
+#ifdef CONFIG_PPC_ICSWX
+	mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
+	if (!mm->context.cop_lockp) {
+		__destroy_context(index);
+		subpage_prot_free(mm);
+		mm->context.id = MMU_NO_CONTEXT;
+		return -ENOMEM;
+	}
+	spin_lock_init(mm->context.cop_lockp);
+#endif /* CONFIG_PPC_ICSWX */
+
+#ifdef CONFIG_PPC_64K_PAGES
+	mm->context.pte_frag = NULL;
+#endif
+	return 0;
+}
+
+void __destroy_context(int context_id)
+{
+	spin_lock(&mmu_context_lock);
+	ida_remove(&mmu_context_ida, context_id);
+	spin_unlock(&mmu_context_lock);
+}
+EXPORT_SYMBOL_GPL(__destroy_context);
+
+#ifdef CONFIG_PPC_64K_PAGES
+static void destroy_pagetable_page(struct mm_struct *mm)
+{
+	int count;
+	void *pte_frag;
+	struct page *page;
+
+	pte_frag = mm->context.pte_frag;
+	if (!pte_frag)
+		return;
+
+	page = virt_to_page(pte_frag);
+	/* drop all the pending references */
+	count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
+	/* We allow PTE_FRAG_NR fragments from a PTE page */
+	count = atomic_sub_return(PTE_FRAG_NR - count, &page->_count);
+	if (!count) {
+		pgtable_page_dtor(page);
+		free_hot_cold_page(page, 0);
+	}
+}
+
+#else
+static inline void destroy_pagetable_page(struct mm_struct *mm)
+{
+	return;
+}
+#endif
+
+
+void destroy_context(struct mm_struct *mm)
+{
+
+#ifdef CONFIG_PPC_ICSWX
+	drop_cop(mm->context.acop, mm);
+	kfree(mm->context.cop_lockp);
+	mm->context.cop_lockp = NULL;
+#endif /* CONFIG_PPC_ICSWX */
+
+	destroy_pagetable_page(mm);
+	__destroy_context(mm->context.id);
+	subpage_prot_free(mm);
+	mm->context.id = MMU_NO_CONTEXT;
+}
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
new file mode 100644
index 00000000000..928ebe79668
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -0,0 +1,449 @@
+/*
+ * This file contains the routines for handling the MMU on those
+ * PowerPC implementations where the MMU is not using the hash
+ * table, such as 8xx, 4xx, BookE's etc...
+ *
+ * Copyright 2008 Ben Herrenschmidt <benh@kernel.crashing.org>
+ *                IBM Corp.
+ *
+ *  Derived from previous arch/powerpc/mm/mmu_context.c
+ *  and arch/powerpc/include/asm/mmu_context.h
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ * TODO:
+ *
+ *   - The global context lock will not scale very well
+ *   - The maps should be dynamically allocated to allow for processors
+ *     that support more PID bits at runtime
+ *   - Implement flush_tlb_mm() by making the context stale and picking
+ *     a new one
+ *   - More aggressively clear stale map bits and maybe find some way to
+ *     also clear mm->cpu_vm_mask bits when processes are migrated
+ */
+
+//#define DEBUG_MAP_CONSISTENCY
+//#define DEBUG_CLAMP_LAST_CONTEXT   31
+//#define DEBUG_HARDER
+
+/* We don't use DEBUG because it tends to be compiled in always nowadays
+ * and this would generate way too much output
+ */
+#ifdef DEBUG_HARDER
+#define pr_hard(args...)	printk(KERN_DEBUG args)
+#define pr_hardcont(args...)	printk(KERN_CONT args)
+#else
+#define pr_hard(args...)	do { } while(0)
+#define pr_hardcont(args...)	do { } while(0)
+#endif
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/bootmem.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/slab.h>
+
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
+
+static unsigned int first_context, last_context;
+static unsigned int next_context, nr_free_contexts;
+static unsigned long *context_map;
+static unsigned long *stale_map[NR_CPUS];
+static struct mm_struct **context_mm;
+static DEFINE_RAW_SPINLOCK(context_lock);
+
+#define CTX_MAP_SIZE	\
+	(sizeof(unsigned long) * (last_context / BITS_PER_LONG + 1))
+
+
+/* Steal a context from a task that has one at the moment.
+ *
+ * This is used when we are running out of available PID numbers
+ * on the processors.
+ *
+ * This isn't an LRU system, it just frees up each context in
+ * turn (sort-of pseudo-random replacement :).  This would be the
+ * place to implement an LRU scheme if anyone was motivated to do it.
+ *  -- paulus
+ *
+ * For context stealing, we use a slightly different approach for
+ * SMP and UP. Basically, the UP one is simpler and doesn't use
+ * the stale map as we can just flush the local CPU
+ *  -- benh
+ */
+#ifdef CONFIG_SMP
+static unsigned int steal_context_smp(unsigned int id)
+{
+	struct mm_struct *mm;
+	unsigned int cpu, max, i;
+
+	max = last_context - first_context;
+
+	/* Attempt to free next_context first and then loop until we manage */
+	while (max--) {
+		/* Pick up the victim mm */
+		mm = context_mm[id];
+
+		/* We have a candidate victim, check if it's active, on SMP
+		 * we cannot steal active contexts
+		 */
+		if (mm->context.active) {
+			id++;
+			if (id > last_context)
+				id = first_context;
+			continue;
+		}
+		pr_hardcont(" | steal %d from 0x%p", id, mm);
+
+		/* Mark this mm has having no context anymore */
+		mm->context.id = MMU_NO_CONTEXT;
+
+		/* Mark it stale on all CPUs that used this mm. For threaded
+		 * implementations, we set it on all threads on each core
+		 * represented in the mask. A future implementation will use
+		 * a core map instead but this will do for now.
+		 */
+		for_each_cpu(cpu, mm_cpumask(mm)) {
+			for (i = cpu_first_thread_sibling(cpu);
+			     i <= cpu_last_thread_sibling(cpu); i++) {
+				if (stale_map[i])
+					__set_bit(id, stale_map[i]);
+			}
+			cpu = i - 1;
+		}
+		return id;
+	}
+
+	/* This will happen if you have more CPUs than available contexts,
+	 * all we can do here is wait a bit and try again
+	 */
+	raw_spin_unlock(&context_lock);
+	cpu_relax();
+	raw_spin_lock(&context_lock);
+
+	/* This will cause the caller to try again */
+	return MMU_NO_CONTEXT;
+}
+#endif  /* CONFIG_SMP */
+
+/* Note that this will also be called on SMP if all other CPUs are
+ * offlined, which means that it may be called for cpu != 0. For
+ * this to work, we somewhat assume that CPUs that are onlined
+ * come up with a fully clean TLB (or are cleaned when offlined)
+ */
+static unsigned int steal_context_up(unsigned int id)
+{
+	struct mm_struct *mm;
+	int cpu = smp_processor_id();
+
+	/* Pick up the victim mm */
+	mm = context_mm[id];
+
+	pr_hardcont(" | steal %d from 0x%p", id, mm);
+
+	/* Flush the TLB for that context */
+	local_flush_tlb_mm(mm);
+
+	/* Mark this mm has having no context anymore */
+	mm->context.id = MMU_NO_CONTEXT;
+
+	/* XXX This clear should ultimately be part of local_flush_tlb_mm */
+	__clear_bit(id, stale_map[cpu]);
+
+	return id;
+}
+
+#ifdef DEBUG_MAP_CONSISTENCY
+static void context_check_map(void)
+{
+	unsigned int id, nrf, nact;
+
+	nrf = nact = 0;
+	for (id = first_context; id <= last_context; id++) {
+		int used = test_bit(id, context_map);
+		if (!used)
+			nrf++;
+		if (used != (context_mm[id] != NULL))
+			pr_err("MMU: Context %d is %s and MM is %p !\n",
+			       id, used ? "used" : "free", context_mm[id]);
+		if (context_mm[id] != NULL)
+			nact += context_mm[id]->context.active;
+	}
+	if (nrf != nr_free_contexts) {
+		pr_err("MMU: Free context count out of sync ! (%d vs %d)\n",
+		       nr_free_contexts, nrf);
+		nr_free_contexts = nrf;
+	}
+	if (nact > num_online_cpus())
+		pr_err("MMU: More active contexts than CPUs ! (%d vs %d)\n",
+		       nact, num_online_cpus());
+	if (first_context > 0 && !test_bit(0, context_map))
+		pr_err("MMU: Context 0 has been freed !!!\n");
+}
+#else
+static void context_check_map(void) { }
+#endif
+
+void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
+{
+	unsigned int i, id, cpu = smp_processor_id();
+	unsigned long *map;
+
+	/* No lockless fast path .. yet */
+	raw_spin_lock(&context_lock);
+
+	pr_hard("[%d] activating context for mm @%p, active=%d, id=%d",
+		cpu, next, next->context.active, next->context.id);
+
+#ifdef CONFIG_SMP
+	/* Mark us active and the previous one not anymore */
+	next->context.active++;
+	if (prev) {
+		pr_hardcont(" (old=0x%p a=%d)", prev, prev->context.active);
+		WARN_ON(prev->context.active < 1);
+		prev->context.active--;
+	}
+
+ again:
+#endif /* CONFIG_SMP */
+
+	/* If we already have a valid assigned context, skip all that */
+	id = next->context.id;
+	if (likely(id != MMU_NO_CONTEXT)) {
+#ifdef DEBUG_MAP_CONSISTENCY
+		if (context_mm[id] != next)
+			pr_err("MMU: mm 0x%p has id %d but context_mm[%d] says 0x%p\n",
+			       next, id, id, context_mm[id]);
+#endif
+		goto ctxt_ok;
+	}
+
+	/* We really don't have a context, let's try to acquire one */
+	id = next_context;
+	if (id > last_context)
+		id = first_context;
+	map = context_map;
+
+	/* No more free contexts, let's try to steal one */
+	if (nr_free_contexts == 0) {
+#ifdef CONFIG_SMP
+		if (num_online_cpus() > 1) {
+			id = steal_context_smp(id);
+			if (id == MMU_NO_CONTEXT)
+				goto again;
+			goto stolen;
+		}
+#endif /* CONFIG_SMP */
+		id = steal_context_up(id);
+		goto stolen;
+	}
+	nr_free_contexts--;
+
+	/* We know there's at least one free context, try to find it */
+	while (__test_and_set_bit(id, map)) {
+		id = find_next_zero_bit(map, last_context+1, id);
+		if (id > last_context)
+			id = first_context;
+	}
+ stolen:
+	next_context = id + 1;
+	context_mm[id] = next;
+	next->context.id = id;
+	pr_hardcont(" | new id=%d,nrf=%d", id, nr_free_contexts);
+
+	context_check_map();
+ ctxt_ok:
+
+	/* If that context got marked stale on this CPU, then flush the
+	 * local TLB for it and unmark it before we use it
+	 */
+	if (test_bit(id, stale_map[cpu])) {
+		pr_hardcont(" | stale flush %d [%d..%d]",
+			    id, cpu_first_thread_sibling(cpu),
+			    cpu_last_thread_sibling(cpu));
+
+		local_flush_tlb_mm(next);
+
+		/* XXX This clear should ultimately be part of local_flush_tlb_mm */
+		for (i = cpu_first_thread_sibling(cpu);
+		     i <= cpu_last_thread_sibling(cpu); i++) {
+			if (stale_map[i])
+				__clear_bit(id, stale_map[i]);
+		}
+	}
+
+	/* Flick the MMU and release lock */
+	pr_hardcont(" -> %d\n", id);
+	set_context(id, next->pgd);
+	raw_spin_unlock(&context_lock);
+}
+
+/*
+ * Set up the context for a new address space.
+ */
+int init_new_context(struct task_struct *t, struct mm_struct *mm)
+{
+	pr_hard("initing context for mm @%p\n", mm);
+
+	mm->context.id = MMU_NO_CONTEXT;
+	mm->context.active = 0;
+
+#ifdef CONFIG_PPC_MM_SLICES
+	if (slice_mm_new_context(mm))
+		slice_set_user_psize(mm, mmu_virtual_psize);
+#endif
+
+	return 0;
+}
+
+/*
+ * We're finished using the context for an address space.
+ */
+void destroy_context(struct mm_struct *mm)
+{
+	unsigned long flags;
+	unsigned int id;
+
+	if (mm->context.id == MMU_NO_CONTEXT)
+		return;
+
+	WARN_ON(mm->context.active != 0);
+
+	raw_spin_lock_irqsave(&context_lock, flags);
+	id = mm->context.id;
+	if (id != MMU_NO_CONTEXT) {
+		__clear_bit(id, context_map);
+		mm->context.id = MMU_NO_CONTEXT;
+#ifdef DEBUG_MAP_CONSISTENCY
+		mm->context.active = 0;
+#endif
+		context_mm[id] = NULL;
+		nr_free_contexts++;
+	}
+	raw_spin_unlock_irqrestore(&context_lock, flags);
+}
+
+#ifdef CONFIG_SMP
+
+static int mmu_context_cpu_notify(struct notifier_block *self,
+				  unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned int)(long)hcpu;
+
+	/* We don't touch CPU 0 map, it's allocated at aboot and kept
+	 * around forever
+	 */
+	if (cpu == boot_cpuid)
+		return NOTIFY_OK;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		pr_devel("MMU: Allocating stale context map for CPU %d\n", cpu);
+		stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL);
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu);
+		kfree(stale_map[cpu]);
+		stale_map[cpu] = NULL;
+
+		/* We also clear the cpu_vm_mask bits of CPUs going away */
+		clear_tasks_mm_cpumask(cpu);
+	break;
+#endif /* CONFIG_HOTPLUG_CPU */
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block mmu_context_cpu_nb = {
+	.notifier_call	= mmu_context_cpu_notify,
+};
+
+#endif /* CONFIG_SMP */
+
+/*
+ * Initialize the context management stuff.
+ */
+void __init mmu_context_init(void)
+{
+	/* Mark init_mm as being active on all possible CPUs since
+	 * we'll get called with prev == init_mm the first time
+	 * we schedule on a given CPU
+	 */
+	init_mm.context.active = NR_CPUS;
+
+	/*
+	 *   The MPC8xx has only 16 contexts.  We rotate through them on each
+	 * task switch.  A better way would be to keep track of tasks that
+	 * own contexts, and implement an LRU usage.  That way very active
+	 * tasks don't always have to pay the TLB reload overhead.  The
+	 * kernel pages are mapped shared, so the kernel can run on behalf
+	 * of any task that makes a kernel entry.  Shared does not mean they
+	 * are not protected, just that the ASID comparison is not performed.
+	 *      -- Dan
+	 *
+	 * The IBM4xx has 256 contexts, so we can just rotate through these
+	 * as a way of "switching" contexts.  If the TID of the TLB is zero,
+	 * the PID/TID comparison is disabled, so we can use a TID of zero
+	 * to represent all kernel pages as shared among all contexts.
+	 * 	-- Dan
+	 *
+	 * The IBM 47x core supports 16-bit PIDs, thus 65535 contexts. We
+	 * should normally never have to steal though the facility is
+	 * present if needed.
+	 *      -- BenH
+	 */
+	if (mmu_has_feature(MMU_FTR_TYPE_8xx)) {
+		first_context = 0;
+		last_context = 15;
+	} else if (mmu_has_feature(MMU_FTR_TYPE_47x)) {
+		first_context = 1;
+		last_context = 65535;
+	} else {
+		first_context = 1;
+		last_context = 255;
+	}
+
+#ifdef DEBUG_CLAMP_LAST_CONTEXT
+	last_context = DEBUG_CLAMP_LAST_CONTEXT;
+#endif
+	/*
+	 * Allocate the maps used by context management
+	 */
+	context_map = alloc_bootmem(CTX_MAP_SIZE);
+	context_mm = alloc_bootmem(sizeof(void *) * (last_context + 1));
+#ifndef CONFIG_SMP
+	stale_map[0] = alloc_bootmem(CTX_MAP_SIZE);
+#else
+	stale_map[boot_cpuid] = alloc_bootmem(CTX_MAP_SIZE);
+
+	register_cpu_notifier(&mmu_context_cpu_nb);
+#endif
+
+	printk(KERN_INFO
+	       "MMU: Allocated %zu bytes of context maps for %d contexts\n",
+	       2 * CTX_MAP_SIZE + (sizeof(void *) * (last_context + 1)),
+	       last_context - first_context + 1);
+
+	/*
+	 * Some processors have too few contexts to reserve one for
+	 * init_mm, and require using context 0 for a normal task.
+	 * Other processors reserve the use of context zero for the kernel.
+	 * This code assumes first_context < 32.
+	 */
+	context_map[0] = (1 << first_context) - 1;
+	next_context = first_context;
+	nr_free_contexts = last_context - first_context + 1;
+}
+
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index ebfd13dc9d1..9615d82919b 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -22,20 +22,84 @@
 #include <asm/tlbflush.h>
 #include <asm/mmu.h>
 
+#ifdef CONFIG_PPC_MMU_NOHASH
+
+/*
+ * On 40x and 8xx, we directly inline tlbia and tlbivax
+ */
+#if defined(CONFIG_40x) || defined(CONFIG_8xx)
+static inline void _tlbil_all(void)
+{
+	asm volatile ("sync; tlbia; isync" : : : "memory");
+}
+static inline void _tlbil_pid(unsigned int pid)
+{
+	asm volatile ("sync; tlbia; isync" : : : "memory");
+}
+#define _tlbil_pid_noind(pid)	_tlbil_pid(pid)
+
+#else /* CONFIG_40x || CONFIG_8xx */
+extern void _tlbil_all(void);
+extern void _tlbil_pid(unsigned int pid);
+#ifdef CONFIG_PPC_BOOK3E
+extern void _tlbil_pid_noind(unsigned int pid);
+#else
+#define _tlbil_pid_noind(pid)	_tlbil_pid(pid)
+#endif
+#endif /* !(CONFIG_40x || CONFIG_8xx) */
+
+/*
+ * On 8xx, we directly inline tlbie, on others, it's extern
+ */
+#ifdef CONFIG_8xx
+static inline void _tlbil_va(unsigned long address, unsigned int pid,
+			     unsigned int tsize, unsigned int ind)
+{
+	asm volatile ("tlbie %0; sync" : : "r" (address) : "memory");
+}
+#elif defined(CONFIG_PPC_BOOK3E)
+extern void _tlbil_va(unsigned long address, unsigned int pid,
+		      unsigned int tsize, unsigned int ind);
+#else
+extern void __tlbil_va(unsigned long address, unsigned int pid);
+static inline void _tlbil_va(unsigned long address, unsigned int pid,
+			     unsigned int tsize, unsigned int ind)
+{
+	__tlbil_va(address, pid);
+}
+#endif /* CONIFG_8xx */
+
+#if defined(CONFIG_PPC_BOOK3E) || defined(CONFIG_PPC_47x)
+extern void _tlbivax_bcast(unsigned long address, unsigned int pid,
+			   unsigned int tsize, unsigned int ind);
+#else
+static inline void _tlbivax_bcast(unsigned long address, unsigned int pid,
+				   unsigned int tsize, unsigned int ind)
+{
+	BUG();
+}
+#endif
+
+#else /* CONFIG_PPC_MMU_NOHASH */
+
 extern void hash_preload(struct mm_struct *mm, unsigned long ea,
 			 unsigned long access, unsigned long trap);
 
 
+extern void _tlbie(unsigned long address);
+extern void _tlbia(void);
+
+#endif /* CONFIG_PPC_MMU_NOHASH */
+
 #ifdef CONFIG_PPC32
+
 extern void mapin_ram(void);
 extern int map_page(unsigned long va, phys_addr_t pa, int flags);
-extern void setbat(int index, unsigned long virt, unsigned long phys,
+extern void setbat(int index, unsigned long virt, phys_addr_t phys,
 		   unsigned int size, int flags);
-extern void settlbcam(int index, unsigned long virt, phys_addr_t phys,
-		      unsigned int size, int flags, unsigned int pid);
-extern void invalidate_tlbcam_entry(int index);
 
 extern int __map_without_bats;
+extern int __allow_ioremap_reserved;
 extern unsigned long ioremap_base;
 extern unsigned int rtas_data, rtas_size;
 
@@ -43,50 +107,61 @@ struct hash_pte;
 extern struct hash_pte *Hash, *Hash_end;
 extern unsigned long Hash_size, Hash_mask;
 
-extern unsigned int num_tlbcam_entries;
-#endif
+#endif /* CONFIG_PPC32 */
+
+#ifdef CONFIG_PPC64
+extern int map_kernel_page(unsigned long ea, unsigned long pa, int flags);
+#endif /* CONFIG_PPC64 */
 
 extern unsigned long ioremap_bot;
 extern unsigned long __max_low_memory;
-extern unsigned long __initial_memory_limit;
-extern unsigned long total_memory;
-extern unsigned long total_lowmem;
+extern phys_addr_t __initial_memory_limit_addr;
+extern phys_addr_t total_memory;
+extern phys_addr_t total_lowmem;
+extern phys_addr_t memstart_addr;
+extern phys_addr_t lowmem_end_addr;
+
+#ifdef CONFIG_WII
+extern unsigned long wii_hole_start;
+extern unsigned long wii_hole_size;
+
+extern unsigned long wii_mmu_mapin_mem2(unsigned long top);
+extern void wii_memory_fixups(void);
+#endif
 
 /* ...and now those things that may be slightly different between processor
  * architectures.  -- Dan
  */
 #if defined(CONFIG_8xx)
-#define flush_HPTE(X, va, pg)	_tlbie(va, 0 /* 8xx doesn't care about PID */)
 #define MMU_init_hw()		do { } while(0)
-#define mmu_mapin_ram()		(0UL)
+#define mmu_mapin_ram(top)	(0UL)
 
 #elif defined(CONFIG_4xx)
-#define flush_HPTE(pid, va, pg)	_tlbie(va, pid)
 extern void MMU_init_hw(void);
-extern unsigned long mmu_mapin_ram(void);
+extern unsigned long mmu_mapin_ram(unsigned long top);
 
-#elif defined(CONFIG_FSL_BOOKE)
-#define flush_HPTE(pid, va, pg)	_tlbie(va, pid)
+#elif defined(CONFIG_PPC_FSL_BOOK3E)
+extern unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx);
+extern unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
+				 phys_addr_t phys);
+#ifdef CONFIG_PPC32
 extern void MMU_init_hw(void);
-extern unsigned long mmu_mapin_ram(void);
+extern unsigned long mmu_mapin_ram(unsigned long top);
 extern void adjust_total_lowmem(void);
+extern int switch_to_as1(void);
+extern void restore_to_as0(int esel, int offset, void *dt_ptr, int bootcpu);
+#endif
+extern void loadcam_entry(unsigned int index);
 
+struct tlbcam {
+	u32	MAS0;
+	u32	MAS1;
+	unsigned long	MAS2;
+	u32	MAS3;
+	u32	MAS7;
+};
 #elif defined(CONFIG_PPC32)
 /* anything 32-bit except 4xx or 8xx */
 extern void MMU_init_hw(void);
-extern unsigned long mmu_mapin_ram(void);
-
-/* Be careful....this needs to be updated if we ever encounter 603 SMPs,
- * which includes all new 82xx processors.  We need tlbie/tlbsync here
- * in that case (I think). -- Dan.
- */
-static inline void flush_HPTE(unsigned context, unsigned long va,
-			      unsigned long pdval)
-{
-	if ((Hash != 0) &&
-	    cpu_has_feature(CPU_FTR_HPTE_TABLE))
-		flush_hash_pages(0, va, pdval, 1);
-	else
-		_tlbie(va);
-}
+extern unsigned long mmu_mapin_ram(unsigned long top);
 #endif
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index a300d254aac..3b181b22cd4 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -13,14 +13,31 @@
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/nodemask.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
+#include <linux/memblock.h>
+#include <linux/of.h>
+#include <linux/pfn.h>
+#include <linux/cpuset.h>
+#include <linux/node.h>
+#include <linux/stop_machine.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <asm/cputhreads.h>
 #include <asm/sparsemem.h>
-#include <asm/lmb.h>
-#include <asm/system.h>
+#include <asm/prom.h>
 #include <asm/smp.h>
+#include <asm/cputhreads.h>
+#include <asm/topology.h>
+#include <asm/firmware.h>
+#include <asm/paca.h>
+#include <asm/hvcall.h>
+#include <asm/setup.h>
+#include <asm/vdso.h>
 
 static int numa_enabled = 1;
 
@@ -30,18 +47,45 @@ static int numa_debug;
 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
 
 int numa_cpu_lookup_table[NR_CPUS];
-cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
+cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
 struct pglist_data *node_data[MAX_NUMNODES];
 
 EXPORT_SYMBOL(numa_cpu_lookup_table);
-EXPORT_SYMBOL(numa_cpumask_lookup_table);
+EXPORT_SYMBOL(node_to_cpumask_map);
 EXPORT_SYMBOL(node_data);
 
-static bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
 static int min_common_depth;
 static int n_mem_addr_cells, n_mem_size_cells;
+static int form1_affinity;
 
-static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
+#define MAX_DISTANCE_REF_POINTS 4
+static int distance_ref_points_depth;
+static const __be32 *distance_ref_points;
+static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
+
+/*
+ * Allocate node_to_cpumask_map based on number of available nodes
+ * Requires node_possible_map to be valid.
+ *
+ * Note: cpumask_of_node() is not valid until after this is done.
+ */
+static void __init setup_node_to_cpumask_map(void)
+{
+	unsigned int node;
+
+	/* setup nr_node_ids if not done yet */
+	if (nr_node_ids == MAX_NUMNODES)
+		setup_nr_node_ids();
+
+	/* allocate the map */
+	for (node = 0; node < nr_node_ids; node++)
+		alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
+
+	/* cpumask_of_node() will now work */
+	dbg("Node to cpumask map for %d nodes\n", nr_node_ids);
+}
+
+static int __init fake_numa_create_new_node(unsigned long end_pfn,
 						unsigned int *nid)
 {
 	unsigned long long mem;
@@ -88,92 +132,163 @@ static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
 	return 0;
 }
 
-static void __cpuinit map_cpu_to_node(int cpu, int node)
+/*
+ * get_node_active_region - Return active region containing pfn
+ * Active range returned is empty if none found.
+ * @pfn: The page to return the region for
+ * @node_ar: Returned set to the active region containing @pfn
+ */
+static void __init get_node_active_region(unsigned long pfn,
+					  struct node_active_region *node_ar)
+{
+	unsigned long start_pfn, end_pfn;
+	int i, nid;
+
+	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+		if (pfn >= start_pfn && pfn < end_pfn) {
+			node_ar->nid = nid;
+			node_ar->start_pfn = start_pfn;
+			node_ar->end_pfn = end_pfn;
+			break;
+		}
+	}
+}
+
+static void reset_numa_cpu_lookup_table(void)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu)
+		numa_cpu_lookup_table[cpu] = -1;
+}
+
+static void update_numa_cpu_lookup_table(unsigned int cpu, int node)
 {
 	numa_cpu_lookup_table[cpu] = node;
+}
+
+static void map_cpu_to_node(int cpu, int node)
+{
+	update_numa_cpu_lookup_table(cpu, node);
 
 	dbg("adding cpu %d to node %d\n", cpu, node);
 
-	if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node])))
-		cpu_set(cpu, numa_cpumask_lookup_table[node]);
+	if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node])))
+		cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
+#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR)
 static void unmap_cpu_from_node(unsigned long cpu)
 {
 	int node = numa_cpu_lookup_table[cpu];
 
 	dbg("removing cpu %lu from node %d\n", cpu, node);
 
-	if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
-		cpu_clear(cpu, numa_cpumask_lookup_table[node]);
+	if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) {
+		cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
 	} else {
 		printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
 		       cpu, node);
 	}
 }
-#endif /* CONFIG_HOTPLUG_CPU */
+#endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
+
+/* must hold reference to node during call */
+static const __be32 *of_get_associativity(struct device_node *dev)
+{
+	return of_get_property(dev, "ibm,associativity", NULL);
+}
 
-static struct device_node * __cpuinit find_cpu_node(unsigned int cpu)
+/*
+ * Returns the property linux,drconf-usable-memory if
+ * it exists (the property exists only in kexec/kdump kernels,
+ * added by kexec-tools)
+ */
+static const __be32 *of_get_usable_memory(struct device_node *memory)
 {
-	unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
-	struct device_node *cpu_node = NULL;
-	const unsigned int *interrupt_server, *reg;
-	int len;
+	const __be32 *prop;
+	u32 len;
+	prop = of_get_property(memory, "linux,drconf-usable-memory", &len);
+	if (!prop || len < sizeof(unsigned int))
+		return NULL;
+	return prop;
+}
 
-	while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
-		/* Try interrupt server first */
-		interrupt_server = of_get_property(cpu_node,
-					"ibm,ppc-interrupt-server#s", &len);
+int __node_distance(int a, int b)
+{
+	int i;
+	int distance = LOCAL_DISTANCE;
 
-		len = len / sizeof(u32);
+	if (!form1_affinity)
+		return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
 
-		if (interrupt_server && (len > 0)) {
-			while (len--) {
-				if (interrupt_server[len] == hw_cpuid)
-					return cpu_node;
-			}
-		} else {
-			reg = of_get_property(cpu_node, "reg", &len);
-			if (reg && (len > 0) && (reg[0] == hw_cpuid))
-				return cpu_node;
-		}
+	for (i = 0; i < distance_ref_points_depth; i++) {
+		if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
+			break;
+
+		/* Double the distance for each NUMA level */
+		distance *= 2;
 	}
 
-	return NULL;
+	return distance;
 }
+EXPORT_SYMBOL(__node_distance);
 
-/* must hold reference to node during call */
-static const int *of_get_associativity(struct device_node *dev)
+static void initialize_distance_lookup_table(int nid,
+		const __be32 *associativity)
 {
-	return of_get_property(dev, "ibm,associativity", NULL);
+	int i;
+
+	if (!form1_affinity)
+		return;
+
+	for (i = 0; i < distance_ref_points_depth; i++) {
+		const __be32 *entry;
+
+		entry = &associativity[be32_to_cpu(distance_ref_points[i])];
+		distance_lookup_table[nid][i] = of_read_number(entry, 1);
+	}
 }
 
 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
  * info is found.
  */
-static int of_node_to_nid_single(struct device_node *device)
+static int associativity_to_nid(const __be32 *associativity)
 {
 	int nid = -1;
-	const unsigned int *tmp;
 
 	if (min_common_depth == -1)
 		goto out;
 
-	tmp = of_get_associativity(device);
-	if (!tmp)
-		goto out;
-
-	if (tmp[0] >= min_common_depth)
-		nid = tmp[min_common_depth];
+	if (of_read_number(associativity, 1) >= min_common_depth)
+		nid = of_read_number(&associativity[min_common_depth], 1);
 
 	/* POWER4 LPAR uses 0xffff as invalid node */
 	if (nid == 0xffff || nid >= MAX_NUMNODES)
 		nid = -1;
+
+	if (nid > 0 &&
+	    of_read_number(associativity, 1) >= distance_ref_points_depth)
+		initialize_distance_lookup_table(nid, associativity);
+
 out:
 	return nid;
 }
 
+/* Returns the nid associated with the given device tree node,
+ * or -1 if not found.
+ */
+static int of_node_to_nid_single(struct device_node *device)
+{
+	int nid = -1;
+	const __be32 *tmp;
+
+	tmp = of_get_associativity(device);
+	if (tmp)
+		nid = associativity_to_nid(tmp);
+	return nid;
+}
+
 /* Walk the device tree upwards, looking for an associativity id */
 int of_node_to_nid(struct device_node *device)
 {
@@ -196,50 +311,75 @@ int of_node_to_nid(struct device_node *device)
 }
 EXPORT_SYMBOL_GPL(of_node_to_nid);
 
-/*
- * In theory, the "ibm,associativity" property may contain multiple
- * associativity lists because a resource may be multiply connected
- * into the machine.  This resource then has different associativity
- * characteristics relative to its multiple connections.  We ignore
- * this for now.  We also assume that all cpu and memory sets have
- * their distances represented at a common level.  This won't be
- * true for hierarchical NUMA.
- *
- * In any case the ibm,associativity-reference-points should give
- * the correct depth for a normal NUMA system.
- *
- * - Dave Hansen <haveblue@us.ibm.com>
- */
 static int __init find_min_common_depth(void)
 {
 	int depth;
-	const unsigned int *ref_points;
-	struct device_node *rtas_root;
-	unsigned int len;
-
-	rtas_root = of_find_node_by_path("/rtas");
+	struct device_node *root;
 
-	if (!rtas_root)
-		return -1;
+	if (firmware_has_feature(FW_FEATURE_OPAL))
+		root = of_find_node_by_path("/ibm,opal");
+	else
+		root = of_find_node_by_path("/rtas");
+	if (!root)
+		root = of_find_node_by_path("/");
 
 	/*
-	 * this property is 2 32-bit integers, each representing a level of
-	 * depth in the associativity nodes.  The first is for an SMP
-	 * configuration (should be all 0's) and the second is for a normal
-	 * NUMA configuration.
+	 * This property is a set of 32-bit integers, each representing
+	 * an index into the ibm,associativity nodes.
+	 *
+	 * With form 0 affinity the first integer is for an SMP configuration
+	 * (should be all 0's) and the second is for a normal NUMA
+	 * configuration. We have only one level of NUMA.
+	 *
+	 * With form 1 affinity the first integer is the most significant
+	 * NUMA boundary and the following are progressively less significant
+	 * boundaries. There can be more than one level of NUMA.
 	 */
-	ref_points = of_get_property(rtas_root,
-			"ibm,associativity-reference-points", &len);
+	distance_ref_points = of_get_property(root,
+					"ibm,associativity-reference-points",
+					&distance_ref_points_depth);
 
-	if ((len >= 1) && ref_points) {
-		depth = ref_points[1];
-	} else {
+	if (!distance_ref_points) {
 		dbg("NUMA: ibm,associativity-reference-points not found.\n");
-		depth = -1;
+		goto err;
+	}
+
+	distance_ref_points_depth /= sizeof(int);
+
+	if (firmware_has_feature(FW_FEATURE_OPAL) ||
+	    firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
+		dbg("Using form 1 affinity\n");
+		form1_affinity = 1;
+	}
+
+	if (form1_affinity) {
+		depth = of_read_number(distance_ref_points, 1);
+	} else {
+		if (distance_ref_points_depth < 2) {
+			printk(KERN_WARNING "NUMA: "
+				"short ibm,associativity-reference-points\n");
+			goto err;
+		}
+
+		depth = of_read_number(&distance_ref_points[1], 1);
 	}
-	of_node_put(rtas_root);
 
+	/*
+	 * Warn and cap if the hardware supports more than
+	 * MAX_DISTANCE_REF_POINTS domains.
+	 */
+	if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
+		printk(KERN_WARNING "NUMA: distance array capped at "
+			"%d entries\n", MAX_DISTANCE_REF_POINTS);
+		distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
+	}
+
+	of_node_put(root);
 	return depth;
+
+err:
+	of_node_put(root);
+	return -1;
 }
 
 static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
@@ -255,35 +395,174 @@ static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
 	of_node_put(memory);
 }
 
-static unsigned long __devinit read_n_cells(int n, const unsigned int **buf)
+static unsigned long read_n_cells(int n, const __be32 **buf)
 {
 	unsigned long result = 0;
 
 	while (n--) {
-		result = (result << 32) | **buf;
+		result = (result << 32) | of_read_number(*buf, 1);
 		(*buf)++;
 	}
 	return result;
 }
 
 /*
+ * Read the next memblock list entry from the ibm,dynamic-memory property
+ * and return the information in the provided of_drconf_cell structure.
+ */
+static void read_drconf_cell(struct of_drconf_cell *drmem, const __be32 **cellp)
+{
+	const __be32 *cp;
+
+	drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp);
+
+	cp = *cellp;
+	drmem->drc_index = of_read_number(cp, 1);
+	drmem->reserved = of_read_number(&cp[1], 1);
+	drmem->aa_index = of_read_number(&cp[2], 1);
+	drmem->flags = of_read_number(&cp[3], 1);
+
+	*cellp = cp + 4;
+}
+
+/*
+ * Retrieve and validate the ibm,dynamic-memory property of the device tree.
+ *
+ * The layout of the ibm,dynamic-memory property is a number N of memblock
+ * list entries followed by N memblock list entries.  Each memblock list entry
+ * contains information as laid out in the of_drconf_cell struct above.
+ */
+static int of_get_drconf_memory(struct device_node *memory, const __be32 **dm)
+{
+	const __be32 *prop;
+	u32 len, entries;
+
+	prop = of_get_property(memory, "ibm,dynamic-memory", &len);
+	if (!prop || len < sizeof(unsigned int))
+		return 0;
+
+	entries = of_read_number(prop++, 1);
+
+	/* Now that we know the number of entries, revalidate the size
+	 * of the property read in to ensure we have everything
+	 */
+	if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int))
+		return 0;
+
+	*dm = prop;
+	return entries;
+}
+
+/*
+ * Retrieve and validate the ibm,lmb-size property for drconf memory
+ * from the device tree.
+ */
+static u64 of_get_lmb_size(struct device_node *memory)
+{
+	const __be32 *prop;
+	u32 len;
+
+	prop = of_get_property(memory, "ibm,lmb-size", &len);
+	if (!prop || len < sizeof(unsigned int))
+		return 0;
+
+	return read_n_cells(n_mem_size_cells, &prop);
+}
+
+struct assoc_arrays {
+	u32	n_arrays;
+	u32	array_sz;
+	const __be32 *arrays;
+};
+
+/*
+ * Retrieve and validate the list of associativity arrays for drconf
+ * memory from the ibm,associativity-lookup-arrays property of the
+ * device tree..
+ *
+ * The layout of the ibm,associativity-lookup-arrays property is a number N
+ * indicating the number of associativity arrays, followed by a number M
+ * indicating the size of each associativity array, followed by a list
+ * of N associativity arrays.
+ */
+static int of_get_assoc_arrays(struct device_node *memory,
+			       struct assoc_arrays *aa)
+{
+	const __be32 *prop;
+	u32 len;
+
+	prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
+	if (!prop || len < 2 * sizeof(unsigned int))
+		return -1;
+
+	aa->n_arrays = of_read_number(prop++, 1);
+	aa->array_sz = of_read_number(prop++, 1);
+
+	/* Now that we know the number of arrays and size of each array,
+	 * revalidate the size of the property read in.
+	 */
+	if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int))
+		return -1;
+
+	aa->arrays = prop;
+	return 0;
+}
+
+/*
+ * This is like of_node_to_nid_single() for memory represented in the
+ * ibm,dynamic-reconfiguration-memory node.
+ */
+static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
+				   struct assoc_arrays *aa)
+{
+	int default_nid = 0;
+	int nid = default_nid;
+	int index;
+
+	if (min_common_depth > 0 && min_common_depth <= aa->array_sz &&
+	    !(drmem->flags & DRCONF_MEM_AI_INVALID) &&
+	    drmem->aa_index < aa->n_arrays) {
+		index = drmem->aa_index * aa->array_sz + min_common_depth - 1;
+		nid = of_read_number(&aa->arrays[index], 1);
+
+		if (nid == 0xffff || nid >= MAX_NUMNODES)
+			nid = default_nid;
+	}
+
+	return nid;
+}
+
+/*
  * Figure out to which domain a cpu belongs and stick it there.
  * Return the id of the domain used.
  */
-static int __cpuinit numa_setup_cpu(unsigned long lcpu)
+static int numa_setup_cpu(unsigned long lcpu)
 {
-	int nid = 0;
-	struct device_node *cpu = find_cpu_node(lcpu);
+	int nid;
+	struct device_node *cpu;
+
+	/*
+	 * If a valid cpu-to-node mapping is already available, use it
+	 * directly instead of querying the firmware, since it represents
+	 * the most recent mapping notified to us by the platform (eg: VPHN).
+	 */
+	if ((nid = numa_cpu_lookup_table[lcpu]) >= 0) {
+		map_cpu_to_node(lcpu, nid);
+		return nid;
+	}
+
+	cpu = of_get_cpu_node(lcpu, NULL);
 
 	if (!cpu) {
 		WARN_ON(1);
+		nid = 0;
 		goto out;
 	}
 
 	nid = of_node_to_nid_single(cpu);
 
 	if (nid < 0 || !node_online(nid))
-		nid = any_online_node(NODE_MASK_ALL);
+		nid = first_online_node;
 out:
 	map_cpu_to_node(lcpu, nid);
 
@@ -292,17 +571,38 @@ out:
 	return nid;
 }
 
-static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
-			     unsigned long action,
+static void verify_cpu_node_mapping(int cpu, int node)
+{
+	int base, sibling, i;
+
+	/* Verify that all the threads in the core belong to the same node */
+	base = cpu_first_thread_sibling(cpu);
+
+	for (i = 0; i < threads_per_core; i++) {
+		sibling = base + i;
+
+		if (sibling == cpu || cpu_is_offline(sibling))
+			continue;
+
+		if (cpu_to_node(sibling) != node) {
+			WARN(1, "CPU thread siblings %d and %d don't belong"
+				" to the same node!\n", cpu, sibling);
+			break;
+		}
+	}
+}
+
+static int cpu_numa_callback(struct notifier_block *nfb, unsigned long action,
 			     void *hcpu)
 {
 	unsigned long lcpu = (unsigned long)hcpu;
-	int ret = NOTIFY_DONE;
+	int ret = NOTIFY_DONE, nid;
 
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
-		numa_setup_cpu(lcpu);
+		nid = numa_setup_cpu(lcpu);
+		verify_cpu_node_mapping((int)lcpu, nid);
 		ret = NOTIFY_OK;
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
@@ -324,27 +624,40 @@ static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
  * Returns the size the region should have to enforce the memory limit.
  * This will either be the original value of size, a truncated value,
  * or zero. If the returned value of size is 0 the region should be
- * discarded as it lies wholy above the memory limit.
+ * discarded as it lies wholly above the memory limit.
  */
 static unsigned long __init numa_enforce_memory_limit(unsigned long start,
 						      unsigned long size)
 {
 	/*
-	 * We use lmb_end_of_DRAM() in here instead of memory_limit because
+	 * We use memblock_end_of_DRAM() in here instead of memory_limit because
 	 * we've already adjusted it for the limit and it takes care of
-	 * having memory holes below the limit.
+	 * having memory holes below the limit.  Also, in the case of
+	 * iommu_is_off, memory_limit is not set but is implicitly enforced.
 	 */
 
-	if (! memory_limit)
-		return size;
-
-	if (start + size <= lmb_end_of_DRAM())
+	if (start + size <= memblock_end_of_DRAM())
 		return size;
 
-	if (start >= lmb_end_of_DRAM())
+	if (start >= memblock_end_of_DRAM())
 		return 0;
 
-	return lmb_end_of_DRAM() - start;
+	return memblock_end_of_DRAM() - start;
+}
+
+/*
+ * Reads the counter for a given entry in
+ * linux,drconf-usable-memory property
+ */
+static inline int __init read_usm_ranges(const __be32 **usm)
+{
+	/*
+	 * For each lmb in ibm,dynamic-memory a corresponding
+	 * entry in linux,drconf-usable-memory property contains
+	 * a counter followed by that many (base, size) duple.
+	 * read the counter from linux,drconf-usable-memory
+	 */
+	return read_n_cells(n_mem_size_cells, usm);
 }
 
 /*
@@ -353,64 +666,70 @@ static unsigned long __init numa_enforce_memory_limit(unsigned long start,
  */
 static void __init parse_drconf_memory(struct device_node *memory)
 {
-	const unsigned int *lm, *dm, *aa;
-	unsigned int ls, ld, la;
-	unsigned int n, aam, aalen;
-	unsigned long lmb_size, size, start;
-	int nid, default_nid = 0;
-	unsigned int ai, flags;
-
-	lm = of_get_property(memory, "ibm,lmb-size", &ls);
-	dm = of_get_property(memory, "ibm,dynamic-memory", &ld);
-	aa = of_get_property(memory, "ibm,associativity-lookup-arrays", &la);
-	if (!lm || !dm || !aa ||
-	    ls < sizeof(unsigned int) || ld < sizeof(unsigned int) ||
-	    la < 2 * sizeof(unsigned int))
+	const __be32 *uninitialized_var(dm), *usm;
+	unsigned int n, rc, ranges, is_kexec_kdump = 0;
+	unsigned long lmb_size, base, size, sz;
+	int nid;
+	struct assoc_arrays aa = { .arrays = NULL };
+
+	n = of_get_drconf_memory(memory, &dm);
+	if (!n)
+		return;
+
+	lmb_size = of_get_lmb_size(memory);
+	if (!lmb_size)
 		return;
 
-	lmb_size = read_n_cells(n_mem_size_cells, &lm);
-	n = *dm++;		/* number of LMBs */
-	aam = *aa++;		/* number of associativity lists */
-	aalen = *aa++;		/* length of each associativity list */
-	if (ld < (n * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int) ||
-	    la < (aam * aalen + 2) * sizeof(unsigned int))
+	rc = of_get_assoc_arrays(memory, &aa);
+	if (rc)
 		return;
 
+	/* check if this is a kexec/kdump kernel */
+	usm = of_get_usable_memory(memory);
+	if (usm != NULL)
+		is_kexec_kdump = 1;
+
 	for (; n != 0; --n) {
-		start = read_n_cells(n_mem_addr_cells, &dm);
-		ai = dm[2];
-		flags = dm[3];
-		dm += 4;
-		/* 0x80 == reserved, 0x8 = assigned to us */
-		if ((flags & 0x80) || !(flags & 0x8))
-			continue;
-		nid = default_nid;
-		/* flags & 0x40 means associativity index is invalid */
-		if (min_common_depth > 0 && min_common_depth <= aalen &&
-		    (flags & 0x40) == 0 && ai < aam) {
-			/* this is like of_node_to_nid_single */
-			nid = aa[ai * aalen + min_common_depth - 1];
-			if (nid == 0xffff || nid >= MAX_NUMNODES)
-				nid = default_nid;
-		}
+		struct of_drconf_cell drmem;
 
-		fake_numa_create_new_node(((start + lmb_size) >> PAGE_SHIFT),
-						&nid);
-		node_set_online(nid);
+		read_drconf_cell(&drmem, &dm);
 
-		size = numa_enforce_memory_limit(start, lmb_size);
-		if (!size)
+		/* skip this block if the reserved bit is set in flags (0x80)
+		   or if the block is not assigned to this partition (0x8) */
+		if ((drmem.flags & DRCONF_MEM_RESERVED)
+		    || !(drmem.flags & DRCONF_MEM_ASSIGNED))
 			continue;
 
-		add_active_range(nid, start >> PAGE_SHIFT,
-				 (start >> PAGE_SHIFT) + (size >> PAGE_SHIFT));
+		base = drmem.base_addr;
+		size = lmb_size;
+		ranges = 1;
+
+		if (is_kexec_kdump) {
+			ranges = read_usm_ranges(&usm);
+			if (!ranges) /* there are no (base, size) duple */
+				continue;
+		}
+		do {
+			if (is_kexec_kdump) {
+				base = read_n_cells(n_mem_addr_cells, &usm);
+				size = read_n_cells(n_mem_size_cells, &usm);
+			}
+			nid = of_drconf_to_nid_single(&drmem, &aa);
+			fake_numa_create_new_node(
+				((base + size) >> PAGE_SHIFT),
+					   &nid);
+			node_set_online(nid);
+			sz = numa_enforce_memory_limit(base, size);
+			if (sz)
+				memblock_set_node(base, sz,
+						  &memblock.memory, nid);
+		} while (--ranges);
 	}
 }
 
 static int __init parse_numa_properties(void)
 {
-	struct device_node *cpu = NULL;
-	struct device_node *memory = NULL;
+	struct device_node *memory;
 	int default_nid = 0;
 	unsigned long i;
 
@@ -432,9 +751,10 @@ static int __init parse_numa_properties(void)
 	 * each node to be onlined must have NODE_DATA etc backing it.
 	 */
 	for_each_present_cpu(i) {
+		struct device_node *cpu;
 		int nid;
 
-		cpu = find_cpu_node(i);
+		cpu = of_get_cpu_node(i, NULL);
 		BUG_ON(!cpu);
 		nid = of_node_to_nid_single(cpu);
 		of_node_put(cpu);
@@ -450,13 +770,13 @@ static int __init parse_numa_properties(void)
 	}
 
 	get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
-	memory = NULL;
-	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
+
+	for_each_node_by_type(memory, "memory") {
 		unsigned long start;
 		unsigned long size;
 		int nid;
 		int ranges;
-		const unsigned int *memcell_buf;
+		const __be32 *memcell_buf;
 		unsigned int len;
 
 		memcell_buf = of_get_property(memory,
@@ -492,16 +812,16 @@ new_range:
 				continue;
 		}
 
-		add_active_range(nid, start >> PAGE_SHIFT,
-				(start >> PAGE_SHIFT) + (size >> PAGE_SHIFT));
+		memblock_set_node(start, size, &memblock.memory, nid);
 
 		if (--ranges)
 			goto new_range;
 	}
 
 	/*
-	 * Now do the same thing for each LMB listed in the ibm,dynamic-memory
-	 * property in the ibm,dynamic-reconfiguration-memory node.
+	 * Now do the same thing for each MEMBLOCK listed in the
+	 * ibm,dynamic-memory property in the
+	 * ibm,dynamic-reconfiguration-memory node.
 	 */
 	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
 	if (memory)
@@ -512,22 +832,25 @@ new_range:
 
 static void __init setup_nonnuma(void)
 {
-	unsigned long top_of_ram = lmb_end_of_DRAM();
-	unsigned long total_ram = lmb_phys_mem_size();
+	unsigned long top_of_ram = memblock_end_of_DRAM();
+	unsigned long total_ram = memblock_phys_mem_size();
 	unsigned long start_pfn, end_pfn;
-	unsigned int i, nid = 0;
+	unsigned int nid = 0;
+	struct memblock_region *reg;
 
 	printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
 	       top_of_ram, total_ram);
 	printk(KERN_DEBUG "Memory hole size: %ldMB\n",
 	       (top_of_ram - total_ram) >> 20);
 
-	for (i = 0; i < lmb.memory.cnt; ++i) {
-		start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT;
-		end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i);
+	for_each_memblock(memory, reg) {
+		start_pfn = memblock_region_memory_base_pfn(reg);
+		end_pfn = memblock_region_memory_end_pfn(reg);
 
 		fake_numa_create_new_node(end_pfn, &nid);
-		add_active_range(nid, start_pfn, end_pfn);
+		memblock_set_node(PFN_PHYS(start_pfn),
+				  PFN_PHYS(end_pfn - start_pfn),
+				  &memblock.memory, nid);
 		node_set_online(nid);
 	}
 }
@@ -548,8 +871,9 @@ void __init dump_numa_cpu_topology(void)
 		 * If we used a CPU iterator here we would miss printing
 		 * the holes in the cpumap.
 		 */
-		for (cpu = 0; cpu < NR_CPUS; cpu++) {
-			if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
+		for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+			if (cpumask_test_cpu(cpu,
+					node_to_cpumask_map[node])) {
 				if (count == 0)
 					printk(" %u", cpu);
 				++count;
@@ -561,7 +885,7 @@ void __init dump_numa_cpu_topology(void)
 		}
 
 		if (count > 1)
-			printk("-%u", NR_CPUS - 1);
+			printk("-%u", nr_cpu_ids - 1);
 		printk("\n");
 	}
 }
@@ -581,7 +905,7 @@ static void __init dump_numa_memory_topology(void)
 
 		count = 0;
 
-		for (i = 0; i < lmb_end_of_DRAM();
+		for (i = 0; i < memblock_end_of_DRAM();
 		     i += (1 << SECTION_SIZE_BITS)) {
 			if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) {
 				if (count == 0)
@@ -601,60 +925,134 @@ static void __init dump_numa_memory_topology(void)
 }
 
 /*
- * Allocate some memory, satisfying the lmb or bootmem allocator where
+ * Allocate some memory, satisfying the memblock or bootmem allocator where
  * required. nid is the preferred node and end is the physical address of
  * the highest address in the node.
  *
- * Returns the physical address of the memory.
+ * Returns the virtual address of the memory.
  */
-static void __init *careful_allocation(int nid, unsigned long size,
+static void __init *careful_zallocation(int nid, unsigned long size,
 				       unsigned long align,
 				       unsigned long end_pfn)
 {
+	void *ret;
 	int new_nid;
-	unsigned long ret = __lmb_alloc_base(size, align, end_pfn << PAGE_SHIFT);
+	unsigned long ret_paddr;
+
+	ret_paddr = __memblock_alloc_base(size, align, end_pfn << PAGE_SHIFT);
 
 	/* retry over all memory */
-	if (!ret)
-		ret = __lmb_alloc_base(size, align, lmb_end_of_DRAM());
+	if (!ret_paddr)
+		ret_paddr = __memblock_alloc_base(size, align, memblock_end_of_DRAM());
 
-	if (!ret)
-		panic("numa.c: cannot allocate %lu bytes on node %d",
+	if (!ret_paddr)
+		panic("numa.c: cannot allocate %lu bytes for node %d",
 		      size, nid);
 
+	ret = __va(ret_paddr);
+
 	/*
-	 * If the memory came from a previously allocated node, we must
-	 * retry with the bootmem allocator.
+	 * We initialize the nodes in numeric order: 0, 1, 2...
+	 * and hand over control from the MEMBLOCK allocator to the
+	 * bootmem allocator.  If this function is called for
+	 * node 5, then we know that all nodes <5 are using the
+	 * bootmem allocator instead of the MEMBLOCK allocator.
+	 *
+	 * So, check the nid from which this allocation came
+	 * and double check to see if we need to use bootmem
+	 * instead of the MEMBLOCK.  We don't free the MEMBLOCK memory
+	 * since it would be useless.
 	 */
-	new_nid = early_pfn_to_nid(ret >> PAGE_SHIFT);
+	new_nid = early_pfn_to_nid(ret_paddr >> PAGE_SHIFT);
 	if (new_nid < nid) {
-		ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(new_nid),
+		ret = __alloc_bootmem_node(NODE_DATA(new_nid),
 				size, align, 0);
 
-		if (!ret)
-			panic("numa.c: cannot allocate %lu bytes on node %d",
-			      size, new_nid);
-
-		ret = __pa(ret);
-
-		dbg("alloc_bootmem %lx %lx\n", ret, size);
+		dbg("alloc_bootmem %p %lx\n", ret, size);
 	}
 
-	return (void *)ret;
+	memset(ret, 0, size);
+	return ret;
 }
 
-static struct notifier_block __cpuinitdata ppc64_numa_nb = {
+static struct notifier_block ppc64_numa_nb = {
 	.notifier_call = cpu_numa_callback,
 	.priority = 1 /* Must run before sched domains notifier. */
 };
 
+static void __init mark_reserved_regions_for_nid(int nid)
+{
+	struct pglist_data *node = NODE_DATA(nid);
+	struct memblock_region *reg;
+
+	for_each_memblock(reserved, reg) {
+		unsigned long physbase = reg->base;
+		unsigned long size = reg->size;
+		unsigned long start_pfn = physbase >> PAGE_SHIFT;
+		unsigned long end_pfn = PFN_UP(physbase + size);
+		struct node_active_region node_ar;
+		unsigned long node_end_pfn = pgdat_end_pfn(node);
+
+		/*
+		 * Check to make sure that this memblock.reserved area is
+		 * within the bounds of the node that we care about.
+		 * Checking the nid of the start and end points is not
+		 * sufficient because the reserved area could span the
+		 * entire node.
+		 */
+		if (end_pfn <= node->node_start_pfn ||
+		    start_pfn >= node_end_pfn)
+			continue;
+
+		get_node_active_region(start_pfn, &node_ar);
+		while (start_pfn < end_pfn &&
+			node_ar.start_pfn < node_ar.end_pfn) {
+			unsigned long reserve_size = size;
+			/*
+			 * if reserved region extends past active region
+			 * then trim size to active region
+			 */
+			if (end_pfn > node_ar.end_pfn)
+				reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
+					- physbase;
+			/*
+			 * Only worry about *this* node, others may not
+			 * yet have valid NODE_DATA().
+			 */
+			if (node_ar.nid == nid) {
+				dbg("reserve_bootmem %lx %lx nid=%d\n",
+					physbase, reserve_size, node_ar.nid);
+				reserve_bootmem_node(NODE_DATA(node_ar.nid),
+						physbase, reserve_size,
+						BOOTMEM_DEFAULT);
+			}
+			/*
+			 * if reserved region is contained in the active region
+			 * then done.
+			 */
+			if (end_pfn <= node_ar.end_pfn)
+				break;
+
+			/*
+			 * reserved region extends past the active region
+			 *   get next active region that contains this
+			 *   reserved region
+			 */
+			start_pfn = node_ar.end_pfn;
+			physbase = start_pfn << PAGE_SHIFT;
+			size = size - reserve_size;
+			get_node_active_region(start_pfn, &node_ar);
+		}
+	}
+}
+
+
 void __init do_init_bootmem(void)
 {
 	int nid;
-	unsigned int i;
 
 	min_low_pfn = 0;
-	max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
+	max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
 	max_pfn = max_low_pfn;
 
 	if (parse_numa_properties())
@@ -662,28 +1060,28 @@ void __init do_init_bootmem(void)
 	else
 		dump_numa_memory_topology();
 
-	register_cpu_notifier(&ppc64_numa_nb);
-	cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
-			  (void *)(unsigned long)boot_cpuid);
-
 	for_each_online_node(nid) {
 		unsigned long start_pfn, end_pfn;
-		unsigned long bootmem_paddr;
+		void *bootmem_vaddr;
 		unsigned long bootmap_pages;
 
 		get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
 
-		/* Allocate the node structure node local if possible */
-		NODE_DATA(nid) = careful_allocation(nid,
+		/*
+		 * Allocate the node structure node local if possible
+		 *
+		 * Be careful moving this around, as it relies on all
+		 * previous nodes' bootmem to be initialized and have
+		 * all reserved areas marked.
+		 */
+		NODE_DATA(nid) = careful_zallocation(nid,
 					sizeof(struct pglist_data),
 					SMP_CACHE_BYTES, end_pfn);
-		NODE_DATA(nid) = __va(NODE_DATA(nid));
-		memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
 
   		dbg("node %d\n", nid);
 		dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
 
-		NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
+		NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
 		NODE_DATA(nid)->node_start_pfn = start_pfn;
 		NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
 
@@ -694,56 +1092,45 @@ void __init do_init_bootmem(void)
   		dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT);
 
 		bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
-		bootmem_paddr = (unsigned long)careful_allocation(nid,
+		bootmem_vaddr = careful_zallocation(nid,
 					bootmap_pages << PAGE_SHIFT,
 					PAGE_SIZE, end_pfn);
-		memset(__va(bootmem_paddr), 0, bootmap_pages << PAGE_SHIFT);
 
-		dbg("bootmap_paddr = %lx\n", bootmem_paddr);
+		dbg("bootmap_vaddr = %p\n", bootmem_vaddr);
 
-		init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
+		init_bootmem_node(NODE_DATA(nid),
+				  __pa(bootmem_vaddr) >> PAGE_SHIFT,
 				  start_pfn, end_pfn);
 
 		free_bootmem_with_active_regions(nid, end_pfn);
+		/*
+		 * Be very careful about moving this around.  Future
+		 * calls to careful_zallocation() depend on this getting
+		 * done correctly.
+		 */
+		mark_reserved_regions_for_nid(nid);
+		sparse_memory_present_with_active_regions(nid);
+	}
 
-		/* Mark reserved regions on this node */
-		for (i = 0; i < lmb.reserved.cnt; i++) {
-			unsigned long physbase = lmb.reserved.region[i].base;
-			unsigned long size = lmb.reserved.region[i].size;
-			unsigned long start_paddr = start_pfn << PAGE_SHIFT;
-			unsigned long end_paddr = end_pfn << PAGE_SHIFT;
-
-			if (early_pfn_to_nid(physbase >> PAGE_SHIFT) != nid &&
-			    early_pfn_to_nid((physbase+size-1) >> PAGE_SHIFT) != nid)
-				continue;
+	init_bootmem_done = 1;
 
-			if (physbase < end_paddr &&
-			    (physbase+size) > start_paddr) {
-				/* overlaps */
-				if (physbase < start_paddr) {
-					size -= start_paddr - physbase;
-					physbase = start_paddr;
-				}
-
-				if (size > end_paddr - physbase)
-					size = end_paddr - physbase;
-
-				dbg("reserve_bootmem %lx %lx\n", physbase,
-				    size);
-				reserve_bootmem_node(NODE_DATA(nid), physbase,
-						     size, BOOTMEM_DEFAULT);
-			}
-		}
+	/*
+	 * Now bootmem is initialised we can create the node to cpumask
+	 * lookup tables and setup the cpu callback to populate them.
+	 */
+	setup_node_to_cpumask_map();
 
-		sparse_memory_present_with_active_regions(nid);
-	}
+	reset_numa_cpu_lookup_table();
+	register_cpu_notifier(&ppc64_numa_nb);
+	cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
+			  (void *)(unsigned long)boot_cpuid);
 }
 
 void __init paging_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-	max_zone_pfns[ZONE_DMA] = lmb_end_of_DRAM() >> PAGE_SHIFT;
+	max_zone_pfns[ZONE_DMA] = memblock_end_of_DRAM() >> PAGE_SHIFT;
 	free_area_init_nodes(max_zone_pfns);
 }
 
@@ -768,24 +1155,67 @@ early_param("numa", early_numa);
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
- * Find the node associated with a hot added memory section.  Section
- * corresponds to a SPARSEMEM section, not an LMB.  It is assumed that
- * sections are fully contained within a single LMB.
+ * Find the node associated with a hot added memory section for
+ * memory represented in the device tree by the property
+ * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory.
  */
-int hot_add_scn_to_nid(unsigned long scn_addr)
+static int hot_add_drconf_scn_to_nid(struct device_node *memory,
+				     unsigned long scn_addr)
 {
-	struct device_node *memory = NULL;
-	nodemask_t nodes;
-	int default_nid = any_online_node(NODE_MASK_ALL);
-	int nid;
+	const __be32 *dm;
+	unsigned int drconf_cell_cnt, rc;
+	unsigned long lmb_size;
+	struct assoc_arrays aa;
+	int nid = -1;
 
-	if (!numa_enabled || (min_common_depth < 0))
-		return default_nid;
+	drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
+	if (!drconf_cell_cnt)
+		return -1;
+
+	lmb_size = of_get_lmb_size(memory);
+	if (!lmb_size)
+		return -1;
+
+	rc = of_get_assoc_arrays(memory, &aa);
+	if (rc)
+		return -1;
 
-	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
+	for (; drconf_cell_cnt != 0; --drconf_cell_cnt) {
+		struct of_drconf_cell drmem;
+
+		read_drconf_cell(&drmem, &dm);
+
+		/* skip this block if it is reserved or not assigned to
+		 * this partition */
+		if ((drmem.flags & DRCONF_MEM_RESERVED)
+		    || !(drmem.flags & DRCONF_MEM_ASSIGNED))
+			continue;
+
+		if ((scn_addr < drmem.base_addr)
+		    || (scn_addr >= (drmem.base_addr + lmb_size)))
+			continue;
+
+		nid = of_drconf_to_nid_single(&drmem, &aa);
+		break;
+	}
+
+	return nid;
+}
+
+/*
+ * Find the node associated with a hot added memory section for memory
+ * represented in the device tree as a node (i.e. memory@XXXX) for
+ * each memblock.
+ */
+static int hot_add_node_scn_to_nid(unsigned long scn_addr)
+{
+	struct device_node *memory;
+	int nid = -1;
+
+	for_each_node_by_type(memory, "memory") {
 		unsigned long start, size;
 		int ranges;
-		const unsigned int *memcell_buf;
+		const __be32 *memcell_buf;
 		unsigned int len;
 
 		memcell_buf = of_get_property(memory, "reg", &len);
@@ -794,33 +1224,586 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
 
 		/* ranges in cell */
 		ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
-ha_new_range:
-		start = read_n_cells(n_mem_addr_cells, &memcell_buf);
-		size = read_n_cells(n_mem_size_cells, &memcell_buf);
-		nid = of_node_to_nid_single(memory);
 
-		/* Domains not present at boot default to 0 */
-		if (nid < 0 || !node_online(nid))
-			nid = default_nid;
+		while (ranges--) {
+			start = read_n_cells(n_mem_addr_cells, &memcell_buf);
+			size = read_n_cells(n_mem_size_cells, &memcell_buf);
+
+			if ((scn_addr < start) || (scn_addr >= (start + size)))
+				continue;
 
-		if ((scn_addr >= start) && (scn_addr < (start + size))) {
-			of_node_put(memory);
-			goto got_nid;
+			nid = of_node_to_nid_single(memory);
+			break;
 		}
 
-		if (--ranges)		/* process all ranges in cell */
-			goto ha_new_range;
+		if (nid >= 0)
+			break;
 	}
-	BUG();	/* section address should be found above */
-	return 0;
 
-	/* Temporary code to ensure that returned node is not empty */
-got_nid:
-	nodes_setall(nodes);
-	while (NODE_DATA(nid)->node_spanned_pages == 0) {
-		node_clear(nid, nodes);
-		nid = any_online_node(nodes);
+	of_node_put(memory);
+
+	return nid;
+}
+
+/*
+ * Find the node associated with a hot added memory section.  Section
+ * corresponds to a SPARSEMEM section, not an MEMBLOCK.  It is assumed that
+ * sections are fully contained within a single MEMBLOCK.
+ */
+int hot_add_scn_to_nid(unsigned long scn_addr)
+{
+	struct device_node *memory = NULL;
+	int nid, found = 0;
+
+	if (!numa_enabled || (min_common_depth < 0))
+		return first_online_node;
+
+	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+	if (memory) {
+		nid = hot_add_drconf_scn_to_nid(memory, scn_addr);
+		of_node_put(memory);
+	} else {
+		nid = hot_add_node_scn_to_nid(scn_addr);
 	}
+
+	if (nid < 0 || !node_online(nid))
+		nid = first_online_node;
+
+	if (NODE_DATA(nid)->node_spanned_pages)
+		return nid;
+
+	for_each_online_node(nid) {
+		if (NODE_DATA(nid)->node_spanned_pages) {
+			found = 1;
+			break;
+		}
+	}
+
+	BUG_ON(!found);
 	return nid;
 }
+
+static u64 hot_add_drconf_memory_max(void)
+{
+        struct device_node *memory = NULL;
+        unsigned int drconf_cell_cnt = 0;
+        u64 lmb_size = 0;
+	const __be32 *dm = NULL;
+
+        memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+        if (memory) {
+                drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
+                lmb_size = of_get_lmb_size(memory);
+                of_node_put(memory);
+        }
+        return lmb_size * drconf_cell_cnt;
+}
+
+/*
+ * memory_hotplug_max - return max address of memory that may be added
+ *
+ * This is currently only used on systems that support drconfig memory
+ * hotplug.
+ */
+u64 memory_hotplug_max(void)
+{
+        return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM());
+}
 #endif /* CONFIG_MEMORY_HOTPLUG */
+
+/* Virtual Processor Home Node (VPHN) support */
+#ifdef CONFIG_PPC_SPLPAR
+struct topology_update_data {
+	struct topology_update_data *next;
+	unsigned int cpu;
+	int old_nid;
+	int new_nid;
+};
+
+static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
+static cpumask_t cpu_associativity_changes_mask;
+static int vphn_enabled;
+static int prrn_enabled;
+static void reset_topology_timer(void);
+
+/*
+ * Store the current values of the associativity change counters in the
+ * hypervisor.
+ */
+static void setup_cpu_associativity_change_counters(void)
+{
+	int cpu;
+
+	/* The VPHN feature supports a maximum of 8 reference points */
+	BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8);
+
+	for_each_possible_cpu(cpu) {
+		int i;
+		u8 *counts = vphn_cpu_change_counts[cpu];
+		volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
+
+		for (i = 0; i < distance_ref_points_depth; i++)
+			counts[i] = hypervisor_counts[i];
+	}
+}
+
+/*
+ * The hypervisor maintains a set of 8 associativity change counters in
+ * the VPA of each cpu that correspond to the associativity levels in the
+ * ibm,associativity-reference-points property. When an associativity
+ * level changes, the corresponding counter is incremented.
+ *
+ * Set a bit in cpu_associativity_changes_mask for each cpu whose home
+ * node associativity levels have changed.
+ *
+ * Returns the number of cpus with unhandled associativity changes.
+ */
+static int update_cpu_associativity_changes_mask(void)
+{
+	int cpu;
+	cpumask_t *changes = &cpu_associativity_changes_mask;
+
+	for_each_possible_cpu(cpu) {
+		int i, changed = 0;
+		u8 *counts = vphn_cpu_change_counts[cpu];
+		volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
+
+		for (i = 0; i < distance_ref_points_depth; i++) {
+			if (hypervisor_counts[i] != counts[i]) {
+				counts[i] = hypervisor_counts[i];
+				changed = 1;
+			}
+		}
+		if (changed) {
+			cpumask_or(changes, changes, cpu_sibling_mask(cpu));
+			cpu = cpu_last_thread_sibling(cpu);
+		}
+	}
+
+	return cpumask_weight(changes);
+}
+
+/*
+ * 6 64-bit registers unpacked into 12 32-bit associativity values. To form
+ * the complete property we have to add the length in the first cell.
+ */
+#define VPHN_ASSOC_BUFSIZE (6*sizeof(u64)/sizeof(u32) + 1)
+
+/*
+ * Convert the associativity domain numbers returned from the hypervisor
+ * to the sequence they would appear in the ibm,associativity property.
+ */
+static int vphn_unpack_associativity(const long *packed, __be32 *unpacked)
+{
+	int i, nr_assoc_doms = 0;
+	const __be16 *field = (const __be16 *) packed;
+
+#define VPHN_FIELD_UNUSED	(0xffff)
+#define VPHN_FIELD_MSB		(0x8000)
+#define VPHN_FIELD_MASK		(~VPHN_FIELD_MSB)
+
+	for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) {
+		if (be16_to_cpup(field) == VPHN_FIELD_UNUSED) {
+			/* All significant fields processed, and remaining
+			 * fields contain the reserved value of all 1's.
+			 * Just store them.
+			 */
+			unpacked[i] = *((__be32 *)field);
+			field += 2;
+		} else if (be16_to_cpup(field) & VPHN_FIELD_MSB) {
+			/* Data is in the lower 15 bits of this field */
+			unpacked[i] = cpu_to_be32(
+				be16_to_cpup(field) & VPHN_FIELD_MASK);
+			field++;
+			nr_assoc_doms++;
+		} else {
+			/* Data is in the lower 15 bits of this field
+			 * concatenated with the next 16 bit field
+			 */
+			unpacked[i] = *((__be32 *)field);
+			field += 2;
+			nr_assoc_doms++;
+		}
+	}
+
+	/* The first cell contains the length of the property */
+	unpacked[0] = cpu_to_be32(nr_assoc_doms);
+
+	return nr_assoc_doms;
+}
+
+/*
+ * Retrieve the new associativity information for a virtual processor's
+ * home node.
+ */
+static long hcall_vphn(unsigned long cpu, __be32 *associativity)
+{
+	long rc;
+	long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
+	u64 flags = 1;
+	int hwcpu = get_hard_smp_processor_id(cpu);
+
+	rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu);
+	vphn_unpack_associativity(retbuf, associativity);
+
+	return rc;
+}
+
+static long vphn_get_associativity(unsigned long cpu,
+					__be32 *associativity)
+{
+	long rc;
+
+	rc = hcall_vphn(cpu, associativity);
+
+	switch (rc) {
+	case H_FUNCTION:
+		printk(KERN_INFO
+			"VPHN is not supported. Disabling polling...\n");
+		stop_topology_update();
+		break;
+	case H_HARDWARE:
+		printk(KERN_ERR
+			"hcall_vphn() experienced a hardware fault "
+			"preventing VPHN. Disabling polling...\n");
+		stop_topology_update();
+	}
+
+	return rc;
+}
+
+/*
+ * Update the CPU maps and sysfs entries for a single CPU when its NUMA
+ * characteristics change. This function doesn't perform any locking and is
+ * only safe to call from stop_machine().
+ */
+static int update_cpu_topology(void *data)
+{
+	struct topology_update_data *update;
+	unsigned long cpu;
+
+	if (!data)
+		return -EINVAL;
+
+	cpu = smp_processor_id();
+
+	for (update = data; update; update = update->next) {
+		if (cpu != update->cpu)
+			continue;
+
+		unmap_cpu_from_node(update->cpu);
+		map_cpu_to_node(update->cpu, update->new_nid);
+		vdso_getcpu_init();
+	}
+
+	return 0;
+}
+
+static int update_lookup_table(void *data)
+{
+	struct topology_update_data *update;
+
+	if (!data)
+		return -EINVAL;
+
+	/*
+	 * Upon topology update, the numa-cpu lookup table needs to be updated
+	 * for all threads in the core, including offline CPUs, to ensure that
+	 * future hotplug operations respect the cpu-to-node associativity
+	 * properly.
+	 */
+	for (update = data; update; update = update->next) {
+		int nid, base, j;
+
+		nid = update->new_nid;
+		base = cpu_first_thread_sibling(update->cpu);
+
+		for (j = 0; j < threads_per_core; j++) {
+			update_numa_cpu_lookup_table(base + j, nid);
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Update the node maps and sysfs entries for each cpu whose home node
+ * has changed. Returns 1 when the topology has changed, and 0 otherwise.
+ */
+int arch_update_cpu_topology(void)
+{
+	unsigned int cpu, sibling, changed = 0;
+	struct topology_update_data *updates, *ud;
+	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
+	cpumask_t updated_cpus;
+	struct device *dev;
+	int weight, new_nid, i = 0;
+
+	weight = cpumask_weight(&cpu_associativity_changes_mask);
+	if (!weight)
+		return 0;
+
+	updates = kzalloc(weight * (sizeof(*updates)), GFP_KERNEL);
+	if (!updates)
+		return 0;
+
+	cpumask_clear(&updated_cpus);
+
+	for_each_cpu(cpu, &cpu_associativity_changes_mask) {
+		/*
+		 * If siblings aren't flagged for changes, updates list
+		 * will be too short. Skip on this update and set for next
+		 * update.
+		 */
+		if (!cpumask_subset(cpu_sibling_mask(cpu),
+					&cpu_associativity_changes_mask)) {
+			pr_info("Sibling bits not set for associativity "
+					"change, cpu%d\n", cpu);
+			cpumask_or(&cpu_associativity_changes_mask,
+					&cpu_associativity_changes_mask,
+					cpu_sibling_mask(cpu));
+			cpu = cpu_last_thread_sibling(cpu);
+			continue;
+		}
+
+		/* Use associativity from first thread for all siblings */
+		vphn_get_associativity(cpu, associativity);
+		new_nid = associativity_to_nid(associativity);
+		if (new_nid < 0 || !node_online(new_nid))
+			new_nid = first_online_node;
+
+		if (new_nid == numa_cpu_lookup_table[cpu]) {
+			cpumask_andnot(&cpu_associativity_changes_mask,
+					&cpu_associativity_changes_mask,
+					cpu_sibling_mask(cpu));
+			cpu = cpu_last_thread_sibling(cpu);
+			continue;
+		}
+
+		for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
+			ud = &updates[i++];
+			ud->cpu = sibling;
+			ud->new_nid = new_nid;
+			ud->old_nid = numa_cpu_lookup_table[sibling];
+			cpumask_set_cpu(sibling, &updated_cpus);
+			if (i < weight)
+				ud->next = &updates[i];
+		}
+		cpu = cpu_last_thread_sibling(cpu);
+	}
+
+	/*
+	 * In cases where we have nothing to update (because the updates list
+	 * is too short or because the new topology is same as the old one),
+	 * skip invoking update_cpu_topology() via stop-machine(). This is
+	 * necessary (and not just a fast-path optimization) since stop-machine
+	 * can end up electing a random CPU to run update_cpu_topology(), and
+	 * thus trick us into setting up incorrect cpu-node mappings (since
+	 * 'updates' is kzalloc()'ed).
+	 *
+	 * And for the similar reason, we will skip all the following updating.
+	 */
+	if (!cpumask_weight(&updated_cpus))
+		goto out;
+
+	stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
+
+	/*
+	 * Update the numa-cpu lookup table with the new mappings, even for
+	 * offline CPUs. It is best to perform this update from the stop-
+	 * machine context.
+	 */
+	stop_machine(update_lookup_table, &updates[0],
+					cpumask_of(raw_smp_processor_id()));
+
+	for (ud = &updates[0]; ud; ud = ud->next) {
+		unregister_cpu_under_node(ud->cpu, ud->old_nid);
+		register_cpu_under_node(ud->cpu, ud->new_nid);
+
+		dev = get_cpu_device(ud->cpu);
+		if (dev)
+			kobject_uevent(&dev->kobj, KOBJ_CHANGE);
+		cpumask_clear_cpu(ud->cpu, &cpu_associativity_changes_mask);
+		changed = 1;
+	}
+
+out:
+	kfree(updates);
+	return changed;
+}
+
+static void topology_work_fn(struct work_struct *work)
+{
+	rebuild_sched_domains();
+}
+static DECLARE_WORK(topology_work, topology_work_fn);
+
+static void topology_schedule_update(void)
+{
+	schedule_work(&topology_work);
+}
+
+static void topology_timer_fn(unsigned long ignored)
+{
+	if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask))
+		topology_schedule_update();
+	else if (vphn_enabled) {
+		if (update_cpu_associativity_changes_mask() > 0)
+			topology_schedule_update();
+		reset_topology_timer();
+	}
+}
+static struct timer_list topology_timer =
+	TIMER_INITIALIZER(topology_timer_fn, 0, 0);
+
+static void reset_topology_timer(void)
+{
+	topology_timer.data = 0;
+	topology_timer.expires = jiffies + 60 * HZ;
+	mod_timer(&topology_timer, topology_timer.expires);
+}
+
+#ifdef CONFIG_SMP
+
+static void stage_topology_update(int core_id)
+{
+	cpumask_or(&cpu_associativity_changes_mask,
+		&cpu_associativity_changes_mask, cpu_sibling_mask(core_id));
+	reset_topology_timer();
+}
+
+static int dt_update_callback(struct notifier_block *nb,
+				unsigned long action, void *data)
+{
+	struct of_prop_reconfig *update;
+	int rc = NOTIFY_DONE;
+
+	switch (action) {
+	case OF_RECONFIG_UPDATE_PROPERTY:
+		update = (struct of_prop_reconfig *)data;
+		if (!of_prop_cmp(update->dn->type, "cpu") &&
+		    !of_prop_cmp(update->prop->name, "ibm,associativity")) {
+			u32 core_id;
+			of_property_read_u32(update->dn, "reg", &core_id);
+			stage_topology_update(core_id);
+			rc = NOTIFY_OK;
+		}
+		break;
+	}
+
+	return rc;
+}
+
+static struct notifier_block dt_update_nb = {
+	.notifier_call = dt_update_callback,
+};
+
+#endif
+
+/*
+ * Start polling for associativity changes.
+ */
+int start_topology_update(void)
+{
+	int rc = 0;
+
+	if (firmware_has_feature(FW_FEATURE_PRRN)) {
+		if (!prrn_enabled) {
+			prrn_enabled = 1;
+			vphn_enabled = 0;
+#ifdef CONFIG_SMP
+			rc = of_reconfig_notifier_register(&dt_update_nb);
+#endif
+		}
+	} else if (firmware_has_feature(FW_FEATURE_VPHN) &&
+		   lppaca_shared_proc(get_lppaca())) {
+		if (!vphn_enabled) {
+			prrn_enabled = 0;
+			vphn_enabled = 1;
+			setup_cpu_associativity_change_counters();
+			init_timer_deferrable(&topology_timer);
+			reset_topology_timer();
+		}
+	}
+
+	return rc;
+}
+
+/*
+ * Disable polling for VPHN associativity changes.
+ */
+int stop_topology_update(void)
+{
+	int rc = 0;
+
+	if (prrn_enabled) {
+		prrn_enabled = 0;
+#ifdef CONFIG_SMP
+		rc = of_reconfig_notifier_unregister(&dt_update_nb);
+#endif
+	} else if (vphn_enabled) {
+		vphn_enabled = 0;
+		rc = del_timer_sync(&topology_timer);
+	}
+
+	return rc;
+}
+
+int prrn_is_enabled(void)
+{
+	return prrn_enabled;
+}
+
+static int topology_read(struct seq_file *file, void *v)
+{
+	if (vphn_enabled || prrn_enabled)
+		seq_puts(file, "on\n");
+	else
+		seq_puts(file, "off\n");
+
+	return 0;
+}
+
+static int topology_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, topology_read, NULL);
+}
+
+static ssize_t topology_write(struct file *file, const char __user *buf,
+			      size_t count, loff_t *off)
+{
+	char kbuf[4]; /* "on" or "off" plus null. */
+	int read_len;
+
+	read_len = count < 3 ? count : 3;
+	if (copy_from_user(kbuf, buf, read_len))
+		return -EINVAL;
+
+	kbuf[read_len] = '\0';
+
+	if (!strncmp(kbuf, "on", 2))
+		start_topology_update();
+	else if (!strncmp(kbuf, "off", 3))
+		stop_topology_update();
+	else
+		return -EINVAL;
+
+	return count;
+}
+
+static const struct file_operations topology_ops = {
+	.read = seq_read,
+	.write = topology_write,
+	.open = topology_open,
+	.release = single_release
+};
+
+static int topology_update_init(void)
+{
+	start_topology_update();
+	proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops);
+
+	return 0;
+}
+device_initcall(topology_update_init);
+#endif /* CONFIG_PPC_SPLPAR */
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
new file mode 100644
index 00000000000..c695943a513
--- /dev/null
+++ b/arch/powerpc/mm/pgtable.c
@@ -0,0 +1,236 @@
+/*
+ * This file contains common routines for dealing with free of page tables
+ * Along with common page table handling code
+ *
+ *  Derived from arch/powerpc/mm/tlb_64.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  Dave Engebretsen <engebret@us.ibm.com>
+ *      Rework for PPC64 port.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/hugetlb.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+
+static inline int is_exec_fault(void)
+{
+	return current->thread.regs && TRAP(current->thread.regs) == 0x400;
+}
+
+/* We only try to do i/d cache coherency on stuff that looks like
+ * reasonably "normal" PTEs. We currently require a PTE to be present
+ * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE. We also only do that
+ * on userspace PTEs
+ */
+static inline int pte_looks_normal(pte_t pte)
+{
+	return (pte_val(pte) &
+	    (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) ==
+	    (_PAGE_PRESENT | _PAGE_USER);
+}
+
+struct page * maybe_pte_to_page(pte_t pte)
+{
+	unsigned long pfn = pte_pfn(pte);
+	struct page *page;
+
+	if (unlikely(!pfn_valid(pfn)))
+		return NULL;
+	page = pfn_to_page(pfn);
+	if (PageReserved(page))
+		return NULL;
+	return page;
+}
+
+#if defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0
+
+/* Server-style MMU handles coherency when hashing if HW exec permission
+ * is supposed per page (currently 64-bit only). If not, then, we always
+ * flush the cache for valid PTEs in set_pte. Embedded CPU without HW exec
+ * support falls into the same category.
+ */
+
+static pte_t set_pte_filter(pte_t pte)
+{
+	pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
+	if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
+				       cpu_has_feature(CPU_FTR_NOEXECUTE))) {
+		struct page *pg = maybe_pte_to_page(pte);
+		if (!pg)
+			return pte;
+		if (!test_bit(PG_arch_1, &pg->flags)) {
+			flush_dcache_icache_page(pg);
+			set_bit(PG_arch_1, &pg->flags);
+		}
+	}
+	return pte;
+}
+
+static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
+				     int dirty)
+{
+	return pte;
+}
+
+#else /* defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 */
+
+/* Embedded type MMU with HW exec support. This is a bit more complicated
+ * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so
+ * instead we "filter out" the exec permission for non clean pages.
+ */
+static pte_t set_pte_filter(pte_t pte)
+{
+	struct page *pg;
+
+	/* No exec permission in the first place, move on */
+	if (!(pte_val(pte) & _PAGE_EXEC) || !pte_looks_normal(pte))
+		return pte;
+
+	/* If you set _PAGE_EXEC on weird pages you're on your own */
+	pg = maybe_pte_to_page(pte);
+	if (unlikely(!pg))
+		return pte;
+
+	/* If the page clean, we move on */
+	if (test_bit(PG_arch_1, &pg->flags))
+		return pte;
+
+	/* If it's an exec fault, we flush the cache and make it clean */
+	if (is_exec_fault()) {
+		flush_dcache_icache_page(pg);
+		set_bit(PG_arch_1, &pg->flags);
+		return pte;
+	}
+
+	/* Else, we filter out _PAGE_EXEC */
+	return __pte(pte_val(pte) & ~_PAGE_EXEC);
+}
+
+static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
+				     int dirty)
+{
+	struct page *pg;
+
+	/* So here, we only care about exec faults, as we use them
+	 * to recover lost _PAGE_EXEC and perform I$/D$ coherency
+	 * if necessary. Also if _PAGE_EXEC is already set, same deal,
+	 * we just bail out
+	 */
+	if (dirty || (pte_val(pte) & _PAGE_EXEC) || !is_exec_fault())
+		return pte;
+
+#ifdef CONFIG_DEBUG_VM
+	/* So this is an exec fault, _PAGE_EXEC is not set. If it was
+	 * an error we would have bailed out earlier in do_page_fault()
+	 * but let's make sure of it
+	 */
+	if (WARN_ON(!(vma->vm_flags & VM_EXEC)))
+		return pte;
+#endif /* CONFIG_DEBUG_VM */
+
+	/* If you set _PAGE_EXEC on weird pages you're on your own */
+	pg = maybe_pte_to_page(pte);
+	if (unlikely(!pg))
+		goto bail;
+
+	/* If the page is already clean, we move on */
+	if (test_bit(PG_arch_1, &pg->flags))
+		goto bail;
+
+	/* Clean the page and set PG_arch_1 */
+	flush_dcache_icache_page(pg);
+	set_bit(PG_arch_1, &pg->flags);
+
+ bail:
+	return __pte(pte_val(pte) | _PAGE_EXEC);
+}
+
+#endif /* !(defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0) */
+
+/*
+ * set_pte stores a linux PTE into the linux page table.
+ */
+void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+		pte_t pte)
+{
+#ifdef CONFIG_DEBUG_VM
+	WARN_ON(pte_val(*ptep) & _PAGE_PRESENT);
+#endif
+	/* Note: mm->context.id might not yet have been assigned as
+	 * this context might not have been activated yet when this
+	 * is called.
+	 */
+	pte = set_pte_filter(pte);
+
+	/* Perform the setting of the PTE */
+	__set_pte_at(mm, addr, ptep, pte, 0);
+}
+
+/*
+ * This is called when relaxing access to a PTE. It's also called in the page
+ * fault path when we don't hit any of the major fault cases, ie, a minor
+ * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
+ * handled those two for us, we additionally deal with missing execute
+ * permission here on some processors
+ */
+int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+			  pte_t *ptep, pte_t entry, int dirty)
+{
+	int changed;
+	entry = set_access_flags_filter(entry, vma, dirty);
+	changed = !pte_same(*(ptep), entry);
+	if (changed) {
+		if (!is_vm_hugetlb_page(vma))
+			assert_pte_locked(vma->vm_mm, address);
+		__ptep_set_access_flags(ptep, entry);
+		flush_tlb_page_nohash(vma, address);
+	}
+	return changed;
+}
+
+#ifdef CONFIG_DEBUG_VM
+void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	if (mm == &init_mm)
+		return;
+	pgd = mm->pgd + pgd_index(addr);
+	BUG_ON(pgd_none(*pgd));
+	pud = pud_offset(pgd, addr);
+	BUG_ON(pud_none(*pud));
+	pmd = pmd_offset(pud, addr);
+	/*
+	 * khugepaged to collapse normal pages to hugepage, first set
+	 * pmd to none to force page fault/gup to take mmap_sem. After
+	 * pmd is set to none, we do a pte_clear which does this assertion
+	 * so if we find pmd none, return.
+	 */
+	if (pmd_none(*pmd))
+		return;
+	BUG_ON(!pmd_present(*pmd));
+	assert_spin_locked(pte_lockptr(mm, pmd));
+}
+#endif /* CONFIG_DEBUG_VM */
+
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index ac3390f8190..343a87fa78b 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -26,10 +26,14 @@
 #include <linux/vmalloc.h>
 #include <linux/init.h>
 #include <linux/highmem.h>
+#include <linux/memblock.h>
+#include <linux/slab.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
+#include <asm/fixmap.h>
 #include <asm/io.h>
+#include <asm/setup.h>
 
 #include "mmu_decl.h"
 
@@ -47,14 +51,10 @@ EXPORT_SYMBOL(ioremap_bot);	/* aka VMALLOC_END */
 
 extern char etext[], _stext[];
 
-#ifdef CONFIG_SMP
-extern void hash_page_sync(void);
-#endif
-
 #ifdef HAVE_BATS
-extern unsigned long v_mapped_by_bats(unsigned long va);
-extern unsigned long p_mapped_by_bats(unsigned long pa);
-void setbat(int index, unsigned long virt, unsigned long phys,
+extern phys_addr_t v_mapped_by_bats(unsigned long va);
+extern unsigned long p_mapped_by_bats(phys_addr_t pa);
+void setbat(int index, unsigned long virt, phys_addr_t phys,
 	    unsigned int size, int flags);
 
 #else /* !HAVE_BATS */
@@ -64,31 +64,36 @@ void setbat(int index, unsigned long virt, unsigned long phys,
 
 #ifdef HAVE_TLBCAM
 extern unsigned int tlbcam_index;
-extern unsigned long v_mapped_by_tlbcam(unsigned long va);
-extern unsigned long p_mapped_by_tlbcam(unsigned long pa);
+extern phys_addr_t v_mapped_by_tlbcam(unsigned long va);
+extern unsigned long p_mapped_by_tlbcam(phys_addr_t pa);
 #else /* !HAVE_TLBCAM */
 #define v_mapped_by_tlbcam(x)	(0UL)
 #define p_mapped_by_tlbcam(x)	(0UL)
 #endif /* HAVE_TLBCAM */
 
-#ifdef CONFIG_PTE_64BIT
-/* 44x uses an 8kB pgdir because it has 8-byte Linux PTEs. */
-#define PGDIR_ORDER	1
-#else
-#define PGDIR_ORDER	0
-#endif
+#define PGDIR_ORDER	(32 + PGD_T_LOG2 - PGDIR_SHIFT)
 
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	pgd_t *ret;
 
-	ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER);
+	/* pgdir take page or two with 4K pages and a page fraction otherwise */
+#ifndef CONFIG_PPC_4K_PAGES
+	ret = kzalloc(1 << PGDIR_ORDER, GFP_KERNEL);
+#else
+	ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
+			PGDIR_ORDER - PAGE_SHIFT);
+#endif
 	return ret;
 }
 
 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
-	free_pages((unsigned long)pgd, PGDIR_ORDER);
+#ifndef CONFIG_PPC_4K_PAGES
+	kfree((void *)pgd);
+#else
+	free_pages((unsigned long)pgd, PGDIR_ORDER - PAGE_SHIFT);
+#endif
 }
 
 __init_refok pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
@@ -111,57 +116,78 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
 	struct page *ptepage;
 
-#ifdef CONFIG_HIGHPTE
-	gfp_t flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_REPEAT | __GFP_ZERO;
-#else
 	gfp_t flags = GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO;
-#endif
 
 	ptepage = alloc_pages(flags, 0);
 	if (!ptepage)
 		return NULL;
-	pgtable_page_ctor(ptepage);
+	if (!pgtable_page_ctor(ptepage)) {
+		__free_page(ptepage);
+		return NULL;
+	}
 	return ptepage;
 }
 
-void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+void __iomem *
+ioremap(phys_addr_t addr, unsigned long size)
 {
-#ifdef CONFIG_SMP
-	hash_page_sync();
-#endif
-	free_page((unsigned long)pte);
+	return __ioremap_caller(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED,
+				__builtin_return_address(0));
 }
+EXPORT_SYMBOL(ioremap);
 
-void pte_free(struct mm_struct *mm, pgtable_t ptepage)
+void __iomem *
+ioremap_wc(phys_addr_t addr, unsigned long size)
 {
-#ifdef CONFIG_SMP
-	hash_page_sync();
-#endif
-	pgtable_page_dtor(ptepage);
-	__free_page(ptepage);
+	return __ioremap_caller(addr, size, _PAGE_NO_CACHE,
+				__builtin_return_address(0));
 }
+EXPORT_SYMBOL(ioremap_wc);
 
 void __iomem *
-ioremap(phys_addr_t addr, unsigned long size)
+ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags)
 {
-	return __ioremap(addr, size, _PAGE_NO_CACHE);
+	/* writeable implies dirty for kernel addresses */
+	if (flags & _PAGE_RW)
+		flags |= _PAGE_DIRTY | _PAGE_HWWRITE;
+
+	/* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
+	flags &= ~(_PAGE_USER | _PAGE_EXEC);
+
+#ifdef _PAGE_BAP_SR
+	/* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format
+	 * which means that we just cleared supervisor access... oops ;-) This
+	 * restores it
+	 */
+	flags |= _PAGE_BAP_SR;
+#endif
+
+	return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
 }
-EXPORT_SYMBOL(ioremap);
+EXPORT_SYMBOL(ioremap_prot);
 
 void __iomem *
-ioremap_flags(phys_addr_t addr, unsigned long size, unsigned long flags)
+__ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
 {
-	return __ioremap(addr, size, flags);
+	return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
 }
-EXPORT_SYMBOL(ioremap_flags);
 
 void __iomem *
-__ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
+__ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags,
+		 void *caller)
 {
 	unsigned long v, i;
 	phys_addr_t p;
 	int err;
 
+	/* Make sure we have the base flags */
+	if ((flags & _PAGE_PRESENT) == 0)
+		flags |= PAGE_KERNEL;
+
+	/* Non-cacheable page cannot be coherent */
+	if (flags & _PAGE_NO_CACHE)
+		flags &= ~_PAGE_COHERENT;
+
 	/*
 	 * Choose an address to map it to.
 	 * Once the vmalloc system is running, we use it.
@@ -178,15 +204,18 @@ __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
 	if (p < 16*1024*1024)
 		p += _ISA_MEM_BASE;
 
+#ifndef CONFIG_CRASH_DUMP
 	/*
 	 * Don't allow anybody to remap normal RAM that we're using.
 	 * mem_init() sets high_memory so only do the check after that.
 	 */
-	if (mem_init_done && (p < virt_to_phys(high_memory))) {
-		printk("__ioremap(): phys addr 0x%llx is RAM lr %p\n",
+	if (mem_init_done && (p < virt_to_phys(high_memory)) &&
+	    !(__allow_ioremap_reserved && memblock_is_region_reserved(p, size))) {
+		printk("__ioremap(): phys addr 0x%llx is RAM lr %pf\n",
 		       (unsigned long long)p, __builtin_return_address(0));
 		return NULL;
 	}
+#endif
 
 	if (size == 0)
 		return NULL;
@@ -210,19 +239,15 @@ __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
 
 	if (mem_init_done) {
 		struct vm_struct *area;
-		area = get_vm_area(size, VM_IOREMAP);
+		area = get_vm_area_caller(size, VM_IOREMAP, caller);
 		if (area == 0)
 			return NULL;
+		area->phys_addr = p;
 		v = (unsigned long) area->addr;
 	} else {
 		v = (ioremap_bot -= size);
 	}
 
-	if ((flags & _PAGE_PRESENT) == 0)
-		flags |= _PAGE_KERNEL;
-	if (flags & _PAGE_NO_CACHE)
-		flags |= _PAGE_GUARDED;
-
 	/*
 	 * Should check if it is a candidate for a BAT mapping
 	 */
@@ -269,27 +294,30 @@ int map_page(unsigned long va, phys_addr_t pa, int flags)
 		/* The PTE should never be already set nor present in the
 		 * hash table
 		 */
-		BUG_ON(pte_val(*pg) & (_PAGE_PRESENT | _PAGE_HASHPTE));
+		BUG_ON((pte_val(*pg) & (_PAGE_PRESENT | _PAGE_HASHPTE)) &&
+		       flags);
 		set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT,
 						     __pgprot(flags)));
 	}
+	smp_wmb();
 	return err;
 }
 
 /*
- * Map in all of physical memory starting at KERNELBASE.
+ * Map in a chunk of physical memory starting at start.
  */
-void __init mapin_ram(void)
+void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
 {
-	unsigned long v, p, s, f;
+	unsigned long v, s, f;
+	phys_addr_t p;
 	int ktext;
 
-	s = mmu_mapin_ram();
-	v = KERNELBASE + s;
-	p = PPC_MEMSTART + s;
-	for (; s < total_lowmem; s += PAGE_SIZE) {
+	s = offset;
+	v = PAGE_OFFSET + s;
+	p = memstart_addr + s;
+	for (; s < top; s += PAGE_SIZE) {
 		ktext = ((char *) v >= _stext && (char *) v < etext);
-		f = ktext ?_PAGE_RAM_TEXT : _PAGE_RAM;
+		f = ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL;
 		map_page(v, p, f);
 #ifdef CONFIG_PPC_STD_MMU_32
 		if (ktext)
@@ -300,6 +328,30 @@ void __init mapin_ram(void)
 	}
 }
 
+void __init mapin_ram(void)
+{
+	unsigned long s, top;
+
+#ifndef CONFIG_WII
+	top = total_lowmem;
+	s = mmu_mapin_ram(top);
+	__mapin_ram_chunk(s, top);
+#else
+	if (!wii_hole_size) {
+		s = mmu_mapin_ram(total_lowmem);
+		__mapin_ram_chunk(s, total_lowmem);
+	} else {
+		top = wii_hole_start;
+		s = mmu_mapin_ram(top);
+		__mapin_ram_chunk(s, top);
+
+		top = memblock_end_of_DRAM();
+		s = wii_mmu_mapin_mem2(top);
+		__mapin_ram_chunk(s, top);
+	}
+#endif
+}
+
 /* Scan the real Linux page tables and return a PTE pointer for
  * a virtual address in a context.
  * Returns true (1) if PTE was found, zero otherwise.  The pointer to
@@ -349,9 +401,9 @@ static int __change_page_attr(struct page *page, pgprot_t prot)
 		return 0;
 	if (!get_pteptr(&init_mm, address, &kpte, &kpmd))
 		return -EINVAL;
-	set_pte_at(&init_mm, address, kpte, mk_pte(page, prot));
+	__set_pte_at(&init_mm, address, kpte, mk_pte(page, prot), 0);
 	wmb();
-	flush_HPTE(0, address, pmd_val(*kpmd));
+	flush_tlb_page(NULL, address);
 	pte_unmap(kpte);
 
 	return 0;
@@ -386,3 +438,23 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
 	change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
 }
 #endif /* CONFIG_DEBUG_PAGEALLOC */
+
+static int fixmaps;
+
+void __set_fixmap (enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
+{
+	unsigned long address = __fix_to_virt(idx);
+
+	if (idx >= __end_of_fixed_addresses) {
+		BUG();
+		return;
+	}
+
+	map_page(address, phys, pgprot_val(flags));
+	fixmaps++;
+}
+
+void __this_fixmap_does_not_exist(void)
+{
+	WARN_ON(1);
+}
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 3ef0ad2f9ca..f6ce1f111f5 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -26,13 +26,16 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
+#include <linux/export.h>
 #include <linux/types.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/stddef.h>
 #include <linux/vmalloc.h>
-#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/slab.h>
 
 #include <asm/pgalloc.h>
 #include <asm/page.h>
@@ -47,27 +50,52 @@
 #include <asm/processor.h>
 #include <asm/cputable.h>
 #include <asm/sections.h>
-#include <asm/system.h>
-#include <asm/abs_addr.h>
 #include <asm/firmware.h>
 
 #include "mmu_decl.h"
 
+/* Some sanity checking */
+#if TASK_SIZE_USER64 > PGTABLE_RANGE
+#error TASK_SIZE_USER64 exceeds pagetable range
+#endif
+
+#ifdef CONFIG_PPC_STD_MMU_64
+#if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT))
+#error TASK_SIZE_USER64 exceeds user VSID range
+#endif
+#endif
+
 unsigned long ioremap_bot = IOREMAP_BASE;
 
+#ifdef CONFIG_PPC_MMU_NOHASH
+static void *early_alloc_pgtable(unsigned long size)
+{
+	void *pt;
+
+	if (init_bootmem_done)
+		pt = __alloc_bootmem(size, size, __pa(MAX_DMA_ADDRESS));
+	else
+		pt = __va(memblock_alloc_base(size, size,
+					 __pa(MAX_DMA_ADDRESS)));
+	memset(pt, 0, size);
+
+	return pt;
+}
+#endif /* CONFIG_PPC_MMU_NOHASH */
+
 /*
- * map_io_page currently only called by __ioremap
- * map_io_page adds an entry to the ioremap page table
+ * map_kernel_page currently only called by __ioremap
+ * map_kernel_page adds an entry to the ioremap page table
  * and adds an entry to the HPT, possibly bolting it
  */
-static int map_io_page(unsigned long ea, unsigned long pa, int flags)
+int map_kernel_page(unsigned long ea, unsigned long pa, int flags)
 {
 	pgd_t *pgdp;
 	pud_t *pudp;
 	pmd_t *pmdp;
 	pte_t *ptep;
 
-	if (mem_init_done) {
+	if (slab_is_available()) {
 		pgdp = pgd_offset_k(ea);
 		pudp = pud_alloc(&init_mm, pgdp, ea);
 		if (!pudp)
@@ -81,6 +109,35 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
 		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
 							  __pgprot(flags)));
 	} else {
+#ifdef CONFIG_PPC_MMU_NOHASH
+		/* Warning ! This will blow up if bootmem is not initialized
+		 * which our ppc64 code is keen to do that, we'll need to
+		 * fix it and/or be more careful
+		 */
+		pgdp = pgd_offset_k(ea);
+#ifdef PUD_TABLE_SIZE
+		if (pgd_none(*pgdp)) {
+			pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
+			BUG_ON(pudp == NULL);
+			pgd_populate(&init_mm, pgdp, pudp);
+		}
+#endif /* PUD_TABLE_SIZE */
+		pudp = pud_offset(pgdp, ea);
+		if (pud_none(*pudp)) {
+			pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
+			BUG_ON(pmdp == NULL);
+			pud_populate(&init_mm, pudp, pmdp);
+		}
+		pmdp = pmd_offset(pudp, ea);
+		if (!pmd_present(*pmdp)) {
+			ptep = early_alloc_pgtable(PAGE_SIZE);
+			BUG_ON(ptep == NULL);
+			pmd_populate_kernel(&init_mm, pmdp, ptep);
+		}
+		ptep = pte_offset_kernel(pmdp, ea);
+		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
+							  __pgprot(flags)));
+#else /* CONFIG_PPC_MMU_NOHASH */
 		/*
 		 * If the mm subsystem is not fully up, we cannot create a
 		 * linux page table entry for this mapping.  Simply bolt an
@@ -93,7 +150,20 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
 			       "memory at %016lx !\n", pa);
 			return -ENOMEM;
 		}
+#endif /* !CONFIG_PPC_MMU_NOHASH */
 	}
+
+#ifdef CONFIG_PPC_BOOK3E_64
+	/*
+	 * With hardware tablewalk, a sync is needed to ensure that
+	 * subsequent accesses see the PTE we just wrote.  Unlike userspace
+	 * mappings, we can't tolerate spurious faults, so make sure
+	 * the new PTE will be seen the first time.
+	 */
+	mb();
+#else
+	smp_wmb();
+#endif
 	return 0;
 }
 
@@ -107,15 +177,24 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size,
 {
 	unsigned long i;
 
+	/* Make sure we have the base flags */
 	if ((flags & _PAGE_PRESENT) == 0)
 		flags |= pgprot_val(PAGE_KERNEL);
 
+	/* Non-cacheable page cannot be coherent */
+	if (flags & _PAGE_NO_CACHE)
+		flags &= ~_PAGE_COHERENT;
+
+	/* We don't support the 4K PFN hack with ioremap */
+	if (flags & _PAGE_4K_PFN)
+		return NULL;
+
 	WARN_ON(pa & ~PAGE_MASK);
 	WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
 	WARN_ON(size & ~PAGE_MASK);
 
 	for (i = 0; i < size; i += PAGE_SIZE)
-		if (map_io_page((unsigned long)ea+i, pa+i, flags))
+		if (map_kernel_page((unsigned long)ea+i, pa+i, flags))
 			return NULL;
 
 	return (void __iomem *)ea;
@@ -135,8 +214,8 @@ void __iounmap_at(void *ea, unsigned long size)
 	unmap_kernel_range((unsigned long)ea, size);
 }
 
-void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
-			 unsigned long flags)
+void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size,
+				unsigned long flags, void *caller)
 {
 	phys_addr_t paligned;
 	void __iomem *ret;
@@ -159,10 +238,13 @@ void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
 	if (mem_init_done) {
 		struct vm_struct *area;
 
-		area = __get_vm_area(size, VM_IOREMAP,
-				     ioremap_bot, IOREMAP_END);
+		area = __get_vm_area_caller(size, VM_IOREMAP,
+					    ioremap_bot, IOREMAP_END,
+					    caller);
 		if (area == NULL)
 			return NULL;
+
+		area->phys_addr = paligned;
 		ret = __ioremap_at(paligned, area->addr, size, flags);
 		if (!ret)
 			vunmap(area->addr);
@@ -177,22 +259,55 @@ void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
 	return ret;
 }
 
+void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
+			 unsigned long flags)
+{
+	return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
+}
 
 void __iomem * ioremap(phys_addr_t addr, unsigned long size)
 {
 	unsigned long flags = _PAGE_NO_CACHE | _PAGE_GUARDED;
+	void *caller = __builtin_return_address(0);
 
 	if (ppc_md.ioremap)
-		return ppc_md.ioremap(addr, size, flags);
-	return __ioremap(addr, size, flags);
+		return ppc_md.ioremap(addr, size, flags, caller);
+	return __ioremap_caller(addr, size, flags, caller);
 }
 
-void __iomem * ioremap_flags(phys_addr_t addr, unsigned long size,
+void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size)
+{
+	unsigned long flags = _PAGE_NO_CACHE;
+	void *caller = __builtin_return_address(0);
+
+	if (ppc_md.ioremap)
+		return ppc_md.ioremap(addr, size, flags, caller);
+	return __ioremap_caller(addr, size, flags, caller);
+}
+
+void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size,
 			     unsigned long flags)
 {
+	void *caller = __builtin_return_address(0);
+
+	/* writeable implies dirty for kernel addresses */
+	if (flags & _PAGE_RW)
+		flags |= _PAGE_DIRTY;
+
+	/* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
+	flags &= ~(_PAGE_USER | _PAGE_EXEC);
+
+#ifdef _PAGE_BAP_SR
+	/* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format
+	 * which means that we just cleared supervisor access... oops ;-) This
+	 * restores it
+	 */
+	flags |= _PAGE_BAP_SR;
+#endif
+
 	if (ppc_md.ioremap)
-		return ppc_md.ioremap(addr, size, flags);
-	return __ioremap(addr, size, flags);
+		return ppc_md.ioremap(addr, size, flags, caller);
+	return __ioremap_caller(addr, size, flags, caller);
 }
 
 
@@ -226,9 +341,550 @@ void iounmap(volatile void __iomem *token)
 }
 
 EXPORT_SYMBOL(ioremap);
-EXPORT_SYMBOL(ioremap_flags);
+EXPORT_SYMBOL(ioremap_wc);
+EXPORT_SYMBOL(ioremap_prot);
 EXPORT_SYMBOL(__ioremap);
 EXPORT_SYMBOL(__ioremap_at);
 EXPORT_SYMBOL(iounmap);
 EXPORT_SYMBOL(__iounmap);
 EXPORT_SYMBOL(__iounmap_at);
+
+/*
+ * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags
+ * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
+ */
+struct page *pmd_page(pmd_t pmd)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	if (pmd_trans_huge(pmd))
+		return pfn_to_page(pmd_pfn(pmd));
+#endif
+	return virt_to_page(pmd_page_vaddr(pmd));
+}
+
+#ifdef CONFIG_PPC_64K_PAGES
+static pte_t *get_from_cache(struct mm_struct *mm)
+{
+	void *pte_frag, *ret;
+
+	spin_lock(&mm->page_table_lock);
+	ret = mm->context.pte_frag;
+	if (ret) {
+		pte_frag = ret + PTE_FRAG_SIZE;
+		/*
+		 * If we have taken up all the fragments mark PTE page NULL
+		 */
+		if (((unsigned long)pte_frag & ~PAGE_MASK) == 0)
+			pte_frag = NULL;
+		mm->context.pte_frag = pte_frag;
+	}
+	spin_unlock(&mm->page_table_lock);
+	return (pte_t *)ret;
+}
+
+static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
+{
+	void *ret = NULL;
+	struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
+				       __GFP_REPEAT | __GFP_ZERO);
+	if (!page)
+		return NULL;
+	if (!kernel && !pgtable_page_ctor(page)) {
+		__free_page(page);
+		return NULL;
+	}
+
+	ret = page_address(page);
+	spin_lock(&mm->page_table_lock);
+	/*
+	 * If we find pgtable_page set, we return
+	 * the allocated page with single fragement
+	 * count.
+	 */
+	if (likely(!mm->context.pte_frag)) {
+		atomic_set(&page->_count, PTE_FRAG_NR);
+		mm->context.pte_frag = ret + PTE_FRAG_SIZE;
+	}
+	spin_unlock(&mm->page_table_lock);
+
+	return (pte_t *)ret;
+}
+
+pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
+{
+	pte_t *pte;
+
+	pte = get_from_cache(mm);
+	if (pte)
+		return pte;
+
+	return __alloc_for_cache(mm, kernel);
+}
+
+void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel)
+{
+	struct page *page = virt_to_page(table);
+	if (put_page_testzero(page)) {
+		if (!kernel)
+			pgtable_page_dtor(page);
+		free_hot_cold_page(page, 0);
+	}
+}
+
+#ifdef CONFIG_SMP
+static void page_table_free_rcu(void *table)
+{
+	struct page *page = virt_to_page(table);
+	if (put_page_testzero(page)) {
+		pgtable_page_dtor(page);
+		free_hot_cold_page(page, 0);
+	}
+}
+
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
+{
+	unsigned long pgf = (unsigned long)table;
+
+	BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+	pgf |= shift;
+	tlb_remove_table(tlb, (void *)pgf);
+}
+
+void __tlb_remove_table(void *_table)
+{
+	void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
+	unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
+
+	if (!shift)
+		/* PTE page needs special handling */
+		page_table_free_rcu(table);
+	else {
+		BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+		kmem_cache_free(PGT_CACHE(shift), table);
+	}
+}
+#else
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
+{
+	if (!shift) {
+		/* PTE page needs special handling */
+		struct page *page = virt_to_page(table);
+		if (put_page_testzero(page)) {
+			pgtable_page_dtor(page);
+			free_hot_cold_page(page, 0);
+		}
+	} else {
+		BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+		kmem_cache_free(PGT_CACHE(shift), table);
+	}
+}
+#endif
+#endif /* CONFIG_PPC_64K_PAGES */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+/*
+ * This is called when relaxing access to a hugepage. It's also called in the page
+ * fault path when we don't hit any of the major fault cases, ie, a minor
+ * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
+ * handled those two for us, we additionally deal with missing execute
+ * permission here on some processors
+ */
+int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+			  pmd_t *pmdp, pmd_t entry, int dirty)
+{
+	int changed;
+#ifdef CONFIG_DEBUG_VM
+	WARN_ON(!pmd_trans_huge(*pmdp));
+	assert_spin_locked(&vma->vm_mm->page_table_lock);
+#endif
+	changed = !pmd_same(*(pmdp), entry);
+	if (changed) {
+		__ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
+		/*
+		 * Since we are not supporting SW TLB systems, we don't
+		 * have any thing similar to flush_tlb_page_nohash()
+		 */
+	}
+	return changed;
+}
+
+unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
+				  pmd_t *pmdp, unsigned long clr,
+				  unsigned long set)
+{
+
+	unsigned long old, tmp;
+
+#ifdef CONFIG_DEBUG_VM
+	WARN_ON(!pmd_trans_huge(*pmdp));
+	assert_spin_locked(&mm->page_table_lock);
+#endif
+
+#ifdef PTE_ATOMIC_UPDATES
+	__asm__ __volatile__(
+	"1:	ldarx	%0,0,%3\n\
+		andi.	%1,%0,%6\n\
+		bne-	1b \n\
+		andc	%1,%0,%4 \n\
+		or	%1,%1,%7\n\
+		stdcx.	%1,0,%3 \n\
+		bne-	1b"
+	: "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
+	: "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY), "r" (set)
+	: "cc" );
+#else
+	old = pmd_val(*pmdp);
+	*pmdp = __pmd((old & ~clr) | set);
+#endif
+	if (old & _PAGE_HASHPTE)
+		hpte_do_hugepage_flush(mm, addr, pmdp);
+	return old;
+}
+
+pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
+		       pmd_t *pmdp)
+{
+	pmd_t pmd;
+
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+	if (pmd_trans_huge(*pmdp)) {
+		pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+	} else {
+		/*
+		 * khugepaged calls this for normal pmd
+		 */
+		pmd = *pmdp;
+		pmd_clear(pmdp);
+		/*
+		 * Wait for all pending hash_page to finish. This is needed
+		 * in case of subpage collapse. When we collapse normal pages
+		 * to hugepage, we first clear the pmd, then invalidate all
+		 * the PTE entries. The assumption here is that any low level
+		 * page fault will see a none pmd and take the slow path that
+		 * will wait on mmap_sem. But we could very well be in a
+		 * hash_page with local ptep pointer value. Such a hash page
+		 * can result in adding new HPTE entries for normal subpages.
+		 * That means we could be modifying the page content as we
+		 * copy them to a huge page. So wait for parallel hash_page
+		 * to finish before invalidating HPTE entries. We can do this
+		 * by sending an IPI to all the cpus and executing a dummy
+		 * function there.
+		 */
+		kick_all_cpus_sync();
+		/*
+		 * Now invalidate the hpte entries in the range
+		 * covered by pmd. This make sure we take a
+		 * fault and will find the pmd as none, which will
+		 * result in a major fault which takes mmap_sem and
+		 * hence wait for collapse to complete. Without this
+		 * the __collapse_huge_page_copy can result in copying
+		 * the old content.
+		 */
+		flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
+	}
+	return pmd;
+}
+
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+			      unsigned long address, pmd_t *pmdp)
+{
+	return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
+}
+
+/*
+ * We currently remove entries from the hashtable regardless of whether
+ * the entry was young or dirty. The generic routines only flush if the
+ * entry was young or dirty which is not good enough.
+ *
+ * We should be more intelligent about this but for the moment we override
+ * these functions and force a tlb flush unconditionally
+ */
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+				  unsigned long address, pmd_t *pmdp)
+{
+	return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
+}
+
+/*
+ * We mark the pmd splitting and invalidate all the hpte
+ * entries for this hugepage.
+ */
+void pmdp_splitting_flush(struct vm_area_struct *vma,
+			  unsigned long address, pmd_t *pmdp)
+{
+	unsigned long old, tmp;
+
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+#ifdef CONFIG_DEBUG_VM
+	WARN_ON(!pmd_trans_huge(*pmdp));
+	assert_spin_locked(&vma->vm_mm->page_table_lock);
+#endif
+
+#ifdef PTE_ATOMIC_UPDATES
+
+	__asm__ __volatile__(
+	"1:	ldarx	%0,0,%3\n\
+		andi.	%1,%0,%6\n\
+		bne-	1b \n\
+		ori	%1,%0,%4 \n\
+		stdcx.	%1,0,%3 \n\
+		bne-	1b"
+	: "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
+	: "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY)
+	: "cc" );
+#else
+	old = pmd_val(*pmdp);
+	*pmdp = __pmd(old | _PAGE_SPLITTING);
+#endif
+	/*
+	 * If we didn't had the splitting flag set, go and flush the
+	 * HPTE entries.
+	 */
+	if (!(old & _PAGE_SPLITTING)) {
+		/* We need to flush the hpte */
+		if (old & _PAGE_HASHPTE)
+			hpte_do_hugepage_flush(vma->vm_mm, address, pmdp);
+	}
+	/*
+	 * This ensures that generic code that rely on IRQ disabling
+	 * to prevent a parallel THP split work as expected.
+	 */
+	kick_all_cpus_sync();
+}
+
+/*
+ * We want to put the pgtable in pmd and use pgtable for tracking
+ * the base page size hptes
+ */
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+				pgtable_t pgtable)
+{
+	pgtable_t *pgtable_slot;
+	assert_spin_locked(&mm->page_table_lock);
+	/*
+	 * we store the pgtable in the second half of PMD
+	 */
+	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+	*pgtable_slot = pgtable;
+	/*
+	 * expose the deposited pgtable to other cpus.
+	 * before we set the hugepage PTE at pmd level
+	 * hash fault code looks at the deposted pgtable
+	 * to store hash index values.
+	 */
+	smp_wmb();
+}
+
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+{
+	pgtable_t pgtable;
+	pgtable_t *pgtable_slot;
+
+	assert_spin_locked(&mm->page_table_lock);
+	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+	pgtable = *pgtable_slot;
+	/*
+	 * Once we withdraw, mark the entry NULL.
+	 */
+	*pgtable_slot = NULL;
+	/*
+	 * We store HPTE information in the deposited PTE fragment.
+	 * zero out the content on withdraw.
+	 */
+	memset(pgtable, 0, PTE_FRAG_SIZE);
+	return pgtable;
+}
+
+/*
+ * set a new huge pmd. We should not be called for updating
+ * an existing pmd entry. That should go via pmd_hugepage_update.
+ */
+void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+		pmd_t *pmdp, pmd_t pmd)
+{
+#ifdef CONFIG_DEBUG_VM
+	WARN_ON(pmd_val(*pmdp) & _PAGE_PRESENT);
+	assert_spin_locked(&mm->page_table_lock);
+	WARN_ON(!pmd_trans_huge(pmd));
+#endif
+	return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
+}
+
+void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+		     pmd_t *pmdp)
+{
+	pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
+}
+
+/*
+ * A linux hugepage PMD was changed and the corresponding hash table entries
+ * neesd to be flushed.
+ */
+void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
+			    pmd_t *pmdp)
+{
+	int ssize, i;
+	unsigned long s_addr;
+	int max_hpte_count;
+	unsigned int psize, valid;
+	unsigned char *hpte_slot_array;
+	unsigned long hidx, vpn, vsid, hash, shift, slot;
+
+	/*
+	 * Flush all the hptes mapping this hugepage
+	 */
+	s_addr = addr & HPAGE_PMD_MASK;
+	hpte_slot_array = get_hpte_slot_array(pmdp);
+	/*
+	 * IF we try to do a HUGE PTE update after a withdraw is done.
+	 * we will find the below NULL. This happens when we do
+	 * split_huge_page_pmd
+	 */
+	if (!hpte_slot_array)
+		return;
+
+	/* get the base page size */
+	psize = get_slice_psize(mm, s_addr);
+
+	if (ppc_md.hugepage_invalidate)
+		return ppc_md.hugepage_invalidate(mm, hpte_slot_array,
+						  s_addr, psize);
+	/*
+	 * No bluk hpte removal support, invalidate each entry
+	 */
+	shift = mmu_psize_defs[psize].shift;
+	max_hpte_count = HPAGE_PMD_SIZE >> shift;
+	for (i = 0; i < max_hpte_count; i++) {
+		/*
+		 * 8 bits per each hpte entries
+		 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
+		 */
+		valid = hpte_valid(hpte_slot_array, i);
+		if (!valid)
+			continue;
+		hidx =  hpte_hash_index(hpte_slot_array, i);
+
+		/* get the vpn */
+		addr = s_addr + (i * (1ul << shift));
+		if (!is_kernel_addr(addr)) {
+			ssize = user_segment_size(addr);
+			vsid = get_vsid(mm->context.id, addr, ssize);
+			WARN_ON(vsid == 0);
+		} else {
+			vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+			ssize = mmu_kernel_ssize;
+		}
+
+		vpn = hpt_vpn(addr, vsid, ssize);
+		hash = hpt_hash(vpn, shift, ssize);
+		if (hidx & _PTEIDX_SECONDARY)
+			hash = ~hash;
+
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += hidx & _PTEIDX_GROUP_IX;
+		ppc_md.hpte_invalidate(slot, vpn, psize,
+				       MMU_PAGE_16M, ssize, 0);
+	}
+}
+
+static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
+{
+	pmd_val(pmd) |= pgprot_val(pgprot);
+	return pmd;
+}
+
+pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
+{
+	pmd_t pmd;
+	/*
+	 * For a valid pte, we would have _PAGE_PRESENT or _PAGE_FILE always
+	 * set. We use this to check THP page at pmd level.
+	 * leaf pte for huge page, bottom two bits != 00
+	 */
+	pmd_val(pmd) = pfn << PTE_RPN_SHIFT;
+	pmd_val(pmd) |= _PAGE_THP_HUGE;
+	pmd = pmd_set_protbits(pmd, pgprot);
+	return pmd;
+}
+
+pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
+{
+	return pfn_pmd(page_to_pfn(page), pgprot);
+}
+
+pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+
+	pmd_val(pmd) &= _HPAGE_CHG_MASK;
+	pmd = pmd_set_protbits(pmd, newprot);
+	return pmd;
+}
+
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a HUGE PMD entry in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux HUGE PMD entry.
+ */
+void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
+			  pmd_t *pmd)
+{
+	return;
+}
+
+pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+			 unsigned long addr, pmd_t *pmdp)
+{
+	pmd_t old_pmd;
+	pgtable_t pgtable;
+	unsigned long old;
+	pgtable_t *pgtable_slot;
+
+	old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
+	old_pmd = __pmd(old);
+	/*
+	 * We have pmd == none and we are holding page_table_lock.
+	 * So we can safely go and clear the pgtable hash
+	 * index info.
+	 */
+	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+	pgtable = *pgtable_slot;
+	/*
+	 * Let's zero out old valid and hash index details
+	 * hash fault look at them.
+	 */
+	memset(pgtable, 0, PTE_FRAG_SIZE);
+	return old_pmd;
+}
+
+int has_transparent_hugepage(void)
+{
+	if (!mmu_has_feature(MMU_FTR_16M_PAGE))
+		return 0;
+	/*
+	 * We support THP only if PMD_SIZE is 16MB.
+	 */
+	if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
+		return 0;
+	/*
+	 * We need to make sure that we support 16MB hugepage in a segement
+	 * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
+	 * of 64K.
+	 */
+	/*
+	 * If we have 64K HPTE, we will be using that by default
+	 */
+	if (mmu_psize_defs[MMU_PAGE_64K].shift &&
+	    (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
+		return 0;
+	/*
+	 * Ok we only have 4K HPTE
+	 */
+	if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
+		return 0;
+
+	return 1;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
index 5c45d474cfc..11571e11883 100644
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ b/arch/powerpc/mm/ppc_mmu_32.c
@@ -26,11 +26,11 @@
 #include <linux/mm.h>
 #include <linux/init.h>
 #include <linux/highmem.h>
+#include <linux/memblock.h>
 
 #include <asm/prom.h>
 #include <asm/mmu.h>
 #include <asm/machdep.h>
-#include <asm/lmb.h>
 
 #include "mmu_decl.h"
 
@@ -38,21 +38,18 @@ struct hash_pte *Hash, *Hash_end;
 unsigned long Hash_size, Hash_mask;
 unsigned long _SDR1;
 
-union ubat {			/* BAT register values to be loaded */
-	struct ppc_bat bat;
-	u32	word[2];
-} BATS[8][2];			/* 8 pairs of IBAT, DBAT */
+struct ppc_bat BATS[8][2];	/* 8 pairs of IBAT, DBAT */
 
 struct batrange {		/* stores address ranges mapped by BATs */
 	unsigned long start;
 	unsigned long limit;
-	unsigned long phys;
+	phys_addr_t phys;
 } bat_addrs[8];
 
 /*
  * Return PA for this VA if it is mapped by a BAT, or 0
  */
-unsigned long v_mapped_by_bats(unsigned long va)
+phys_addr_t v_mapped_by_bats(unsigned long va)
 {
 	int b;
 	for (b = 0; b < 4; ++b)
@@ -64,7 +61,7 @@ unsigned long v_mapped_by_bats(unsigned long va)
 /*
  * Return VA for a given PA or 0 if not mapped
  */
-unsigned long p_mapped_by_bats(unsigned long pa)
+unsigned long p_mapped_by_bats(phys_addr_t pa)
 {
 	int b;
 	for (b = 0; b < 4; ++b)
@@ -75,14 +72,10 @@ unsigned long p_mapped_by_bats(unsigned long pa)
 	return 0;
 }
 
-unsigned long __init mmu_mapin_ram(void)
+unsigned long __init mmu_mapin_ram(unsigned long top)
 {
-#ifdef CONFIG_POWER4
-	return 0;
-#else
 	unsigned long tot, bl, done;
 	unsigned long max_size = (256<<20);
-	unsigned long align;
 
 	if (__map_without_bats) {
 		printk(KERN_DEBUG "RAM mapped without BATs\n");
@@ -93,32 +86,25 @@ unsigned long __init mmu_mapin_ram(void)
 
 	/* Make sure we don't map a block larger than the
 	   smallest alignment of the physical address. */
-	/* alignment of PPC_MEMSTART */
-	align = ~(PPC_MEMSTART-1) & PPC_MEMSTART;
-	/* set BAT block size to MIN(max_size, align) */
-	if (align && align < max_size)
-		max_size = align;
-
-	tot = total_lowmem;
+	tot = top;
 	for (bl = 128<<10; bl < max_size; bl <<= 1) {
 		if (bl * 2 > tot)
 			break;
 	}
 
-	setbat(2, KERNELBASE, PPC_MEMSTART, bl, _PAGE_RAM);
-	done = (unsigned long)bat_addrs[2].limit - KERNELBASE + 1;
+	setbat(2, PAGE_OFFSET, 0, bl, PAGE_KERNEL_X);
+	done = (unsigned long)bat_addrs[2].limit - PAGE_OFFSET + 1;
 	if ((done < tot) && !bat_addrs[3].limit) {
 		/* use BAT3 to cover a bit more */
 		tot -= done;
 		for (bl = 128<<10; bl < max_size; bl <<= 1)
 			if (bl * 2 > tot)
 				break;
-		setbat(3, KERNELBASE+done, PPC_MEMSTART+done, bl, _PAGE_RAM);
-		done = (unsigned long)bat_addrs[3].limit - KERNELBASE + 1;
+		setbat(3, PAGE_OFFSET+done, done, bl, PAGE_KERNEL_X);
+		done = (unsigned long)bat_addrs[3].limit - PAGE_OFFSET + 1;
 	}
 
 	return done;
-#endif
 }
 
 /*
@@ -126,16 +112,16 @@ unsigned long __init mmu_mapin_ram(void)
  * The parameters are not checked; in particular size must be a power
  * of 2 between 128k and 256M.
  */
-void __init setbat(int index, unsigned long virt, unsigned long phys,
+void __init setbat(int index, unsigned long virt, phys_addr_t phys,
 		   unsigned int size, int flags)
 {
 	unsigned int bl;
 	int wimgxpp;
-	union ubat *bat = BATS[index];
+	struct ppc_bat *bat = BATS[index];
 
-	if (((flags & _PAGE_NO_CACHE) == 0) &&
-	    cpu_has_feature(CPU_FTR_NEED_COHERENT))
-		flags |= _PAGE_COHERENT;
+	if ((flags & _PAGE_NO_CACHE) ||
+	    (cpu_has_feature(CPU_FTR_NEED_COHERENT) == 0))
+		flags &= ~_PAGE_COHERENT;
 
 	bl = (size >> 17) - 1;
 	if (PVR_VER(mfspr(SPRN_PVR)) != 1) {
@@ -144,15 +130,13 @@ void __init setbat(int index, unsigned long virt, unsigned long phys,
 		wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
 				   | _PAGE_COHERENT | _PAGE_GUARDED);
 		wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX;
-		bat[1].word[0] = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */
-		bat[1].word[1] = phys | wimgxpp;
-#ifndef CONFIG_KGDB /* want user access for breakpoints */
+		bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */
+		bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp;
 		if (flags & _PAGE_USER)
-#endif
-			bat[1].bat.batu.vp = 1;
+			bat[1].batu |= 1; 	/* Vp = 1 */
 		if (flags & _PAGE_GUARDED) {
 			/* G bit must be zero in IBATs */
-			bat[0].word[0] = bat[0].word[1] = 0;
+			bat[0].batu = bat[0].batl = 0;
 		} else {
 			/* make IBAT same as DBAT */
 			bat[0] = bat[1];
@@ -165,8 +149,8 @@ void __init setbat(int index, unsigned long virt, unsigned long phys,
 				   | _PAGE_COHERENT);
 		wimgxpp |= (flags & _PAGE_RW)?
 			((flags & _PAGE_USER)? PP_RWRW: PP_RWXX): PP_RXRX;
-		bat->word[0] = virt | wimgxpp | 4;	/* Ks=0, Ku=1 */
-		bat->word[1] = phys | bl | 0x40;	/* V=1 */
+		bat->batu = virt | wimgxpp | 4;	/* Ks=0, Ku=1 */
+		bat->batl = phys | bl | 0x40;	/* V=1 */
 	}
 
 	bat_addrs[index].start = virt;
@@ -202,7 +186,7 @@ void __init MMU_init_hw(void)
 	extern unsigned int hash_page[];
 	extern unsigned int flush_hash_patch_A[], flush_hash_patch_B[];
 
-	if (!cpu_has_feature(CPU_FTR_HPTE_TABLE)) {
+	if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) {
 		/*
 		 * Put a blr (procedure return) instruction at the
 		 * start of hash_page, since we can still get DSI
@@ -239,15 +223,14 @@ void __init MMU_init_hw(void)
 	 * Find some memory for the hash table.
 	 */
 	if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322);
-	Hash = __va(lmb_alloc_base(Hash_size, Hash_size,
-				   __initial_memory_limit));
+	Hash = __va(memblock_alloc(Hash_size, Hash_size));
 	cacheable_memzero(Hash, Hash_size);
 	_SDR1 = __pa(Hash) | SDR1_LOW_BITS;
 
 	Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size);
 
-	printk("Total memory = %ldMB; using %ldkB for hash table (at %p)\n",
-	       total_memory >> 20, Hash_size >> 10, Hash);
+	printk("Total memory = %lldMB; using %ldkB for hash table (at %p)\n",
+	       (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash);
 
 
 	/*
@@ -288,3 +271,18 @@ void __init MMU_init_hw(void)
 
 	if ( ppc_md.progress ) ppc_md.progress("hash:done", 0x205);
 }
+
+void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+				phys_addr_t first_memblock_size)
+{
+	/* We don't currently support the first MEMBLOCK not mapping 0
+	 * physical on those processors
+	 */
+	BUG_ON(first_memblock_base != 0);
+
+	/* 601 can only access 16MB at the moment */
+	if (PVR_VER(mfspr(SPRN_PVR)) == 1)
+		memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01000000));
+	else /* Anything else has 256M mapped */
+		memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000));
+}
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 906daeda59a..0399a670295 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -2,7 +2,7 @@
  * PowerPC64 SLB support.
  *
  * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
- * Based on earlier code writteh by:
+ * Based on earlier code written by:
  * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
  *    Copyright (c) 2001 Dave Engebretsen
  * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
@@ -14,8 +14,6 @@
  *      2 of the License, or (at your option) any later version.
  */
 
-#undef DEBUG
-
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
 #include <asm/mmu_context.h>
@@ -23,15 +21,10 @@
 #include <asm/cputable.h>
 #include <asm/cacheflush.h>
 #include <asm/smp.h>
-#include <asm/firmware.h>
 #include <linux/compiler.h>
 #include <asm/udbg.h>
+#include <asm/code-patching.h>
 
-#ifdef DEBUG
-#define DBG(fmt...) udbg_printf(fmt)
-#else
-#define DBG(fmt...)
-#endif
 
 extern void slb_allocate_realmode(unsigned long ea);
 extern void slb_allocate_user(unsigned long ea);
@@ -44,13 +37,13 @@ static void slb_allocate(unsigned long ea)
 	slb_allocate_realmode(ea);
 }
 
+#define slb_esid_mask(ssize)	\
+	(((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T)
+
 static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
 					 unsigned long slot)
 {
-	unsigned long mask;
-
-	mask = (ssize == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T;
-	return (ea & mask) | SLB_ESID_V | slot;
+	return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | slot;
 }
 
 #define slb_vsid_shift(ssize)	\
@@ -73,8 +66,10 @@ static inline void slb_shadow_update(unsigned long ea, int ssize,
 	 * we only update the current CPU's SLB shadow buffer.
 	 */
 	get_slb_shadow()->save_area[entry].esid = 0;
-	get_slb_shadow()->save_area[entry].vsid = mk_vsid_data(ea, ssize, flags);
-	get_slb_shadow()->save_area[entry].esid = mk_esid_data(ea, ssize, entry);
+	get_slb_shadow()->save_area[entry].vsid =
+				cpu_to_be64(mk_vsid_data(ea, ssize, flags));
+	get_slb_shadow()->save_area[entry].esid =
+				cpu_to_be64(mk_esid_data(ea, ssize, entry));
 }
 
 static inline void slb_shadow_clear(unsigned long entry)
@@ -99,15 +94,13 @@ static inline void create_shadowed_slbe(unsigned long ea, int ssize,
 		     : "memory" );
 }
 
-void slb_flush_and_rebolt(void)
+static void __slb_flush_and_rebolt(void)
 {
 	/* If you change this make sure you change SLB_NUM_BOLTED
-	 * appropriately too. */
+	 * and PR KVM appropriately too. */
 	unsigned long linear_llp, vmalloc_llp, lflags, vflags;
 	unsigned long ksp_esid_data, ksp_vsid_data;
 
-	WARN_ON(!irqs_disabled());
-
 	linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
 	vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
 	lflags = SLB_VSID_KERNEL | linear_llp;
@@ -121,15 +114,10 @@ void slb_flush_and_rebolt(void)
 	} else {
 		/* Update stack entry; others don't change */
 		slb_shadow_update(get_paca()->kstack, mmu_kernel_ssize, lflags, 2);
-		ksp_vsid_data = get_slb_shadow()->save_area[2].vsid;
+		ksp_vsid_data =
+			be64_to_cpu(get_slb_shadow()->save_area[2].vsid);
 	}
 
-	/*
-	 * We can't take a PMU exception in the following code, so hard
-	 * disable interrupts.
-	 */
-	hard_irq_disable();
-
 	/* We need to do this all in asm, so we're sure we don't touch
 	 * the stack between the slbia and rebolting it. */
 	asm volatile("isync\n"
@@ -146,6 +134,21 @@ void slb_flush_and_rebolt(void)
 		     : "memory");
 }
 
+void slb_flush_and_rebolt(void)
+{
+
+	WARN_ON(!irqs_disabled());
+
+	/*
+	 * We can't take a PMU exception in the following code, so hard
+	 * disable interrupts.
+	 */
+	hard_irq_disable();
+
+	__slb_flush_and_rebolt();
+	get_paca()->slb_cache_ptr = 0;
+}
+
 void slb_vmalloc_update(void)
 {
 	unsigned long vflags;
@@ -166,7 +169,7 @@ static inline int esids_match(unsigned long addr1, unsigned long addr2)
 	int esid_1t_count;
 
 	/* System is not 1T segment size capable. */
-	if (!cpu_has_feature(CPU_FTR_1T_SEGMENT))
+	if (!mmu_has_feature(MMU_FTR_1T_SEGMENT))
 		return (GET_ESID(addr1) == GET_ESID(addr2));
 
 	esid_1t_count = (((addr1 >> SID_SHIFT_1T) != 0) +
@@ -187,13 +190,21 @@ static inline int esids_match(unsigned long addr1, unsigned long addr2)
 /* Flush all user entries from the segment table of the current processor. */
 void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
 {
-	unsigned long offset = get_paca()->slb_cache_ptr;
+	unsigned long offset;
 	unsigned long slbie_data = 0;
 	unsigned long pc = KSTK_EIP(tsk);
 	unsigned long stack = KSTK_ESP(tsk);
-	unsigned long unmapped_base;
+	unsigned long exec_base;
 
-	if (!cpu_has_feature(CPU_FTR_NO_SLBIE_B) &&
+	/*
+	 * We need interrupts hard-disabled here, not just soft-disabled,
+	 * so that a PMU interrupt can't occur, which might try to access
+	 * user memory (to get a stack trace) and possible cause an SLB miss
+	 * which would update the slb_cache/slb_cache_ptr fields in the PACA.
+	 */
+	hard_irq_disable();
+	offset = get_paca()->slb_cache_ptr;
+	if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) &&
 	    offset <= SLB_CACHE_ENTRIES) {
 		int i;
 		asm volatile("isync" : : : "memory");
@@ -207,7 +218,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
 		}
 		asm volatile("isync" : : : "memory");
 	} else {
-		slb_flush_and_rebolt();
+		__slb_flush_and_rebolt();
 	}
 
 	/* Workaround POWER5 < DD2.1 issue */
@@ -219,40 +230,45 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
 
 	/*
 	 * preload some userspace segments into the SLB.
+	 * Almost all 32 and 64bit PowerPC executables are linked at
+	 * 0x10000000 so it makes sense to preload this segment.
 	 */
-	if (test_tsk_thread_flag(tsk, TIF_32BIT))
-		unmapped_base = TASK_UNMAPPED_BASE_USER32;
-	else
-		unmapped_base = TASK_UNMAPPED_BASE_USER64;
-
-	if (is_kernel_addr(pc))
-		return;
-	slb_allocate(pc);
+	exec_base = 0x10000000;
 
-	if (esids_match(pc,stack))
+	if (is_kernel_addr(pc) || is_kernel_addr(stack) ||
+	    is_kernel_addr(exec_base))
 		return;
 
-	if (is_kernel_addr(stack))
-		return;
-	slb_allocate(stack);
+	slb_allocate(pc);
 
-	if (esids_match(pc,unmapped_base) || esids_match(stack,unmapped_base))
-		return;
+	if (!esids_match(pc, stack))
+		slb_allocate(stack);
 
-	if (is_kernel_addr(unmapped_base))
-		return;
-	slb_allocate(unmapped_base);
+	if (!esids_match(pc, exec_base) &&
+	    !esids_match(stack, exec_base))
+		slb_allocate(exec_base);
 }
 
 static inline void patch_slb_encoding(unsigned int *insn_addr,
 				      unsigned int immed)
 {
-	/* Assume the instruction had a "0" immediate value, just
-	 * "or" in the new value
-	 */
-	*insn_addr |= immed;
-	flush_icache_range((unsigned long)insn_addr, 4+
-			   (unsigned long)insn_addr);
+	int insn = (*insn_addr & 0xffff0000) | immed;
+	patch_instruction(insn_addr, insn);
+}
+
+extern u32 slb_compare_rr_to_size[];
+extern u32 slb_miss_kernel_load_linear[];
+extern u32 slb_miss_kernel_load_io[];
+extern u32 slb_compare_rr_to_size[];
+extern u32 slb_miss_kernel_load_vmemmap[];
+
+void slb_set_size(u16 size)
+{
+	if (mmu_slb_size == size)
+		return;
+
+	mmu_slb_size = size;
+	patch_slb_encoding(slb_compare_rr_to_size, mmu_slb_size);
 }
 
 void slb_initialize(void)
@@ -260,16 +276,18 @@ void slb_initialize(void)
 	unsigned long linear_llp, vmalloc_llp, io_llp;
 	unsigned long lflags, vflags;
 	static int slb_encoding_inited;
-	extern unsigned int *slb_miss_kernel_load_linear;
-	extern unsigned int *slb_miss_kernel_load_io;
-	extern unsigned int *slb_compare_rr_to_size;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+	unsigned long vmemmap_llp;
+#endif
 
 	/* Prepare our SLB miss handler based on our page size */
 	linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
 	io_llp = mmu_psize_defs[mmu_io_psize].sllp;
 	vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
 	get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp;
-
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+	vmemmap_llp = mmu_psize_defs[mmu_vmemmap_psize].sllp;
+#endif
 	if (!slb_encoding_inited) {
 		slb_encoding_inited = 1;
 		patch_slb_encoding(slb_miss_kernel_load_linear,
@@ -279,17 +297,18 @@ void slb_initialize(void)
 		patch_slb_encoding(slb_compare_rr_to_size,
 				   mmu_slb_size);
 
-		DBG("SLB: linear  LLP = %04x\n", linear_llp);
-		DBG("SLB: io      LLP = %04x\n", io_llp);
+		pr_devel("SLB: linear  LLP = %04lx\n", linear_llp);
+		pr_devel("SLB: io      LLP = %04lx\n", io_llp);
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+		patch_slb_encoding(slb_miss_kernel_load_vmemmap,
+				   SLB_VSID_KERNEL | vmemmap_llp);
+		pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp);
+#endif
 	}
 
 	get_paca()->stab_rr = SLB_NUM_BOLTED;
 
-	/* On iSeries the bolted entries have already been set up by
-	 * the hypervisor from the lparMap data in head.S */
-	if (firmware_has_feature(FW_FEATURE_ISERIES))
-		return;
-
 	lflags = SLB_VSID_KERNEL | linear_llp;
 	vflags = SLB_VSID_KERNEL | vmalloc_llp;
 
@@ -301,11 +320,16 @@ void slb_initialize(void)
 
 	create_shadowed_slbe(VMALLOC_START, mmu_kernel_ssize, vflags, 1);
 
+	/* For the boot cpu, we're running on the stack in init_thread_union,
+	 * which is in the first segment of the linear mapping, and also
+	 * get_paca()->kstack hasn't been initialized yet.
+	 * For secondary cpus, we need to bolt the kernel stack entry now.
+	 */
 	slb_shadow_clear(2);
+	if (raw_smp_processor_id() != boot_cpuid &&
+	    (get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET)
+		create_shadowed_slbe(get_paca()->kstack,
+				     mmu_kernel_ssize, lflags, 2);
 
-	/* We don't bolt the stack for the time being - we're in boot,
-	 * so the stack is in the bolted segment.  By the time it goes
-	 * elsewhere, we'll call _switch() which will bolt in the new
-	 * one. */
 	asm volatile("isync":::"memory");
 }
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index 657f6b37e9d..736d18b3cef 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -31,10 +31,15 @@
  * No other registers are examined or changed.
  */
 _GLOBAL(slb_allocate_realmode)
-	/* r3 = faulting address */
+	/*
+	 * check for bad kernel/user address
+	 * (ea & ~REGION_MASK) >= PGTABLE_RANGE
+	 */
+	rldicr. r9,r3,4,(63 - PGTABLE_EADDR_SIZE - 4)
+	bne-	8f
 
 	srdi	r9,r3,60		/* get region */
-	srdi	r10,r3,28		/* get esid */
+	srdi	r10,r3,SID_SHIFT	/* get esid */
 	cmpldi	cr7,r9,0xc		/* cmp PAGE_OFFSET for later use */
 
 	/* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */
@@ -47,48 +52,68 @@ _GLOBAL(slb_allocate_realmode)
 	 * it to VSID 0, which is reserved as a bad VSID - one which
 	 * will never have any pages in it.  */
 
-	/* Check if hitting the linear mapping of the vmalloc/ioremap
-	 * kernel space
+	/* Check if hitting the linear mapping or some other kernel space
 	*/
 	bne	cr7,1f
 
 	/* Linear mapping encoding bits, the "li" instruction below will
 	 * be patched by the kernel at boot
 	 */
-_GLOBAL(slb_miss_kernel_load_linear)
+.globl slb_miss_kernel_load_linear
+slb_miss_kernel_load_linear:
 	li	r11,0
+	/*
+	 * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
+	 * r9 = region id.
+	 */
+	addis	r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha
+	addi	r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l
+
+
 BEGIN_FTR_SECTION
 	b	slb_finish_load
-END_FTR_SECTION_IFCLR(CPU_FTR_1T_SEGMENT)
+END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
 	b	slb_finish_load_1T
 
-1:	/* vmalloc/ioremap mapping encoding bits, the "li" instructions below
-	 * will be patched by the kernel at boot
+1:
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+	/* Check virtual memmap region. To be patches at kernel boot */
+	cmpldi	cr0,r9,0xf
+	bne	1f
+.globl slb_miss_kernel_load_vmemmap
+slb_miss_kernel_load_vmemmap:
+	li	r11,0
+	b	6f
+1:
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
+	/* vmalloc mapping gets the encoding from the PACA as the mapping
+	 * can be demoted from 64K -> 4K dynamically on some machines
 	 */
-BEGIN_FTR_SECTION
-	/* check whether this is in vmalloc or ioremap space */
 	clrldi	r11,r10,48
 	cmpldi	r11,(VMALLOC_SIZE >> 28) - 1
 	bgt	5f
 	lhz	r11,PACAVMALLOCSLLP(r13)
 	b	6f
 5:
-END_FTR_SECTION_IFCLR(CPU_FTR_CI_LARGE_PAGE)
-_GLOBAL(slb_miss_kernel_load_io)
+	/* IO mapping */
+.globl slb_miss_kernel_load_io
+slb_miss_kernel_load_io:
 	li	r11,0
 6:
+	/*
+	 * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
+	 * r9 = region id.
+	 */
+	addis	r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha
+	addi	r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l
+
 BEGIN_FTR_SECTION
 	b	slb_finish_load
-END_FTR_SECTION_IFCLR(CPU_FTR_1T_SEGMENT)
+END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
 	b	slb_finish_load_1T
 
-0:	/* user address: proto-VSID = context << 15 | ESID. First check
-	 * if the address is within the boundaries of the user region
-	 */
-	srdi.	r9,r10,USER_ESID_BITS
-	bne-	8f			/* invalid ea bits set */
-
-
+0:
 	/* when using slices, we extract the psize off the slice bitmaps
 	 * and then we need to get the sllp encoding off the mmu_psize_defs
 	 * array.
@@ -100,17 +125,31 @@ END_FTR_SECTION_IFCLR(CPU_FTR_1T_SEGMENT)
 	 * between 4k and 64k standard page size
 	 */
 #ifdef CONFIG_PPC_MM_SLICES
+	/* r10 have esid */
 	cmpldi	r10,16
-
-	/* Get the slice index * 4 in r11 and matching slice size mask in r9 */
-	ld	r9,PACALOWSLICESPSIZE(r13)
-	sldi	r11,r10,2
+	/* below SLICE_LOW_TOP */
 	blt	5f
-	ld	r9,PACAHIGHSLICEPSIZE(r13)
-	srdi	r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT - 2)
-	andi.	r11,r11,0x3c
+	/*
+	 * Handle hpsizes,
+	 * r9 is get_paca()->context.high_slices_psize[index], r11 is mask_index
+	 */
+	srdi    r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT + 1) /* index */
+	addi	r9,r11,PACAHIGHSLICEPSIZE
+	lbzx	r9,r13,r9		/* r9 is hpsizes[r11] */
+	/* r11 = (r10 >> (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)) & 0x1 */
+	rldicl	r11,r10,(64 - (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)),63
+	b	6f
 
-5:	/* Extract the psize and multiply to get an array offset */
+5:
+	/*
+	 * Handle lpsizes
+	 * r9 is get_paca()->context.low_slices_psize, r11 is index
+	 */
+	ld	r9,PACALOWSLICESPSIZE(r13)
+	mr	r11,r10
+6:
+	sldi	r11,r11,2  /* index * 4 */
+	/* Extract the psize and multiply to get an array offset */
 	srd	r9,r9,r11
 	andi.	r9,r9,0xf
 	mulli	r9,r9,MMUPSIZEDEFSIZE
@@ -130,15 +169,13 @@ END_FTR_SECTION_IFCLR(CPU_FTR_1T_SEGMENT)
 	ld	r9,PACACONTEXTID(r13)
 BEGIN_FTR_SECTION
 	cmpldi	r10,0x1000
-END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
-	rldimi	r10,r9,USER_ESID_BITS,0
-BEGIN_FTR_SECTION
 	bge	slb_finish_load_1T
-END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
 	b	slb_finish_load
 
 8:	/* invalid EA */
 	li	r10,0			/* BAD_VSID */
+	li	r9,0			/* BAD_VSID */
 	li	r11,SLB_VSID_USER	/* flags don't much matter */
 	b	slb_finish_load
 
@@ -187,8 +224,6 @@ _GLOBAL(slb_allocate_user)
 
 	/* get context to calculate proto-VSID */
 	ld	r9,PACACONTEXTID(r13)
-	rldimi	r10,r9,USER_ESID_BITS,0
-
 	/* fall through slb_finish_load */
 
 #endif /* __DISABLED__ */
@@ -197,11 +232,16 @@ _GLOBAL(slb_allocate_user)
 /*
  * Finish loading of an SLB entry and return
  *
- * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET
+ * r3 = EA, r9 = context, r10 = ESID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET
  */
 slb_finish_load:
+	rldimi  r10,r9,ESID_BITS,0
 	ASM_VSID_SCRAMBLE(r10,r9,256M)
-	rldimi	r11,r10,SLB_VSID_SHIFT,16	/* combine VSID and flags */
+	/*
+	 * bits above VSID_BITS_256M need to be ignored from r10
+	 * also combine VSID and flags
+	 */
+	rldimi	r11,r10,SLB_VSID_SHIFT,(64 - (SLB_VSID_SHIFT + VSID_BITS_256M))
 
 	/* r3 = EA, r11 = VSID data */
 	/*
@@ -209,26 +249,12 @@ slb_finish_load:
 	 * free slot first but that took too long. Unfortunately we
  	 * dont have any LRU information to help us choose a slot.
  	 */
-#ifdef CONFIG_PPC_ISERIES
-BEGIN_FW_FTR_SECTION
-	/*
-	 * On iSeries, the "bolted" stack segment can be cast out on
-	 * shared processor switch so we need to check for a miss on
-	 * it and restore it to the right slot.
-	 */
-	ld	r9,PACAKSAVE(r13)
-	clrrdi	r9,r9,28
-	clrrdi	r3,r3,28
-	li	r10,SLB_NUM_BOLTED-1	/* Stack goes in last bolted slot */
-	cmpld	r9,r3
-	beq	3f
-END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
-#endif /* CONFIG_PPC_ISERIES */
 
 7:	ld	r10,PACASTABRR(r13)
 	addi	r10,r10,1
 	/* This gets soft patched on boot. */
-_GLOBAL(slb_compare_rr_to_size)
+.globl slb_compare_rr_to_size
+slb_compare_rr_to_size:
 	cmpldi	r10,0
 
 	blt+	4f
@@ -259,10 +285,10 @@ _GLOBAL(slb_compare_rr_to_size)
 	bge	1f
 
 	/* still room in the slb cache */
-	sldi	r11,r3,1		/* r11 = offset * sizeof(u16) */
-	rldicl	r10,r10,36,28		/* get low 16 bits of the ESID */
-	add	r11,r11,r13		/* r11 = (u16 *)paca + offset */
-	sth	r10,PACASLBCACHE(r11)	/* paca->slb_cache[offset] = esid */
+	sldi	r11,r3,2		/* r11 = offset * sizeof(u32) */
+	srdi    r10,r10,28		/* get the 36 bits of the ESID */
+	add	r11,r11,r13		/* r11 = (u32 *)paca + offset */
+	stw	r10,PACASLBCACHE(r11)	/* paca->slb_cache[offset] = esid */
 	addi	r3,r3,1			/* offset++ */
 	b	2f
 1:					/* offset >= SLB_CACHE_ENTRIES */
@@ -274,14 +300,18 @@ _GLOBAL(slb_compare_rr_to_size)
 
 /*
  * Finish loading of a 1T SLB entry (for the kernel linear mapping) and return.
- * We assume legacy iSeries will never have 1T segments.
  *
- * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9
+ * r3 = EA, r9 = context, r10 = ESID(256MB), r11 = flags, clobbers r9
  */
 slb_finish_load_1T:
-	srdi	r10,r10,40-28		/* get 1T ESID */
+	srdi	r10,r10,(SID_SHIFT_1T - SID_SHIFT)	/* get 1T ESID */
+	rldimi  r10,r9,ESID_BITS_1T,0
 	ASM_VSID_SCRAMBLE(r10,r9,1T)
-	rldimi	r11,r10,SLB_VSID_SHIFT_1T,16	/* combine VSID and flags */
+	/*
+	 * bits above VSID_BITS_1T need to be ignored from r10
+	 * also combine VSID and flags
+	 */
+	rldimi	r11,r10,SLB_VSID_SHIFT_1T,(64 - (SLB_VSID_SHIFT_1T + VSID_BITS_1T))
 	li	r10,MMU_SEGSIZE_1T
 	rldimi	r11,r10,SLB_VSID_SSIZE_SHIFT,0	/* insert segment size */
 
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index ad928edafb0..b0c75cc15ef 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -29,11 +29,16 @@
 #include <linux/pagemap.h>
 #include <linux/err.h>
 #include <linux/spinlock.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <asm/mman.h>
 #include <asm/mmu.h>
 #include <asm/spu.h>
 
+/* some sanity checks */
+#if (PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE
+#error PGTABLE_RANGE exceeds slice_mask high_slices size
+#endif
+
 static DEFINE_SPINLOCK(slice_convert_lock);
 
 
@@ -42,7 +47,7 @@ int _slice_debug = 1;
 
 static void slice_print_mask(const char *label, struct slice_mask mask)
 {
-	char	*p, buf[16 + 3 + 16 + 1];
+	char	*p, buf[16 + 3 + 64 + 1];
 	int	i;
 
 	if (!_slice_debug)
@@ -54,7 +59,7 @@ static void slice_print_mask(const char *label, struct slice_mask mask)
 	*(p++) = '-';
 	*(p++) = ' ';
 	for (i = 0; i < SLICE_NUM_HIGH; i++)
-		*(p++) = (mask.high_slices & (1 << i)) ? '1' : '0';
+		*(p++) = (mask.high_slices & (1ul << i)) ? '1' : '0';
 	*(p++) = 0;
 
 	printk(KERN_DEBUG "%s:%s\n", label, buf);
@@ -84,8 +89,8 @@ static struct slice_mask slice_range_to_mask(unsigned long start,
 	}
 
 	if ((start + len) > SLICE_LOW_TOP)
-		ret.high_slices = (1u << (GET_HIGH_SLICE_INDEX(end) + 1))
-			- (1u << GET_HIGH_SLICE_INDEX(start));
+		ret.high_slices = (1ul << (GET_HIGH_SLICE_INDEX(end) + 1))
+			- (1ul << GET_HIGH_SLICE_INDEX(start));
 
 	return ret;
 }
@@ -135,26 +140,31 @@ static struct slice_mask slice_mask_for_free(struct mm_struct *mm)
 
 	for (i = 0; i < SLICE_NUM_HIGH; i++)
 		if (!slice_high_has_vma(mm, i))
-			ret.high_slices |= 1u << i;
+			ret.high_slices |= 1ul << i;
 
 	return ret;
 }
 
 static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize)
 {
+	unsigned char *hpsizes;
+	int index, mask_index;
 	struct slice_mask ret = { 0, 0 };
 	unsigned long i;
-	u64 psizes;
+	u64 lpsizes;
 
-	psizes = mm->context.low_slices_psize;
+	lpsizes = mm->context.low_slices_psize;
 	for (i = 0; i < SLICE_NUM_LOW; i++)
-		if (((psizes >> (i * 4)) & 0xf) == psize)
+		if (((lpsizes >> (i * 4)) & 0xf) == psize)
 			ret.low_slices |= 1u << i;
 
-	psizes = mm->context.high_slices_psize;
-	for (i = 0; i < SLICE_NUM_HIGH; i++)
-		if (((psizes >> (i * 4)) & 0xf) == psize)
-			ret.high_slices |= 1u << i;
+	hpsizes = mm->context.high_slices_psize;
+	for (i = 0; i < SLICE_NUM_HIGH; i++) {
+		mask_index = i & 0x1;
+		index = i >> 1;
+		if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize)
+			ret.high_slices |= 1ul << i;
+	}
 
 	return ret;
 }
@@ -183,8 +193,10 @@ static void slice_flush_segments(void *parm)
 
 static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psize)
 {
+	int index, mask_index;
 	/* Write the new slice psize bits */
-	u64 lpsizes, hpsizes;
+	unsigned char *hpsizes;
+	u64 lpsizes;
 	unsigned long i, flags;
 
 	slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize);
@@ -201,157 +213,136 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
 			lpsizes = (lpsizes & ~(0xful << (i * 4))) |
 				(((unsigned long)psize) << (i * 4));
 
-	hpsizes = mm->context.high_slices_psize;
-	for (i = 0; i < SLICE_NUM_HIGH; i++)
-		if (mask.high_slices & (1u << i))
-			hpsizes = (hpsizes & ~(0xful << (i * 4))) |
-				(((unsigned long)psize) << (i * 4));
-
+	/* Assign the value back */
 	mm->context.low_slices_psize = lpsizes;
-	mm->context.high_slices_psize = hpsizes;
+
+	hpsizes = mm->context.high_slices_psize;
+	for (i = 0; i < SLICE_NUM_HIGH; i++) {
+		mask_index = i & 0x1;
+		index = i >> 1;
+		if (mask.high_slices & (1ul << i))
+			hpsizes[index] = (hpsizes[index] &
+					  ~(0xf << (mask_index * 4))) |
+				(((unsigned long)psize) << (mask_index * 4));
+	}
 
 	slice_dbg(" lsps=%lx, hsps=%lx\n",
 		  mm->context.low_slices_psize,
 		  mm->context.high_slices_psize);
 
 	spin_unlock_irqrestore(&slice_convert_lock, flags);
-	mb();
 
-	/* XXX this is sub-optimal but will do for now */
-	on_each_cpu(slice_flush_segments, mm, 0, 1);
 #ifdef CONFIG_SPU_BASE
 	spu_flush_all_slbs(mm);
 #endif
 }
 
+/*
+ * Compute which slice addr is part of;
+ * set *boundary_addr to the start or end boundary of that slice
+ * (depending on 'end' parameter);
+ * return boolean indicating if the slice is marked as available in the
+ * 'available' slice_mark.
+ */
+static bool slice_scan_available(unsigned long addr,
+				 struct slice_mask available,
+				 int end,
+				 unsigned long *boundary_addr)
+{
+	unsigned long slice;
+	if (addr < SLICE_LOW_TOP) {
+		slice = GET_LOW_SLICE_INDEX(addr);
+		*boundary_addr = (slice + end) << SLICE_LOW_SHIFT;
+		return !!(available.low_slices & (1u << slice));
+	} else {
+		slice = GET_HIGH_SLICE_INDEX(addr);
+		*boundary_addr = (slice + end) ?
+			((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP;
+		return !!(available.high_slices & (1ul << slice));
+	}
+}
+
 static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
 					      unsigned long len,
 					      struct slice_mask available,
-					      int psize, int use_cache)
+					      int psize)
 {
-	struct vm_area_struct *vma;
-	unsigned long start_addr, addr;
-	struct slice_mask mask;
 	int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
-
-	if (use_cache) {
-		if (len <= mm->cached_hole_size) {
-			start_addr = addr = TASK_UNMAPPED_BASE;
-			mm->cached_hole_size = 0;
-		} else
-			start_addr = addr = mm->free_area_cache;
-	} else
-		start_addr = addr = TASK_UNMAPPED_BASE;
-
-full_search:
-	for (;;) {
-		addr = _ALIGN_UP(addr, 1ul << pshift);
-		if ((TASK_SIZE - len) < addr)
-			break;
-		vma = find_vma(mm, addr);
-		BUG_ON(vma && (addr >= vma->vm_end));
-
-		mask = slice_range_to_mask(addr, len);
-		if (!slice_check_fit(mask, available)) {
-			if (addr < SLICE_LOW_TOP)
-				addr = _ALIGN_UP(addr + 1,  1ul << SLICE_LOW_SHIFT);
-			else
-				addr = _ALIGN_UP(addr + 1,  1ul << SLICE_HIGH_SHIFT);
+	unsigned long addr, found, next_end;
+	struct vm_unmapped_area_info info;
+
+	info.flags = 0;
+	info.length = len;
+	info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
+	info.align_offset = 0;
+
+	addr = TASK_UNMAPPED_BASE;
+	while (addr < TASK_SIZE) {
+		info.low_limit = addr;
+		if (!slice_scan_available(addr, available, 1, &addr))
 			continue;
+
+ next_slice:
+		/*
+		 * At this point [info.low_limit; addr) covers
+		 * available slices only and ends at a slice boundary.
+		 * Check if we need to reduce the range, or if we can
+		 * extend it to cover the next available slice.
+		 */
+		if (addr >= TASK_SIZE)
+			addr = TASK_SIZE;
+		else if (slice_scan_available(addr, available, 1, &next_end)) {
+			addr = next_end;
+			goto next_slice;
 		}
-		if (!vma || addr + len <= vma->vm_start) {
-			/*
-			 * Remember the place where we stopped the search:
-			 */
-			if (use_cache)
-				mm->free_area_cache = addr + len;
-			return addr;
-		}
-		if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start)
-		        mm->cached_hole_size = vma->vm_start - addr;
-		addr = vma->vm_end;
-	}
+		info.high_limit = addr;
 
-	/* Make sure we didn't miss any holes */
-	if (use_cache && start_addr != TASK_UNMAPPED_BASE) {
-		start_addr = addr = TASK_UNMAPPED_BASE;
-		mm->cached_hole_size = 0;
-		goto full_search;
+		found = vm_unmapped_area(&info);
+		if (!(found & ~PAGE_MASK))
+			return found;
 	}
+
 	return -ENOMEM;
 }
 
 static unsigned long slice_find_area_topdown(struct mm_struct *mm,
 					     unsigned long len,
 					     struct slice_mask available,
-					     int psize, int use_cache)
+					     int psize)
 {
-	struct vm_area_struct *vma;
-	unsigned long addr;
-	struct slice_mask mask;
 	int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+	unsigned long addr, found, prev;
+	struct vm_unmapped_area_info info;
 
-	/* check if free_area_cache is useful for us */
-	if (use_cache) {
-		if (len <= mm->cached_hole_size) {
-			mm->cached_hole_size = 0;
-			mm->free_area_cache = mm->mmap_base;
-		}
-
-		/* either no address requested or can't fit in requested
-		 * address hole
-		 */
-		addr = mm->free_area_cache;
-
-		/* make sure it can fit in the remaining address space */
-		if (addr > len) {
-			addr = _ALIGN_DOWN(addr - len, 1ul << pshift);
-			mask = slice_range_to_mask(addr, len);
-			if (slice_check_fit(mask, available) &&
-			    slice_area_is_free(mm, addr, len))
-					/* remember the address as a hint for
-					 * next time
-					 */
-					return (mm->free_area_cache = addr);
-		}
-	}
+	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+	info.length = len;
+	info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
+	info.align_offset = 0;
 
 	addr = mm->mmap_base;
-	while (addr > len) {
-		/* Go down by chunk size */
-		addr = _ALIGN_DOWN(addr - len, 1ul << pshift);
-
-		/* Check for hit with different page size */
-		mask = slice_range_to_mask(addr, len);
-		if (!slice_check_fit(mask, available)) {
-			if (addr < SLICE_LOW_TOP)
-				addr = _ALIGN_DOWN(addr, 1ul << SLICE_LOW_SHIFT);
-			else if (addr < (1ul << SLICE_HIGH_SHIFT))
-				addr = SLICE_LOW_TOP;
-			else
-				addr = _ALIGN_DOWN(addr, 1ul << SLICE_HIGH_SHIFT);
+	while (addr > PAGE_SIZE) {
+		info.high_limit = addr;
+		if (!slice_scan_available(addr - 1, available, 0, &addr))
 			continue;
-		}
 
+ prev_slice:
 		/*
-		 * Lookup failure means no vma is above this address,
-		 * else if new region fits below vma->vm_start,
-		 * return with success:
+		 * At this point [addr; info.high_limit) covers
+		 * available slices only and starts at a slice boundary.
+		 * Check if we need to reduce the range, or if we can
+		 * extend it to cover the previous available slice.
 		 */
-		vma = find_vma(mm, addr);
-		if (!vma || (addr + len) <= vma->vm_start) {
-			/* remember the address as a hint for next time */
-			if (use_cache)
-				mm->free_area_cache = addr;
-			return addr;
+		if (addr < PAGE_SIZE)
+			addr = PAGE_SIZE;
+		else if (slice_scan_available(addr - 1, available, 0, &prev)) {
+			addr = prev;
+			goto prev_slice;
 		}
+		info.low_limit = addr;
 
-		/* remember the largest hole we saw so far */
-		if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start)
-		        mm->cached_hole_size = vma->vm_start - addr;
-
-		/* try just below the current vma->vm_start */
-		addr = vma->vm_start;
+		found = vm_unmapped_area(&info);
+		if (!(found & ~PAGE_MASK))
+			return found;
 	}
 
 	/*
@@ -360,48 +351,55 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
 	 * can happen with large stack limits and large mmap()
 	 * allocations.
 	 */
-	addr = slice_find_area_bottomup(mm, len, available, psize, 0);
-
-	/*
-	 * Restore the topdown base:
-	 */
-	if (use_cache) {
-		mm->free_area_cache = mm->mmap_base;
-		mm->cached_hole_size = ~0UL;
-	}
-
-	return addr;
+	return slice_find_area_bottomup(mm, len, available, psize);
 }
 
 
 static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
 				     struct slice_mask mask, int psize,
-				     int topdown, int use_cache)
+				     int topdown)
 {
 	if (topdown)
-		return slice_find_area_topdown(mm, len, mask, psize, use_cache);
+		return slice_find_area_topdown(mm, len, mask, psize);
 	else
-		return slice_find_area_bottomup(mm, len, mask, psize, use_cache);
+		return slice_find_area_bottomup(mm, len, mask, psize);
 }
 
+#define or_mask(dst, src)	do {			\
+	(dst).low_slices |= (src).low_slices;		\
+	(dst).high_slices |= (src).high_slices;		\
+} while (0)
+
+#define andnot_mask(dst, src)	do {			\
+	(dst).low_slices &= ~(src).low_slices;		\
+	(dst).high_slices &= ~(src).high_slices;	\
+} while (0)
+
+#ifdef CONFIG_PPC_64K_PAGES
+#define MMU_PAGE_BASE	MMU_PAGE_64K
+#else
+#define MMU_PAGE_BASE	MMU_PAGE_4K
+#endif
+
 unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 				      unsigned long flags, unsigned int psize,
-				      int topdown, int use_cache)
+				      int topdown)
 {
-	struct slice_mask mask;
+	struct slice_mask mask = {0, 0};
 	struct slice_mask good_mask;
 	struct slice_mask potential_mask = {0,0} /* silence stupid warning */;
-	int pmask_set = 0;
+	struct slice_mask compat_mask = {0, 0};
 	int fixed = (flags & MAP_FIXED);
 	int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
 	struct mm_struct *mm = current->mm;
+	unsigned long newaddr;
 
 	/* Sanity checks */
 	BUG_ON(mm->task_size == 0);
 
 	slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize);
-	slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d, use_cache=%d\n",
-		  addr, len, flags, topdown, use_cache);
+	slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n",
+		  addr, len, flags, topdown);
 
 	if (len > mm->task_size)
 		return -ENOMEM;
@@ -410,27 +408,54 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	if (fixed && (addr & ((1ul << pshift) - 1)))
 		return -EINVAL;
 	if (fixed && addr > (mm->task_size - len))
-		return -EINVAL;
+		return -ENOMEM;
 
 	/* If hint, make sure it matches our alignment restrictions */
 	if (!fixed && addr) {
 		addr = _ALIGN_UP(addr, 1ul << pshift);
 		slice_dbg(" aligned addr=%lx\n", addr);
+		/* Ignore hint if it's too large or overlaps a VMA */
+		if (addr > mm->task_size - len ||
+		    !slice_area_is_free(mm, addr, len))
+			addr = 0;
 	}
 
-	/* First makeup a "good" mask of slices that have the right size
+	/* First make up a "good" mask of slices that have the right size
 	 * already
 	 */
 	good_mask = slice_mask_for_size(mm, psize);
 	slice_print_mask(" good_mask", good_mask);
 
-	/* First check hint if it's valid or if we have MAP_FIXED */
-	if ((addr != 0 || fixed) && (mm->task_size - len) >= addr) {
+	/*
+	 * Here "good" means slices that are already the right page size,
+	 * "compat" means slices that have a compatible page size (i.e.
+	 * 4k in a 64k pagesize kernel), and "free" means slices without
+	 * any VMAs.
+	 *
+	 * If MAP_FIXED:
+	 *	check if fits in good | compat => OK
+	 *	check if fits in good | compat | free => convert free
+	 *	else bad
+	 * If have hint:
+	 *	check if hint fits in good => OK
+	 *	check if hint fits in good | free => convert free
+	 * Otherwise:
+	 *	search in good, found => OK
+	 *	search in good | free, found => convert free
+	 *	search in good | compat | free, found => convert free.
+	 */
 
-		/* Don't bother with hint if it overlaps a VMA */
-		if (!fixed && !slice_area_is_free(mm, addr, len))
-			goto search;
+#ifdef CONFIG_PPC_64K_PAGES
+	/* If we support combo pages, we can allow 64k pages in 4k slices */
+	if (psize == MMU_PAGE_64K) {
+		compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K);
+		if (fixed)
+			or_mask(good_mask, compat_mask);
+	}
+#endif
 
+	/* First check hint if it's valid or if we have MAP_FIXED */
+	if (addr != 0 || fixed) {
 		/* Build a mask for the requested range */
 		mask = slice_range_to_mask(addr, len);
 		slice_print_mask(" mask", mask);
@@ -442,54 +467,63 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 			slice_dbg(" fits good !\n");
 			return addr;
 		}
-
-		/* We don't fit in the good mask, check what other slices are
-		 * empty and thus can be converted
+	} else {
+		/* Now let's see if we can find something in the existing
+		 * slices for that size
 		 */
-		potential_mask = slice_mask_for_free(mm);
-		potential_mask.low_slices |= good_mask.low_slices;
-		potential_mask.high_slices |= good_mask.high_slices;
-		pmask_set = 1;
-		slice_print_mask(" potential", potential_mask);
-		if (slice_check_fit(mask, potential_mask)) {
-			slice_dbg(" fits potential !\n");
-			goto convert;
+		newaddr = slice_find_area(mm, len, good_mask, psize, topdown);
+		if (newaddr != -ENOMEM) {
+			/* Found within the good mask, we don't have to setup,
+			 * we thus return directly
+			 */
+			slice_dbg(" found area at 0x%lx\n", newaddr);
+			return newaddr;
 		}
 	}
 
-	/* If we have MAP_FIXED and failed the above step, then error out */
+	/* We don't fit in the good mask, check what other slices are
+	 * empty and thus can be converted
+	 */
+	potential_mask = slice_mask_for_free(mm);
+	or_mask(potential_mask, good_mask);
+	slice_print_mask(" potential", potential_mask);
+
+	if ((addr != 0 || fixed) && slice_check_fit(mask, potential_mask)) {
+		slice_dbg(" fits potential !\n");
+		goto convert;
+	}
+
+	/* If we have MAP_FIXED and failed the above steps, then error out */
 	if (fixed)
 		return -EBUSY;
 
- search:
 	slice_dbg(" search...\n");
 
-	/* Now let's see if we can find something in the existing slices
-	 * for that size
+	/* If we had a hint that didn't work out, see if we can fit
+	 * anywhere in the good area.
 	 */
-	addr = slice_find_area(mm, len, good_mask, psize, topdown, use_cache);
-	if (addr != -ENOMEM) {
-		/* Found within the good mask, we don't have to setup,
-		 * we thus return directly
-		 */
-		slice_dbg(" found area at 0x%lx\n", addr);
-		return addr;
-	}
-
-	/* Won't fit, check what can be converted */
-	if (!pmask_set) {
-		potential_mask = slice_mask_for_free(mm);
-		potential_mask.low_slices |= good_mask.low_slices;
-		potential_mask.high_slices |= good_mask.high_slices;
-		pmask_set = 1;
-		slice_print_mask(" potential", potential_mask);
+	if (addr) {
+		addr = slice_find_area(mm, len, good_mask, psize, topdown);
+		if (addr != -ENOMEM) {
+			slice_dbg(" found area at 0x%lx\n", addr);
+			return addr;
+		}
 	}
 
 	/* Now let's see if we can find something in the existing slices
-	 * for that size
+	 * for that size plus free slices
 	 */
-	addr = slice_find_area(mm, len, potential_mask, psize, topdown,
-			       use_cache);
+	addr = slice_find_area(mm, len, potential_mask, psize, topdown);
+
+#ifdef CONFIG_PPC_64K_PAGES
+	if (addr == -ENOMEM && psize == MMU_PAGE_64K) {
+		/* retry the search with 4k-page slices included */
+		or_mask(potential_mask, compat_mask);
+		addr = slice_find_area(mm, len, potential_mask, psize,
+				       topdown);
+	}
+#endif
+
 	if (addr == -ENOMEM)
 		return -ENOMEM;
 
@@ -498,7 +532,13 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	slice_print_mask(" mask", mask);
 
  convert:
-	slice_convert(mm, mask, psize);
+	andnot_mask(mask, good_mask);
+	andnot_mask(mask, compat_mask);
+	if (mask.low_slices || mask.high_slices) {
+		slice_convert(mm, mask, psize);
+		if (psize > MMU_PAGE_BASE)
+			on_each_cpu(slice_flush_segments, mm, 1);
+	}
 	return addr;
 
 }
@@ -511,8 +551,7 @@ unsigned long arch_get_unmapped_area(struct file *filp,
 				     unsigned long flags)
 {
 	return slice_get_unmapped_area(addr, len, flags,
-				       current->mm->context.user_psize,
-				       0, 1);
+				       current->mm->context.user_psize, 0);
 }
 
 unsigned long arch_get_unmapped_area_topdown(struct file *filp,
@@ -522,24 +561,24 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp,
 					     const unsigned long flags)
 {
 	return slice_get_unmapped_area(addr0, len, flags,
-				       current->mm->context.user_psize,
-				       1, 1);
+				       current->mm->context.user_psize, 1);
 }
 
 unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr)
 {
-	u64 psizes;
-	int index;
+	unsigned char *hpsizes;
+	int index, mask_index;
 
 	if (addr < SLICE_LOW_TOP) {
-		psizes = mm->context.low_slices_psize;
+		u64 lpsizes;
+		lpsizes = mm->context.low_slices_psize;
 		index = GET_LOW_SLICE_INDEX(addr);
-	} else {
-		psizes = mm->context.high_slices_psize;
-		index = GET_HIGH_SLICE_INDEX(addr);
+		return (lpsizes >> (index * 4)) & 0xf;
 	}
-
-	return (psizes >> (index * 4)) & 0xf;
+	hpsizes = mm->context.high_slices_psize;
+	index = GET_HIGH_SLICE_INDEX(addr);
+	mask_index = index & 0x1;
+	return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xf;
 }
 EXPORT_SYMBOL_GPL(get_slice_psize);
 
@@ -559,7 +598,9 @@ EXPORT_SYMBOL_GPL(get_slice_psize);
  */
 void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
 {
-	unsigned long flags, lpsizes, hpsizes;
+	int index, mask_index;
+	unsigned char *hpsizes;
+	unsigned long flags, lpsizes;
 	unsigned int old_psize;
 	int i;
 
@@ -580,15 +621,21 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
 		if (((lpsizes >> (i * 4)) & 0xf) == old_psize)
 			lpsizes = (lpsizes & ~(0xful << (i * 4))) |
 				(((unsigned long)psize) << (i * 4));
+	/* Assign the value back */
+	mm->context.low_slices_psize = lpsizes;
 
 	hpsizes = mm->context.high_slices_psize;
-	for (i = 0; i < SLICE_NUM_HIGH; i++)
-		if (((hpsizes >> (i * 4)) & 0xf) == old_psize)
-			hpsizes = (hpsizes & ~(0xful << (i * 4))) |
-				(((unsigned long)psize) << (i * 4));
+	for (i = 0; i < SLICE_NUM_HIGH; i++) {
+		mask_index = i & 0x1;
+		index = i >> 1;
+		if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == old_psize)
+			hpsizes[index] = (hpsizes[index] &
+					  ~(0xf << (mask_index * 4))) |
+				(((unsigned long)psize) << (mask_index * 4));
+	}
+
+
 
-	mm->context.low_slices_psize = lpsizes;
-	mm->context.high_slices_psize = hpsizes;
 
 	slice_dbg(" lsps=%lx, hsps=%lx\n",
 		  mm->context.low_slices_psize,
@@ -598,8 +645,47 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
 	spin_unlock_irqrestore(&slice_convert_lock, flags);
 }
 
+void slice_set_psize(struct mm_struct *mm, unsigned long address,
+		     unsigned int psize)
+{
+	unsigned char *hpsizes;
+	unsigned long i, flags;
+	u64 *lpsizes;
+
+	spin_lock_irqsave(&slice_convert_lock, flags);
+	if (address < SLICE_LOW_TOP) {
+		i = GET_LOW_SLICE_INDEX(address);
+		lpsizes = &mm->context.low_slices_psize;
+		*lpsizes = (*lpsizes & ~(0xful << (i * 4))) |
+			((unsigned long) psize << (i * 4));
+	} else {
+		int index, mask_index;
+		i = GET_HIGH_SLICE_INDEX(address);
+		hpsizes = mm->context.high_slices_psize;
+		mask_index = i & 0x1;
+		index = i >> 1;
+		hpsizes[index] = (hpsizes[index] &
+				  ~(0xf << (mask_index * 4))) |
+			(((unsigned long)psize) << (mask_index * 4));
+	}
+
+	spin_unlock_irqrestore(&slice_convert_lock, flags);
+
+#ifdef CONFIG_SPU_BASE
+	spu_flush_all_slbs(mm);
+#endif
+}
+
+void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
+			   unsigned long len, unsigned int psize)
+{
+	struct slice_mask mask = slice_range_to_mask(start, len);
+
+	slice_convert(mm, mask, psize);
+}
+
 /*
- * is_hugepage_only_range() is used by generic code to verify wether
+ * is_hugepage_only_range() is used by generic code to verify whether
  * a normal mmap mapping (non hugetlbfs) is valid on a given area.
  *
  * until the generic code provides a more generic hook and/or starts
@@ -621,9 +707,18 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 			   unsigned long len)
 {
 	struct slice_mask mask, available;
+	unsigned int psize = mm->context.user_psize;
 
 	mask = slice_range_to_mask(addr, len);
-	available = slice_mask_for_size(mm, mm->context.user_psize);
+	available = slice_mask_for_size(mm, psize);
+#ifdef CONFIG_PPC_64K_PAGES
+	/* We need to account for 4k slices too */
+	if (psize == MMU_PAGE_64K) {
+		struct slice_mask compat_mask;
+		compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K);
+		or_mask(available, compat_mask);
+	}
+#endif
 
 #if 0 /* too verbose */
 	slice_dbg("is_hugepage_only_range(mm=%p, addr=%lx, len=%lx)\n",
diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c
index 50448d5de9d..3f8efa6f299 100644
--- a/arch/powerpc/mm/stab.c
+++ b/arch/powerpc/mm/stab.c
@@ -12,15 +12,14 @@
  *      2 of the License, or (at your option) any later version.
  */
 
+#include <linux/memblock.h>
+
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
 #include <asm/mmu_context.h>
 #include <asm/paca.h>
 #include <asm/cputable.h>
-#include <asm/lmb.h>
-#include <asm/abs_addr.h>
-#include <asm/firmware.h>
-#include <asm/iseries/hv_call.h>
+#include <asm/prom.h>
 
 struct stab_entry {
 	unsigned long esid_data;
@@ -28,8 +27,8 @@ struct stab_entry {
 };
 
 #define NR_STAB_CACHE_ENTRIES 8
-DEFINE_PER_CPU(long, stab_cache_ptr);
-DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]);
+static DEFINE_PER_CPU(long, stab_cache_ptr);
+static DEFINE_PER_CPU(long [NR_STAB_CACHE_ENTRIES], stab_cache);
 
 /*
  * Create a segment table entry for the given esid/vsid pair.
@@ -162,7 +161,7 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
 {
 	struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr;
 	struct stab_entry *ste;
-	unsigned long offset = __get_cpu_var(stab_cache_ptr);
+	unsigned long offset;
 	unsigned long pc = KSTK_EIP(tsk);
 	unsigned long stack = KSTK_ESP(tsk);
 	unsigned long unmapped_base;
@@ -170,6 +169,15 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
 	/* Force previous translations to complete. DRENG */
 	asm volatile("isync" : : : "memory");
 
+	/*
+	 * We need interrupts hard-disabled here, not just soft-disabled,
+	 * so that a PMU interrupt can't occur, which might try to access
+	 * user memory (to get a stack trace) and possible cause an STAB miss
+	 * which would update the stab_cache/stab_cache_ptr per-cpu variables.
+	 */
+	hard_irq_disable();
+
+	offset = __get_cpu_var(stab_cache_ptr);
 	if (offset <= NR_STAB_CACHE_ENTRIES) {
 		int i;
 
@@ -232,7 +240,7 @@ void __init stabs_alloc(void)
 {
 	int cpu;
 
-	if (cpu_has_feature(CPU_FTR_SLB))
+	if (mmu_has_feature(MMU_FTR_SLB))
 		return;
 
 	for_each_possible_cpu(cpu) {
@@ -241,16 +249,16 @@ void __init stabs_alloc(void)
 		if (cpu == 0)
 			continue; /* stab for CPU 0 is statically allocated */
 
-		newstab = lmb_alloc_base(HW_PAGE_SIZE, HW_PAGE_SIZE,
+		newstab = memblock_alloc_base(HW_PAGE_SIZE, HW_PAGE_SIZE,
 					 1<<SID_SHIFT);
 		newstab = (unsigned long)__va(newstab);
 
 		memset((void *)newstab, 0, HW_PAGE_SIZE);
 
 		paca[cpu].stab_addr = newstab;
-		paca[cpu].stab_real = virt_to_abs(newstab);
-		printk(KERN_INFO "Segment table for CPU %d at 0x%lx "
-		       "virtual, 0x%lx absolute\n",
+		paca[cpu].stab_real = __pa(newstab);
+		printk(KERN_INFO "Segment table for CPU %d at 0x%llx "
+		       "virtual, 0x%llx absolute\n",
 		       cpu, paca[cpu].stab_addr, paca[cpu].stab_real);
 	}
 }
@@ -274,12 +282,5 @@ void stab_initialize(unsigned long stab)
 	/* Set ASR */
 	stabreal = get_paca()->stab_real | 0x1ul;
 
-#ifdef CONFIG_PPC_ISERIES
-	if (firmware_has_feature(FW_FEATURE_ISERIES)) {
-		HvCall1(HvCallBaseSetASR, stabreal);
-		return;
-	}
-#endif /* CONFIG_PPC_ISERIES */
-
 	mtspr(SPRN_ASR, stabreal);
 }
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index 4cafc0c33d0..6c0b1f5f8d2 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -10,7 +10,6 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/gfp.h>
-#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
@@ -24,9 +23,9 @@
  * Also makes sure that the subpage_prot_table structure is
  * reinitialized for the next user.
  */
-void subpage_prot_free(pgd_t *pgd)
+void subpage_prot_free(struct mm_struct *mm)
 {
-	struct subpage_prot_table *spt = pgd_subpage_prot(pgd);
+	struct subpage_prot_table *spt = &mm->context.spt;
 	unsigned long i, j, addr;
 	u32 **p;
 
@@ -51,6 +50,13 @@ void subpage_prot_free(pgd_t *pgd)
 	spt->maxaddr = 0;
 }
 
+void subpage_prot_init_new_context(struct mm_struct *mm)
+{
+	struct subpage_prot_table *spt = &mm->context.spt;
+
+	memset(spt, 0, sizeof(*spt));
+}
+
 static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
 			     int npages)
 {
@@ -72,7 +78,7 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	arch_enter_lazy_mmu_mode();
 	for (; npages > 0; --npages) {
-		pte_update(mm, addr, pte, 0, 0);
+		pte_update(mm, addr, pte, 0, 0, 0);
 		addr += PAGE_SIZE;
 		++pte;
 	}
@@ -87,9 +93,10 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
 static void subpage_prot_clear(unsigned long addr, unsigned long len)
 {
 	struct mm_struct *mm = current->mm;
-	struct subpage_prot_table *spt = pgd_subpage_prot(mm->pgd);
+	struct subpage_prot_table *spt = &mm->context.spt;
 	u32 **spm, *spp;
-	int i, nw;
+	unsigned long i;
+	size_t nw;
 	unsigned long next, limit;
 
 	down_write(&mm->mmap_sem);
@@ -98,7 +105,7 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
 		limit = spt->maxaddr;
 	for (; addr < limit; addr = next) {
 		next = pmd_addr_end(addr, limit);
-		if (addr < 0x100000000) {
+		if (addr < 0x100000000UL) {
 			spm = spt->low_prot;
 		} else {
 			spm = spt->protptrs[addr >> SBP_L3_SHIFT];
@@ -123,6 +130,53 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
 	up_write(&mm->mmap_sem);
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
+				  unsigned long end, struct mm_walk *walk)
+{
+	struct vm_area_struct *vma = walk->private;
+	split_huge_page_pmd(vma, addr, pmd);
+	return 0;
+}
+
+static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
+				    unsigned long len)
+{
+	struct vm_area_struct *vma;
+	struct mm_walk subpage_proto_walk = {
+		.mm = mm,
+		.pmd_entry = subpage_walk_pmd_entry,
+	};
+
+	/*
+	 * We don't try too hard, we just mark all the vma in that range
+	 * VM_NOHUGEPAGE and split them.
+	 */
+	vma = find_vma(mm, addr);
+	/*
+	 * If the range is in unmapped range, just return
+	 */
+	if (vma && ((addr + len) <= vma->vm_start))
+		return;
+
+	while (vma) {
+		if (vma->vm_start >= (addr + len))
+			break;
+		vma->vm_flags |= VM_NOHUGEPAGE;
+		subpage_proto_walk.private = vma;
+		walk_page_range(vma->vm_start, vma->vm_end,
+				&subpage_proto_walk);
+		vma = vma->vm_next;
+	}
+}
+#else
+static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
+				    unsigned long len)
+{
+	return;
+}
+#endif
+
 /*
  * Copy in a subpage protection map for an address range.
  * The map has 2 bits per 4k subpage, so 32 bits per 64k page.
@@ -136,9 +190,10 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
 long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
 {
 	struct mm_struct *mm = current->mm;
-	struct subpage_prot_table *spt = pgd_subpage_prot(mm->pgd);
+	struct subpage_prot_table *spt = &mm->context.spt;
 	u32 **spm, *spp;
-	int i, nw;
+	unsigned long i;
+	size_t nw;
 	unsigned long next, limit;
 	int err;
 
@@ -160,10 +215,11 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
 		return -EFAULT;
 
 	down_write(&mm->mmap_sem);
+	subpage_mark_vma_nohuge(mm, addr, len);
 	for (limit = addr + len; addr < limit; addr = next) {
 		next = pmd_addr_end(addr, limit);
 		err = -ENOMEM;
-		if (addr < 0x100000000) {
+		if (addr < 0x100000000UL) {
 			spm = spt->low_prot;
 		} else {
 			spm = spt->protptrs[addr >> SBP_L3_SHIFT];
diff --git a/arch/powerpc/mm/tlb_32.c b/arch/powerpc/mm/tlb_hash32.c
index eb4b512d65f..558e30cce33 100644
--- a/arch/powerpc/mm/tlb_32.c
+++ b/arch/powerpc/mm/tlb_hash32.c
@@ -27,6 +27,7 @@
 #include <linux/init.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/export.h>
 
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
@@ -45,6 +46,7 @@ void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr)
 		flush_hash_pages(mm->context.id, addr, ptephys, 1);
 	}
 }
+EXPORT_SYMBOL(flush_hash_entry);
 
 /*
  * Called by ptep_set_access_flags, must flush on CPUs for which the
@@ -85,17 +87,6 @@ void tlb_flush(struct mmu_gather *tlb)
  *    -- Cort
  */
 
-/*
- * 750 SMP is a Bad Idea because the 750 doesn't broadcast all
- * the cache operations on the bus.  Hence we need to use an IPI
- * to get the other CPU(s) to invalidate their TLBs.
- */
-#ifdef CONFIG_SMP_750
-#define FINISH_FLUSH	smp_send_tlb_invalidate(0)
-#else
-#define FINISH_FLUSH	do { } while (0)
-#endif
-
 static void flush_range(struct mm_struct *mm, unsigned long start,
 			unsigned long end)
 {
@@ -134,8 +125,8 @@ static void flush_range(struct mm_struct *mm, unsigned long start,
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
 	flush_range(&init_mm, start, end);
-	FINISH_FLUSH;
 }
+EXPORT_SYMBOL(flush_tlb_kernel_range);
 
 /*
  * Flush all the (user) entries for the address space described by mm.
@@ -157,8 +148,8 @@ void flush_tlb_mm(struct mm_struct *mm)
 	 */
 	for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
 		flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
-	FINISH_FLUSH;
 }
+EXPORT_SYMBOL(flush_tlb_mm);
 
 void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
 {
@@ -173,8 +164,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
 	pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr);
 	if (!pmd_none(*pmd))
 		flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
-	FINISH_FLUSH;
 }
+EXPORT_SYMBOL(flush_tlb_page);
 
 /*
  * For each address in the range, find the pte for the address
@@ -185,5 +176,9 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 		     unsigned long end)
 {
 	flush_range(vma->vm_mm, start, end);
-	FINISH_FLUSH;
+}
+EXPORT_SYMBOL(flush_tlb_range);
+
+void __init early_init_mmu(void)
+{
 }
diff --git a/arch/powerpc/mm/tlb_64.c b/arch/powerpc/mm/tlb_hash64.c
index e2d867ce1c7..c99f6510a0b 100644
--- a/arch/powerpc/mm/tlb_64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -23,7 +23,6 @@
 
 #include <linux/kernel.h>
 #include <linux/mm.h>
-#include <linux/init.h>
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
 #include <asm/pgalloc.h>
@@ -33,102 +32,18 @@
 
 DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
 
-/* This is declared as we are using the more or less generic
- * include/asm-powerpc/tlb.h file -- tgall
- */
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
-unsigned long pte_freelist_forced_free;
-
-struct pte_freelist_batch
-{
-	struct rcu_head	rcu;
-	unsigned int	index;
-	pgtable_free_t	tables[0];
-};
-
-DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
-unsigned long pte_freelist_forced_free;
-
-#define PTE_FREELIST_SIZE \
-	((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
-	  / sizeof(pgtable_free_t))
-
-static void pte_free_smp_sync(void *arg)
-{
-	/* Do nothing, just ensure we sync with all CPUs */
-}
-
-/* This is only called when we are critically out of memory
- * (and fail to get a page in pte_free_tlb).
- */
-static void pgtable_free_now(pgtable_free_t pgf)
-{
-	pte_freelist_forced_free++;
-
-	smp_call_function(pte_free_smp_sync, NULL, 0, 1);
-
-	pgtable_free(pgf);
-}
-
-static void pte_free_rcu_callback(struct rcu_head *head)
-{
-	struct pte_freelist_batch *batch =
-		container_of(head, struct pte_freelist_batch, rcu);
-	unsigned int i;
-
-	for (i = 0; i < batch->index; i++)
-		pgtable_free(batch->tables[i]);
-
-	free_page((unsigned long)batch);
-}
-
-static void pte_free_submit(struct pte_freelist_batch *batch)
-{
-	INIT_RCU_HEAD(&batch->rcu);
-	call_rcu(&batch->rcu, pte_free_rcu_callback);
-}
-
-void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
-{
-	/* This is safe since tlb_gather_mmu has disabled preemption */
-        cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id());
-	struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
-
-	if (atomic_read(&tlb->mm->mm_users) < 2 ||
-	    cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) {
-		pgtable_free(pgf);
-		return;
-	}
-
-	if (*batchp == NULL) {
-		*batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
-		if (*batchp == NULL) {
-			pgtable_free_now(pgf);
-			return;
-		}
-		(*batchp)->index = 0;
-	}
-	(*batchp)->tables[(*batchp)->index++] = pgf;
-	if ((*batchp)->index == PTE_FREELIST_SIZE) {
-		pte_free_submit(*batchp);
-		*batchp = NULL;
-	}
-}
-
 /*
  * A linux PTE was changed and the corresponding hash table entry
  * neesd to be flushed. This function will either perform the flush
  * immediately or will batch it up if the current CPU has an active
  * batch on it.
- *
- * Must be called from within some kind of spinlock/non-preempt region...
  */
 void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, unsigned long pte, int huge)
 {
-	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
-	unsigned long vsid, vaddr;
+	unsigned long vpn;
+	struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
+	unsigned long vsid;
 	unsigned int psize;
 	int ssize;
 	real_pte_t rpte;
@@ -136,11 +51,6 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 
 	i = batch->index;
 
-	/* We mask the address for the base page size. Huge pages will
-	 * have applied their own masking already
-	 */
-	addr &= PAGE_MASK;
-
 	/* Get page size (maybe move back to caller).
 	 *
 	 * NOTE: when using special 64K mappings in 4K environment like
@@ -150,24 +60,33 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 	 */
 	if (huge) {
 #ifdef CONFIG_HUGETLB_PAGE
-		psize = mmu_huge_psize;
+		psize = get_slice_psize(mm, addr);
+		/* Mask the address for the correct page size */
+		addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1);
 #else
 		BUG();
 		psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */
 #endif
-	} else
+	} else {
 		psize = pte_pagesize_index(mm, addr, pte);
+		/* Mask the address for the standard page size.  If we
+		 * have a 64k page kernel, but the hardware does not
+		 * support 64k pages, this might be different from the
+		 * hardware page size encoded in the slice table. */
+		addr &= PAGE_MASK;
+	}
+
 
 	/* Build full vaddr */
 	if (!is_kernel_addr(addr)) {
 		ssize = user_segment_size(addr);
 		vsid = get_vsid(mm->context.id, addr, ssize);
-		WARN_ON(vsid == 0);
 	} else {
 		vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
 		ssize = mmu_kernel_ssize;
 	}
-	vaddr = hpt_va(addr, vsid, ssize);
+	WARN_ON(vsid == 0);
+	vpn = hpt_vpn(addr, vsid, ssize);
 	rpte = __real_pte(__pte(pte), ptep);
 
 	/*
@@ -177,7 +96,8 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 	 * and decide to use local invalidates instead...
 	 */
 	if (!batch->active) {
-		flush_hash_page(vaddr, rpte, psize, ssize, 0);
+		flush_hash_page(vpn, rpte, psize, ssize, 0);
+		put_cpu_var(ppc64_tlb_batch);
 		return;
 	}
 
@@ -202,10 +122,11 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 		batch->ssize = ssize;
 	}
 	batch->pte[i] = rpte;
-	batch->vaddr[i] = vaddr;
+	batch->vpn[i] = vpn;
 	batch->index = ++i;
 	if (i >= PPC64_TLB_BATCH_NR)
 		__flush_tlb_pending(batch);
+	put_cpu_var(ppc64_tlb_batch);
 }
 
 /*
@@ -217,30 +138,33 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
  */
 void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
 {
-	cpumask_t tmp;
+	const struct cpumask *tmp;
 	int i, local = 0;
 
 	i = batch->index;
-	tmp = cpumask_of_cpu(smp_processor_id());
-	if (cpus_equal(batch->mm->cpu_vm_mask, tmp))
+	tmp = cpumask_of(smp_processor_id());
+	if (cpumask_equal(mm_cpumask(batch->mm), tmp))
 		local = 1;
 	if (i == 1)
-		flush_hash_page(batch->vaddr[0], batch->pte[0],
+		flush_hash_page(batch->vpn[0], batch->pte[0],
 				batch->psize, batch->ssize, local);
 	else
 		flush_hash_range(i, local);
 	batch->index = 0;
 }
 
-void pte_free_finish(void)
+void tlb_flush(struct mmu_gather *tlb)
 {
-	/* This is safe since tlb_gather_mmu has disabled preemption */
-	struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
+	struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch);
 
-	if (*batchp == NULL)
-		return;
-	pte_free_submit(*batchp);
-	*batchp = NULL;
+	/* If there's a TLB batch pending, then we must flush it because the
+	 * pages are going to be freed and we really don't want to have a CPU
+	 * access a freed page because it has a stale TLB
+	 */
+	if (tlbbatch->index)
+		__flush_tlb_pending(tlbbatch);
+
+	put_cpu_var(ppc64_tlb_batch);
 }
 
 /**
@@ -258,14 +182,13 @@ void pte_free_finish(void)
  * since 64K pages may overlap with other bridges when using 64K pages
  * with 4K HW pages on IO space.
  *
- * Because of that usage pattern, it's only available with CONFIG_HOTPLUG
- * and is implemented for small size rather than speed.
+ * Because of that usage pattern, it is implemented for small size rather
+ * than speed.
  */
-#ifdef CONFIG_HOTPLUG
-
 void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
 			      unsigned long end)
 {
+	int hugepage_shift;
 	unsigned long flags;
 
 	start = _ALIGN_DOWN(start, PAGE_SIZE);
@@ -283,7 +206,8 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
 	local_irq_save(flags);
 	arch_enter_lazy_mmu_mode();
 	for (; start < end; start += PAGE_SIZE) {
-		pte_t *ptep = find_linux_pte(mm->pgd, start);
+		pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start,
+							&hugepage_shift);
 		unsigned long pte;
 
 		if (ptep == NULL)
@@ -291,10 +215,38 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
 		pte = pte_val(*ptep);
 		if (!(pte & _PAGE_HASHPTE))
 			continue;
-		hpte_need_flush(mm, start, ptep, pte, 0);
+		if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte)))
+			hpte_do_hugepage_flush(mm, start, (pmd_t *)pte);
+		else
+			hpte_need_flush(mm, start, ptep, pte, 0);
 	}
 	arch_leave_lazy_mmu_mode();
 	local_irq_restore(flags);
 }
 
-#endif /* CONFIG_HOTPLUG */
+void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
+{
+	pte_t *pte;
+	pte_t *start_pte;
+	unsigned long flags;
+
+	addr = _ALIGN_DOWN(addr, PMD_SIZE);
+	/* Note: Normally, we should only ever use a batch within a
+	 * PTE locked section. This violates the rule, but will work
+	 * since we don't actually modify the PTEs, we just flush the
+	 * hash while leaving the PTEs intact (including their reference
+	 * to being hashed). This is not the most performance oriented
+	 * way to do things but is fine for our needs here.
+	 */
+	local_irq_save(flags);
+	arch_enter_lazy_mmu_mode();
+	start_pte = pte_offset_map(pmd, addr);
+	for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
+		unsigned long pteval = pte_val(*pte);
+		if (pteval & _PAGE_HASHPTE)
+			hpte_need_flush(mm, addr, pte, pteval, 0);
+		addr += PAGE_SIZE;
+	}
+	arch_leave_lazy_mmu_mode();
+	local_irq_restore(flags);
+}
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
new file mode 100644
index 00000000000..356e8b41fb0
--- /dev/null
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -0,0 +1,1171 @@
+/*
+ *  Low level TLB miss handlers for Book3E
+ *
+ *  Copyright (C) 2008-2009
+ *      Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/processor.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/cputable.h>
+#include <asm/pgtable.h>
+#include <asm/exception-64e.h>
+#include <asm/ppc-opcode.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_booke_hv_asm.h>
+
+#ifdef CONFIG_PPC_64K_PAGES
+#define VPTE_PMD_SHIFT	(PTE_INDEX_SIZE+1)
+#else
+#define VPTE_PMD_SHIFT	(PTE_INDEX_SIZE)
+#endif
+#define VPTE_PUD_SHIFT	(VPTE_PMD_SHIFT + PMD_INDEX_SIZE)
+#define VPTE_PGD_SHIFT	(VPTE_PUD_SHIFT + PUD_INDEX_SIZE)
+#define VPTE_INDEX_SIZE (VPTE_PGD_SHIFT + PGD_INDEX_SIZE)
+
+/**********************************************************************
+ *                                                                    *
+ * TLB miss handling for Book3E with a bolted linear mapping          *
+ * No virtual page table, no nested TLB misses                        *
+ *                                                                    *
+ **********************************************************************/
+
+/*
+ * Note that, unlike non-bolted handlers, TLB_EXFRAME is not
+ * modified by the TLB miss handlers themselves, since the TLB miss
+ * handler code will not itself cause a recursive TLB miss.
+ *
+ * TLB_EXFRAME will be modified when crit/mc/debug exceptions are
+ * entered/exited.
+ */
+.macro tlb_prolog_bolted intnum addr
+	mtspr	SPRN_SPRG_GEN_SCRATCH,r12
+	mfspr	r12,SPRN_SPRG_TLB_EXFRAME
+	std	r13,EX_TLB_R13(r12)
+	std	r10,EX_TLB_R10(r12)
+	mfspr	r13,SPRN_SPRG_PACA
+
+	mfcr	r10
+	std	r11,EX_TLB_R11(r12)
+#ifdef CONFIG_KVM_BOOKE_HV
+BEGIN_FTR_SECTION
+	mfspr	r11, SPRN_SRR1
+END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
+#endif
+	DO_KVM	\intnum, SPRN_SRR1
+	std	r16,EX_TLB_R16(r12)
+	mfspr	r16,\addr		/* get faulting address */
+	std	r14,EX_TLB_R14(r12)
+	ld	r14,PACAPGD(r13)
+	std	r15,EX_TLB_R15(r12)
+	std	r10,EX_TLB_CR(r12)
+	TLB_MISS_PROLOG_STATS
+.endm
+
+.macro tlb_epilog_bolted
+	ld	r14,EX_TLB_CR(r12)
+	ld	r10,EX_TLB_R10(r12)
+	ld	r11,EX_TLB_R11(r12)
+	ld	r13,EX_TLB_R13(r12)
+	mtcr	r14
+	ld	r14,EX_TLB_R14(r12)
+	ld	r15,EX_TLB_R15(r12)
+	TLB_MISS_RESTORE_STATS
+	ld	r16,EX_TLB_R16(r12)
+	mfspr	r12,SPRN_SPRG_GEN_SCRATCH
+.endm
+
+/* Data TLB miss */
+	START_EXCEPTION(data_tlb_miss_bolted)
+	tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR
+
+	/* We need _PAGE_PRESENT and  _PAGE_ACCESSED set */
+
+	/* We do the user/kernel test for the PID here along with the RW test
+	 */
+	/* We pre-test some combination of permissions to avoid double
+	 * faults:
+	 *
+	 * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE
+	 * ESR_ST   is 0x00800000
+	 * _PAGE_BAP_SW is 0x00000010
+	 * So the shift is >> 19. This tests for supervisor writeability.
+	 * If the page happens to be supervisor writeable and not user
+	 * writeable, we will take a new fault later, but that should be
+	 * a rare enough case.
+	 *
+	 * We also move ESR_ST in _PAGE_DIRTY position
+	 * _PAGE_DIRTY is 0x00001000 so the shift is >> 11
+	 *
+	 * MAS1 is preset for all we need except for TID that needs to
+	 * be cleared for kernel translations
+	 */
+
+	mfspr	r11,SPRN_ESR
+
+	srdi	r15,r16,60		/* get region */
+	rldicl.	r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
+	bne-	dtlb_miss_fault_bolted	/* Bail if fault addr is invalid */
+
+	rlwinm	r10,r11,32-19,27,27
+	rlwimi	r10,r11,32-16,19,19
+	cmpwi	r15,0			/* user vs kernel check */
+	ori	r10,r10,_PAGE_PRESENT
+	oris	r11,r10,_PAGE_ACCESSED@h
+
+	TLB_MISS_STATS_SAVE_INFO_BOLTED
+	bne	tlb_miss_kernel_bolted
+
+tlb_miss_common_bolted:
+/*
+ * This is the guts of the TLB miss handler for bolted-linear.
+ * We are entered with:
+ *
+ * r16 = faulting address
+ * r15 = crap (free to use)
+ * r14 = page table base
+ * r13 = PACA
+ * r11 = PTE permission mask
+ * r10 = crap (free to use)
+ */
+	rldicl	r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3
+	cmpldi	cr0,r14,0
+	clrrdi	r15,r15,3
+	beq	tlb_miss_fault_bolted	/* No PGDIR, bail */
+
+BEGIN_MMU_FTR_SECTION
+	/* Set the TLB reservation and search for existing entry. Then load
+	 * the entry.
+	 */
+	PPC_TLBSRX_DOT(0,R16)
+	ldx	r14,r14,r15		/* grab pgd entry */
+	beq	tlb_miss_done_bolted	/* tlb exists already, bail */
+MMU_FTR_SECTION_ELSE
+	ldx	r14,r14,r15		/* grab pgd entry */
+ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV)
+
+#ifndef CONFIG_PPC_64K_PAGES
+	rldicl	r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3
+	clrrdi	r15,r15,3
+	cmpdi	cr0,r14,0
+	bge	tlb_miss_fault_bolted	/* Bad pgd entry or hugepage; bail */
+	ldx	r14,r14,r15		/* grab pud entry */
+#endif /* CONFIG_PPC_64K_PAGES */
+
+	rldicl	r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3
+	clrrdi	r15,r15,3
+	cmpdi	cr0,r14,0
+	bge	tlb_miss_fault_bolted
+	ldx	r14,r14,r15		/* Grab pmd entry */
+
+	rldicl	r15,r16,64-PAGE_SHIFT+3,64-PTE_INDEX_SIZE-3
+	clrrdi	r15,r15,3
+	cmpdi	cr0,r14,0
+	bge	tlb_miss_fault_bolted
+	ldx	r14,r14,r15		/* Grab PTE, normal (!huge) page */
+
+	/* Check if required permissions are met */
+	andc.	r15,r11,r14
+	rldicr	r15,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
+	bne-	tlb_miss_fault_bolted
+
+	/* Now we build the MAS:
+	 *
+	 * MAS 0   :	Fully setup with defaults in MAS4 and TLBnCFG
+	 * MAS 1   :	Almost fully setup
+	 *               - PID already updated by caller if necessary
+	 *               - TSIZE need change if !base page size, not
+	 *                 yet implemented for now
+	 * MAS 2   :	Defaults not useful, need to be redone
+	 * MAS 3+7 :	Needs to be done
+	 */
+	clrrdi	r11,r16,12		/* Clear low crap in EA */
+	clrldi	r15,r15,12		/* Clear crap at the top */
+	rlwimi	r11,r14,32-19,27,31	/* Insert WIMGE */
+	rlwimi	r15,r14,32-8,22,25	/* Move in U bits */
+	mtspr	SPRN_MAS2,r11
+	andi.	r11,r14,_PAGE_DIRTY
+	rlwimi	r15,r14,32-2,26,31	/* Move in BAP bits */
+
+	/* Mask out SW and UW if !DIRTY (XXX optimize this !) */
+	bne	1f
+	li	r11,MAS3_SW|MAS3_UW
+	andc	r15,r15,r11
+1:
+	mtspr	SPRN_MAS7_MAS3,r15
+	tlbwe
+
+tlb_miss_done_bolted:
+	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
+	tlb_epilog_bolted
+	rfi
+
+itlb_miss_kernel_bolted:
+	li	r11,_PAGE_PRESENT|_PAGE_BAP_SX	/* Base perm */
+	oris	r11,r11,_PAGE_ACCESSED@h
+tlb_miss_kernel_bolted:
+	mfspr	r10,SPRN_MAS1
+	ld	r14,PACA_KERNELPGD(r13)
+	cmpldi	cr0,r15,8		/* Check for vmalloc region */
+	rlwinm	r10,r10,0,16,1		/* Clear TID */
+	mtspr	SPRN_MAS1,r10
+	beq+	tlb_miss_common_bolted
+
+tlb_miss_fault_bolted:
+	/* We need to check if it was an instruction miss */
+	andi.	r10,r11,_PAGE_EXEC|_PAGE_BAP_SX
+	bne	itlb_miss_fault_bolted
+dtlb_miss_fault_bolted:
+	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+	tlb_epilog_bolted
+	b	exc_data_storage_book3e
+itlb_miss_fault_bolted:
+	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+	tlb_epilog_bolted
+	b	exc_instruction_storage_book3e
+
+/* Instruction TLB miss */
+	START_EXCEPTION(instruction_tlb_miss_bolted)
+	tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0
+
+	rldicl.	r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
+	srdi	r15,r16,60		/* get region */
+	TLB_MISS_STATS_SAVE_INFO_BOLTED
+	bne-	itlb_miss_fault_bolted
+
+	li	r11,_PAGE_PRESENT|_PAGE_EXEC	/* Base perm */
+
+	/* We do the user/kernel test for the PID here along with the RW test
+	 */
+
+	cmpldi	cr0,r15,0			/* Check for user region */
+	oris	r11,r11,_PAGE_ACCESSED@h
+	beq	tlb_miss_common_bolted
+	b	itlb_miss_kernel_bolted
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+/*
+ * TLB miss handling for e6500 and derivatives, using hardware tablewalk.
+ *
+ * Linear mapping is bolted: no virtual page table or nested TLB misses
+ * Indirect entries in TLB1, hardware loads resulting direct entries
+ *    into TLB0
+ * No HES or NV hint on TLB1, so we need to do software round-robin
+ * No tlbsrx. so we need a spinlock, and we have to deal
+ *    with MAS-damage caused by tlbsx
+ * 4K pages only
+ */
+
+	START_EXCEPTION(instruction_tlb_miss_e6500)
+	tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0
+
+	ld	r11,PACA_TCD_PTR(r13)
+	srdi.	r15,r16,60		/* get region */
+	ori	r16,r16,1
+
+	TLB_MISS_STATS_SAVE_INFO_BOLTED
+	bne	tlb_miss_kernel_e6500	/* user/kernel test */
+
+	b	tlb_miss_common_e6500
+
+	START_EXCEPTION(data_tlb_miss_e6500)
+	tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR
+
+	ld	r11,PACA_TCD_PTR(r13)
+	srdi.	r15,r16,60		/* get region */
+	rldicr	r16,r16,0,62
+
+	TLB_MISS_STATS_SAVE_INFO_BOLTED
+	bne	tlb_miss_kernel_e6500	/* user vs kernel check */
+
+/*
+ * This is the guts of the TLB miss handler for e6500 and derivatives.
+ * We are entered with:
+ *
+ * r16 = page of faulting address (low bit 0 if data, 1 if instruction)
+ * r15 = crap (free to use)
+ * r14 = page table base
+ * r13 = PACA
+ * r11 = tlb_per_core ptr
+ * r10 = cpu number
+ */
+tlb_miss_common_e6500:
+	/*
+	 * Search if we already have an indirect entry for that virtual
+	 * address, and if we do, bail out.
+	 *
+	 * MAS6:IND should be already set based on MAS4
+	 */
+1:	lbarx	r15,0,r11
+	lhz	r10,PACAPACAINDEX(r13)
+	cmpdi	r15,0
+	cmpdi	cr1,r15,1	/* set cr1.eq = 0 for non-recursive */
+	bne	2f
+	stbcx.	r10,0,r11
+	bne	1b
+3:
+	.subsection 1
+2:	cmpd	cr1,r15,r10	/* recursive lock due to mcheck/crit/etc? */
+	beq	cr1,3b		/* unlock will happen if cr1.eq = 0 */
+	lbz	r15,0(r11)
+	cmpdi	r15,0
+	bne	2b
+	b	1b
+	.previous
+
+	mfspr	r15,SPRN_MAS2
+
+	tlbsx	0,r16
+	mfspr	r10,SPRN_MAS1
+	andis.	r10,r10,MAS1_VALID@h
+	bne	tlb_miss_done_e6500
+
+	/* Undo MAS-damage from the tlbsx */
+	mfspr	r10,SPRN_MAS1
+	oris	r10,r10,MAS1_VALID@h
+	mtspr	SPRN_MAS1,r10
+	mtspr	SPRN_MAS2,r15
+
+	/* Now, we need to walk the page tables. First check if we are in
+	 * range.
+	 */
+	rldicl.	r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
+	bne-	tlb_miss_fault_e6500
+
+	rldicl	r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3
+	cmpldi	cr0,r14,0
+	clrrdi	r15,r15,3
+	beq-	tlb_miss_fault_e6500 /* No PGDIR, bail */
+	ldx	r14,r14,r15		/* grab pgd entry */
+
+	rldicl	r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3
+	clrrdi	r15,r15,3
+	cmpdi	cr0,r14,0
+	bge	tlb_miss_fault_e6500	/* Bad pgd entry or hugepage; bail */
+	ldx	r14,r14,r15		/* grab pud entry */
+
+	rldicl	r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3
+	clrrdi	r15,r15,3
+	cmpdi	cr0,r14,0
+	bge	tlb_miss_fault_e6500
+	ldx	r14,r14,r15		/* Grab pmd entry */
+
+	mfspr	r10,SPRN_MAS0
+	cmpdi	cr0,r14,0
+	bge	tlb_miss_fault_e6500
+
+	/* Now we build the MAS for a 2M indirect page:
+	 *
+	 * MAS 0   :	ESEL needs to be filled by software round-robin
+	 * MAS 1   :	Fully set up
+	 *               - PID already updated by caller if necessary
+	 *               - TSIZE for now is base ind page size always
+	 *               - TID already cleared if necessary
+	 * MAS 2   :	Default not 2M-aligned, need to be redone
+	 * MAS 3+7 :	Needs to be done
+	 */
+
+	ori	r14,r14,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT)
+	mtspr	SPRN_MAS7_MAS3,r14
+
+	clrrdi	r15,r16,21		/* make EA 2M-aligned */
+	mtspr	SPRN_MAS2,r15
+
+	lbz	r15,TCD_ESEL_NEXT(r11)
+	lbz	r16,TCD_ESEL_MAX(r11)
+	lbz	r14,TCD_ESEL_FIRST(r11)
+	rlwimi	r10,r15,16,0x00ff0000	/* insert esel_next into MAS0 */
+	addi	r15,r15,1		/* increment esel_next */
+	mtspr	SPRN_MAS0,r10
+	cmpw	r15,r16
+	iseleq	r15,r14,r15		/* if next == last use first */
+	stb	r15,TCD_ESEL_NEXT(r11)
+
+	tlbwe
+
+tlb_miss_done_e6500:
+	.macro	tlb_unlock_e6500
+	beq	cr1,1f		/* no unlock if lock was recursively grabbed */
+	li	r15,0
+	isync
+	stb	r15,0(r11)
+1:
+	.endm
+
+	tlb_unlock_e6500
+	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
+	tlb_epilog_bolted
+	rfi
+
+tlb_miss_kernel_e6500:
+	mfspr	r10,SPRN_MAS1
+	ld	r14,PACA_KERNELPGD(r13)
+	cmpldi	cr0,r15,8		/* Check for vmalloc region */
+	rlwinm	r10,r10,0,16,1		/* Clear TID */
+	mtspr	SPRN_MAS1,r10
+	beq+	tlb_miss_common_e6500
+
+tlb_miss_fault_e6500:
+	tlb_unlock_e6500
+	/* We need to check if it was an instruction miss */
+	andi.	r16,r16,1
+	bne	itlb_miss_fault_e6500
+dtlb_miss_fault_e6500:
+	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+	tlb_epilog_bolted
+	b	exc_data_storage_book3e
+itlb_miss_fault_e6500:
+	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+	tlb_epilog_bolted
+	b	exc_instruction_storage_book3e
+#endif /* CONFIG_PPC_FSL_BOOK3E */
+
+/**********************************************************************
+ *                                                                    *
+ * TLB miss handling for Book3E with TLB reservation and HES support  *
+ *                                                                    *
+ **********************************************************************/
+
+
+/* Data TLB miss */
+	START_EXCEPTION(data_tlb_miss)
+	TLB_MISS_PROLOG
+
+	/* Now we handle the fault proper. We only save DEAR in normal
+	 * fault case since that's the only interesting values here.
+	 * We could probably also optimize by not saving SRR0/1 in the
+	 * linear mapping case but I'll leave that for later
+	 */
+	mfspr	r14,SPRN_ESR
+	mfspr	r16,SPRN_DEAR		/* get faulting address */
+	srdi	r15,r16,60		/* get region */
+	cmpldi	cr0,r15,0xc		/* linear mapping ? */
+	TLB_MISS_STATS_SAVE_INFO
+	beq	tlb_load_linear		/* yes -> go to linear map load */
+
+	/* The page tables are mapped virtually linear. At this point, though,
+	 * we don't know whether we are trying to fault in a first level
+	 * virtual address or a virtual page table address. We can get that
+	 * from bit 0x1 of the region ID which we have set for a page table
+	 */
+	andi.	r10,r15,0x1
+	bne-	virt_page_table_tlb_miss
+
+	std	r14,EX_TLB_ESR(r12);	/* save ESR */
+	std	r16,EX_TLB_DEAR(r12);	/* save DEAR */
+
+	 /* We need _PAGE_PRESENT and  _PAGE_ACCESSED set */
+	li	r11,_PAGE_PRESENT
+	oris	r11,r11,_PAGE_ACCESSED@h
+
+	/* We do the user/kernel test for the PID here along with the RW test
+	 */
+	cmpldi	cr0,r15,0		/* Check for user region */
+
+	/* We pre-test some combination of permissions to avoid double
+	 * faults:
+	 *
+	 * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE
+	 * ESR_ST   is 0x00800000
+	 * _PAGE_BAP_SW is 0x00000010
+	 * So the shift is >> 19. This tests for supervisor writeability.
+	 * If the page happens to be supervisor writeable and not user
+	 * writeable, we will take a new fault later, but that should be
+	 * a rare enough case.
+	 *
+	 * We also move ESR_ST in _PAGE_DIRTY position
+	 * _PAGE_DIRTY is 0x00001000 so the shift is >> 11
+	 *
+	 * MAS1 is preset for all we need except for TID that needs to
+	 * be cleared for kernel translations
+	 */
+	rlwimi	r11,r14,32-19,27,27
+	rlwimi	r11,r14,32-16,19,19
+	beq	normal_tlb_miss
+	/* XXX replace the RMW cycles with immediate loads + writes */
+1:	mfspr	r10,SPRN_MAS1
+	cmpldi	cr0,r15,8		/* Check for vmalloc region */
+	rlwinm	r10,r10,0,16,1		/* Clear TID */
+	mtspr	SPRN_MAS1,r10
+	beq+	normal_tlb_miss
+
+	/* We got a crappy address, just fault with whatever DEAR and ESR
+	 * are here
+	 */
+	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+	TLB_MISS_EPILOG_ERROR
+	b	exc_data_storage_book3e
+
+/* Instruction TLB miss */
+	START_EXCEPTION(instruction_tlb_miss)
+	TLB_MISS_PROLOG
+
+	/* If we take a recursive fault, the second level handler may need
+	 * to know whether we are handling a data or instruction fault in
+	 * order to get to the right store fault handler. We provide that
+	 * info by writing a crazy value in ESR in our exception frame
+	 */
+	li	r14,-1	/* store to exception frame is done later */
+
+	/* Now we handle the fault proper. We only save DEAR in the non
+	 * linear mapping case since we know the linear mapping case will
+	 * not re-enter. We could indeed optimize and also not save SRR0/1
+	 * in the linear mapping case but I'll leave that for later
+	 *
+	 * Faulting address is SRR0 which is already in r16
+	 */
+	srdi	r15,r16,60		/* get region */
+	cmpldi	cr0,r15,0xc		/* linear mapping ? */
+	TLB_MISS_STATS_SAVE_INFO
+	beq	tlb_load_linear		/* yes -> go to linear map load */
+
+	/* We do the user/kernel test for the PID here along with the RW test
+	 */
+	li	r11,_PAGE_PRESENT|_PAGE_EXEC	/* Base perm */
+	oris	r11,r11,_PAGE_ACCESSED@h
+
+	cmpldi	cr0,r15,0			/* Check for user region */
+	std	r14,EX_TLB_ESR(r12)		/* write crazy -1 to frame */
+	beq	normal_tlb_miss
+
+	li	r11,_PAGE_PRESENT|_PAGE_BAP_SX	/* Base perm */
+	oris	r11,r11,_PAGE_ACCESSED@h
+	/* XXX replace the RMW cycles with immediate loads + writes */
+	mfspr	r10,SPRN_MAS1
+	cmpldi	cr0,r15,8			/* Check for vmalloc region */
+	rlwinm	r10,r10,0,16,1			/* Clear TID */
+	mtspr	SPRN_MAS1,r10
+	beq+	normal_tlb_miss
+
+	/* We got a crappy address, just fault */
+	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+	TLB_MISS_EPILOG_ERROR
+	b	exc_instruction_storage_book3e
+
+/*
+ * This is the guts of the first-level TLB miss handler for direct
+ * misses. We are entered with:
+ *
+ * r16 = faulting address
+ * r15 = region ID
+ * r14 = crap (free to use)
+ * r13 = PACA
+ * r12 = TLB exception frame in PACA
+ * r11 = PTE permission mask
+ * r10 = crap (free to use)
+ */
+normal_tlb_miss:
+	/* So we first construct the page table address. We do that by
+	 * shifting the bottom of the address (not the region ID) by
+	 * PAGE_SHIFT-3, clearing the bottom 3 bits (get a PTE ptr) and
+	 * or'ing the fourth high bit.
+	 *
+	 * NOTE: For 64K pages, we do things slightly differently in
+	 * order to handle the weird page table format used by linux
+	 */
+	ori	r10,r15,0x1
+#ifdef CONFIG_PPC_64K_PAGES
+	/* For the top bits, 16 bytes per PTE */
+	rldicl	r14,r16,64-(PAGE_SHIFT-4),PAGE_SHIFT-4+4
+	/* Now create the bottom bits as 0 in position 0x8000 and
+	 * the rest calculated for 8 bytes per PTE
+	 */
+	rldicl	r15,r16,64-(PAGE_SHIFT-3),64-15
+	/* Insert the bottom bits in */
+	rlwimi	r14,r15,0,16,31
+#else
+	rldicl	r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4
+#endif
+	sldi	r15,r10,60
+	clrrdi	r14,r14,3
+	or	r10,r15,r14
+
+BEGIN_MMU_FTR_SECTION
+	/* Set the TLB reservation and search for existing entry. Then load
+	 * the entry.
+	 */
+	PPC_TLBSRX_DOT(0,R16)
+	ld	r14,0(r10)
+	beq	normal_tlb_miss_done
+MMU_FTR_SECTION_ELSE
+	ld	r14,0(r10)
+ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV)
+
+finish_normal_tlb_miss:
+	/* Check if required permissions are met */
+	andc.	r15,r11,r14
+	bne-	normal_tlb_miss_access_fault
+
+	/* Now we build the MAS:
+	 *
+	 * MAS 0   :	Fully setup with defaults in MAS4 and TLBnCFG
+	 * MAS 1   :	Almost fully setup
+	 *               - PID already updated by caller if necessary
+	 *               - TSIZE need change if !base page size, not
+	 *                 yet implemented for now
+	 * MAS 2   :	Defaults not useful, need to be redone
+	 * MAS 3+7 :	Needs to be done
+	 *
+	 * TODO: mix up code below for better scheduling
+	 */
+	clrrdi	r11,r16,12		/* Clear low crap in EA */
+	rlwimi	r11,r14,32-19,27,31	/* Insert WIMGE */
+	mtspr	SPRN_MAS2,r11
+
+	/* Check page size, if not standard, update MAS1 */
+	rldicl	r11,r14,64-8,64-8
+#ifdef CONFIG_PPC_64K_PAGES
+	cmpldi	cr0,r11,BOOK3E_PAGESZ_64K
+#else
+	cmpldi	cr0,r11,BOOK3E_PAGESZ_4K
+#endif
+	beq-	1f
+	mfspr	r11,SPRN_MAS1
+	rlwimi	r11,r14,31,21,24
+	rlwinm	r11,r11,0,21,19
+	mtspr	SPRN_MAS1,r11
+1:
+	/* Move RPN in position */
+	rldicr	r11,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
+	clrldi	r15,r11,12		/* Clear crap at the top */
+	rlwimi	r15,r14,32-8,22,25	/* Move in U bits */
+	rlwimi	r15,r14,32-2,26,31	/* Move in BAP bits */
+
+	/* Mask out SW and UW if !DIRTY (XXX optimize this !) */
+	andi.	r11,r14,_PAGE_DIRTY
+	bne	1f
+	li	r11,MAS3_SW|MAS3_UW
+	andc	r15,r15,r11
+1:
+BEGIN_MMU_FTR_SECTION
+	srdi	r16,r15,32
+	mtspr	SPRN_MAS3,r15
+	mtspr	SPRN_MAS7,r16
+MMU_FTR_SECTION_ELSE
+	mtspr	SPRN_MAS7_MAS3,r15
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
+
+	tlbwe
+
+normal_tlb_miss_done:
+	/* We don't bother with restoring DEAR or ESR since we know we are
+	 * level 0 and just going back to userland. They are only needed
+	 * if you are going to take an access fault
+	 */
+	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
+	TLB_MISS_EPILOG_SUCCESS
+	rfi
+
+normal_tlb_miss_access_fault:
+	/* We need to check if it was an instruction miss */
+	andi.	r10,r11,_PAGE_EXEC
+	bne	1f
+	ld	r14,EX_TLB_DEAR(r12)
+	ld	r15,EX_TLB_ESR(r12)
+	mtspr	SPRN_DEAR,r14
+	mtspr	SPRN_ESR,r15
+	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+	TLB_MISS_EPILOG_ERROR
+	b	exc_data_storage_book3e
+1:	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+	TLB_MISS_EPILOG_ERROR
+	b	exc_instruction_storage_book3e
+
+
+/*
+ * This is the guts of the second-level TLB miss handler for direct
+ * misses. We are entered with:
+ *
+ * r16 = virtual page table faulting address
+ * r15 = region (top 4 bits of address)
+ * r14 = crap (free to use)
+ * r13 = PACA
+ * r12 = TLB exception frame in PACA
+ * r11 = crap (free to use)
+ * r10 = crap (free to use)
+ *
+ * Note that this should only ever be called as a second level handler
+ * with the current scheme when using SW load.
+ * That means we can always get the original fault DEAR at
+ * EX_TLB_DEAR-EX_TLB_SIZE(r12)
+ *
+ * It can be re-entered by the linear mapping miss handler. However, to
+ * avoid too much complication, it will restart the whole fault at level
+ * 0 so we don't care too much about clobbers
+ *
+ * XXX That code was written back when we couldn't clobber r14. We can now,
+ * so we could probably optimize things a bit
+ */
+virt_page_table_tlb_miss:
+	/* Are we hitting a kernel page table ? */
+	andi.	r10,r15,0x8
+
+	/* The cool thing now is that r10 contains 0 for user and 8 for kernel,
+	 * and we happen to have the swapper_pg_dir at offset 8 from the user
+	 * pgdir in the PACA :-).
+	 */
+	add	r11,r10,r13
+
+	/* If kernel, we need to clear MAS1 TID */
+	beq	1f
+	/* XXX replace the RMW cycles with immediate loads + writes */
+	mfspr	r10,SPRN_MAS1
+	rlwinm	r10,r10,0,16,1			/* Clear TID */
+	mtspr	SPRN_MAS1,r10
+1:
+BEGIN_MMU_FTR_SECTION
+	/* Search if we already have a TLB entry for that virtual address, and
+	 * if we do, bail out.
+	 */
+	PPC_TLBSRX_DOT(0,R16)
+	beq	virt_page_table_tlb_miss_done
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
+
+	/* Now, we need to walk the page tables. First check if we are in
+	 * range.
+	 */
+	rldicl.	r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4
+	bne-	virt_page_table_tlb_miss_fault
+
+	/* Get the PGD pointer */
+	ld	r15,PACAPGD(r11)
+	cmpldi	cr0,r15,0
+	beq-	virt_page_table_tlb_miss_fault
+
+	/* Get to PGD entry */
+	rldicl	r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3
+	clrrdi	r10,r11,3
+	ldx	r15,r10,r15
+	cmpdi	cr0,r15,0
+	bge	virt_page_table_tlb_miss_fault
+
+#ifndef CONFIG_PPC_64K_PAGES
+	/* Get to PUD entry */
+	rldicl	r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3
+	clrrdi	r10,r11,3
+	ldx	r15,r10,r15
+	cmpdi	cr0,r15,0
+	bge	virt_page_table_tlb_miss_fault
+#endif /* CONFIG_PPC_64K_PAGES */
+
+	/* Get to PMD entry */
+	rldicl	r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3
+	clrrdi	r10,r11,3
+	ldx	r15,r10,r15
+	cmpdi	cr0,r15,0
+	bge	virt_page_table_tlb_miss_fault
+
+	/* Ok, we're all right, we can now create a kernel translation for
+	 * a 4K or 64K page from r16 -> r15.
+	 */
+	/* Now we build the MAS:
+	 *
+	 * MAS 0   :	Fully setup with defaults in MAS4 and TLBnCFG
+	 * MAS 1   :	Almost fully setup
+	 *               - PID already updated by caller if necessary
+	 *               - TSIZE for now is base page size always
+	 * MAS 2   :	Use defaults
+	 * MAS 3+7 :	Needs to be done
+	 *
+	 * So we only do MAS 2 and 3 for now...
+	 */
+	clrldi	r11,r15,4		/* remove region ID from RPN */
+	ori	r10,r11,1		/* Or-in SR */
+
+BEGIN_MMU_FTR_SECTION
+	srdi	r16,r10,32
+	mtspr	SPRN_MAS3,r10
+	mtspr	SPRN_MAS7,r16
+MMU_FTR_SECTION_ELSE
+	mtspr	SPRN_MAS7_MAS3,r10
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
+
+	tlbwe
+
+BEGIN_MMU_FTR_SECTION
+virt_page_table_tlb_miss_done:
+
+	/* We have overriden MAS2:EPN but currently our primary TLB miss
+	 * handler will always restore it so that should not be an issue,
+	 * if we ever optimize the primary handler to not write MAS2 on
+	 * some cases, we'll have to restore MAS2:EPN here based on the
+	 * original fault's DEAR. If we do that we have to modify the
+	 * ITLB miss handler to also store SRR0 in the exception frame
+	 * as DEAR.
+	 *
+	 * However, one nasty thing we did is we cleared the reservation
+	 * (well, potentially we did). We do a trick here thus if we
+	 * are not a level 0 exception (we interrupted the TLB miss) we
+	 * offset the return address by -4 in order to replay the tlbsrx
+	 * instruction there
+	 */
+	subf	r10,r13,r12
+	cmpldi	cr0,r10,PACA_EXTLB+EX_TLB_SIZE
+	bne-	1f
+	ld	r11,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13)
+	addi	r10,r11,-4
+	std	r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13)
+1:
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
+	/* Return to caller, normal case */
+	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK);
+	TLB_MISS_EPILOG_SUCCESS
+	rfi
+
+virt_page_table_tlb_miss_fault:
+	/* If we fault here, things are a little bit tricky. We need to call
+	 * either data or instruction store fault, and we need to retrieve
+	 * the original fault address and ESR (for data).
+	 *
+	 * The thing is, we know that in normal circumstances, this is
+	 * always called as a second level tlb miss for SW load or as a first
+	 * level TLB miss for HW load, so we should be able to peek at the
+	 * relevant information in the first exception frame in the PACA.
+	 *
+	 * However, we do need to double check that, because we may just hit
+	 * a stray kernel pointer or a userland attack trying to hit those
+	 * areas. If that is the case, we do a data fault. (We can't get here
+	 * from an instruction tlb miss anyway).
+	 *
+	 * Note also that when going to a fault, we must unwind the previous
+	 * level as well. Since we are doing that, we don't need to clear or
+	 * restore the TLB reservation neither.
+	 */
+	subf	r10,r13,r12
+	cmpldi	cr0,r10,PACA_EXTLB+EX_TLB_SIZE
+	bne-	virt_page_table_tlb_miss_whacko_fault
+
+	/* We dig the original DEAR and ESR from slot 0 */
+	ld	r15,EX_TLB_DEAR+PACA_EXTLB(r13)
+	ld	r16,EX_TLB_ESR+PACA_EXTLB(r13)
+
+	/* We check for the "special" ESR value for instruction faults */
+	cmpdi	cr0,r16,-1
+	beq	1f
+	mtspr	SPRN_DEAR,r15
+	mtspr	SPRN_ESR,r16
+	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT);
+	TLB_MISS_EPILOG_ERROR
+	b	exc_data_storage_book3e
+1:	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT);
+	TLB_MISS_EPILOG_ERROR
+	b	exc_instruction_storage_book3e
+
+virt_page_table_tlb_miss_whacko_fault:
+	/* The linear fault will restart everything so ESR and DEAR will
+	 * not have been clobbered, let's just fault with what we have
+	 */
+	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_FAULT);
+	TLB_MISS_EPILOG_ERROR
+	b	exc_data_storage_book3e
+
+
+/**************************************************************
+ *                                                            *
+ * TLB miss handling for Book3E with hw page table support    *
+ *                                                            *
+ **************************************************************/
+
+
+/* Data TLB miss */
+	START_EXCEPTION(data_tlb_miss_htw)
+	TLB_MISS_PROLOG
+
+	/* Now we handle the fault proper. We only save DEAR in normal
+	 * fault case since that's the only interesting values here.
+	 * We could probably also optimize by not saving SRR0/1 in the
+	 * linear mapping case but I'll leave that for later
+	 */
+	mfspr	r14,SPRN_ESR
+	mfspr	r16,SPRN_DEAR		/* get faulting address */
+	srdi	r11,r16,60		/* get region */
+	cmpldi	cr0,r11,0xc		/* linear mapping ? */
+	TLB_MISS_STATS_SAVE_INFO
+	beq	tlb_load_linear		/* yes -> go to linear map load */
+
+	/* We do the user/kernel test for the PID here along with the RW test
+	 */
+	cmpldi	cr0,r11,0		/* Check for user region */
+	ld	r15,PACAPGD(r13)	/* Load user pgdir */
+	beq	htw_tlb_miss
+
+	/* XXX replace the RMW cycles with immediate loads + writes */
+1:	mfspr	r10,SPRN_MAS1
+	cmpldi	cr0,r11,8		/* Check for vmalloc region */
+	rlwinm	r10,r10,0,16,1		/* Clear TID */
+	mtspr	SPRN_MAS1,r10
+	ld	r15,PACA_KERNELPGD(r13)	/* Load kernel pgdir */
+	beq+	htw_tlb_miss
+
+	/* We got a crappy address, just fault with whatever DEAR and ESR
+	 * are here
+	 */
+	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+	TLB_MISS_EPILOG_ERROR
+	b	exc_data_storage_book3e
+
+/* Instruction TLB miss */
+	START_EXCEPTION(instruction_tlb_miss_htw)
+	TLB_MISS_PROLOG
+
+	/* If we take a recursive fault, the second level handler may need
+	 * to know whether we are handling a data or instruction fault in
+	 * order to get to the right store fault handler. We provide that
+	 * info by keeping a crazy value for ESR in r14
+	 */
+	li	r14,-1	/* store to exception frame is done later */
+
+	/* Now we handle the fault proper. We only save DEAR in the non
+	 * linear mapping case since we know the linear mapping case will
+	 * not re-enter. We could indeed optimize and also not save SRR0/1
+	 * in the linear mapping case but I'll leave that for later
+	 *
+	 * Faulting address is SRR0 which is already in r16
+	 */
+	srdi	r11,r16,60		/* get region */
+	cmpldi	cr0,r11,0xc		/* linear mapping ? */
+	TLB_MISS_STATS_SAVE_INFO
+	beq	tlb_load_linear		/* yes -> go to linear map load */
+
+	/* We do the user/kernel test for the PID here along with the RW test
+	 */
+	cmpldi	cr0,r11,0			/* Check for user region */
+	ld	r15,PACAPGD(r13)		/* Load user pgdir */
+	beq	htw_tlb_miss
+
+	/* XXX replace the RMW cycles with immediate loads + writes */
+1:	mfspr	r10,SPRN_MAS1
+	cmpldi	cr0,r11,8			/* Check for vmalloc region */
+	rlwinm	r10,r10,0,16,1			/* Clear TID */
+	mtspr	SPRN_MAS1,r10
+	ld	r15,PACA_KERNELPGD(r13)		/* Load kernel pgdir */
+	beq+	htw_tlb_miss
+
+	/* We got a crappy address, just fault */
+	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+	TLB_MISS_EPILOG_ERROR
+	b	exc_instruction_storage_book3e
+
+
+/*
+ * This is the guts of the second-level TLB miss handler for direct
+ * misses. We are entered with:
+ *
+ * r16 = virtual page table faulting address
+ * r15 = PGD pointer
+ * r14 = ESR
+ * r13 = PACA
+ * r12 = TLB exception frame in PACA
+ * r11 = crap (free to use)
+ * r10 = crap (free to use)
+ *
+ * It can be re-entered by the linear mapping miss handler. However, to
+ * avoid too much complication, it will save/restore things for us
+ */
+htw_tlb_miss:
+	/* Search if we already have a TLB entry for that virtual address, and
+	 * if we do, bail out.
+	 *
+	 * MAS1:IND should be already set based on MAS4
+	 */
+	PPC_TLBSRX_DOT(0,R16)
+	beq	htw_tlb_miss_done
+
+	/* Now, we need to walk the page tables. First check if we are in
+	 * range.
+	 */
+	rldicl.	r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
+	bne-	htw_tlb_miss_fault
+
+	/* Get the PGD pointer */
+	cmpldi	cr0,r15,0
+	beq-	htw_tlb_miss_fault
+
+	/* Get to PGD entry */
+	rldicl	r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3
+	clrrdi	r10,r11,3
+	ldx	r15,r10,r15
+	cmpdi	cr0,r15,0
+	bge	htw_tlb_miss_fault
+
+#ifndef CONFIG_PPC_64K_PAGES
+	/* Get to PUD entry */
+	rldicl	r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3
+	clrrdi	r10,r11,3
+	ldx	r15,r10,r15
+	cmpdi	cr0,r15,0
+	bge	htw_tlb_miss_fault
+#endif /* CONFIG_PPC_64K_PAGES */
+
+	/* Get to PMD entry */
+	rldicl	r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3
+	clrrdi	r10,r11,3
+	ldx	r15,r10,r15
+	cmpdi	cr0,r15,0
+	bge	htw_tlb_miss_fault
+
+	/* Ok, we're all right, we can now create an indirect entry for
+	 * a 1M or 256M page.
+	 *
+	 * The last trick is now that because we use "half" pages for
+	 * the HTW (1M IND is 2K and 256M IND is 32K) we need to account
+	 * for an added LSB bit to the RPN. For 64K pages, there is no
+	 * problem as we already use 32K arrays (half PTE pages), but for
+	 * 4K page we need to extract a bit from the virtual address and
+	 * insert it into the "PA52" bit of the RPN.
+	 */
+#ifndef CONFIG_PPC_64K_PAGES
+	rlwimi	r15,r16,32-9,20,20
+#endif
+	/* Now we build the MAS:
+	 *
+	 * MAS 0   :	Fully setup with defaults in MAS4 and TLBnCFG
+	 * MAS 1   :	Almost fully setup
+	 *               - PID already updated by caller if necessary
+	 *               - TSIZE for now is base ind page size always
+	 * MAS 2   :	Use defaults
+	 * MAS 3+7 :	Needs to be done
+	 */
+#ifdef CONFIG_PPC_64K_PAGES
+	ori	r10,r15,(BOOK3E_PAGESZ_64K << MAS3_SPSIZE_SHIFT)
+#else
+	ori	r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT)
+#endif
+
+BEGIN_MMU_FTR_SECTION
+	srdi	r16,r10,32
+	mtspr	SPRN_MAS3,r10
+	mtspr	SPRN_MAS7,r16
+MMU_FTR_SECTION_ELSE
+	mtspr	SPRN_MAS7_MAS3,r10
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
+
+	tlbwe
+
+htw_tlb_miss_done:
+	/* We don't bother with restoring DEAR or ESR since we know we are
+	 * level 0 and just going back to userland. They are only needed
+	 * if you are going to take an access fault
+	 */
+	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK)
+	TLB_MISS_EPILOG_SUCCESS
+	rfi
+
+htw_tlb_miss_fault:
+	/* We need to check if it was an instruction miss. We know this
+	 * though because r14 would contain -1
+	 */
+	cmpdi	cr0,r14,-1
+	beq	1f
+	mtspr	SPRN_DEAR,r16
+	mtspr	SPRN_ESR,r14
+	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT)
+	TLB_MISS_EPILOG_ERROR
+	b	exc_data_storage_book3e
+1:	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT)
+	TLB_MISS_EPILOG_ERROR
+	b	exc_instruction_storage_book3e
+
+/*
+ * This is the guts of "any" level TLB miss handler for kernel linear
+ * mapping misses. We are entered with:
+ *
+ *
+ * r16 = faulting address
+ * r15 = crap (free to use)
+ * r14 = ESR (data) or -1 (instruction)
+ * r13 = PACA
+ * r12 = TLB exception frame in PACA
+ * r11 = crap (free to use)
+ * r10 = crap (free to use)
+ *
+ * In addition we know that we will not re-enter, so in theory, we could
+ * use a simpler epilog not restoring SRR0/1 etc.. but we'll do that later.
+ *
+ * We also need to be careful about MAS registers here & TLB reservation,
+ * as we know we'll have clobbered them if we interrupt the main TLB miss
+ * handlers in which case we probably want to do a full restart at level
+ * 0 rather than saving / restoring the MAS.
+ *
+ * Note: If we care about performance of that core, we can easily shuffle
+ *       a few things around
+ */
+tlb_load_linear:
+	/* For now, we assume the linear mapping is contiguous and stops at
+	 * linear_map_top. We also assume the size is a multiple of 1G, thus
+	 * we only use 1G pages for now. That might have to be changed in a
+	 * final implementation, especially when dealing with hypervisors
+	 */
+	ld	r11,PACATOC(r13)
+	ld	r11,linear_map_top@got(r11)
+	ld	r10,0(r11)
+	tovirt(10,10)
+	cmpld	cr0,r16,r10
+	bge	tlb_load_linear_fault
+
+	/* MAS1 need whole new setup. */
+	li	r15,(BOOK3E_PAGESZ_1GB<<MAS1_TSIZE_SHIFT)
+	oris	r15,r15,MAS1_VALID@h	/* MAS1 needs V and TSIZE */
+	mtspr	SPRN_MAS1,r15
+
+	/* Already somebody there ? */
+	PPC_TLBSRX_DOT(0,R16)
+	beq	tlb_load_linear_done
+
+	/* Now we build the remaining MAS. MAS0 and 2 should be fine
+	 * with their defaults, which leaves us with MAS 3 and 7. The
+	 * mapping is linear, so we just take the address, clear the
+	 * region bits, and or in the permission bits which are currently
+	 * hard wired
+	 */
+	clrrdi	r10,r16,30		/* 1G page index */
+	clrldi	r10,r10,4		/* clear region bits */
+	ori	r10,r10,MAS3_SR|MAS3_SW|MAS3_SX
+
+BEGIN_MMU_FTR_SECTION
+	srdi	r16,r10,32
+	mtspr	SPRN_MAS3,r10
+	mtspr	SPRN_MAS7,r16
+MMU_FTR_SECTION_ELSE
+	mtspr	SPRN_MAS7_MAS3,r10
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
+
+	tlbwe
+
+tlb_load_linear_done:
+	/* We use the "error" epilog for success as we do want to
+	 * restore to the initial faulting context, whatever it was.
+	 * We do that because we can't resume a fault within a TLB
+	 * miss handler, due to MAS and TLB reservation being clobbered.
+	 */
+	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_LINEAR)
+	TLB_MISS_EPILOG_ERROR
+	rfi
+
+tlb_load_linear_fault:
+	/* We keep the DEAR and ESR around, this shouldn't have happened */
+	cmpdi	cr0,r14,-1
+	beq	1f
+	TLB_MISS_EPILOG_ERROR_SPECIAL
+	b	exc_data_storage_book3e
+1:	TLB_MISS_EPILOG_ERROR_SPECIAL
+	b	exc_instruction_storage_book3e
+
+
+#ifdef CONFIG_BOOK3E_MMU_TLB_STATS
+.tlb_stat_inc:
+1:	ldarx	r8,0,r9
+	addi	r8,r8,1
+	stdcx.	r8,0,r9
+	bne-	1b
+	blr
+#endif
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
new file mode 100644
index 00000000000..92cb18d52ea
--- /dev/null
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -0,0 +1,732 @@
+/*
+ * This file contains the routines for TLB flushing.
+ * On machines where the MMU does not use a hash table to store virtual to
+ * physical translations (ie, SW loaded TLBs or Book3E compilant processors,
+ * this does -not- include 603 however which shares the implementation with
+ * hash based processors)
+ *
+ *  -- BenH
+ *
+ * Copyright 2008,2009 Ben Herrenschmidt <benh@kernel.crashing.org>
+ *                     IBM Corp.
+ *
+ *  Derived from arch/ppc/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/preempt.h>
+#include <linux/spinlock.h>
+#include <linux/memblock.h>
+#include <linux/of_fdt.h>
+#include <linux/hugetlb.h>
+
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/code-patching.h>
+#include <asm/hugetlb.h>
+#include <asm/paca.h>
+
+#include "mmu_decl.h"
+
+/*
+ * This struct lists the sw-supported page sizes.  The hardawre MMU may support
+ * other sizes not listed here.   The .ind field is only used on MMUs that have
+ * indirect page table entries.
+ */
+#ifdef CONFIG_PPC_BOOK3E_MMU
+#ifdef CONFIG_PPC_FSL_BOOK3E
+struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
+	[MMU_PAGE_4K] = {
+		.shift	= 12,
+		.enc	= BOOK3E_PAGESZ_4K,
+	},
+	[MMU_PAGE_2M] = {
+		.shift	= 21,
+		.enc	= BOOK3E_PAGESZ_2M,
+	},
+	[MMU_PAGE_4M] = {
+		.shift	= 22,
+		.enc	= BOOK3E_PAGESZ_4M,
+	},
+	[MMU_PAGE_16M] = {
+		.shift	= 24,
+		.enc	= BOOK3E_PAGESZ_16M,
+	},
+	[MMU_PAGE_64M] = {
+		.shift	= 26,
+		.enc	= BOOK3E_PAGESZ_64M,
+	},
+	[MMU_PAGE_256M] = {
+		.shift	= 28,
+		.enc	= BOOK3E_PAGESZ_256M,
+	},
+	[MMU_PAGE_1G] = {
+		.shift	= 30,
+		.enc	= BOOK3E_PAGESZ_1GB,
+	},
+};
+#else
+struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
+	[MMU_PAGE_4K] = {
+		.shift	= 12,
+		.ind	= 20,
+		.enc	= BOOK3E_PAGESZ_4K,
+	},
+	[MMU_PAGE_16K] = {
+		.shift	= 14,
+		.enc	= BOOK3E_PAGESZ_16K,
+	},
+	[MMU_PAGE_64K] = {
+		.shift	= 16,
+		.ind	= 28,
+		.enc	= BOOK3E_PAGESZ_64K,
+	},
+	[MMU_PAGE_1M] = {
+		.shift	= 20,
+		.enc	= BOOK3E_PAGESZ_1M,
+	},
+	[MMU_PAGE_16M] = {
+		.shift	= 24,
+		.ind	= 36,
+		.enc	= BOOK3E_PAGESZ_16M,
+	},
+	[MMU_PAGE_256M] = {
+		.shift	= 28,
+		.enc	= BOOK3E_PAGESZ_256M,
+	},
+	[MMU_PAGE_1G] = {
+		.shift	= 30,
+		.enc	= BOOK3E_PAGESZ_1GB,
+	},
+};
+#endif /* CONFIG_FSL_BOOKE */
+
+static inline int mmu_get_tsize(int psize)
+{
+	return mmu_psize_defs[psize].enc;
+}
+#else
+static inline int mmu_get_tsize(int psize)
+{
+	/* This isn't used on !Book3E for now */
+	return 0;
+}
+#endif /* CONFIG_PPC_BOOK3E_MMU */
+
+/* The variables below are currently only used on 64-bit Book3E
+ * though this will probably be made common with other nohash
+ * implementations at some point
+ */
+#ifdef CONFIG_PPC64
+
+int mmu_linear_psize;		/* Page size used for the linear mapping */
+int mmu_pte_psize;		/* Page size used for PTE pages */
+int mmu_vmemmap_psize;		/* Page size used for the virtual mem map */
+int book3e_htw_mode;		/* HW tablewalk?  Value is PPC_HTW_* */
+unsigned long linear_map_top;	/* Top of linear mapping */
+
+
+/*
+ * Number of bytes to add to SPRN_SPRG_TLB_EXFRAME on crit/mcheck/debug
+ * exceptions.  This is used for bolted and e6500 TLB miss handlers which
+ * do not modify this SPRG in the TLB miss code; for other TLB miss handlers,
+ * this is set to zero.
+ */
+int extlb_level_exc;
+
+#endif /* CONFIG_PPC64 */
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+/* next_tlbcam_idx is used to round-robin tlbcam entry assignment */
+DEFINE_PER_CPU(int, next_tlbcam_idx);
+EXPORT_PER_CPU_SYMBOL(next_tlbcam_idx);
+#endif
+
+/*
+ * Base TLB flushing operations:
+ *
+ *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ *  - flush_tlb_page(vma, vmaddr) flushes one page
+ *  - flush_tlb_range(vma, start, end) flushes a range of pages
+ *  - flush_tlb_kernel_range(start, end) flushes kernel pages
+ *
+ *  - local_* variants of page and mm only apply to the current
+ *    processor
+ */
+
+/*
+ * These are the base non-SMP variants of page and mm flushing
+ */
+void local_flush_tlb_mm(struct mm_struct *mm)
+{
+	unsigned int pid;
+
+	preempt_disable();
+	pid = mm->context.id;
+	if (pid != MMU_NO_CONTEXT)
+		_tlbil_pid(pid);
+	preempt_enable();
+}
+EXPORT_SYMBOL(local_flush_tlb_mm);
+
+void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
+			    int tsize, int ind)
+{
+	unsigned int pid;
+
+	preempt_disable();
+	pid = mm ? mm->context.id : 0;
+	if (pid != MMU_NO_CONTEXT)
+		_tlbil_va(vmaddr, pid, tsize, ind);
+	preempt_enable();
+}
+
+void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+	__local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
+			       mmu_get_tsize(mmu_virtual_psize), 0);
+}
+EXPORT_SYMBOL(local_flush_tlb_page);
+
+/*
+ * And here are the SMP non-local implementations
+ */
+#ifdef CONFIG_SMP
+
+static DEFINE_RAW_SPINLOCK(tlbivax_lock);
+
+static int mm_is_core_local(struct mm_struct *mm)
+{
+	return cpumask_subset(mm_cpumask(mm),
+			      topology_thread_cpumask(smp_processor_id()));
+}
+
+struct tlb_flush_param {
+	unsigned long addr;
+	unsigned int pid;
+	unsigned int tsize;
+	unsigned int ind;
+};
+
+static void do_flush_tlb_mm_ipi(void *param)
+{
+	struct tlb_flush_param *p = param;
+
+	_tlbil_pid(p ? p->pid : 0);
+}
+
+static void do_flush_tlb_page_ipi(void *param)
+{
+	struct tlb_flush_param *p = param;
+
+	_tlbil_va(p->addr, p->pid, p->tsize, p->ind);
+}
+
+
+/* Note on invalidations and PID:
+ *
+ * We snapshot the PID with preempt disabled. At this point, it can still
+ * change either because:
+ * - our context is being stolen (PID -> NO_CONTEXT) on another CPU
+ * - we are invaliating some target that isn't currently running here
+ *   and is concurrently acquiring a new PID on another CPU
+ * - some other CPU is re-acquiring a lost PID for this mm
+ * etc...
+ *
+ * However, this shouldn't be a problem as we only guarantee
+ * invalidation of TLB entries present prior to this call, so we
+ * don't care about the PID changing, and invalidating a stale PID
+ * is generally harmless.
+ */
+
+void flush_tlb_mm(struct mm_struct *mm)
+{
+	unsigned int pid;
+
+	preempt_disable();
+	pid = mm->context.id;
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		goto no_context;
+	if (!mm_is_core_local(mm)) {
+		struct tlb_flush_param p = { .pid = pid };
+		/* Ignores smp_processor_id() even if set. */
+		smp_call_function_many(mm_cpumask(mm),
+				       do_flush_tlb_mm_ipi, &p, 1);
+	}
+	_tlbil_pid(pid);
+ no_context:
+	preempt_enable();
+}
+EXPORT_SYMBOL(flush_tlb_mm);
+
+void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
+		      int tsize, int ind)
+{
+	struct cpumask *cpu_mask;
+	unsigned int pid;
+
+	preempt_disable();
+	pid = mm ? mm->context.id : 0;
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		goto bail;
+	cpu_mask = mm_cpumask(mm);
+	if (!mm_is_core_local(mm)) {
+		/* If broadcast tlbivax is supported, use it */
+		if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) {
+			int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL);
+			if (lock)
+				raw_spin_lock(&tlbivax_lock);
+			_tlbivax_bcast(vmaddr, pid, tsize, ind);
+			if (lock)
+				raw_spin_unlock(&tlbivax_lock);
+			goto bail;
+		} else {
+			struct tlb_flush_param p = {
+				.pid = pid,
+				.addr = vmaddr,
+				.tsize = tsize,
+				.ind = ind,
+			};
+			/* Ignores smp_processor_id() even if set in cpu_mask */
+			smp_call_function_many(cpu_mask,
+					       do_flush_tlb_page_ipi, &p, 1);
+		}
+	}
+	_tlbil_va(vmaddr, pid, tsize, ind);
+ bail:
+	preempt_enable();
+}
+
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+	if (vma && is_vm_hugetlb_page(vma))
+		flush_hugetlb_page(vma, vmaddr);
+#endif
+
+	__flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
+			 mmu_get_tsize(mmu_virtual_psize), 0);
+}
+EXPORT_SYMBOL(flush_tlb_page);
+
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_PPC_47x
+void __init early_init_mmu_47x(void)
+{
+#ifdef CONFIG_SMP
+	unsigned long root = of_get_flat_dt_root();
+	if (of_get_flat_dt_prop(root, "cooperative-partition", NULL))
+		mmu_clear_feature(MMU_FTR_USE_TLBIVAX_BCAST);
+#endif /* CONFIG_SMP */
+}
+#endif /* CONFIG_PPC_47x */
+
+/*
+ * Flush kernel TLB entries in the given range
+ */
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+#ifdef CONFIG_SMP
+	preempt_disable();
+	smp_call_function(do_flush_tlb_mm_ipi, NULL, 1);
+	_tlbil_pid(0);
+	preempt_enable();
+#else
+	_tlbil_pid(0);
+#endif
+}
+EXPORT_SYMBOL(flush_tlb_kernel_range);
+
+/*
+ * Currently, for range flushing, we just do a full mm flush. This should
+ * be optimized based on a threshold on the size of the range, since
+ * some implementation can stack multiple tlbivax before a tlbsync but
+ * for now, we keep it that way
+ */
+void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+		     unsigned long end)
+
+{
+	flush_tlb_mm(vma->vm_mm);
+}
+EXPORT_SYMBOL(flush_tlb_range);
+
+void tlb_flush(struct mmu_gather *tlb)
+{
+	flush_tlb_mm(tlb->mm);
+}
+
+/*
+ * Below are functions specific to the 64-bit variant of Book3E though that
+ * may change in the future
+ */
+
+#ifdef CONFIG_PPC64
+
+/*
+ * Handling of virtual linear page tables or indirect TLB entries
+ * flushing when PTE pages are freed
+ */
+void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
+{
+	int tsize = mmu_psize_defs[mmu_pte_psize].enc;
+
+	if (book3e_htw_mode != PPC_HTW_NONE) {
+		unsigned long start = address & PMD_MASK;
+		unsigned long end = address + PMD_SIZE;
+		unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift;
+
+		/* This isn't the most optimal, ideally we would factor out the
+		 * while preempt & CPU mask mucking around, or even the IPI but
+		 * it will do for now
+		 */
+		while (start < end) {
+			__flush_tlb_page(tlb->mm, start, tsize, 1);
+			start += size;
+		}
+	} else {
+		unsigned long rmask = 0xf000000000000000ul;
+		unsigned long rid = (address & rmask) | 0x1000000000000000ul;
+		unsigned long vpte = address & ~rmask;
+
+#ifdef CONFIG_PPC_64K_PAGES
+		vpte = (vpte >> (PAGE_SHIFT - 4)) & ~0xfffful;
+#else
+		vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful;
+#endif
+		vpte |= rid;
+		__flush_tlb_page(tlb->mm, vpte, tsize, 0);
+	}
+}
+
+static void setup_page_sizes(void)
+{
+	unsigned int tlb0cfg;
+	unsigned int tlb0ps;
+	unsigned int eptcfg;
+	int i, psize;
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	unsigned int mmucfg = mfspr(SPRN_MMUCFG);
+	int fsl_mmu = mmu_has_feature(MMU_FTR_TYPE_FSL_E);
+
+	if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) {
+		unsigned int tlb1cfg = mfspr(SPRN_TLB1CFG);
+		unsigned int min_pg, max_pg;
+
+		min_pg = (tlb1cfg & TLBnCFG_MINSIZE) >> TLBnCFG_MINSIZE_SHIFT;
+		max_pg = (tlb1cfg & TLBnCFG_MAXSIZE) >> TLBnCFG_MAXSIZE_SHIFT;
+
+		for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+			struct mmu_psize_def *def;
+			unsigned int shift;
+
+			def = &mmu_psize_defs[psize];
+			shift = def->shift;
+
+			if (shift == 0 || shift & 1)
+				continue;
+
+			/* adjust to be in terms of 4^shift Kb */
+			shift = (shift - 10) >> 1;
+
+			if ((shift >= min_pg) && (shift <= max_pg))
+				def->flags |= MMU_PAGE_SIZE_DIRECT;
+		}
+
+		goto out;
+	}
+
+	if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) {
+		u32 tlb1cfg, tlb1ps;
+
+		tlb0cfg = mfspr(SPRN_TLB0CFG);
+		tlb1cfg = mfspr(SPRN_TLB1CFG);
+		tlb1ps = mfspr(SPRN_TLB1PS);
+		eptcfg = mfspr(SPRN_EPTCFG);
+
+		if ((tlb1cfg & TLBnCFG_IND) && (tlb0cfg & TLBnCFG_PT))
+			book3e_htw_mode = PPC_HTW_E6500;
+
+		/*
+		 * We expect 4K subpage size and unrestricted indirect size.
+		 * The lack of a restriction on indirect size is a Freescale
+		 * extension, indicated by PSn = 0 but SPSn != 0.
+		 */
+		if (eptcfg != 2)
+			book3e_htw_mode = PPC_HTW_NONE;
+
+		for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+			struct mmu_psize_def *def = &mmu_psize_defs[psize];
+
+			if (tlb1ps & (1U << (def->shift - 10))) {
+				def->flags |= MMU_PAGE_SIZE_DIRECT;
+
+				if (book3e_htw_mode && psize == MMU_PAGE_2M)
+					def->flags |= MMU_PAGE_SIZE_INDIRECT;
+			}
+		}
+
+		goto out;
+	}
+#endif
+
+	tlb0cfg = mfspr(SPRN_TLB0CFG);
+	tlb0ps = mfspr(SPRN_TLB0PS);
+	eptcfg = mfspr(SPRN_EPTCFG);
+
+	/* Look for supported direct sizes */
+	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+		struct mmu_psize_def *def = &mmu_psize_defs[psize];
+
+		if (tlb0ps & (1U << (def->shift - 10)))
+			def->flags |= MMU_PAGE_SIZE_DIRECT;
+	}
+
+	/* Indirect page sizes supported ? */
+	if ((tlb0cfg & TLBnCFG_IND) == 0 ||
+	    (tlb0cfg & TLBnCFG_PT) == 0)
+		goto out;
+
+	book3e_htw_mode = PPC_HTW_IBM;
+
+	/* Now, we only deal with one IND page size for each
+	 * direct size. Hopefully all implementations today are
+	 * unambiguous, but we might want to be careful in the
+	 * future.
+	 */
+	for (i = 0; i < 3; i++) {
+		unsigned int ps, sps;
+
+		sps = eptcfg & 0x1f;
+		eptcfg >>= 5;
+		ps = eptcfg & 0x1f;
+		eptcfg >>= 5;
+		if (!ps || !sps)
+			continue;
+		for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
+			struct mmu_psize_def *def = &mmu_psize_defs[psize];
+
+			if (ps == (def->shift - 10))
+				def->flags |= MMU_PAGE_SIZE_INDIRECT;
+			if (sps == (def->shift - 10))
+				def->ind = ps + 10;
+		}
+	}
+
+out:
+	/* Cleanup array and print summary */
+	pr_info("MMU: Supported page sizes\n");
+	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+		struct mmu_psize_def *def = &mmu_psize_defs[psize];
+		const char *__page_type_names[] = {
+			"unsupported",
+			"direct",
+			"indirect",
+			"direct & indirect"
+		};
+		if (def->flags == 0) {
+			def->shift = 0;	
+			continue;
+		}
+		pr_info("  %8ld KB as %s\n", 1ul << (def->shift - 10),
+			__page_type_names[def->flags & 0x3]);
+	}
+}
+
+static void setup_mmu_htw(void)
+{
+	/*
+	 * If we want to use HW tablewalk, enable it by patching the TLB miss
+	 * handlers to branch to the one dedicated to it.
+	 */
+
+	switch (book3e_htw_mode) {
+	case PPC_HTW_IBM:
+		patch_exception(0x1c0, exc_data_tlb_miss_htw_book3e);
+		patch_exception(0x1e0, exc_instruction_tlb_miss_htw_book3e);
+		break;
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	case PPC_HTW_E6500:
+		extlb_level_exc = EX_TLB_SIZE;
+		patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e);
+		patch_exception(0x1e0, exc_instruction_tlb_miss_e6500_book3e);
+		break;
+#endif
+	}
+	pr_info("MMU: Book3E HW tablewalk %s\n",
+		book3e_htw_mode != PPC_HTW_NONE ? "enabled" : "not supported");
+}
+
+/*
+ * Early initialization of the MMU TLB code
+ */
+static void __early_init_mmu(int boot_cpu)
+{
+	unsigned int mas4;
+
+	/* XXX This will have to be decided at runtime, but right
+	 * now our boot and TLB miss code hard wires it. Ideally
+	 * we should find out a suitable page size and patch the
+	 * TLB miss code (either that or use the PACA to store
+	 * the value we want)
+	 */
+	mmu_linear_psize = MMU_PAGE_1G;
+
+	/* XXX This should be decided at runtime based on supported
+	 * page sizes in the TLB, but for now let's assume 16M is
+	 * always there and a good fit (which it probably is)
+	 *
+	 * Freescale booke only supports 4K pages in TLB0, so use that.
+	 */
+	if (mmu_has_feature(MMU_FTR_TYPE_FSL_E))
+		mmu_vmemmap_psize = MMU_PAGE_4K;
+	else
+		mmu_vmemmap_psize = MMU_PAGE_16M;
+
+	/* XXX This code only checks for TLB 0 capabilities and doesn't
+	 *     check what page size combos are supported by the HW. It
+	 *     also doesn't handle the case where a separate array holds
+	 *     the IND entries from the array loaded by the PT.
+	 */
+	if (boot_cpu) {
+		/* Look for supported page sizes */
+		setup_page_sizes();
+
+		/* Look for HW tablewalk support */
+		setup_mmu_htw();
+	}
+
+	/* Set MAS4 based on page table setting */
+
+	mas4 = 0x4 << MAS4_WIMGED_SHIFT;
+	switch (book3e_htw_mode) {
+	case PPC_HTW_E6500:
+		mas4 |= MAS4_INDD;
+		mas4 |= BOOK3E_PAGESZ_2M << MAS4_TSIZED_SHIFT;
+		mas4 |= MAS4_TLBSELD(1);
+		mmu_pte_psize = MMU_PAGE_2M;
+		break;
+
+	case PPC_HTW_IBM:
+		mas4 |= MAS4_INDD;
+#ifdef CONFIG_PPC_64K_PAGES
+		mas4 |=	BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT;
+		mmu_pte_psize = MMU_PAGE_256M;
+#else
+		mas4 |=	BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT;
+		mmu_pte_psize = MMU_PAGE_1M;
+#endif
+		break;
+
+	case PPC_HTW_NONE:
+#ifdef CONFIG_PPC_64K_PAGES
+		mas4 |=	BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT;
+#else
+		mas4 |=	BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT;
+#endif
+		mmu_pte_psize = mmu_virtual_psize;
+		break;
+	}
+	mtspr(SPRN_MAS4, mas4);
+
+	/* Set the global containing the top of the linear mapping
+	 * for use by the TLB miss code
+	 */
+	linear_map_top = memblock_end_of_DRAM();
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
+		unsigned int num_cams;
+
+		/* use a quarter of the TLBCAM for bolted linear map */
+		num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
+		linear_map_top = map_mem_in_cams(linear_map_top, num_cams);
+
+		/* limit memory so we dont have linear faults */
+		memblock_enforce_memory_limit(linear_map_top);
+
+		if (book3e_htw_mode == PPC_HTW_NONE) {
+			extlb_level_exc = EX_TLB_SIZE;
+			patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e);
+			patch_exception(0x1e0,
+				exc_instruction_tlb_miss_bolted_book3e);
+		}
+	}
+#endif
+
+	/* A sync won't hurt us after mucking around with
+	 * the MMU configuration
+	 */
+	mb();
+
+	memblock_set_current_limit(linear_map_top);
+}
+
+void __init early_init_mmu(void)
+{
+	__early_init_mmu(1);
+}
+
+void early_init_mmu_secondary(void)
+{
+	__early_init_mmu(0);
+}
+
+void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+				phys_addr_t first_memblock_size)
+{
+	/* On non-FSL Embedded 64-bit, we adjust the RMA size to match
+	 * the bolted TLB entry. We know for now that only 1G
+	 * entries are supported though that may eventually
+	 * change.
+	 *
+	 * on FSL Embedded 64-bit, we adjust the RMA size to match the
+	 * first bolted TLB entry size.  We still limit max to 1G even if
+	 * the TLB could cover more.  This is due to what the early init
+	 * code is setup to do.
+	 *
+	 * We crop it to the size of the first MEMBLOCK to
+	 * avoid going over total available memory just in case...
+	 */
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
+		unsigned long linear_sz;
+		linear_sz = calc_cam_sz(first_memblock_size, PAGE_OFFSET,
+					first_memblock_base);
+		ppc64_rma_size = min_t(u64, linear_sz, 0x40000000);
+	} else
+#endif
+		ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000);
+
+	/* Finally limit subsequent allocations */
+	memblock_set_current_limit(first_memblock_base + ppc64_rma_size);
+}
+#else /* ! CONFIG_PPC64 */
+void __init early_init_mmu(void)
+{
+#ifdef CONFIG_PPC_47x
+	early_init_mmu_47x();
+#endif
+}
+#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S
new file mode 100644
index 00000000000..43ff3c797fb
--- /dev/null
+++ b/arch/powerpc/mm/tlb_nohash_low.S
@@ -0,0 +1,426 @@
+/*
+ * This file contains low-level functions for performing various
+ * types of TLB invalidations on various processors with no hash
+ * table.
+ *
+ * This file implements the following functions for all no-hash
+ * processors. Some aren't implemented for some variants. Some
+ * are inline in tlbflush.h
+ *
+ *	- tlbil_va
+ *	- tlbil_pid
+ *	- tlbil_all
+ *	- tlbivax_bcast
+ *
+ * Code mostly moved over from misc_32.S
+ *
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Partially rewritten by Cort Dougan (cort@cs.nmt.edu)
+ * Paul Mackerras, Kumar Gala and Benjamin Herrenschmidt.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/cputable.h>
+#include <asm/mmu.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/processor.h>
+#include <asm/bug.h>
+
+#if defined(CONFIG_40x)
+
+/*
+ * 40x implementation needs only tlbil_va
+ */
+_GLOBAL(__tlbil_va)
+	/* We run the search with interrupts disabled because we have to change
+	 * the PID and I don't want to preempt when that happens.
+	 */
+	mfmsr	r5
+	mfspr	r6,SPRN_PID
+	wrteei	0
+	mtspr	SPRN_PID,r4
+	tlbsx.	r3, 0, r3
+	mtspr	SPRN_PID,r6
+	wrtee	r5
+	bne	1f
+	sync
+	/* There are only 64 TLB entries, so r3 < 64, which means bit 25 is
+	 * clear. Since 25 is the V bit in the TLB_TAG, loading this value
+	 * will invalidate the TLB entry. */
+	tlbwe	r3, r3, TLB_TAG
+	isync
+1:	blr
+
+#elif defined(CONFIG_8xx)
+
+/*
+ * Nothing to do for 8xx, everything is inline
+ */
+
+#elif defined(CONFIG_44x) /* Includes 47x */
+
+/*
+ * 440 implementation uses tlbsx/we for tlbil_va and a full sweep
+ * of the TLB for everything else.
+ */
+_GLOBAL(__tlbil_va)
+	mfspr	r5,SPRN_MMUCR
+	mfmsr   r10
+
+	/*
+	 * We write 16 bits of STID since 47x supports that much, we
+	 * will never be passed out of bounds values on 440 (hopefully)
+	 */
+	rlwimi  r5,r4,0,16,31
+
+	/* We have to run the search with interrupts disabled, otherwise
+	 * an interrupt which causes a TLB miss can clobber the MMUCR
+	 * between the mtspr and the tlbsx.
+	 *
+	 * Critical and Machine Check interrupts take care of saving
+	 * and restoring MMUCR, so only normal interrupts have to be
+	 * taken care of.
+	 */
+	wrteei	0
+	mtspr	SPRN_MMUCR,r5
+	tlbsx.	r6,0,r3
+	bne	10f
+	sync
+BEGIN_MMU_FTR_SECTION
+	b	2f
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
+	/* On 440 There are only 64 TLB entries, so r3 < 64, which means bit
+	 * 22, is clear.  Since 22 is the V bit in the TLB_PAGEID, loading this
+	 * value will invalidate the TLB entry.
+	 */
+	tlbwe	r6,r6,PPC44x_TLB_PAGEID
+	isync
+10:	wrtee	r10
+	blr
+2:
+#ifdef CONFIG_PPC_47x
+	oris	r7,r6,0x8000	/* specify way explicitely */
+	clrrwi	r4,r3,12	/* get an EPN for the hashing with V = 0 */
+	ori	r4,r4,PPC47x_TLBE_SIZE
+	tlbwe   r4,r7,0		/* write it */
+	isync
+	wrtee	r10
+	blr
+#else /* CONFIG_PPC_47x */
+1:	trap
+	EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0;
+#endif /* !CONFIG_PPC_47x */
+
+_GLOBAL(_tlbil_all)
+_GLOBAL(_tlbil_pid)
+BEGIN_MMU_FTR_SECTION
+	b	2f
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
+	li	r3,0
+	sync
+
+	/* Load high watermark */
+	lis	r4,tlb_44x_hwater@ha
+	lwz	r5,tlb_44x_hwater@l(r4)
+
+1:	tlbwe	r3,r3,PPC44x_TLB_PAGEID
+	addi	r3,r3,1
+	cmpw	0,r3,r5
+	ble	1b
+
+	isync
+	blr
+2:
+#ifdef CONFIG_PPC_47x
+	/* 476 variant. There's not simple way to do this, hopefully we'll
+	 * try to limit the amount of such full invalidates
+	 */
+	mfmsr	r11		/* Interrupts off */
+	wrteei	0
+	li	r3,-1		/* Current set */
+	lis	r10,tlb_47x_boltmap@h
+	ori	r10,r10,tlb_47x_boltmap@l
+	lis	r7,0x8000	/* Specify way explicitely */
+
+	b	9f		/* For each set */
+
+1:	li	r9,4		/* Number of ways */
+	li	r4,0		/* Current way */
+	li	r6,0		/* Default entry value 0 */
+	andi.	r0,r8,1		/* Check if way 0 is bolted */
+	mtctr	r9		/* Load way counter */
+	bne-	3f		/* Bolted, skip loading it */
+
+2:	/* For each way */
+	or	r5,r3,r4	/* Make way|index for tlbre */
+	rlwimi	r5,r5,16,8,15	/* Copy index into position */
+	tlbre	r6,r5,0		/* Read entry */
+3:	addis	r4,r4,0x2000	/* Next way */
+	andi.	r0,r6,PPC47x_TLB0_VALID /* Valid entry ? */
+	beq	4f		/* Nope, skip it */
+	rlwimi	r7,r5,0,1,2	/* Insert way number */
+	rlwinm	r6,r6,0,21,19	/* Clear V */
+	tlbwe   r6,r7,0		/* Write it */
+4:	bdnz	2b		/* Loop for each way */
+	srwi	r8,r8,1		/* Next boltmap bit */
+9:	cmpwi	cr1,r3,255	/* Last set done ? */
+	addi	r3,r3,1		/* Next set */
+	beq	cr1,1f		/* End of loop */
+	andi.	r0,r3,0x1f	/* Need to load a new boltmap word ? */
+	bne	1b		/* No, loop */
+	lwz	r8,0(r10)	/* Load boltmap entry */
+	addi	r10,r10,4	/* Next word */
+	b	1b		/* Then loop */
+1:	isync			/* Sync shadows */
+	wrtee	r11
+#else /* CONFIG_PPC_47x */
+1:	trap
+	EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0;
+#endif /* !CONFIG_PPC_47x */
+	blr
+
+#ifdef CONFIG_PPC_47x
+
+/*
+ * _tlbivax_bcast is only on 47x. We don't bother doing a runtime
+ * check though, it will blow up soon enough if we mistakenly try
+ * to use it on a 440.
+ */
+_GLOBAL(_tlbivax_bcast)
+	mfspr	r5,SPRN_MMUCR
+	mfmsr	r10
+	rlwimi	r5,r4,0,16,31
+	wrteei	0
+	mtspr	SPRN_MMUCR,r5
+	isync
+	PPC_TLBIVAX(0, R3)
+	isync
+	eieio
+	tlbsync
+BEGIN_FTR_SECTION
+	b	1f
+END_FTR_SECTION_IFSET(CPU_FTR_476_DD2)
+	sync
+	wrtee	r10
+	blr
+/*
+ * DD2 HW could hang if in instruction fetch happens before msync completes.
+ * Touch enough instruction cache lines to ensure cache hits
+ */
+1:	mflr	r9
+	bl	2f
+2:	mflr	r6
+	li	r7,32
+	PPC_ICBT(0,R6,R7)		/* touch next cache line */
+	add	r6,r6,r7
+	PPC_ICBT(0,R6,R7)		/* touch next cache line */
+	add	r6,r6,r7
+	PPC_ICBT(0,R6,R7)		/* touch next cache line */
+	sync
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	mtlr	r9
+	wrtee	r10
+	blr
+#endif /* CONFIG_PPC_47x */
+
+#elif defined(CONFIG_FSL_BOOKE)
+/*
+ * FSL BookE implementations.
+ *
+ * Since feature sections are using _SECTION_ELSE we need
+ * to have the larger code path before the _SECTION_ELSE
+ */
+
+/*
+ * Flush MMU TLB on the local processor
+ */
+_GLOBAL(_tlbil_all)
+BEGIN_MMU_FTR_SECTION
+	li	r3,(MMUCSR0_TLBFI)@l
+	mtspr	SPRN_MMUCSR0, r3
+1:
+	mfspr	r3,SPRN_MMUCSR0
+	andi.	r3,r3,MMUCSR0_TLBFI@l
+	bne	1b
+MMU_FTR_SECTION_ELSE
+	PPC_TLBILX_ALL(0,R0)
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX)
+	msync
+	isync
+	blr
+
+_GLOBAL(_tlbil_pid)
+BEGIN_MMU_FTR_SECTION
+	slwi	r3,r3,16
+	mfmsr	r10
+	wrteei	0
+	mfspr	r4,SPRN_MAS6	/* save MAS6 */
+	mtspr	SPRN_MAS6,r3
+	PPC_TLBILX_PID(0,R0)
+	mtspr	SPRN_MAS6,r4	/* restore MAS6 */
+	wrtee	r10
+MMU_FTR_SECTION_ELSE
+	li	r3,(MMUCSR0_TLBFI)@l
+	mtspr	SPRN_MMUCSR0, r3
+1:
+	mfspr	r3,SPRN_MMUCSR0
+	andi.	r3,r3,MMUCSR0_TLBFI@l
+	bne	1b
+ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBILX)
+	msync
+	isync
+	blr
+
+/*
+ * Flush MMU TLB for a particular address, but only on the local processor
+ * (no broadcast)
+ */
+_GLOBAL(__tlbil_va)
+	mfmsr	r10
+	wrteei	0
+	slwi	r4,r4,16
+	ori	r4,r4,(MAS6_ISIZE(BOOK3E_PAGESZ_4K))@l
+	mtspr	SPRN_MAS6,r4		/* assume AS=0 for now */
+BEGIN_MMU_FTR_SECTION
+	tlbsx	0,r3
+	mfspr	r4,SPRN_MAS1		/* check valid */
+	andis.	r3,r4,MAS1_VALID@h
+	beq	1f
+	rlwinm	r4,r4,0,1,31
+	mtspr	SPRN_MAS1,r4
+	tlbwe
+MMU_FTR_SECTION_ELSE
+	PPC_TLBILX_VA(0,R3)
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX)
+	msync
+	isync
+1:	wrtee	r10
+	blr
+#elif defined(CONFIG_PPC_BOOK3E)
+/*
+ * New Book3E (>= 2.06) implementation
+ *
+ * Note: We may be able to get away without the interrupt masking stuff
+ * if we save/restore MAS6 on exceptions that might modify it
+ */
+_GLOBAL(_tlbil_pid)
+	slwi	r4,r3,MAS6_SPID_SHIFT
+	mfmsr	r10
+	wrteei	0
+	mtspr	SPRN_MAS6,r4
+	PPC_TLBILX_PID(0,R0)
+	wrtee	r10
+	msync
+	isync
+	blr
+
+_GLOBAL(_tlbil_pid_noind)
+	slwi	r4,r3,MAS6_SPID_SHIFT
+	mfmsr	r10
+	ori	r4,r4,MAS6_SIND
+	wrteei	0
+	mtspr	SPRN_MAS6,r4
+	PPC_TLBILX_PID(0,R0)
+	wrtee	r10
+	msync
+	isync
+	blr
+
+_GLOBAL(_tlbil_all)
+	PPC_TLBILX_ALL(0,R0)
+	msync
+	isync
+	blr
+
+_GLOBAL(_tlbil_va)
+	mfmsr	r10
+	wrteei	0
+	cmpwi	cr0,r6,0
+	slwi	r4,r4,MAS6_SPID_SHIFT
+	rlwimi	r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK
+	beq	1f
+	rlwimi	r4,r6,MAS6_SIND_SHIFT,MAS6_SIND
+1:	mtspr	SPRN_MAS6,r4		/* assume AS=0 for now */
+	PPC_TLBILX_VA(0,R3)
+	msync
+	isync
+	wrtee	r10
+	blr
+
+_GLOBAL(_tlbivax_bcast)
+	mfmsr	r10
+	wrteei	0
+	cmpwi	cr0,r6,0
+	slwi	r4,r4,MAS6_SPID_SHIFT
+	rlwimi	r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK
+	beq	1f
+	rlwimi	r4,r6,MAS6_SIND_SHIFT,MAS6_SIND
+1:	mtspr	SPRN_MAS6,r4		/* assume AS=0 for now */
+	PPC_TLBIVAX(0,R3)
+	eieio
+	tlbsync
+	sync
+	wrtee	r10
+	blr
+
+_GLOBAL(set_context)
+#ifdef CONFIG_BDI_SWITCH
+	/* Context switch the PTE pointer for the Abatron BDI2000.
+	 * The PGDIR is the second parameter.
+	 */
+	lis	r5, abatron_pteptrs@h
+	ori	r5, r5, abatron_pteptrs@l
+	stw	r4, 0x4(r5)
+#endif
+	mtspr	SPRN_PID,r3
+	isync			/* Force context change */
+	blr
+#else
+#error Unsupported processor type !
+#endif
+
+#if defined(CONFIG_PPC_FSL_BOOK3E)
+/*
+ * extern void loadcam_entry(unsigned int index)
+ *
+ * Load TLBCAM[index] entry in to the L2 CAM MMU
+ */
+_GLOBAL(loadcam_entry)
+	mflr	r5
+	LOAD_REG_ADDR_PIC(r4, TLBCAM)
+	mtlr	r5
+	mulli	r5,r3,TLBCAM_SIZE
+	add	r3,r5,r4
+	lwz	r4,TLBCAM_MAS0(r3)
+	mtspr	SPRN_MAS0,r4
+	lwz	r4,TLBCAM_MAS1(r3)
+	mtspr	SPRN_MAS1,r4
+	PPC_LL	r4,TLBCAM_MAS2(r3)
+	mtspr	SPRN_MAS2,r4
+	lwz	r4,TLBCAM_MAS3(r3)
+	mtspr	SPRN_MAS3,r4
+BEGIN_MMU_FTR_SECTION
+	lwz	r4,TLBCAM_MAS7(r3)
+	mtspr	SPRN_MAS7,r4
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS)
+	isync
+	tlbwe
+	isync
+	blr
+#endif