1 files changed, 0 insertions, 442 deletions
diff --git a/arch/ppc/kernel/dma-mapping.c b/arch/ppc/kernel/dma-mapping.c
deleted file mode 100644
index 10fec736396..00000000000
--- a/arch/ppc/kernel/dma-mapping.c
+++ /dev/null
@@ -1,442 +0,0 @@
-/*
- *  PowerPC version derived from arch/arm/mm/consistent.c
- *    Copyright (C) 2001 Dan Malek (dmalek@jlc.net)
- *
- *  Copyright (C) 2000 Russell King
- *
- * Consistent memory allocators.  Used for DMA devices that want to
- * share uncached memory with the processor core.  The function return
- * is the virtual address and 'dma_handle' is the physical address.
- * Mostly stolen from the ARM port, with some changes for PowerPC.
- *						-- Dan
- *
- * Reorganized to get rid of the arch-specific consistent_* functions
- * and provide non-coherent implementations for the DMA API. -Matt
- *
- * Added in_interrupt() safe dma_alloc_coherent()/dma_free_coherent()
- * implementation. This is pulled straight from ARM and barely
- * modified. -Matt
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/stddef.h>
-#include <linux/vmalloc.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/highmem.h>
-#include <linux/dma-mapping.h>
-#include <linux/hardirq.h>
-
-#include <asm/pgalloc.h>
-#include <asm/prom.h>
-#include <asm/io.h>
-#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <asm/uaccess.h>
-#include <asm/smp.h>
-#include <asm/machdep.h>
-
-int map_page(unsigned long va, phys_addr_t pa, int flags);
-
-#include <asm/tlbflush.h>
-
-/*
- * This address range defaults to a value that is safe for all
- * platforms which currently set CONFIG_NOT_COHERENT_CACHE. It
- * can be further configured for specific applications under
- * the "Advanced Setup" menu. -Matt
- */
-#define CONSISTENT_BASE	(CONFIG_CONSISTENT_START)
-#define CONSISTENT_END	(CONFIG_CONSISTENT_START + CONFIG_CONSISTENT_SIZE)
-#define CONSISTENT_OFFSET(x)	(((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT)
-
-/*
- * This is the page table (2MB) covering uncached, DMA consistent allocations
- */
-static pte_t *consistent_pte;
-static DEFINE_SPINLOCK(consistent_lock);
-
-/*
- * VM region handling support.
- *
- * This should become something generic, handling VM region allocations for
- * vmalloc and similar (ioremap, module space, etc).
- *
- * I envisage vmalloc()'s supporting vm_struct becoming:
- *
- *  struct vm_struct {
- *    struct vm_region	region;
- *    unsigned long	flags;
- *    struct page	**pages;
- *    unsigned int	nr_pages;
- *    unsigned long	phys_addr;
- *  };
- *
- * get_vm_area() would then call vm_region_alloc with an appropriate
- * struct vm_region head (eg):
- *
- *  struct vm_region vmalloc_head = {
- *	.vm_list	= LIST_HEAD_INIT(vmalloc_head.vm_list),
- *	.vm_start	= VMALLOC_START,
- *	.vm_end		= VMALLOC_END,
- *  };
- *
- * However, vmalloc_head.vm_start is variable (typically, it is dependent on
- * the amount of RAM found at boot time.)  I would imagine that get_vm_area()
- * would have to initialise this each time prior to calling vm_region_alloc().
- */
-struct vm_region {
-	struct list_head	vm_list;
-	unsigned long		vm_start;
-	unsigned long		vm_end;
-};
-
-static struct vm_region consistent_head = {
-	.vm_list	= LIST_HEAD_INIT(consistent_head.vm_list),
-	.vm_start	= CONSISTENT_BASE,
-	.vm_end		= CONSISTENT_END,
-};
-
-static struct vm_region *
-vm_region_alloc(struct vm_region *head, size_t size, gfp_t gfp)
-{
-	unsigned long addr = head->vm_start, end = head->vm_end - size;
-	unsigned long flags;
-	struct vm_region *c, *new;
-
-	new = kmalloc(sizeof(struct vm_region), gfp);
-	if (!new)
-		goto out;
-
-	spin_lock_irqsave(&consistent_lock, flags);
-
-	list_for_each_entry(c, &head->vm_list, vm_list) {
-		if ((addr + size) < addr)
-			goto nospc;
-		if ((addr + size) <= c->vm_start)
-			goto found;
-		addr = c->vm_end;
-		if (addr > end)
-			goto nospc;
-	}
-
- found:
-	/*
-	 * Insert this entry _before_ the one we found.
-	 */
-	list_add_tail(&new->vm_list, &c->vm_list);
-	new->vm_start = addr;
-	new->vm_end = addr + size;
-
-	spin_unlock_irqrestore(&consistent_lock, flags);
-	return new;
-
- nospc:
-	spin_unlock_irqrestore(&consistent_lock, flags);
-	kfree(new);
- out:
-	return NULL;
-}
-
-static struct vm_region *vm_region_find(struct vm_region *head, unsigned long addr)
-{
-	struct vm_region *c;
-
-	list_for_each_entry(c, &head->vm_list, vm_list) {
-		if (c->vm_start == addr)
-			goto out;
-	}
-	c = NULL;
- out:
-	return c;
-}
-
-/*
- * Allocate DMA-coherent memory space and return both the kernel remapped
- * virtual and bus address for that space.
- */
-void *
-__dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp)
-{
-	struct page *page;
-	struct vm_region *c;
-	unsigned long order;
-	u64 mask = 0x00ffffff, limit; /* ISA default */
-
-	if (!consistent_pte) {
-		printk(KERN_ERR "%s: not initialised\n", __func__);
-		dump_stack();
-		return NULL;
-	}
-
-	size = PAGE_ALIGN(size);
-	limit = (mask + 1) & ~mask;
-	if ((limit && size >= limit) || size >= (CONSISTENT_END - CONSISTENT_BASE)) {
-		printk(KERN_WARNING "coherent allocation too big (requested %#x mask %#Lx)\n",
-		       size, mask);
-		return NULL;
-	}
-
-	order = get_order(size);
-
-	if (mask != 0xffffffff)
-		gfp |= GFP_DMA;
-
-	page = alloc_pages(gfp, order);
-	if (!page)
-		goto no_page;
-
-	/*
-	 * Invalidate any data that might be lurking in the
-	 * kernel direct-mapped region for device DMA.
-	 */
-	{
-		unsigned long kaddr = (unsigned long)page_address(page);
-		memset(page_address(page), 0, size);
-		flush_dcache_range(kaddr, kaddr + size);
-	}
-
-	/*
-	 * Allocate a virtual address in the consistent mapping region.
-	 */
-	c = vm_region_alloc(&consistent_head, size,
-			    gfp & ~(__GFP_DMA | __GFP_HIGHMEM));
-	if (c) {
-		unsigned long vaddr = c->vm_start;
-		pte_t *pte = consistent_pte + CONSISTENT_OFFSET(vaddr);
-		struct page *end = page + (1 << order);
-
-		split_page(page, order);
-
-		/*
-		 * Set the "dma handle"
-		 */
-		*handle = page_to_bus(page);
-
-		do {
-			BUG_ON(!pte_none(*pte));
-
-			SetPageReserved(page);
-			set_pte_at(&init_mm, vaddr,
-				   pte, mk_pte(page, pgprot_noncached(PAGE_KERNEL)));
-			page++;
-			pte++;
-			vaddr += PAGE_SIZE;
-		} while (size -= PAGE_SIZE);
-
-		/*
-		 * Free the otherwise unused pages.
-		 */
-		while (page < end) {
-			__free_page(page);
-			page++;
-		}
-
-		return (void *)c->vm_start;
-	}
-
-	if (page)
-		__free_pages(page, order);
- no_page:
-	return NULL;
-}
-EXPORT_SYMBOL(__dma_alloc_coherent);
-
-/*
- * free a page as defined by the above mapping.
- */
-void __dma_free_coherent(size_t size, void *vaddr)
-{
-	struct vm_region *c;
-	unsigned long flags, addr;
-	pte_t *ptep;
-
-	size = PAGE_ALIGN(size);
-
-	spin_lock_irqsave(&consistent_lock, flags);
-
-	c = vm_region_find(&consistent_head, (unsigned long)vaddr);
-	if (!c)
-		goto no_area;
-
-	if ((c->vm_end - c->vm_start) != size) {
-		printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n",
-		       __func__, c->vm_end - c->vm_start, size);
-		dump_stack();
-		size = c->vm_end - c->vm_start;
-	}
-
-	ptep = consistent_pte + CONSISTENT_OFFSET(c->vm_start);
-	addr = c->vm_start;
-	do {
-		pte_t pte = ptep_get_and_clear(&init_mm, addr, ptep);
-		unsigned long pfn;
-
-		ptep++;
-		addr += PAGE_SIZE;
-
-		if (!pte_none(pte) && pte_present(pte)) {
-			pfn = pte_pfn(pte);
-
-			if (pfn_valid(pfn)) {
-				struct page *page = pfn_to_page(pfn);
-				ClearPageReserved(page);
-
-				__free_page(page);
-				continue;
-			}
-		}
-
-		printk(KERN_CRIT "%s: bad page in kernel page table\n",
-		       __func__);
-	} while (size -= PAGE_SIZE);
-
-	flush_tlb_kernel_range(c->vm_start, c->vm_end);
-
-	list_del(&c->vm_list);
-
-	spin_unlock_irqrestore(&consistent_lock, flags);
-
-	kfree(c);
-	return;
-
- no_area:
-	spin_unlock_irqrestore(&consistent_lock, flags);
-	printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n",
-	       __func__, vaddr);
-	dump_stack();
-}
-EXPORT_SYMBOL(__dma_free_coherent);
-
-/*
- * Initialise the consistent memory allocation.
- */
-static int __init dma_alloc_init(void)
-{
-	pgd_t *pgd;
-	pmd_t *pmd;
-	pte_t *pte;
-	int ret = 0;
-
-	do {
-		pgd = pgd_offset(&init_mm, CONSISTENT_BASE);
-		pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE);
-		if (!pmd) {
-			printk(KERN_ERR "%s: no pmd tables\n", __func__);
-			ret = -ENOMEM;
-			break;
-		}
-		WARN_ON(!pmd_none(*pmd));
-
-		pte = pte_alloc_kernel(pmd, CONSISTENT_BASE);
-		if (!pte) {
-			printk(KERN_ERR "%s: no pte tables\n", __func__);
-			ret = -ENOMEM;
-			break;
-		}
-
-		consistent_pte = pte;
-	} while (0);
-
-	return ret;
-}
-
-core_initcall(dma_alloc_init);
-
-/*
- * make an area consistent.
- */
-void __dma_sync(void *vaddr, size_t size, int direction)
-{
-	unsigned long start = (unsigned long)vaddr;
-	unsigned long end   = start + size;
-
-	switch (direction) {
-	case DMA_NONE:
-		BUG();
-	case DMA_FROM_DEVICE:	/* invalidate only */
-		invalidate_dcache_range(start, end);
-		break;
-	case DMA_TO_DEVICE:		/* writeback only */
-		clean_dcache_range(start, end);
-		break;
-	case DMA_BIDIRECTIONAL:	/* writeback and invalidate */
-		flush_dcache_range(start, end);
-		break;
-	}
-}
-EXPORT_SYMBOL(__dma_sync);
-
-#ifdef CONFIG_HIGHMEM
-/*
- * __dma_sync_page() implementation for systems using highmem.
- * In this case, each page of a buffer must be kmapped/kunmapped
- * in order to have a virtual address for __dma_sync(). This must
- * not sleep so kmap_atomic()/kunmap_atomic() are used.
- *
- * Note: yes, it is possible and correct to have a buffer extend
- * beyond the first page.
- */
-static inline void __dma_sync_page_highmem(struct page *page,
-		unsigned long offset, size_t size, int direction)
-{
-	size_t seg_size = min((size_t)(PAGE_SIZE - offset), size);
-	size_t cur_size = seg_size;
-	unsigned long flags, start, seg_offset = offset;
-	int nr_segs = 1 + ((size - seg_size) + PAGE_SIZE - 1)/PAGE_SIZE;
-	int seg_nr = 0;
-
-	local_irq_save(flags);
-
-	do {
-		start = (unsigned long)kmap_atomic(page + seg_nr,
-				KM_PPC_SYNC_PAGE) + seg_offset;
-
-		/* Sync this buffer segment */
-		__dma_sync((void *)start, seg_size, direction);
-		kunmap_atomic((void *)start, KM_PPC_SYNC_PAGE);
-		seg_nr++;
-
-		/* Calculate next buffer segment size */
-		seg_size = min((size_t)PAGE_SIZE, size - cur_size);
-
-		/* Add the segment size to our running total */
-		cur_size += seg_size;
-		seg_offset = 0;
-	} while (seg_nr < nr_segs);
-
-	local_irq_restore(flags);
-}
-#endif /* CONFIG_HIGHMEM */
-
-/*
- * __dma_sync_page makes memory consistent. identical to __dma_sync, but
- * takes a struct page instead of a virtual address
- */
-void __dma_sync_page(struct page *page, unsigned long offset,
-	size_t size, int direction)
-{
-#ifdef CONFIG_HIGHMEM
-	__dma_sync_page_highmem(page, offset, size, direction);
-#else
-	unsigned long start = (unsigned long)page_address(page) + offset;
-	__dma_sync((void *)start, size, direction);
-#endif
-}
-EXPORT_SYMBOL(__dma_sync_page);
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index ee728e433aa..670c312d914 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -2,12 +2,13 @@
 # Makefile for the linux kernel.
 #
 
+CFLAGS_prom.o		= -I$(src)/../../../scripts/dtc/libfdt
 CFLAGS_ptrace.o		+= -DUTS_MACHINE='"$(UTS_MACHINE)"'
 
 subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
 
 ifeq ($(CONFIG_PPC64),y)
-CFLAGS_prom_init.o	+= -mno-minimal-toc
+CFLAGS_prom_init.o	+= $(NO_MINIMAL_TOC)
 endif
 ifeq ($(CONFIG_PPC32),y)
 CFLAGS_prom_init.o      += -fPIC
@@ -28,7 +29,7 @@ endif
 
 obj-y				:= cputable.o ptrace.o syscalls.o \
 				   irq.o align.o signal_32.o pmc.o vdso.o \
-				   init_task.o process.o systbl.o idle.o \
+				   process.o systbl.o idle.o \
 				   signal.o sysfs.o cacheinfo.o time.o \
 				   prom.o traps.o setup-common.o \
 				   udbg.o misc.o io.o dma.o \
@@ -38,16 +39,15 @@ obj-$(CONFIG_PPC64)		+= setup_64.o sys_ppc32.o \
 				   paca.o nvram_64.o firmware.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
 obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_ppc970.o cpu_setup_pa6t.o
-obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_power7.o
+obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_power.o
+obj-$(CONFIG_PPC_BOOK3S_64)	+= mce.o mce_power.o
 obj64-$(CONFIG_RELOCATABLE)	+= reloc_64.o
 obj-$(CONFIG_PPC_BOOK3E_64)	+= exceptions-64e.o idle_book3e.o
-obj-$(CONFIG_PPC_A2)		+= cpu_setup_a2.o
 obj-$(CONFIG_PPC64)		+= vdso64/
 obj-$(CONFIG_ALTIVEC)		+= vecemu.o
 obj-$(CONFIG_PPC_970_NAP)	+= idle_power4.o
 obj-$(CONFIG_PPC_P7_NAP)	+= idle_power7.o
 obj-$(CONFIG_PPC_OF)		+= of_platform.o prom_parse.o
-obj-$(CONFIG_PPC_CLOCK)		+= clock.o
 procfs-y			:= proc_powerpc.o
 obj-$(CONFIG_PROC_FS)		+= $(procfs-y)
 rtaspci-$(CONFIG_PPC64)-$(CONFIG_PCI)	:= rtas_pci.o
@@ -55,11 +55,13 @@ obj-$(CONFIG_PPC_RTAS)		+= rtas.o rtas-rtc.o $(rtaspci-y-y)
 obj-$(CONFIG_PPC_RTAS_DAEMON)	+= rtasd.o
 obj-$(CONFIG_RTAS_FLASH)	+= rtas_flash.o
 obj-$(CONFIG_RTAS_PROC)		+= rtas-proc.o
-obj-$(CONFIG_LPARCFG)		+= lparcfg.o
 obj-$(CONFIG_IBMVIO)		+= vio.o
 obj-$(CONFIG_IBMEBUS)           += ibmebus.o
+obj-$(CONFIG_EEH)              += eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \
+				  eeh_driver.o eeh_event.o eeh_sysfs.o
 obj-$(CONFIG_GENERIC_TBSYNC)	+= smp-tbsync.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
+obj-$(CONFIG_FA_DUMP)		+= fadump.o
 ifeq ($(CONFIG_PPC32),y)
 obj-$(CONFIG_E500)		+= idle_e500.o
 endif
@@ -74,8 +76,8 @@ endif
 obj64-$(CONFIG_HIBERNATION)	+= swsusp_asm64.o
 obj-$(CONFIG_MODULES)		+= module.o module_$(CONFIG_WORD_SIZE).o
 obj-$(CONFIG_44x)		+= cpu_setup_44x.o
-obj-$(CONFIG_PPC_FSL_BOOK3E)	+= cpu_setup_fsl_booke.o dbell.o
-obj-$(CONFIG_PPC_BOOK3E_64)	+= dbell.o
+obj-$(CONFIG_PPC_FSL_BOOK3E)	+= cpu_setup_fsl_booke.o
+obj-$(CONFIG_PPC_DOORBELL)	+= dbell.o
 obj-$(CONFIG_JUMP_LABEL)	+= jump_label.o
 
 extra-y				:= head_$(CONFIG_WORD_SIZE).o
@@ -90,16 +92,16 @@ obj-$(CONFIG_RELOCATABLE_PPC32)	+= reloc_32.o
 obj-$(CONFIG_PPC32)		+= entry_32.o setup_32.o
 obj-$(CONFIG_PPC64)		+= dma-iommu.o iommu.o
 obj-$(CONFIG_KGDB)		+= kgdb.o
-obj-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE)	+= prom_init.o
 obj-$(CONFIG_MODULES)		+= ppc_ksyms.o
 obj-$(CONFIG_BOOTX_TEXT)	+= btext.o
 obj-$(CONFIG_SMP)		+= smp.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
+obj-$(CONFIG_UPROBES)		+= uprobes.o
 obj-$(CONFIG_PPC_UDBG_16550)	+= legacy_serial.o udbg_16550.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-$(CONFIG_SWIOTLB)		+= dma-swiotlb.o
 
-pci64-$(CONFIG_PPC64)		+= pci_dn.o isa-bridge.o
+pci64-$(CONFIG_PPC64)		+= pci_dn.o pci-hotplug.o isa-bridge.o
 obj-$(CONFIG_PCI)		+= pci_$(CONFIG_WORD_SIZE).o $(pci64-y) \
 				   pci-common.o pci_of_scan.o
 obj-$(CONFIG_PCI_MSI)		+= msi.o
@@ -113,22 +115,13 @@ obj-$(CONFIG_PPC_IO_WORKAROUNDS)	+= io-workarounds.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
 obj-$(CONFIG_FTRACE_SYSCALLS)	+= ftrace.o
-obj-$(CONFIG_PERF_EVENTS)	+= perf_callchain.o
 
-obj-$(CONFIG_PPC_PERF_CTRS)	+= perf_event.o
-obj64-$(CONFIG_PPC_PERF_CTRS)	+= power4-pmu.o ppc970-pmu.o power5-pmu.o \
-				   power5+-pmu.o power6-pmu.o power7-pmu.o
-obj32-$(CONFIG_PPC_PERF_CTRS)	+= mpc7450-pmu.o
-
-obj-$(CONFIG_FSL_EMB_PERF_EVENT) += perf_event_fsl_emb.o
-obj-$(CONFIG_FSL_EMB_PERF_EVENT_E500) += e500-pmu.o
-
-obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
-
-ifneq ($(CONFIG_PPC_INDIRECT_IO),y)
+ifneq ($(CONFIG_PPC_INDIRECT_PIO),y)
 obj-y				+= iomap.o
 endif
 
+obj64-$(CONFIG_PPC_TRANSACTIONAL_MEM)	+= tm.o
+
 obj-$(CONFIG_PPC64)		+= $(obj64-y)
 obj-$(CONFIG_PPC32)		+= $(obj32-y)
 
@@ -136,6 +129,7 @@ ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC),)
 obj-y				+= ppc_save_regs.o
 endif
 
+obj-$(CONFIG_EPAPR_PARAVIRT)	+= epapr_paravirt.o epapr_hcalls.o
 obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvm_emul.o
 
 # Disable GCOV in odd or sensitive code
@@ -148,6 +142,7 @@ GCOV_PROFILE_kprobes.o := n
 extra-$(CONFIG_PPC_FPU)		+= fpu.o
 extra-$(CONFIG_ALTIVEC)		+= vector.o
 extra-$(CONFIG_PPC64)		+= entry_64.o
+extra-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE)	+= prom_init.o
 
 extra-y				+= systbl_chk.i
 $(obj)/systbl.o:		systbl_chk
diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
index 8184ee97e48..34f55524d45 100644
--- a/arch/powerpc/kernel/align.c
+++ b/arch/powerpc/kernel/align.c
@@ -21,18 +21,17 @@
 #include <linux/mm.h>
 #include <asm/processor.h>
 #include <asm/uaccess.h>
-#include <asm/system.h>
 #include <asm/cache.h>
 #include <asm/cputable.h>
 #include <asm/emulated_ops.h>
+#include <asm/switch_to.h>
+#include <asm/disassemble.h>
 
 struct aligninfo {
 	unsigned char len;
 	unsigned char flags;
 };
 
-#define IS_XFORM(inst)	(((inst) >> 26) == 31)
-#define IS_DSFORM(inst)	(((inst) >> 26) >= 56)
 
 #define INVALID	{ 0, 0 }
 
@@ -54,8 +53,6 @@ struct aligninfo {
 /* DSISR bits reported for a DCBZ instruction: */
 #define DCBZ	0x5f	/* 8xx/82xx dcbz faults when cache not enabled */
 
-#define SWAP(a, b)	(t = (a), (a) = (b), (b) = t)
-
 /*
  * The PowerPC stores certain bits of the instruction that caused the
  * alignment exception in the DSISR register.  This array maps those
@@ -75,7 +72,7 @@ static struct aligninfo aligninfo[128] = {
 	{ 8, LD+F },		/* 00 0 1001: lfd */
 	{ 4, ST+F+S },		/* 00 0 1010: stfs */
 	{ 8, ST+F },		/* 00 0 1011: stfd */
-	INVALID,		/* 00 0 1100 */
+	{ 16, LD },		/* 00 0 1100: lq */
 	{ 8, LD },		/* 00 0 1101: ld/ldu/lwa */
 	INVALID,		/* 00 0 1110 */
 	{ 8, ST },		/* 00 0 1111: std/stdu */
@@ -142,7 +139,7 @@ static struct aligninfo aligninfo[128] = {
 	{ 2, LD+SW },		/* 10 0 1100: lhbrx */
 	{ 4, LD+SE },		/* 10 0 1101  lwa */
 	{ 2, ST+SW },		/* 10 0 1110: sthbrx */
-	INVALID,		/* 10 0 1111 */
+	{ 16, ST },		/* 10 0 1111: stq */
 	INVALID,		/* 10 1 0000 */
 	INVALID,		/* 10 1 0001 */
 	INVALID,		/* 10 1 0010 */
@@ -194,37 +191,6 @@ static struct aligninfo aligninfo[128] = {
 };
 
 /*
- * Create a DSISR value from the instruction
- */
-static inline unsigned make_dsisr(unsigned instr)
-{
-	unsigned dsisr;
-
-
-	/* bits  6:15 --> 22:31 */
-	dsisr = (instr & 0x03ff0000) >> 16;
-
-	if (IS_XFORM(instr)) {
-		/* bits 29:30 --> 15:16 */
-		dsisr |= (instr & 0x00000006) << 14;
-		/* bit     25 -->    17 */
-		dsisr |= (instr & 0x00000040) << 8;
-		/* bits 21:24 --> 18:21 */
-		dsisr |= (instr & 0x00000780) << 3;
-	} else {
-		/* bit      5 -->    17 */
-		dsisr |= (instr & 0x04000000) >> 12;
-		/* bits  1: 4 --> 18:21 */
-		dsisr |= (instr & 0x78000000) >> 17;
-		/* bits 30:31 --> 12:13 */
-		if (IS_DSFORM(instr))
-			dsisr |= (instr & 0x00000003) << 18;
-	}
-
-	return dsisr;
-}
-
-/*
  * The dcbz (data cache block zero) instruction
  * gives an alignment fault if used on non-cacheable
  * memory.  We handle the fault mainly for the
@@ -256,11 +222,17 @@ static int emulate_dcbz(struct pt_regs *regs, unsigned char __user *addr)
  * bottom 4 bytes of each register, and the loads clear the
  * top 4 bytes of the affected register.
  */
+#ifdef __BIG_ENDIAN__
 #ifdef CONFIG_PPC64
 #define REG_BYTE(rp, i)		*((u8 *)((rp) + ((i) >> 2)) + ((i) & 3) + 4)
 #else
 #define REG_BYTE(rp, i)		*((u8 *)(rp) + (i))
 #endif
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define REG_BYTE(rp, i)		(*(((u8 *)((rp) + ((i)>>2)) + ((i)&3))))
+#endif
 
 #define SWIZ_PTR(p)		((unsigned char __user *)((p) ^ swiz))
 
@@ -305,6 +277,15 @@ static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr,
 			nb0 = nb + reg * 4 - 128;
 			nb = 128 - reg * 4;
 		}
+#ifdef __LITTLE_ENDIAN__
+		/*
+		 *  String instructions are endian neutral but the code
+		 *  below is not.  Force byte swapping on so that the
+		 *  effects of swizzling are undone in the load/store
+		 *  loops below.
+		 */
+		flags ^= SW;
+#endif
 	} else {
 		/* lwm, stmw */
 		nb = (32 - reg) * 4;
@@ -372,8 +353,6 @@ static int emulate_fp_pair(unsigned char __user *addr, unsigned int reg,
 	char *ptr1 = (char *) &current->thread.TS_FPR(reg+1);
 	int i, ret, sw = 0;
 
-	if (!(flags & F))
-		return 0;
 	if (reg & 1)
 		return 0;	/* invalid form: FRS/FRT must be even */
 	if (flags & SW)
@@ -393,6 +372,34 @@ static int emulate_fp_pair(unsigned char __user *addr, unsigned int reg,
 	return 1;	/* exception handled and fixed up */
 }
 
+#ifdef CONFIG_PPC64
+static int emulate_lq_stq(struct pt_regs *regs, unsigned char __user *addr,
+			  unsigned int reg, unsigned int flags)
+{
+	char *ptr0 = (char *)&regs->gpr[reg];
+	char *ptr1 = (char *)&regs->gpr[reg+1];
+	int i, ret, sw = 0;
+
+	if (reg & 1)
+		return 0;	/* invalid form: GPR must be even */
+	if (flags & SW)
+		sw = 7;
+	ret = 0;
+	for (i = 0; i < 8; ++i) {
+		if (!(flags & ST)) {
+			ret |= __get_user(ptr0[i^sw], addr + i);
+			ret |= __get_user(ptr1[i^sw], addr + i + 8);
+		} else {
+			ret |= __put_user(ptr0[i^sw], addr + i);
+			ret |= __put_user(ptr1[i^sw], addr + i + 8);
+		}
+	}
+	if (ret)
+		return -EFAULT;
+	return 1;	/* exception handled and fixed up */
+}
+#endif /* CONFIG_PPC64 */
+
 #ifdef CONFIG_SPE
 
 static struct aligninfo spe_aligninfo[32] = {
@@ -458,7 +465,7 @@ static struct aligninfo spe_aligninfo[32] = {
 static int emulate_spe(struct pt_regs *regs, unsigned int reg,
 		       unsigned int instr)
 {
-	int t, ret;
+	int ret;
 	union {
 		u64 ll;
 		u32 w[2];
@@ -581,24 +588,18 @@ static int emulate_spe(struct pt_regs *regs, unsigned int reg,
 	if (flags & SW) {
 		switch (flags & 0xf0) {
 		case E8:
-			SWAP(data.v[0], data.v[7]);
-			SWAP(data.v[1], data.v[6]);
-			SWAP(data.v[2], data.v[5]);
-			SWAP(data.v[3], data.v[4]);
+			data.ll = swab64(data.ll);
 			break;
 		case E4:
-
-			SWAP(data.v[0], data.v[3]);
-			SWAP(data.v[1], data.v[2]);
-			SWAP(data.v[4], data.v[7]);
-			SWAP(data.v[5], data.v[6]);
+			data.w[0] = swab32(data.w[0]);
+			data.w[1] = swab32(data.w[1]);
 			break;
 		/* Its half word endian */
 		default:
-			SWAP(data.v[0], data.v[1]);
-			SWAP(data.v[2], data.v[3]);
-			SWAP(data.v[4], data.v[5]);
-			SWAP(data.v[6], data.v[7]);
+			data.h[0] = swab16(data.h[0]);
+			data.h[1] = swab16(data.h[1]);
+			data.h[2] = swab16(data.h[2]);
+			data.h[3] = swab16(data.h[3]);
 			break;
 		}
 	}
@@ -651,17 +652,38 @@ static int emulate_vsx(unsigned char __user *addr, unsigned int reg,
 	int sw = 0;
 	int i, j;
 
+	/* userland only */
+	if (unlikely(!user_mode(regs)))
+		return 0;
+
 	flush_vsx_to_thread(current);
 
 	if (reg < 32)
-		ptr = (char *) &current->thread.TS_FPR(reg);
+		ptr = (char *) &current->thread.fp_state.fpr[reg][0];
 	else
-		ptr = (char *) &current->thread.vr[reg - 32];
+		ptr = (char *) &current->thread.vr_state.vr[reg - 32];
 
 	lptr = (unsigned long *) ptr;
 
+#ifdef __LITTLE_ENDIAN__
+	if (flags & SW) {
+		elsize = length;
+		sw = length-1;
+	} else {
+		/*
+		 * The elements are BE ordered, even in LE mode, so process
+		 * them in reverse order.
+		 */
+		addr += length - elsize;
+
+		/* 8 byte memory accesses go in the top 8 bytes of the VR */
+		if (length == 8)
+			ptr += 8;
+	}
+#else
 	if (flags & SW)
 		sw = elsize-1;
+#endif
 
 	for (j = 0; j < length; j += elsize) {
 		for (i = 0; i < elsize; ++i) {
@@ -671,19 +693,31 @@ static int emulate_vsx(unsigned char __user *addr, unsigned int reg,
 				ret |= __get_user(ptr[i^sw], addr + i);
 		}
 		ptr  += elsize;
+#ifdef __LITTLE_ENDIAN__
+		addr -= elsize;
+#else
 		addr += elsize;
+#endif
 	}
 
+#ifdef __BIG_ENDIAN__
+#define VSX_HI 0
+#define VSX_LO 1
+#else
+#define VSX_HI 1
+#define VSX_LO 0
+#endif
+
 	if (!ret) {
 		if (flags & U)
 			regs->gpr[areg] = regs->dar;
 
 		/* Splat load copies the same data to top and bottom 8 bytes */
 		if (flags & SPLT)
-			lptr[1] = lptr[0];
-		/* For 8 byte loads, zero the top 8 bytes */
+			lptr[VSX_LO] = lptr[VSX_HI];
+		/* For 8 byte loads, zero the low 8 bytes */
 		else if (!(flags & ST) && (8 == length))
-			lptr[1] = 0;
+			lptr[VSX_LO] = 0;
 	} else
 		return -EFAULT;
 
@@ -706,18 +740,28 @@ int fix_alignment(struct pt_regs *regs)
 	unsigned int dsisr;
 	unsigned char __user *addr;
 	unsigned long p, swiz;
-	int ret, t;
-	union {
+	int ret, i;
+	union data {
 		u64 ll;
 		double dd;
 		unsigned char v[8];
 		struct {
+#ifdef __LITTLE_ENDIAN__
+			int	 low32;
+			unsigned hi32;
+#else
 			unsigned hi32;
 			int	 low32;
+#endif
 		} x32;
 		struct {
+#ifdef __LITTLE_ENDIAN__
+			short	      low16;
+			unsigned char hi48[6];
+#else
 			unsigned char hi48[6];
 			short	      low16;
+#endif
 		} x16;
 	} data;
 
@@ -764,10 +808,21 @@ int fix_alignment(struct pt_regs *regs)
 	nb = aligninfo[instr].len;
 	flags = aligninfo[instr].flags;
 
+	/* ldbrx/stdbrx overlap lfs/stfs in the DSISR unfortunately */
+	if (IS_XFORM(instruction) && ((instruction >> 1) & 0x3ff) == 532) {
+		nb = 8;
+		flags = LD+SW;
+	} else if (IS_XFORM(instruction) &&
+		   ((instruction >> 1) & 0x3ff) == 660) {
+		nb = 8;
+		flags = ST+SW;
+	}
+
 	/* Byteswap little endian loads and stores */
 	swiz = 0;
-	if (regs->msr & MSR_LE) {
+	if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE)) {
 		flags ^= SW;
+#ifdef __BIG_ENDIAN__
 		/*
 		 * So-called "PowerPC little endian" mode works by
 		 * swizzling addresses rather than by actually doing
@@ -780,6 +835,7 @@ int fix_alignment(struct pt_regs *regs)
 		 */
 		if (cpu_has_feature(CPU_FTR_PPC_LE))
 			swiz = 7;
+#endif
 	}
 
 	/* DAR has the operand effective address */
@@ -804,7 +860,7 @@ int fix_alignment(struct pt_regs *regs)
 			elsize = 8;
 
 		flags = 0;
-		if (regs->msr & MSR_LE)
+		if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE))
 			flags |= SW;
 		if (instruction & 0x100)
 			flags |= ST;
@@ -852,10 +908,20 @@ int fix_alignment(struct pt_regs *regs)
 		flush_fp_to_thread(current);
 	}
 
-	/* Special case for 16-byte FP loads and stores */
-	if (nb == 16) {
-		PPC_WARN_ALIGNMENT(fp_pair, regs);
-		return emulate_fp_pair(addr, reg, flags);
+	if ((nb == 16)) {
+		if (flags & F) {
+			/* Special case for 16-byte FP loads and stores */
+			PPC_WARN_ALIGNMENT(fp_pair, regs);
+			return emulate_fp_pair(addr, reg, flags);
+		} else {
+#ifdef CONFIG_PPC64
+			/* Special case for 16-byte loads and stores */
+			PPC_WARN_ALIGNMENT(lq_stq, regs);
+			return emulate_lq_stq(regs, addr, reg, flags);
+#else
+			return 0;
+#endif
+		}
 	}
 
 	PPC_WARN_ALIGNMENT(unaligned, regs);
@@ -864,32 +930,36 @@ int fix_alignment(struct pt_regs *regs)
 	 * get it from register values
 	 */
 	if (!(flags & ST)) {
-		data.ll = 0;
-		ret = 0;
-		p = (unsigned long) addr;
+		unsigned int start = 0;
+
 		switch (nb) {
-		case 8:
-			ret |= __get_user_inatomic(data.v[0], SWIZ_PTR(p++));
-			ret |= __get_user_inatomic(data.v[1], SWIZ_PTR(p++));
-			ret |= __get_user_inatomic(data.v[2], SWIZ_PTR(p++));
-			ret |= __get_user_inatomic(data.v[3], SWIZ_PTR(p++));
 		case 4:
-			ret |= __get_user_inatomic(data.v[4], SWIZ_PTR(p++));
-			ret |= __get_user_inatomic(data.v[5], SWIZ_PTR(p++));
+			start = offsetof(union data, x32.low32);
+			break;
 		case 2:
-			ret |= __get_user_inatomic(data.v[6], SWIZ_PTR(p++));
-			ret |= __get_user_inatomic(data.v[7], SWIZ_PTR(p++));
-			if (unlikely(ret))
-				return -EFAULT;
+			start = offsetof(union data, x16.low16);
+			break;
 		}
+
+		data.ll = 0;
+		ret = 0;
+		p = (unsigned long)addr;
+
+		for (i = 0; i < nb; i++)
+			ret |= __get_user_inatomic(data.v[start + i],
+						   SWIZ_PTR(p++));
+
+		if (unlikely(ret))
+			return -EFAULT;
+
 	} else if (flags & F) {
-		data.dd = current->thread.TS_FPR(reg);
+		data.ll = current->thread.TS_FPR(reg);
 		if (flags & S) {
 			/* Single-precision FP store requires conversion... */
 #ifdef CONFIG_PPC_FPU
 			preempt_disable();
 			enable_kernel_fp();
-			cvt_df(&data.dd, (float *)&data.v[4]);
+			cvt_df(&data.dd, (float *)&data.x32.low32);
 			preempt_enable();
 #else
 			return 0;
@@ -901,17 +971,13 @@ int fix_alignment(struct pt_regs *regs)
 	if (flags & SW) {
 		switch (nb) {
 		case 8:
-			SWAP(data.v[0], data.v[7]);
-			SWAP(data.v[1], data.v[6]);
-			SWAP(data.v[2], data.v[5]);
-			SWAP(data.v[3], data.v[4]);
+			data.ll = swab64(data.ll);
 			break;
 		case 4:
-			SWAP(data.v[4], data.v[7]);
-			SWAP(data.v[5], data.v[6]);
+			data.x32.low32 = swab32(data.x32.low32);
 			break;
 		case 2:
-			SWAP(data.v[6], data.v[7]);
+			data.x16.low16 = swab16(data.x16.low16);
 			break;
 		}
 	}
@@ -933,7 +999,7 @@ int fix_alignment(struct pt_regs *regs)
 #ifdef CONFIG_PPC_FPU
 		preempt_disable();
 		enable_kernel_fp();
-		cvt_fd((float *)&data.v[4], &data.dd);
+		cvt_fd((float *)&data.x32.low32, &data.dd);
 		preempt_enable();
 #else
 		return 0;
@@ -943,25 +1009,28 @@ int fix_alignment(struct pt_regs *regs)
 
 	/* Store result to memory or update registers */
 	if (flags & ST) {
-		ret = 0;
-		p = (unsigned long) addr;
+		unsigned int start = 0;
+
 		switch (nb) {
-		case 8:
-			ret |= __put_user_inatomic(data.v[0], SWIZ_PTR(p++));
-			ret |= __put_user_inatomic(data.v[1], SWIZ_PTR(p++));
-			ret |= __put_user_inatomic(data.v[2], SWIZ_PTR(p++));
-			ret |= __put_user_inatomic(data.v[3], SWIZ_PTR(p++));
 		case 4:
-			ret |= __put_user_inatomic(data.v[4], SWIZ_PTR(p++));
-			ret |= __put_user_inatomic(data.v[5], SWIZ_PTR(p++));
+			start = offsetof(union data, x32.low32);
+			break;
 		case 2:
-			ret |= __put_user_inatomic(data.v[6], SWIZ_PTR(p++));
-			ret |= __put_user_inatomic(data.v[7], SWIZ_PTR(p++));
+			start = offsetof(union data, x16.low16);
+			break;
 		}
+
+		ret = 0;
+		p = (unsigned long)addr;
+
+		for (i = 0; i < nb; i++)
+			ret |= __put_user_inatomic(data.v[start + i],
+						   SWIZ_PTR(p++));
+
 		if (unlikely(ret))
 			return -EFAULT;
 	} else if (flags & F)
-		current->thread.TS_FPR(reg) = data.dd;
+		current->thread.TS_FPR(reg) = data.ll;
 	else
 		regs->gpr[reg] = data.ll;
 
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 04caee7d9bc..f5995a91221 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -46,9 +46,6 @@
 #include <asm/hvcall.h>
 #include <asm/xics.h>
 #endif
-#ifdef CONFIG_PPC_ISERIES
-#include <asm/iseries/alpaca.h>
-#endif
 #ifdef CONFIG_PPC_POWERNV
 #include <asm/opal.h>
 #endif
@@ -57,6 +54,7 @@
 #endif
 #if defined(CONFIG_KVM) && defined(CONFIG_PPC_BOOK3S)
 #include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
 #endif
 
 #ifdef CONFIG_PPC32
@@ -79,36 +77,37 @@ int main(void)
 	DEFINE(SIGSEGV, SIGSEGV);
 	DEFINE(NMI_MASK, NMI_MASK);
 	DEFINE(THREAD_DSCR, offsetof(struct thread_struct, dscr));
+	DEFINE(THREAD_DSCR_INHERIT, offsetof(struct thread_struct, dscr_inherit));
+	DEFINE(TASKTHREADPPR, offsetof(struct task_struct, thread.ppr));
 #else
 	DEFINE(THREAD_INFO, offsetof(struct task_struct, stack));
+	DEFINE(THREAD_INFO_GAP, _ALIGN_UP(sizeof(struct thread_info), 16));
+	DEFINE(KSP_LIMIT, offsetof(struct thread_struct, ksp_limit));
 #endif /* CONFIG_PPC64 */
 
 	DEFINE(KSP, offsetof(struct thread_struct, ksp));
-	DEFINE(KSP_LIMIT, offsetof(struct thread_struct, ksp_limit));
 	DEFINE(PT_REGS, offsetof(struct thread_struct, regs));
 #ifdef CONFIG_BOOKE
 	DEFINE(THREAD_NORMSAVES, offsetof(struct thread_struct, normsave[0]));
 #endif
 	DEFINE(THREAD_FPEXC_MODE, offsetof(struct thread_struct, fpexc_mode));
-	DEFINE(THREAD_FPR0, offsetof(struct thread_struct, fpr[0]));
-	DEFINE(THREAD_FPSCR, offsetof(struct thread_struct, fpscr));
+	DEFINE(THREAD_FPSTATE, offsetof(struct thread_struct, fp_state));
+	DEFINE(THREAD_FPSAVEAREA, offsetof(struct thread_struct, fp_save_area));
+	DEFINE(FPSTATE_FPSCR, offsetof(struct thread_fp_state, fpscr));
 #ifdef CONFIG_ALTIVEC
-	DEFINE(THREAD_VR0, offsetof(struct thread_struct, vr[0]));
+	DEFINE(THREAD_VRSTATE, offsetof(struct thread_struct, vr_state));
+	DEFINE(THREAD_VRSAVEAREA, offsetof(struct thread_struct, vr_save_area));
 	DEFINE(THREAD_VRSAVE, offsetof(struct thread_struct, vrsave));
-	DEFINE(THREAD_VSCR, offsetof(struct thread_struct, vscr));
 	DEFINE(THREAD_USED_VR, offsetof(struct thread_struct, used_vr));
+	DEFINE(VRSTATE_VSCR, offsetof(struct thread_vr_state, vscr));
 #endif /* CONFIG_ALTIVEC */
 #ifdef CONFIG_VSX
-	DEFINE(THREAD_VSR0, offsetof(struct thread_struct, fpr));
 	DEFINE(THREAD_USED_VSR, offsetof(struct thread_struct, used_vsr));
 #endif /* CONFIG_VSX */
 #ifdef CONFIG_PPC64
 	DEFINE(KSP_VSID, offsetof(struct thread_struct, ksp_vsid));
 #else /* CONFIG_PPC64 */
 	DEFINE(PGDIR, offsetof(struct thread_struct, pgdir));
-#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
-	DEFINE(THREAD_DBCR0, offsetof(struct thread_struct, dbcr0));
-#endif
 #ifdef CONFIG_SPE
 	DEFINE(THREAD_EVR0, offsetof(struct thread_struct, evr[0]));
 	DEFINE(THREAD_ACC, offsetof(struct thread_struct, acc));
@@ -116,9 +115,46 @@ int main(void)
 	DEFINE(THREAD_USED_SPE, offsetof(struct thread_struct, used_spe));
 #endif /* CONFIG_SPE */
 #endif /* CONFIG_PPC64 */
+#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
+	DEFINE(THREAD_DBCR0, offsetof(struct thread_struct, debug.dbcr0));
+#endif
 #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
 	DEFINE(THREAD_KVM_SVCPU, offsetof(struct thread_struct, kvm_shadow_vcpu));
 #endif
+#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE)
+	DEFINE(THREAD_KVM_VCPU, offsetof(struct thread_struct, kvm_vcpu));
+#endif
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	DEFINE(THREAD_TAR, offsetof(struct thread_struct, tar));
+	DEFINE(THREAD_BESCR, offsetof(struct thread_struct, bescr));
+	DEFINE(THREAD_EBBHR, offsetof(struct thread_struct, ebbhr));
+	DEFINE(THREAD_EBBRR, offsetof(struct thread_struct, ebbrr));
+	DEFINE(THREAD_SIAR, offsetof(struct thread_struct, siar));
+	DEFINE(THREAD_SDAR, offsetof(struct thread_struct, sdar));
+	DEFINE(THREAD_SIER, offsetof(struct thread_struct, sier));
+	DEFINE(THREAD_MMCR0, offsetof(struct thread_struct, mmcr0));
+	DEFINE(THREAD_MMCR2, offsetof(struct thread_struct, mmcr2));
+#endif
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	DEFINE(PACATMSCRATCH, offsetof(struct paca_struct, tm_scratch));
+	DEFINE(THREAD_TM_TFHAR, offsetof(struct thread_struct, tm_tfhar));
+	DEFINE(THREAD_TM_TEXASR, offsetof(struct thread_struct, tm_texasr));
+	DEFINE(THREAD_TM_TFIAR, offsetof(struct thread_struct, tm_tfiar));
+	DEFINE(THREAD_TM_TAR, offsetof(struct thread_struct, tm_tar));
+	DEFINE(THREAD_TM_PPR, offsetof(struct thread_struct, tm_ppr));
+	DEFINE(THREAD_TM_DSCR, offsetof(struct thread_struct, tm_dscr));
+	DEFINE(PT_CKPT_REGS, offsetof(struct thread_struct, ckpt_regs));
+	DEFINE(THREAD_TRANSACT_VRSTATE, offsetof(struct thread_struct,
+						 transact_vr));
+	DEFINE(THREAD_TRANSACT_VRSAVE, offsetof(struct thread_struct,
+					    transact_vrsave));
+	DEFINE(THREAD_TRANSACT_FPSTATE, offsetof(struct thread_struct,
+						 transact_fp));
+	/* Local pt_regs on stack for Transactional Memory funcs. */
+	DEFINE(TM_FRAME_SIZE, STACK_FRAME_OVERHEAD +
+	       sizeof(struct pt_regs) + 16);
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
 	DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
 	DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
@@ -147,7 +183,7 @@ int main(void)
 	DEFINE(PACAKBASE, offsetof(struct paca_struct, kernelbase));
 	DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));
 	DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
-	DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled));
+	DEFINE(PACAIRQHAPPENED, offsetof(struct paca_struct, irq_happened));
 	DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
 #ifdef CONFIG_PPC_MM_SLICES
 	DEFINE(PACALOWSLICESPSIZE, offsetof(struct paca_struct,
@@ -168,6 +204,15 @@ int main(void)
 	DEFINE(PACA_MC_STACK, offsetof(struct paca_struct, mc_kstack));
 	DEFINE(PACA_CRIT_STACK, offsetof(struct paca_struct, crit_kstack));
 	DEFINE(PACA_DBG_STACK, offsetof(struct paca_struct, dbg_kstack));
+	DEFINE(PACA_TCD_PTR, offsetof(struct paca_struct, tcd_ptr));
+
+	DEFINE(TCD_ESEL_NEXT,
+		offsetof(struct tlb_core_data, esel_next));
+	DEFINE(TCD_ESEL_MAX,
+		offsetof(struct tlb_core_data, esel_max));
+	DEFINE(TCD_ESEL_FIRST,
+		offsetof(struct tlb_core_data, esel_first));
+	DEFINE(TCD_LOCK, offsetof(struct tlb_core_data, lock));
 #endif /* CONFIG_PPC_BOOK3E */
 
 #ifdef CONFIG_PPC_STD_MMU_64
@@ -191,24 +236,26 @@ int main(void)
 	DEFINE(SLBSHADOW_STACKESID,
 	       offsetof(struct slb_shadow, save_area[SLB_NUM_BOLTED - 1].esid));
 	DEFINE(SLBSHADOW_SAVEAREA, offsetof(struct slb_shadow, save_area));
-	DEFINE(LPPACASRR0, offsetof(struct lppaca, saved_srr0));
-	DEFINE(LPPACASRR1, offsetof(struct lppaca, saved_srr1));
-	DEFINE(LPPACAANYINT, offsetof(struct lppaca, int_dword.any_int));
-	DEFINE(LPPACADECRINT, offsetof(struct lppaca, int_dword.fields.decr_int));
 	DEFINE(LPPACA_PMCINUSE, offsetof(struct lppaca, pmcregs_in_use));
 	DEFINE(LPPACA_DTLIDX, offsetof(struct lppaca, dtl_idx));
 	DEFINE(LPPACA_YIELDCOUNT, offsetof(struct lppaca, yield_count));
 	DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx));
 #endif /* CONFIG_PPC_STD_MMU_64 */
 	DEFINE(PACAEMERGSP, offsetof(struct paca_struct, emergency_sp));
+#ifdef CONFIG_PPC_BOOK3S_64
+	DEFINE(PACAMCEMERGSP, offsetof(struct paca_struct, mc_emergency_sp));
+	DEFINE(PACA_IN_MCE, offsetof(struct paca_struct, in_mce));
+#endif
 	DEFINE(PACAHWCPUID, offsetof(struct paca_struct, hw_cpu_id));
 	DEFINE(PACAKEXECSTATE, offsetof(struct paca_struct, kexec_state));
+	DEFINE(PACA_DSCR, offsetof(struct paca_struct, dscr_default));
 	DEFINE(PACA_STARTTIME, offsetof(struct paca_struct, starttime));
 	DEFINE(PACA_STARTTIME_USER, offsetof(struct paca_struct, starttime_user));
 	DEFINE(PACA_USER_TIME, offsetof(struct paca_struct, user_time));
 	DEFINE(PACA_SYSTEM_TIME, offsetof(struct paca_struct, system_time));
 	DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save));
 	DEFINE(PACA_NAPSTATELOST, offsetof(struct paca_struct, nap_state_lost));
+	DEFINE(PACA_SPRG_VDSO, offsetof(struct paca_struct, sprg_vdso));
 #endif /* CONFIG_PPC64 */
 
 	/* RTAS */
@@ -384,40 +431,28 @@ int main(void)
 	DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry));
 #endif
 
-#ifdef CONFIG_PPC_ISERIES
-	/* the assembler miscalculates the VSID values */
-	DEFINE(PAGE_OFFSET_ESID, GET_ESID(PAGE_OFFSET));
-	DEFINE(PAGE_OFFSET_VSID, KERNEL_VSID(PAGE_OFFSET));
-	DEFINE(VMALLOC_START_ESID, GET_ESID(VMALLOC_START));
-	DEFINE(VMALLOC_START_VSID, KERNEL_VSID(VMALLOC_START));
-
-	/* alpaca */
-	DEFINE(ALPACA_SIZE, sizeof(struct alpaca));
-#endif
-
 	DEFINE(PGD_TABLE_SIZE, PGD_TABLE_SIZE);
 	DEFINE(PTE_SIZE, sizeof(pte_t));
 
 #ifdef CONFIG_KVM
 	DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
 	DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
+	DEFINE(VCPU_GUEST_PID, offsetof(struct kvm_vcpu, arch.pid));
 	DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
 	DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave));
-	DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fpr));
-	DEFINE(VCPU_FPSCR, offsetof(struct kvm_vcpu, arch.fpscr));
+	DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fp.fpr));
 #ifdef CONFIG_ALTIVEC
-	DEFINE(VCPU_VRS, offsetof(struct kvm_vcpu, arch.vr));
-	DEFINE(VCPU_VSCR, offsetof(struct kvm_vcpu, arch.vscr));
-#endif
-#ifdef CONFIG_VSX
-	DEFINE(VCPU_VSRS, offsetof(struct kvm_vcpu, arch.vsr));
+	DEFINE(VCPU_VRS, offsetof(struct kvm_vcpu, arch.vr.vr));
 #endif
 	DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
 	DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr));
 	DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
+#ifdef CONFIG_PPC_BOOK3S
+	DEFINE(VCPU_TAR, offsetof(struct kvm_vcpu, arch.tar));
+#endif
 	DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
 	DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc));
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 	DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.shregs.msr));
 	DEFINE(VCPU_SRR0, offsetof(struct kvm_vcpu, arch.shregs.srr0));
 	DEFINE(VCPU_SRR1, offsetof(struct kvm_vcpu, arch.shregs.srr1));
@@ -426,71 +461,128 @@ int main(void)
 	DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
 	DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
 #endif
-	DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4));
-	DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
-	DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
-	DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7));
+	DEFINE(VCPU_SHARED_SPRG3, offsetof(struct kvm_vcpu_arch_shared, sprg3));
+	DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4));
+	DEFINE(VCPU_SHARED_SPRG5, offsetof(struct kvm_vcpu_arch_shared, sprg5));
+	DEFINE(VCPU_SHARED_SPRG6, offsetof(struct kvm_vcpu_arch_shared, sprg6));
+	DEFINE(VCPU_SHARED_SPRG7, offsetof(struct kvm_vcpu_arch_shared, sprg7));
 	DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid));
 	DEFINE(VCPU_SHADOW_PID1, offsetof(struct kvm_vcpu, arch.shadow_pid1));
 	DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared));
 	DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr));
 	DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
+#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE)
+	DEFINE(VCPU_SHAREDBE, offsetof(struct kvm_vcpu, arch.shared_big_endian));
+#endif
 
-	/* book3s */
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+	DEFINE(VCPU_SHARED_MAS0, offsetof(struct kvm_vcpu_arch_shared, mas0));
+	DEFINE(VCPU_SHARED_MAS1, offsetof(struct kvm_vcpu_arch_shared, mas1));
+	DEFINE(VCPU_SHARED_MAS2, offsetof(struct kvm_vcpu_arch_shared, mas2));
+	DEFINE(VCPU_SHARED_MAS7_3, offsetof(struct kvm_vcpu_arch_shared, mas7_3));
+	DEFINE(VCPU_SHARED_MAS4, offsetof(struct kvm_vcpu_arch_shared, mas4));
+	DEFINE(VCPU_SHARED_MAS6, offsetof(struct kvm_vcpu_arch_shared, mas6));
+
+	DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm));
 	DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid));
+
+	/* book3s */
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 	DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1));
 	DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid));
 	DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
 	DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1));
 	DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
-	DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter));
-	DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
+	DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits));
 	DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
 	DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
+	DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v));
 	DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
 	DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
+	DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
+	DEFINE(VCPU_VPA_DIRTY, offsetof(struct kvm_vcpu, arch.vpa.dirty));
 #endif
 #ifdef CONFIG_PPC_BOOK3S
-	DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm));
 	DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
 	DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr));
 	DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr));
+	DEFINE(VCPU_IC, offsetof(struct kvm_vcpu, arch.ic));
+	DEFINE(VCPU_VTB, offsetof(struct kvm_vcpu, arch.vtb));
 	DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr));
 	DEFINE(VCPU_AMR, offsetof(struct kvm_vcpu, arch.amr));
 	DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor));
+	DEFINE(VCPU_IAMR, offsetof(struct kvm_vcpu, arch.iamr));
 	DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl));
 	DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr));
+	DEFINE(VCPU_DABRX, offsetof(struct kvm_vcpu, arch.dabrx));
+	DEFINE(VCPU_DAWR, offsetof(struct kvm_vcpu, arch.dawr));
+	DEFINE(VCPU_DAWRX, offsetof(struct kvm_vcpu, arch.dawrx));
+	DEFINE(VCPU_CIABR, offsetof(struct kvm_vcpu, arch.ciabr));
 	DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
 	DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
 	DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
 	DEFINE(VCPU_PENDING_EXC, offsetof(struct kvm_vcpu, arch.pending_exceptions));
 	DEFINE(VCPU_CEDED, offsetof(struct kvm_vcpu, arch.ceded));
 	DEFINE(VCPU_PRODDED, offsetof(struct kvm_vcpu, arch.prodded));
-	DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa));
 	DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
 	DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
+	DEFINE(VCPU_SPMC, offsetof(struct kvm_vcpu, arch.spmc));
+	DEFINE(VCPU_SIAR, offsetof(struct kvm_vcpu, arch.siar));
+	DEFINE(VCPU_SDAR, offsetof(struct kvm_vcpu, arch.sdar));
+	DEFINE(VCPU_SIER, offsetof(struct kvm_vcpu, arch.sier));
 	DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
 	DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max));
 	DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
-	DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu));
 	DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
 	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
+	DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr));
 	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
 	DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
-	DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid));
+	DEFINE(VCPU_CFAR, offsetof(struct kvm_vcpu, arch.cfar));
+	DEFINE(VCPU_PPR, offsetof(struct kvm_vcpu, arch.ppr));
+	DEFINE(VCPU_FSCR, offsetof(struct kvm_vcpu, arch.fscr));
+	DEFINE(VCPU_SHADOW_FSCR, offsetof(struct kvm_vcpu, arch.shadow_fscr));
+	DEFINE(VCPU_PSPB, offsetof(struct kvm_vcpu, arch.pspb));
+	DEFINE(VCPU_EBBHR, offsetof(struct kvm_vcpu, arch.ebbhr));
+	DEFINE(VCPU_EBBRR, offsetof(struct kvm_vcpu, arch.ebbrr));
+	DEFINE(VCPU_BESCR, offsetof(struct kvm_vcpu, arch.bescr));
+	DEFINE(VCPU_CSIGR, offsetof(struct kvm_vcpu, arch.csigr));
+	DEFINE(VCPU_TACR, offsetof(struct kvm_vcpu, arch.tacr));
+	DEFINE(VCPU_TCSCR, offsetof(struct kvm_vcpu, arch.tcscr));
+	DEFINE(VCPU_ACOP, offsetof(struct kvm_vcpu, arch.acop));
+	DEFINE(VCPU_WORT, offsetof(struct kvm_vcpu, arch.wort));
+	DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1));
 	DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
 	DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
 	DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
 	DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads));
-	DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
-			   offsetof(struct kvmppc_vcpu_book3s, vcpu));
+	DEFINE(VCORE_KVM, offsetof(struct kvmppc_vcore, kvm));
+	DEFINE(VCORE_TB_OFFSET, offsetof(struct kvmppc_vcore, tb_offset));
+	DEFINE(VCORE_LPCR, offsetof(struct kvmppc_vcore, lpcr));
+	DEFINE(VCORE_PCR, offsetof(struct kvmppc_vcore, pcr));
+	DEFINE(VCORE_DPDES, offsetof(struct kvmppc_vcore, dpdes));
 	DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
 	DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv));
 	DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb));
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	DEFINE(VCPU_TFHAR, offsetof(struct kvm_vcpu, arch.tfhar));
+	DEFINE(VCPU_TFIAR, offsetof(struct kvm_vcpu, arch.tfiar));
+	DEFINE(VCPU_TEXASR, offsetof(struct kvm_vcpu, arch.texasr));
+	DEFINE(VCPU_GPR_TM, offsetof(struct kvm_vcpu, arch.gpr_tm));
+	DEFINE(VCPU_FPRS_TM, offsetof(struct kvm_vcpu, arch.fp_tm.fpr));
+	DEFINE(VCPU_VRS_TM, offsetof(struct kvm_vcpu, arch.vr_tm.vr));
+	DEFINE(VCPU_VRSAVE_TM, offsetof(struct kvm_vcpu, arch.vrsave_tm));
+	DEFINE(VCPU_CR_TM, offsetof(struct kvm_vcpu, arch.cr_tm));
+	DEFINE(VCPU_LR_TM, offsetof(struct kvm_vcpu, arch.lr_tm));
+	DEFINE(VCPU_CTR_TM, offsetof(struct kvm_vcpu, arch.ctr_tm));
+	DEFINE(VCPU_AMR_TM, offsetof(struct kvm_vcpu, arch.amr_tm));
+	DEFINE(VCPU_PPR_TM, offsetof(struct kvm_vcpu, arch.ppr_tm));
+	DEFINE(VCPU_DSCR_TM, offsetof(struct kvm_vcpu, arch.dscr_tm));
+	DEFINE(VCPU_TAR_TM, offsetof(struct kvm_vcpu, arch.tar_tm));
+#endif
 
 #ifdef CONFIG_PPC_BOOK3S_64
-#ifdef CONFIG_KVM_BOOK3S_PR
+#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
+	DEFINE(PACA_SVCPU, offsetof(struct paca_struct, shadow_vcpu));
 # define SVCPU_FIELD(x, f)	DEFINE(x, offsetof(struct paca_struct, shadow_vcpu.f))
 #else
 # define SVCPU_FIELD(x, f)
@@ -530,6 +622,7 @@ int main(void)
 #ifdef CONFIG_PPC64
 	SVCPU_FIELD(SVCPU_SLB, slb);
 	SVCPU_FIELD(SVCPU_SLB_MAX, slb_max);
+	SVCPU_FIELD(SVCPU_SHADOW_FSCR, shadow_fscr);
 #endif
 
 	HSTATE_FIELD(HSTATE_HOST_R1, host_r1);
@@ -538,14 +631,20 @@ int main(void)
 	HSTATE_FIELD(HSTATE_VMHANDLER, vmhandler);
 	HSTATE_FIELD(HSTATE_SCRATCH0, scratch0);
 	HSTATE_FIELD(HSTATE_SCRATCH1, scratch1);
+	HSTATE_FIELD(HSTATE_SCRATCH2, scratch2);
 	HSTATE_FIELD(HSTATE_IN_GUEST, in_guest);
 	HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5);
 	HSTATE_FIELD(HSTATE_NAPPING, napping);
 
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	HSTATE_FIELD(HSTATE_HWTHREAD_REQ, hwthread_req);
+	HSTATE_FIELD(HSTATE_HWTHREAD_STATE, hwthread_state);
 	HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
 	HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore);
 	HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys);
+	HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
+	HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
+	HSTATE_FIELD(HSTATE_PTID, ptid);
 	HSTATE_FIELD(HSTATE_MMCR, host_mmcr);
 	HSTATE_FIELD(HSTATE_PMC, host_pmc);
 	HSTATE_FIELD(HSTATE_PURR, host_purr);
@@ -554,7 +653,13 @@ int main(void)
 	HSTATE_FIELD(HSTATE_DABR, dabr);
 	HSTATE_FIELD(HSTATE_DECEXP, dec_expires);
 	DEFINE(IPI_PRIORITY, IPI_PRIORITY);
-#endif /* CONFIG_KVM_BOOK3S_64_HV */
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	HSTATE_FIELD(HSTATE_CFAR, cfar);
+	HSTATE_FIELD(HSTATE_PPR, ppr);
+	HSTATE_FIELD(HSTATE_HOST_FSCR, host_fscr);
+#endif /* CONFIG_PPC_BOOK3S_64 */
 
 #else /* CONFIG_PPC_BOOK3S */
 	DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
@@ -565,6 +670,7 @@ int main(void)
 	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
 	DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear));
 	DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr));
+	DEFINE(VCPU_CRIT_SAVE, offsetof(struct kvm_vcpu, arch.crit_save));
 #endif /* CONFIG_PPC_BOOK3S */
 #endif /* CONFIG_KVM */
 
@@ -603,6 +709,12 @@ int main(void)
 	DEFINE(VCPU_HOST_SPEFSCR, offsetof(struct kvm_vcpu, arch.host_spefscr));
 #endif
 
+#ifdef CONFIG_KVM_BOOKE_HV
+	DEFINE(VCPU_HOST_MAS4, offsetof(struct kvm_vcpu, arch.host_mas4));
+	DEFINE(VCPU_HOST_MAS6, offsetof(struct kvm_vcpu, arch.host_mas6));
+	DEFINE(VCPU_EPLC, offsetof(struct kvm_vcpu, arch.eplc));
+#endif
+
 #ifdef CONFIG_KVM_EXIT_TIMING
 	DEFINE(VCPU_TIMING_EXIT_TBU, offsetof(struct kvm_vcpu,
 						arch.timing_exit.tv32.tbu));
diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c
index ac8f52732fd..41c011cb607 100644
--- a/arch/powerpc/kernel/btext.c
+++ b/arch/powerpc/kernel/btext.c
@@ -25,11 +25,6 @@
 static void scrollscreen(void);
 #endif
 
-static void draw_byte(unsigned char c, long locX, long locY);
-static void draw_byte_32(unsigned char *bits, unsigned int *base, int rb);
-static void draw_byte_16(unsigned char *bits, unsigned int *base, int rb);
-static void draw_byte_8(unsigned char *bits, unsigned int *base, int rb);
-
 #define __force_data __attribute__((__section__(".data")))
 
 static int g_loc_X __force_data;
@@ -52,6 +47,26 @@ static unsigned char vga_font[cmapsz];
 int boot_text_mapped __force_data = 0;
 int force_printk_to_btext = 0;
 
+extern void rmci_on(void);
+extern void rmci_off(void);
+
+static inline void rmci_maybe_on(void)
+{
+#if defined(CONFIG_PPC_EARLY_DEBUG_BOOTX) && defined(CONFIG_PPC64)
+	if (!(mfmsr() & MSR_DR))
+		rmci_on();
+#endif
+}
+
+static inline void rmci_maybe_off(void)
+{
+#if defined(CONFIG_PPC_EARLY_DEBUG_BOOTX) && defined(CONFIG_PPC64)
+	if (!(mfmsr() & MSR_DR))
+		rmci_off();
+#endif
+}
+
+
 #ifdef CONFIG_PPC32
 /* Calc BAT values for mapping the display and store them
  * in disp_BAT.  Those values are then used from head.S to map
@@ -134,7 +149,7 @@ void __init btext_unmap(void)
  *    changes.
  */
 
-static void map_boot_text(void)
+void btext_map(void)
 {
 	unsigned long base, offset, size;
 	unsigned char *vbase;
@@ -209,7 +224,7 @@ int btext_initialize(struct device_node *np)
 	dispDeviceRect[2] = width;
 	dispDeviceRect[3] = height;
 
-	map_boot_text();
+	btext_map();
 
 	return 0;
 }
@@ -283,7 +298,7 @@ void btext_update_display(unsigned long phys, int width, int height,
 		iounmap(logicalDisplayBase);
 		boot_text_mapped = 0;
 	}
-	map_boot_text();
+	btext_map();
 	g_loc_X = 0;
 	g_loc_Y = 0;
 	g_max_loc_X = width / 8;
@@ -298,6 +313,7 @@ void btext_clearscreen(void)
 					(dispDeviceDepth >> 3)) >> 2;
 	int i,j;
 
+	rmci_maybe_on();
 	for (i=0; i<(dispDeviceRect[3] - dispDeviceRect[1]); i++)
 	{
 		unsigned int *ptr = base;
@@ -305,6 +321,7 @@ void btext_clearscreen(void)
 			*(ptr++) = 0;
 		base += (dispDeviceRowBytes >> 2);
 	}
+	rmci_maybe_off();
 }
 
 void btext_flushscreen(void)
@@ -355,6 +372,8 @@ static void scrollscreen(void)
 				   (dispDeviceDepth >> 3)) >> 2;
 	int i,j;
 
+	rmci_maybe_on();
+
 	for (i=0; i<(dispDeviceRect[3] - dispDeviceRect[1] - 16); i++)
 	{
 		unsigned int *src_ptr = src;
@@ -371,9 +390,116 @@ static void scrollscreen(void)
 			*(dst_ptr++) = 0;
 		dst += (dispDeviceRowBytes >> 2);
 	}
+
+	rmci_maybe_off();
 }
 #endif /* ndef NO_SCROLL */
 
+static unsigned int expand_bits_8[16] = {
+	0x00000000,
+	0x000000ff,
+	0x0000ff00,
+	0x0000ffff,
+	0x00ff0000,
+	0x00ff00ff,
+	0x00ffff00,
+	0x00ffffff,
+	0xff000000,
+	0xff0000ff,
+	0xff00ff00,
+	0xff00ffff,
+	0xffff0000,
+	0xffff00ff,
+	0xffffff00,
+	0xffffffff
+};
+
+static unsigned int expand_bits_16[4] = {
+	0x00000000,
+	0x0000ffff,
+	0xffff0000,
+	0xffffffff
+};
+
+
+static void draw_byte_32(unsigned char *font, unsigned int *base, int rb)
+{
+	int l, bits;
+	int fg = 0xFFFFFFFFUL;
+	int bg = 0x00000000UL;
+
+	for (l = 0; l < 16; ++l)
+	{
+		bits = *font++;
+		base[0] = (-(bits >> 7) & fg) ^ bg;
+		base[1] = (-((bits >> 6) & 1) & fg) ^ bg;
+		base[2] = (-((bits >> 5) & 1) & fg) ^ bg;
+		base[3] = (-((bits >> 4) & 1) & fg) ^ bg;
+		base[4] = (-((bits >> 3) & 1) & fg) ^ bg;
+		base[5] = (-((bits >> 2) & 1) & fg) ^ bg;
+		base[6] = (-((bits >> 1) & 1) & fg) ^ bg;
+		base[7] = (-(bits & 1) & fg) ^ bg;
+		base = (unsigned int *) ((char *)base + rb);
+	}
+}
+
+static inline void draw_byte_16(unsigned char *font, unsigned int *base, int rb)
+{
+	int l, bits;
+	int fg = 0xFFFFFFFFUL;
+	int bg = 0x00000000UL;
+	unsigned int *eb = (int *)expand_bits_16;
+
+	for (l = 0; l < 16; ++l)
+	{
+		bits = *font++;
+		base[0] = (eb[bits >> 6] & fg) ^ bg;
+		base[1] = (eb[(bits >> 4) & 3] & fg) ^ bg;
+		base[2] = (eb[(bits >> 2) & 3] & fg) ^ bg;
+		base[3] = (eb[bits & 3] & fg) ^ bg;
+		base = (unsigned int *) ((char *)base + rb);
+	}
+}
+
+static inline void draw_byte_8(unsigned char *font, unsigned int *base, int rb)
+{
+	int l, bits;
+	int fg = 0x0F0F0F0FUL;
+	int bg = 0x00000000UL;
+	unsigned int *eb = (int *)expand_bits_8;
+
+	for (l = 0; l < 16; ++l)
+	{
+		bits = *font++;
+		base[0] = (eb[bits >> 4] & fg) ^ bg;
+		base[1] = (eb[bits & 0xf] & fg) ^ bg;
+		base = (unsigned int *) ((char *)base + rb);
+	}
+}
+
+static noinline void draw_byte(unsigned char c, long locX, long locY)
+{
+	unsigned char *base	= calc_base(locX << 3, locY << 4);
+	unsigned char *font	= &vga_font[((unsigned int)c) * 16];
+	int rb			= dispDeviceRowBytes;
+
+	rmci_maybe_on();
+	switch(dispDeviceDepth) {
+	case 24:
+	case 32:
+		draw_byte_32(font, (unsigned int *)base, rb);
+		break;
+	case 15:
+	case 16:
+		draw_byte_16(font, (unsigned int *)base, rb);
+		break;
+	case 8:
+		draw_byte_8(font, (unsigned int *)base, rb);
+		break;
+	}
+	rmci_maybe_off();
+}
+
 void btext_drawchar(char c)
 {
 	int cline = 0;
@@ -465,107 +591,12 @@ void btext_drawhex(unsigned long v)
 	btext_drawchar(' ');
 }
 
-static void draw_byte(unsigned char c, long locX, long locY)
-{
-	unsigned char *base	= calc_base(locX << 3, locY << 4);
-	unsigned char *font	= &vga_font[((unsigned int)c) * 16];
-	int rb			= dispDeviceRowBytes;
-
-	switch(dispDeviceDepth) {
-	case 24:
-	case 32:
-		draw_byte_32(font, (unsigned int *)base, rb);
-		break;
-	case 15:
-	case 16:
-		draw_byte_16(font, (unsigned int *)base, rb);
-		break;
-	case 8:
-		draw_byte_8(font, (unsigned int *)base, rb);
-		break;
-	}
-}
-
-static unsigned int expand_bits_8[16] = {
-	0x00000000,
-	0x000000ff,
-	0x0000ff00,
-	0x0000ffff,
-	0x00ff0000,
-	0x00ff00ff,
-	0x00ffff00,
-	0x00ffffff,
-	0xff000000,
-	0xff0000ff,
-	0xff00ff00,
-	0xff00ffff,
-	0xffff0000,
-	0xffff00ff,
-	0xffffff00,
-	0xffffffff
-};
-
-static unsigned int expand_bits_16[4] = {
-	0x00000000,
-	0x0000ffff,
-	0xffff0000,
-	0xffffffff
-};
-
-
-static void draw_byte_32(unsigned char *font, unsigned int *base, int rb)
-{
-	int l, bits;
-	int fg = 0xFFFFFFFFUL;
-	int bg = 0x00000000UL;
-
-	for (l = 0; l < 16; ++l)
-	{
-		bits = *font++;
-		base[0] = (-(bits >> 7) & fg) ^ bg;
-		base[1] = (-((bits >> 6) & 1) & fg) ^ bg;
-		base[2] = (-((bits >> 5) & 1) & fg) ^ bg;
-		base[3] = (-((bits >> 4) & 1) & fg) ^ bg;
-		base[4] = (-((bits >> 3) & 1) & fg) ^ bg;
-		base[5] = (-((bits >> 2) & 1) & fg) ^ bg;
-		base[6] = (-((bits >> 1) & 1) & fg) ^ bg;
-		base[7] = (-(bits & 1) & fg) ^ bg;
-		base = (unsigned int *) ((char *)base + rb);
-	}
-}
-
-static void draw_byte_16(unsigned char *font, unsigned int *base, int rb)
-{
-	int l, bits;
-	int fg = 0xFFFFFFFFUL;
-	int bg = 0x00000000UL;
-	unsigned int *eb = (int *)expand_bits_16;
-
-	for (l = 0; l < 16; ++l)
-	{
-		bits = *font++;
-		base[0] = (eb[bits >> 6] & fg) ^ bg;
-		base[1] = (eb[(bits >> 4) & 3] & fg) ^ bg;
-		base[2] = (eb[(bits >> 2) & 3] & fg) ^ bg;
-		base[3] = (eb[bits & 3] & fg) ^ bg;
-		base = (unsigned int *) ((char *)base + rb);
-	}
-}
-
-static void draw_byte_8(unsigned char *font, unsigned int *base, int rb)
+void __init udbg_init_btext(void)
 {
-	int l, bits;
-	int fg = 0x0F0F0F0FUL;
-	int bg = 0x00000000UL;
-	unsigned int *eb = (int *)expand_bits_8;
-
-	for (l = 0; l < 16; ++l)
-	{
-		bits = *font++;
-		base[0] = (eb[bits >> 4] & fg) ^ bg;
-		base[1] = (eb[bits & 0xf] & fg) ^ bg;
-		base = (unsigned int *) ((char *)base + rb);
-	}
+	/* If btext is enabled, we might have a BAT setup for early display,
+	 * thus we do enable some very basic udbg output
+	 */
+	udbg_putc = btext_drawchar;
 }
 
 static unsigned char vga_font[cmapsz] = {
@@ -913,10 +944,3 @@ static unsigned char vga_font[cmapsz] = {
 0x00, 0x00, 0x00, 0x00,
 };
 
-void __init udbg_init_btext(void)
-{
-	/* If btext is enabled, we might have a BAT setup for early display,
-	 * thus we do enable some very basic udbg output
-	 */
-	udbg_putc = btext_drawchar;
-}
diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
index 92c6b008dd2..40198d50b4c 100644
--- a/arch/powerpc/kernel/cacheinfo.c
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -12,7 +12,6 @@
 
 #include <linux/cpu.h>
 #include <linux/cpumask.h>
-#include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
@@ -131,7 +130,8 @@ static const char *cache_type_string(const struct cache *cache)
 	return cache_type_info[cache->type].name;
 }
 
-static void __cpuinit cache_init(struct cache *cache, int type, int level, struct device_node *ofnode)
+static void cache_init(struct cache *cache, int type, int level,
+		       struct device_node *ofnode)
 {
 	cache->type = type;
 	cache->level = level;
@@ -140,7 +140,7 @@ static void __cpuinit cache_init(struct cache *cache, int type, int level, struc
 	list_add(&cache->list, &cache_list);
 }
 
-static struct cache *__cpuinit new_cache(int type, int level, struct device_node *ofnode)
+static struct cache *new_cache(int type, int level, struct device_node *ofnode)
 {
 	struct cache *cache;
 
@@ -195,7 +195,7 @@ static void cache_cpu_set(struct cache *cache, int cpu)
 static int cache_size(const struct cache *cache, unsigned int *ret)
 {
 	const char *propname;
-	const u32 *cache_size;
+	const __be32 *cache_size;
 
 	propname = cache_type_info[cache->type].size_prop;
 
@@ -203,7 +203,7 @@ static int cache_size(const struct cache *cache, unsigned int *ret)
 	if (!cache_size)
 		return -ENODEV;
 
-	*ret = *cache_size;
+	*ret = of_read_number(cache_size, 1);
 	return 0;
 }
 
@@ -221,7 +221,7 @@ static int cache_size_kb(const struct cache *cache, unsigned int *ret)
 /* not cache_line_size() because that's a macro in include/linux/cache.h */
 static int cache_get_line_size(const struct cache *cache, unsigned int *ret)
 {
-	const u32 *line_size;
+	const __be32 *line_size;
 	int i, lim;
 
 	lim = ARRAY_SIZE(cache_type_info[cache->type].line_size_props);
@@ -238,14 +238,14 @@ static int cache_get_line_size(const struct cache *cache, unsigned int *ret)
 	if (!line_size)
 		return -ENODEV;
 
-	*ret = *line_size;
+	*ret = of_read_number(line_size, 1);
 	return 0;
 }
 
 static int cache_nr_sets(const struct cache *cache, unsigned int *ret)
 {
 	const char *propname;
-	const u32 *nr_sets;
+	const __be32 *nr_sets;
 
 	propname = cache_type_info[cache->type].nr_sets_prop;
 
@@ -253,7 +253,7 @@ static int cache_nr_sets(const struct cache *cache, unsigned int *ret)
 	if (!nr_sets)
 		return -ENODEV;
 
-	*ret = *nr_sets;
+	*ret = of_read_number(nr_sets, 1);
 	return 0;
 }
 
@@ -324,7 +324,8 @@ static bool cache_node_is_unified(const struct device_node *np)
 	return of_get_property(np, "cache-unified", NULL);
 }
 
-static struct cache *__cpuinit cache_do_one_devnode_unified(struct device_node *node, int level)
+static struct cache *cache_do_one_devnode_unified(struct device_node *node,
+						  int level)
 {
 	struct cache *cache;
 
@@ -335,7 +336,8 @@ static struct cache *__cpuinit cache_do_one_devnode_unified(struct device_node *
 	return cache;
 }
 
-static struct cache *__cpuinit cache_do_one_devnode_split(struct device_node *node, int level)
+static struct cache *cache_do_one_devnode_split(struct device_node *node,
+						int level)
 {
 	struct cache *dcache, *icache;
 
@@ -357,7 +359,7 @@ err:
 	return NULL;
 }
 
-static struct cache *__cpuinit cache_do_one_devnode(struct device_node *node, int level)
+static struct cache *cache_do_one_devnode(struct device_node *node, int level)
 {
 	struct cache *cache;
 
@@ -369,7 +371,8 @@ static struct cache *__cpuinit cache_do_one_devnode(struct device_node *node, in
 	return cache;
 }
 
-static struct cache *__cpuinit cache_lookup_or_instantiate(struct device_node *node, int level)
+static struct cache *cache_lookup_or_instantiate(struct device_node *node,
+						 int level)
 {
 	struct cache *cache;
 
@@ -385,7 +388,7 @@ static struct cache *__cpuinit cache_lookup_or_instantiate(struct device_node *n
 	return cache;
 }
 
-static void __cpuinit link_cache_lists(struct cache *smaller, struct cache *bigger)
+static void link_cache_lists(struct cache *smaller, struct cache *bigger)
 {
 	while (smaller->next_local) {
 		if (smaller->next_local == bigger)
@@ -396,13 +399,13 @@ static void __cpuinit link_cache_lists(struct cache *smaller, struct cache *bigg
 	smaller->next_local = bigger;
 }
 
-static void __cpuinit do_subsidiary_caches_debugcheck(struct cache *cache)
+static void do_subsidiary_caches_debugcheck(struct cache *cache)
 {
 	WARN_ON_ONCE(cache->level != 1);
 	WARN_ON_ONCE(strcmp(cache->ofnode->type, "cpu"));
 }
 
-static void __cpuinit do_subsidiary_caches(struct cache *cache)
+static void do_subsidiary_caches(struct cache *cache)
 {
 	struct device_node *subcache_node;
 	int level = cache->level;
@@ -423,7 +426,7 @@ static void __cpuinit do_subsidiary_caches(struct cache *cache)
 	}
 }
 
-static struct cache *__cpuinit cache_chain_instantiate(unsigned int cpu_id)
+static struct cache *cache_chain_instantiate(unsigned int cpu_id)
 {
 	struct device_node *cpu_node;
 	struct cache *cpu_cache = NULL;
@@ -448,7 +451,7 @@ out:
 	return cpu_cache;
 }
 
-static struct cache_dir *__cpuinit cacheinfo_create_cache_dir(unsigned int cpu_id)
+static struct cache_dir *cacheinfo_create_cache_dir(unsigned int cpu_id)
 {
 	struct cache_dir *cache_dir;
 	struct device *dev;
@@ -653,7 +656,7 @@ static struct kobj_type cache_index_type = {
 	.default_attrs = cache_index_default_attrs,
 };
 
-static void __cpuinit cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir)
+static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir)
 {
 	const char *cache_name;
 	const char *cache_type;
@@ -696,7 +699,8 @@ static void __cpuinit cacheinfo_create_index_opt_attrs(struct cache_index_dir *d
 	kfree(buf);
 }
 
-static void __cpuinit cacheinfo_create_index_dir(struct cache *cache, int index, struct cache_dir *cache_dir)
+static void cacheinfo_create_index_dir(struct cache *cache, int index,
+				       struct cache_dir *cache_dir)
 {
 	struct cache_index_dir *index_dir;
 	int rc;
@@ -722,7 +726,8 @@ err:
 	kfree(index_dir);
 }
 
-static void __cpuinit cacheinfo_sysfs_populate(unsigned int cpu_id, struct cache *cache_list)
+static void cacheinfo_sysfs_populate(unsigned int cpu_id,
+				     struct cache *cache_list)
 {
 	struct cache_dir *cache_dir;
 	struct cache *cache;
@@ -740,7 +745,7 @@ static void __cpuinit cacheinfo_sysfs_populate(unsigned int cpu_id, struct cache
 	}
 }
 
-void __cpuinit cacheinfo_cpu_online(unsigned int cpu_id)
+void cacheinfo_cpu_online(unsigned int cpu_id)
 {
 	struct cache *cache;
 
@@ -751,7 +756,10 @@ void __cpuinit cacheinfo_cpu_online(unsigned int cpu_id)
 	cacheinfo_sysfs_populate(cpu_id, cache);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU /* functions needed for cpu offline */
+/* functions needed to remove cache entry for cpu offline or suspend/resume */
+
+#if (defined(CONFIG_PPC_PSERIES) && defined(CONFIG_SUSPEND)) || \
+    defined(CONFIG_HOTPLUG_CPU)
 
 static struct cache *cache_lookup_by_cpu(unsigned int cpu_id)
 {
@@ -788,6 +796,9 @@ static void remove_cache_dir(struct cache_dir *cache_dir)
 {
 	remove_index_dirs(cache_dir);
 
+	/* Remove cache dir from sysfs */
+	kobject_del(cache_dir->kobj);
+
 	kobject_put(cache_dir->kobj);
 
 	kfree(cache_dir);
@@ -835,4 +846,4 @@ void cacheinfo_cpu_offline(unsigned int cpu_id)
 	if (cache)
 		cache_cpu_clear(cache, cpu_id);
 }
-#endif /* CONFIG_HOTPLUG_CPU */
+#endif /* (CONFIG_PPC_PSERIES && CONFIG_SUSPEND) || CONFIG_HOTPLUG_CPU */
diff --git a/arch/powerpc/kernel/clock.c b/arch/powerpc/kernel/clock.c
deleted file mode 100644
index a764b47791e..00000000000
--- a/arch/powerpc/kernel/clock.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Dummy clk implementations for powerpc.
- * These need to be overridden in platform code.
- */
-
-#include <linux/clk.h>
-#include <linux/err.h>
-#include <linux/errno.h>
-#include <linux/export.h>
-#include <asm/clk_interface.h>
-
-struct clk_interface clk_functions;
-
-struct clk *clk_get(struct device *dev, const char *id)
-{
-	if (clk_functions.clk_get)
-		return clk_functions.clk_get(dev, id);
-	return ERR_PTR(-ENOSYS);
-}
-EXPORT_SYMBOL(clk_get);
-
-void clk_put(struct clk *clk)
-{
-	if (clk_functions.clk_put)
-		clk_functions.clk_put(clk);
-}
-EXPORT_SYMBOL(clk_put);
-
-int clk_enable(struct clk *clk)
-{
-	if (clk_functions.clk_enable)
-		return clk_functions.clk_enable(clk);
-	return -ENOSYS;
-}
-EXPORT_SYMBOL(clk_enable);
-
-void clk_disable(struct clk *clk)
-{
-	if (clk_functions.clk_disable)
-		clk_functions.clk_disable(clk);
-}
-EXPORT_SYMBOL(clk_disable);
-
-unsigned long clk_get_rate(struct clk *clk)
-{
-	if (clk_functions.clk_get_rate)
-		return clk_functions.clk_get_rate(clk);
-	return 0;
-}
-EXPORT_SYMBOL(clk_get_rate);
-
-long clk_round_rate(struct clk *clk, unsigned long rate)
-{
-	if (clk_functions.clk_round_rate)
-		return clk_functions.clk_round_rate(clk, rate);
-	return -ENOSYS;
-}
-EXPORT_SYMBOL(clk_round_rate);
-
-int clk_set_rate(struct clk *clk, unsigned long rate)
-{
-	if (clk_functions.clk_set_rate)
-		return clk_functions.clk_set_rate(clk, rate);
-	return -ENOSYS;
-}
-EXPORT_SYMBOL(clk_set_rate);
-
-struct clk *clk_get_parent(struct clk *clk)
-{
-	if (clk_functions.clk_get_parent)
-		return clk_functions.clk_get_parent(clk);
-	return ERR_PTR(-ENOSYS);
-}
-EXPORT_SYMBOL(clk_get_parent);
-
-int clk_set_parent(struct clk *clk, struct clk *parent)
-{
-	if (clk_functions.clk_set_parent)
-		return clk_functions.clk_set_parent(clk, parent);
-	return -ENOSYS;
-}
-EXPORT_SYMBOL(clk_set_parent);
diff --git a/arch/powerpc/kernel/cpu_setup_a2.S b/arch/powerpc/kernel/cpu_setup_a2.S
deleted file mode 100644
index ebc62f42a23..00000000000
--- a/arch/powerpc/kernel/cpu_setup_a2.S
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- *  A2 specific assembly support code
- *
- *  Copyright 2009 Ben Herrenschmidt, IBM Corp.
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- */
-
-#include <asm/asm-offsets.h>
-#include <asm/ppc_asm.h>
-#include <asm/ppc-opcode.h>
-#include <asm/processor.h>
-#include <asm/reg_a2.h>
-#include <asm/reg.h>
-#include <asm/thread_info.h>
-
-/*
- * Disable thdid and class fields in ERATs to bump PID to full 14 bits capacity.
- * This also prevents external LPID accesses but that isn't a problem when not a
- * guest. Under PV, this setting will be ignored and MMUCR will return the right
- * number of PID bits we can use.
- */
-#define MMUCR1_EXTEND_PID \
-	(MMUCR1_ICTID | MMUCR1_ITTID | MMUCR1_DCTID | \
-	 MMUCR1_DTTID | MMUCR1_DCCD)
-
-/*
- * Use extended PIDs if enabled.
- * Don't clear the ERATs on context sync events and enable I & D LRU.
- * Enable ERAT back invalidate when tlbwe overwrites an entry.
- */
-#define INITIAL_MMUCR1 \
-	(MMUCR1_EXTEND_PID | MMUCR1_CSINV_NEVER | MMUCR1_IRRE | \
-	 MMUCR1_DRRE | MMUCR1_TLBWE_BINV)
-
-_GLOBAL(__setup_cpu_a2)
-	/* Some of these are actually thread local and some are
-	 * core local but doing it always won't hurt
-	 */
-
-#ifdef CONFIG_PPC_ICSWX
-	/* Make sure ACOP starts out as zero */
-	li	r3,0
-	mtspr   SPRN_ACOP,r3
-
-	/* Skip the following if we are in Guest mode */
-	mfmsr	r3
-	andis.	r0,r3,MSR_GS@h
-	bne	_icswx_skip_guest
-
-	/* Enable icswx instruction */
-	mfspr   r3,SPRN_A2_CCR2
-	ori     r3,r3,A2_CCR2_ENABLE_ICSWX
-	mtspr   SPRN_A2_CCR2,r3
-
-	/* Unmask all CTs in HACOP */
-	li      r3,-1
-	mtspr   SPRN_HACOP,r3
-_icswx_skip_guest:
-#endif /* CONFIG_PPC_ICSWX */
-
-	/* Enable doorbell */
-	mfspr   r3,SPRN_A2_CCR2
-	oris     r3,r3,A2_CCR2_ENABLE_PC@h
-	mtspr   SPRN_A2_CCR2,r3
-	isync
-
-	/* Setup CCR0 to disable power saving for now as it's busted
-	 * in the current implementations. Setup CCR1 to wake on
-	 * interrupts normally (we write the default value but who
-	 * knows what FW may have clobbered...)
-	 */
-	li	r3,0
-	mtspr	SPRN_A2_CCR0, r3
-	LOAD_REG_IMMEDIATE(r3,0x0f0f0f0f)
-	mtspr	SPRN_A2_CCR1, r3
-
-	/* Initialise MMUCR1 */
-	lis	r3,INITIAL_MMUCR1@h
-	ori	r3,r3,INITIAL_MMUCR1@l
-	mtspr	SPRN_MMUCR1,r3
-
-	/* Set MMUCR2 to enable 4K, 64K, 1M, 16M and 1G pages */
-	LOAD_REG_IMMEDIATE(r3, 0x000a7531)
-	mtspr	SPRN_MMUCR2,r3
-
-	/* Set MMUCR3 to write all thids bit to the TLB */
-	LOAD_REG_IMMEDIATE(r3, 0x0000000f)
-	mtspr	SPRN_MMUCR3,r3
-
-	/* Don't do ERAT stuff if running guest mode */
-	mfmsr	r3
-	andis.	r0,r3,MSR_GS@h
-	bne	1f
-
-	/* Now set the I-ERAT watermark to 15 */
-	lis	r4,(MMUCR0_TLBSEL_I|MMUCR0_ECL)@h
-	mtspr	SPRN_MMUCR0, r4
-	li	r4,A2_IERAT_SIZE-1
-	PPC_ERATWE(r4,r4,3)
-
-	/* Now set the D-ERAT watermark to 31 */
-	lis	r4,(MMUCR0_TLBSEL_D|MMUCR0_ECL)@h
-	mtspr	SPRN_MMUCR0, r4
-	li	r4,A2_DERAT_SIZE-1
-	PPC_ERATWE(r4,r4,3)
-
-	/* And invalidate the beast just in case. That won't get rid of
-	 * a bolted entry though it will be in LRU and so will go away eventually
-	 * but let's not bother for now
-	 */
-	PPC_ERATILX(0,0,0)
-1:
-	blr
-
-_GLOBAL(__restore_cpu_a2)
-	b	__setup_cpu_a2
diff --git a/arch/powerpc/kernel/cpu_setup_fsl_booke.S b/arch/powerpc/kernel/cpu_setup_fsl_booke.S
index 8053db02b85..4f1393d2007 100644
--- a/arch/powerpc/kernel/cpu_setup_fsl_booke.S
+++ b/arch/powerpc/kernel/cpu_setup_fsl_booke.S
@@ -16,6 +16,8 @@
 #include <asm/processor.h>
 #include <asm/cputable.h>
 #include <asm/ppc_asm.h>
+#include <asm/mmu-book3e.h>
+#include <asm/asm-offsets.h>
 
 _GLOBAL(__e500_icache_setup)
 	mfspr	r0, SPRN_L1CSR1
@@ -51,6 +53,61 @@ _GLOBAL(__e500_dcache_setup)
 	isync
 	blr
 
+/*
+ * FIXME - we haven't yet done testing to determine a reasonable default
+ * value for PW20_WAIT_IDLE_BIT.
+ */
+#define PW20_WAIT_IDLE_BIT		50 /* 1ms, TB frequency is 41.66MHZ */
+_GLOBAL(setup_pw20_idle)
+	mfspr	r3, SPRN_PWRMGTCR0
+
+	/* Set PW20_WAIT bit, enable pw20 state*/
+	ori	r3, r3, PWRMGTCR0_PW20_WAIT
+	li	r11, PW20_WAIT_IDLE_BIT
+
+	/* Set Automatic PW20 Core Idle Count */
+	rlwimi	r3, r11, PWRMGTCR0_PW20_ENT_SHIFT, PWRMGTCR0_PW20_ENT
+
+	mtspr	SPRN_PWRMGTCR0, r3
+
+	blr
+
+/*
+ * FIXME - we haven't yet done testing to determine a reasonable default
+ * value for AV_WAIT_IDLE_BIT.
+ */
+#define AV_WAIT_IDLE_BIT		50 /* 1ms, TB frequency is 41.66MHZ */
+_GLOBAL(setup_altivec_idle)
+	mfspr	r3, SPRN_PWRMGTCR0
+
+	/* Enable Altivec Idle */
+	oris	r3, r3, PWRMGTCR0_AV_IDLE_PD_EN@h
+	li	r11, AV_WAIT_IDLE_BIT
+
+	/* Set Automatic AltiVec Idle Count */
+	rlwimi	r3, r11, PWRMGTCR0_AV_IDLE_CNT_SHIFT, PWRMGTCR0_AV_IDLE_CNT
+
+	mtspr	SPRN_PWRMGTCR0, r3
+
+	blr
+
+_GLOBAL(__setup_cpu_e6500)
+	mflr	r6
+#ifdef CONFIG_PPC64
+	bl	setup_altivec_ivors
+	/* Touch IVOR42 only if the CPU supports E.HV category */
+	mfspr	r10,SPRN_MMUCFG
+	rlwinm.	r10,r10,0,MMUCFG_LPIDSIZE
+	beq	1f
+	bl	setup_lrat_ivor
+1:
+#endif
+	bl	setup_pw20_idle
+	bl	setup_altivec_idle
+	bl	__setup_cpu_e5500
+	mtlr	r6
+	blr
+
 #ifdef CONFIG_PPC32
 _GLOBAL(__setup_cpu_e200)
 	/* enable dedicated debug exception handling resources (Debug APU) */
@@ -64,7 +121,7 @@ _GLOBAL(__setup_cpu_e500v2)
 	bl	__e500_icache_setup
 	bl	__e500_dcache_setup
 	bl	__setup_e500_ivors
-#ifdef CONFIG_FSL_RIO
+#if defined(CONFIG_FSL_RIO) || defined(CONFIG_FSL_PCI)
 	/* Ensure that RFXE is set */
 	mfspr	r3,SPRN_HID1
 	oris	r3,r3,HID1_RFXE@h
@@ -73,26 +130,96 @@ _GLOBAL(__setup_cpu_e500v2)
 	mtlr	r4
 	blr
 _GLOBAL(__setup_cpu_e500mc)
-	mflr	r4
+_GLOBAL(__setup_cpu_e5500)
+	mflr	r5
 	bl	__e500_icache_setup
 	bl	__e500_dcache_setup
 	bl	__setup_e500mc_ivors
-	mtlr	r4
+	/*
+	 * We only want to touch IVOR38-41 if we're running on hardware
+	 * that supports category E.HV.  The architectural way to determine
+	 * this is MMUCFG[LPIDSIZE].
+	 */
+	mfspr	r3, SPRN_MMUCFG
+	rlwinm.	r3, r3, 0, MMUCFG_LPIDSIZE
+	beq	1f
+	bl	__setup_ehv_ivors
+	b	2f
+1:
+	lwz	r3, CPU_SPEC_FEATURES(r4)
+	/* We need this check as cpu_setup is also called for
+	 * the secondary cores. So, if we have already cleared
+	 * the feature on the primary core, avoid doing it on the
+	 * secondary core.
+	 */
+	andis.	r6, r3, CPU_FTR_EMB_HV@h
+	beq	2f
+	rlwinm	r3, r3, 0, ~CPU_FTR_EMB_HV
+	stw	r3, CPU_SPEC_FEATURES(r4)
+2:
+	mtlr	r5
 	blr
 #endif
-/* Right now, restore and setup are the same thing */
+
+#ifdef CONFIG_PPC_BOOK3E_64
+_GLOBAL(__restore_cpu_e6500)
+	mflr	r5
+	bl	setup_altivec_ivors
+	/* Touch IVOR42 only if the CPU supports E.HV category */
+	mfspr	r10,SPRN_MMUCFG
+	rlwinm.	r10,r10,0,MMUCFG_LPIDSIZE
+	beq	1f
+	bl	setup_lrat_ivor
+1:
+	bl	setup_pw20_idle
+	bl	setup_altivec_idle
+	bl	__restore_cpu_e5500
+	mtlr	r5
+	blr
+
 _GLOBAL(__restore_cpu_e5500)
-_GLOBAL(__setup_cpu_e5500)
 	mflr	r4
 	bl	__e500_icache_setup
 	bl	__e500_dcache_setup
-#ifdef CONFIG_PPC_BOOK3E_64
-	bl	.__setup_base_ivors
-	bl	.setup_perfmon_ivor
-	bl	.setup_doorbell_ivors
-	bl	.setup_ehv_ivors
-#else
-	bl	__setup_e500mc_ivors
-#endif
+	bl	__setup_base_ivors
+	bl	setup_perfmon_ivor
+	bl	setup_doorbell_ivors
+	/*
+	 * We only want to touch IVOR38-41 if we're running on hardware
+	 * that supports category E.HV.  The architectural way to determine
+	 * this is MMUCFG[LPIDSIZE].
+	 */
+	mfspr	r10,SPRN_MMUCFG
+	rlwinm.	r10,r10,0,MMUCFG_LPIDSIZE
+	beq	1f
+	bl	setup_ehv_ivors
+1:
 	mtlr	r4
 	blr
+
+_GLOBAL(__setup_cpu_e5500)
+	mflr	r5
+	bl	__e500_icache_setup
+	bl	__e500_dcache_setup
+	bl	__setup_base_ivors
+	bl	setup_perfmon_ivor
+	bl	setup_doorbell_ivors
+	/*
+	 * We only want to touch IVOR38-41 if we're running on hardware
+	 * that supports category E.HV.  The architectural way to determine
+	 * this is MMUCFG[LPIDSIZE].
+	 */
+	mfspr	r10,SPRN_MMUCFG
+	rlwinm.	r10,r10,0,MMUCFG_LPIDSIZE
+	beq	1f
+	bl	setup_ehv_ivors
+	b	2f
+1:
+	ld	r10,CPU_SPEC_FEATURES(r4)
+	LOAD_REG_IMMEDIATE(r9,CPU_FTR_EMB_HV)
+	andc	r10,r10,r9
+	std	r10,CPU_SPEC_FEATURES(r4)
+2:
+	mtlr	r5
+	blr
+#endif
diff --git a/arch/powerpc/kernel/cpu_setup_power7.S b/arch/powerpc/kernel/cpu_setup_power.S
index 76797c5105d..46733535cc0 100644
--- a/arch/powerpc/kernel/cpu_setup_power7.S
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -27,8 +27,9 @@ _GLOBAL(__setup_cpu_power7)
 	beqlr
 	li	r0,0
 	mtspr	SPRN_LPID,r0
+	mfspr	r3,SPRN_LPCR
 	bl	__init_LPCR
-	bl	__init_TLB
+	bl	__init_tlb_power7
 	mtlr	r11
 	blr
 
@@ -39,8 +40,46 @@ _GLOBAL(__restore_cpu_power7)
 	beqlr
 	li	r0,0
 	mtspr	SPRN_LPID,r0
+	mfspr	r3,SPRN_LPCR
+	bl	__init_LPCR
+	bl	__init_tlb_power7
+	mtlr	r11
+	blr
+
+_GLOBAL(__setup_cpu_power8)
+	mflr	r11
+	bl	__init_FSCR
+	bl	__init_PMU
+	bl	__init_hvmode_206
+	mtlr	r11
+	beqlr
+	li	r0,0
+	mtspr	SPRN_LPID,r0
+	mfspr	r3,SPRN_LPCR
+	ori	r3, r3, LPCR_PECEDH
+	bl	__init_LPCR
+	bl	__init_HFSCR
+	bl	__init_tlb_power8
+	bl	__init_PMU_HV
+	mtlr	r11
+	blr
+
+_GLOBAL(__restore_cpu_power8)
+	mflr	r11
+	bl	__init_FSCR
+	bl	__init_PMU
+	mfmsr	r3
+	rldicl.	r0,r3,4,63
+	mtlr	r11
+	beqlr
+	li	r0,0
+	mtspr	SPRN_LPID,r0
+	mfspr   r3,SPRN_LPCR
+	ori	r3, r3, LPCR_PECEDH
 	bl	__init_LPCR
-	bl	__init_TLB
+	bl	__init_HFSCR
+	bl	__init_tlb_power8
+	bl	__init_PMU_HV
 	mtlr	r11
 	blr
 
@@ -57,6 +96,7 @@ __init_hvmode_206:
 
 __init_LPCR:
 	/* Setup a sane LPCR:
+	 *   Called with initial LPCR in R3
 	 *
 	 *   LPES = 0b01 (HSRR0/1 used for 0x500)
 	 *   PECE = 0b111
@@ -67,7 +107,6 @@ __init_LPCR:
 	 *
 	 * Other bits untouched for now
 	 */
-	mfspr	r3,SPRN_LPCR
 	li	r5,1
 	rldimi	r3,r5, LPCR_LPES_SH, 64-LPCR_LPES_SH-2
 	ori	r3,r3,(LPCR_PECE0|LPCR_PECE1|LPCR_PECE2)
@@ -82,14 +121,62 @@ __init_LPCR:
 	isync
 	blr
 
-__init_TLB:
-	/* Clear the TLB */
+__init_FSCR:
+	mfspr	r3,SPRN_FSCR
+	ori	r3,r3,FSCR_TAR|FSCR_DSCR|FSCR_EBB
+	mtspr	SPRN_FSCR,r3
+	blr
+
+__init_HFSCR:
+	mfspr	r3,SPRN_HFSCR
+	ori	r3,r3,HFSCR_TAR|HFSCR_TM|HFSCR_BHRB|HFSCR_PM|\
+		      HFSCR_DSCR|HFSCR_VECVSX|HFSCR_FP|HFSCR_EBB
+	mtspr	SPRN_HFSCR,r3
+	blr
+
+/*
+ * Clear the TLB using the specified IS form of tlbiel instruction
+ * (invalidate by congruence class). P7 has 128 CCs., P8 has 512.
+ *
+ * r3 = IS field
+ */
+__init_tlb_power7:
+	li	r3,0xc00	/* IS field = 0b11 */
+_GLOBAL(__flush_tlb_power7)
 	li	r6,128
 	mtctr	r6
-	li	r7,0xc00	/* IS field = 0b11 */
+	mr	r7,r3		/* IS field */
+	ptesync
+2:	tlbiel	r7
+	addi	r7,r7,0x1000
+	bdnz	2b
+	ptesync
+1:	blr
+
+__init_tlb_power8:
+	li	r3,0xc00	/* IS field = 0b11 */
+_GLOBAL(__flush_tlb_power8)
+	li	r6,512
+	mtctr	r6
+	mr	r7,r3		/* IS field */
 	ptesync
 2:	tlbiel	r7
 	addi	r7,r7,0x1000
 	bdnz	2b
 	ptesync
 1:	blr
+
+__init_PMU_HV:
+	li	r5,0
+	mtspr	SPRN_MMCRC,r5
+	mtspr	SPRN_MMCRH,r5
+	blr
+
+__init_PMU:
+	li	r5,0
+	mtspr	SPRN_MMCRS,r5
+	mtspr	SPRN_MMCRA,r5
+	mtspr	SPRN_MMCR0,r5
+	mtspr	SPRN_MMCR1,r5
+	mtspr	SPRN_MMCR2,r5
+	blr
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 81db9e2a8a2..0c157642c2a 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -20,6 +20,7 @@
 #include <asm/cputable.h>
 #include <asm/prom.h>		/* for PTRRELOC on ARCH=ppc */
 #include <asm/mmu.h>
+#include <asm/setup.h>
 
 struct cpu_spec* cur_cpu_spec = NULL;
 EXPORT_SYMBOL(cur_cpu_spec);
@@ -67,11 +68,19 @@ extern void __restore_cpu_pa6t(void);
 extern void __restore_cpu_ppc970(void);
 extern void __setup_cpu_power7(unsigned long offset, struct cpu_spec* spec);
 extern void __restore_cpu_power7(void);
+extern void __setup_cpu_power8(unsigned long offset, struct cpu_spec* spec);
+extern void __restore_cpu_power8(void);
 extern void __restore_cpu_a2(void);
+extern void __flush_tlb_power7(unsigned long inval_selector);
+extern void __flush_tlb_power8(unsigned long inval_selector);
+extern long __machine_check_early_realmode_p7(struct pt_regs *regs);
+extern long __machine_check_early_realmode_p8(struct pt_regs *regs);
 #endif /* CONFIG_PPC64 */
 #if defined(CONFIG_E500)
 extern void __setup_cpu_e5500(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_e6500(unsigned long offset, struct cpu_spec* spec);
 extern void __restore_cpu_e5500(void);
+extern void __restore_cpu_e6500(void);
 #endif /* CONFIG_E500 */
 
 /* This table only contains "desktop" CPUs, it need to be filled with embedded
@@ -93,6 +102,15 @@ extern void __restore_cpu_e5500(void);
 				 PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP | \
 				 PPC_FEATURE_TRUE_LE | \
 				 PPC_FEATURE_PSERIES_PERFMON_COMPAT)
+#define COMMON_USER2_POWER7	(PPC_FEATURE2_DSCR)
+#define COMMON_USER_POWER8	(COMMON_USER_PPC64 | PPC_FEATURE_ARCH_2_06 |\
+				 PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP | \
+				 PPC_FEATURE_TRUE_LE | \
+				 PPC_FEATURE_PSERIES_PERFMON_COMPAT)
+#define COMMON_USER2_POWER8	(PPC_FEATURE2_ARCH_2_07 | \
+				 PPC_FEATURE2_HTM_COMP | PPC_FEATURE2_DSCR | \
+				 PPC_FEATURE2_ISEL | PPC_FEATURE2_TAR | \
+				 PPC_FEATURE2_VEC_CRYPTO)
 #define COMMON_USER_PA6T	(COMMON_USER_PPC64 | PPC_FEATURE_PA6T |\
 				 PPC_FEATURE_TRUE_LE | \
 				 PPC_FEATURE_HAS_ALTIVEC_COMP)
@@ -268,7 +286,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_PPC970,
 		.cpu_user_features	= COMMON_USER_POWER4 |
 			PPC_FEATURE_HAS_ALTIVEC_COMP,
-		.mmu_features		= MMU_FTR_HPTE_TABLE,
+		.mmu_features		= MMU_FTRS_PPC970,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.num_pmcs		= 8,
@@ -419,6 +437,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "POWER7 (architected)",
 		.cpu_features		= CPU_FTRS_POWER7,
 		.cpu_user_features	= COMMON_USER_POWER7,
+		.cpu_user_features2	= COMMON_USER2_POWER7,
 		.mmu_features		= MMU_FTRS_POWER7,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
@@ -426,14 +445,35 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.oprofile_cpu_type	= "ppc64/ibm-compat-v1",
 		.cpu_setup		= __setup_cpu_power7,
 		.cpu_restore		= __restore_cpu_power7,
+		.flush_tlb		= __flush_tlb_power7,
+		.machine_check_early	= __machine_check_early_realmode_p7,
 		.platform		= "power7",
 	},
+	{	/* 2.07-compliant processor, i.e. Power8 "architected" mode */
+		.pvr_mask		= 0xffffffff,
+		.pvr_value		= 0x0f000004,
+		.cpu_name		= "POWER8 (architected)",
+		.cpu_features		= CPU_FTRS_POWER8,
+		.cpu_user_features	= COMMON_USER_POWER8,
+		.cpu_user_features2	= COMMON_USER2_POWER8,
+		.mmu_features		= MMU_FTRS_POWER8,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.oprofile_type		= PPC_OPROFILE_INVALID,
+		.oprofile_cpu_type	= "ppc64/ibm-compat-v1",
+		.cpu_setup		= __setup_cpu_power8,
+		.cpu_restore		= __restore_cpu_power8,
+		.flush_tlb		= __flush_tlb_power8,
+		.machine_check_early	= __machine_check_early_realmode_p8,
+		.platform		= "power8",
+	},
 	{	/* Power7 */
 		.pvr_mask		= 0xffff0000,
 		.pvr_value		= 0x003f0000,
 		.cpu_name		= "POWER7 (raw)",
 		.cpu_features		= CPU_FTRS_POWER7,
 		.cpu_user_features	= COMMON_USER_POWER7,
+		.cpu_user_features2	= COMMON_USER2_POWER7,
 		.mmu_features		= MMU_FTRS_POWER7,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
@@ -443,6 +483,8 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.oprofile_type		= PPC_OPROFILE_POWER4,
 		.cpu_setup		= __setup_cpu_power7,
 		.cpu_restore		= __restore_cpu_power7,
+		.flush_tlb		= __flush_tlb_power7,
+		.machine_check_early	= __machine_check_early_realmode_p7,
 		.platform		= "power7",
 	},
 	{	/* Power7+ */
@@ -451,6 +493,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "POWER7+ (raw)",
 		.cpu_features		= CPU_FTRS_POWER7,
 		.cpu_user_features	= COMMON_USER_POWER7,
+		.cpu_user_features2	= COMMON_USER2_POWER7,
 		.mmu_features		= MMU_FTRS_POWER7,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
@@ -460,8 +503,70 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.oprofile_type		= PPC_OPROFILE_POWER4,
 		.cpu_setup		= __setup_cpu_power7,
 		.cpu_restore		= __restore_cpu_power7,
+		.flush_tlb		= __flush_tlb_power7,
+		.machine_check_early	= __machine_check_early_realmode_p7,
 		.platform		= "power7+",
 	},
+	{	/* Power8E */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x004b0000,
+		.cpu_name		= "POWER8E (raw)",
+		.cpu_features		= CPU_FTRS_POWER8E,
+		.cpu_user_features	= COMMON_USER_POWER8,
+		.cpu_user_features2	= COMMON_USER2_POWER8,
+		.mmu_features		= MMU_FTRS_POWER8,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_IBM,
+		.oprofile_cpu_type	= "ppc64/power8",
+		.oprofile_type		= PPC_OPROFILE_INVALID,
+		.cpu_setup		= __setup_cpu_power8,
+		.cpu_restore		= __restore_cpu_power8,
+		.flush_tlb		= __flush_tlb_power8,
+		.machine_check_early	= __machine_check_early_realmode_p8,
+		.platform		= "power8",
+	},
+	{	/* Power8 DD1: Does not support doorbell IPIs */
+		.pvr_mask		= 0xffffff00,
+		.pvr_value		= 0x004d0100,
+		.cpu_name		= "POWER8 (raw)",
+		.cpu_features		= CPU_FTRS_POWER8_DD1,
+		.cpu_user_features	= COMMON_USER_POWER8,
+		.cpu_user_features2	= COMMON_USER2_POWER8,
+		.mmu_features		= MMU_FTRS_POWER8,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_IBM,
+		.oprofile_cpu_type	= "ppc64/power8",
+		.oprofile_type		= PPC_OPROFILE_INVALID,
+		.cpu_setup		= __setup_cpu_power8,
+		.cpu_restore		= __restore_cpu_power8,
+		.flush_tlb		= __flush_tlb_power8,
+		.machine_check_early	= __machine_check_early_realmode_p8,
+		.platform		= "power8",
+	},
+	{	/* Power8 */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x004d0000,
+		.cpu_name		= "POWER8 (raw)",
+		.cpu_features		= CPU_FTRS_POWER8,
+		.cpu_user_features	= COMMON_USER_POWER8,
+		.cpu_user_features2	= COMMON_USER2_POWER8,
+		.mmu_features		= MMU_FTRS_POWER8,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_IBM,
+		.oprofile_cpu_type	= "ppc64/power8",
+		.oprofile_type		= PPC_OPROFILE_INVALID,
+		.cpu_setup		= __setup_cpu_power8,
+		.cpu_restore		= __restore_cpu_power8,
+		.flush_tlb		= __flush_tlb_power8,
+		.machine_check_early	= __machine_check_early_realmode_p8,
+		.platform		= "power8",
+	},
 	{	/* Cell Broadband Engine */
 		.pvr_mask		= 0xffff0000,
 		.pvr_value		= 0x00700000,
@@ -1816,7 +1921,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.platform		= "ppc440",
 	},
 	{ /* 464 in APM821xx */
-		.pvr_mask		= 0xffffff00,
+		.pvr_mask		= 0xfffffff0,
 		.pvr_value		= 0x12C41C80,
 		.cpu_name		= "APM821XX",
 		.cpu_features		= CPU_FTRS_44X,
@@ -1954,6 +2059,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_user_features	= COMMON_USER_BOOKE |
 			PPC_FEATURE_HAS_SPE_COMP |
 			PPC_FEATURE_HAS_EFP_SINGLE_COMP,
+		.cpu_user_features2	= PPC_FEATURE2_ISEL,
 		.mmu_features		= MMU_FTR_TYPE_FSL_E,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
@@ -1973,6 +2079,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 			PPC_FEATURE_HAS_SPE_COMP |
 			PPC_FEATURE_HAS_EFP_SINGLE_COMP |
 			PPC_FEATURE_HAS_EFP_DOUBLE_COMP,
+		.cpu_user_features2	= PPC_FEATURE2_ISEL,
 		.mmu_features		= MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
@@ -1989,6 +2096,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "e500mc",
 		.cpu_features		= CPU_FTRS_E500MC,
 		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+		.cpu_user_features2	= PPC_FEATURE2_ISEL,
 		.mmu_features		= MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS |
 			MMU_FTR_USE_TLBILX,
 		.icache_bsize		= 64,
@@ -2007,6 +2115,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "e5500",
 		.cpu_features		= CPU_FTRS_E5500,
 		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+		.cpu_user_features2	= PPC_FEATURE2_ISEL,
 		.mmu_features		= MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS |
 			MMU_FTR_USE_TLBILX,
 		.icache_bsize		= 64,
@@ -2015,10 +2124,34 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.oprofile_cpu_type	= "ppc/e500mc",
 		.oprofile_type		= PPC_OPROFILE_FSL_EMB,
 		.cpu_setup		= __setup_cpu_e5500,
+#ifndef CONFIG_PPC32
 		.cpu_restore		= __restore_cpu_e5500,
+#endif
 		.machine_check		= machine_check_e500mc,
 		.platform		= "ppce5500",
 	},
+	{	/* e6500 */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x80400000,
+		.cpu_name		= "e6500",
+		.cpu_features		= CPU_FTRS_E6500,
+		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU |
+			PPC_FEATURE_HAS_ALTIVEC_COMP,
+		.cpu_user_features2	= PPC_FEATURE2_ISEL,
+		.mmu_features		= MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS |
+			MMU_FTR_USE_TLBILX,
+		.icache_bsize		= 64,
+		.dcache_bsize		= 64,
+		.num_pmcs		= 6,
+		.oprofile_cpu_type	= "ppc/e6500",
+		.oprofile_type		= PPC_OPROFILE_FSL_EMB,
+		.cpu_setup		= __setup_cpu_e6500,
+#ifndef CONFIG_PPC32
+		.cpu_restore		= __restore_cpu_e6500,
+#endif
+		.machine_check		= machine_check_e500mc,
+		.platform		= "ppce6500",
+	},
 #ifdef CONFIG_PPC32
 	{	/* default match */
 		.pvr_mask		= 0x00000000,
@@ -2036,44 +2169,6 @@ static struct cpu_spec __initdata cpu_specs[] = {
 	}
 #endif /* CONFIG_PPC32 */
 #endif /* CONFIG_E500 */
-
-#ifdef CONFIG_PPC_A2
-	{	/* Standard A2 (>= DD2) + FPU core */
-		.pvr_mask		= 0xffff0000,
-		.pvr_value		= 0x00480000,
-		.cpu_name		= "A2 (>= DD2)",
-		.cpu_features		= CPU_FTRS_A2,
-		.cpu_user_features	= COMMON_USER_PPC64,
-		.mmu_features		= MMU_FTRS_A2,
-		.icache_bsize		= 64,
-		.dcache_bsize		= 64,
-		.num_pmcs		= 0,
-		.cpu_setup		= __setup_cpu_a2,
-		.cpu_restore		= __restore_cpu_a2,
-		.machine_check		= machine_check_generic,
-		.platform		= "ppca2",
-	},
-	{	/* This is a default entry to get going, to be replaced by
-		 * a real one at some stage
-		 */
-#define CPU_FTRS_BASE_BOOK3E	(CPU_FTR_USE_TB | \
-	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_SMT | \
-	    CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE)
-		.pvr_mask		= 0x00000000,
-		.pvr_value		= 0x00000000,
-		.cpu_name		= "Book3E",
-		.cpu_features		= CPU_FTRS_BASE_BOOK3E,
-		.cpu_user_features	= COMMON_USER_PPC64,
-		.mmu_features		= MMU_FTR_TYPE_3E | MMU_FTR_USE_TLBILX |
-					  MMU_FTR_USE_TLBIVAX_BCAST |
-					  MMU_FTR_LOCK_BCAST_INVAL,
-		.icache_bsize		= 64,
-		.dcache_bsize		= 64,
-		.num_pmcs		= 0,
-		.machine_check		= machine_check_generic,
-		.platform		= "power6",
-	},
-#endif /* CONFIG_PPC_A2 */
 };
 
 static struct cpu_spec the_cpu_spec;
diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c
index abef75176c0..51dbace3269 100644
--- a/arch/powerpc/kernel/crash.c
+++ b/arch/powerpc/kernel/crash.c
@@ -17,7 +17,6 @@
 #include <linux/export.h>
 #include <linux/crash_dump.h>
 #include <linux/delay.h>
-#include <linux/init.h>
 #include <linux/irq.h>
 #include <linux/types.h>
 
@@ -27,8 +26,8 @@
 #include <asm/kdump.h>
 #include <asm/prom.h>
 #include <asm/smp.h>
-#include <asm/system.h>
 #include <asm/setjmp.h>
+#include <asm/debug.h>
 
 /*
  * The primary CPU waits a while for all secondary CPUs to enter. This is to
@@ -82,7 +81,7 @@ void crash_ipi_callback(struct pt_regs *regs)
 	}
 
 	atomic_inc(&cpus_in_crash);
-	smp_mb__after_atomic_inc();
+	smp_mb__after_atomic();
 
 	/*
 	 * Starting the kdump boot.
diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c
index b3ba5163eae..7a13f378ca2 100644
--- a/arch/powerpc/kernel/crash_dump.c
+++ b/arch/powerpc/kernel/crash_dump.c
@@ -69,16 +69,6 @@ void __init setup_kdump_trampoline(void)
 }
 #endif /* CONFIG_NONSTATIC_KERNEL */
 
-static int __init parse_savemaxmem(char *p)
-{
-	if (p)
-		saved_max_pfn = (memparse(p, &p) >> PAGE_SHIFT) - 1;
-
-	return 1;
-}
-__setup("savemaxmem=", parse_savemaxmem);
-
-
 static size_t copy_oldmem_vaddr(void *vaddr, char *buf, size_t csize,
                                unsigned long offset, int userbuf)
 {
@@ -108,17 +98,19 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
 			size_t csize, unsigned long offset, int userbuf)
 {
 	void  *vaddr;
+	phys_addr_t paddr;
 
 	if (!csize)
 		return 0;
 
 	csize = min_t(size_t, csize, PAGE_SIZE);
+	paddr = pfn << PAGE_SHIFT;
 
-	if ((min_low_pfn < pfn) && (pfn < max_pfn)) {
-		vaddr = __va(pfn << PAGE_SHIFT);
+	if (memblock_is_region_memory(paddr, csize)) {
+		vaddr = __va(paddr);
 		csize = copy_oldmem_vaddr(vaddr, buf, csize, offset, userbuf);
 	} else {
-		vaddr = __ioremap(pfn << PAGE_SHIFT, PAGE_SIZE, 0);
+		vaddr = __ioremap(paddr, PAGE_SIZE, 0);
 		csize = copy_oldmem_vaddr(vaddr, buf, csize, offset, userbuf);
 		iounmap(vaddr);
 	}
@@ -134,15 +126,15 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
 void crash_free_reserved_phys_range(unsigned long begin, unsigned long end)
 {
 	unsigned long addr;
-	const u32 *basep, *sizep;
+	const __be32 *basep, *sizep;
 	unsigned int rtas_start = 0, rtas_end = 0;
 
 	basep = of_get_property(rtas.dev, "linux,rtas-base", NULL);
 	sizep = of_get_property(rtas.dev, "rtas-size", NULL);
 
 	if (basep && sizep) {
-		rtas_start = *basep;
-		rtas_end = *basep + *sizep;
+		rtas_start = be32_to_cpup(basep);
+		rtas_end = rtas_start + be32_to_cpup(sizep);
 	}
 
 	for (addr = begin; addr < end; addr += PAGE_SIZE) {
@@ -150,10 +142,7 @@ void crash_free_reserved_phys_range(unsigned long begin, unsigned long end)
 		if (addr <= rtas_end && ((addr + PAGE_SIZE) > rtas_start))
 			continue;
 
-		ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
-		init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
-		free_page((unsigned long)__va(addr));
-		totalram_pages++;
+		free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
 	}
 }
 #endif
diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c
index 2cc451aaaca..d55c76c571f 100644
--- a/arch/powerpc/kernel/dbell.c
+++ b/arch/powerpc/kernel/dbell.c
@@ -21,14 +21,16 @@
 #ifdef CONFIG_SMP
 void doorbell_setup_this_cpu(void)
 {
-	unsigned long tag = mfspr(SPRN_PIR) & 0x3fff;
+	unsigned long tag = mfspr(SPRN_DOORBELL_CPUTAG) & PPC_DBELL_TAG_MASK;
 
 	smp_muxed_ipi_set_data(smp_processor_id(), tag);
 }
 
 void doorbell_cause_ipi(int cpu, unsigned long data)
 {
-	ppc_msgsnd(PPC_DBELL, 0, data);
+	/* Order previous accesses vs. msgsnd, which is treated as a store */
+	mb();
+	ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, data);
 }
 
 void doorbell_exception(struct pt_regs *regs)
@@ -37,6 +39,10 @@ void doorbell_exception(struct pt_regs *regs)
 
 	irq_enter();
 
+	may_hard_irq_enable();
+
+	__get_cpu_var(irq_stat).doorbell_irqs++;
+
 	smp_ipi_demux();
 
 	irq_exit();
diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index 3f6464b4d97..54d0116256f 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -17,7 +17,8 @@
  * to the dma address (mapping) of the first page.
  */
 static void *dma_iommu_alloc_coherent(struct device *dev, size_t size,
-				      dma_addr_t *dma_handle, gfp_t flag)
+				      dma_addr_t *dma_handle, gfp_t flag,
+				      struct dma_attrs *attrs)
 {
 	return iommu_alloc_coherent(dev, get_iommu_table_base(dev), size,
 				    dma_handle, dev->coherent_dma_mask, flag,
@@ -25,7 +26,8 @@ static void *dma_iommu_alloc_coherent(struct device *dev, size_t size,
 }
 
 static void dma_iommu_free_coherent(struct device *dev, size_t size,
-				    void *vaddr, dma_addr_t dma_handle)
+				    void *vaddr, dma_addr_t dma_handle,
+				    struct dma_attrs *attrs)
 {
 	iommu_free_coherent(get_iommu_table_base(dev), size, vaddr, dma_handle);
 }
@@ -81,11 +83,10 @@ static int dma_iommu_dma_supported(struct device *dev, u64 mask)
 		return 0;
 	}
 
-	if ((tbl->it_offset + tbl->it_size) > (mask >> IOMMU_PAGE_SHIFT)) {
-		dev_info(dev, "Warning: IOMMU window too big for device mask\n");
-		dev_info(dev, "mask: 0x%08llx, table end: 0x%08lx\n",
-				mask, (tbl->it_offset + tbl->it_size) <<
-				IOMMU_PAGE_SHIFT);
+	if (tbl->it_offset > (mask >> tbl->it_page_shift)) {
+		dev_info(dev, "Warning: IOMMU offset too big for device mask\n");
+		dev_info(dev, "mask: 0x%08llx, table offset: 0x%08lx\n",
+				mask, tbl->it_offset << tbl->it_page_shift);
 		return 0;
 	} else
 		return 1;
@@ -105,8 +106,9 @@ static u64 dma_iommu_get_required_mask(struct device *dev)
 }
 
 struct dma_map_ops dma_iommu_ops = {
-	.alloc_coherent		= dma_iommu_alloc_coherent,
-	.free_coherent		= dma_iommu_free_coherent,
+	.alloc			= dma_iommu_alloc_coherent,
+	.free			= dma_iommu_free_coherent,
+	.mmap			= dma_direct_mmap_coherent,
 	.map_sg			= dma_iommu_map_sg,
 	.unmap_sg		= dma_iommu_unmap_sg,
 	.dma_supported		= dma_iommu_dma_supported,
diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c
index 1ebc9189aad..bd1a2aba599 100644
--- a/arch/powerpc/kernel/dma-swiotlb.c
+++ b/arch/powerpc/kernel/dma-swiotlb.c
@@ -12,6 +12,7 @@
  */
 
 #include <linux/dma-mapping.h>
+#include <linux/memblock.h>
 #include <linux/pfn.h>
 #include <linux/of_platform.h>
 #include <linux/platform_device.h>
@@ -20,7 +21,6 @@
 #include <asm/machdep.h>
 #include <asm/swiotlb.h>
 #include <asm/dma.h>
-#include <asm/abs_addr.h>
 
 unsigned int ppc_swiotlb_enable;
 
@@ -47,8 +47,9 @@ static u64 swiotlb_powerpc_get_required(struct device *dev)
  * for everything else.
  */
 struct dma_map_ops swiotlb_dma_ops = {
-	.alloc_coherent = dma_direct_alloc_coherent,
-	.free_coherent = dma_direct_free_coherent,
+	.alloc = dma_direct_alloc_coherent,
+	.free = dma_direct_free_coherent,
+	.mmap = dma_direct_mmap_coherent,
 	.map_sg = swiotlb_map_sg_attrs,
 	.unmap_sg = swiotlb_unmap_sg_attrs,
 	.dma_supported = swiotlb_dma_supported,
@@ -104,3 +105,23 @@ int __init swiotlb_setup_bus_notifier(void)
 			      &ppc_swiotlb_plat_bus_notifier);
 	return 0;
 }
+
+void swiotlb_detect_4g(void)
+{
+	if ((memblock_end_of_DRAM() - 1) > 0xffffffff)
+		ppc_swiotlb_enable = 1;
+}
+
+static int __init swiotlb_late_init(void)
+{
+	if (ppc_swiotlb_enable) {
+		swiotlb_print_info();
+		set_pci_dma_ops(&swiotlb_dma_ops);
+		ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb;
+	} else {
+		swiotlb_free();
+	}
+
+	return 0;
+}
+subsys_initcall(swiotlb_late_init);
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index 7d0233c12ee..ee78f6e49d6 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -11,8 +11,9 @@
 #include <linux/gfp.h>
 #include <linux/memblock.h>
 #include <linux/export.h>
+#include <linux/pci.h>
+#include <asm/vio.h>
 #include <asm/bug.h>
-#include <asm/abs_addr.h>
 #include <asm/machdep.h>
 
 /*
@@ -26,7 +27,8 @@
 
 
 void *dma_direct_alloc_coherent(struct device *dev, size_t size,
-				dma_addr_t *dma_handle, gfp_t flag)
+				dma_addr_t *dma_handle, gfp_t flag,
+				struct dma_attrs *attrs)
 {
 	void *ret;
 #ifdef CONFIG_NOT_COHERENT_CACHE
@@ -47,14 +49,15 @@ void *dma_direct_alloc_coherent(struct device *dev, size_t size,
 		return NULL;
 	ret = page_address(page);
 	memset(ret, 0, size);
-	*dma_handle = virt_to_abs(ret) + get_dma_offset(dev);
+	*dma_handle = __pa(ret) + get_dma_offset(dev);
 
 	return ret;
 #endif
 }
 
 void dma_direct_free_coherent(struct device *dev, size_t size,
-			      void *vaddr, dma_addr_t dma_handle)
+			      void *vaddr, dma_addr_t dma_handle,
+			      struct dma_attrs *attrs)
 {
 #ifdef CONFIG_NOT_COHERENT_CACHE
 	__dma_free_coherent(size, vaddr);
@@ -63,6 +66,24 @@ void dma_direct_free_coherent(struct device *dev, size_t size,
 #endif
 }
 
+int dma_direct_mmap_coherent(struct device *dev, struct vm_area_struct *vma,
+			     void *cpu_addr, dma_addr_t handle, size_t size,
+			     struct dma_attrs *attrs)
+{
+	unsigned long pfn;
+
+#ifdef CONFIG_NOT_COHERENT_CACHE
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	pfn = __dma_get_coherent_pfn((unsigned long)cpu_addr);
+#else
+	pfn = page_to_pfn(virt_to_page(cpu_addr));
+#endif
+	return remap_pfn_range(vma, vma->vm_start,
+			       pfn + vma->vm_pgoff,
+			       vma->vm_end - vma->vm_start,
+			       vma->vm_page_prot);
+}
+
 static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl,
 			     int nents, enum dma_data_direction direction,
 			     struct dma_attrs *attrs)
@@ -150,8 +171,9 @@ static inline void dma_direct_sync_single(struct device *dev,
 #endif
 
 struct dma_map_ops dma_direct_ops = {
-	.alloc_coherent			= dma_direct_alloc_coherent,
-	.free_coherent			= dma_direct_free_coherent,
+	.alloc				= dma_direct_alloc_coherent,
+	.free				= dma_direct_free_coherent,
+	.mmap				= dma_direct_mmap_coherent,
 	.map_sg				= dma_direct_map_sg,
 	.unmap_sg			= dma_direct_unmap_sg,
 	.dma_supported			= dma_direct_dma_supported,
@@ -169,12 +191,10 @@ EXPORT_SYMBOL(dma_direct_ops);
 
 #define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
 
-int dma_set_mask(struct device *dev, u64 dma_mask)
+int __dma_set_mask(struct device *dev, u64 dma_mask)
 {
 	struct dma_map_ops *dma_ops = get_dma_ops(dev);
 
-	if (ppc_md.dma_set_mask)
-		return ppc_md.dma_set_mask(dev, dma_mask);
 	if ((dma_ops != NULL) && (dma_ops->set_dma_mask != NULL))
 		return dma_ops->set_dma_mask(dev, dma_mask);
 	if (!dev->dma_mask || !dma_supported(dev, dma_mask))
@@ -182,6 +202,12 @@ int dma_set_mask(struct device *dev, u64 dma_mask)
 	*dev->dma_mask = dma_mask;
 	return 0;
 }
+int dma_set_mask(struct device *dev, u64 dma_mask)
+{
+	if (ppc_md.dma_set_mask)
+		return ppc_md.dma_set_mask(dev, dma_mask);
+	return __dma_set_mask(dev, dma_mask);
+}
 EXPORT_SYMBOL(dma_set_mask);
 
 u64 dma_get_required_mask(struct device *dev)
@@ -203,26 +229,15 @@ EXPORT_SYMBOL_GPL(dma_get_required_mask);
 
 static int __init dma_init(void)
 {
-       dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
+	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
+#ifdef CONFIG_PCI
+	dma_debug_add_bus(&pci_bus_type);
+#endif
+#ifdef CONFIG_IBMVIO
+	dma_debug_add_bus(&vio_bus_type);
+#endif
 
        return 0;
 }
 fs_initcall(dma_init);
 
-int dma_mmap_coherent(struct device *dev, struct vm_area_struct *vma,
-		      void *cpu_addr, dma_addr_t handle, size_t size)
-{
-	unsigned long pfn;
-
-#ifdef CONFIG_NOT_COHERENT_CACHE
-	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-	pfn = __dma_get_coherent_pfn((unsigned long)cpu_addr);
-#else
-	pfn = page_to_pfn(virt_to_page(cpu_addr));
-#endif
-	return remap_pfn_range(vma, vma->vm_start,
-			       pfn + vma->vm_pgoff,
-			       vma->vm_end - vma->vm_start,
-			       vma->vm_page_prot);
-}
-EXPORT_SYMBOL_GPL(dma_mmap_coherent);
diff --git a/arch/powerpc/kernel/e500-pmu.c b/arch/powerpc/kernel/e500-pmu.c
deleted file mode 100644
index cb2e2949c8d..00000000000
--- a/arch/powerpc/kernel/e500-pmu.c
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Performance counter support for e500 family processors.
- *
- * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
- * Copyright 2010 Freescale Semiconductor, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/string.h>
-#include <linux/perf_event.h>
-#include <asm/reg.h>
-#include <asm/cputable.h>
-
-/*
- * Map of generic hardware event types to hardware events
- * Zero if unsupported
- */
-static int e500_generic_events[] = {
-	[PERF_COUNT_HW_CPU_CYCLES] = 1,
-	[PERF_COUNT_HW_INSTRUCTIONS] = 2,
-	[PERF_COUNT_HW_CACHE_MISSES] = 41, /* Data L1 cache reloads */
-	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 12,
-	[PERF_COUNT_HW_BRANCH_MISSES] = 15,
-};
-
-#define C(x)	PERF_COUNT_HW_CACHE_##x
-
-/*
- * Table of generalized cache-related events.
- * 0 means not supported, -1 means nonsensical, other values
- * are event codes.
- */
-static int e500_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
-	/*
-	 * D-cache misses are not split into read/write/prefetch;
-	 * use raw event 41.
-	 */
-	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
-		[C(OP_READ)] = {	27,		0	},
-		[C(OP_WRITE)] = {	28,		0	},
-		[C(OP_PREFETCH)] = {	29,		0	},
-	},
-	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
-		[C(OP_READ)] = {	2,		60	},
-		[C(OP_WRITE)] = {	-1,		-1	},
-		[C(OP_PREFETCH)] = {	0,		0	},
-	},
-	/*
-	 * Assuming LL means L2, it's not a good match for this model.
-	 * It allocates only on L1 castout or explicit prefetch, and
-	 * does not have separate read/write events (but it does have
-	 * separate instruction/data events).
-	 */
-	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
-		[C(OP_READ)] = {	0,		0	},
-		[C(OP_WRITE)] = {	0,		0	},
-		[C(OP_PREFETCH)] = {	0,		0	},
-	},
-	/*
-	 * There are data/instruction MMU misses, but that's a miss on
-	 * the chip's internal level-one TLB which is probably not
-	 * what the user wants.  Instead, unified level-two TLB misses
-	 * are reported here.
-	 */
-	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
-		[C(OP_READ)] = {	26,		66	},
-		[C(OP_WRITE)] = {	-1,		-1	},
-		[C(OP_PREFETCH)] = {	-1,		-1	},
-	},
-	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
-		[C(OP_READ)] = {	12,		15 	},
-		[C(OP_WRITE)] = {	-1,		-1	},
-		[C(OP_PREFETCH)] = {	-1,		-1	},
-	},
-	[C(NODE)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
-		[C(OP_READ)] = {	-1,		-1 	},
-		[C(OP_WRITE)] = {	-1,		-1	},
-		[C(OP_PREFETCH)] = {	-1,		-1	},
-	},
-};
-
-static int num_events = 128;
-
-/* Upper half of event id is PMLCb, for threshold events */
-static u64 e500_xlate_event(u64 event_id)
-{
-	u32 event_low = (u32)event_id;
-	u64 ret;
-
-	if (event_low >= num_events)
-		return 0;
-
-	ret = FSL_EMB_EVENT_VALID;
-
-	if (event_low >= 76 && event_low <= 81) {
-		ret |= FSL_EMB_EVENT_RESTRICTED;
-		ret |= event_id &
-		       (FSL_EMB_EVENT_THRESHMUL | FSL_EMB_EVENT_THRESH);
-	} else if (event_id &
-	           (FSL_EMB_EVENT_THRESHMUL | FSL_EMB_EVENT_THRESH)) {
-		/* Threshold requested on non-threshold event */
-		return 0;
-	}
-
-	return ret;
-}
-
-static struct fsl_emb_pmu e500_pmu = {
-	.name			= "e500 family",
-	.n_counter		= 4,
-	.n_restricted		= 2,
-	.xlate_event		= e500_xlate_event,
-	.n_generic		= ARRAY_SIZE(e500_generic_events),
-	.generic_events		= e500_generic_events,
-	.cache_events		= &e500_cache_events,
-};
-
-static int init_e500_pmu(void)
-{
-	if (!cur_cpu_spec->oprofile_cpu_type)
-		return -ENODEV;
-
-	if (!strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc/e500mc"))
-		num_events = 256;
-	else if (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc/e500"))
-		return -ENODEV;
-
-	return register_fsl_emb_pmu(&e500_pmu);
-}
-
-early_initcall(init_e500_pmu);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
new file mode 100644
index 00000000000..86e25702aac
--- /dev/null
+++ b/arch/powerpc/kernel/eeh.c
@@ -0,0 +1,1183 @@
+/*
+ * Copyright IBM Corporation 2001, 2005, 2006
+ * Copyright Dave Engebretsen & Todd Inglett 2001
+ * Copyright Linas Vepstas 2005, 2006
+ * Copyright 2001-2012 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ * Please address comments and feedback to Linas Vepstas <linas@austin.ibm.com>
+ */
+
+#include <linux/delay.h>
+#include <linux/debugfs.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/rbtree.h>
+#include <linux/reboot.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/export.h>
+#include <linux/of.h>
+
+#include <linux/atomic.h>
+#include <asm/debug.h>
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/io.h>
+#include <asm/machdep.h>
+#include <asm/ppc-pci.h>
+#include <asm/rtas.h>
+
+
+/** Overview:
+ *  EEH, or "Extended Error Handling" is a PCI bridge technology for
+ *  dealing with PCI bus errors that can't be dealt with within the
+ *  usual PCI framework, except by check-stopping the CPU.  Systems
+ *  that are designed for high-availability/reliability cannot afford
+ *  to crash due to a "mere" PCI error, thus the need for EEH.
+ *  An EEH-capable bridge operates by converting a detected error
+ *  into a "slot freeze", taking the PCI adapter off-line, making
+ *  the slot behave, from the OS'es point of view, as if the slot
+ *  were "empty": all reads return 0xff's and all writes are silently
+ *  ignored.  EEH slot isolation events can be triggered by parity
+ *  errors on the address or data busses (e.g. during posted writes),
+ *  which in turn might be caused by low voltage on the bus, dust,
+ *  vibration, humidity, radioactivity or plain-old failed hardware.
+ *
+ *  Note, however, that one of the leading causes of EEH slot
+ *  freeze events are buggy device drivers, buggy device microcode,
+ *  or buggy device hardware.  This is because any attempt by the
+ *  device to bus-master data to a memory address that is not
+ *  assigned to the device will trigger a slot freeze.   (The idea
+ *  is to prevent devices-gone-wild from corrupting system memory).
+ *  Buggy hardware/drivers will have a miserable time co-existing
+ *  with EEH.
+ *
+ *  Ideally, a PCI device driver, when suspecting that an isolation
+ *  event has occurred (e.g. by reading 0xff's), will then ask EEH
+ *  whether this is the case, and then take appropriate steps to
+ *  reset the PCI slot, the PCI device, and then resume operations.
+ *  However, until that day,  the checking is done here, with the
+ *  eeh_check_failure() routine embedded in the MMIO macros.  If
+ *  the slot is found to be isolated, an "EEH Event" is synthesized
+ *  and sent out for processing.
+ */
+
+/* If a device driver keeps reading an MMIO register in an interrupt
+ * handler after a slot isolation event, it might be broken.
+ * This sets the threshold for how many read attempts we allow
+ * before printing an error message.
+ */
+#define EEH_MAX_FAILS	2100000
+
+/* Time to wait for a PCI slot to report status, in milliseconds */
+#define PCI_BUS_RESET_WAIT_MSEC (5*60*1000)
+
+/*
+ * EEH probe mode support, which is part of the flags,
+ * is to support multiple platforms for EEH. Some platforms
+ * like pSeries do PCI emunation based on device tree.
+ * However, other platforms like powernv probe PCI devices
+ * from hardware. The flag is used to distinguish that.
+ * In addition, struct eeh_ops::probe would be invoked for
+ * particular OF node or PCI device so that the corresponding
+ * PE would be created there.
+ */
+int eeh_subsystem_flags;
+EXPORT_SYMBOL(eeh_subsystem_flags);
+
+/* Platform dependent EEH operations */
+struct eeh_ops *eeh_ops = NULL;
+
+/* Lock to avoid races due to multiple reports of an error */
+DEFINE_RAW_SPINLOCK(confirm_error_lock);
+
+/* Buffer for reporting pci register dumps. Its here in BSS, and
+ * not dynamically alloced, so that it ends up in RMO where RTAS
+ * can access it.
+ */
+#define EEH_PCI_REGS_LOG_LEN 4096
+static unsigned char pci_regs_buf[EEH_PCI_REGS_LOG_LEN];
+
+/*
+ * The struct is used to maintain the EEH global statistic
+ * information. Besides, the EEH global statistics will be
+ * exported to user space through procfs
+ */
+struct eeh_stats {
+	u64 no_device;		/* PCI device not found		*/
+	u64 no_dn;		/* OF node not found		*/
+	u64 no_cfg_addr;	/* Config address not found	*/
+	u64 ignored_check;	/* EEH check skipped		*/
+	u64 total_mmio_ffs;	/* Total EEH checks		*/
+	u64 false_positives;	/* Unnecessary EEH checks	*/
+	u64 slot_resets;	/* PE reset			*/
+};
+
+static struct eeh_stats eeh_stats;
+
+#define IS_BRIDGE(class_code) (((class_code)<<16) == PCI_BASE_CLASS_BRIDGE)
+
+static int __init eeh_setup(char *str)
+{
+	if (!strcmp(str, "off"))
+		eeh_subsystem_flags |= EEH_FORCE_DISABLED;
+
+	return 1;
+}
+__setup("eeh=", eeh_setup);
+
+/**
+ * eeh_gather_pci_data - Copy assorted PCI config space registers to buff
+ * @edev: device to report data for
+ * @buf: point to buffer in which to log
+ * @len: amount of room in buffer
+ *
+ * This routine captures assorted PCI configuration space data,
+ * and puts them into a buffer for RTAS error logging.
+ */
+static size_t eeh_gather_pci_data(struct eeh_dev *edev, char * buf, size_t len)
+{
+	struct device_node *dn = eeh_dev_to_of_node(edev);
+	u32 cfg;
+	int cap, i;
+	int n = 0;
+
+	n += scnprintf(buf+n, len-n, "%s\n", dn->full_name);
+	pr_warn("EEH: of node=%s\n", dn->full_name);
+
+	eeh_ops->read_config(dn, PCI_VENDOR_ID, 4, &cfg);
+	n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg);
+	pr_warn("EEH: PCI device/vendor: %08x\n", cfg);
+
+	eeh_ops->read_config(dn, PCI_COMMAND, 4, &cfg);
+	n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg);
+	pr_warn("EEH: PCI cmd/status register: %08x\n", cfg);
+
+	/* Gather bridge-specific registers */
+	if (edev->mode & EEH_DEV_BRIDGE) {
+		eeh_ops->read_config(dn, PCI_SEC_STATUS, 2, &cfg);
+		n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg);
+		pr_warn("EEH: Bridge secondary status: %04x\n", cfg);
+
+		eeh_ops->read_config(dn, PCI_BRIDGE_CONTROL, 2, &cfg);
+		n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg);
+		pr_warn("EEH: Bridge control: %04x\n", cfg);
+	}
+
+	/* Dump out the PCI-X command and status regs */
+	cap = edev->pcix_cap;
+	if (cap) {
+		eeh_ops->read_config(dn, cap, 4, &cfg);
+		n += scnprintf(buf+n, len-n, "pcix-cmd:%x\n", cfg);
+		pr_warn("EEH: PCI-X cmd: %08x\n", cfg);
+
+		eeh_ops->read_config(dn, cap+4, 4, &cfg);
+		n += scnprintf(buf+n, len-n, "pcix-stat:%x\n", cfg);
+		pr_warn("EEH: PCI-X status: %08x\n", cfg);
+	}
+
+	/* If PCI-E capable, dump PCI-E cap 10 */
+	cap = edev->pcie_cap;
+	if (cap) {
+		n += scnprintf(buf+n, len-n, "pci-e cap10:\n");
+		pr_warn("EEH: PCI-E capabilities and status follow:\n");
+
+		for (i=0; i<=8; i++) {
+			eeh_ops->read_config(dn, cap+4*i, 4, &cfg);
+			n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg);
+			pr_warn("EEH: PCI-E %02x: %08x\n", i, cfg);
+		}
+	}
+
+	/* If AER capable, dump it */
+	cap = edev->aer_cap;
+	if (cap) {
+		n += scnprintf(buf+n, len-n, "pci-e AER:\n");
+		pr_warn("EEH: PCI-E AER capability register set follows:\n");
+
+		for (i=0; i<14; i++) {
+			eeh_ops->read_config(dn, cap+4*i, 4, &cfg);
+			n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg);
+			pr_warn("EEH: PCI-E AER %02x: %08x\n", i, cfg);
+		}
+	}
+
+	return n;
+}
+
+/**
+ * eeh_slot_error_detail - Generate combined log including driver log and error log
+ * @pe: EEH PE
+ * @severity: temporary or permanent error log
+ *
+ * This routine should be called to generate the combined log, which
+ * is comprised of driver log and error log. The driver log is figured
+ * out from the config space of the corresponding PCI device, while
+ * the error log is fetched through platform dependent function call.
+ */
+void eeh_slot_error_detail(struct eeh_pe *pe, int severity)
+{
+	size_t loglen = 0;
+	struct eeh_dev *edev, *tmp;
+
+	/*
+	 * When the PHB is fenced or dead, it's pointless to collect
+	 * the data from PCI config space because it should return
+	 * 0xFF's. For ER, we still retrieve the data from the PCI
+	 * config space.
+	 *
+	 * For pHyp, we have to enable IO for log retrieval. Otherwise,
+	 * 0xFF's is always returned from PCI config space.
+	 */
+	if (!(pe->type & EEH_PE_PHB)) {
+		if (eeh_probe_mode_devtree())
+			eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
+		eeh_ops->configure_bridge(pe);
+		eeh_pe_restore_bars(pe);
+
+		pci_regs_buf[0] = 0;
+		eeh_pe_for_each_dev(pe, edev, tmp) {
+			loglen += eeh_gather_pci_data(edev, pci_regs_buf + loglen,
+						      EEH_PCI_REGS_LOG_LEN - loglen);
+		}
+	}
+
+	eeh_ops->get_log(pe, severity, pci_regs_buf, loglen);
+}
+
+/**
+ * eeh_token_to_phys - Convert EEH address token to phys address
+ * @token: I/O token, should be address in the form 0xA....
+ *
+ * This routine should be called to convert virtual I/O address
+ * to physical one.
+ */
+static inline unsigned long eeh_token_to_phys(unsigned long token)
+{
+	pte_t *ptep;
+	unsigned long pa;
+	int hugepage_shift;
+
+	/*
+	 * We won't find hugepages here, iomem
+	 */
+	ptep = find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift);
+	if (!ptep)
+		return token;
+	WARN_ON(hugepage_shift);
+	pa = pte_pfn(*ptep) << PAGE_SHIFT;
+
+	return pa | (token & (PAGE_SIZE-1));
+}
+
+/*
+ * On PowerNV platform, we might already have fenced PHB there.
+ * For that case, it's meaningless to recover frozen PE. Intead,
+ * We have to handle fenced PHB firstly.
+ */
+static int eeh_phb_check_failure(struct eeh_pe *pe)
+{
+	struct eeh_pe *phb_pe;
+	unsigned long flags;
+	int ret;
+
+	if (!eeh_probe_mode_dev())
+		return -EPERM;
+
+	/* Find the PHB PE */
+	phb_pe = eeh_phb_pe_get(pe->phb);
+	if (!phb_pe) {
+		pr_warning("%s Can't find PE for PHB#%d\n",
+			   __func__, pe->phb->global_number);
+		return -EEXIST;
+	}
+
+	/* If the PHB has been in problematic state */
+	eeh_serialize_lock(&flags);
+	if (phb_pe->state & EEH_PE_ISOLATED) {
+		ret = 0;
+		goto out;
+	}
+
+	/* Check PHB state */
+	ret = eeh_ops->get_state(phb_pe, NULL);
+	if ((ret < 0) ||
+	    (ret == EEH_STATE_NOT_SUPPORT) ||
+	    (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) ==
+	    (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) {
+		ret = 0;
+		goto out;
+	}
+
+	/* Isolate the PHB and send event */
+	eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED);
+	eeh_serialize_unlock(flags);
+
+	pr_err("EEH: PHB#%x failure detected, location: %s\n",
+		phb_pe->phb->global_number, eeh_pe_loc_get(phb_pe));
+	dump_stack();
+	eeh_send_failure_event(phb_pe);
+
+	return 1;
+out:
+	eeh_serialize_unlock(flags);
+	return ret;
+}
+
+/**
+ * eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze
+ * @edev: eeh device
+ *
+ * Check for an EEH failure for the given device node.  Call this
+ * routine if the result of a read was all 0xff's and you want to
+ * find out if this is due to an EEH slot freeze.  This routine
+ * will query firmware for the EEH status.
+ *
+ * Returns 0 if there has not been an EEH error; otherwise returns
+ * a non-zero value and queues up a slot isolation event notification.
+ *
+ * It is safe to call this routine in an interrupt context.
+ */
+int eeh_dev_check_failure(struct eeh_dev *edev)
+{
+	int ret;
+	int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
+	unsigned long flags;
+	struct device_node *dn;
+	struct pci_dev *dev;
+	struct eeh_pe *pe, *parent_pe, *phb_pe;
+	int rc = 0;
+	const char *location;
+
+	eeh_stats.total_mmio_ffs++;
+
+	if (!eeh_enabled())
+		return 0;
+
+	if (!edev) {
+		eeh_stats.no_dn++;
+		return 0;
+	}
+	dn = eeh_dev_to_of_node(edev);
+	dev = eeh_dev_to_pci_dev(edev);
+	pe = edev->pe;
+
+	/* Access to IO BARs might get this far and still not want checking. */
+	if (!pe) {
+		eeh_stats.ignored_check++;
+		pr_debug("EEH: Ignored check for %s %s\n",
+			eeh_pci_name(dev), dn->full_name);
+		return 0;
+	}
+
+	if (!pe->addr && !pe->config_addr) {
+		eeh_stats.no_cfg_addr++;
+		return 0;
+	}
+
+	/*
+	 * On PowerNV platform, we might already have fenced PHB
+	 * there and we need take care of that firstly.
+	 */
+	ret = eeh_phb_check_failure(pe);
+	if (ret > 0)
+		return ret;
+
+	/* If we already have a pending isolation event for this
+	 * slot, we know it's bad already, we don't need to check.
+	 * Do this checking under a lock; as multiple PCI devices
+	 * in one slot might report errors simultaneously, and we
+	 * only want one error recovery routine running.
+	 */
+	eeh_serialize_lock(&flags);
+	rc = 1;
+	if (pe->state & EEH_PE_ISOLATED) {
+		pe->check_count++;
+		if (pe->check_count % EEH_MAX_FAILS == 0) {
+			location = of_get_property(dn, "ibm,loc-code", NULL);
+			printk(KERN_ERR "EEH: %d reads ignored for recovering device at "
+				"location=%s driver=%s pci addr=%s\n",
+				pe->check_count, location,
+				eeh_driver_name(dev), eeh_pci_name(dev));
+			printk(KERN_ERR "EEH: Might be infinite loop in %s driver\n",
+				eeh_driver_name(dev));
+			dump_stack();
+		}
+		goto dn_unlock;
+	}
+
+	/*
+	 * Now test for an EEH failure.  This is VERY expensive.
+	 * Note that the eeh_config_addr may be a parent device
+	 * in the case of a device behind a bridge, or it may be
+	 * function zero of a multi-function device.
+	 * In any case they must share a common PHB.
+	 */
+	ret = eeh_ops->get_state(pe, NULL);
+
+	/* Note that config-io to empty slots may fail;
+	 * they are empty when they don't have children.
+	 * We will punt with the following conditions: Failure to get
+	 * PE's state, EEH not support and Permanently unavailable
+	 * state, PE is in good state.
+	 */
+	if ((ret < 0) ||
+	    (ret == EEH_STATE_NOT_SUPPORT) ||
+	    ((ret & active_flags) == active_flags)) {
+		eeh_stats.false_positives++;
+		pe->false_positives++;
+		rc = 0;
+		goto dn_unlock;
+	}
+
+	/*
+	 * It should be corner case that the parent PE has been
+	 * put into frozen state as well. We should take care
+	 * that at first.
+	 */
+	parent_pe = pe->parent;
+	while (parent_pe) {
+		/* Hit the ceiling ? */
+		if (parent_pe->type & EEH_PE_PHB)
+			break;
+
+		/* Frozen parent PE ? */
+		ret = eeh_ops->get_state(parent_pe, NULL);
+		if (ret > 0 &&
+		    (ret & active_flags) != active_flags)
+			pe = parent_pe;
+
+		/* Next parent level */
+		parent_pe = parent_pe->parent;
+	}
+
+	eeh_stats.slot_resets++;
+
+	/* Avoid repeated reports of this failure, including problems
+	 * with other functions on this device, and functions under
+	 * bridges.
+	 */
+	eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
+	eeh_serialize_unlock(flags);
+
+	/* Most EEH events are due to device driver bugs.  Having
+	 * a stack trace will help the device-driver authors figure
+	 * out what happened.  So print that out.
+	 */
+	phb_pe = eeh_phb_pe_get(pe->phb);
+	pr_err("EEH: Frozen PHB#%x-PE#%x detected\n",
+	       pe->phb->global_number, pe->addr);
+	pr_err("EEH: PE location: %s, PHB location: %s\n",
+	       eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe));
+	dump_stack();
+
+	eeh_send_failure_event(pe);
+
+	return 1;
+
+dn_unlock:
+	eeh_serialize_unlock(flags);
+	return rc;
+}
+
+EXPORT_SYMBOL_GPL(eeh_dev_check_failure);
+
+/**
+ * eeh_check_failure - Check if all 1's data is due to EEH slot freeze
+ * @token: I/O token, should be address in the form 0xA....
+ * @val: value, should be all 1's (XXX why do we need this arg??)
+ *
+ * Check for an EEH failure at the given token address.  Call this
+ * routine if the result of a read was all 0xff's and you want to
+ * find out if this is due to an EEH slot freeze event.  This routine
+ * will query firmware for the EEH status.
+ *
+ * Note this routine is safe to call in an interrupt context.
+ */
+unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val)
+{
+	unsigned long addr;
+	struct eeh_dev *edev;
+
+	/* Finding the phys addr + pci device; this is pretty quick. */
+	addr = eeh_token_to_phys((unsigned long __force) token);
+	edev = eeh_addr_cache_get_dev(addr);
+	if (!edev) {
+		eeh_stats.no_device++;
+		return val;
+	}
+
+	eeh_dev_check_failure(edev);
+	return val;
+}
+
+EXPORT_SYMBOL(eeh_check_failure);
+
+
+/**
+ * eeh_pci_enable - Enable MMIO or DMA transfers for this slot
+ * @pe: EEH PE
+ *
+ * This routine should be called to reenable frozen MMIO or DMA
+ * so that it would work correctly again. It's useful while doing
+ * recovery or log collection on the indicated device.
+ */
+int eeh_pci_enable(struct eeh_pe *pe, int function)
+{
+	int rc, flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
+
+	/*
+	 * pHyp doesn't allow to enable IO or DMA on unfrozen PE.
+	 * Also, it's pointless to enable them on unfrozen PE. So
+	 * we have the check here.
+	 */
+	if (function == EEH_OPT_THAW_MMIO ||
+	    function == EEH_OPT_THAW_DMA) {
+		rc = eeh_ops->get_state(pe, NULL);
+		if (rc < 0)
+			return rc;
+
+		/* Needn't to enable or already enabled */
+		if ((rc == EEH_STATE_NOT_SUPPORT) ||
+		    ((rc & flags) == flags))
+			return 0;
+	}
+
+	rc = eeh_ops->set_option(pe, function);
+	if (rc)
+		pr_warn("%s: Unexpected state change %d on "
+			"PHB#%d-PE#%x, err=%d\n",
+			__func__, function, pe->phb->global_number,
+			pe->addr, rc);
+
+	rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
+	if (rc <= 0)
+		return rc;
+
+	if ((function == EEH_OPT_THAW_MMIO) &&
+	    (rc & EEH_STATE_MMIO_ENABLED))
+		return 0;
+
+	if ((function == EEH_OPT_THAW_DMA) &&
+	    (rc & EEH_STATE_DMA_ENABLED))
+		return 0;
+
+	return rc;
+}
+
+/**
+ * pcibios_set_pcie_slot_reset - Set PCI-E reset state
+ * @dev: pci device struct
+ * @state: reset state to enter
+ *
+ * Return value:
+ * 	0 if success
+ */
+int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state)
+{
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
+	struct eeh_pe *pe = edev->pe;
+
+	if (!pe) {
+		pr_err("%s: No PE found on PCI device %s\n",
+			__func__, pci_name(dev));
+		return -EINVAL;
+	}
+
+	switch (state) {
+	case pcie_deassert_reset:
+		eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
+		break;
+	case pcie_hot_reset:
+		eeh_ops->reset(pe, EEH_RESET_HOT);
+		break;
+	case pcie_warm_reset:
+		eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL);
+		break;
+	default:
+		return -EINVAL;
+	};
+
+	return 0;
+}
+
+/**
+ * eeh_set_pe_freset - Check the required reset for the indicated device
+ * @data: EEH device
+ * @flag: return value
+ *
+ * Each device might have its preferred reset type: fundamental or
+ * hot reset. The routine is used to collected the information for
+ * the indicated device and its children so that the bunch of the
+ * devices could be reset properly.
+ */
+static void *eeh_set_dev_freset(void *data, void *flag)
+{
+	struct pci_dev *dev;
+	unsigned int *freset = (unsigned int *)flag;
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+
+	dev = eeh_dev_to_pci_dev(edev);
+	if (dev)
+		*freset |= dev->needs_freset;
+
+	return NULL;
+}
+
+/**
+ * eeh_reset_pe_once - Assert the pci #RST line for 1/4 second
+ * @pe: EEH PE
+ *
+ * Assert the PCI #RST line for 1/4 second.
+ */
+static void eeh_reset_pe_once(struct eeh_pe *pe)
+{
+	unsigned int freset = 0;
+
+	/* Determine type of EEH reset required for
+	 * Partitionable Endpoint, a hot-reset (1)
+	 * or a fundamental reset (3).
+	 * A fundamental reset required by any device under
+	 * Partitionable Endpoint trumps hot-reset.
+	 */
+	eeh_pe_dev_traverse(pe, eeh_set_dev_freset, &freset);
+
+	if (freset)
+		eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL);
+	else
+		eeh_ops->reset(pe, EEH_RESET_HOT);
+
+	eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
+}
+
+/**
+ * eeh_reset_pe - Reset the indicated PE
+ * @pe: EEH PE
+ *
+ * This routine should be called to reset indicated device, including
+ * PE. A PE might include multiple PCI devices and sometimes PCI bridges
+ * might be involved as well.
+ */
+int eeh_reset_pe(struct eeh_pe *pe)
+{
+	int flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
+	int i, rc;
+
+	/* Take three shots at resetting the bus */
+	for (i=0; i<3; i++) {
+		eeh_reset_pe_once(pe);
+
+		/*
+		 * EEH_PE_ISOLATED is expected to be removed after
+		 * BAR restore.
+		 */
+		rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
+		if ((rc & flags) == flags)
+			return 0;
+
+		if (rc < 0) {
+			pr_err("%s: Unrecoverable slot failure on PHB#%d-PE#%x",
+				__func__, pe->phb->global_number, pe->addr);
+			return -1;
+		}
+		pr_err("EEH: bus reset %d failed on PHB#%d-PE#%x, rc=%d\n",
+			i+1, pe->phb->global_number, pe->addr, rc);
+	}
+
+	return -1;
+}
+
+/**
+ * eeh_save_bars - Save device bars
+ * @edev: PCI device associated EEH device
+ *
+ * Save the values of the device bars. Unlike the restore
+ * routine, this routine is *not* recursive. This is because
+ * PCI devices are added individually; but, for the restore,
+ * an entire slot is reset at a time.
+ */
+void eeh_save_bars(struct eeh_dev *edev)
+{
+	int i;
+	struct device_node *dn;
+
+	if (!edev)
+		return;
+	dn = eeh_dev_to_of_node(edev);
+
+	for (i = 0; i < 16; i++)
+		eeh_ops->read_config(dn, i * 4, 4, &edev->config_space[i]);
+
+	/*
+	 * For PCI bridges including root port, we need enable bus
+	 * master explicitly. Otherwise, it can't fetch IODA table
+	 * entries correctly. So we cache the bit in advance so that
+	 * we can restore it after reset, either PHB range or PE range.
+	 */
+	if (edev->mode & EEH_DEV_BRIDGE)
+		edev->config_space[1] |= PCI_COMMAND_MASTER;
+}
+
+/**
+ * eeh_ops_register - Register platform dependent EEH operations
+ * @ops: platform dependent EEH operations
+ *
+ * Register the platform dependent EEH operation callback
+ * functions. The platform should call this function before
+ * any other EEH operations.
+ */
+int __init eeh_ops_register(struct eeh_ops *ops)
+{
+	if (!ops->name) {
+		pr_warning("%s: Invalid EEH ops name for %p\n",
+			__func__, ops);
+		return -EINVAL;
+	}
+
+	if (eeh_ops && eeh_ops != ops) {
+		pr_warning("%s: EEH ops of platform %s already existing (%s)\n",
+			__func__, eeh_ops->name, ops->name);
+		return -EEXIST;
+	}
+
+	eeh_ops = ops;
+
+	return 0;
+}
+
+/**
+ * eeh_ops_unregister - Unreigster platform dependent EEH operations
+ * @name: name of EEH platform operations
+ *
+ * Unregister the platform dependent EEH operation callback
+ * functions.
+ */
+int __exit eeh_ops_unregister(const char *name)
+{
+	if (!name || !strlen(name)) {
+		pr_warning("%s: Invalid EEH ops name\n",
+			__func__);
+		return -EINVAL;
+	}
+
+	if (eeh_ops && !strcmp(eeh_ops->name, name)) {
+		eeh_ops = NULL;
+		return 0;
+	}
+
+	return -EEXIST;
+}
+
+static int eeh_reboot_notifier(struct notifier_block *nb,
+			       unsigned long action, void *unused)
+{
+	eeh_set_enable(false);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block eeh_reboot_nb = {
+	.notifier_call = eeh_reboot_notifier,
+};
+
+/**
+ * eeh_init - EEH initialization
+ *
+ * Initialize EEH by trying to enable it for all of the adapters in the system.
+ * As a side effect we can determine here if eeh is supported at all.
+ * Note that we leave EEH on so failed config cycles won't cause a machine
+ * check.  If a user turns off EEH for a particular adapter they are really
+ * telling Linux to ignore errors.  Some hardware (e.g. POWER5) won't
+ * grant access to a slot if EEH isn't enabled, and so we always enable
+ * EEH for all slots/all devices.
+ *
+ * The eeh-force-off option disables EEH checking globally, for all slots.
+ * Even if force-off is set, the EEH hardware is still enabled, so that
+ * newer systems can boot.
+ */
+int eeh_init(void)
+{
+	struct pci_controller *hose, *tmp;
+	struct device_node *phb;
+	static int cnt = 0;
+	int ret = 0;
+
+	/*
+	 * We have to delay the initialization on PowerNV after
+	 * the PCI hierarchy tree has been built because the PEs
+	 * are figured out based on PCI devices instead of device
+	 * tree nodes
+	 */
+	if (machine_is(powernv) && cnt++ <= 0)
+		return ret;
+
+	/* Register reboot notifier */
+	ret = register_reboot_notifier(&eeh_reboot_nb);
+	if (ret) {
+		pr_warn("%s: Failed to register notifier (%d)\n",
+			__func__, ret);
+		return ret;
+	}
+
+	/* call platform initialization function */
+	if (!eeh_ops) {
+		pr_warning("%s: Platform EEH operation not found\n",
+			__func__);
+		return -EEXIST;
+	} else if ((ret = eeh_ops->init())) {
+		pr_warning("%s: Failed to call platform init function (%d)\n",
+			__func__, ret);
+		return ret;
+	}
+
+	/* Initialize EEH event */
+	ret = eeh_event_init();
+	if (ret)
+		return ret;
+
+	/* Enable EEH for all adapters */
+	if (eeh_probe_mode_devtree()) {
+		list_for_each_entry_safe(hose, tmp,
+			&hose_list, list_node) {
+			phb = hose->dn;
+			traverse_pci_devices(phb, eeh_ops->of_probe, NULL);
+		}
+	} else if (eeh_probe_mode_dev()) {
+		list_for_each_entry_safe(hose, tmp,
+			&hose_list, list_node)
+			pci_walk_bus(hose->bus, eeh_ops->dev_probe, NULL);
+	} else {
+		pr_warn("%s: Invalid probe mode %x",
+			__func__, eeh_subsystem_flags);
+		return -EINVAL;
+	}
+
+	/*
+	 * Call platform post-initialization. Actually, It's good chance
+	 * to inform platform that EEH is ready to supply service if the
+	 * I/O cache stuff has been built up.
+	 */
+	if (eeh_ops->post_init) {
+		ret = eeh_ops->post_init();
+		if (ret)
+			return ret;
+	}
+
+	if (eeh_enabled())
+		pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n");
+	else
+		pr_warning("EEH: No capable adapters found\n");
+
+	return ret;
+}
+
+core_initcall_sync(eeh_init);
+
+/**
+ * eeh_add_device_early - Enable EEH for the indicated device_node
+ * @dn: device node for which to set up EEH
+ *
+ * This routine must be used to perform EEH initialization for PCI
+ * devices that were added after system boot (e.g. hotplug, dlpar).
+ * This routine must be called before any i/o is performed to the
+ * adapter (inluding any config-space i/o).
+ * Whether this actually enables EEH or not for this device depends
+ * on the CEC architecture, type of the device, on earlier boot
+ * command-line arguments & etc.
+ */
+void eeh_add_device_early(struct device_node *dn)
+{
+	struct pci_controller *phb;
+
+	/*
+	 * If we're doing EEH probe based on PCI device, we
+	 * would delay the probe until late stage because
+	 * the PCI device isn't available this moment.
+	 */
+	if (!eeh_probe_mode_devtree())
+		return;
+
+	if (!of_node_to_eeh_dev(dn))
+		return;
+	phb = of_node_to_eeh_dev(dn)->phb;
+
+	/* USB Bus children of PCI devices will not have BUID's */
+	if (NULL == phb || 0 == phb->buid)
+		return;
+
+	eeh_ops->of_probe(dn, NULL);
+}
+
+/**
+ * eeh_add_device_tree_early - Enable EEH for the indicated device
+ * @dn: device node
+ *
+ * This routine must be used to perform EEH initialization for the
+ * indicated PCI device that was added after system boot (e.g.
+ * hotplug, dlpar).
+ */
+void eeh_add_device_tree_early(struct device_node *dn)
+{
+	struct device_node *sib;
+
+	for_each_child_of_node(dn, sib)
+		eeh_add_device_tree_early(sib);
+	eeh_add_device_early(dn);
+}
+EXPORT_SYMBOL_GPL(eeh_add_device_tree_early);
+
+/**
+ * eeh_add_device_late - Perform EEH initialization for the indicated pci device
+ * @dev: pci device for which to set up EEH
+ *
+ * This routine must be used to complete EEH initialization for PCI
+ * devices that were added after system boot (e.g. hotplug, dlpar).
+ */
+void eeh_add_device_late(struct pci_dev *dev)
+{
+	struct device_node *dn;
+	struct eeh_dev *edev;
+
+	if (!dev || !eeh_enabled())
+		return;
+
+	pr_debug("EEH: Adding device %s\n", pci_name(dev));
+
+	dn = pci_device_to_OF_node(dev);
+	edev = of_node_to_eeh_dev(dn);
+	if (edev->pdev == dev) {
+		pr_debug("EEH: Already referenced !\n");
+		return;
+	}
+
+	/*
+	 * The EEH cache might not be removed correctly because of
+	 * unbalanced kref to the device during unplug time, which
+	 * relies on pcibios_release_device(). So we have to remove
+	 * that here explicitly.
+	 */
+	if (edev->pdev) {
+		eeh_rmv_from_parent_pe(edev);
+		eeh_addr_cache_rmv_dev(edev->pdev);
+		eeh_sysfs_remove_device(edev->pdev);
+		edev->mode &= ~EEH_DEV_SYSFS;
+
+		/*
+		 * We definitely should have the PCI device removed
+		 * though it wasn't correctly. So we needn't call
+		 * into error handler afterwards.
+		 */
+		edev->mode |= EEH_DEV_NO_HANDLER;
+
+		edev->pdev = NULL;
+		dev->dev.archdata.edev = NULL;
+	}
+
+	edev->pdev = dev;
+	dev->dev.archdata.edev = edev;
+
+	/*
+	 * We have to do the EEH probe here because the PCI device
+	 * hasn't been created yet in the early stage.
+	 */
+	if (eeh_probe_mode_dev())
+		eeh_ops->dev_probe(dev, NULL);
+
+	eeh_addr_cache_insert_dev(dev);
+}
+
+/**
+ * eeh_add_device_tree_late - Perform EEH initialization for the indicated PCI bus
+ * @bus: PCI bus
+ *
+ * This routine must be used to perform EEH initialization for PCI
+ * devices which are attached to the indicated PCI bus. The PCI bus
+ * is added after system boot through hotplug or dlpar.
+ */
+void eeh_add_device_tree_late(struct pci_bus *bus)
+{
+	struct pci_dev *dev;
+
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		eeh_add_device_late(dev);
+		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+			struct pci_bus *subbus = dev->subordinate;
+			if (subbus)
+				eeh_add_device_tree_late(subbus);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(eeh_add_device_tree_late);
+
+/**
+ * eeh_add_sysfs_files - Add EEH sysfs files for the indicated PCI bus
+ * @bus: PCI bus
+ *
+ * This routine must be used to add EEH sysfs files for PCI
+ * devices which are attached to the indicated PCI bus. The PCI bus
+ * is added after system boot through hotplug or dlpar.
+ */
+void eeh_add_sysfs_files(struct pci_bus *bus)
+{
+	struct pci_dev *dev;
+
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		eeh_sysfs_add_device(dev);
+		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+			struct pci_bus *subbus = dev->subordinate;
+			if (subbus)
+				eeh_add_sysfs_files(subbus);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(eeh_add_sysfs_files);
+
+/**
+ * eeh_remove_device - Undo EEH setup for the indicated pci device
+ * @dev: pci device to be removed
+ *
+ * This routine should be called when a device is removed from
+ * a running system (e.g. by hotplug or dlpar).  It unregisters
+ * the PCI device from the EEH subsystem.  I/O errors affecting
+ * this device will no longer be detected after this call; thus,
+ * i/o errors affecting this slot may leave this device unusable.
+ */
+void eeh_remove_device(struct pci_dev *dev)
+{
+	struct eeh_dev *edev;
+
+	if (!dev || !eeh_enabled())
+		return;
+	edev = pci_dev_to_eeh_dev(dev);
+
+	/* Unregister the device with the EEH/PCI address search system */
+	pr_debug("EEH: Removing device %s\n", pci_name(dev));
+
+	if (!edev || !edev->pdev || !edev->pe) {
+		pr_debug("EEH: Not referenced !\n");
+		return;
+	}
+
+	/*
+	 * During the hotplug for EEH error recovery, we need the EEH
+	 * device attached to the parent PE in order for BAR restore
+	 * a bit later. So we keep it for BAR restore and remove it
+	 * from the parent PE during the BAR resotre.
+	 */
+	edev->pdev = NULL;
+	dev->dev.archdata.edev = NULL;
+	if (!(edev->pe->state & EEH_PE_KEEP))
+		eeh_rmv_from_parent_pe(edev);
+	else
+		edev->mode |= EEH_DEV_DISCONNECTED;
+
+	/*
+	 * We're removing from the PCI subsystem, that means
+	 * the PCI device driver can't support EEH or not
+	 * well. So we rely on hotplug completely to do recovery
+	 * for the specific PCI device.
+	 */
+	edev->mode |= EEH_DEV_NO_HANDLER;
+
+	eeh_addr_cache_rmv_dev(dev);
+	eeh_sysfs_remove_device(dev);
+	edev->mode &= ~EEH_DEV_SYSFS;
+}
+
+static int proc_eeh_show(struct seq_file *m, void *v)
+{
+	if (!eeh_enabled()) {
+		seq_printf(m, "EEH Subsystem is globally disabled\n");
+		seq_printf(m, "eeh_total_mmio_ffs=%llu\n", eeh_stats.total_mmio_ffs);
+	} else {
+		seq_printf(m, "EEH Subsystem is enabled\n");
+		seq_printf(m,
+				"no device=%llu\n"
+				"no device node=%llu\n"
+				"no config address=%llu\n"
+				"check not wanted=%llu\n"
+				"eeh_total_mmio_ffs=%llu\n"
+				"eeh_false_positives=%llu\n"
+				"eeh_slot_resets=%llu\n",
+				eeh_stats.no_device,
+				eeh_stats.no_dn,
+				eeh_stats.no_cfg_addr,
+				eeh_stats.ignored_check,
+				eeh_stats.total_mmio_ffs,
+				eeh_stats.false_positives,
+				eeh_stats.slot_resets);
+	}
+
+	return 0;
+}
+
+static int proc_eeh_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, proc_eeh_show, NULL);
+}
+
+static const struct file_operations proc_eeh_operations = {
+	.open      = proc_eeh_open,
+	.read      = seq_read,
+	.llseek    = seq_lseek,
+	.release   = single_release,
+};
+
+#ifdef CONFIG_DEBUG_FS
+static int eeh_enable_dbgfs_set(void *data, u64 val)
+{
+	if (val)
+		eeh_subsystem_flags &= ~EEH_FORCE_DISABLED;
+	else
+		eeh_subsystem_flags |= EEH_FORCE_DISABLED;
+
+	/* Notify the backend */
+	if (eeh_ops->post_init)
+		eeh_ops->post_init();
+
+	return 0;
+}
+
+static int eeh_enable_dbgfs_get(void *data, u64 *val)
+{
+	if (eeh_enabled())
+		*val = 0x1ul;
+	else
+		*val = 0x0ul;
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(eeh_enable_dbgfs_ops, eeh_enable_dbgfs_get,
+			eeh_enable_dbgfs_set, "0x%llx\n");
+#endif
+
+static int __init eeh_init_proc(void)
+{
+	if (machine_is(pseries) || machine_is(powernv)) {
+		proc_create("powerpc/eeh", 0, NULL, &proc_eeh_operations);
+#ifdef CONFIG_DEBUG_FS
+		debugfs_create_file("eeh_enable", 0600,
+                                    powerpc_debugfs_root, NULL,
+                                    &eeh_enable_dbgfs_ops);
+#endif
+	}
+
+	return 0;
+}
+__initcall(eeh_init_proc);
diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
new file mode 100644
index 00000000000..e8c9fd546a5
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_cache.c
@@ -0,0 +1,310 @@
+/*
+ * PCI address cache; allows the lookup of PCI devices based on I/O address
+ *
+ * Copyright IBM Corporation 2004
+ * Copyright Linas Vepstas <linas@austin.ibm.com> 2004
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+
+
+/**
+ * The pci address cache subsystem.  This subsystem places
+ * PCI device address resources into a red-black tree, sorted
+ * according to the address range, so that given only an i/o
+ * address, the corresponding PCI device can be **quickly**
+ * found. It is safe to perform an address lookup in an interrupt
+ * context; this ability is an important feature.
+ *
+ * Currently, the only customer of this code is the EEH subsystem;
+ * thus, this code has been somewhat tailored to suit EEH better.
+ * In particular, the cache does *not* hold the addresses of devices
+ * for which EEH is not enabled.
+ *
+ * (Implementation Note: The RB tree seems to be better/faster
+ * than any hash algo I could think of for this problem, even
+ * with the penalty of slow pointer chases for d-cache misses).
+ */
+struct pci_io_addr_range {
+	struct rb_node rb_node;
+	unsigned long addr_lo;
+	unsigned long addr_hi;
+	struct eeh_dev *edev;
+	struct pci_dev *pcidev;
+	unsigned int flags;
+};
+
+static struct pci_io_addr_cache {
+	struct rb_root rb_root;
+	spinlock_t piar_lock;
+} pci_io_addr_cache_root;
+
+static inline struct eeh_dev *__eeh_addr_cache_get_device(unsigned long addr)
+{
+	struct rb_node *n = pci_io_addr_cache_root.rb_root.rb_node;
+
+	while (n) {
+		struct pci_io_addr_range *piar;
+		piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+
+		if (addr < piar->addr_lo)
+			n = n->rb_left;
+		else if (addr > piar->addr_hi)
+			n = n->rb_right;
+		else
+			return piar->edev;
+	}
+
+	return NULL;
+}
+
+/**
+ * eeh_addr_cache_get_dev - Get device, given only address
+ * @addr: mmio (PIO) phys address or i/o port number
+ *
+ * Given an mmio phys address, or a port number, find a pci device
+ * that implements this address.  Be sure to pci_dev_put the device
+ * when finished.  I/O port numbers are assumed to be offset
+ * from zero (that is, they do *not* have pci_io_addr added in).
+ * It is safe to call this function within an interrupt.
+ */
+struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr)
+{
+	struct eeh_dev *edev;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+	edev = __eeh_addr_cache_get_device(addr);
+	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+	return edev;
+}
+
+#ifdef DEBUG
+/*
+ * Handy-dandy debug print routine, does nothing more
+ * than print out the contents of our addr cache.
+ */
+static void eeh_addr_cache_print(struct pci_io_addr_cache *cache)
+{
+	struct rb_node *n;
+	int cnt = 0;
+
+	n = rb_first(&cache->rb_root);
+	while (n) {
+		struct pci_io_addr_range *piar;
+		piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+		pr_debug("PCI: %s addr range %d [%lx-%lx]: %s\n",
+		       (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt,
+		       piar->addr_lo, piar->addr_hi, pci_name(piar->pcidev));
+		cnt++;
+		n = rb_next(n);
+	}
+}
+#endif
+
+/* Insert address range into the rb tree. */
+static struct pci_io_addr_range *
+eeh_addr_cache_insert(struct pci_dev *dev, unsigned long alo,
+		      unsigned long ahi, unsigned int flags)
+{
+	struct rb_node **p = &pci_io_addr_cache_root.rb_root.rb_node;
+	struct rb_node *parent = NULL;
+	struct pci_io_addr_range *piar;
+
+	/* Walk tree, find a place to insert into tree */
+	while (*p) {
+		parent = *p;
+		piar = rb_entry(parent, struct pci_io_addr_range, rb_node);
+		if (ahi < piar->addr_lo) {
+			p = &parent->rb_left;
+		} else if (alo > piar->addr_hi) {
+			p = &parent->rb_right;
+		} else {
+			if (dev != piar->pcidev ||
+			    alo != piar->addr_lo || ahi != piar->addr_hi) {
+				pr_warning("PIAR: overlapping address range\n");
+			}
+			return piar;
+		}
+	}
+	piar = kzalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC);
+	if (!piar)
+		return NULL;
+
+	piar->addr_lo = alo;
+	piar->addr_hi = ahi;
+	piar->edev = pci_dev_to_eeh_dev(dev);
+	piar->pcidev = dev;
+	piar->flags = flags;
+
+#ifdef DEBUG
+	pr_debug("PIAR: insert range=[%lx:%lx] dev=%s\n",
+	                  alo, ahi, pci_name(dev));
+#endif
+
+	rb_link_node(&piar->rb_node, parent, p);
+	rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root);
+
+	return piar;
+}
+
+static void __eeh_addr_cache_insert_dev(struct pci_dev *dev)
+{
+	struct device_node *dn;
+	struct eeh_dev *edev;
+	int i;
+
+	dn = pci_device_to_OF_node(dev);
+	if (!dn) {
+		pr_warning("PCI: no pci dn found for dev=%s\n", pci_name(dev));
+		return;
+	}
+
+	edev = of_node_to_eeh_dev(dn);
+	if (!edev) {
+		pr_warning("PCI: no EEH dev found for dn=%s\n",
+			dn->full_name);
+		return;
+	}
+
+	/* Skip any devices for which EEH is not enabled. */
+	if (!eeh_probe_mode_dev() && !edev->pe) {
+#ifdef DEBUG
+		pr_info("PCI: skip building address cache for=%s - %s\n",
+			pci_name(dev), dn->full_name);
+#endif
+		return;
+	}
+
+	/* Walk resources on this device, poke them into the tree */
+	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
+		unsigned long start = pci_resource_start(dev,i);
+		unsigned long end = pci_resource_end(dev,i);
+		unsigned int flags = pci_resource_flags(dev,i);
+
+		/* We are interested only bus addresses, not dma or other stuff */
+		if (0 == (flags & (IORESOURCE_IO | IORESOURCE_MEM)))
+			continue;
+		if (start == 0 || ~start == 0 || end == 0 || ~end == 0)
+			 continue;
+		eeh_addr_cache_insert(dev, start, end, flags);
+	}
+}
+
+/**
+ * eeh_addr_cache_insert_dev - Add a device to the address cache
+ * @dev: PCI device whose I/O addresses we are interested in.
+ *
+ * In order to support the fast lookup of devices based on addresses,
+ * we maintain a cache of devices that can be quickly searched.
+ * This routine adds a device to that cache.
+ */
+void eeh_addr_cache_insert_dev(struct pci_dev *dev)
+{
+	unsigned long flags;
+
+	/* Ignore PCI bridges */
+	if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE)
+		return;
+
+	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+	__eeh_addr_cache_insert_dev(dev);
+	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+}
+
+static inline void __eeh_addr_cache_rmv_dev(struct pci_dev *dev)
+{
+	struct rb_node *n;
+
+restart:
+	n = rb_first(&pci_io_addr_cache_root.rb_root);
+	while (n) {
+		struct pci_io_addr_range *piar;
+		piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+
+		if (piar->pcidev == dev) {
+			rb_erase(n, &pci_io_addr_cache_root.rb_root);
+			kfree(piar);
+			goto restart;
+		}
+		n = rb_next(n);
+	}
+}
+
+/**
+ * eeh_addr_cache_rmv_dev - remove pci device from addr cache
+ * @dev: device to remove
+ *
+ * Remove a device from the addr-cache tree.
+ * This is potentially expensive, since it will walk
+ * the tree multiple times (once per resource).
+ * But so what; device removal doesn't need to be that fast.
+ */
+void eeh_addr_cache_rmv_dev(struct pci_dev *dev)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+	__eeh_addr_cache_rmv_dev(dev);
+	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+}
+
+/**
+ * eeh_addr_cache_build - Build a cache of I/O addresses
+ *
+ * Build a cache of pci i/o addresses.  This cache will be used to
+ * find the pci device that corresponds to a given address.
+ * This routine scans all pci busses to build the cache.
+ * Must be run late in boot process, after the pci controllers
+ * have been scanned for devices (after all device resources are known).
+ */
+void eeh_addr_cache_build(void)
+{
+	struct device_node *dn;
+	struct eeh_dev *edev;
+	struct pci_dev *dev = NULL;
+
+	spin_lock_init(&pci_io_addr_cache_root.piar_lock);
+
+	for_each_pci_dev(dev) {
+		dn = pci_device_to_OF_node(dev);
+		if (!dn)
+			continue;
+
+		edev = of_node_to_eeh_dev(dn);
+		if (!edev)
+			continue;
+
+		dev->dev.archdata.edev = edev;
+		edev->pdev = dev;
+
+		eeh_addr_cache_insert_dev(dev);
+		eeh_sysfs_add_device(dev);
+	}
+
+#ifdef DEBUG
+	/* Verify tree built up above, echo back the list of addrs. */
+	eeh_addr_cache_print(&pci_io_addr_cache_root);
+#endif
+}
diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c
new file mode 100644
index 00000000000..1efa28f5fc5
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_dev.c
@@ -0,0 +1,112 @@
+/*
+ * The file intends to implement dynamic creation of EEH device, which will
+ * be bound with OF node and PCI device simutaneously. The EEH devices would
+ * be foundamental information for EEH core components to work proerly. Besides,
+ * We have to support multiple situations where dynamic creation of EEH device
+ * is required:
+ *
+ * 1) Before PCI emunation starts, we need create EEH devices according to the
+ *    PCI sensitive OF nodes.
+ * 2) When PCI emunation is done, we need do the binding between PCI device and
+ *    the associated EEH device.
+ * 3) DR (Dynamic Reconfiguration) would create PCI sensitive OF node. EEH device
+ *    will be created while PCI sensitive OF node is detected from DR.
+ * 4) PCI hotplug needs redoing the binding between PCI device and EEH device. If
+ *    PHB is newly inserted, we also need create EEH devices accordingly.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+
+/**
+ * eeh_dev_init - Create EEH device according to OF node
+ * @dn: device node
+ * @data: PHB
+ *
+ * It will create EEH device according to the given OF node. The function
+ * might be called by PCI emunation, DR, PHB hotplug.
+ */
+void *eeh_dev_init(struct device_node *dn, void *data)
+{
+	struct pci_controller *phb = data;
+	struct eeh_dev *edev;
+
+	/* Allocate EEH device */
+	edev = kzalloc(sizeof(*edev), GFP_KERNEL);
+	if (!edev) {
+		pr_warning("%s: out of memory\n", __func__);
+		return NULL;
+	}
+
+	/* Associate EEH device with OF node */
+	PCI_DN(dn)->edev = edev;
+	edev->dn  = dn;
+	edev->phb = phb;
+	INIT_LIST_HEAD(&edev->list);
+
+	return NULL;
+}
+
+/**
+ * eeh_dev_phb_init_dynamic - Create EEH devices for devices included in PHB
+ * @phb: PHB
+ *
+ * Scan the PHB OF node and its child association, then create the
+ * EEH devices accordingly
+ */
+void eeh_dev_phb_init_dynamic(struct pci_controller *phb)
+{
+	struct device_node *dn = phb->dn;
+
+	/* EEH PE for PHB */
+	eeh_phb_pe_create(phb);
+
+	/* EEH device for PHB */
+	eeh_dev_init(dn, phb);
+
+	/* EEH devices for children OF nodes */
+	traverse_pci_devices(dn, eeh_dev_init, phb);
+}
+
+/**
+ * eeh_dev_phb_init - Create EEH devices for devices included in existing PHBs
+ *
+ * Scan all the existing PHBs and create EEH devices for their OF
+ * nodes and their children OF nodes
+ */
+static int __init eeh_dev_phb_init(void)
+{
+	struct pci_controller *phb, *tmp;
+
+	list_for_each_entry_safe(phb, tmp, &hose_list, list_node)
+		eeh_dev_phb_init_dynamic(phb);
+
+	pr_info("EEH: devices created\n");
+
+	return 0;
+}
+
+core_initcall(eeh_dev_phb_init);
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
new file mode 100644
index 00000000000..420da61d4ce
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -0,0 +1,870 @@
+/*
+ * PCI Error Recovery Driver for RPA-compliant PPC64 platform.
+ * Copyright IBM Corp. 2004 2005
+ * Copyright Linas Vepstas <linas@linas.org> 2004, 2005
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
+ */
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/ppc-pci.h>
+#include <asm/pci-bridge.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+
+/**
+ * eeh_pcid_name - Retrieve name of PCI device driver
+ * @pdev: PCI device
+ *
+ * This routine is used to retrieve the name of PCI device driver
+ * if that's valid.
+ */
+static inline const char *eeh_pcid_name(struct pci_dev *pdev)
+{
+	if (pdev && pdev->dev.driver)
+		return pdev->dev.driver->name;
+	return "";
+}
+
+/**
+ * eeh_pcid_get - Get the PCI device driver
+ * @pdev: PCI device
+ *
+ * The function is used to retrieve the PCI device driver for
+ * the indicated PCI device. Besides, we will increase the reference
+ * of the PCI device driver to prevent that being unloaded on
+ * the fly. Otherwise, kernel crash would be seen.
+ */
+static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev)
+{
+	if (!pdev || !pdev->driver)
+		return NULL;
+
+	if (!try_module_get(pdev->driver->driver.owner))
+		return NULL;
+
+	return pdev->driver;
+}
+
+/**
+ * eeh_pcid_put - Dereference on the PCI device driver
+ * @pdev: PCI device
+ *
+ * The function is called to do dereference on the PCI device
+ * driver of the indicated PCI device.
+ */
+static inline void eeh_pcid_put(struct pci_dev *pdev)
+{
+	if (!pdev || !pdev->driver)
+		return;
+
+	module_put(pdev->driver->driver.owner);
+}
+
+#if 0
+static void print_device_node_tree(struct pci_dn *pdn, int dent)
+{
+	int i;
+	struct device_node *pc;
+
+	if (!pdn)
+		return;
+	for (i = 0; i < dent; i++)
+		printk(" ");
+	printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n",
+		pdn->node->name, pdn->eeh_mode, pdn->eeh_config_addr,
+		pdn->eeh_pe_config_addr, pdn->node->full_name);
+	dent += 3;
+	pc = pdn->node->child;
+	while (pc) {
+		print_device_node_tree(PCI_DN(pc), dent);
+		pc = pc->sibling;
+	}
+}
+#endif
+
+/**
+ * eeh_disable_irq - Disable interrupt for the recovering device
+ * @dev: PCI device
+ *
+ * This routine must be called when reporting temporary or permanent
+ * error to the particular PCI device to disable interrupt of that
+ * device. If the device has enabled MSI or MSI-X interrupt, we needn't
+ * do real work because EEH should freeze DMA transfers for those PCI
+ * devices encountering EEH errors, which includes MSI or MSI-X.
+ */
+static void eeh_disable_irq(struct pci_dev *dev)
+{
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
+
+	/* Don't disable MSI and MSI-X interrupts. They are
+	 * effectively disabled by the DMA Stopped state
+	 * when an EEH error occurs.
+	 */
+	if (dev->msi_enabled || dev->msix_enabled)
+		return;
+
+	if (!irq_has_action(dev->irq))
+		return;
+
+	edev->mode |= EEH_DEV_IRQ_DISABLED;
+	disable_irq_nosync(dev->irq);
+}
+
+/**
+ * eeh_enable_irq - Enable interrupt for the recovering device
+ * @dev: PCI device
+ *
+ * This routine must be called to enable interrupt while failed
+ * device could be resumed.
+ */
+static void eeh_enable_irq(struct pci_dev *dev)
+{
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
+
+	if ((edev->mode) & EEH_DEV_IRQ_DISABLED) {
+		edev->mode &= ~EEH_DEV_IRQ_DISABLED;
+		/*
+		 * FIXME !!!!!
+		 *
+		 * This is just ass backwards. This maze has
+		 * unbalanced irq_enable/disable calls. So instead of
+		 * finding the root cause it works around the warning
+		 * in the irq_enable code by conditionally calling
+		 * into it.
+		 *
+		 * That's just wrong.The warning in the core code is
+		 * there to tell people to fix their assymetries in
+		 * their own code, not by abusing the core information
+		 * to avoid it.
+		 *
+		 * I so wish that the assymetry would be the other way
+		 * round and a few more irq_disable calls render that
+		 * shit unusable forever.
+		 *
+		 *	tglx
+		 */
+		if (irqd_irq_disabled(irq_get_irq_data(dev->irq)))
+			enable_irq(dev->irq);
+	}
+}
+
+static bool eeh_dev_removed(struct eeh_dev *edev)
+{
+	/* EEH device removed ? */
+	if (!edev || (edev->mode & EEH_DEV_REMOVED))
+		return true;
+
+	return false;
+}
+
+/**
+ * eeh_report_error - Report pci error to each device driver
+ * @data: eeh device
+ * @userdata: return value
+ *
+ * Report an EEH error to each device driver, collect up and
+ * merge the device driver responses. Cumulative response
+ * passed back in "userdata".
+ */
+static void *eeh_report_error(void *data, void *userdata)
+{
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+	enum pci_ers_result rc, *res = userdata;
+	struct pci_driver *driver;
+
+	if (!dev || eeh_dev_removed(edev))
+		return NULL;
+	dev->error_state = pci_channel_io_frozen;
+
+	driver = eeh_pcid_get(dev);
+	if (!driver) return NULL;
+
+	eeh_disable_irq(dev);
+
+	if (!driver->err_handler ||
+	    !driver->err_handler->error_detected) {
+		eeh_pcid_put(dev);
+		return NULL;
+	}
+
+	rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
+
+	/* A driver that needs a reset trumps all others */
+	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
+	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
+
+	eeh_pcid_put(dev);
+	return NULL;
+}
+
+/**
+ * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled
+ * @data: eeh device
+ * @userdata: return value
+ *
+ * Tells each device driver that IO ports, MMIO and config space I/O
+ * are now enabled. Collects up and merges the device driver responses.
+ * Cumulative response passed back in "userdata".
+ */
+static void *eeh_report_mmio_enabled(void *data, void *userdata)
+{
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+	enum pci_ers_result rc, *res = userdata;
+	struct pci_driver *driver;
+
+	if (!dev || eeh_dev_removed(edev))
+		return NULL;
+
+	driver = eeh_pcid_get(dev);
+	if (!driver) return NULL;
+
+	if (!driver->err_handler ||
+	    !driver->err_handler->mmio_enabled ||
+	    (edev->mode & EEH_DEV_NO_HANDLER)) {
+		eeh_pcid_put(dev);
+		return NULL;
+	}
+
+	rc = driver->err_handler->mmio_enabled(dev);
+
+	/* A driver that needs a reset trumps all others */
+	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
+	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
+
+	eeh_pcid_put(dev);
+	return NULL;
+}
+
+/**
+ * eeh_report_reset - Tell device that slot has been reset
+ * @data: eeh device
+ * @userdata: return value
+ *
+ * This routine must be called while EEH tries to reset particular
+ * PCI device so that the associated PCI device driver could take
+ * some actions, usually to save data the driver needs so that the
+ * driver can work again while the device is recovered.
+ */
+static void *eeh_report_reset(void *data, void *userdata)
+{
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+	enum pci_ers_result rc, *res = userdata;
+	struct pci_driver *driver;
+
+	if (!dev || eeh_dev_removed(edev))
+		return NULL;
+	dev->error_state = pci_channel_io_normal;
+
+	driver = eeh_pcid_get(dev);
+	if (!driver) return NULL;
+
+	eeh_enable_irq(dev);
+
+	if (!driver->err_handler ||
+	    !driver->err_handler->slot_reset ||
+	    (edev->mode & EEH_DEV_NO_HANDLER)) {
+		eeh_pcid_put(dev);
+		return NULL;
+	}
+
+	rc = driver->err_handler->slot_reset(dev);
+	if ((*res == PCI_ERS_RESULT_NONE) ||
+	    (*res == PCI_ERS_RESULT_RECOVERED)) *res = rc;
+	if (*res == PCI_ERS_RESULT_DISCONNECT &&
+	     rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
+
+	eeh_pcid_put(dev);
+	return NULL;
+}
+
+/**
+ * eeh_report_resume - Tell device to resume normal operations
+ * @data: eeh device
+ * @userdata: return value
+ *
+ * This routine must be called to notify the device driver that it
+ * could resume so that the device driver can do some initialization
+ * to make the recovered device work again.
+ */
+static void *eeh_report_resume(void *data, void *userdata)
+{
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+	struct pci_driver *driver;
+
+	if (!dev || eeh_dev_removed(edev))
+		return NULL;
+	dev->error_state = pci_channel_io_normal;
+
+	driver = eeh_pcid_get(dev);
+	if (!driver) return NULL;
+
+	eeh_enable_irq(dev);
+
+	if (!driver->err_handler ||
+	    !driver->err_handler->resume ||
+	    (edev->mode & EEH_DEV_NO_HANDLER)) {
+		edev->mode &= ~EEH_DEV_NO_HANDLER;
+		eeh_pcid_put(dev);
+		return NULL;
+	}
+
+	driver->err_handler->resume(dev);
+
+	eeh_pcid_put(dev);
+	return NULL;
+}
+
+/**
+ * eeh_report_failure - Tell device driver that device is dead.
+ * @data: eeh device
+ * @userdata: return value
+ *
+ * This informs the device driver that the device is permanently
+ * dead, and that no further recovery attempts will be made on it.
+ */
+static void *eeh_report_failure(void *data, void *userdata)
+{
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+	struct pci_driver *driver;
+
+	if (!dev || eeh_dev_removed(edev))
+		return NULL;
+	dev->error_state = pci_channel_io_perm_failure;
+
+	driver = eeh_pcid_get(dev);
+	if (!driver) return NULL;
+
+	eeh_disable_irq(dev);
+
+	if (!driver->err_handler ||
+	    !driver->err_handler->error_detected) {
+		eeh_pcid_put(dev);
+		return NULL;
+	}
+
+	driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
+
+	eeh_pcid_put(dev);
+	return NULL;
+}
+
+static void *eeh_rmv_device(void *data, void *userdata)
+{
+	struct pci_driver *driver;
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+	int *removed = (int *)userdata;
+
+	/*
+	 * Actually, we should remove the PCI bridges as well.
+	 * However, that's lots of complexity to do that,
+	 * particularly some of devices under the bridge might
+	 * support EEH. So we just care about PCI devices for
+	 * simplicity here.
+	 */
+	if (!dev || (dev->hdr_type & PCI_HEADER_TYPE_BRIDGE))
+		return NULL;
+
+	/*
+	 * We rely on count-based pcibios_release_device() to
+	 * detach permanently offlined PEs. Unfortunately, that's
+	 * not reliable enough. We might have the permanently
+	 * offlined PEs attached, but we needn't take care of
+	 * them and their child devices.
+	 */
+	if (eeh_dev_removed(edev))
+		return NULL;
+
+	driver = eeh_pcid_get(dev);
+	if (driver) {
+		eeh_pcid_put(dev);
+		if (driver->err_handler)
+			return NULL;
+	}
+
+	/* Remove it from PCI subsystem */
+	pr_debug("EEH: Removing %s without EEH sensitive driver\n",
+		 pci_name(dev));
+	edev->bus = dev->bus;
+	edev->mode |= EEH_DEV_DISCONNECTED;
+	(*removed)++;
+
+	pci_lock_rescan_remove();
+	pci_stop_and_remove_bus_device(dev);
+	pci_unlock_rescan_remove();
+
+	return NULL;
+}
+
+static void *eeh_pe_detach_dev(void *data, void *userdata)
+{
+	struct eeh_pe *pe = (struct eeh_pe *)data;
+	struct eeh_dev *edev, *tmp;
+
+	eeh_pe_for_each_dev(pe, edev, tmp) {
+		if (!(edev->mode & EEH_DEV_DISCONNECTED))
+			continue;
+
+		edev->mode &= ~(EEH_DEV_DISCONNECTED | EEH_DEV_IRQ_DISABLED);
+		eeh_rmv_from_parent_pe(edev);
+	}
+
+	return NULL;
+}
+
+/*
+ * Explicitly clear PE's frozen state for PowerNV where
+ * we have frozen PE until BAR restore is completed. It's
+ * harmless to clear it for pSeries. To be consistent with
+ * PE reset (for 3 times), we try to clear the frozen state
+ * for 3 times as well.
+ */
+static void *__eeh_clear_pe_frozen_state(void *data, void *flag)
+{
+	struct eeh_pe *pe = (struct eeh_pe *)data;
+	int i, rc;
+
+	for (i = 0; i < 3; i++) {
+		rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
+		if (rc)
+			continue;
+		rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
+		if (!rc)
+			break;
+	}
+
+	/* The PE has been isolated, clear it */
+	if (rc) {
+		pr_warn("%s: Can't clear frozen PHB#%x-PE#%x (%d)\n",
+			__func__, pe->phb->global_number, pe->addr, rc);
+		return (void *)pe;
+	}
+
+	return NULL;
+}
+
+static int eeh_clear_pe_frozen_state(struct eeh_pe *pe)
+{
+	void *rc;
+
+	rc = eeh_pe_traverse(pe, __eeh_clear_pe_frozen_state, NULL);
+	if (!rc)
+		eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
+
+	return rc ? -EIO : 0;
+}
+
+/**
+ * eeh_reset_device - Perform actual reset of a pci slot
+ * @pe: EEH PE
+ * @bus: PCI bus corresponding to the isolcated slot
+ *
+ * This routine must be called to do reset on the indicated PE.
+ * During the reset, udev might be invoked because those affected
+ * PCI devices will be removed and then added.
+ */
+static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
+{
+	struct pci_bus *frozen_bus = eeh_pe_bus_get(pe);
+	struct timeval tstamp;
+	int cnt, rc, removed = 0;
+
+	/* pcibios will clear the counter; save the value */
+	cnt = pe->freeze_count;
+	tstamp = pe->tstamp;
+
+	/*
+	 * We don't remove the corresponding PE instances because
+	 * we need the information afterwords. The attached EEH
+	 * devices are expected to be attached soon when calling
+	 * into pcibios_add_pci_devices().
+	 */
+	eeh_pe_state_mark(pe, EEH_PE_KEEP);
+	if (bus) {
+		pci_lock_rescan_remove();
+		pcibios_remove_pci_devices(bus);
+		pci_unlock_rescan_remove();
+	} else if (frozen_bus) {
+		eeh_pe_dev_traverse(pe, eeh_rmv_device, &removed);
+	}
+
+	/*
+	 * Reset the pci controller. (Asserts RST#; resets config space).
+	 * Reconfigure bridges and devices. Don't try to bring the system
+	 * up if the reset failed for some reason.
+	 *
+	 * During the reset, it's very dangerous to have uncontrolled PCI
+	 * config accesses. So we prefer to block them. However, controlled
+	 * PCI config accesses initiated from EEH itself are allowed.
+	 */
+	eeh_pe_state_mark(pe, EEH_PE_RESET);
+	rc = eeh_reset_pe(pe);
+	if (rc) {
+		eeh_pe_state_clear(pe, EEH_PE_RESET);
+		return rc;
+	}
+
+	pci_lock_rescan_remove();
+
+	/* Restore PE */
+	eeh_ops->configure_bridge(pe);
+	eeh_pe_restore_bars(pe);
+	eeh_pe_state_clear(pe, EEH_PE_RESET);
+
+	/* Clear frozen state */
+	rc = eeh_clear_pe_frozen_state(pe);
+	if (rc)
+		return rc;
+
+	/* Give the system 5 seconds to finish running the user-space
+	 * hotplug shutdown scripts, e.g. ifdown for ethernet.  Yes,
+	 * this is a hack, but if we don't do this, and try to bring
+	 * the device up before the scripts have taken it down,
+	 * potentially weird things happen.
+	 */
+	if (bus) {
+		pr_info("EEH: Sleep 5s ahead of complete hotplug\n");
+		ssleep(5);
+
+		/*
+		 * The EEH device is still connected with its parent
+		 * PE. We should disconnect it so the binding can be
+		 * rebuilt when adding PCI devices.
+		 */
+		eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL);
+		pcibios_add_pci_devices(bus);
+	} else if (frozen_bus && removed) {
+		pr_info("EEH: Sleep 5s ahead of partial hotplug\n");
+		ssleep(5);
+
+		eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL);
+		pcibios_add_pci_devices(frozen_bus);
+	}
+	eeh_pe_state_clear(pe, EEH_PE_KEEP);
+
+	pe->tstamp = tstamp;
+	pe->freeze_count = cnt;
+
+	pci_unlock_rescan_remove();
+	return 0;
+}
+
+/* The longest amount of time to wait for a pci device
+ * to come back on line, in seconds.
+ */
+#define MAX_WAIT_FOR_RECOVERY 300
+
+static void eeh_handle_normal_event(struct eeh_pe *pe)
+{
+	struct pci_bus *frozen_bus;
+	int rc = 0;
+	enum pci_ers_result result = PCI_ERS_RESULT_NONE;
+
+	frozen_bus = eeh_pe_bus_get(pe);
+	if (!frozen_bus) {
+		pr_err("%s: Cannot find PCI bus for PHB#%d-PE#%x\n",
+			__func__, pe->phb->global_number, pe->addr);
+		return;
+	}
+
+	eeh_pe_update_time_stamp(pe);
+	pe->freeze_count++;
+	if (pe->freeze_count > EEH_MAX_ALLOWED_FREEZES)
+		goto excess_failures;
+	pr_warning("EEH: This PCI device has failed %d times in the last hour\n",
+		pe->freeze_count);
+
+	/* Walk the various device drivers attached to this slot through
+	 * a reset sequence, giving each an opportunity to do what it needs
+	 * to accomplish the reset.  Each child gets a report of the
+	 * status ... if any child can't handle the reset, then the entire
+	 * slot is dlpar removed and added.
+	 */
+	pr_info("EEH: Notify device drivers to shutdown\n");
+	eeh_pe_dev_traverse(pe, eeh_report_error, &result);
+
+	/* Get the current PCI slot state. This can take a long time,
+	 * sometimes over 3 seconds for certain systems.
+	 */
+	rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
+	if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
+		pr_warning("EEH: Permanent failure\n");
+		goto hard_fail;
+	}
+
+	/* Since rtas may enable MMIO when posting the error log,
+	 * don't post the error log until after all dev drivers
+	 * have been informed.
+	 */
+	pr_info("EEH: Collect temporary log\n");
+	eeh_slot_error_detail(pe, EEH_LOG_TEMP);
+
+	/* If all device drivers were EEH-unaware, then shut
+	 * down all of the device drivers, and hope they
+	 * go down willingly, without panicing the system.
+	 */
+	if (result == PCI_ERS_RESULT_NONE) {
+		pr_info("EEH: Reset with hotplug activity\n");
+		rc = eeh_reset_device(pe, frozen_bus);
+		if (rc) {
+			pr_warning("%s: Unable to reset, err=%d\n",
+				   __func__, rc);
+			goto hard_fail;
+		}
+	}
+
+	/* If all devices reported they can proceed, then re-enable MMIO */
+	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
+		pr_info("EEH: Enable I/O for affected devices\n");
+		rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
+
+		if (rc < 0)
+			goto hard_fail;
+		if (rc) {
+			result = PCI_ERS_RESULT_NEED_RESET;
+		} else {
+			pr_info("EEH: Notify device drivers to resume I/O\n");
+			eeh_pe_dev_traverse(pe, eeh_report_mmio_enabled, &result);
+		}
+	}
+
+	/* If all devices reported they can proceed, then re-enable DMA */
+	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
+		pr_info("EEH: Enabled DMA for affected devices\n");
+		rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
+
+		if (rc < 0)
+			goto hard_fail;
+		if (rc) {
+			result = PCI_ERS_RESULT_NEED_RESET;
+		} else {
+			/*
+			 * We didn't do PE reset for the case. The PE
+			 * is still in frozen state. Clear it before
+			 * resuming the PE.
+			 */
+			eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
+			result = PCI_ERS_RESULT_RECOVERED;
+		}
+	}
+
+	/* If any device has a hard failure, then shut off everything. */
+	if (result == PCI_ERS_RESULT_DISCONNECT) {
+		pr_warning("EEH: Device driver gave up\n");
+		goto hard_fail;
+	}
+
+	/* If any device called out for a reset, then reset the slot */
+	if (result == PCI_ERS_RESULT_NEED_RESET) {
+		pr_info("EEH: Reset without hotplug activity\n");
+		rc = eeh_reset_device(pe, NULL);
+		if (rc) {
+			pr_warning("%s: Cannot reset, err=%d\n",
+				   __func__, rc);
+			goto hard_fail;
+		}
+
+		pr_info("EEH: Notify device drivers "
+			"the completion of reset\n");
+		result = PCI_ERS_RESULT_NONE;
+		eeh_pe_dev_traverse(pe, eeh_report_reset, &result);
+	}
+
+	/* All devices should claim they have recovered by now. */
+	if ((result != PCI_ERS_RESULT_RECOVERED) &&
+	    (result != PCI_ERS_RESULT_NONE)) {
+		pr_warning("EEH: Not recovered\n");
+		goto hard_fail;
+	}
+
+	/* Tell all device drivers that they can resume operations */
+	pr_info("EEH: Notify device driver to resume\n");
+	eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
+
+	return;
+
+excess_failures:
+	/*
+	 * About 90% of all real-life EEH failures in the field
+	 * are due to poorly seated PCI cards. Only 10% or so are
+	 * due to actual, failed cards.
+	 */
+	pr_err("EEH: PHB#%d-PE#%x has failed %d times in the\n"
+	       "last hour and has been permanently disabled.\n"
+	       "Please try reseating or replacing it.\n",
+		pe->phb->global_number, pe->addr,
+		pe->freeze_count);
+	goto perm_error;
+
+hard_fail:
+	pr_err("EEH: Unable to recover from failure from PHB#%d-PE#%x.\n"
+	       "Please try reseating or replacing it\n",
+		pe->phb->global_number, pe->addr);
+
+perm_error:
+	eeh_slot_error_detail(pe, EEH_LOG_PERM);
+
+	/* Notify all devices that they're about to go down. */
+	eeh_pe_dev_traverse(pe, eeh_report_failure, NULL);
+
+	/* Mark the PE to be removed permanently */
+	pe->freeze_count = EEH_MAX_ALLOWED_FREEZES + 1;
+
+	/*
+	 * Shut down the device drivers for good. We mark
+	 * all removed devices correctly to avoid access
+	 * the their PCI config any more.
+	 */
+	if (frozen_bus) {
+		eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
+
+		pci_lock_rescan_remove();
+		pcibios_remove_pci_devices(frozen_bus);
+		pci_unlock_rescan_remove();
+	}
+}
+
+static void eeh_handle_special_event(void)
+{
+	struct eeh_pe *pe, *phb_pe;
+	struct pci_bus *bus;
+	struct pci_controller *hose;
+	unsigned long flags;
+	int rc;
+
+
+	do {
+		rc = eeh_ops->next_error(&pe);
+
+		switch (rc) {
+		case EEH_NEXT_ERR_DEAD_IOC:
+			/* Mark all PHBs in dead state */
+			eeh_serialize_lock(&flags);
+
+			/* Purge all events */
+			eeh_remove_event(NULL, true);
+
+			list_for_each_entry(hose, &hose_list, list_node) {
+				phb_pe = eeh_phb_pe_get(hose);
+				if (!phb_pe) continue;
+
+				eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED);
+			}
+
+			eeh_serialize_unlock(flags);
+
+			break;
+		case EEH_NEXT_ERR_FROZEN_PE:
+		case EEH_NEXT_ERR_FENCED_PHB:
+		case EEH_NEXT_ERR_DEAD_PHB:
+			/* Mark the PE in fenced state */
+			eeh_serialize_lock(&flags);
+
+			/* Purge all events of the PHB */
+			eeh_remove_event(pe, true);
+
+			if (rc == EEH_NEXT_ERR_DEAD_PHB)
+				eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
+			else
+				eeh_pe_state_mark(pe,
+					EEH_PE_ISOLATED | EEH_PE_RECOVERING);
+
+			eeh_serialize_unlock(flags);
+
+			break;
+		case EEH_NEXT_ERR_NONE:
+			return;
+		default:
+			pr_warn("%s: Invalid value %d from next_error()\n",
+				__func__, rc);
+			return;
+		}
+
+		/*
+		 * For fenced PHB and frozen PE, it's handled as normal
+		 * event. We have to remove the affected PHBs for dead
+		 * PHB and IOC
+		 */
+		if (rc == EEH_NEXT_ERR_FROZEN_PE ||
+		    rc == EEH_NEXT_ERR_FENCED_PHB) {
+			eeh_handle_normal_event(pe);
+			eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
+		} else {
+			pci_lock_rescan_remove();
+			list_for_each_entry(hose, &hose_list, list_node) {
+				phb_pe = eeh_phb_pe_get(hose);
+				if (!phb_pe ||
+				    !(phb_pe->state & EEH_PE_ISOLATED) ||
+				    (phb_pe->state & EEH_PE_RECOVERING))
+					continue;
+
+				/* Notify all devices to be down */
+				bus = eeh_pe_bus_get(phb_pe);
+				eeh_pe_dev_traverse(pe,
+					eeh_report_failure, NULL);
+				pcibios_remove_pci_devices(bus);
+			}
+			pci_unlock_rescan_remove();
+		}
+
+		/*
+		 * If we have detected dead IOC, we needn't proceed
+		 * any more since all PHBs would have been removed
+		 */
+		if (rc == EEH_NEXT_ERR_DEAD_IOC)
+			break;
+	} while (rc != EEH_NEXT_ERR_NONE);
+}
+
+/**
+ * eeh_handle_event - Reset a PCI device after hard lockup.
+ * @pe: EEH PE
+ *
+ * While PHB detects address or data parity errors on particular PCI
+ * slot, the associated PE will be frozen. Besides, DMA's occurring
+ * to wild addresses (which usually happen due to bugs in device
+ * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
+ * #PERR or other misc PCI-related errors also can trigger EEH errors.
+ *
+ * Recovery process consists of unplugging the device driver (which
+ * generated hotplug events to userspace), then issuing a PCI #RST to
+ * the device, then reconfiguring the PCI config space for all bridges
+ * & devices under this slot, and then finally restarting the device
+ * drivers (which cause a second set of hotplug events to go out to
+ * userspace).
+ */
+void eeh_handle_event(struct eeh_pe *pe)
+{
+	if (pe)
+		eeh_handle_normal_event(pe);
+	else
+		eeh_handle_special_event();
+}
diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
new file mode 100644
index 00000000000..4eefb6e34db
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_event.c
@@ -0,0 +1,196 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ * Copyright (c) 2005 Linas Vepstas <linas@linas.org>
+ */
+
+#include <linux/delay.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/semaphore.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/kthread.h>
+#include <asm/eeh_event.h>
+#include <asm/ppc-pci.h>
+
+/** Overview:
+ *  EEH error states may be detected within exception handlers;
+ *  however, the recovery processing needs to occur asynchronously
+ *  in a normal kernel context and not an interrupt context.
+ *  This pair of routines creates an event and queues it onto a
+ *  work-queue, where a worker thread can drive recovery.
+ */
+
+static DEFINE_SPINLOCK(eeh_eventlist_lock);
+static struct semaphore eeh_eventlist_sem;
+LIST_HEAD(eeh_eventlist);
+
+/**
+ * eeh_event_handler - Dispatch EEH events.
+ * @dummy - unused
+ *
+ * The detection of a frozen slot can occur inside an interrupt,
+ * where it can be hard to do anything about it.  The goal of this
+ * routine is to pull these detection events out of the context
+ * of the interrupt handler, and re-dispatch them for processing
+ * at a later time in a normal context.
+ */
+static int eeh_event_handler(void * dummy)
+{
+	unsigned long flags;
+	struct eeh_event *event;
+	struct eeh_pe *pe;
+
+	while (!kthread_should_stop()) {
+		if (down_interruptible(&eeh_eventlist_sem))
+			break;
+
+		/* Fetch EEH event from the queue */
+		spin_lock_irqsave(&eeh_eventlist_lock, flags);
+		event = NULL;
+		if (!list_empty(&eeh_eventlist)) {
+			event = list_entry(eeh_eventlist.next,
+					   struct eeh_event, list);
+			list_del(&event->list);
+		}
+		spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+		if (!event)
+			continue;
+
+		/* We might have event without binding PE */
+		pe = event->pe;
+		if (pe) {
+			eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
+			if (pe->type & EEH_PE_PHB)
+				pr_info("EEH: Detected error on PHB#%d\n",
+					 pe->phb->global_number);
+			else
+				pr_info("EEH: Detected PCI bus error on "
+					"PHB#%d-PE#%x\n",
+					pe->phb->global_number, pe->addr);
+			eeh_handle_event(pe);
+			eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
+		} else {
+			eeh_handle_event(NULL);
+		}
+
+		kfree(event);
+	}
+
+	return 0;
+}
+
+/**
+ * eeh_event_init - Start kernel thread to handle EEH events
+ *
+ * This routine is called to start the kernel thread for processing
+ * EEH event.
+ */
+int eeh_event_init(void)
+{
+	struct task_struct *t;
+	int ret = 0;
+
+	/* Initialize semaphore */
+	sema_init(&eeh_eventlist_sem, 0);
+
+	t = kthread_run(eeh_event_handler, NULL, "eehd");
+	if (IS_ERR(t)) {
+		ret = PTR_ERR(t);
+		pr_err("%s: Failed to start EEH daemon (%d)\n",
+			__func__, ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * eeh_send_failure_event - Generate a PCI error event
+ * @pe: EEH PE
+ *
+ * This routine can be called within an interrupt context;
+ * the actual event will be delivered in a normal context
+ * (from a workqueue).
+ */
+int eeh_send_failure_event(struct eeh_pe *pe)
+{
+	unsigned long flags;
+	struct eeh_event *event;
+
+	event = kzalloc(sizeof(*event), GFP_ATOMIC);
+	if (!event) {
+		pr_err("EEH: out of memory, event not handled\n");
+		return -ENOMEM;
+	}
+	event->pe = pe;
+
+	/* We may or may not be called in an interrupt context */
+	spin_lock_irqsave(&eeh_eventlist_lock, flags);
+	list_add(&event->list, &eeh_eventlist);
+	spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+
+	/* For EEH deamon to knick in */
+	up(&eeh_eventlist_sem);
+
+	return 0;
+}
+
+/**
+ * eeh_remove_event - Remove EEH event from the queue
+ * @pe: Event binding to the PE
+ * @force: Event will be removed unconditionally
+ *
+ * On PowerNV platform, we might have subsequent coming events
+ * is part of the former one. For that case, those subsequent
+ * coming events are totally duplicated and unnecessary, thus
+ * they should be removed.
+ */
+void eeh_remove_event(struct eeh_pe *pe, bool force)
+{
+	unsigned long flags;
+	struct eeh_event *event, *tmp;
+
+	/*
+	 * If we have NULL PE passed in, we have dead IOC
+	 * or we're sure we can report all existing errors
+	 * by the caller.
+	 *
+	 * With "force", the event with associated PE that
+	 * have been isolated, the event won't be removed
+	 * to avoid event lost.
+	 */
+	spin_lock_irqsave(&eeh_eventlist_lock, flags);
+	list_for_each_entry_safe(event, tmp, &eeh_eventlist, list) {
+		if (!force && event->pe &&
+		    (event->pe->state & EEH_PE_ISOLATED))
+			continue;
+
+		if (!pe) {
+			list_del(&event->list);
+			kfree(event);
+		} else if (pe->type & EEH_PE_PHB) {
+			if (event->pe && event->pe->phb == pe->phb) {
+				list_del(&event->list);
+				kfree(event);
+			}
+		} else if (event->pe == pe) {
+			list_del(&event->list);
+			kfree(event);
+		}
+	}
+	spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+}
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
new file mode 100644
index 00000000000..fbd01eba447
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -0,0 +1,887 @@
+/*
+ * The file intends to implement PE based on the information from
+ * platforms. Basically, there have 3 types of PEs: PHB/Bus/Device.
+ * All the PEs should be organized as hierarchy tree. The first level
+ * of the tree will be associated to existing PHBs since the particular
+ * PE is only meaningful in one PHB domain.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+
+static LIST_HEAD(eeh_phb_pe);
+
+/**
+ * eeh_pe_alloc - Allocate PE
+ * @phb: PCI controller
+ * @type: PE type
+ *
+ * Allocate PE instance dynamically.
+ */
+static struct eeh_pe *eeh_pe_alloc(struct pci_controller *phb, int type)
+{
+	struct eeh_pe *pe;
+
+	/* Allocate PHB PE */
+	pe = kzalloc(sizeof(struct eeh_pe), GFP_KERNEL);
+	if (!pe) return NULL;
+
+	/* Initialize PHB PE */
+	pe->type = type;
+	pe->phb = phb;
+	INIT_LIST_HEAD(&pe->child_list);
+	INIT_LIST_HEAD(&pe->child);
+	INIT_LIST_HEAD(&pe->edevs);
+
+	return pe;
+}
+
+/**
+ * eeh_phb_pe_create - Create PHB PE
+ * @phb: PCI controller
+ *
+ * The function should be called while the PHB is detected during
+ * system boot or PCI hotplug in order to create PHB PE.
+ */
+int eeh_phb_pe_create(struct pci_controller *phb)
+{
+	struct eeh_pe *pe;
+
+	/* Allocate PHB PE */
+	pe = eeh_pe_alloc(phb, EEH_PE_PHB);
+	if (!pe) {
+		pr_err("%s: out of memory!\n", __func__);
+		return -ENOMEM;
+	}
+
+	/* Put it into the list */
+	list_add_tail(&pe->child, &eeh_phb_pe);
+
+	pr_debug("EEH: Add PE for PHB#%d\n", phb->global_number);
+
+	return 0;
+}
+
+/**
+ * eeh_phb_pe_get - Retrieve PHB PE based on the given PHB
+ * @phb: PCI controller
+ *
+ * The overall PEs form hierarchy tree. The first layer of the
+ * hierarchy tree is composed of PHB PEs. The function is used
+ * to retrieve the corresponding PHB PE according to the given PHB.
+ */
+struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb)
+{
+	struct eeh_pe *pe;
+
+	list_for_each_entry(pe, &eeh_phb_pe, child) {
+		/*
+		 * Actually, we needn't check the type since
+		 * the PE for PHB has been determined when that
+		 * was created.
+		 */
+		if ((pe->type & EEH_PE_PHB) && pe->phb == phb)
+			return pe;
+	}
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_next - Retrieve the next PE in the tree
+ * @pe: current PE
+ * @root: root PE
+ *
+ * The function is used to retrieve the next PE in the
+ * hierarchy PE tree.
+ */
+static struct eeh_pe *eeh_pe_next(struct eeh_pe *pe,
+				  struct eeh_pe *root)
+{
+	struct list_head *next = pe->child_list.next;
+
+	if (next == &pe->child_list) {
+		while (1) {
+			if (pe == root)
+				return NULL;
+			next = pe->child.next;
+			if (next != &pe->parent->child_list)
+				break;
+			pe = pe->parent;
+		}
+	}
+
+	return list_entry(next, struct eeh_pe, child);
+}
+
+/**
+ * eeh_pe_traverse - Traverse PEs in the specified PHB
+ * @root: root PE
+ * @fn: callback
+ * @flag: extra parameter to callback
+ *
+ * The function is used to traverse the specified PE and its
+ * child PEs. The traversing is to be terminated once the
+ * callback returns something other than NULL, or no more PEs
+ * to be traversed.
+ */
+void *eeh_pe_traverse(struct eeh_pe *root,
+		      eeh_traverse_func fn, void *flag)
+{
+	struct eeh_pe *pe;
+	void *ret;
+
+	for (pe = root; pe; pe = eeh_pe_next(pe, root)) {
+		ret = fn(pe, flag);
+		if (ret) return ret;
+	}
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_dev_traverse - Traverse the devices from the PE
+ * @root: EEH PE
+ * @fn: function callback
+ * @flag: extra parameter to callback
+ *
+ * The function is used to traverse the devices of the specified
+ * PE and its child PEs.
+ */
+void *eeh_pe_dev_traverse(struct eeh_pe *root,
+		eeh_traverse_func fn, void *flag)
+{
+	struct eeh_pe *pe;
+	struct eeh_dev *edev, *tmp;
+	void *ret;
+
+	if (!root) {
+		pr_warning("%s: Invalid PE %p\n", __func__, root);
+		return NULL;
+	}
+
+	/* Traverse root PE */
+	for (pe = root; pe; pe = eeh_pe_next(pe, root)) {
+		eeh_pe_for_each_dev(pe, edev, tmp) {
+			ret = fn(edev, flag);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * __eeh_pe_get - Check the PE address
+ * @data: EEH PE
+ * @flag: EEH device
+ *
+ * For one particular PE, it can be identified by PE address
+ * or tranditional BDF address. BDF address is composed of
+ * Bus/Device/Function number. The extra data referred by flag
+ * indicates which type of address should be used.
+ */
+static void *__eeh_pe_get(void *data, void *flag)
+{
+	struct eeh_pe *pe = (struct eeh_pe *)data;
+	struct eeh_dev *edev = (struct eeh_dev *)flag;
+
+	/* Unexpected PHB PE */
+	if (pe->type & EEH_PE_PHB)
+		return NULL;
+
+	/* We prefer PE address */
+	if (edev->pe_config_addr &&
+	   (edev->pe_config_addr == pe->addr))
+		return pe;
+
+	/* Try BDF address */
+	if (edev->config_addr &&
+	   (edev->config_addr == pe->config_addr))
+		return pe;
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_get - Search PE based on the given address
+ * @edev: EEH device
+ *
+ * Search the corresponding PE based on the specified address which
+ * is included in the eeh device. The function is used to check if
+ * the associated PE has been created against the PE address. It's
+ * notable that the PE address has 2 format: traditional PE address
+ * which is composed of PCI bus/device/function number, or unified
+ * PE address.
+ */
+struct eeh_pe *eeh_pe_get(struct eeh_dev *edev)
+{
+	struct eeh_pe *root = eeh_phb_pe_get(edev->phb);
+	struct eeh_pe *pe;
+
+	pe = eeh_pe_traverse(root, __eeh_pe_get, edev);
+
+	return pe;
+}
+
+/**
+ * eeh_pe_get_parent - Retrieve the parent PE
+ * @edev: EEH device
+ *
+ * The whole PEs existing in the system are organized as hierarchy
+ * tree. The function is used to retrieve the parent PE according
+ * to the parent EEH device.
+ */
+static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev)
+{
+	struct device_node *dn;
+	struct eeh_dev *parent;
+
+	/*
+	 * It might have the case for the indirect parent
+	 * EEH device already having associated PE, but
+	 * the direct parent EEH device doesn't have yet.
+	 */
+	dn = edev->dn->parent;
+	while (dn) {
+		/* We're poking out of PCI territory */
+		if (!PCI_DN(dn)) return NULL;
+
+		parent = of_node_to_eeh_dev(dn);
+		/* We're poking out of PCI territory */
+		if (!parent) return NULL;
+
+		if (parent->pe)
+			return parent->pe;
+
+		dn = dn->parent;
+	}
+
+	return NULL;
+}
+
+/**
+ * eeh_add_to_parent_pe - Add EEH device to parent PE
+ * @edev: EEH device
+ *
+ * Add EEH device to the parent PE. If the parent PE already
+ * exists, the PE type will be changed to EEH_PE_BUS. Otherwise,
+ * we have to create new PE to hold the EEH device and the new
+ * PE will be linked to its parent PE as well.
+ */
+int eeh_add_to_parent_pe(struct eeh_dev *edev)
+{
+	struct eeh_pe *pe, *parent;
+
+	/*
+	 * Search the PE has been existing or not according
+	 * to the PE address. If that has been existing, the
+	 * PE should be composed of PCI bus and its subordinate
+	 * components.
+	 */
+	pe = eeh_pe_get(edev);
+	if (pe && !(pe->type & EEH_PE_INVALID)) {
+		if (!edev->pe_config_addr) {
+			pr_err("%s: PE with addr 0x%x already exists\n",
+				__func__, edev->config_addr);
+			return -EEXIST;
+		}
+
+		/* Mark the PE as type of PCI bus */
+		pe->type = EEH_PE_BUS;
+		edev->pe = pe;
+
+		/* Put the edev to PE */
+		list_add_tail(&edev->list, &pe->edevs);
+		pr_debug("EEH: Add %s to Bus PE#%x\n",
+			edev->dn->full_name, pe->addr);
+
+		return 0;
+	} else if (pe && (pe->type & EEH_PE_INVALID)) {
+		list_add_tail(&edev->list, &pe->edevs);
+		edev->pe = pe;
+		/*
+		 * We're running to here because of PCI hotplug caused by
+		 * EEH recovery. We need clear EEH_PE_INVALID until the top.
+		 */
+		parent = pe;
+		while (parent) {
+			if (!(parent->type & EEH_PE_INVALID))
+				break;
+			parent->type &= ~(EEH_PE_INVALID | EEH_PE_KEEP);
+			parent = parent->parent;
+		}
+		pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n",
+			edev->dn->full_name, pe->addr, pe->parent->addr);
+
+		return 0;
+	}
+
+	/* Create a new EEH PE */
+	pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
+	if (!pe) {
+		pr_err("%s: out of memory!\n", __func__);
+		return -ENOMEM;
+	}
+	pe->addr	= edev->pe_config_addr;
+	pe->config_addr	= edev->config_addr;
+
+	/*
+	 * While doing PE reset, we probably hot-reset the
+	 * upstream bridge. However, the PCI devices including
+	 * the associated EEH devices might be removed when EEH
+	 * core is doing recovery. So that won't safe to retrieve
+	 * the bridge through downstream EEH device. We have to
+	 * trace the parent PCI bus, then the upstream bridge.
+	 */
+	if (eeh_probe_mode_dev())
+		pe->bus = eeh_dev_to_pci_dev(edev)->bus;
+
+	/*
+	 * Put the new EEH PE into hierarchy tree. If the parent
+	 * can't be found, the newly created PE will be attached
+	 * to PHB directly. Otherwise, we have to associate the
+	 * PE with its parent.
+	 */
+	parent = eeh_pe_get_parent(edev);
+	if (!parent) {
+		parent = eeh_phb_pe_get(edev->phb);
+		if (!parent) {
+			pr_err("%s: No PHB PE is found (PHB Domain=%d)\n",
+				__func__, edev->phb->global_number);
+			edev->pe = NULL;
+			kfree(pe);
+			return -EEXIST;
+		}
+	}
+	pe->parent = parent;
+
+	/*
+	 * Put the newly created PE into the child list and
+	 * link the EEH device accordingly.
+	 */
+	list_add_tail(&pe->child, &parent->child_list);
+	list_add_tail(&edev->list, &pe->edevs);
+	edev->pe = pe;
+	pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n",
+		edev->dn->full_name, pe->addr, pe->parent->addr);
+
+	return 0;
+}
+
+/**
+ * eeh_rmv_from_parent_pe - Remove one EEH device from the associated PE
+ * @edev: EEH device
+ *
+ * The PE hierarchy tree might be changed when doing PCI hotplug.
+ * Also, the PCI devices or buses could be removed from the system
+ * during EEH recovery. So we have to call the function remove the
+ * corresponding PE accordingly if necessary.
+ */
+int eeh_rmv_from_parent_pe(struct eeh_dev *edev)
+{
+	struct eeh_pe *pe, *parent, *child;
+	int cnt;
+
+	if (!edev->pe) {
+		pr_debug("%s: No PE found for EEH device %s\n",
+			 __func__, edev->dn->full_name);
+		return -EEXIST;
+	}
+
+	/* Remove the EEH device */
+	pe = edev->pe;
+	edev->pe = NULL;
+	list_del(&edev->list);
+
+	/*
+	 * Check if the parent PE includes any EEH devices.
+	 * If not, we should delete that. Also, we should
+	 * delete the parent PE if it doesn't have associated
+	 * child PEs and EEH devices.
+	 */
+	while (1) {
+		parent = pe->parent;
+		if (pe->type & EEH_PE_PHB)
+			break;
+
+		if (!(pe->state & EEH_PE_KEEP)) {
+			if (list_empty(&pe->edevs) &&
+			    list_empty(&pe->child_list)) {
+				list_del(&pe->child);
+				kfree(pe);
+			} else {
+				break;
+			}
+		} else {
+			if (list_empty(&pe->edevs)) {
+				cnt = 0;
+				list_for_each_entry(child, &pe->child_list, child) {
+					if (!(child->type & EEH_PE_INVALID)) {
+						cnt++;
+						break;
+					}
+				}
+
+				if (!cnt)
+					pe->type |= EEH_PE_INVALID;
+				else
+					break;
+			}
+		}
+
+		pe = parent;
+	}
+
+	return 0;
+}
+
+/**
+ * eeh_pe_update_time_stamp - Update PE's frozen time stamp
+ * @pe: EEH PE
+ *
+ * We have time stamp for each PE to trace its time of getting
+ * frozen in last hour. The function should be called to update
+ * the time stamp on first error of the specific PE. On the other
+ * handle, we needn't account for errors happened in last hour.
+ */
+void eeh_pe_update_time_stamp(struct eeh_pe *pe)
+{
+	struct timeval tstamp;
+
+	if (!pe) return;
+
+	if (pe->freeze_count <= 0) {
+		pe->freeze_count = 0;
+		do_gettimeofday(&pe->tstamp);
+	} else {
+		do_gettimeofday(&tstamp);
+		if (tstamp.tv_sec - pe->tstamp.tv_sec > 3600) {
+			pe->tstamp = tstamp;
+			pe->freeze_count = 0;
+		}
+	}
+}
+
+/**
+ * __eeh_pe_state_mark - Mark the state for the PE
+ * @data: EEH PE
+ * @flag: state
+ *
+ * The function is used to mark the indicated state for the given
+ * PE. Also, the associated PCI devices will be put into IO frozen
+ * state as well.
+ */
+static void *__eeh_pe_state_mark(void *data, void *flag)
+{
+	struct eeh_pe *pe = (struct eeh_pe *)data;
+	int state = *((int *)flag);
+	struct eeh_dev *edev, *tmp;
+	struct pci_dev *pdev;
+
+	/* Keep the state of permanently removed PE intact */
+	if ((pe->freeze_count > EEH_MAX_ALLOWED_FREEZES) &&
+	    (state & (EEH_PE_ISOLATED | EEH_PE_RECOVERING)))
+		return NULL;
+
+	pe->state |= state;
+
+	/* Offline PCI devices if applicable */
+	if (state != EEH_PE_ISOLATED)
+		return NULL;
+
+	eeh_pe_for_each_dev(pe, edev, tmp) {
+		pdev = eeh_dev_to_pci_dev(edev);
+		if (pdev)
+			pdev->error_state = pci_channel_io_frozen;
+	}
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_state_mark - Mark specified state for PE and its associated device
+ * @pe: EEH PE
+ *
+ * EEH error affects the current PE and its child PEs. The function
+ * is used to mark appropriate state for the affected PEs and the
+ * associated devices.
+ */
+void eeh_pe_state_mark(struct eeh_pe *pe, int state)
+{
+	eeh_pe_traverse(pe, __eeh_pe_state_mark, &state);
+}
+
+static void *__eeh_pe_dev_mode_mark(void *data, void *flag)
+{
+	struct eeh_dev *edev = data;
+	int mode = *((int *)flag);
+
+	edev->mode |= mode;
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_dev_state_mark - Mark state for all device under the PE
+ * @pe: EEH PE
+ *
+ * Mark specific state for all child devices of the PE.
+ */
+void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode)
+{
+	eeh_pe_dev_traverse(pe, __eeh_pe_dev_mode_mark, &mode);
+}
+
+/**
+ * __eeh_pe_state_clear - Clear state for the PE
+ * @data: EEH PE
+ * @flag: state
+ *
+ * The function is used to clear the indicated state from the
+ * given PE. Besides, we also clear the check count of the PE
+ * as well.
+ */
+static void *__eeh_pe_state_clear(void *data, void *flag)
+{
+	struct eeh_pe *pe = (struct eeh_pe *)data;
+	int state = *((int *)flag);
+
+	/* Keep the state of permanently removed PE intact */
+	if ((pe->freeze_count > EEH_MAX_ALLOWED_FREEZES) &&
+	    (state & EEH_PE_ISOLATED))
+		return NULL;
+
+	pe->state &= ~state;
+
+	/* Clear check count since last isolation */
+	if (state & EEH_PE_ISOLATED)
+		pe->check_count = 0;
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_state_clear - Clear state for the PE and its children
+ * @pe: PE
+ * @state: state to be cleared
+ *
+ * When the PE and its children has been recovered from error,
+ * we need clear the error state for that. The function is used
+ * for the purpose.
+ */
+void eeh_pe_state_clear(struct eeh_pe *pe, int state)
+{
+	eeh_pe_traverse(pe, __eeh_pe_state_clear, &state);
+}
+
+/*
+ * Some PCI bridges (e.g. PLX bridges) have primary/secondary
+ * buses assigned explicitly by firmware, and we probably have
+ * lost that after reset. So we have to delay the check until
+ * the PCI-CFG registers have been restored for the parent
+ * bridge.
+ *
+ * Don't use normal PCI-CFG accessors, which probably has been
+ * blocked on normal path during the stage. So we need utilize
+ * eeh operations, which is always permitted.
+ */
+static void eeh_bridge_check_link(struct eeh_dev *edev,
+				  struct device_node *dn)
+{
+	int cap;
+	uint32_t val;
+	int timeout = 0;
+
+	/*
+	 * We only check root port and downstream ports of
+	 * PCIe switches
+	 */
+	if (!(edev->mode & (EEH_DEV_ROOT_PORT | EEH_DEV_DS_PORT)))
+		return;
+
+	pr_debug("%s: Check PCIe link for %04x:%02x:%02x.%01x ...\n",
+		 __func__, edev->phb->global_number,
+		 edev->config_addr >> 8,
+		 PCI_SLOT(edev->config_addr & 0xFF),
+		 PCI_FUNC(edev->config_addr & 0xFF));
+
+	/* Check slot status */
+	cap = edev->pcie_cap;
+	eeh_ops->read_config(dn, cap + PCI_EXP_SLTSTA, 2, &val);
+	if (!(val & PCI_EXP_SLTSTA_PDS)) {
+		pr_debug("  No card in the slot (0x%04x) !\n", val);
+		return;
+	}
+
+	/* Check power status if we have the capability */
+	eeh_ops->read_config(dn, cap + PCI_EXP_SLTCAP, 2, &val);
+	if (val & PCI_EXP_SLTCAP_PCP) {
+		eeh_ops->read_config(dn, cap + PCI_EXP_SLTCTL, 2, &val);
+		if (val & PCI_EXP_SLTCTL_PCC) {
+			pr_debug("  In power-off state, power it on ...\n");
+			val &= ~(PCI_EXP_SLTCTL_PCC | PCI_EXP_SLTCTL_PIC);
+			val |= (0x0100 & PCI_EXP_SLTCTL_PIC);
+			eeh_ops->write_config(dn, cap + PCI_EXP_SLTCTL, 2, val);
+			msleep(2 * 1000);
+		}
+	}
+
+	/* Enable link */
+	eeh_ops->read_config(dn, cap + PCI_EXP_LNKCTL, 2, &val);
+	val &= ~PCI_EXP_LNKCTL_LD;
+	eeh_ops->write_config(dn, cap + PCI_EXP_LNKCTL, 2, val);
+
+	/* Check link */
+	eeh_ops->read_config(dn, cap + PCI_EXP_LNKCAP, 4, &val);
+	if (!(val & PCI_EXP_LNKCAP_DLLLARC)) {
+		pr_debug("  No link reporting capability (0x%08x) \n", val);
+		msleep(1000);
+		return;
+	}
+
+	/* Wait the link is up until timeout (5s) */
+	timeout = 0;
+	while (timeout < 5000) {
+		msleep(20);
+		timeout += 20;
+
+		eeh_ops->read_config(dn, cap + PCI_EXP_LNKSTA, 2, &val);
+		if (val & PCI_EXP_LNKSTA_DLLLA)
+			break;
+	}
+
+	if (val & PCI_EXP_LNKSTA_DLLLA)
+		pr_debug("  Link up (%s)\n",
+			 (val & PCI_EXP_LNKSTA_CLS_2_5GB) ? "2.5GB" : "5GB");
+	else
+		pr_debug("  Link not ready (0x%04x)\n", val);
+}
+
+#define BYTE_SWAP(OFF)	(8*((OFF)/4)+3-(OFF))
+#define SAVED_BYTE(OFF)	(((u8 *)(edev->config_space))[BYTE_SWAP(OFF)])
+
+static void eeh_restore_bridge_bars(struct eeh_dev *edev,
+				    struct device_node *dn)
+{
+	int i;
+
+	/*
+	 * Device BARs: 0x10 - 0x18
+	 * Bus numbers and windows: 0x18 - 0x30
+	 */
+	for (i = 4; i < 13; i++)
+		eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]);
+	/* Rom: 0x38 */
+	eeh_ops->write_config(dn, 14*4, 4, edev->config_space[14]);
+
+	/* Cache line & Latency timer: 0xC 0xD */
+	eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1,
+                SAVED_BYTE(PCI_CACHE_LINE_SIZE));
+        eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1,
+                SAVED_BYTE(PCI_LATENCY_TIMER));
+	/* Max latency, min grant, interrupt ping and line: 0x3C */
+	eeh_ops->write_config(dn, 15*4, 4, edev->config_space[15]);
+
+	/* PCI Command: 0x4 */
+	eeh_ops->write_config(dn, PCI_COMMAND, 4, edev->config_space[1]);
+
+	/* Check the PCIe link is ready */
+	eeh_bridge_check_link(edev, dn);
+}
+
+static void eeh_restore_device_bars(struct eeh_dev *edev,
+				    struct device_node *dn)
+{
+	int i;
+	u32 cmd;
+
+	for (i = 4; i < 10; i++)
+		eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]);
+	/* 12 == Expansion ROM Address */
+	eeh_ops->write_config(dn, 12*4, 4, edev->config_space[12]);
+
+	eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1,
+		SAVED_BYTE(PCI_CACHE_LINE_SIZE));
+	eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1,
+		SAVED_BYTE(PCI_LATENCY_TIMER));
+
+	/* max latency, min grant, interrupt pin and line */
+	eeh_ops->write_config(dn, 15*4, 4, edev->config_space[15]);
+
+	/*
+	 * Restore PERR & SERR bits, some devices require it,
+	 * don't touch the other command bits
+	 */
+	eeh_ops->read_config(dn, PCI_COMMAND, 4, &cmd);
+	if (edev->config_space[1] & PCI_COMMAND_PARITY)
+		cmd |= PCI_COMMAND_PARITY;
+	else
+		cmd &= ~PCI_COMMAND_PARITY;
+	if (edev->config_space[1] & PCI_COMMAND_SERR)
+		cmd |= PCI_COMMAND_SERR;
+	else
+		cmd &= ~PCI_COMMAND_SERR;
+	eeh_ops->write_config(dn, PCI_COMMAND, 4, cmd);
+}
+
+/**
+ * eeh_restore_one_device_bars - Restore the Base Address Registers for one device
+ * @data: EEH device
+ * @flag: Unused
+ *
+ * Loads the PCI configuration space base address registers,
+ * the expansion ROM base address, the latency timer, and etc.
+ * from the saved values in the device node.
+ */
+static void *eeh_restore_one_device_bars(void *data, void *flag)
+{
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct device_node *dn = eeh_dev_to_of_node(edev);
+
+	/* Do special restore for bridges */
+	if (edev->mode & EEH_DEV_BRIDGE)
+		eeh_restore_bridge_bars(edev, dn);
+	else
+		eeh_restore_device_bars(edev, dn);
+
+	if (eeh_ops->restore_config)
+		eeh_ops->restore_config(dn);
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_restore_bars - Restore the PCI config space info
+ * @pe: EEH PE
+ *
+ * This routine performs a recursive walk to the children
+ * of this device as well.
+ */
+void eeh_pe_restore_bars(struct eeh_pe *pe)
+{
+	/*
+	 * We needn't take the EEH lock since eeh_pe_dev_traverse()
+	 * will take that.
+	 */
+	eeh_pe_dev_traverse(pe, eeh_restore_one_device_bars, NULL);
+}
+
+/**
+ * eeh_pe_loc_get - Retrieve location code binding to the given PE
+ * @pe: EEH PE
+ *
+ * Retrieve the location code of the given PE. If the primary PE bus
+ * is root bus, we will grab location code from PHB device tree node
+ * or root port. Otherwise, the upstream bridge's device tree node
+ * of the primary PE bus will be checked for the location code.
+ */
+const char *eeh_pe_loc_get(struct eeh_pe *pe)
+{
+	struct pci_controller *hose;
+	struct pci_bus *bus = eeh_pe_bus_get(pe);
+	struct pci_dev *pdev;
+	struct device_node *dn;
+	const char *loc;
+
+	if (!bus)
+		return "N/A";
+
+	/* PHB PE or root PE ? */
+	if (pci_is_root_bus(bus)) {
+		hose = pci_bus_to_host(bus);
+		loc = of_get_property(hose->dn,
+				"ibm,loc-code", NULL);
+		if (loc)
+			return loc;
+		loc = of_get_property(hose->dn,
+				"ibm,io-base-loc-code", NULL);
+		if (loc)
+			return loc;
+
+		pdev = pci_get_slot(bus, 0x0);
+	} else {
+		pdev = bus->self;
+	}
+
+	if (!pdev) {
+		loc = "N/A";
+		goto out;
+	}
+
+	dn = pci_device_to_OF_node(pdev);
+	if (!dn) {
+		loc = "N/A";
+		goto out;
+	}
+
+	loc = of_get_property(dn, "ibm,loc-code", NULL);
+	if (!loc)
+		loc = of_get_property(dn, "ibm,slot-location-code", NULL);
+	if (!loc)
+		loc = "N/A";
+
+out:
+	if (pci_is_root_bus(bus) && pdev)
+		pci_dev_put(pdev);
+	return loc;
+}
+
+/**
+ * eeh_pe_bus_get - Retrieve PCI bus according to the given PE
+ * @pe: EEH PE
+ *
+ * Retrieve the PCI bus according to the given PE. Basically,
+ * there're 3 types of PEs: PHB/Bus/Device. For PHB PE, the
+ * primary PCI bus will be retrieved. The parent bus will be
+ * returned for BUS PE. However, we don't have associated PCI
+ * bus for DEVICE PE.
+ */
+struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
+{
+	struct pci_bus *bus = NULL;
+	struct eeh_dev *edev;
+	struct pci_dev *pdev;
+
+	if (pe->type & EEH_PE_PHB) {
+		bus = pe->phb->bus;
+	} else if (pe->type & EEH_PE_BUS ||
+		   pe->type & EEH_PE_DEVICE) {
+		if (pe->bus) {
+			bus = pe->bus;
+			goto out;
+		}
+
+		edev = list_first_entry(&pe->edevs, struct eeh_dev, list);
+		pdev = eeh_dev_to_pci_dev(edev);
+		if (pdev)
+			bus = pdev->bus;
+	}
+
+out:
+	return bus;
+}
diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c
new file mode 100644
index 00000000000..e2595ba4b72
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_sysfs.c
@@ -0,0 +1,98 @@
+/*
+ * Sysfs entries for PCI Error Recovery for PAPR-compliant platform.
+ * Copyright IBM Corporation 2007
+ * Copyright Linas Vepstas <linas@austin.ibm.com> 2007
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
+ */
+#include <linux/pci.h>
+#include <linux/stat.h>
+#include <asm/ppc-pci.h>
+#include <asm/pci-bridge.h>
+
+/**
+ * EEH_SHOW_ATTR -- Create sysfs entry for eeh statistic
+ * @_name: name of file in sysfs directory
+ * @_memb: name of member in struct pci_dn to access
+ * @_format: printf format for display
+ *
+ * All of the attributes look very similar, so just
+ * auto-gen a cut-n-paste routine to display them.
+ */
+#define EEH_SHOW_ATTR(_name,_memb,_format)               \
+static ssize_t eeh_show_##_name(struct device *dev,      \
+		struct device_attribute *attr, char *buf)          \
+{                                                        \
+	struct pci_dev *pdev = to_pci_dev(dev);               \
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);      \
+	                                                      \
+	if (!edev)                                            \
+		return 0;                                     \
+	                                                      \
+	return sprintf(buf, _format "\n", edev->_memb);       \
+}                                                        \
+static DEVICE_ATTR(_name, S_IRUGO, eeh_show_##_name, NULL);
+
+EEH_SHOW_ATTR(eeh_mode,            mode,            "0x%x");
+EEH_SHOW_ATTR(eeh_config_addr,     config_addr,     "0x%x");
+EEH_SHOW_ATTR(eeh_pe_config_addr,  pe_config_addr,  "0x%x");
+
+void eeh_sysfs_add_device(struct pci_dev *pdev)
+{
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
+	int rc=0;
+
+	if (!eeh_enabled())
+		return;
+
+	if (edev && (edev->mode & EEH_DEV_SYSFS))
+		return;
+
+	rc += device_create_file(&pdev->dev, &dev_attr_eeh_mode);
+	rc += device_create_file(&pdev->dev, &dev_attr_eeh_config_addr);
+	rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
+
+	if (rc)
+		printk(KERN_WARNING "EEH: Unable to create sysfs entries\n");
+	else if (edev)
+		edev->mode |= EEH_DEV_SYSFS;
+}
+
+void eeh_sysfs_remove_device(struct pci_dev *pdev)
+{
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
+
+	/*
+	 * The parent directory might have been removed. We needn't
+	 * continue for that case.
+	 */
+	if (!pdev->dev.kobj.sd) {
+		if (edev)
+			edev->mode &= ~EEH_DEV_SYSFS;
+		return;
+	}
+
+	device_remove_file(&pdev->dev, &dev_attr_eeh_mode);
+	device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr);
+	device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
+
+	if (edev)
+		edev->mode &= ~EEH_DEV_SYSFS;
+}
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 4f80cf1ce77..22b45a4955c 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -89,6 +89,10 @@ crit_transfer_to_handler:
 	mfspr	r0,SPRN_SRR1
 	stw	r0,_SRR1(r11)
 
+	/* set the stack limit to the current stack
+	 * and set the limit to protect the thread_info
+	 * struct
+	 */
 	mfspr	r8,SPRN_SPRG_THREAD
 	lwz	r0,KSP_LIMIT(r8)
 	stw	r0,SAVED_KSP_LIMIT(r11)
@@ -109,6 +113,10 @@ crit_transfer_to_handler:
 	mfspr	r0,SPRN_SRR1
 	stw	r0,crit_srr1@l(0)
 
+	/* set the stack limit to the current stack
+	 * and set the limit to protect the thread_info
+	 * struct
+	 */
 	mfspr	r8,SPRN_SPRG_THREAD
 	lwz	r0,KSP_LIMIT(r8)
 	stw	r0,saved_ksp_limit@l(0)
@@ -158,7 +166,7 @@ transfer_to_handler:
 	tophys(r11,r11)
 	addi	r11,r11,global_dbcr0@l
 #ifdef CONFIG_SMP
-	rlwinm	r9,r1,0,0,(31-THREAD_SHIFT)
+	CURRENT_THREAD_INFO(r9, r1)
 	lwz	r9,TI_CPU(r9)
 	slwi	r9,r9,3
 	add	r11,r11,r9
@@ -179,7 +187,7 @@ transfer_to_handler:
 	ble-	stack_ovf		/* then the kernel stack overflowed */
 5:
 #if defined(CONFIG_6xx) || defined(CONFIG_E500)
-	rlwinm	r9,r1,0,0,31-THREAD_SHIFT
+	CURRENT_THREAD_INFO(r9, r1)
 	tophys(r9,r9)			/* check local flags */
 	lwz	r12,TI_LOCAL_FLAGS(r9)
 	mtcrf	0x01,r12
@@ -206,40 +214,37 @@ reenable_mmu:				/* re-enable mmu so we can */
 	andi.	r10,r10,MSR_EE		/* Did EE change? */
 	beq	1f
 
-	/* Save handler and return address into the 2 unused words
-	 * of the STACK_FRAME_OVERHEAD (sneak sneak sneak). Everything
-	 * else can be recovered from the pt_regs except r3 which for
-	 * normal interrupts has been set to pt_regs and for syscalls
-	 * is an argument, so we temporarily use ORIG_GPR3 to save it
-	 */
-	stw	r9,8(r1)
-	stw	r11,12(r1)
-	stw	r3,ORIG_GPR3(r1)
 	/*
 	 * The trace_hardirqs_off will use CALLER_ADDR0 and CALLER_ADDR1.
 	 * If from user mode there is only one stack frame on the stack, and
 	 * accessing CALLER_ADDR1 will cause oops. So we need create a dummy
 	 * stack frame to make trace_hardirqs_off happy.
+	 *
+	 * This is handy because we also need to save a bunch of GPRs,
+	 * r3 can be different from GPR3(r1) at this point, r9 and r11
+	 * contains the old MSR and handler address respectively,
+	 * r4 & r5 can contain page fault arguments that need to be passed
+	 * along as well. r12, CCR, CTR, XER etc... are left clobbered as
+	 * they aren't useful past this point (aren't syscall arguments),
+	 * the rest is restored from the exception frame.
 	 */
-	andi.	r12,r12,MSR_PR
-	beq	11f
-	stwu	r1,-16(r1)
-	bl	trace_hardirqs_off
-	addi	r1,r1,16
-	b	12f
-
-11:
+	stwu	r1,-32(r1)
+	stw	r9,8(r1)
+	stw	r11,12(r1)
+	stw	r3,16(r1)
+	stw	r4,20(r1)
+	stw	r5,24(r1)
 	bl	trace_hardirqs_off
-12:
+	lwz	r5,24(r1)
+	lwz	r4,20(r1)
+	lwz	r3,16(r1)
+	lwz	r11,12(r1)
+	lwz	r9,8(r1)
+	addi	r1,r1,32
 	lwz	r0,GPR0(r1)
-	lwz	r3,ORIG_GPR3(r1)
-	lwz	r4,GPR4(r1)
-	lwz	r5,GPR5(r1)
 	lwz	r6,GPR6(r1)
 	lwz	r7,GPR7(r1)
 	lwz	r8,GPR8(r1)
-	lwz	r9,8(r1)
-	lwz	r11,12(r1)
 1:	mtctr	r11
 	mtlr	r9
 	bctr				/* jump to handler */
@@ -330,7 +335,7 @@ _GLOBAL(DoSyscall)
 	mtmsr	r11
 1:
 #endif /* CONFIG_TRACE_IRQFLAGS */
-	rlwinm	r10,r1,0,0,(31-THREAD_SHIFT)	/* current_thread_info() */
+	CURRENT_THREAD_INFO(r10, r1)
 	lwz	r11,TI_FLAGS(r10)
 	andi.	r11,r11,_TIF_SYSCALL_T_OR_A
 	bne-	syscall_dotrace
@@ -351,7 +356,7 @@ ret_from_syscall:
 	bl	do_show_syscall_exit
 #endif
 	mr	r6,r3
-	rlwinm	r12,r1,0,0,(31-THREAD_SHIFT)	/* current_thread_info() */
+	CURRENT_THREAD_INFO(r12, r1)
 	/* disable interrupts so current_thread_info()->flags can't change */
 	LOAD_MSR_KERNEL(r10,MSR_KERNEL)	/* doesn't include MSR_EE */
 	/* Note: We don't bother telling lockdep about it */
@@ -430,6 +435,17 @@ ret_from_fork:
 	li	r3,0
 	b	ret_from_syscall
 
+	.globl	ret_from_kernel_thread
+ret_from_kernel_thread:
+	REST_NVGPRS(r1)
+	bl	schedule_tail
+	mtlr	r14
+	mr	r3,r15
+	PPC440EP_ERR42
+	blrl
+	li	r3,0
+	b	ret_from_syscall
+
 /* Traced system call support */
 syscall_dotrace:
 	SAVE_NVGPRS(r1)
@@ -812,7 +828,7 @@ ret_from_except:
 
 user_exc_return:		/* r10 contains MSR_KERNEL here */
 	/* Check current_thread_info()->flags */
-	rlwinm	r9,r1,0,0,(31-THREAD_SHIFT)
+	CURRENT_THREAD_INFO(r9, r1)
 	lwz	r9,TI_FLAGS(r9)
 	andi.	r0,r9,_TIF_USER_WORK_MASK
 	bne	do_work
@@ -826,19 +842,56 @@ restore_user:
 	bnel-	load_dbcr0
 #endif
 
-#ifdef CONFIG_PREEMPT
 	b	restore
 
 /* N.B. the only way to get here is from the beq following ret_from_except. */
 resume_kernel:
+	/* check current_thread_info, _TIF_EMULATE_STACK_STORE */
+	CURRENT_THREAD_INFO(r9, r1)
+	lwz	r8,TI_FLAGS(r9)
+	andis.	r0,r8,_TIF_EMULATE_STACK_STORE@h
+	beq+	1f
+
+	addi	r8,r1,INT_FRAME_SIZE	/* Get the kprobed function entry */
+
+	lwz	r3,GPR1(r1)
+	subi	r3,r3,INT_FRAME_SIZE	/* dst: Allocate a trampoline exception frame */
+	mr	r4,r1			/* src:  current exception frame */
+	mr	r1,r3			/* Reroute the trampoline frame to r1 */
+
+	/* Copy from the original to the trampoline. */
+	li	r5,INT_FRAME_SIZE/4	/* size: INT_FRAME_SIZE */
+	li	r6,0			/* start offset: 0 */
+	mtctr	r5
+2:	lwzx	r0,r6,r4
+	stwx	r0,r6,r3
+	addi	r6,r6,4
+	bdnz	2b
+
+	/* Do real store operation to complete stwu */
+	lwz	r5,GPR1(r1)
+	stw	r8,0(r5)
+
+	/* Clear _TIF_EMULATE_STACK_STORE flag */
+	lis	r11,_TIF_EMULATE_STACK_STORE@h
+	addi	r5,r9,TI_FLAGS
+0:	lwarx	r8,0,r5
+	andc	r8,r8,r11
+#ifdef CONFIG_IBM405_ERR77
+	dcbt	0,r5
+#endif
+	stwcx.	r8,0,r5
+	bne-	0b
+1:
+
+#ifdef CONFIG_PREEMPT
 	/* check current_thread_info->preempt_count */
-	rlwinm	r9,r1,0,0,(31-THREAD_SHIFT)
 	lwz	r0,TI_PREEMPT(r9)
 	cmpwi	0,r0,0		/* if non-zero, just restore regs and return */
 	bne	restore
-	lwz	r0,TI_FLAGS(r9)
-	andi.	r0,r0,_TIF_NEED_RESCHED
+	andi.	r8,r8,_TIF_NEED_RESCHED
 	beq+	restore
+	lwz	r3,_MSR(r1)
 	andi.	r0,r3,MSR_EE	/* interrupts off? */
 	beq	restore		/* don't schedule if so */
 #ifdef CONFIG_TRACE_IRQFLAGS
@@ -849,7 +902,7 @@ resume_kernel:
 	bl	trace_hardirqs_off
 #endif
 1:	bl	preempt_schedule_irq
-	rlwinm	r9,r1,0,0,(31-THREAD_SHIFT)
+	CURRENT_THREAD_INFO(r9, r1)
 	lwz	r3,TI_FLAGS(r9)
 	andi.	r0,r3,_TIF_NEED_RESCHED
 	bne-	1b
@@ -859,8 +912,6 @@ resume_kernel:
 	 */
 	bl	trace_hardirqs_on
 #endif
-#else
-resume_kernel:
 #endif /* CONFIG_PREEMPT */
 
 	/* interrupts are hard-disabled at this point */
@@ -1119,7 +1170,7 @@ ret_from_debug_exc:
 	lwz	r10,SAVED_KSP_LIMIT(r1)
 	stw	r10,KSP_LIMIT(r9)
 	lwz	r9,THREAD_INFO-THREAD(r9)
-	rlwinm	r10,r1,0,0,(31-THREAD_SHIFT)
+	CURRENT_THREAD_INFO(r10, r1)
 	lwz	r10,TI_PREEMPT(r10)
 	stw	r10,TI_PREEMPT(r9)
 	RESTORE_xSRR(SRR0,SRR1);
@@ -1153,7 +1204,7 @@ load_dbcr0:
 	lis	r11,global_dbcr0@ha
 	addi	r11,r11,global_dbcr0@l
 #ifdef CONFIG_SMP
-	rlwinm	r9,r1,0,0,(31-THREAD_SHIFT)
+	CURRENT_THREAD_INFO(r9, r1)
 	lwz	r9,TI_CPU(r9)
 	slwi	r9,r9,3
 	add	r11,r11,r9
@@ -1194,7 +1245,7 @@ recheck:
 	LOAD_MSR_KERNEL(r10,MSR_KERNEL)
 	SYNC
 	MTMSRD(r10)		/* disable interrupts */
-	rlwinm	r9,r1,0,0,(31-THREAD_SHIFT)
+	CURRENT_THREAD_INFO(r9, r1)
 	lwz	r9,TI_FLAGS(r9)
 	andi.	r0,r9,_TIF_NEED_RESCHED
 	bne-	do_resched
@@ -1213,7 +1264,7 @@ do_user_signal:			/* r10 contains MSR_KERNEL here */
 	stw	r3,_TRAP(r1)
 2:	addi	r3,r1,STACK_FRAME_OVERHEAD
 	mr	r4,r9
-	bl	do_signal
+	bl	do_notify_resume
 	REST_NVGPRS(r1)
 	b	recheck
 
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index d834425186a..6528c5e2cc4 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -32,13 +32,15 @@
 #include <asm/ptrace.h>
 #include <asm/irqflags.h>
 #include <asm/ftrace.h>
+#include <asm/hw_irq.h>
+#include <asm/context_tracking.h>
 
 /*
  * System calls.
  */
 	.section	".toc","aw"
-.SYS_CALL_TABLE:
-	.tc .sys_call_table[TC],.sys_call_table
+SYS_CALL_TABLE:
+	.tc sys_call_table[TC],sys_call_table
 
 /* This value is used to mark exception frames on the stack. */
 exception_marker:
@@ -61,16 +63,11 @@ system_call_common:
 	std	r12,_MSR(r1)
 	std	r0,GPR0(r1)
 	std	r10,GPR1(r1)
+	beq	2f			/* if from kernel mode */
 	ACCOUNT_CPU_USER_ENTRY(r10, r11)
-	/*
-	 * This "crclr so" clears CR0.SO, which is the error indication on
-	 * return from this system call.  There must be no cmp instruction
-	 * between it and the "mfcr r9" below, otherwise if XER.SO is set,
-	 * CR0.SO will get set, causing all system calls to appear to fail.
-	 */
-	crclr	so
-	std	r2,GPR2(r1)
+2:	std	r2,GPR2(r1)
 	std	r3,GPR3(r1)
+	mfcr	r2
 	std	r4,GPR4(r1)
 	std	r5,GPR5(r1)
 	std	r6,GPR6(r1)
@@ -81,85 +78,82 @@ system_call_common:
 	std	r11,GPR10(r1)
 	std	r11,GPR11(r1)
 	std	r11,GPR12(r1)
+	std	r11,_XER(r1)
+	std	r11,_CTR(r1)
 	std	r9,GPR13(r1)
-	mfcr	r9
 	mflr	r10
+	/*
+	 * This clears CR0.SO (bit 28), which is the error indication on
+	 * return from this system call.
+	 */
+	rldimi	r2,r11,28,(63-28)
 	li	r11,0xc01
-	std	r9,_CCR(r1)
 	std	r10,_LINK(r1)
 	std	r11,_TRAP(r1)
-	mfxer	r9
-	mfctr	r10
-	std	r9,_XER(r1)
-	std	r10,_CTR(r1)
 	std	r3,ORIG_GPR3(r1)
+	std	r2,_CCR(r1)
 	ld	r2,PACATOC(r13)
 	addi	r9,r1,STACK_FRAME_OVERHEAD
 	ld	r11,exception_marker@toc(r2)
 	std	r11,-16(r9)		/* "regshere" marker */
-#if defined(CONFIG_VIRT_CPU_ACCOUNTING) && defined(CONFIG_PPC_SPLPAR)
+#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC_SPLPAR)
 BEGIN_FW_FTR_SECTION
 	beq	33f
 	/* if from user, see if there are any DTL entries to process */
 	ld	r10,PACALPPACAPTR(r13)	/* get ptr to VPA */
 	ld	r11,PACA_DTL_RIDX(r13)	/* get log read index */
-	ld	r10,LPPACA_DTLIDX(r10)	/* get log write index */
+	addi	r10,r10,LPPACA_DTLIDX
+	LDX_BE	r10,0,r10		/* get log write index */
 	cmpd	cr1,r11,r10
 	beq+	cr1,33f
-	bl	.accumulate_stolen_time
+	bl	accumulate_stolen_time
 	REST_GPR(0,r1)
 	REST_4GPRS(3,r1)
 	REST_2GPRS(7,r1)
 	addi	r9,r1,STACK_FRAME_OVERHEAD
 33:
 END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING && CONFIG_PPC_SPLPAR */
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE && CONFIG_PPC_SPLPAR */
 
-#ifdef CONFIG_TRACE_IRQFLAGS
-	bl	.trace_hardirqs_on
-	REST_GPR(0,r1)
-	REST_4GPRS(3,r1)
-	REST_2GPRS(7,r1)
-	addi	r9,r1,STACK_FRAME_OVERHEAD
-	ld	r12,_MSR(r1)
-#endif /* CONFIG_TRACE_IRQFLAGS */
-	li	r10,1
-	stb	r10,PACASOFTIRQEN(r13)
-	stb	r10,PACAHARDIRQEN(r13)
-	std	r10,SOFTE(r1)
-#ifdef CONFIG_PPC_ISERIES
-BEGIN_FW_FTR_SECTION
-	/* Hack for handling interrupts when soft-enabling on iSeries */
-	cmpdi	cr1,r0,0x5555		/* syscall 0x5555 */
-	andi.	r10,r12,MSR_PR		/* from kernel */
-	crand	4*cr0+eq,4*cr1+eq,4*cr0+eq
-	bne	2f
-	b	hardware_interrupt_entry
-2:
-END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
-#endif /* CONFIG_PPC_ISERIES */
+	/*
+	 * A syscall should always be called with interrupts enabled
+	 * so we just unconditionally hard-enable here. When some kind
+	 * of irq tracing is used, we additionally check that condition
+	 * is correct
+	 */
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_BUG)
+	lbz	r10,PACASOFTIRQEN(r13)
+	xori	r10,r10,1
+1:	tdnei	r10,0
+	EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING
+#endif
 
-	/* Hard enable interrupts */
 #ifdef CONFIG_PPC_BOOK3E
 	wrteei	1
 #else
-	mfmsr	r11
+	ld	r11,PACAKMSR(r13)
 	ori	r11,r11,MSR_EE
 	mtmsrd	r11,1
 #endif /* CONFIG_PPC_BOOK3E */
 
+	/* We do need to set SOFTE in the stack frame or the return
+	 * from interrupt will be painful
+	 */
+	li	r10,1
+	std	r10,SOFTE(r1)
+
 #ifdef SHOW_SYSCALLS
-	bl	.do_show_syscall
+	bl	do_show_syscall
 	REST_GPR(0,r1)
 	REST_4GPRS(3,r1)
 	REST_2GPRS(7,r1)
 	addi	r9,r1,STACK_FRAME_OVERHEAD
 #endif
-	clrrdi	r11,r1,THREAD_SHIFT
+	CURRENT_THREAD_INFO(r11, r1)
 	ld	r10,TI_FLAGS(r11)
 	andi.	r11,r10,_TIF_SYSCALL_T_OR_A
-	bne-	syscall_dotrace
-syscall_dotrace_cont:
+	bne	syscall_dotrace
+.Lsyscall_dotrace_cont:
 	cmpldi	0,r0,NR_syscalls
 	bge-	syscall_enosys
 
@@ -168,7 +162,7 @@ system_call:			/* label this so stack traces look sane */
  * Need to vector to 32 Bit or default sys_call_table here,
  * based on caller's run-mode / personality.
  */
-	ld	r11,.SYS_CALL_TABLE@toc(2)
+	ld	r11,SYS_CALL_TABLE@toc(2)
 	andi.	r10,r10,_TIF_32BIT
 	beq	15f
 	addi	r11,r11,8	/* use 32-bit syscall entries */
@@ -180,17 +174,17 @@ system_call:			/* label this so stack traces look sane */
 	clrldi	r8,r8,32
 15:
 	slwi	r0,r0,4
-	ldx	r10,r11,r0	/* Fetch system call handler [ptr] */
-	mtctr   r10
+	ldx	r12,r11,r0	/* Fetch system call handler [ptr] */
+	mtctr   r12
 	bctrl			/* Call handler */
 
 syscall_exit:
 	std	r3,RESULT(r1)
 #ifdef SHOW_SYSCALLS
-	bl	.do_show_syscall_exit
+	bl	do_show_syscall_exit
 	ld	r3,RESULT(r1)
 #endif
-	clrrdi	r12,r1,THREAD_SHIFT
+	CURRENT_THREAD_INFO(r12, r1)
 
 	ld	r8,_MSR(r1)
 #ifdef CONFIG_PPC_BOOK3S
@@ -198,17 +192,24 @@ syscall_exit:
 	andi.	r10,r8,MSR_RI
 	beq-	unrecov_restore
 #endif
-
-	/* Disable interrupts so current_thread_info()->flags can't change,
+	/*
+	 * Disable interrupts so current_thread_info()->flags can't change,
 	 * and so that we don't get interrupted after loading SRR0/1.
 	 */
 #ifdef CONFIG_PPC_BOOK3E
 	wrteei	0
 #else
-	mfmsr	r10
-	rldicl	r10,r10,48,1
-	rotldi	r10,r10,16
-	mtmsrd	r10,1
+	ld	r10,PACAKMSR(r13)
+	/*
+	 * For performance reasons we clear RI the same time that we
+	 * clear EE. We only need to clear RI just before we restore r13
+	 * below, but batching it with EE saves us one expensive mtmsrd call.
+	 * We have to be careful to restore RI if we branch anywhere from
+	 * here (eg syscall_exit_work).
+	 */
+	li	r9,MSR_RI
+	andc	r11,r10,r9
+	mtmsrd	r11,1
 #endif /* CONFIG_PPC_BOOK3E */
 
 	ld	r9,TI_FLAGS(r12)
@@ -218,27 +219,17 @@ syscall_exit:
 	cmpld	r3,r11
 	ld	r5,_CCR(r1)
 	bge-	syscall_error
-syscall_error_cont:
+.Lsyscall_error_cont:
 	ld	r7,_NIP(r1)
 BEGIN_FTR_SECTION
 	stdcx.	r0,0,r1			/* to clear the reservation */
 END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
 	andi.	r6,r8,MSR_PR
 	ld	r4,_LINK(r1)
-	/*
-	 * Clear RI before restoring r13.  If we are returning to
-	 * userspace and we take an exception after restoring r13,
-	 * we end up corrupting the userspace r13 value.
-	 */
-#ifdef CONFIG_PPC_BOOK3S
-	/* No MSR:RI on BookE */
-	li	r12,MSR_RI
-	andc	r11,r10,r12
-	mtmsrd	r11,1			/* clear MSR.RI */
-#endif /* CONFIG_PPC_BOOK3S */
 
 	beq-	1f
 	ACCOUNT_CPU_USER_EXIT(r11, r12)
+	HMT_MEDIUM_LOW_HAS_PPR
 	ld	r13,GPR13(r1)	/* only restore r13 if returning to usermode */
 1:	ld	r2,GPR2(r1)
 	ld	r1,GPR1(r1)
@@ -253,13 +244,13 @@ syscall_error:
 	oris	r5,r5,0x1000	/* Set SO bit in CR */
 	neg	r3,r3
 	std	r5,_CCR(r1)
-	b	syscall_error_cont
+	b	.Lsyscall_error_cont
 	
 /* Traced system call support */
 syscall_dotrace:
-	bl	.save_nvgprs
+	bl	save_nvgprs
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	.do_syscall_trace_enter
+	bl	do_syscall_trace_enter
 	/*
 	 * Restore argument registers possibly just changed.
 	 * We use the return value of do_syscall_trace_enter
@@ -273,15 +264,18 @@ syscall_dotrace:
 	ld	r7,GPR7(r1)
 	ld	r8,GPR8(r1)
 	addi	r9,r1,STACK_FRAME_OVERHEAD
-	clrrdi	r10,r1,THREAD_SHIFT
+	CURRENT_THREAD_INFO(r10, r1)
 	ld	r10,TI_FLAGS(r10)
-	b	syscall_dotrace_cont
+	b	.Lsyscall_dotrace_cont
 
 syscall_enosys:
 	li	r3,-ENOSYS
 	b	syscall_exit
 	
 syscall_exit_work:
+#ifdef CONFIG_PPC_BOOK3S
+	mtmsrd	r10,1		/* Restore RI */
+#endif
 	/* If TIF_RESTOREALL is set, don't scribble on either r3 or ccr.
 	 If TIF_NOERROR is set, just save r3 as it is. */
 
@@ -312,22 +306,23 @@ syscall_exit_work:
 	subi	r12,r12,TI_FLAGS
 
 4:	/* Anything else left to do? */
+	SET_DEFAULT_THREAD_PPR(r3, r10)		/* Set thread.ppr = 3 */
 	andi.	r0,r9,(_TIF_SYSCALL_T_OR_A|_TIF_SINGLESTEP)
-	beq	.ret_from_except_lite
+	beq	ret_from_except_lite
 
 	/* Re-enable interrupts */
 #ifdef CONFIG_PPC_BOOK3E
 	wrteei	1
 #else
-	mfmsr	r10
+	ld	r10,PACAKMSR(r13)
 	ori	r10,r10,MSR_EE
 	mtmsrd	r10,1
 #endif /* CONFIG_PPC_BOOK3E */
 
-	bl	.save_nvgprs
+	bl	save_nvgprs
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	.do_syscall_trace_leave
-	b	.ret_from_except
+	bl	do_syscall_trace_leave
+	b	ret_from_except
 
 /* Save non-volatile GPRs, if not already saved. */
 _GLOBAL(save_nvgprs)
@@ -350,36 +345,48 @@ _GLOBAL(save_nvgprs)
  */
 
 _GLOBAL(ppc_fork)
-	bl	.save_nvgprs
-	bl	.sys_fork
+	bl	save_nvgprs
+	bl	sys_fork
 	b	syscall_exit
 
 _GLOBAL(ppc_vfork)
-	bl	.save_nvgprs
-	bl	.sys_vfork
+	bl	save_nvgprs
+	bl	sys_vfork
 	b	syscall_exit
 
 _GLOBAL(ppc_clone)
-	bl	.save_nvgprs
-	bl	.sys_clone
+	bl	save_nvgprs
+	bl	sys_clone
 	b	syscall_exit
 
 _GLOBAL(ppc32_swapcontext)
-	bl	.save_nvgprs
-	bl	.compat_sys_swapcontext
+	bl	save_nvgprs
+	bl	compat_sys_swapcontext
 	b	syscall_exit
 
 _GLOBAL(ppc64_swapcontext)
-	bl	.save_nvgprs
-	bl	.sys_swapcontext
+	bl	save_nvgprs
+	bl	sys_swapcontext
 	b	syscall_exit
 
 _GLOBAL(ret_from_fork)
-	bl	.schedule_tail
+	bl	schedule_tail
 	REST_NVGPRS(r1)
 	li	r3,0
 	b	syscall_exit
 
+_GLOBAL(ret_from_kernel_thread)
+	bl	schedule_tail
+	REST_NVGPRS(r1)
+	mtlr	r14
+	mr	r3,r15
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+	mr	r12,r14
+#endif
+	blrl
+	li	r3,0
+	b	syscall_exit
+
 /*
  * This routine switches between two different tasks.  The process
  * state of one is saved on its kernel stack.  Then the state
@@ -421,12 +428,6 @@ BEGIN_FTR_SECTION
 	std	r24,THREAD_VRSAVE(r3)
 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 #endif /* CONFIG_ALTIVEC */
-#ifdef CONFIG_PPC64
-BEGIN_FTR_SECTION
-	mfspr	r25,SPRN_DSCR
-	std	r25,THREAD_DSCR(r3)
-END_FTR_SECTION_IFSET(CPU_FTR_DSCR)
-#endif
 	and.	r0,r0,r22
 	beq+	1f
 	andc	r22,r22,r0
@@ -437,6 +438,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_DSCR)
 	std	r23,_CCR(r1)
 	std	r1,KSP(r3)	/* Set old stack pointer */
 
+#ifdef CONFIG_PPC_BOOK3S_64
+BEGIN_FTR_SECTION
+	/* Event based branch registers */
+	mfspr	r0, SPRN_BESCR
+	std	r0, THREAD_BESCR(r3)
+	mfspr	r0, SPRN_EBBHR
+	std	r0, THREAD_EBBHR(r3)
+	mfspr	r0, SPRN_EBBRR
+	std	r0, THREAD_EBBRR(r3)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+#endif
+
 #ifdef CONFIG_SMP
 	/* We need a sync somewhere here to make sure that if the
 	 * previous task gets rescheduled on another CPU, it sees all
@@ -456,6 +469,13 @@ BEGIN_FTR_SECTION
 	ldarx	r6,0,r1
 END_FTR_SECTION_IFSET(CPU_FTR_STCX_CHECKS_ADDRESS)
 
+#ifdef CONFIG_PPC_BOOK3S
+/* Cancel all explict user streams as they will have no use after context
+ * switch and will stop the HW from creating streams itself
+ */
+	DCBT_STOP_ALL_STREAM_IDS(r6)
+#endif
+
 	addi	r6,r4,-THREAD	/* Convert THREAD to 'current' */
 	std	r6,PACACURRENT(r13)	/* Set new 'current' */
 
@@ -493,9 +513,11 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
 	 */
 	ld	r9,PACA_SLBSHADOWPTR(r13)
 	li	r12,0
-	std	r12,SLBSHADOW_STACKESID(r9) /* Clear ESID */
-	std	r7,SLBSHADOW_STACKVSID(r9)  /* Save VSID */
-	std	r0,SLBSHADOW_STACKESID(r9)  /* Save ESID */
+	std	r12,SLBSHADOW_STACKESID(r9)	/* Clear ESID */
+	li	r12,SLBSHADOW_STACKVSID
+	STDX_BE	r7,r12,r9			/* Save VSID */
+	li	r12,SLBSHADOW_STACKESID
+	STDX_BE	r0,r12,r9			/* Save ESID */
 
 	/* No need to check for MMU_FTR_NO_SLBIE_B here, since when
 	 * we have 1TB segments, the only CPUs known to have the errata
@@ -510,7 +532,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
 2:
 #endif /* !CONFIG_PPC_BOOK3S */
 
-	clrrdi	r7,r8,THREAD_SHIFT	/* base of new stack */
+	CURRENT_THREAD_INFO(r7, r8)  /* base of new stack */
 	/* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE
 	   because we don't need to leave the 288-byte ABI gap at the
 	   top of the kernel stack. */
@@ -519,8 +541,20 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
 	mr	r1,r8		/* start using new stack pointer */
 	std	r7,PACAKSAVE(r13)
 
-	ld	r6,_CCR(r1)
-	mtcrf	0xFF,r6
+#ifdef CONFIG_PPC_BOOK3S_64
+BEGIN_FTR_SECTION
+	/* Event based branch registers */
+	ld	r0, THREAD_BESCR(r4)
+	mtspr	SPRN_BESCR, r0
+	ld	r0, THREAD_EBBHR(r4)
+	mtspr	SPRN_EBBHR, r0
+	ld	r0, THREAD_EBBRR(r4)
+	mtspr	SPRN_EBBRR, r0
+
+	ld	r0,THREAD_TAR(r4)
+	mtspr	SPRN_TAR,r0
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+#endif
 
 #ifdef CONFIG_ALTIVEC
 BEGIN_FTR_SECTION
@@ -530,14 +564,27 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 #endif /* CONFIG_ALTIVEC */
 #ifdef CONFIG_PPC64
 BEGIN_FTR_SECTION
+	lwz	r6,THREAD_DSCR_INHERIT(r4)
 	ld	r0,THREAD_DSCR(r4)
+	cmpwi	r6,0
+	bne	1f
+	ld	r0,PACA_DSCR(r13)
+1:
+BEGIN_FTR_SECTION_NESTED(70)
+	mfspr	r8, SPRN_FSCR
+	rldimi	r8, r6, FSCR_DSCR_LG, (63 - FSCR_DSCR_LG)
+	mtspr	SPRN_FSCR, r8
+END_FTR_SECTION_NESTED(CPU_FTR_ARCH_207S, CPU_FTR_ARCH_207S, 70)
 	cmpd	r0,r25
-	beq	1f
+	beq	2f
 	mtspr	SPRN_DSCR,r0
-1:	
+2:
 END_FTR_SECTION_IFSET(CPU_FTR_DSCR)
 #endif
 
+	ld	r6,_CCR(r1)
+	mtcrf	0xFF,r6
+
 	/* r3-r13 are destroyed -- Cort */
 	REST_8GPRS(14, r1)
 	REST_10GPRS(22, r1)
@@ -553,7 +600,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_DSCR)
 _GLOBAL(ret_from_except)
 	ld	r11,_TRAP(r1)
 	andi.	r0,r11,1
-	bne	.ret_from_except_lite
+	bne	ret_from_except_lite
 	REST_NVGPRS(r1)
 
 _GLOBAL(ret_from_except_lite)
@@ -565,51 +612,197 @@ _GLOBAL(ret_from_except_lite)
 #ifdef CONFIG_PPC_BOOK3E
 	wrteei	0
 #else
-	mfmsr	r10		/* Get current interrupt state */
-	rldicl	r9,r10,48,1	/* clear MSR_EE */
-	rotldi	r9,r9,16
-	mtmsrd	r9,1		/* Update machine state */
+	ld	r10,PACAKMSR(r13) /* Get kernel MSR without EE */
+	mtmsrd	r10,1		  /* Update machine state */
 #endif /* CONFIG_PPC_BOOK3E */
 
-#ifdef CONFIG_PREEMPT
-	clrrdi	r9,r1,THREAD_SHIFT	/* current_thread_info() */
-	li	r0,_TIF_NEED_RESCHED	/* bits to check */
+	CURRENT_THREAD_INFO(r9, r1)
 	ld	r3,_MSR(r1)
+#ifdef CONFIG_PPC_BOOK3E
+	ld	r10,PACACURRENT(r13)
+#endif /* CONFIG_PPC_BOOK3E */
 	ld	r4,TI_FLAGS(r9)
-	/* Move MSR_PR bit in r3 to _TIF_SIGPENDING position in r0 */
-	rlwimi	r0,r3,32+TIF_SIGPENDING-MSR_PR_LG,_TIF_SIGPENDING
-	and.	r0,r4,r0	/* check NEED_RESCHED and maybe SIGPENDING */
-	bne	do_work
-
-#else /* !CONFIG_PREEMPT */
-	ld	r3,_MSR(r1)	/* Returning to user mode? */
 	andi.	r3,r3,MSR_PR
-	beq	restore		/* if not, just restore regs and return */
+	beq	resume_kernel
+#ifdef CONFIG_PPC_BOOK3E
+	lwz	r3,(THREAD+THREAD_DBCR0)(r10)
+#endif /* CONFIG_PPC_BOOK3E */
 
 	/* Check current_thread_info()->flags */
-	clrrdi	r9,r1,THREAD_SHIFT
-	ld	r4,TI_FLAGS(r9)
 	andi.	r0,r4,_TIF_USER_WORK_MASK
-	bne	do_work
+#ifdef CONFIG_PPC_BOOK3E
+	bne	1f
+	/*
+	 * Check to see if the dbcr0 register is set up to debug.
+	 * Use the internal debug mode bit to do this.
+	 */
+	andis.	r0,r3,DBCR0_IDM@h
+	beq	restore
+	mfmsr	r0
+	rlwinm	r0,r0,0,~MSR_DE	/* Clear MSR.DE */
+	mtmsr	r0
+	mtspr	SPRN_DBCR0,r3
+	li	r10, -1
+	mtspr	SPRN_DBSR,r10
+	b	restore
+#else
+	beq	restore
 #endif
+1:	andi.	r0,r4,_TIF_NEED_RESCHED
+	beq	2f
+	bl	restore_interrupts
+	SCHEDULE_USER
+	b	ret_from_except_lite
+2:
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	andi.	r0,r4,_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM
+	bne	3f		/* only restore TM if nothing else to do */
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	restore_tm_state
+	b	restore
+3:
+#endif
+	bl	save_nvgprs
+	bl	restore_interrupts
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	do_notify_resume
+	b	ret_from_except
+
+resume_kernel:
+	/* check current_thread_info, _TIF_EMULATE_STACK_STORE */
+	andis.	r8,r4,_TIF_EMULATE_STACK_STORE@h
+	beq+	1f
+
+	addi	r8,r1,INT_FRAME_SIZE	/* Get the kprobed function entry */
+
+	lwz	r3,GPR1(r1)
+	subi	r3,r3,INT_FRAME_SIZE	/* dst: Allocate a trampoline exception frame */
+	mr	r4,r1			/* src:  current exception frame */
+	mr	r1,r3			/* Reroute the trampoline frame to r1 */
+
+	/* Copy from the original to the trampoline. */
+	li	r5,INT_FRAME_SIZE/8	/* size: INT_FRAME_SIZE */
+	li	r6,0			/* start offset: 0 */
+	mtctr	r5
+2:	ldx	r0,r6,r4
+	stdx	r0,r6,r3
+	addi	r6,r6,8
+	bdnz	2b
+
+	/* Do real store operation to complete stwu */
+	lwz	r5,GPR1(r1)
+	std	r8,0(r5)
+
+	/* Clear _TIF_EMULATE_STACK_STORE flag */
+	lis	r11,_TIF_EMULATE_STACK_STORE@h
+	addi	r5,r9,TI_FLAGS
+0:	ldarx	r4,0,r5
+	andc	r4,r4,r11
+	stdcx.	r4,0,r5
+	bne-	0b
+1:
+
+#ifdef CONFIG_PREEMPT
+	/* Check if we need to preempt */
+	andi.	r0,r4,_TIF_NEED_RESCHED
+	beq+	restore
+	/* Check that preempt_count() == 0 and interrupts are enabled */
+	lwz	r8,TI_PREEMPT(r9)
+	cmpwi	cr1,r8,0
+	ld	r0,SOFTE(r1)
+	cmpdi	r0,0
+	crandc	eq,cr1*4+eq,eq
+	bne	restore
+
+	/*
+	 * Here we are preempting the current task. We want to make
+	 * sure we are soft-disabled first and reconcile irq state.
+	 */
+	RECONCILE_IRQ_STATE(r3,r4)
+1:	bl	preempt_schedule_irq
+
+	/* Re-test flags and eventually loop */
+	CURRENT_THREAD_INFO(r9, r1)
+	ld	r4,TI_FLAGS(r9)
+	andi.	r0,r4,_TIF_NEED_RESCHED
+	bne	1b
+
+	/*
+	 * arch_local_irq_restore() from preempt_schedule_irq above may
+	 * enable hard interrupt but we really should disable interrupts
+	 * when we return from the interrupt, and so that we don't get
+	 * interrupted after loading SRR0/1.
+	 */
+#ifdef CONFIG_PPC_BOOK3E
+	wrteei	0
+#else
+	ld	r10,PACAKMSR(r13) /* Get kernel MSR without EE */
+	mtmsrd	r10,1		  /* Update machine state */
+#endif /* CONFIG_PPC_BOOK3E */
+#endif /* CONFIG_PREEMPT */
 
+	.globl	fast_exc_return_irq
+fast_exc_return_irq:
 restore:
-BEGIN_FW_FTR_SECTION
+	/*
+	 * This is the main kernel exit path. First we check if we
+	 * are about to re-enable interrupts
+	 */
 	ld	r5,SOFTE(r1)
-FW_FTR_SECTION_ELSE
-	b	.Liseries_check_pending_irqs
-ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
-2:
-	TRACE_AND_RESTORE_IRQ(r5);
+	lbz	r6,PACASOFTIRQEN(r13)
+	cmpwi	cr0,r5,0
+	beq	restore_irq_off
 
-	/* extract EE bit and use it to restore paca->hard_enabled */
-	ld	r3,_MSR(r1)
-	rldicl	r4,r3,49,63		/* r0 = (r3 >> 15) & 1 */
-	stb	r4,PACAHARDIRQEN(r13)
+	/* We are enabling, were we already enabled ? Yes, just return */
+	cmpwi	cr0,r6,1
+	beq	cr0,do_restore
 
+	/*
+	 * We are about to soft-enable interrupts (we are hard disabled
+	 * at this point). We check if there's anything that needs to
+	 * be replayed first.
+	 */
+	lbz	r0,PACAIRQHAPPENED(r13)
+	cmpwi	cr0,r0,0
+	bne-	restore_check_irq_replay
+
+	/*
+	 * Get here when nothing happened while soft-disabled, just
+	 * soft-enable and move-on. We will hard-enable as a side
+	 * effect of rfi
+	 */
+restore_no_replay:
+	TRACE_ENABLE_INTS
+	li	r0,1
+	stb	r0,PACASOFTIRQEN(r13);
+
+	/*
+	 * Final return path. BookE is handled in a different file
+	 */
+do_restore:
 #ifdef CONFIG_PPC_BOOK3E
-	b	.exception_return_book3e
+	b	exception_return_book3e
 #else
+	/*
+	 * Clear the reservation. If we know the CPU tracks the address of
+	 * the reservation then we can potentially save some cycles and use
+	 * a larx. On POWER6 and POWER7 this is significantly faster.
+	 */
+BEGIN_FTR_SECTION
+	stdcx.	r0,0,r1		/* to clear the reservation */
+FTR_SECTION_ELSE
+	ldarx	r4,0,r1
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
+
+	/*
+	 * Some code path such as load_up_fpu or altivec return directly
+	 * here. They run entirely hard disabled and do not alter the
+	 * interrupt state. They also don't use lwarx/stwcx. and thus
+	 * are known not to leave dangling reservations.
+	 */
+	.globl	fast_exception_return
+fast_exception_return:
+	ld	r3,_MSR(r1)
 	ld	r4,_CTR(r1)
 	ld	r0,_LINK(r1)
 	mtctr	r4
@@ -622,32 +815,35 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
 	andi.	r0,r3,MSR_RI
 	beq-	unrecov_restore
 
-	/*
-	 * Clear the reservation. If we know the CPU tracks the address of
-	 * the reservation then we can potentially save some cycles and use
-	 * a larx. On POWER6 and POWER7 this is significantly faster.
-	 */
+	/* Load PPR from thread struct before we clear MSR:RI */
 BEGIN_FTR_SECTION
-	stdcx.	r0,0,r1		/* to clear the reservation */
-FTR_SECTION_ELSE
-	ldarx	r4,0,r1
-ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
+	ld	r2,PACACURRENT(r13)
+	ld	r2,TASKTHREADPPR(r2)
+END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 
 	/*
 	 * Clear RI before restoring r13.  If we are returning to
 	 * userspace and we take an exception after restoring r13,
 	 * we end up corrupting the userspace r13 value.
 	 */
-	mfmsr	r4
-	andc	r4,r4,r0	/* r0 contains MSR_RI here */
+	ld	r4,PACAKMSR(r13) /* Get kernel MSR without EE */
+	andc	r4,r4,r0	 /* r0 contains MSR_RI here */
 	mtmsrd	r4,1
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	/* TM debug */
+	std	r3, PACATMSCRATCH(r13) /* Stash returned-to MSR */
+#endif
 	/*
 	 * r13 is our per cpu area, only restore it if we are returning to
-	 * userspace
+	 * userspace the value stored in the stack frame may belong to
+	 * another CPU.
 	 */
 	andi.	r0,r3,MSR_PR
 	beq	1f
+BEGIN_FTR_SECTION
+	mtspr	SPRN_PPR,r2	/* Restore PPR */
+END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	ACCOUNT_CPU_USER_EXIT(r2, r4)
 	REST_GPR(13, r1)
 1:
@@ -669,99 +865,86 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
 
 #endif /* CONFIG_PPC_BOOK3E */
 
-.Liseries_check_pending_irqs:
-#ifdef CONFIG_PPC_ISERIES
-	ld	r5,SOFTE(r1)
-	cmpdi	0,r5,0
-	beq	2b
-	/* Check for pending interrupts (iSeries) */
-	ld	r3,PACALPPACAPTR(r13)
-	ld	r3,LPPACAANYINT(r3)
-	cmpdi	r3,0
-	beq+	2b			/* skip do_IRQ if no interrupts */
-
-	li	r3,0
-	stb	r3,PACASOFTIRQEN(r13)	/* ensure we are soft-disabled */
-#ifdef CONFIG_TRACE_IRQFLAGS
-	bl	.trace_hardirqs_off
-	mfmsr	r10
-#endif
-	ori	r10,r10,MSR_EE
-	mtmsrd	r10			/* hard-enable again */
-	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	.do_IRQ
-	b	.ret_from_except_lite		/* loop back and handle more */
-#endif
-
-do_work:
-#ifdef CONFIG_PREEMPT
-	andi.	r0,r3,MSR_PR	/* Returning to user mode? */
-	bne	user_work
-	/* Check that preempt_count() == 0 and interrupts are enabled */
-	lwz	r8,TI_PREEMPT(r9)
-	cmpwi	cr1,r8,0
-	ld	r0,SOFTE(r1)
-	cmpdi	r0,0
-	crandc	eq,cr1*4+eq,eq
-	bne	restore
-
-	/* Here we are preempting the current task.
+	/*
+	 * We are returning to a context with interrupts soft disabled.
 	 *
-	 * Ensure interrupts are soft-disabled. We also properly mark
-	 * the PACA to reflect the fact that they are hard-disabled
-	 * and trace the change
+	 * However, we may also about to hard enable, so we need to
+	 * make sure that in this case, we also clear PACA_IRQ_HARD_DIS
+	 * or that bit can get out of sync and bad things will happen
 	 */
-	li	r0,0
-	stb	r0,PACASOFTIRQEN(r13)
-	stb	r0,PACAHARDIRQEN(r13)
+restore_irq_off:
+	ld	r3,_MSR(r1)
+	lbz	r7,PACAIRQHAPPENED(r13)
+	andi.	r0,r3,MSR_EE
+	beq	1f
+	rlwinm	r7,r7,0,~PACA_IRQ_HARD_DIS
+	stb	r7,PACAIRQHAPPENED(r13)
+1:	li	r0,0
+	stb	r0,PACASOFTIRQEN(r13);
 	TRACE_DISABLE_INTS
+	b	do_restore
 
-	/* Call the scheduler with soft IRQs off */
-1:	bl	.preempt_schedule_irq
-
-	/* Hard-disable interrupts again (and update PACA) */
-#ifdef CONFIG_PPC_BOOK3E
-	wrteei	0
-#else
-	mfmsr	r10
-	rldicl	r10,r10,48,1
-	rotldi	r10,r10,16
-	mtmsrd	r10,1
-#endif /* CONFIG_PPC_BOOK3E */
-	li	r0,0
-	stb	r0,PACAHARDIRQEN(r13)
-
-	/* Re-test flags and eventually loop */
-	clrrdi	r9,r1,THREAD_SHIFT
-	ld	r4,TI_FLAGS(r9)
-	andi.	r0,r4,_TIF_NEED_RESCHED
-	bne	1b
-	b	restore
-
-user_work:
-#endif /* CONFIG_PREEMPT */
+	/*
+	 * Something did happen, check if a re-emit is needed
+	 * (this also clears paca->irq_happened)
+	 */
+restore_check_irq_replay:
+	/* XXX: We could implement a fast path here where we check
+	 * for irq_happened being just 0x01, in which case we can
+	 * clear it and return. That means that we would potentially
+	 * miss a decrementer having wrapped all the way around.
+	 *
+	 * Still, this might be useful for things like hash_page
+	 */
+	bl	__check_irq_replay
+	cmpwi	cr0,r3,0
+ 	beq	restore_no_replay
+ 
+	/*
+	 * We need to re-emit an interrupt. We do so by re-using our
+	 * existing exception frame. We first change the trap value,
+	 * but we need to ensure we preserve the low nibble of it
+	 */
+	ld	r4,_TRAP(r1)
+	clrldi	r4,r4,60
+	or	r4,r4,r3
+	std	r4,_TRAP(r1)
 
-	/* Enable interrupts */
+	/*
+	 * Then find the right handler and call it. Interrupts are
+	 * still soft-disabled and we keep them that way.
+	*/
+	cmpwi	cr0,r3,0x500
+	bne	1f
+	addi	r3,r1,STACK_FRAME_OVERHEAD;
+ 	bl	do_IRQ
+	b	ret_from_except
+1:	cmpwi	cr0,r3,0x900
+	bne	1f
+	addi	r3,r1,STACK_FRAME_OVERHEAD;
+	bl	timer_interrupt
+	b	ret_from_except
+#ifdef CONFIG_PPC_DOORBELL
+1:
 #ifdef CONFIG_PPC_BOOK3E
-	wrteei	1
+	cmpwi	cr0,r3,0x280
 #else
-	ori	r10,r10,MSR_EE
-	mtmsrd	r10,1
+	BEGIN_FTR_SECTION
+		cmpwi	cr0,r3,0xe80
+	FTR_SECTION_ELSE
+		cmpwi	cr0,r3,0xa00
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
 #endif /* CONFIG_PPC_BOOK3E */
-
-	andi.	r0,r4,_TIF_NEED_RESCHED
-	beq	1f
-	bl	.schedule
-	b	.ret_from_except_lite
-
-1:	bl	.save_nvgprs
-	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	.do_signal
-	b	.ret_from_except
-
+	bne	1f
+	addi	r3,r1,STACK_FRAME_OVERHEAD;
+	bl	doorbell_exception
+	b	ret_from_except
+#endif /* CONFIG_PPC_DOORBELL */
+1:	b	ret_from_except /* What else to do here ? */
+ 
 unrecov_restore:
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	.unrecoverable_exception
+	bl	unrecoverable_exception
 	b	unrecov_restore
 
 #ifdef CONFIG_PPC_RTAS
@@ -827,7 +1010,7 @@ _GLOBAL(enter_rtas)
         std	r6,PACASAVEDMSR(r13)
 
 	/* Setup our real return addr */	
-	LOAD_REG_ADDR(r4,.rtas_return_loc)
+	LOAD_REG_ADDR(r4,rtas_return_loc)
 	clrldi	r4,r4,2			/* convert to realmode address */
        	mtlr	r4
 
@@ -837,7 +1020,7 @@ _GLOBAL(enter_rtas)
 	
         li      r9,1
         rldicr  r9,r9,MSR_SF_LG,(63-MSR_SF_LG)
-	ori	r9,r9,MSR_IR|MSR_DR|MSR_FE0|MSR_FE1|MSR_FP|MSR_RI
+	ori	r9,r9,MSR_IR|MSR_DR|MSR_FE0|MSR_FE1|MSR_FP|MSR_RI|MSR_LE
 	andc	r6,r0,r9
 	sync				/* disable interrupts so SRR0/1 */
 	mtmsrd	r0			/* don't get trashed */
@@ -851,14 +1034,16 @@ _GLOBAL(enter_rtas)
 	rfid
 	b	.	/* prevent speculative execution */
 
-_STATIC(rtas_return_loc)
+rtas_return_loc:
+	FIXUP_ENDIAN
+
 	/* relocation is off at this point */
 	GET_PACA(r4)
 	clrldi	r4,r4,2			/* convert to realmode address */
 
 	bcl	20,31,$+4
 0:	mflr	r3
-	ld	r3,(1f-0b)(r3)		/* get &.rtas_restore_regs */
+	ld	r3,(1f-0b)(r3)		/* get &rtas_restore_regs */
 
 	mfmsr   r6
 	li	r0,MSR_RI
@@ -875,9 +1060,9 @@ _STATIC(rtas_return_loc)
 	b	.	/* prevent speculative execution */
 
 	.align	3
-1:	.llong	.rtas_restore_regs
+1:	.llong	rtas_restore_regs
 
-_STATIC(rtas_restore_regs)
+rtas_restore_regs:
 	/* relocation is on at this point */
 	REST_GPR(2, r1)			/* Restore the TOC */
 	REST_GPR(13, r1)		/* Restore paca */
@@ -923,28 +1108,30 @@ _GLOBAL(enter_prom)
 	std	r10,_CCR(r1)
 	std	r11,_MSR(r1)
 
-	/* Get the PROM entrypoint */
-	mtlr	r4
+	/* Put PROM address in SRR0 */
+	mtsrr0	r4
 
-	/* Switch MSR to 32 bits mode
+	/* Setup our trampoline return addr in LR */
+	bcl	20,31,$+4
+0:	mflr	r4
+	addi	r4,r4,(1f - 0b)
+       	mtlr	r4
+
+	/* Prepare a 32-bit mode big endian MSR
 	 */
 #ifdef CONFIG_PPC_BOOK3E
 	rlwinm	r11,r11,0,1,31
-	mtmsr	r11
+	mtsrr1	r11
+	rfi
 #else /* CONFIG_PPC_BOOK3E */
-        mfmsr   r11
-        li      r12,1
-        rldicr  r12,r12,MSR_SF_LG,(63-MSR_SF_LG)
-        andc    r11,r11,r12
-        li      r12,1
-        rldicr  r12,r12,MSR_ISF_LG,(63-MSR_ISF_LG)
-        andc    r11,r11,r12
-        mtmsrd  r11
+	LOAD_REG_IMMEDIATE(r12, MSR_SF | MSR_ISF | MSR_LE)
+	andc	r11,r11,r12
+	mtsrr1	r11
+	rfid
 #endif /* CONFIG_PPC_BOOK3E */
-        isync
 
-	/* Enter PROM here... */
-	blrl
+1:	/* Return from OF */
+	FIXUP_ENDIAN
 
 	/* Just make sure that r1 top 32 bits didn't get
 	 * corrupt by OF
@@ -975,7 +1162,7 @@ _GLOBAL(mcount)
 _GLOBAL(_mcount)
 	blr
 
-_GLOBAL(ftrace_caller)
+_GLOBAL_TOC(ftrace_caller)
 	/* Taken from output of objdump from lib64/glibc */
 	mflr	r3
 	ld	r11, 0(r1)
@@ -999,10 +1186,7 @@ _GLOBAL(ftrace_graph_stub)
 _GLOBAL(ftrace_stub)
 	blr
 #else
-_GLOBAL(mcount)
-	blr
-
-_GLOBAL(_mcount)
+_GLOBAL_TOC(_mcount)
 	/* Taken from output of objdump from lib64/glibc */
 	mflr	r3
 	ld	r11, 0(r1)
@@ -1040,7 +1224,7 @@ _GLOBAL(ftrace_graph_caller)
 	ld	r11, 112(r1)
 	addi	r3, r11, 16
 
-	bl	.prepare_ftrace_return
+	bl	prepare_ftrace_return
 	nop
 
 	ld	r0, 128(r1)
@@ -1056,7 +1240,7 @@ _GLOBAL(return_to_handler)
 	mr	r31, r1
 	stdu	r1, -112(r1)
 
-	bl	.ftrace_return_to_handler
+	bl	ftrace_return_to_handler
 	nop
 
 	/* return value has real return address */
@@ -1086,7 +1270,7 @@ _GLOBAL(mod_return_to_handler)
 	 */
 	ld	r2, PACATOC(r13)
 
-	bl	.ftrace_return_to_handler
+	bl	ftrace_return_to_handler
 	nop
 
 	/* return value has real return address */
diff --git a/arch/powerpc/kernel/epapr_hcalls.S b/arch/powerpc/kernel/epapr_hcalls.S
new file mode 100644
index 00000000000..9f1ebf7338f
--- /dev/null
+++ b/arch/powerpc/kernel/epapr_hcalls.S
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2012 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/threads.h>
+#include <asm/epapr_hcalls.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/cputable.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-compat.h>
+#include <asm/asm-offsets.h>
+
+#ifndef CONFIG_PPC64
+/* epapr_ev_idle() was derived from e500_idle() */
+_GLOBAL(epapr_ev_idle)
+	CURRENT_THREAD_INFO(r3, r1)
+	PPC_LL	r4, TI_LOCAL_FLAGS(r3)	/* set napping bit */
+	ori	r4, r4,_TLF_NAPPING	/* so when we take an exception */
+	PPC_STL	r4, TI_LOCAL_FLAGS(r3)	/* it will return to our caller */
+
+	wrteei	1
+
+idle_loop:
+	LOAD_REG_IMMEDIATE(r11, EV_HCALL_TOKEN(EV_IDLE))
+
+.global epapr_ev_idle_start
+epapr_ev_idle_start:
+	li	r3, -1
+	nop
+	nop
+	nop
+
+	/*
+	 * Guard against spurious wakeups from a hypervisor --
+	 * only interrupt will cause us to return to LR due to
+	 * _TLF_NAPPING.
+	 */
+	b	idle_loop
+#endif
+
+/* Hypercall entry point. Will be patched with device tree instructions. */
+.global epapr_hypercall_start
+epapr_hypercall_start:
+	li	r3, -1
+	nop
+	nop
+	nop
+	blr
diff --git a/arch/powerpc/kernel/epapr_paravirt.c b/arch/powerpc/kernel/epapr_paravirt.c
new file mode 100644
index 00000000000..59e4ba74975
--- /dev/null
+++ b/arch/powerpc/kernel/epapr_paravirt.c
@@ -0,0 +1,85 @@
+/*
+ * ePAPR para-virtualization support.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (C) 2012 Freescale Semiconductor, Inc.
+ */
+
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <asm/epapr_hcalls.h>
+#include <asm/cacheflush.h>
+#include <asm/code-patching.h>
+#include <asm/machdep.h>
+
+#if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64)
+extern void epapr_ev_idle(void);
+extern u32 epapr_ev_idle_start[];
+#endif
+
+bool epapr_paravirt_enabled;
+static bool __maybe_unused epapr_has_idle;
+
+static int __init early_init_dt_scan_epapr(unsigned long node,
+					   const char *uname,
+					   int depth, void *data)
+{
+	const u32 *insts;
+	int len;
+	int i;
+
+	insts = of_get_flat_dt_prop(node, "hcall-instructions", &len);
+	if (!insts)
+		return 0;
+
+	if (len % 4 || len > (4 * 4))
+		return -1;
+
+	for (i = 0; i < (len / 4); i++) {
+		u32 inst = be32_to_cpu(insts[i]);
+		patch_instruction(epapr_hypercall_start + i, inst);
+#if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64)
+		patch_instruction(epapr_ev_idle_start + i, inst);
+#endif
+	}
+
+#if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64)
+	if (of_get_flat_dt_prop(node, "has-idle", NULL))
+		epapr_has_idle = true;
+#endif
+
+	epapr_paravirt_enabled = true;
+
+	return 1;
+}
+
+int __init epapr_paravirt_early_init(void)
+{
+	of_scan_flat_dt(early_init_dt_scan_epapr, NULL);
+
+	return 0;
+}
+
+static int __init epapr_idle_init(void)
+{
+#if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64)
+	if (epapr_has_idle)
+		ppc_md.power_save = epapr_ev_idle;
+#endif
+
+	return 0;
+}
+
+postcore_initcall(epapr_idle_init);
diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
index 429983c06f9..bb9cac6c805 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -24,6 +24,9 @@
 #include <asm/ptrace.h>
 #include <asm/ppc-opcode.h>
 #include <asm/mmu.h>
+#include <asm/hw_irq.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_booke_hv_asm.h>
 
 /* XXX This will ultimately add space for a special exception save
  *     structure used to save things like SRR0/SRR1, SPRGs, MAS, etc...
@@ -31,19 +34,263 @@
  *     special interrupts from within a non-standard level will probably
  *     blow you up
  */
-#define	SPECIAL_EXC_FRAME_SIZE	INT_FRAME_SIZE
+#define SPECIAL_EXC_SRR0	0
+#define SPECIAL_EXC_SRR1	1
+#define SPECIAL_EXC_SPRG_GEN	2
+#define SPECIAL_EXC_SPRG_TLB	3
+#define SPECIAL_EXC_MAS0	4
+#define SPECIAL_EXC_MAS1	5
+#define SPECIAL_EXC_MAS2	6
+#define SPECIAL_EXC_MAS3	7
+#define SPECIAL_EXC_MAS6	8
+#define SPECIAL_EXC_MAS7	9
+#define SPECIAL_EXC_MAS5	10	/* E.HV only */
+#define SPECIAL_EXC_MAS8	11	/* E.HV only */
+#define SPECIAL_EXC_IRQHAPPENED	12
+#define SPECIAL_EXC_DEAR	13
+#define SPECIAL_EXC_ESR		14
+#define SPECIAL_EXC_SOFTE	15
+#define SPECIAL_EXC_CSRR0	16
+#define SPECIAL_EXC_CSRR1	17
+/* must be even to keep 16-byte stack alignment */
+#define SPECIAL_EXC_END		18
+
+#define SPECIAL_EXC_FRAME_SIZE	(INT_FRAME_SIZE + SPECIAL_EXC_END * 8)
+#define SPECIAL_EXC_FRAME_OFFS  (INT_FRAME_SIZE - 288)
+
+#define SPECIAL_EXC_STORE(reg, name) \
+	std	reg, (SPECIAL_EXC_##name * 8 + SPECIAL_EXC_FRAME_OFFS)(r1)
+
+#define SPECIAL_EXC_LOAD(reg, name) \
+	ld	reg, (SPECIAL_EXC_##name * 8 + SPECIAL_EXC_FRAME_OFFS)(r1)
+
+special_reg_save:
+	lbz	r9,PACAIRQHAPPENED(r13)
+	RECONCILE_IRQ_STATE(r3,r4)
+
+	/*
+	 * We only need (or have stack space) to save this stuff if
+	 * we interrupted the kernel.
+	 */
+	ld	r3,_MSR(r1)
+	andi.	r3,r3,MSR_PR
+	bnelr
+
+	/* Copy info into temporary exception thread info */
+	ld	r11,PACAKSAVE(r13)
+	CURRENT_THREAD_INFO(r11, r11)
+	CURRENT_THREAD_INFO(r12, r1)
+	ld	r10,TI_FLAGS(r11)
+	std	r10,TI_FLAGS(r12)
+	ld	r10,TI_PREEMPT(r11)
+	std	r10,TI_PREEMPT(r12)
+	ld	r10,TI_TASK(r11)
+	std	r10,TI_TASK(r12)
+
+	/*
+	 * Advance to the next TLB exception frame for handler
+	 * types that don't do it automatically.
+	 */
+	LOAD_REG_ADDR(r11,extlb_level_exc)
+	lwz	r12,0(r11)
+	mfspr	r10,SPRN_SPRG_TLB_EXFRAME
+	add	r10,r10,r12
+	mtspr	SPRN_SPRG_TLB_EXFRAME,r10
+
+	/*
+	 * Save registers needed to allow nesting of certain exceptions
+	 * (such as TLB misses) inside special exception levels
+	 */
+	mfspr	r10,SPRN_SRR0
+	SPECIAL_EXC_STORE(r10,SRR0)
+	mfspr	r10,SPRN_SRR1
+	SPECIAL_EXC_STORE(r10,SRR1)
+	mfspr	r10,SPRN_SPRG_GEN_SCRATCH
+	SPECIAL_EXC_STORE(r10,SPRG_GEN)
+	mfspr	r10,SPRN_SPRG_TLB_SCRATCH
+	SPECIAL_EXC_STORE(r10,SPRG_TLB)
+	mfspr	r10,SPRN_MAS0
+	SPECIAL_EXC_STORE(r10,MAS0)
+	mfspr	r10,SPRN_MAS1
+	SPECIAL_EXC_STORE(r10,MAS1)
+	mfspr	r10,SPRN_MAS2
+	SPECIAL_EXC_STORE(r10,MAS2)
+	mfspr	r10,SPRN_MAS3
+	SPECIAL_EXC_STORE(r10,MAS3)
+	mfspr	r10,SPRN_MAS6
+	SPECIAL_EXC_STORE(r10,MAS6)
+	mfspr	r10,SPRN_MAS7
+	SPECIAL_EXC_STORE(r10,MAS7)
+BEGIN_FTR_SECTION
+	mfspr	r10,SPRN_MAS5
+	SPECIAL_EXC_STORE(r10,MAS5)
+	mfspr	r10,SPRN_MAS8
+	SPECIAL_EXC_STORE(r10,MAS8)
+
+	/* MAS5/8 could have inappropriate values if we interrupted KVM code */
+	li	r10,0
+	mtspr	SPRN_MAS5,r10
+	mtspr	SPRN_MAS8,r10
+END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
+	SPECIAL_EXC_STORE(r9,IRQHAPPENED)
+
+	mfspr	r10,SPRN_DEAR
+	SPECIAL_EXC_STORE(r10,DEAR)
+	mfspr	r10,SPRN_ESR
+	SPECIAL_EXC_STORE(r10,ESR)
+
+	lbz	r10,PACASOFTIRQEN(r13)
+	SPECIAL_EXC_STORE(r10,SOFTE)
+	ld	r10,_NIP(r1)
+	SPECIAL_EXC_STORE(r10,CSRR0)
+	ld	r10,_MSR(r1)
+	SPECIAL_EXC_STORE(r10,CSRR1)
+
+	blr
+
+ret_from_level_except:
+	ld	r3,_MSR(r1)
+	andi.	r3,r3,MSR_PR
+	beq	1f
+	b	ret_from_except
+1:
+
+	LOAD_REG_ADDR(r11,extlb_level_exc)
+	lwz	r12,0(r11)
+	mfspr	r10,SPRN_SPRG_TLB_EXFRAME
+	sub	r10,r10,r12
+	mtspr	SPRN_SPRG_TLB_EXFRAME,r10
+
+	/*
+	 * It's possible that the special level exception interrupted a
+	 * TLB miss handler, and inserted the same entry that the
+	 * interrupted handler was about to insert.  On CPUs without TLB
+	 * write conditional, this can result in a duplicate TLB entry.
+	 * Wipe all non-bolted entries to be safe.
+	 *
+	 * Note that this doesn't protect against any TLB misses
+	 * we may take accessing the stack from here to the end of
+	 * the special level exception.  It's not clear how we can
+	 * reasonably protect against that, but only CPUs with
+	 * neither TLB write conditional nor bolted kernel memory
+	 * are affected.  Do any such CPUs even exist?
+	 */
+	PPC_TLBILX_ALL(0,R0)
+
+	REST_NVGPRS(r1)
+
+	SPECIAL_EXC_LOAD(r10,SRR0)
+	mtspr	SPRN_SRR0,r10
+	SPECIAL_EXC_LOAD(r10,SRR1)
+	mtspr	SPRN_SRR1,r10
+	SPECIAL_EXC_LOAD(r10,SPRG_GEN)
+	mtspr	SPRN_SPRG_GEN_SCRATCH,r10
+	SPECIAL_EXC_LOAD(r10,SPRG_TLB)
+	mtspr	SPRN_SPRG_TLB_SCRATCH,r10
+	SPECIAL_EXC_LOAD(r10,MAS0)
+	mtspr	SPRN_MAS0,r10
+	SPECIAL_EXC_LOAD(r10,MAS1)
+	mtspr	SPRN_MAS1,r10
+	SPECIAL_EXC_LOAD(r10,MAS2)
+	mtspr	SPRN_MAS2,r10
+	SPECIAL_EXC_LOAD(r10,MAS3)
+	mtspr	SPRN_MAS3,r10
+	SPECIAL_EXC_LOAD(r10,MAS6)
+	mtspr	SPRN_MAS6,r10
+	SPECIAL_EXC_LOAD(r10,MAS7)
+	mtspr	SPRN_MAS7,r10
+BEGIN_FTR_SECTION
+	SPECIAL_EXC_LOAD(r10,MAS5)
+	mtspr	SPRN_MAS5,r10
+	SPECIAL_EXC_LOAD(r10,MAS8)
+	mtspr	SPRN_MAS8,r10
+END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
+
+	lbz	r6,PACASOFTIRQEN(r13)
+	ld	r5,SOFTE(r1)
+
+	/* Interrupts had better not already be enabled... */
+	twnei	r6,0
+
+	cmpwi	cr0,r5,0
+	beq	1f
+
+	TRACE_ENABLE_INTS
+	stb	r5,PACASOFTIRQEN(r13)
+1:
+	/*
+	 * Restore PACAIRQHAPPENED rather than setting it based on
+	 * the return MSR[EE], since we could have interrupted
+	 * __check_irq_replay() or other inconsistent transitory
+	 * states that must remain that way.
+	 */
+	SPECIAL_EXC_LOAD(r10,IRQHAPPENED)
+	stb	r10,PACAIRQHAPPENED(r13)
+
+	SPECIAL_EXC_LOAD(r10,DEAR)
+	mtspr	SPRN_DEAR,r10
+	SPECIAL_EXC_LOAD(r10,ESR)
+	mtspr	SPRN_ESR,r10
+
+	stdcx.	r0,0,r1		/* to clear the reservation */
+
+	REST_4GPRS(2, r1)
+	REST_4GPRS(6, r1)
+
+	ld	r10,_CTR(r1)
+	ld	r11,_XER(r1)
+	mtctr	r10
+	mtxer	r11
+
+	blr
+
+.macro ret_from_level srr0 srr1 paca_ex scratch
+	bl	ret_from_level_except
+
+	ld	r10,_LINK(r1)
+	ld	r11,_CCR(r1)
+	ld	r0,GPR13(r1)
+	mtlr	r10
+	mtcr	r11
+
+	ld	r10,GPR10(r1)
+	ld	r11,GPR11(r1)
+	ld	r12,GPR12(r1)
+	mtspr	\scratch,r0
+
+	std	r10,\paca_ex+EX_R10(r13);
+	std	r11,\paca_ex+EX_R11(r13);
+	ld	r10,_NIP(r1)
+	ld	r11,_MSR(r1)
+	ld	r0,GPR0(r1)
+	ld	r1,GPR1(r1)
+	mtspr	\srr0,r10
+	mtspr	\srr1,r11
+	ld	r10,\paca_ex+EX_R10(r13)
+	ld	r11,\paca_ex+EX_R11(r13)
+	mfspr	r13,\scratch
+.endm
+
+ret_from_crit_except:
+	ret_from_level SPRN_CSRR0 SPRN_CSRR1 PACA_EXCRIT SPRN_SPRG_CRIT_SCRATCH
+	rfci
+
+ret_from_mc_except:
+	ret_from_level SPRN_MCSRR0 SPRN_MCSRR1 PACA_EXMC SPRN_SPRG_MC_SCRATCH
+	rfmci
 
 /* Exception prolog code for all exceptions */
-#define EXCEPTION_PROLOG(n, type, addition)				    \
+#define EXCEPTION_PROLOG(n, intnum, type, addition)	    		    \
 	mtspr	SPRN_SPRG_##type##_SCRATCH,r13;	/* get spare registers */   \
 	mfspr	r13,SPRN_SPRG_PACA;	/* get PACA */			    \
 	std	r10,PACA_EX##type+EX_R10(r13);				    \
 	std	r11,PACA_EX##type+EX_R11(r13);				    \
 	mfcr	r10;			/* save CR */			    \
+	mfspr	r11,SPRN_##type##_SRR1;/* what are we coming from */	    \
+	DO_KVM	intnum,SPRN_##type##_SRR1;    /* KVM hook */		    \
+	stw	r10,PACA_EX##type+EX_CR(r13); /* save old CR in the PACA */ \
 	addition;			/* additional code for that exc. */ \
 	std	r1,PACA_EX##type+EX_R1(r13); /* save old r1 in the PACA */  \
-	stw	r10,PACA_EX##type+EX_CR(r13); /* save old CR in the PACA */ \
-	mfspr	r11,SPRN_##type##_SRR1;/* what are we coming from */	    \
 	type##_SET_KSTACK;		/* get special stack if necessary */\
 	andi.	r10,r11,MSR_PR;		/* save stack pointer */	    \
 	beq	1f;			/* branch around if supervisor */   \
@@ -58,78 +305,79 @@
 #define SPRN_GEN_SRR0	SPRN_SRR0
 #define SPRN_GEN_SRR1	SPRN_SRR1
 
+#define	GDBELL_SET_KSTACK	GEN_SET_KSTACK
+#define SPRN_GDBELL_SRR0	SPRN_GSRR0
+#define SPRN_GDBELL_SRR1	SPRN_GSRR1
+
 #define CRIT_SET_KSTACK						            \
 	ld	r1,PACA_CRIT_STACK(r13);				    \
-	subi	r1,r1,SPECIAL_EXC_FRAME_SIZE;
+	subi	r1,r1,SPECIAL_EXC_FRAME_SIZE
 #define SPRN_CRIT_SRR0	SPRN_CSRR0
 #define SPRN_CRIT_SRR1	SPRN_CSRR1
 
 #define DBG_SET_KSTACK						            \
 	ld	r1,PACA_DBG_STACK(r13);					    \
-	subi	r1,r1,SPECIAL_EXC_FRAME_SIZE;
+	subi	r1,r1,SPECIAL_EXC_FRAME_SIZE
 #define SPRN_DBG_SRR0	SPRN_DSRR0
 #define SPRN_DBG_SRR1	SPRN_DSRR1
 
 #define MC_SET_KSTACK						            \
 	ld	r1,PACA_MC_STACK(r13);					    \
-	subi	r1,r1,SPECIAL_EXC_FRAME_SIZE;
+	subi	r1,r1,SPECIAL_EXC_FRAME_SIZE
 #define SPRN_MC_SRR0	SPRN_MCSRR0
 #define SPRN_MC_SRR1	SPRN_MCSRR1
 
-#define NORMAL_EXCEPTION_PROLOG(n, addition)				    \
-	EXCEPTION_PROLOG(n, GEN, addition##_GEN)
+#define NORMAL_EXCEPTION_PROLOG(n, intnum, addition)			    \
+	EXCEPTION_PROLOG(n, intnum, GEN, addition##_GEN(n))
 
-#define CRIT_EXCEPTION_PROLOG(n, addition)				    \
-	EXCEPTION_PROLOG(n, CRIT, addition##_CRIT)
+#define CRIT_EXCEPTION_PROLOG(n, intnum, addition)			    \
+	EXCEPTION_PROLOG(n, intnum, CRIT, addition##_CRIT(n))
 
-#define DBG_EXCEPTION_PROLOG(n, addition)				    \
-	EXCEPTION_PROLOG(n, DBG, addition##_DBG)
+#define DBG_EXCEPTION_PROLOG(n, intnum, addition)			    \
+	EXCEPTION_PROLOG(n, intnum, DBG, addition##_DBG(n))
 
-#define MC_EXCEPTION_PROLOG(n, addition)				    \
-	EXCEPTION_PROLOG(n, MC, addition##_MC)
+#define MC_EXCEPTION_PROLOG(n, intnum, addition)			    \
+	EXCEPTION_PROLOG(n, intnum, MC, addition##_MC(n))
 
+#define GDBELL_EXCEPTION_PROLOG(n, intnum, addition)			    \
+	EXCEPTION_PROLOG(n, intnum, GDBELL, addition##_GDBELL(n))
 
 /* Variants of the "addition" argument for the prolog
  */
-#define PROLOG_ADDITION_NONE_GEN
-#define PROLOG_ADDITION_NONE_CRIT
-#define PROLOG_ADDITION_NONE_DBG
-#define PROLOG_ADDITION_NONE_MC
-
-#define PROLOG_ADDITION_MASKABLE_GEN					    \
-	lbz	r11,PACASOFTIRQEN(r13); /* are irqs soft-disabled ? */	    \
-	cmpwi	cr0,r11,0;		/* yes -> go out of line */	    \
-	beq	masked_interrupt_book3e;
-
-#define PROLOG_ADDITION_2REGS_GEN					    \
+#define PROLOG_ADDITION_NONE_GEN(n)
+#define PROLOG_ADDITION_NONE_GDBELL(n)
+#define PROLOG_ADDITION_NONE_CRIT(n)
+#define PROLOG_ADDITION_NONE_DBG(n)
+#define PROLOG_ADDITION_NONE_MC(n)
+
+#define PROLOG_ADDITION_MASKABLE_GEN(n)					    \
+	lbz	r10,PACASOFTIRQEN(r13); /* are irqs soft-disabled ? */	    \
+	cmpwi	cr0,r10,0;		/* yes -> go out of line */	    \
+	beq	masked_interrupt_book3e_##n
+
+#define PROLOG_ADDITION_2REGS_GEN(n)					    \
 	std	r14,PACA_EXGEN+EX_R14(r13);				    \
 	std	r15,PACA_EXGEN+EX_R15(r13)
 
-#define PROLOG_ADDITION_1REG_GEN					    \
+#define PROLOG_ADDITION_1REG_GEN(n)					    \
 	std	r14,PACA_EXGEN+EX_R14(r13);
 
-#define PROLOG_ADDITION_2REGS_CRIT					    \
+#define PROLOG_ADDITION_2REGS_CRIT(n)					    \
 	std	r14,PACA_EXCRIT+EX_R14(r13);				    \
 	std	r15,PACA_EXCRIT+EX_R15(r13)
 
-#define PROLOG_ADDITION_2REGS_DBG					    \
+#define PROLOG_ADDITION_2REGS_DBG(n)					    \
 	std	r14,PACA_EXDBG+EX_R14(r13);				    \
 	std	r15,PACA_EXDBG+EX_R15(r13)
 
-#define PROLOG_ADDITION_2REGS_MC					    \
+#define PROLOG_ADDITION_2REGS_MC(n)					    \
 	std	r14,PACA_EXMC+EX_R14(r13);				    \
 	std	r15,PACA_EXMC+EX_R15(r13)
 
-#define PROLOG_ADDITION_DOORBELL_GEN					    \
-	lbz	r11,PACASOFTIRQEN(r13); /* are irqs soft-disabled ? */	    \
-	cmpwi	cr0,r11,0;		/* yes -> go out of line */	    \
-	beq	masked_doorbell_book3e
 
-
-/* Core exception code for all exceptions except TLB misses.
- * XXX: Needs to make SPRN_SPRG_GEN depend on exception type
- */
-#define EXCEPTION_COMMON(n, excf, ints)					    \
+/* Core exception code for all exceptions except TLB misses. */
+#define EXCEPTION_COMMON_LVL(n, scratch, excf)				    \
+exc_##n##_common:							    \
 	std	r0,GPR0(r1);		/* save r0 in stackframe */	    \
 	std	r2,GPR2(r1);		/* save r2 in stackframe */	    \
 	SAVE_4GPRS(3, r1);		/* save r3 - r6 in stackframe */    \
@@ -137,10 +385,11 @@
 	std	r9,GPR9(r1);		/* save r9 in stackframe */	    \
 	std	r10,_NIP(r1);		/* save SRR0 to stackframe */	    \
 	std	r11,_MSR(r1);		/* save SRR1 to stackframe */	    \
+	beq	2f;			/* if from kernel mode */	    \
 	ACCOUNT_CPU_USER_ENTRY(r10,r11);/* accounting (uses cr0+eq) */	    \
-	ld	r3,excf+EX_R10(r13);	/* get back r10 */		    \
+2:	ld	r3,excf+EX_R10(r13);	/* get back r10 */		    \
 	ld	r4,excf+EX_R11(r13);	/* get back r11 */		    \
-	mfspr	r5,SPRN_SPRG_GEN_SCRATCH;/* get back r13 */		    \
+	mfspr	r5,scratch;		/* get back r13 */		    \
 	std	r12,GPR12(r1);		/* save r12 in stackframe */	    \
 	ld	r2,PACATOC(r13);	/* get kernel TOC into r2 */	    \
 	mflr	r6;			/* save LR in stackframe */	    \
@@ -164,23 +413,29 @@
 	std	r11,SOFTE(r1);		/* and save it to stackframe */     \
 	std	r12,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame */	    \
 	std	r3,_TRAP(r1);		/* set trap number		*/  \
-	std	r0,RESULT(r1);		/* clear regs->result */	    \
-	ints;
-
-/* Variants for the "ints" argument */
-#define INTS_KEEP
-#define INTS_DISABLE_SOFT						    \
-	stb	r0,PACASOFTIRQEN(r13);	/* mark interrupts soft-disabled */ \
-	TRACE_DISABLE_INTS;
-#define INTS_DISABLE_HARD						    \
-	stb	r0,PACAHARDIRQEN(r13); /* and hard disabled */
-#define INTS_DISABLE_ALL						    \
-	INTS_DISABLE_SOFT						    \
-	INTS_DISABLE_HARD
-
-/* This is called by exceptions that used INTS_KEEP (that is did not clear
- * neither soft nor hard IRQ indicators in the PACA. This will restore MSR:EE
- * to it's previous value
+	std	r0,RESULT(r1);		/* clear regs->result */
+
+#define EXCEPTION_COMMON(n) \
+	EXCEPTION_COMMON_LVL(n, SPRN_SPRG_GEN_SCRATCH, PACA_EXGEN)
+#define EXCEPTION_COMMON_CRIT(n) \
+	EXCEPTION_COMMON_LVL(n, SPRN_SPRG_CRIT_SCRATCH, PACA_EXCRIT)
+#define EXCEPTION_COMMON_MC(n) \
+	EXCEPTION_COMMON_LVL(n, SPRN_SPRG_MC_SCRATCH, PACA_EXMC)
+#define EXCEPTION_COMMON_DBG(n) \
+	EXCEPTION_COMMON_LVL(n, SPRN_SPRG_DBG_SCRATCH, PACA_EXDBG)
+
+/*
+ * This is meant for exceptions that don't immediately hard-enable.  We
+ * set a bit in paca->irq_happened to ensure that a subsequent call to
+ * arch_local_irq_restore() will properly hard-enable and avoid the
+ * fast-path, and then reconcile irq state.
+ */
+#define INTS_DISABLE	RECONCILE_IRQ_STATE(r3,r4)
+
+/*
+ * This is called by exceptions that don't use INTS_DISABLE (that did not
+ * touch irq indicators in the PACA).  This will restore MSR:EE to it's
+ * previous value
  *
  * XXX In the long run, we may want to open-code it in order to separate the
  *     load from the wrtee, thus limiting the latency caused by the dependency
@@ -224,7 +479,7 @@ exc_##n##_bad_stack:							    \
  * interrupts happen before the wait instruction.
  */
 #define CHECK_NAPPING()							\
-	clrrdi	r11,r1,THREAD_SHIFT;					\
+	CURRENT_THREAD_INFO(r11, r1);					\
 	ld	r10,TI_LOCAL_FLAGS(r11);				\
 	andi.	r9,r10,_TLF_NAPPING;					\
 	beq+	1f;							\
@@ -235,15 +490,16 @@ exc_##n##_bad_stack:							    \
 1:
 
 
-#define MASKABLE_EXCEPTION(trapnum, label, hdlr, ack)			\
+#define MASKABLE_EXCEPTION(trapnum, intnum, label, hdlr, ack)		\
 	START_EXCEPTION(label);						\
-	NORMAL_EXCEPTION_PROLOG(trapnum, PROLOG_ADDITION_MASKABLE)	\
-	EXCEPTION_COMMON(trapnum, PACA_EXGEN, INTS_DISABLE_ALL)		\
+	NORMAL_EXCEPTION_PROLOG(trapnum, intnum, PROLOG_ADDITION_MASKABLE)\
+	EXCEPTION_COMMON(trapnum)					\
+	INTS_DISABLE;							\
 	ack(r8);							\
 	CHECK_NAPPING();						\
 	addi	r3,r1,STACK_FRAME_OVERHEAD;				\
 	bl	hdlr;							\
-	b	.ret_from_except_lite;
+	b	ret_from_except_lite;
 
 /* This value is used to mark exception frames on the stack. */
 	.section	".toc","aw"
@@ -259,8 +515,8 @@ exception_marker:
 	.balign	0x1000
 	.globl interrupt_base_book3e
 interrupt_base_book3e:					/* fake trap */
-	EXCEPTION_STUB(0x000, machine_check)		/* 0x0200 */
-	EXCEPTION_STUB(0x020, critical_input)		/* 0x0580 */
+	EXCEPTION_STUB(0x000, machine_check)
+	EXCEPTION_STUB(0x020, critical_input)		/* 0x0100 */
 	EXCEPTION_STUB(0x040, debug_crit)		/* 0x0d00 */
 	EXCEPTION_STUB(0x060, data_storage)		/* 0x0300 */
 	EXCEPTION_STUB(0x080, instruction_storage)	/* 0x0400 */
@@ -275,6 +531,8 @@ interrupt_base_book3e:					/* fake trap */
 	EXCEPTION_STUB(0x1a0, watchdog)			/* 0x09f0 */
 	EXCEPTION_STUB(0x1c0, data_tlb_miss)
 	EXCEPTION_STUB(0x1e0, instruction_tlb_miss)
+	EXCEPTION_STUB(0x200, altivec_unavailable)
+	EXCEPTION_STUB(0x220, altivec_assist)
 	EXCEPTION_STUB(0x260, perfmon)
 	EXCEPTION_STUB(0x280, doorbell)
 	EXCEPTION_STUB(0x2a0, doorbell_crit)
@@ -282,103 +540,163 @@ interrupt_base_book3e:					/* fake trap */
 	EXCEPTION_STUB(0x2e0, guest_doorbell_crit)
 	EXCEPTION_STUB(0x300, hypercall)
 	EXCEPTION_STUB(0x320, ehpriv)
+	EXCEPTION_STUB(0x340, lrat_error)
 
 	.globl interrupt_end_book3e
 interrupt_end_book3e:
 
 /* Critical Input Interrupt */
 	START_EXCEPTION(critical_input);
-	CRIT_EXCEPTION_PROLOG(0x100, PROLOG_ADDITION_NONE)
-//	EXCEPTION_COMMON(0x100, PACA_EXCRIT, INTS_DISABLE_ALL)
-//	bl	special_reg_save_crit
-//	CHECK_NAPPING();
-//	addi	r3,r1,STACK_FRAME_OVERHEAD
-//	bl	.critical_exception
-//	b	ret_from_crit_except
-	b	.
+	CRIT_EXCEPTION_PROLOG(0x100, BOOKE_INTERRUPT_CRITICAL,
+			      PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON_CRIT(0x100)
+	bl	save_nvgprs
+	bl	special_reg_save
+	CHECK_NAPPING();
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	unknown_exception
+	b	ret_from_crit_except
 
 /* Machine Check Interrupt */
 	START_EXCEPTION(machine_check);
-	CRIT_EXCEPTION_PROLOG(0x200, PROLOG_ADDITION_NONE)
-//	EXCEPTION_COMMON(0x200, PACA_EXMC, INTS_DISABLE_ALL)
-//	bl	special_reg_save_mc
-//	addi	r3,r1,STACK_FRAME_OVERHEAD
-//	CHECK_NAPPING();
-//	bl	.machine_check_exception
-//	b	ret_from_mc_except
-	b	.
+	MC_EXCEPTION_PROLOG(0x000, BOOKE_INTERRUPT_MACHINE_CHECK,
+			    PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON_MC(0x000)
+	bl	save_nvgprs
+	bl	special_reg_save
+	CHECK_NAPPING();
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	machine_check_exception
+	b	ret_from_mc_except
 
 /* Data Storage Interrupt */
 	START_EXCEPTION(data_storage)
-	NORMAL_EXCEPTION_PROLOG(0x300, PROLOG_ADDITION_2REGS)
+	NORMAL_EXCEPTION_PROLOG(0x300, BOOKE_INTERRUPT_DATA_STORAGE,
+				PROLOG_ADDITION_2REGS)
 	mfspr	r14,SPRN_DEAR
 	mfspr	r15,SPRN_ESR
-	EXCEPTION_COMMON(0x300, PACA_EXGEN, INTS_KEEP)
+	EXCEPTION_COMMON(0x300)
+	INTS_DISABLE
 	b	storage_fault_common
 
 /* Instruction Storage Interrupt */
 	START_EXCEPTION(instruction_storage);
-	NORMAL_EXCEPTION_PROLOG(0x400, PROLOG_ADDITION_2REGS)
+	NORMAL_EXCEPTION_PROLOG(0x400, BOOKE_INTERRUPT_INST_STORAGE,
+				PROLOG_ADDITION_2REGS)
 	li	r15,0
 	mr	r14,r10
-	EXCEPTION_COMMON(0x400, PACA_EXGEN, INTS_KEEP)
+	EXCEPTION_COMMON(0x400)
+	INTS_DISABLE
 	b	storage_fault_common
 
 /* External Input Interrupt */
-	MASKABLE_EXCEPTION(0x500, external_input, .do_IRQ, ACK_NONE)
+	MASKABLE_EXCEPTION(0x500, BOOKE_INTERRUPT_EXTERNAL,
+			   external_input, do_IRQ, ACK_NONE)
 
 /* Alignment */
 	START_EXCEPTION(alignment);
-	NORMAL_EXCEPTION_PROLOG(0x600, PROLOG_ADDITION_2REGS)
+	NORMAL_EXCEPTION_PROLOG(0x600, BOOKE_INTERRUPT_ALIGNMENT,
+				PROLOG_ADDITION_2REGS)
 	mfspr	r14,SPRN_DEAR
 	mfspr	r15,SPRN_ESR
-	EXCEPTION_COMMON(0x600, PACA_EXGEN, INTS_KEEP)
+	EXCEPTION_COMMON(0x600)
 	b	alignment_more	/* no room, go out of line */
 
 /* Program Interrupt */
 	START_EXCEPTION(program);
-	NORMAL_EXCEPTION_PROLOG(0x700, PROLOG_ADDITION_1REG)
+	NORMAL_EXCEPTION_PROLOG(0x700, BOOKE_INTERRUPT_PROGRAM,
+				PROLOG_ADDITION_1REG)
 	mfspr	r14,SPRN_ESR
-	EXCEPTION_COMMON(0x700, PACA_EXGEN, INTS_DISABLE_SOFT)
+	EXCEPTION_COMMON(0x700)
+	INTS_DISABLE
 	std	r14,_DSISR(r1)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	ld	r14,PACA_EXGEN+EX_R14(r13)
-	bl	.save_nvgprs
-	INTS_RESTORE_HARD
-	bl	.program_check_exception
-	b	.ret_from_except
+	bl	save_nvgprs
+	bl	program_check_exception
+	b	ret_from_except
 
 /* Floating Point Unavailable Interrupt */
 	START_EXCEPTION(fp_unavailable);
-	NORMAL_EXCEPTION_PROLOG(0x800, PROLOG_ADDITION_NONE)
+	NORMAL_EXCEPTION_PROLOG(0x800, BOOKE_INTERRUPT_FP_UNAVAIL,
+				PROLOG_ADDITION_NONE)
 	/* we can probably do a shorter exception entry for that one... */
-	EXCEPTION_COMMON(0x800, PACA_EXGEN, INTS_KEEP)
-	bne	1f			/* if from user, just load it up */
-	bl	.save_nvgprs
+	EXCEPTION_COMMON(0x800)
+	ld	r12,_MSR(r1)
+	andi.	r0,r12,MSR_PR;
+	beq-	1f
+	bl	load_up_fpu
+	b	fast_exception_return
+1:	INTS_DISABLE
+	bl	save_nvgprs
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	INTS_RESTORE_HARD
-	bl	.kernel_fp_unavailable_exception
-	BUG_OPCODE
-1:	ld	r12,_MSR(r1)
-	bl	.load_up_fpu
+	bl	kernel_fp_unavailable_exception
+	b	ret_from_except
+
+/* Altivec Unavailable Interrupt */
+	START_EXCEPTION(altivec_unavailable);
+	NORMAL_EXCEPTION_PROLOG(0x200, BOOKE_INTERRUPT_SPE_ALTIVEC_UNAVAIL,
+				PROLOG_ADDITION_NONE)
+	/* we can probably do a shorter exception entry for that one... */
+	EXCEPTION_COMMON(0x200)
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	ld	r12,_MSR(r1)
+	andi.	r0,r12,MSR_PR;
+	beq-	1f
+	bl	load_up_altivec
 	b	fast_exception_return
+1:
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+	INTS_DISABLE
+	bl	save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	altivec_unavailable_exception
+	b	ret_from_except
+
+/* AltiVec Assist */
+	START_EXCEPTION(altivec_assist);
+	NORMAL_EXCEPTION_PROLOG(0x220,
+				BOOKE_INTERRUPT_SPE_FP_DATA_ALTIVEC_ASSIST,
+				PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON(0x220)
+	INTS_DISABLE
+	bl	save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	bl	altivec_assist_exception
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#else
+	bl	unknown_exception
+#endif
+	b	ret_from_except
+
 
 /* Decrementer Interrupt */
-	MASKABLE_EXCEPTION(0x900, decrementer, .timer_interrupt, ACK_DEC)
+	MASKABLE_EXCEPTION(0x900, BOOKE_INTERRUPT_DECREMENTER,
+			   decrementer, timer_interrupt, ACK_DEC)
 
 /* Fixed Interval Timer Interrupt */
-	MASKABLE_EXCEPTION(0x980, fixed_interval, .unknown_exception, ACK_FIT)
+	MASKABLE_EXCEPTION(0x980, BOOKE_INTERRUPT_FIT,
+			   fixed_interval, unknown_exception, ACK_FIT)
 
 /* Watchdog Timer Interrupt */
 	START_EXCEPTION(watchdog);
-	CRIT_EXCEPTION_PROLOG(0x9f0, PROLOG_ADDITION_NONE)
-//	EXCEPTION_COMMON(0x9f0, PACA_EXCRIT, INTS_DISABLE_ALL)
-//	bl	special_reg_save_crit
-//	CHECK_NAPPING();
-//	addi	r3,r1,STACK_FRAME_OVERHEAD
-//	bl	.unknown_exception
-//	b	ret_from_crit_except
-	b	.
+	CRIT_EXCEPTION_PROLOG(0x9f0, BOOKE_INTERRUPT_WATCHDOG,
+			      PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON_CRIT(0x9f0)
+	bl	save_nvgprs
+	bl	special_reg_save
+	CHECK_NAPPING();
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+#ifdef CONFIG_BOOKE_WDT
+	bl	WatchdogException
+#else
+	bl	unknown_exception
+#endif
+	b	ret_from_crit_except
 
 /* System Call Interrupt */
 	START_EXCEPTION(system_call)
@@ -390,17 +708,19 @@ interrupt_end_book3e:
 
 /* Auxiliary Processor Unavailable Interrupt */
 	START_EXCEPTION(ap_unavailable);
-	NORMAL_EXCEPTION_PROLOG(0xf20, PROLOG_ADDITION_NONE)
-	EXCEPTION_COMMON(0xf20, PACA_EXGEN, INTS_KEEP)
+	NORMAL_EXCEPTION_PROLOG(0xf20, BOOKE_INTERRUPT_AP_UNAVAIL,
+				PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON(0xf20)
+	INTS_DISABLE
+	bl	save_nvgprs
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	.save_nvgprs
-	INTS_RESTORE_HARD
-	bl	.unknown_exception
-	b	.ret_from_except
+	bl	unknown_exception
+	b	ret_from_except
 
 /* Debug exception as a critical interrupt*/
 	START_EXCEPTION(debug_crit);
-	CRIT_EXCEPTION_PROLOG(0xd00, PROLOG_ADDITION_2REGS)
+	CRIT_EXCEPTION_PROLOG(0xd00, BOOKE_INTERRUPT_DEBUG,
+			      PROLOG_ADDITION_2REGS)
 
 	/*
 	 * If there is a single step or branch-taken exception in an
@@ -412,7 +732,7 @@ interrupt_end_book3e:
 	 */
 
 	mfspr	r14,SPRN_DBSR		/* check single-step/branch taken */
-	andis.	r15,r14,DBSR_IC@h
+	andis.	r15,r14,(DBSR_IC|DBSR_BT)@h
 	beq+	1f
 
 	LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e)
@@ -423,7 +743,7 @@ interrupt_end_book3e:
 	bge+	cr1,1f
 
 	/* here it looks like we got an inappropriate debug exception. */
-	lis	r14,DBSR_IC@h		/* clear the IC event */
+	lis	r14,(DBSR_IC|DBSR_BT)@h		/* clear the event */
 	rlwinm	r11,r11,0,~MSR_DE	/* clear DE in the CSRR1 value */
 	mtspr	SPRN_DBSR,r14
 	mtspr	SPRN_CSRR1,r11
@@ -447,25 +767,24 @@ interrupt_end_book3e:
 	/* Now we mash up things to make it look like we are coming on a
 	 * normal exception
 	 */
-	mfspr	r15,SPRN_SPRG_CRIT_SCRATCH
-	mtspr	SPRN_SPRG_GEN_SCRATCH,r15
 	mfspr	r14,SPRN_DBSR
-	EXCEPTION_COMMON(0xd00, PACA_EXCRIT, INTS_DISABLE_ALL)
+	EXCEPTION_COMMON_CRIT(0xd00)
 	std	r14,_DSISR(r1)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	mr	r4,r14
 	ld	r14,PACA_EXCRIT+EX_R14(r13)
 	ld	r15,PACA_EXCRIT+EX_R15(r13)
-	bl	.save_nvgprs
-	bl	.DebugException
-	b	.ret_from_except
+	bl	save_nvgprs
+	bl	DebugException
+	b	ret_from_except
 
 kernel_dbg_exc:
 	b	.	/* NYI */
 
 /* Debug exception as a debug interrupt*/
 	START_EXCEPTION(debug_debug);
-	DBG_EXCEPTION_PROLOG(0xd00, PROLOG_ADDITION_2REGS)
+	DBG_EXCEPTION_PROLOG(0xd00, BOOKE_INTERRUPT_DEBUG,
+						 PROLOG_ADDITION_2REGS)
 
 	/*
 	 * If there is a single step or branch-taken exception in an
@@ -477,7 +796,7 @@ kernel_dbg_exc:
 	 */
 
 	mfspr	r14,SPRN_DBSR		/* check single-step/branch taken */
-	andis.	r15,r14,DBSR_IC@h
+	andis.	r15,r14,(DBSR_IC|DBSR_BT)@h
 	beq+	1f
 
 	LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e)
@@ -488,7 +807,7 @@ kernel_dbg_exc:
 	bge+	cr1,1f
 
 	/* here it looks like we got an inappropriate debug exception. */
-	lis	r14,DBSR_IC@h		/* clear the IC event */
+	lis	r14,(DBSR_IC|DBSR_BT)@h		/* clear the event */
 	rlwinm	r11,r11,0,~MSR_DE	/* clear DE in the DSRR1 value */
 	mtspr	SPRN_DBSR,r14
 	mtspr	SPRN_DSRR1,r11
@@ -512,71 +831,175 @@ kernel_dbg_exc:
 	/* Now we mash up things to make it look like we are coming on a
 	 * normal exception
 	 */
-	mfspr	r15,SPRN_SPRG_DBG_SCRATCH
-	mtspr	SPRN_SPRG_GEN_SCRATCH,r15
 	mfspr	r14,SPRN_DBSR
-	EXCEPTION_COMMON(0xd00, PACA_EXDBG, INTS_DISABLE_ALL)
+	EXCEPTION_COMMON_DBG(0xd08)
+	INTS_DISABLE
 	std	r14,_DSISR(r1)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	mr	r4,r14
 	ld	r14,PACA_EXDBG+EX_R14(r13)
 	ld	r15,PACA_EXDBG+EX_R15(r13)
-	bl	.save_nvgprs
-	bl	.DebugException
-	b	.ret_from_except
-
-	MASKABLE_EXCEPTION(0x260, perfmon, .performance_monitor_exception, ACK_NONE)
-
-/* Doorbell interrupt */
-	START_EXCEPTION(doorbell)
-	NORMAL_EXCEPTION_PROLOG(0x2070, PROLOG_ADDITION_DOORBELL)
-	EXCEPTION_COMMON(0x2070, PACA_EXGEN, INTS_DISABLE_ALL)
+	bl	save_nvgprs
+	bl	DebugException
+	b	ret_from_except
+
+	START_EXCEPTION(perfmon);
+	NORMAL_EXCEPTION_PROLOG(0x260, BOOKE_INTERRUPT_PERFORMANCE_MONITOR,
+				PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON(0x260)
+	INTS_DISABLE
 	CHECK_NAPPING()
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	.doorbell_exception
-	b	.ret_from_except_lite
+	bl	performance_monitor_exception
+	b	ret_from_except_lite
+
+/* Doorbell interrupt */
+	MASKABLE_EXCEPTION(0x280, BOOKE_INTERRUPT_DOORBELL,
+			   doorbell, doorbell_exception, ACK_NONE)
 
 /* Doorbell critical Interrupt */
 	START_EXCEPTION(doorbell_crit);
-	CRIT_EXCEPTION_PROLOG(0x2080, PROLOG_ADDITION_NONE)
-//	EXCEPTION_COMMON(0x2080, PACA_EXCRIT, INTS_DISABLE_ALL)
-//	bl	special_reg_save_crit
-//	CHECK_NAPPING();
-//	addi	r3,r1,STACK_FRAME_OVERHEAD
-//	bl	.doorbell_critical_exception
-//	b	ret_from_crit_except
-	b	.
-
-	MASKABLE_EXCEPTION(0x2c0, guest_doorbell, .unknown_exception, ACK_NONE)
-	MASKABLE_EXCEPTION(0x2e0, guest_doorbell_crit, .unknown_exception, ACK_NONE)
-	MASKABLE_EXCEPTION(0x310, hypercall, .unknown_exception, ACK_NONE)
-	MASKABLE_EXCEPTION(0x320, ehpriv, .unknown_exception, ACK_NONE)
+	CRIT_EXCEPTION_PROLOG(0x2a0, BOOKE_INTERRUPT_DOORBELL_CRITICAL,
+			      PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON_CRIT(0x2a0)
+	bl	save_nvgprs
+	bl	special_reg_save
+	CHECK_NAPPING();
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	unknown_exception
+	b	ret_from_crit_except
 
+/*
+ *	Guest doorbell interrupt
+ *	This general exception use GSRRx save/restore registers
+ */
+	START_EXCEPTION(guest_doorbell);
+	GDBELL_EXCEPTION_PROLOG(0x2c0, BOOKE_INTERRUPT_GUEST_DBELL,
+			        PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON(0x2c0)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	save_nvgprs
+	INTS_RESTORE_HARD
+	bl	unknown_exception
+	b	ret_from_except
+
+/* Guest Doorbell critical Interrupt */
+	START_EXCEPTION(guest_doorbell_crit);
+	CRIT_EXCEPTION_PROLOG(0x2e0, BOOKE_INTERRUPT_GUEST_DBELL_CRIT,
+			      PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON_CRIT(0x2e0)
+	bl	save_nvgprs
+	bl	special_reg_save
+	CHECK_NAPPING();
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	unknown_exception
+	b	ret_from_crit_except
+
+/* Hypervisor call */
+	START_EXCEPTION(hypercall);
+	NORMAL_EXCEPTION_PROLOG(0x310, BOOKE_INTERRUPT_HV_SYSCALL,
+			        PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON(0x310)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	save_nvgprs
+	INTS_RESTORE_HARD
+	bl	unknown_exception
+	b	ret_from_except
+
+/* Embedded Hypervisor priviledged  */
+	START_EXCEPTION(ehpriv);
+	NORMAL_EXCEPTION_PROLOG(0x320, BOOKE_INTERRUPT_HV_PRIV,
+			        PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON(0x320)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	save_nvgprs
+	INTS_RESTORE_HARD
+	bl	unknown_exception
+	b	ret_from_except
+
+/* LRAT Error interrupt */
+	START_EXCEPTION(lrat_error);
+	NORMAL_EXCEPTION_PROLOG(0x340, BOOKE_INTERRUPT_LRAT_ERROR,
+			        PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON(0x340)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.save_nvgprs
+	INTS_RESTORE_HARD
+	bl	.unknown_exception
+	b	.ret_from_except
 
 /*
- * An interrupt came in while soft-disabled; clear EE in SRR1,
- * clear paca->hard_enabled and return.
+ * An interrupt came in while soft-disabled; We mark paca->irq_happened
+ * accordingly and if the interrupt is level sensitive, we hard disable
  */
-masked_doorbell_book3e:
-	mtcr	r10
-	/* Resend the doorbell to fire again when ints enabled */
-	mfspr	r10,SPRN_PIR
-	PPC_MSGSND(r10)
-	b	masked_interrupt_book3e_common
 
-masked_interrupt_book3e:
-	mtcr	r10
-masked_interrupt_book3e_common:
-	stb	r11,PACAHARDIRQEN(r13)
-	mfspr	r10,SPRN_SRR1
-	rldicl	r11,r10,48,1		/* clear MSR_EE */
-	rotldi	r10,r11,16
-	mtspr	SPRN_SRR1,r10
-	ld	r10,PACA_EXGEN+EX_R10(r13);	/* restore registers */
-	ld	r11,PACA_EXGEN+EX_R11(r13);
-	mfspr	r13,SPRN_SPRG_GEN_SCRATCH;
+.macro masked_interrupt_book3e paca_irq full_mask
+	lbz	r10,PACAIRQHAPPENED(r13)
+	ori	r10,r10,\paca_irq
+	stb	r10,PACAIRQHAPPENED(r13)
+
+	.if \full_mask == 1
+	rldicl	r10,r11,48,1		/* clear MSR_EE */
+	rotldi	r11,r10,16
+	mtspr	SPRN_SRR1,r11
+	.endif
+
+	lwz	r11,PACA_EXGEN+EX_CR(r13)
+	mtcr	r11
+	ld	r10,PACA_EXGEN+EX_R10(r13)
+	ld	r11,PACA_EXGEN+EX_R11(r13)
+	mfspr	r13,SPRN_SPRG_GEN_SCRATCH
 	rfi
 	b	.
+.endm
+
+masked_interrupt_book3e_0x500:
+	// XXX When adding support for EPR, use PACA_IRQ_EE_EDGE
+	masked_interrupt_book3e PACA_IRQ_EE 1
+
+masked_interrupt_book3e_0x900:
+	ACK_DEC(r10);
+	masked_interrupt_book3e PACA_IRQ_DEC 0
+
+masked_interrupt_book3e_0x980:
+	ACK_FIT(r10);
+	masked_interrupt_book3e PACA_IRQ_DEC 0
+
+masked_interrupt_book3e_0x280:
+masked_interrupt_book3e_0x2c0:
+	masked_interrupt_book3e PACA_IRQ_DBELL 0
+
+/*
+ * Called from arch_local_irq_enable when an interrupt needs
+ * to be resent. r3 contains either 0x500,0x900,0x260 or 0x280
+ * to indicate the kind of interrupt. MSR:EE is already off.
+ * We generate a stackframe like if a real interrupt had happened.
+ *
+ * Note: While MSR:EE is off, we need to make sure that _MSR
+ * in the generated frame has EE set to 1 or the exception
+ * handler will not properly re-enable them.
+ */
+_GLOBAL(__replay_interrupt)
+	/* We are going to jump to the exception common code which
+	 * will retrieve various register values from the PACA which
+	 * we don't give a damn about.
+	 */
+	mflr	r10
+	mfmsr	r11
+	mfcr	r4
+	mtspr	SPRN_SPRG_GEN_SCRATCH,r13;
+	std	r1,PACA_EXGEN+EX_R1(r13);
+	stw	r4,PACA_EXGEN+EX_CR(r13);
+	ori	r11,r11,MSR_EE
+	subi	r1,r1,INT_FRAME_SIZE;
+	cmpwi	cr0,r3,0x500
+	beq	exc_0x500_common
+	cmpwi	cr0,r3,0x900
+	beq	exc_0x900_common
+	cmpwi	cr0,r3,0x280
+	beq	exc_0x280_common
+	blr
+
 
 /*
  * This is called from 0x300 and 0x400 handlers after the prologs with
@@ -591,17 +1014,16 @@ storage_fault_common:
 	mr	r5,r15
 	ld	r14,PACA_EXGEN+EX_R14(r13)
 	ld	r15,PACA_EXGEN+EX_R15(r13)
-	INTS_RESTORE_HARD
-	bl	.do_page_fault
+	bl	do_page_fault
 	cmpdi	r3,0
 	bne-	1f
-	b	.ret_from_except_lite
-1:	bl	.save_nvgprs
+	b	ret_from_except_lite
+1:	bl	save_nvgprs
 	mr	r5,r3
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	ld	r4,_DAR(r1)
-	bl	.bad_page_fault
-	b	.ret_from_except
+	bl	bad_page_fault
+	b	ret_from_except
 
 /*
  * Alignment exception doesn't fit entirely in the 0x100 bytes so it
@@ -613,10 +1035,10 @@ alignment_more:
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	ld	r14,PACA_EXGEN+EX_R14(r13)
 	ld	r15,PACA_EXGEN+EX_R15(r13)
-	bl	.save_nvgprs
+	bl	save_nvgprs
 	INTS_RESTORE_HARD
-	bl	.alignment_exception
-	b	.ret_from_except
+	bl	alignment_exception
+	b	ret_from_except
 
 /*
  * We branch here from entry_64.S for the last stage of the exception
@@ -679,12 +1101,16 @@ fast_exception_return:
 BAD_STACK_TRAMPOLINE(0x000)
 BAD_STACK_TRAMPOLINE(0x100)
 BAD_STACK_TRAMPOLINE(0x200)
+BAD_STACK_TRAMPOLINE(0x220)
 BAD_STACK_TRAMPOLINE(0x260)
+BAD_STACK_TRAMPOLINE(0x280)
+BAD_STACK_TRAMPOLINE(0x2a0)
 BAD_STACK_TRAMPOLINE(0x2c0)
 BAD_STACK_TRAMPOLINE(0x2e0)
 BAD_STACK_TRAMPOLINE(0x300)
 BAD_STACK_TRAMPOLINE(0x310)
 BAD_STACK_TRAMPOLINE(0x320)
+BAD_STACK_TRAMPOLINE(0x340)
 BAD_STACK_TRAMPOLINE(0x400)
 BAD_STACK_TRAMPOLINE(0x500)
 BAD_STACK_TRAMPOLINE(0x600)
@@ -697,11 +1123,10 @@ BAD_STACK_TRAMPOLINE(0xa00)
 BAD_STACK_TRAMPOLINE(0xb00)
 BAD_STACK_TRAMPOLINE(0xc00)
 BAD_STACK_TRAMPOLINE(0xd00)
+BAD_STACK_TRAMPOLINE(0xd08)
 BAD_STACK_TRAMPOLINE(0xe00)
 BAD_STACK_TRAMPOLINE(0xf00)
 BAD_STACK_TRAMPOLINE(0xf20)
-BAD_STACK_TRAMPOLINE(0x2070)
-BAD_STACK_TRAMPOLINE(0x2080)
 
 	.globl	bad_stack_book3e
 bad_stack_book3e:
@@ -747,7 +1172,7 @@ bad_stack_book3e:
 	std	r12,0(r11)
 	ld	r2,PACATOC(r13)
 1:	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	.kernel_bad_stack
+	bl	kernel_bad_stack
 	b	1b
 
 /*
@@ -829,7 +1254,7 @@ skpinv:	addi	r6,r6,1				/* Increment */
 	bne	1b				/* If not, repeat */
 
 	/* Invalidate all TLBs */
-	PPC_TLBILX_ALL(0,0)
+	PPC_TLBILX_ALL(0,R0)
 	sync
 	isync
 
@@ -882,12 +1307,9 @@ skpinv:	addi	r6,r6,1				/* Increment */
 	mtspr	SPRN_MAS0,r3
 	tlbre
 	mfspr	r6,SPRN_MAS1
-	rlwinm	r6,r6,0,2,0	/* clear IPROT */
+	rlwinm	r6,r6,0,2,31	/* clear IPROT and VALID */
 	mtspr	SPRN_MAS1,r6
 	tlbwe
-
-	/* Invalidate TLB1 */
-	PPC_TLBILX_ALL(0,0)
 	sync
 	isync
 
@@ -941,12 +1363,9 @@ skpinv:	addi	r6,r6,1				/* Increment */
 	mtspr	SPRN_MAS0,r4
 	tlbre
 	mfspr	r5,SPRN_MAS1
-	rlwinm	r5,r5,0,2,0	/* clear IPROT */
+	rlwinm	r5,r5,0,2,31	/* clear IPROT and VALID */
 	mtspr	SPRN_MAS1,r5
 	tlbwe
-
-	/* Invalidate TLB1 */
-	PPC_TLBILX_ALL(0,0)
 	sync
 	isync
 
@@ -1048,23 +1467,7 @@ a2_tlbinit_after_linear_map:
 	.globl  a2_tlbinit_after_iprot_flush
 a2_tlbinit_after_iprot_flush:
 
-#ifdef CONFIG_PPC_EARLY_DEBUG_WSP
-	/* Now establish early debug mappings if applicable */
-	/* Restore the MAS0 we used for linear mapping load */
-	mtspr	SPRN_MAS0,r11
-
-	lis	r3,(MAS1_VALID | MAS1_IPROT)@h
-	ori	r3,r3,(BOOK3E_PAGESZ_4K << MAS1_TSIZE_SHIFT)
-	mtspr	SPRN_MAS1,r3
-	LOAD_REG_IMMEDIATE(r3, WSP_UART_VIRT | MAS2_I | MAS2_G)
-	mtspr	SPRN_MAS2,r3
-	LOAD_REG_IMMEDIATE(r3, WSP_UART_PHYS | MAS3_SR | MAS3_SW)
-	mtspr	SPRN_MAS7_MAS3,r3
-	/* re-use the MAS8 value from the linear mapping */
-	tlbwe
-#endif /* CONFIG_PPC_EARLY_DEBUG_WSP */
-
-	PPC_TLBILX(0,0,0)
+	PPC_TLBILX(0,0,R0)
 	sync
 	isync
 
@@ -1102,13 +1505,13 @@ _GLOBAL(start_initialization_book3e)
 	 * and always use AS 0, so we just set it up to match our link
 	 * address and never use 0 based addresses.
 	 */
-	bl	.initial_tlb_book3e
+	bl	initial_tlb_book3e
 
 	/* Init global core bits */
-	bl	.init_core_book3e
+	bl	init_core_book3e
 
 	/* Init per-thread bits */
-	bl	.init_thread_book3e
+	bl	init_thread_book3e
 
 	/* Return to common init code */
 	tovirt(r28,r28)
@@ -1129,7 +1532,7 @@ _GLOBAL(start_initialization_book3e)
  */
 _GLOBAL(book3e_secondary_core_init_tlb_set)
 	li	r4,1
-	b	.generic_secondary_smp_init
+	b	generic_secondary_smp_init
 
 _GLOBAL(book3e_secondary_core_init)
 	mflr	r28
@@ -1139,18 +1542,18 @@ _GLOBAL(book3e_secondary_core_init)
 	bne	2f
 
 	/* Setup TLB for this core */
-	bl	.initial_tlb_book3e
+	bl	initial_tlb_book3e
 
 	/* We can return from the above running at a different
 	 * address, so recalculate r2 (TOC)
 	 */
-	bl	.relative_toc
+	bl	relative_toc
 
 	/* Init global core bits */
-2:	bl	.init_core_book3e
+2:	bl	init_core_book3e
 
 	/* Init per-thread bits */
-3:	bl	.init_thread_book3e
+3:	bl	init_thread_book3e
 
 	/* Return to common init code at proper virtual address.
 	 *
@@ -1177,14 +1580,14 @@ _GLOBAL(book3e_secondary_thread_init)
 	mflr	r28
 	b	3b
 
-_STATIC(init_core_book3e)
+init_core_book3e:
 	/* Establish the interrupt vector base */
 	LOAD_REG_IMMEDIATE(r3, interrupt_base_book3e)
 	mtspr	SPRN_IVPR,r3
 	sync
 	blr
 
-_STATIC(init_thread_book3e)
+init_thread_book3e:
 	lis	r3,(SPRN_EPCR_ICM | SPRN_EPCR_GICM)@h
 	mtspr	SPRN_EPCR,r3
 
@@ -1221,6 +1624,11 @@ _GLOBAL(__setup_base_ivors)
 
 	blr
 
+_GLOBAL(setup_altivec_ivors)
+	SET_IVOR(32, 0x200) /* AltiVec Unavailable */
+	SET_IVOR(33, 0x220) /* AltiVec Assist */
+	blr
+
 _GLOBAL(setup_perfmon_ivor)
 	SET_IVOR(35, 0x260) /* Performance Monitor */
 	blr
@@ -1228,25 +1636,15 @@ _GLOBAL(setup_perfmon_ivor)
 _GLOBAL(setup_doorbell_ivors)
 	SET_IVOR(36, 0x280) /* Processor Doorbell */
 	SET_IVOR(37, 0x2a0) /* Processor Doorbell Crit */
-
-	/* Check MMUCFG[LPIDSIZE] to determine if we have category E.HV */
-	mfspr	r10,SPRN_MMUCFG
-	rlwinm.	r10,r10,0,MMUCFG_LPIDSIZE
-	beqlr
-
-	SET_IVOR(38, 0x2c0) /* Guest Processor Doorbell */
-	SET_IVOR(39, 0x2e0) /* Guest Processor Doorbell Crit/MC */
 	blr
 
 _GLOBAL(setup_ehv_ivors)
-	/*
-	 * We may be running as a guest and lack E.HV even on a chip
-	 * that normally has it.
-	 */
-	mfspr	r10,SPRN_MMUCFG
-	rlwinm.	r10,r10,0,MMUCFG_LPIDSIZE
-	beqlr
-
 	SET_IVOR(40, 0x300) /* Embedded Hypervisor System Call */
 	SET_IVOR(41, 0x320) /* Embedded Hypervisor Privilege */
+	SET_IVOR(38, 0x2c0) /* Guest Processor Doorbell */
+	SET_IVOR(39, 0x2e0) /* Guest Processor Doorbell Crit/MC */
+	blr
+
+_GLOBAL(setup_lrat_ivor)
+	SET_IVOR(42, 0x340) /* LRAT Error */
 	blr
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index d4be7bb3dbd..a7d36b19221 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -12,18 +12,75 @@
  *
  */
 
+#include <asm/hw_irq.h>
 #include <asm/exception-64s.h>
 #include <asm/ptrace.h>
 
 /*
  * We layout physical memory as follows:
  * 0x0000 - 0x00ff : Secondary processor spin code
- * 0x0100 - 0x2fff : pSeries Interrupt prologs
- * 0x3000 - 0x5fff : interrupt support, iSeries and common interrupt prologs
- * 0x6000 - 0x6fff : Initial (CPU0) segment table
+ * 0x0100 - 0x17ff : pSeries Interrupt prologs
+ * 0x1800 - 0x4000 : interrupt support common interrupt prologs
+ * 0x4000 - 0x5fff : pSeries interrupts with IR=1,DR=1
+ * 0x6000 - 0x6fff : more interrupt support including for IR=1,DR=1
  * 0x7000 - 0x7fff : FWNMI data area
- * 0x8000 -        : Early init and support code
+ * 0x8000 - 0x8fff : Initial (CPU0) segment table
+ * 0x9000 -        : Early init and support code
  */
+	/* Syscall routine is used twice, in reloc-off and reloc-on paths */
+#define SYSCALL_PSERIES_1 					\
+BEGIN_FTR_SECTION						\
+	cmpdi	r0,0x1ebe ; 					\
+	beq-	1f ;						\
+END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\
+	mr	r9,r13 ;					\
+	GET_PACA(r13) ;						\
+	mfspr	r11,SPRN_SRR0 ;					\
+0:
+
+#define SYSCALL_PSERIES_2_RFID 					\
+	mfspr	r12,SPRN_SRR1 ;					\
+	ld	r10,PACAKBASE(r13) ; 				\
+	LOAD_HANDLER(r10, system_call_entry) ; 			\
+	mtspr	SPRN_SRR0,r10 ; 				\
+	ld	r10,PACAKMSR(r13) ;				\
+	mtspr	SPRN_SRR1,r10 ; 				\
+	rfid ; 							\
+	b	. ;	/* prevent speculative execution */
+
+#define SYSCALL_PSERIES_3					\
+	/* Fast LE/BE switch system call */			\
+1:	mfspr	r12,SPRN_SRR1 ;					\
+	xori	r12,r12,MSR_LE ;				\
+	mtspr	SPRN_SRR1,r12 ;					\
+	rfid ;		/* return to userspace */		\
+	b	. ;	/* prevent speculative execution */
+
+#if defined(CONFIG_RELOCATABLE)
+	/*
+	 * We can't branch directly; in the direct case we use LR
+	 * and system_call_entry restores LR.  (We thus need to move
+	 * LR to r10 in the RFID case too.)
+	 */
+#define SYSCALL_PSERIES_2_DIRECT				\
+	mflr	r10 ;						\
+	ld	r12,PACAKBASE(r13) ; 				\
+	LOAD_HANDLER(r12, system_call_entry_direct) ;		\
+	mtctr	r12 ;						\
+	mfspr	r12,SPRN_SRR1 ;					\
+	/* Re-use of r13... No spare regs to do this */	\
+	li	r13,MSR_RI ;					\
+	mtmsrd 	r13,1 ;						\
+	GET_PACA(r13) ;	/* get r13 back */			\
+	bctr ;
+#else
+	/* We can branch directly */
+#define SYSCALL_PSERIES_2_DIRECT				\
+	mfspr	r12,SPRN_SRR1 ;					\
+	li	r10,MSR_RI ;					\
+	mtmsrd 	r10,1 ;			/* Set RI (EE=0) */	\
+	b	system_call_entry_direct ;
+#endif
 
 /*
  * This is the start of the interrupt handlers for pSeries
@@ -39,7 +96,7 @@ __start_interrupts:
 
 	.globl system_reset_pSeries;
 system_reset_pSeries:
-	HMT_MEDIUM;
+	HMT_MEDIUM_PPR_DISCARD
 	SET_SCRATCH0(r13)
 #ifdef CONFIG_PPC_P7_NAP
 BEGIN_FTR_SECTION
@@ -56,24 +113,32 @@ BEGIN_FTR_SECTION
 	cmpwi	cr1,r13,2
 	/* Total loss of HV state is fatal, we could try to use the
 	 * PIR to locate a PACA, then use an emergency stack etc...
-	 * but for now, let's just stay stuck here
+	 * OPAL v3 based powernv platforms have new idle states
+	 * which fall in this catagory.
 	 */
-	bgt	cr1,.
+	bgt	cr1,8f
 	GET_PACA(r13)
 
-#ifdef CONFIG_KVM_BOOK3S_64_HV
-	lbz	r0,PACAPROCSTART(r13)
-	cmpwi	r0,0x80
-	bne	1f
-	li	r0,1
-	stb	r0,PACAPROCSTART(r13)
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	li	r0,KVM_HWTHREAD_IN_KERNEL
+	stb	r0,HSTATE_HWTHREAD_STATE(r13)
+	/* Order setting hwthread_state vs. testing hwthread_req */
+	sync
+	lbz	r0,HSTATE_HWTHREAD_REQ(r13)
+	cmpwi	r0,0
+	beq	1f
 	b	kvm_start_guest
 1:
 #endif
 
 	beq	cr1,2f
-	b	.power7_wakeup_noloss
-2:	b	.power7_wakeup_loss
+	b	power7_wakeup_noloss
+2:	b	power7_wakeup_loss
+
+	/* Fast Sleep wakeup on PowerNV */
+8:	GET_PACA(r13)
+	b 	power7_wakeup_tb_loss
+
 9:
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 #endif /* CONFIG_PPC_P7_NAP */
@@ -86,28 +151,57 @@ machine_check_pSeries_1:
 	 * some code path might still want to branch into the original
 	 * vector
 	 */
-	b	machine_check_pSeries
+	HMT_MEDIUM_PPR_DISCARD
+	SET_SCRATCH0(r13)		/* save r13 */
+#ifdef CONFIG_PPC_P7_NAP
+BEGIN_FTR_SECTION
+	/* Running native on arch 2.06 or later, check if we are
+	 * waking up from nap. We only handle no state loss and
+	 * supervisor state loss. We do -not- handle hypervisor
+	 * state loss at this time.
+	 */
+	mfspr	r13,SPRN_SRR1
+	rlwinm.	r13,r13,47-31,30,31
+	OPT_GET_SPR(r13, SPRN_CFAR, CPU_FTR_CFAR)
+	beq	9f
+
+	mfspr	r13,SPRN_SRR1
+	rlwinm.	r13,r13,47-31,30,31
+	/* waking up from powersave (nap) state */
+	cmpwi	cr1,r13,2
+	/* Total loss of HV state is fatal. let's just stay stuck here */
+	OPT_GET_SPR(r13, SPRN_CFAR, CPU_FTR_CFAR)
+	bgt	cr1,.
+9:
+	OPT_SET_SPR(r13, SPRN_CFAR, CPU_FTR_CFAR)
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+#endif /* CONFIG_PPC_P7_NAP */
+	EXCEPTION_PROLOG_0(PACA_EXMC)
+BEGIN_FTR_SECTION
+	b	machine_check_pSeries_early
+FTR_SECTION_ELSE
+	b	machine_check_pSeries_0
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
 
 	. = 0x300
 	.globl data_access_pSeries
 data_access_pSeries:
-	HMT_MEDIUM
+	HMT_MEDIUM_PPR_DISCARD
 	SET_SCRATCH0(r13)
-#ifndef CONFIG_POWER4_ONLY
 BEGIN_FTR_SECTION
 	b	data_access_check_stab
 data_access_not_stab:
 END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB)
-#endif
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD,
-				 KVMTEST_PR, 0x300)
+				 KVMTEST, 0x300)
 
 	. = 0x380
 	.globl data_access_slb_pSeries
 data_access_slb_pSeries:
-	HMT_MEDIUM
+	HMT_MEDIUM_PPR_DISCARD
 	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380)
+	EXCEPTION_PROLOG_0(PACA_EXSLB)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380)
 	std	r3,PACA_EXSLB+EX_R3(r13)
 	mfspr	r3,SPRN_DAR
 #ifdef __DISABLED__
@@ -117,16 +211,16 @@ data_access_slb_pSeries:
 #endif /* __DISABLED__ */
 	mfspr	r12,SPRN_SRR1
 #ifndef CONFIG_RELOCATABLE
-	b	.slb_miss_realmode
+	b	slb_miss_realmode
 #else
 	/*
-	 * We can't just use a direct branch to .slb_miss_realmode
+	 * We can't just use a direct branch to slb_miss_realmode
 	 * because the distance from here to there depends on where
 	 * the kernel ends up being put.
 	 */
 	mfctr	r11
 	ld	r10,PACAKBASE(r13)
-	LOAD_HANDLER(r10, .slb_miss_realmode)
+	LOAD_HANDLER(r10, slb_miss_realmode)
 	mtctr	r10
 	bctr
 #endif
@@ -136,8 +230,9 @@ data_access_slb_pSeries:
 	. = 0x480
 	.globl instruction_access_slb_pSeries
 instruction_access_slb_pSeries:
-	HMT_MEDIUM
+	HMT_MEDIUM_PPR_DISCARD
 	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXSLB)
 	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480)
 	std	r3,PACA_EXSLB+EX_R3(r13)
 	mfspr	r3,SPRN_SRR0		/* SRR0 is faulting address */
@@ -148,11 +243,11 @@ instruction_access_slb_pSeries:
 #endif /* __DISABLED__ */
 	mfspr	r12,SPRN_SRR1
 #ifndef CONFIG_RELOCATABLE
-	b	.slb_miss_realmode
+	b	slb_miss_realmode
 #else
 	mfctr	r11
 	ld	r10,PACAKBASE(r13)
-	LOAD_HANDLER(r10, .slb_miss_realmode)
+	LOAD_HANDLER(r10, slb_miss_realmode)
 	mtctr	r10
 	bctr
 #endif
@@ -165,6 +260,7 @@ instruction_access_slb_pSeries:
 	.globl hardware_interrupt_hv;
 hardware_interrupt_pSeries:
 hardware_interrupt_hv:
+	HMT_MEDIUM_PPR_DISCARD
 	BEGIN_FTR_SECTION
 		_MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt,
 					    EXC_HV, SOFTEN_TEST_HV)
@@ -184,10 +280,14 @@ hardware_interrupt_hv:
 	STD_EXCEPTION_PSERIES(0x800, 0x800, fp_unavailable)
 	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x800)
 
-	MASKABLE_EXCEPTION_PSERIES(0x900, 0x900, decrementer)
-	MASKABLE_EXCEPTION_HV(0x980, 0x982, decrementer)
+	. = 0x900
+	.globl decrementer_pSeries
+decrementer_pSeries:
+	_MASKABLE_EXCEPTION_PSERIES(0x900, decrementer, EXC_STD, SOFTEN_TEST_PR)
+
+	STD_EXCEPTION_HV(0x980, 0x982, hdecrementer)
 
-	STD_EXCEPTION_PSERIES(0xa00, 0xa00, trap_0a)
+	MASKABLE_EXCEPTION_PSERIES(0xa00, 0xa00, doorbell_super)
 	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xa00)
 
 	STD_EXCEPTION_PSERIES(0xb00, 0xb00, trap_0b)
@@ -206,31 +306,11 @@ system_call_pSeries:
 	KVMTEST(0xc00)
 	GET_SCRATCH0(r13)
 #endif
-BEGIN_FTR_SECTION
-	cmpdi	r0,0x1ebe
-	beq-	1f
-END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
-	mr	r9,r13
-	GET_PACA(r13)
-	mfspr	r11,SPRN_SRR0
-	mfspr	r12,SPRN_SRR1
-	ld	r10,PACAKBASE(r13)
-	LOAD_HANDLER(r10, system_call_entry)
-	mtspr	SPRN_SRR0,r10
-	ld	r10,PACAKMSR(r13)
-	mtspr	SPRN_SRR1,r10
-	rfid
-	b	.	/* prevent speculative execution */
-
+	SYSCALL_PSERIES_1
+	SYSCALL_PSERIES_2_RFID
+	SYSCALL_PSERIES_3
 	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xc00)
 
-/* Fast LE/BE switch system call */
-1:	mfspr	r12,SPRN_SRR1
-	xori	r12,r12,MSR_LE
-	mtspr	SPRN_SRR1,r12
-	rfid		/* return to userspace */
-	b	.
-
 	STD_EXCEPTION_PSERIES(0xd00, 0xd00, single_step)
 	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xd00)
 
@@ -238,33 +318,70 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
 	 * out of line to handle them
 	 */
 	. = 0xe00
+hv_data_storage_trampoline:
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXGEN)
 	b	h_data_storage_hv
+
 	. = 0xe20
+hv_instr_storage_trampoline:
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXGEN)
 	b	h_instr_storage_hv
+
 	. = 0xe40
+emulation_assist_trampoline:
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXGEN)
 	b	emulation_assist_hv
-	. = 0xe50
-	b	hmi_exception_hv
+
 	. = 0xe60
+hv_exception_trampoline:
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXGEN)
 	b	hmi_exception_hv
 
+	. = 0xe80
+hv_doorbell_trampoline:
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXGEN)
+	b	h_doorbell_hv
+
 	/* We need to deal with the Altivec unavailable exception
 	 * here which is at 0xf20, thus in the middle of the
 	 * prolog code of the PerformanceMonitor one. A little
 	 * trickery is thus necessary
 	 */
-performance_monitor_pSeries_1:
 	. = 0xf00
+performance_monitor_pseries_trampoline:
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXGEN)
 	b	performance_monitor_pSeries
 
-altivec_unavailable_pSeries_1:
 	. = 0xf20
+altivec_unavailable_pseries_trampoline:
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXGEN)
 	b	altivec_unavailable_pSeries
 
-vsx_unavailable_pSeries_1:
 	. = 0xf40
+vsx_unavailable_pseries_trampoline:
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXGEN)
 	b	vsx_unavailable_pSeries
 
+	. = 0xf60
+facility_unavailable_trampoline:
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXGEN)
+	b	facility_unavailable_pSeries
+
+	. = 0xf80
+hv_facility_unavailable_trampoline:
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXGEN)
+	b	facility_unavailable_hv
+
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error)
 	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1202)
@@ -273,6 +390,26 @@ vsx_unavailable_pSeries_1:
 	STD_EXCEPTION_PSERIES(0x1300, 0x1300, instruction_breakpoint)
 	KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
 
+	. = 0x1500
+	.global denorm_exception_hv
+denorm_exception_hv:
+	HMT_MEDIUM_PPR_DISCARD
+	mtspr	SPRN_SPRG_HSCRATCH0,r13
+	EXCEPTION_PROLOG_0(PACA_EXGEN)
+	EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0x1500)
+
+#ifdef CONFIG_PPC_DENORMALISATION
+	mfspr	r10,SPRN_HSRR1
+	mfspr	r11,SPRN_HSRR0		/* save HSRR0 */
+	andis.	r10,r10,(HSRR1_DENORM)@h /* denorm? */
+	addi	r11,r11,-4		/* HSRR0 is next instruction */
+	bne+	denorm_assist
+#endif
+
+	KVMTEST(0x1500)
+	EXCEPTION_PROLOG_PSERIES_1(denorm_common, EXC_HV)
+	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x1500)
+
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1600, 0x1602, cbe_maintenance)
 	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1602)
@@ -284,23 +421,100 @@ vsx_unavailable_pSeries_1:
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1800, 0x1802, cbe_thermal)
 	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1802)
+#else
+	. = 0x1800
 #endif /* CONFIG_CBE_RAS */
 
-	. = 0x3000
 
 /*** Out of line interrupts support ***/
 
+	.align	7
 	/* moved from 0x200 */
+machine_check_pSeries_early:
+BEGIN_FTR_SECTION
+	EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
+	/*
+	 * Register contents:
+	 * R13		= PACA
+	 * R9		= CR
+	 * Original R9 to R13 is saved on PACA_EXMC
+	 *
+	 * Switch to mc_emergency stack and handle re-entrancy (we limit
+	 * the nested MCE upto level 4 to avoid stack overflow).
+	 * Save MCE registers srr1, srr0, dar and dsisr and then set ME=1
+	 *
+	 * We use paca->in_mce to check whether this is the first entry or
+	 * nested machine check. We increment paca->in_mce to track nested
+	 * machine checks.
+	 *
+	 * If this is the first entry then set stack pointer to
+	 * paca->mc_emergency_sp, otherwise r1 is already pointing to
+	 * stack frame on mc_emergency stack.
+	 *
+	 * NOTE: We are here with MSR_ME=0 (off), which means we risk a
+	 * checkstop if we get another machine check exception before we do
+	 * rfid with MSR_ME=1.
+	 */
+	mr	r11,r1			/* Save r1 */
+	lhz	r10,PACA_IN_MCE(r13)
+	cmpwi	r10,0			/* Are we in nested machine check */
+	bne	0f			/* Yes, we are. */
+	/* First machine check entry */
+	ld	r1,PACAMCEMERGSP(r13)	/* Use MC emergency stack */
+0:	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame */
+	addi	r10,r10,1		/* increment paca->in_mce */
+	sth	r10,PACA_IN_MCE(r13)
+	/* Limit nested MCE to level 4 to avoid stack overflow */
+	cmpwi	r10,4
+	bgt	2f			/* Check if we hit limit of 4 */
+	std	r11,GPR1(r1)		/* Save r1 on the stack. */
+	std	r11,0(r1)		/* make stack chain pointer */
+	mfspr	r11,SPRN_SRR0		/* Save SRR0 */
+	std	r11,_NIP(r1)
+	mfspr	r11,SPRN_SRR1		/* Save SRR1 */
+	std	r11,_MSR(r1)
+	mfspr	r11,SPRN_DAR		/* Save DAR */
+	std	r11,_DAR(r1)
+	mfspr	r11,SPRN_DSISR		/* Save DSISR */
+	std	r11,_DSISR(r1)
+	std	r9,_CCR(r1)		/* Save CR in stackframe */
+	/* Save r9 through r13 from EXMC save area to stack frame. */
+	EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
+	mfmsr	r11			/* get MSR value */
+	ori	r11,r11,MSR_ME		/* turn on ME bit */
+	ori	r11,r11,MSR_RI		/* turn on RI bit */
+	ld	r12,PACAKBASE(r13)	/* get high part of &label */
+	LOAD_HANDLER(r12, machine_check_handle_early)
+1:	mtspr	SPRN_SRR0,r12
+	mtspr	SPRN_SRR1,r11
+	rfid
+	b	.	/* prevent speculative execution */
+2:
+	/* Stack overflow. Stay on emergency stack and panic.
+	 * Keep the ME bit off while panic-ing, so that if we hit
+	 * another machine check we checkstop.
+	 */
+	addi	r1,r1,INT_FRAME_SIZE	/* go back to previous stack frame */
+	ld	r11,PACAKMSR(r13)
+	ld	r12,PACAKBASE(r13)
+	LOAD_HANDLER(r12, unrecover_mce)
+	li	r10,MSR_ME
+	andc	r11,r11,r10		/* Turn off MSR_ME */
+	b	1b
+	b	.	/* prevent speculative execution */
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
+
 machine_check_pSeries:
 	.globl machine_check_fwnmi
 machine_check_fwnmi:
-	HMT_MEDIUM
+	HMT_MEDIUM_PPR_DISCARD
 	SET_SCRATCH0(r13)		/* save r13 */
-	EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common,
-				 EXC_STD, KVMTEST, 0x200)
+	EXCEPTION_PROLOG_0(PACA_EXMC)
+machine_check_pSeries_0:
+	EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST, 0x200)
+	EXCEPTION_PROLOG_PSERIES_1(machine_check_common, EXC_STD)
 	KVM_HANDLER_SKIP(PACA_EXMC, EXC_STD, 0x200)
 
-#ifndef CONFIG_POWER4_ONLY
 	/* moved from 0x300 */
 data_access_check_stab:
 	GET_PACA(r13)
@@ -310,7 +524,7 @@ data_access_check_stab:
 	mfspr	r9,SPRN_DSISR
 	srdi	r10,r10,60
 	rlwimi	r10,r9,16,0x20
-#ifdef CONFIG_KVM_BOOK3S_PR
+#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
 	lbz	r9,HSTATE_IN_GUEST(r13)
 	rlwimi	r10,r9,8,0x300
 #endif
@@ -326,64 +540,172 @@ do_stab_bolted_pSeries:
 	std	r12,PACA_EXSLB+EX_R12(r13)
 	GET_SCRATCH0(r10)
 	std	r10,PACA_EXSLB+EX_R13(r13)
-	EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD)
-#endif /* CONFIG_POWER4_ONLY */
+	EXCEPTION_PROLOG_PSERIES_1(do_stab_bolted, EXC_STD)
 
-	KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x300)
-	KVM_HANDLER_PR_SKIP(PACA_EXSLB, EXC_STD, 0x380)
+	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x300)
+	KVM_HANDLER_SKIP(PACA_EXSLB, EXC_STD, 0x380)
 	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x400)
 	KVM_HANDLER_PR(PACA_EXSLB, EXC_STD, 0x480)
 	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x900)
 	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x982)
 
+#ifdef CONFIG_PPC_DENORMALISATION
+denorm_assist:
+BEGIN_FTR_SECTION
+/*
+ * To denormalise we need to move a copy of the register to itself.
+ * For POWER6 do that here for all FP regs.
+ */
+	mfmsr	r10
+	ori	r10,r10,(MSR_FP|MSR_FE0|MSR_FE1)
+	xori	r10,r10,(MSR_FE0|MSR_FE1)
+	mtmsrd	r10
+	sync
+
+#define FMR2(n)  fmr (n), (n) ; fmr n+1, n+1
+#define FMR4(n)  FMR2(n) ; FMR2(n+2)
+#define FMR8(n)  FMR4(n) ; FMR4(n+4)
+#define FMR16(n) FMR8(n) ; FMR8(n+8)
+#define FMR32(n) FMR16(n) ; FMR16(n+16)
+	FMR32(0)
+
+FTR_SECTION_ELSE
+/*
+ * To denormalise we need to move a copy of the register to itself.
+ * For POWER7 do that here for the first 32 VSX registers only.
+ */
+	mfmsr	r10
+	oris	r10,r10,MSR_VSX@h
+	mtmsrd	r10
+	sync
+
+#define XVCPSGNDP2(n) XVCPSGNDP(n,n,n) ; XVCPSGNDP(n+1,n+1,n+1)
+#define XVCPSGNDP4(n) XVCPSGNDP2(n) ; XVCPSGNDP2(n+2)
+#define XVCPSGNDP8(n) XVCPSGNDP4(n) ; XVCPSGNDP4(n+4)
+#define XVCPSGNDP16(n) XVCPSGNDP8(n) ; XVCPSGNDP8(n+8)
+#define XVCPSGNDP32(n) XVCPSGNDP16(n) ; XVCPSGNDP16(n+16)
+	XVCPSGNDP32(0)
+
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_206)
+
+BEGIN_FTR_SECTION
+	b	denorm_done
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
+/*
+ * To denormalise we need to move a copy of the register to itself.
+ * For POWER8 we need to do that for all 64 VSX registers
+ */
+	XVCPSGNDP32(32)
+denorm_done:
+	mtspr	SPRN_HSRR0,r11
+	mtcrf	0x80,r9
+	ld	r9,PACA_EXGEN+EX_R9(r13)
+	RESTORE_PPR_PACA(PACA_EXGEN, r10)
+BEGIN_FTR_SECTION
+	ld	r10,PACA_EXGEN+EX_CFAR(r13)
+	mtspr	SPRN_CFAR,r10
+END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
+	ld	r10,PACA_EXGEN+EX_R10(r13)
+	ld	r11,PACA_EXGEN+EX_R11(r13)
+	ld	r12,PACA_EXGEN+EX_R12(r13)
+	ld	r13,PACA_EXGEN+EX_R13(r13)
+	HRFID
+	b	.
+#endif
+
 	.align	7
 	/* moved from 0xe00 */
-	STD_EXCEPTION_HV(., 0xe02, h_data_storage)
+	STD_EXCEPTION_HV_OOL(0xe02, h_data_storage)
 	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0xe02)
-	STD_EXCEPTION_HV(., 0xe22, h_instr_storage)
+	STD_EXCEPTION_HV_OOL(0xe22, h_instr_storage)
 	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe22)
-	STD_EXCEPTION_HV(., 0xe42, emulation_assist)
+	STD_EXCEPTION_HV_OOL(0xe42, emulation_assist)
 	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe42)
-	STD_EXCEPTION_HV(., 0xe62, hmi_exception) /* need to flush cache ? */
+	STD_EXCEPTION_HV_OOL(0xe62, hmi_exception) /* need to flush cache ? */
 	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe62)
+	MASKABLE_EXCEPTION_HV_OOL(0xe82, h_doorbell)
+	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe82)
 
 	/* moved from 0xf00 */
-	STD_EXCEPTION_PSERIES(., 0xf00, performance_monitor)
+	STD_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor)
 	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf00)
-	STD_EXCEPTION_PSERIES(., 0xf20, altivec_unavailable)
+	STD_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable)
 	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf20)
-	STD_EXCEPTION_PSERIES(., 0xf40, vsx_unavailable)
+	STD_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable)
 	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf40)
+	STD_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf60)
+	STD_EXCEPTION_HV_OOL(0xf82, facility_unavailable)
+	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xf82)
 
 /*
- * An interrupt came in while soft-disabled; clear EE in SRR1,
- * clear paca->hard_enabled and return.
+ * An interrupt came in while soft-disabled. We set paca->irq_happened, then:
+ * - If it was a decrementer interrupt, we bump the dec to max and and return.
+ * - If it was a doorbell we return immediately since doorbells are edge
+ *   triggered and won't automatically refire.
+ * - else we hard disable and return.
+ * This is called with r10 containing the value to OR to the paca field.
  */
-masked_interrupt:
-	stb	r10,PACAHARDIRQEN(r13)
-	mtcrf	0x80,r9
-	ld	r9,PACA_EXGEN+EX_R9(r13)
-	mfspr	r10,SPRN_SRR1
-	rldicl	r10,r10,48,1		/* clear MSR_EE */
-	rotldi	r10,r10,16
-	mtspr	SPRN_SRR1,r10
-	ld	r10,PACA_EXGEN+EX_R10(r13)
-	GET_SCRATCH0(r13)
-	rfid
+#define MASKED_INTERRUPT(_H)				\
+masked_##_H##interrupt:					\
+	std	r11,PACA_EXGEN+EX_R11(r13);		\
+	lbz	r11,PACAIRQHAPPENED(r13);		\
+	or	r11,r11,r10;				\
+	stb	r11,PACAIRQHAPPENED(r13);		\
+	cmpwi	r10,PACA_IRQ_DEC;			\
+	bne	1f;					\
+	lis	r10,0x7fff;				\
+	ori	r10,r10,0xffff;				\
+	mtspr	SPRN_DEC,r10;				\
+	b	2f;					\
+1:	cmpwi	r10,PACA_IRQ_DBELL;			\
+	beq	2f;					\
+	mfspr	r10,SPRN_##_H##SRR1;			\
+	rldicl	r10,r10,48,1; /* clear MSR_EE */	\
+	rotldi	r10,r10,16;				\
+	mtspr	SPRN_##_H##SRR1,r10;			\
+2:	mtcrf	0x80,r9;				\
+	ld	r9,PACA_EXGEN+EX_R9(r13);		\
+	ld	r10,PACA_EXGEN+EX_R10(r13);		\
+	ld	r11,PACA_EXGEN+EX_R11(r13);		\
+	GET_SCRATCH0(r13);				\
+	##_H##rfid;					\
 	b	.
+	
+	MASKED_INTERRUPT()
+	MASKED_INTERRUPT(H)
 
-masked_Hinterrupt:
-	stb	r10,PACAHARDIRQEN(r13)
-	mtcrf	0x80,r9
-	ld	r9,PACA_EXGEN+EX_R9(r13)
-	mfspr	r10,SPRN_HSRR1
-	rldicl	r10,r10,48,1		/* clear MSR_EE */
-	rotldi	r10,r10,16
-	mtspr	SPRN_HSRR1,r10
-	ld	r10,PACA_EXGEN+EX_R10(r13)
-	GET_SCRATCH0(r13)
-	hrfid
-	b	.
+/*
+ * Called from arch_local_irq_enable when an interrupt needs
+ * to be resent. r3 contains 0x500, 0x900, 0xa00 or 0xe80 to indicate
+ * which kind of interrupt. MSR:EE is already off. We generate a
+ * stackframe like if a real interrupt had happened.
+ *
+ * Note: While MSR:EE is off, we need to make sure that _MSR
+ * in the generated frame has EE set to 1 or the exception
+ * handler will not properly re-enable them.
+ */
+_GLOBAL(__replay_interrupt)
+	/* We are going to jump to the exception common code which
+	 * will retrieve various register values from the PACA which
+	 * we don't give a damn about, so we don't bother storing them.
+	 */
+	mfmsr	r12
+	mflr	r11
+	mfcr	r9
+	ori	r12,r12,MSR_EE
+	cmpwi	r3,0x900
+	beq	decrementer_common
+	cmpwi	r3,0x500
+	beq	hardware_interrupt_common
+BEGIN_FTR_SECTION
+	cmpwi	r3,0xe80
+	beq	h_doorbell_common
+FTR_SECTION_ELSE
+	cmpwi	r3,0xa00
+	beq	doorbell_super_common
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
+	blr
 
 #ifdef CONFIG_PPC_PSERIES
 /*
@@ -392,7 +714,7 @@ masked_Hinterrupt:
 	.globl system_reset_fwnmi
       .align 7
 system_reset_fwnmi:
-	HMT_MEDIUM
+	HMT_MEDIUM_PPR_DISCARD
 	SET_SCRATCH0(r13)		/* save r13 */
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
 				 NOTEST, 0x100)
@@ -427,61 +749,241 @@ slb_miss_user_pseries:
 	b	.				/* prevent spec. execution */
 #endif /* __DISABLED__ */
 
-	.align	7
-	.globl	__end_interrupts
-__end_interrupts:
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+kvmppc_skip_interrupt:
+	/*
+	 * Here all GPRs are unchanged from when the interrupt happened
+	 * except for r13, which is saved in SPRG_SCRATCH0.
+	 */
+	mfspr	r13, SPRN_SRR0
+	addi	r13, r13, 4
+	mtspr	SPRN_SRR0, r13
+	GET_SCRATCH0(r13)
+	rfid
+	b	.
+
+kvmppc_skip_Hinterrupt:
+	/*
+	 * Here all GPRs are unchanged from when the interrupt happened
+	 * except for r13, which is saved in SPRG_SCRATCH0.
+	 */
+	mfspr	r13, SPRN_HSRR0
+	addi	r13, r13, 4
+	mtspr	SPRN_HSRR0, r13
+	GET_SCRATCH0(r13)
+	hrfid
+	b	.
+#endif
 
 /*
  * Code from here down to __end_handlers is invoked from the
  * exception prologs above.  Because the prologs assemble the
  * addresses of these handlers using the LOAD_HANDLER macro,
- * which uses an addi instruction, these handlers must be in
- * the first 32k of the kernel image.
+ * which uses an ori instruction, these handlers must be in
+ * the first 64k of the kernel image.
  */
 
 /*** Common interrupt handlers ***/
 
-	STD_EXCEPTION_COMMON(0x100, system_reset, .system_reset_exception)
+	STD_EXCEPTION_COMMON(0x100, system_reset, system_reset_exception)
 
-	/*
-	 * Machine check is different because we use a different
-	 * save area: PACA_EXMC instead of PACA_EXGEN.
-	 */
-	.align	7
-	.globl machine_check_common
-machine_check_common:
-	EXCEPTION_PROLOG_COMMON(0x200, PACA_EXMC)
-	FINISH_NAP
-	DISABLE_INTS
-	bl	.save_nvgprs
-	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	.machine_check_exception
-	b	.ret_from_except
-
-	STD_EXCEPTION_COMMON_LITE(0x900, decrementer, .timer_interrupt)
-	STD_EXCEPTION_COMMON(0xa00, trap_0a, .unknown_exception)
-	STD_EXCEPTION_COMMON(0xb00, trap_0b, .unknown_exception)
-	STD_EXCEPTION_COMMON(0xd00, single_step, .single_step_exception)
-	STD_EXCEPTION_COMMON(0xe00, trap_0e, .unknown_exception)
-        STD_EXCEPTION_COMMON(0xe40, emulation_assist, .program_check_exception)
-        STD_EXCEPTION_COMMON(0xe60, hmi_exception, .unknown_exception)
-	STD_EXCEPTION_COMMON_IDLE(0xf00, performance_monitor, .performance_monitor_exception)
-	STD_EXCEPTION_COMMON(0x1300, instruction_breakpoint, .instruction_breakpoint_exception)
+	STD_EXCEPTION_COMMON_ASYNC(0x500, hardware_interrupt, do_IRQ)
+	STD_EXCEPTION_COMMON_ASYNC(0x900, decrementer, timer_interrupt)
+	STD_EXCEPTION_COMMON(0x980, hdecrementer, hdec_interrupt)
+#ifdef CONFIG_PPC_DOORBELL
+	STD_EXCEPTION_COMMON_ASYNC(0xa00, doorbell_super, doorbell_exception)
+#else
+	STD_EXCEPTION_COMMON_ASYNC(0xa00, doorbell_super, unknown_exception)
+#endif
+	STD_EXCEPTION_COMMON(0xb00, trap_0b, unknown_exception)
+	STD_EXCEPTION_COMMON(0xd00, single_step, single_step_exception)
+	STD_EXCEPTION_COMMON(0xe00, trap_0e, unknown_exception)
+	STD_EXCEPTION_COMMON(0xe40, emulation_assist, emulation_assist_interrupt)
+	STD_EXCEPTION_COMMON(0xe60, hmi_exception, unknown_exception)
+#ifdef CONFIG_PPC_DOORBELL
+	STD_EXCEPTION_COMMON_ASYNC(0xe80, h_doorbell, doorbell_exception)
+#else
+	STD_EXCEPTION_COMMON_ASYNC(0xe80, h_doorbell, unknown_exception)
+#endif
+	STD_EXCEPTION_COMMON_ASYNC(0xf00, performance_monitor, performance_monitor_exception)
+	STD_EXCEPTION_COMMON(0x1300, instruction_breakpoint, instruction_breakpoint_exception)
+	STD_EXCEPTION_COMMON(0x1502, denorm, unknown_exception)
 #ifdef CONFIG_ALTIVEC
-	STD_EXCEPTION_COMMON(0x1700, altivec_assist, .altivec_assist_exception)
+	STD_EXCEPTION_COMMON(0x1700, altivec_assist, altivec_assist_exception)
 #else
-	STD_EXCEPTION_COMMON(0x1700, altivec_assist, .unknown_exception)
+	STD_EXCEPTION_COMMON(0x1700, altivec_assist, unknown_exception)
 #endif
 #ifdef CONFIG_CBE_RAS
-	STD_EXCEPTION_COMMON(0x1200, cbe_system_error, .cbe_system_error_exception)
-	STD_EXCEPTION_COMMON(0x1600, cbe_maintenance, .cbe_maintenance_exception)
-	STD_EXCEPTION_COMMON(0x1800, cbe_thermal, .cbe_thermal_exception)
+	STD_EXCEPTION_COMMON(0x1200, cbe_system_error, cbe_system_error_exception)
+	STD_EXCEPTION_COMMON(0x1600, cbe_maintenance, cbe_maintenance_exception)
+	STD_EXCEPTION_COMMON(0x1800, cbe_thermal, cbe_thermal_exception)
 #endif /* CONFIG_CBE_RAS */
 
+	/*
+	 * Relocation-on interrupts: A subset of the interrupts can be delivered
+	 * with IR=1/DR=1, if AIL==2 and MSR.HV won't be changed by delivering
+	 * it.  Addresses are the same as the original interrupt addresses, but
+	 * offset by 0xc000000000004000.
+	 * It's impossible to receive interrupts below 0x300 via this mechanism.
+	 * KVM: None of these traps are from the guest ; anything that escalated
+	 * to HV=1 from HV=0 is delivered via real mode handlers.
+	 */
+
+	/*
+	 * This uses the standard macro, since the original 0x300 vector
+	 * only has extra guff for STAB-based processors -- which never
+	 * come here.
+	 */
+	STD_RELON_EXCEPTION_PSERIES(0x4300, 0x300, data_access)
+	. = 0x4380
+	.globl data_access_slb_relon_pSeries
+data_access_slb_relon_pSeries:
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXSLB)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380)
+	std	r3,PACA_EXSLB+EX_R3(r13)
+	mfspr	r3,SPRN_DAR
+	mfspr	r12,SPRN_SRR1
+#ifndef CONFIG_RELOCATABLE
+	b	slb_miss_realmode
+#else
+	/*
+	 * We can't just use a direct branch to slb_miss_realmode
+	 * because the distance from here to there depends on where
+	 * the kernel ends up being put.
+	 */
+	mfctr	r11
+	ld	r10,PACAKBASE(r13)
+	LOAD_HANDLER(r10, slb_miss_realmode)
+	mtctr	r10
+	bctr
+#endif
+
+	STD_RELON_EXCEPTION_PSERIES(0x4400, 0x400, instruction_access)
+	. = 0x4480
+	.globl instruction_access_slb_relon_pSeries
+instruction_access_slb_relon_pSeries:
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXSLB)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x480)
+	std	r3,PACA_EXSLB+EX_R3(r13)
+	mfspr	r3,SPRN_SRR0		/* SRR0 is faulting address */
+	mfspr	r12,SPRN_SRR1
+#ifndef CONFIG_RELOCATABLE
+	b	slb_miss_realmode
+#else
+	mfctr	r11
+	ld	r10,PACAKBASE(r13)
+	LOAD_HANDLER(r10, slb_miss_realmode)
+	mtctr	r10
+	bctr
+#endif
+
+	. = 0x4500
+	.globl hardware_interrupt_relon_pSeries;
+	.globl hardware_interrupt_relon_hv;
+hardware_interrupt_relon_pSeries:
+hardware_interrupt_relon_hv:
+	BEGIN_FTR_SECTION
+		_MASKABLE_RELON_EXCEPTION_PSERIES(0x502, hardware_interrupt, EXC_HV, SOFTEN_TEST_HV)
+	FTR_SECTION_ELSE
+		_MASKABLE_RELON_EXCEPTION_PSERIES(0x500, hardware_interrupt, EXC_STD, SOFTEN_TEST_PR)
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
+	STD_RELON_EXCEPTION_PSERIES(0x4600, 0x600, alignment)
+	STD_RELON_EXCEPTION_PSERIES(0x4700, 0x700, program_check)
+	STD_RELON_EXCEPTION_PSERIES(0x4800, 0x800, fp_unavailable)
+	MASKABLE_RELON_EXCEPTION_PSERIES(0x4900, 0x900, decrementer)
+	STD_RELON_EXCEPTION_HV(0x4980, 0x982, hdecrementer)
+	MASKABLE_RELON_EXCEPTION_PSERIES(0x4a00, 0xa00, doorbell_super)
+	STD_RELON_EXCEPTION_PSERIES(0x4b00, 0xb00, trap_0b)
+
+	. = 0x4c00
+	.globl system_call_relon_pSeries
+system_call_relon_pSeries:
+	HMT_MEDIUM
+	SYSCALL_PSERIES_1
+	SYSCALL_PSERIES_2_DIRECT
+	SYSCALL_PSERIES_3
+
+	STD_RELON_EXCEPTION_PSERIES(0x4d00, 0xd00, single_step)
+
+	. = 0x4e00
+	b	.	/* Can't happen, see v2.07 Book III-S section 6.5 */
+
+	. = 0x4e20
+	b	.	/* Can't happen, see v2.07 Book III-S section 6.5 */
+
+	. = 0x4e40
+emulation_assist_relon_trampoline:
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXGEN)
+	b	emulation_assist_relon_hv
+
+	. = 0x4e60
+	b	.	/* Can't happen, see v2.07 Book III-S section 6.5 */
+
+	. = 0x4e80
+h_doorbell_relon_trampoline:
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXGEN)
+	b	h_doorbell_relon_hv
+
+	. = 0x4f00
+performance_monitor_relon_pseries_trampoline:
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXGEN)
+	b	performance_monitor_relon_pSeries
+
+	. = 0x4f20
+altivec_unavailable_relon_pseries_trampoline:
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXGEN)
+	b	altivec_unavailable_relon_pSeries
+
+	. = 0x4f40
+vsx_unavailable_relon_pseries_trampoline:
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXGEN)
+	b	vsx_unavailable_relon_pSeries
+
+	. = 0x4f60
+facility_unavailable_relon_trampoline:
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXGEN)
+	b	facility_unavailable_relon_pSeries
+
+	. = 0x4f80
+hv_facility_unavailable_relon_trampoline:
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXGEN)
+	b	hv_facility_unavailable_relon_hv
+
+	STD_RELON_EXCEPTION_PSERIES(0x5300, 0x1300, instruction_breakpoint)
+#ifdef CONFIG_PPC_DENORMALISATION
+	. = 0x5500
+	b	denorm_exception_hv
+#endif
+	STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist)
+
+	/* Other future vectors */
+	.align	7
+	.globl	__end_interrupts
+__end_interrupts:
+
 	.align	7
+system_call_entry_direct:
+#if defined(CONFIG_RELOCATABLE)
+	/* The first level prologue may have used LR to get here, saving
+	 * orig in r10.  To save hacking/ifdeffing common code, restore here.
+	 */
+	mtlr	r10
+#endif
 system_call_entry:
 	b	system_call_common
 
+ppc64_runlatch_on_trampoline:
+	b	__ppc64_runlatch_on
+
 /*
  * Here we have detected that the kernel stack pointer is bad.
  * R9 contains the saved CR, r13 points to the paca,
@@ -539,7 +1041,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
 	std	r12,RESULT(r1)
 	std	r11,STACK_FRAME_OVERHEAD-16(r1)
 1:	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	.kernel_bad_stack
+	bl	kernel_bad_stack
 	b	1b
 
 /*
@@ -555,34 +1057,39 @@ data_access_common:
 	mfspr	r10,SPRN_DSISR
 	stw	r10,PACA_EXGEN+EX_DSISR(r13)
 	EXCEPTION_PROLOG_COMMON(0x300, PACA_EXGEN)
+	DISABLE_INTS
+	ld	r12,_MSR(r1)
 	ld	r3,PACA_EXGEN+EX_DAR(r13)
 	lwz	r4,PACA_EXGEN+EX_DSISR(r13)
 	li	r5,0x300
-	b	.do_hash_page	 	/* Try to handle as hpte fault */
+	b	do_hash_page		/* Try to handle as hpte fault */
 
 	.align  7
-        .globl  h_data_storage_common
+	.globl  h_data_storage_common
 h_data_storage_common:
-        mfspr   r10,SPRN_HDAR
-        std     r10,PACA_EXGEN+EX_DAR(r13)
-        mfspr   r10,SPRN_HDSISR
-        stw     r10,PACA_EXGEN+EX_DSISR(r13)
-        EXCEPTION_PROLOG_COMMON(0xe00, PACA_EXGEN)
-        bl      .save_nvgprs
-        addi    r3,r1,STACK_FRAME_OVERHEAD
-        bl      .unknown_exception
-        b       .ret_from_except
+	mfspr   r10,SPRN_HDAR
+	std     r10,PACA_EXGEN+EX_DAR(r13)
+	mfspr   r10,SPRN_HDSISR
+	stw     r10,PACA_EXGEN+EX_DSISR(r13)
+	EXCEPTION_PROLOG_COMMON(0xe00, PACA_EXGEN)
+	bl      save_nvgprs
+	DISABLE_INTS
+	addi    r3,r1,STACK_FRAME_OVERHEAD
+	bl      unknown_exception
+	b       ret_from_except
 
 	.align	7
 	.globl instruction_access_common
 instruction_access_common:
 	EXCEPTION_PROLOG_COMMON(0x400, PACA_EXGEN)
+	DISABLE_INTS
+	ld	r12,_MSR(r1)
 	ld	r3,_NIP(r1)
 	andis.	r4,r12,0x5820
 	li	r5,0x400
-	b	.do_hash_page		/* Try to handle as hpte fault */
+	b	do_hash_page		/* Try to handle as hpte fault */
 
-        STD_EXCEPTION_COMMON(0xe20, h_instr_storage, .unknown_exception)
+	STD_EXCEPTION_COMMON(0xe20, h_instr_storage, unknown_exception)
 
 /*
  * Here is the common SLB miss user that is used when going to virtual
@@ -597,7 +1104,7 @@ slb_miss_user_common:
 	stw	r9,PACA_EXGEN+EX_CCR(r13)
 	std	r10,PACA_EXGEN+EX_LR(r13)
 	std	r11,PACA_EXGEN+EX_SRR0(r13)
-	bl	.slb_allocate_user
+	bl	slb_allocate_user
 
 	ld	r10,PACA_EXGEN+EX_LR(r13)
 	ld	r3,PACA_EXGEN+EX_R3(r13)
@@ -640,116 +1147,37 @@ slb_miss_fault:
 unrecov_user_slb:
 	EXCEPTION_PROLOG_COMMON(0x4200, PACA_EXGEN)
 	DISABLE_INTS
-	bl	.save_nvgprs
+	bl	save_nvgprs
 1:	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	.unrecoverable_exception
+	bl	unrecoverable_exception
 	b	1b
 
 #endif /* __DISABLED__ */
 
 
-/*
- * r13 points to the PACA, r9 contains the saved CR,
- * r12 contain the saved SRR1, SRR0 is still ready for return
- * r3 has the faulting address
- * r9 - r13 are saved in paca->exslb.
- * r3 is saved in paca->slb_r3
- * We assume we aren't going to take any exceptions during this procedure.
- */
-_GLOBAL(slb_miss_realmode)
-	mflr	r10
-#ifdef CONFIG_RELOCATABLE
-	mtctr	r11
-#endif
-
-	stw	r9,PACA_EXSLB+EX_CCR(r13)	/* save CR in exc. frame */
-	std	r10,PACA_EXSLB+EX_LR(r13)	/* save LR */
-
-	bl	.slb_allocate_realmode
-
-	/* All done -- return from exception. */
-
-	ld	r10,PACA_EXSLB+EX_LR(r13)
-	ld	r3,PACA_EXSLB+EX_R3(r13)
-	lwz	r9,PACA_EXSLB+EX_CCR(r13)	/* get saved CR */
-#ifdef CONFIG_PPC_ISERIES
-BEGIN_FW_FTR_SECTION
-	ld	r11,PACALPPACAPTR(r13)
-	ld	r11,LPPACASRR0(r11)		/* get SRR0 value */
-END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
-#endif /* CONFIG_PPC_ISERIES */
-
-	mtlr	r10
-
-	andi.	r10,r12,MSR_RI	/* check for unrecoverable exception */
-	beq-	2f
-
-.machine	push
-.machine	"power4"
-	mtcrf	0x80,r9
-	mtcrf	0x01,r9		/* slb_allocate uses cr0 and cr7 */
-.machine	pop
-
-#ifdef CONFIG_PPC_ISERIES
-BEGIN_FW_FTR_SECTION
-	mtspr	SPRN_SRR0,r11
-	mtspr	SPRN_SRR1,r12
-END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
-#endif /* CONFIG_PPC_ISERIES */
-	ld	r9,PACA_EXSLB+EX_R9(r13)
-	ld	r10,PACA_EXSLB+EX_R10(r13)
-	ld	r11,PACA_EXSLB+EX_R11(r13)
-	ld	r12,PACA_EXSLB+EX_R12(r13)
-	ld	r13,PACA_EXSLB+EX_R13(r13)
-	rfid
-	b	.	/* prevent speculative execution */
-
-2:
-#ifdef CONFIG_PPC_ISERIES
-BEGIN_FW_FTR_SECTION
-	b	unrecov_slb
-END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
-#endif /* CONFIG_PPC_ISERIES */
-	mfspr	r11,SPRN_SRR0
-	ld	r10,PACAKBASE(r13)
-	LOAD_HANDLER(r10,unrecov_slb)
-	mtspr	SPRN_SRR0,r10
-	ld	r10,PACAKMSR(r13)
-	mtspr	SPRN_SRR1,r10
-	rfid
-	b	.
-
-unrecov_slb:
-	EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB)
-	DISABLE_INTS
-	bl	.save_nvgprs
-1:	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	.unrecoverable_exception
-	b	1b
-
+	/*
+	 * Machine check is different because we use a different
+	 * save area: PACA_EXMC instead of PACA_EXGEN.
+	 */
 	.align	7
-	.globl hardware_interrupt_common
-	.globl hardware_interrupt_entry
-hardware_interrupt_common:
-	EXCEPTION_PROLOG_COMMON(0x500, PACA_EXGEN)
+	.globl machine_check_common
+machine_check_common:
+
+	mfspr	r10,SPRN_DAR
+	std	r10,PACA_EXGEN+EX_DAR(r13)
+	mfspr	r10,SPRN_DSISR
+	stw	r10,PACA_EXGEN+EX_DSISR(r13)
+	EXCEPTION_PROLOG_COMMON(0x200, PACA_EXMC)
 	FINISH_NAP
-hardware_interrupt_entry:
 	DISABLE_INTS
-BEGIN_FTR_SECTION
-	bl	.ppc64_runlatch_on
-END_FTR_SECTION_IFSET(CPU_FTR_CTRL)
+	ld	r3,PACA_EXGEN+EX_DAR(r13)
+	lwz	r4,PACA_EXGEN+EX_DSISR(r13)
+	std	r3,_DAR(r1)
+	std	r4,_DSISR(r1)
+	bl	save_nvgprs
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	.do_IRQ
-	b	.ret_from_except_lite
-
-#ifdef CONFIG_PPC_970_NAP
-power4_fixup_nap:
-	andc	r9,r9,r10
-	std	r9,TI_LOCAL_FLAGS(r11)
-	ld	r10,_LINK(r1)		/* make idle task do the */
-	std	r10,_NIP(r1)		/* equivalent of a blr */
-	blr
-#endif
+	bl	machine_check_exception
+	b	ret_from_except
 
 	.align	7
 	.globl alignment_common
@@ -763,35 +1191,52 @@ alignment_common:
 	lwz	r4,PACA_EXGEN+EX_DSISR(r13)
 	std	r3,_DAR(r1)
 	std	r4,_DSISR(r1)
-	bl	.save_nvgprs
+	bl	save_nvgprs
+	DISABLE_INTS
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	ENABLE_INTS
-	bl	.alignment_exception
-	b	.ret_from_except
+	bl	alignment_exception
+	b	ret_from_except
 
 	.align	7
 	.globl program_check_common
 program_check_common:
 	EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN)
-	bl	.save_nvgprs
+	bl	save_nvgprs
+	DISABLE_INTS
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	ENABLE_INTS
-	bl	.program_check_exception
-	b	.ret_from_except
+	bl	program_check_exception
+	b	ret_from_except
 
 	.align	7
 	.globl fp_unavailable_common
 fp_unavailable_common:
 	EXCEPTION_PROLOG_COMMON(0x800, PACA_EXGEN)
 	bne	1f			/* if from user, just load it up */
-	bl	.save_nvgprs
+	bl	save_nvgprs
+	DISABLE_INTS
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	ENABLE_INTS
-	bl	.kernel_fp_unavailable_exception
+	bl	kernel_fp_unavailable_exception
 	BUG_OPCODE
-1:	bl	.load_up_fpu
+1:
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+	/* Test if 2 TM state bits are zero.  If non-zero (ie. userspace was in
+	 * transaction), go do TM stuff
+	 */
+	rldicl.	r0, r12, (64-MSR_TS_LG), (64-2)
+	bne-	2f
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
+#endif
+	bl	load_up_fpu
 	b	fast_exception_return
-
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+2:	/* User process was in a transaction */
+	bl	save_nvgprs
+	DISABLE_INTS
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	fp_unavailable_tm
+	b	ret_from_except
+#endif
 	.align	7
 	.globl altivec_unavailable_common
 altivec_unavailable_common:
@@ -799,16 +1244,33 @@ altivec_unavailable_common:
 #ifdef CONFIG_ALTIVEC
 BEGIN_FTR_SECTION
 	beq	1f
-	bl	.load_up_altivec
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+  BEGIN_FTR_SECTION_NESTED(69)
+	/* Test if 2 TM state bits are zero.  If non-zero (ie. userspace was in
+	 * transaction), go do TM stuff
+	 */
+	rldicl.	r0, r12, (64-MSR_TS_LG), (64-2)
+	bne-	2f
+  END_FTR_SECTION_NESTED(CPU_FTR_TM, CPU_FTR_TM, 69)
+#endif
+	bl	load_up_altivec
 	b	fast_exception_return
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+2:	/* User process was in a transaction */
+	bl	save_nvgprs
+	DISABLE_INTS
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	altivec_unavailable_tm
+	b	ret_from_except
+#endif
 1:
 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 #endif
-	bl	.save_nvgprs
+	bl	save_nvgprs
+	DISABLE_INTS
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	ENABLE_INTS
-	bl	.altivec_unavailable_exception
-	b	.ret_from_except
+	bl	altivec_unavailable_exception
+	b	ret_from_except
 
 	.align	7
 	.globl vsx_unavailable_common
@@ -816,85 +1278,315 @@ vsx_unavailable_common:
 	EXCEPTION_PROLOG_COMMON(0xf40, PACA_EXGEN)
 #ifdef CONFIG_VSX
 BEGIN_FTR_SECTION
-	bne	.load_up_vsx
+	beq	1f
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+  BEGIN_FTR_SECTION_NESTED(69)
+	/* Test if 2 TM state bits are zero.  If non-zero (ie. userspace was in
+	 * transaction), go do TM stuff
+	 */
+	rldicl.	r0, r12, (64-MSR_TS_LG), (64-2)
+	bne-	2f
+  END_FTR_SECTION_NESTED(CPU_FTR_TM, CPU_FTR_TM, 69)
+#endif
+	b	load_up_vsx
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+2:	/* User process was in a transaction */
+	bl	save_nvgprs
+	DISABLE_INTS
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	vsx_unavailable_tm
+	b	ret_from_except
+#endif
 1:
 END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 #endif
-	bl	.save_nvgprs
+	bl	save_nvgprs
+	DISABLE_INTS
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	ENABLE_INTS
-	bl	.vsx_unavailable_exception
-	b	.ret_from_except
+	bl	vsx_unavailable_exception
+	b	ret_from_except
+
+	STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception)
+	STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception)
 
 	.align	7
 	.globl	__end_handlers
 __end_handlers:
 
+	/* Equivalents to the above handlers for relocation-on interrupt vectors */
+	STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist)
+	MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell)
+
+	STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor)
+	STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable)
+	STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable)
+	STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
+	STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable)
+
+#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
 /*
- * Return from an exception with minimal checks.
- * The caller is assumed to have done EXCEPTION_PROLOG_COMMON.
- * If interrupts have been enabled, or anything has been
- * done that might have changed the scheduling status of
- * any task or sent any task a signal, you should use
- * ret_from_except or ret_from_except_lite instead of this.
+ * Data area reserved for FWNMI option.
+ * This address (0x7000) is fixed by the RPA.
  */
-fast_exc_return_irq:			/* restores irq state too */
-	ld	r3,SOFTE(r1)
-	TRACE_AND_RESTORE_IRQ(r3);
-	ld	r12,_MSR(r1)
-	rldicl	r4,r12,49,63		/* get MSR_EE to LSB */
-	stb	r4,PACAHARDIRQEN(r13)	/* restore paca->hard_enabled */
-	b	1f
+	.= 0x7000
+	.globl fwnmi_data_area
+fwnmi_data_area:
+
+	/* pseries and powernv need to keep the whole page from
+	 * 0x7000 to 0x8000 free for use by the firmware
+	 */
+	. = 0x8000
+#endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */
+
+/* Space for CPU0's segment table */
+	.balign 4096
+	.globl initial_stab
+initial_stab:
+	.space	4096
+
+#ifdef CONFIG_PPC_POWERNV
+_GLOBAL(opal_mc_secondary_handler)
+	HMT_MEDIUM_PPR_DISCARD
+	SET_SCRATCH0(r13)
+	GET_PACA(r13)
+	clrldi	r3,r3,2
+	tovirt(r3,r3)
+	std	r3,PACA_OPAL_MC_EVT(r13)
+	ld	r13,OPAL_MC_SRR0(r3)
+	mtspr	SPRN_SRR0,r13
+	ld	r13,OPAL_MC_SRR1(r3)
+	mtspr	SPRN_SRR1,r13
+	ld	r3,OPAL_MC_GPR3(r3)
+	GET_SCRATCH0(r13)
+	b	machine_check_pSeries
+#endif /* CONFIG_PPC_POWERNV */
 
-	.globl	fast_exception_return
-fast_exception_return:
+
+#define MACHINE_CHECK_HANDLER_WINDUP			\
+	/* Clear MSR_RI before setting SRR0 and SRR1. */\
+	li	r0,MSR_RI;				\
+	mfmsr	r9;		/* get MSR value */	\
+	andc	r9,r9,r0;				\
+	mtmsrd	r9,1;		/* Clear MSR_RI */	\
+	/* Move original SRR0 and SRR1 into the respective regs */	\
+	ld	r9,_MSR(r1);				\
+	mtspr	SPRN_SRR1,r9;				\
+	ld	r3,_NIP(r1);				\
+	mtspr	SPRN_SRR0,r3;				\
+	ld	r9,_CTR(r1);				\
+	mtctr	r9;					\
+	ld	r9,_XER(r1);				\
+	mtxer	r9;					\
+	ld	r9,_LINK(r1);				\
+	mtlr	r9;					\
+	REST_GPR(0, r1);				\
+	REST_8GPRS(2, r1);				\
+	REST_GPR(10, r1);				\
+	ld	r11,_CCR(r1);				\
+	mtcr	r11;					\
+	/* Decrement paca->in_mce. */			\
+	lhz	r12,PACA_IN_MCE(r13);			\
+	subi	r12,r12,1;				\
+	sth	r12,PACA_IN_MCE(r13);			\
+	REST_GPR(11, r1);				\
+	REST_2GPRS(12, r1);				\
+	/* restore original r1. */			\
+	ld	r1,GPR1(r1)
+
+	/*
+	 * Handle machine check early in real mode. We come here with
+	 * ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack.
+	 */
+	.align	7
+	.globl machine_check_handle_early
+machine_check_handle_early:
+	std	r0,GPR0(r1)	/* Save r0 */
+	EXCEPTION_PROLOG_COMMON_3(0x200)
+	bl	save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	machine_check_early
+	std	r3,RESULT(r1)	/* Save result */
 	ld	r12,_MSR(r1)
-1:	ld	r11,_NIP(r1)
-	andi.	r3,r12,MSR_RI		/* check if RI is set */
-	beq-	unrecov_fer
+#ifdef	CONFIG_PPC_P7_NAP
+	/*
+	 * Check if thread was in power saving mode. We come here when any
+	 * of the following is true:
+	 * a. thread wasn't in power saving mode
+	 * b. thread was in power saving mode with no state loss or
+	 *    supervisor state loss
+	 *
+	 * Go back to nap again if (b) is true.
+	 */
+	rlwinm.	r11,r12,47-31,30,31	/* Was it in power saving mode? */
+	beq	4f			/* No, it wasn;t */
+	/* Thread was in power saving mode. Go back to nap again. */
+	cmpwi	r11,2
+	bne	3f
+	/* Supervisor state loss */
+	li	r0,1
+	stb	r0,PACA_NAPSTATELOST(r13)
+3:	bl	machine_check_queue_event
+	MACHINE_CHECK_HANDLER_WINDUP
+	GET_PACA(r13)
+	ld	r1,PACAR1(r13)
+	b	power7_enter_nap_mode
+4:
+#endif
+	/*
+	 * Check if we are coming from hypervisor userspace. If yes then we
+	 * continue in host kernel in V mode to deliver the MC event.
+	 */
+	rldicl.	r11,r12,4,63		/* See if MC hit while in HV mode. */
+	beq	5f
+	andi.	r11,r12,MSR_PR		/* See if coming from user. */
+	bne	9f			/* continue in V mode if we are. */
 
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-	andi.	r3,r12,MSR_PR
-	beq	2f
-	ACCOUNT_CPU_USER_EXIT(r3, r4)
+5:
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	/*
+	 * We are coming from kernel context. Check if we are coming from
+	 * guest. if yes, then we can continue. We will fall through
+	 * do_kvm_200->kvmppc_interrupt to deliver the MC event to guest.
+	 */
+	lbz	r11,HSTATE_IN_GUEST(r13)
+	cmpwi	r11,0			/* Check if coming from guest */
+	bne	9f			/* continue if we are. */
+#endif
+	/*
+	 * At this point we are not sure about what context we come from.
+	 * Queue up the MCE event and return from the interrupt.
+	 * But before that, check if this is an un-recoverable exception.
+	 * If yes, then stay on emergency stack and panic.
+	 */
+	andi.	r11,r12,MSR_RI
+	bne	2f
+1:	mfspr	r11,SPRN_SRR0
+	ld	r10,PACAKBASE(r13)
+	LOAD_HANDLER(r10,unrecover_mce)
+	mtspr	SPRN_SRR0,r10
+	ld	r10,PACAKMSR(r13)
+	/*
+	 * We are going down. But there are chances that we might get hit by
+	 * another MCE during panic path and we may run into unstable state
+	 * with no way out. Hence, turn ME bit off while going down, so that
+	 * when another MCE is hit during panic path, system will checkstop
+	 * and hypervisor will get restarted cleanly by SP.
+	 */
+	li	r3,MSR_ME
+	andc	r10,r10,r3		/* Turn off MSR_ME */
+	mtspr	SPRN_SRR1,r10
+	rfid
+	b	.
 2:
+	/*
+	 * Check if we have successfully handled/recovered from error, if not
+	 * then stay on emergency stack and panic.
+	 */
+	ld	r3,RESULT(r1)	/* Load result */
+	cmpdi	r3,0		/* see if we handled MCE successfully */
+
+	beq	1b		/* if !handled then panic */
+	/*
+	 * Return from MC interrupt.
+	 * Queue up the MCE event so that we can log it later, while
+	 * returning from kernel or opal call.
+	 */
+	bl	machine_check_queue_event
+	MACHINE_CHECK_HANDLER_WINDUP
+	rfid
+9:
+	/* Deliver the machine check to host kernel in V mode. */
+	MACHINE_CHECK_HANDLER_WINDUP
+	b	machine_check_pSeries
+
+unrecover_mce:
+	/* Invoke machine_check_exception to print MCE event and panic. */
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	machine_check_exception
+	/*
+	 * We will not reach here. Even if we did, there is no way out. Call
+	 * unrecoverable_exception and die.
+	 */
+1:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	unrecoverable_exception
+	b	1b
+/*
+ * r13 points to the PACA, r9 contains the saved CR,
+ * r12 contain the saved SRR1, SRR0 is still ready for return
+ * r3 has the faulting address
+ * r9 - r13 are saved in paca->exslb.
+ * r3 is saved in paca->slb_r3
+ * We assume we aren't going to take any exceptions during this procedure.
+ */
+slb_miss_realmode:
+	mflr	r10
+#ifdef CONFIG_RELOCATABLE
+	mtctr	r11
 #endif
 
-	ld	r3,_CCR(r1)
-	ld	r4,_LINK(r1)
-	ld	r5,_CTR(r1)
-	ld	r6,_XER(r1)
-	mtcr	r3
-	mtlr	r4
-	mtctr	r5
-	mtxer	r6
-	REST_GPR(0, r1)
-	REST_8GPRS(2, r1)
+	stw	r9,PACA_EXSLB+EX_CCR(r13)	/* save CR in exc. frame */
+	std	r10,PACA_EXSLB+EX_LR(r13)	/* save LR */
+
+	bl	slb_allocate_realmode
 
-	mfmsr	r10
-	rldicl	r10,r10,48,1		/* clear EE */
-	rldicr	r10,r10,16,61		/* clear RI (LE is 0 already) */
-	mtmsrd	r10,1
+	/* All done -- return from exception. */
 
-	mtspr	SPRN_SRR1,r12
-	mtspr	SPRN_SRR0,r11
-	REST_4GPRS(10, r1)
-	ld	r1,GPR1(r1)
+	ld	r10,PACA_EXSLB+EX_LR(r13)
+	ld	r3,PACA_EXSLB+EX_R3(r13)
+	lwz	r9,PACA_EXSLB+EX_CCR(r13)	/* get saved CR */
+
+	mtlr	r10
+
+	andi.	r10,r12,MSR_RI	/* check for unrecoverable exception */
+	beq-	2f
+
+.machine	push
+.machine	"power4"
+	mtcrf	0x80,r9
+	mtcrf	0x01,r9		/* slb_allocate uses cr0 and cr7 */
+.machine	pop
+
+	RESTORE_PPR_PACA(PACA_EXSLB, r9)
+	ld	r9,PACA_EXSLB+EX_R9(r13)
+	ld	r10,PACA_EXSLB+EX_R10(r13)
+	ld	r11,PACA_EXSLB+EX_R11(r13)
+	ld	r12,PACA_EXSLB+EX_R12(r13)
+	ld	r13,PACA_EXSLB+EX_R13(r13)
 	rfid
 	b	.	/* prevent speculative execution */
 
-unrecov_fer:
-	bl	.save_nvgprs
+2:	mfspr	r11,SPRN_SRR0
+	ld	r10,PACAKBASE(r13)
+	LOAD_HANDLER(r10,unrecov_slb)
+	mtspr	SPRN_SRR0,r10
+	ld	r10,PACAKMSR(r13)
+	mtspr	SPRN_SRR1,r10
+	rfid
+	b	.
+
+unrecov_slb:
+	EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB)
+	DISABLE_INTS
+	bl	save_nvgprs
 1:	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	.unrecoverable_exception
+	bl	unrecoverable_exception
 	b	1b
 
 
+#ifdef CONFIG_PPC_970_NAP
+power4_fixup_nap:
+	andc	r9,r9,r10
+	std	r9,TI_LOCAL_FLAGS(r11)
+	ld	r10,_LINK(r1)		/* make idle task do the */
+	std	r10,_NIP(r1)		/* equivalent of a blr */
+	blr
+#endif
+
 /*
  * Hash table stuff
  */
 	.align	7
-_STATIC(do_hash_page)
+do_hash_page:
 	std	r3,_DAR(r1)
 	std	r4,_DSISR(r1)
 
@@ -908,32 +1600,10 @@ BEGIN_FTR_SECTION
 	bne-	do_ste_alloc		/* If so handle it */
 END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB)
 
-	clrrdi	r11,r1,THREAD_SHIFT
+	CURRENT_THREAD_INFO(r11, r1)
 	lwz	r0,TI_PREEMPT(r11)	/* If we're in an "NMI" */
 	andis.	r0,r0,NMI_MASK@h	/* (i.e. an irq when soft-disabled) */
 	bne	77f			/* then don't call hash_page now */
-
-	/*
-	 * On iSeries, we soft-disable interrupts here, then
-	 * hard-enable interrupts so that the hash_page code can spin on
-	 * the hash_table_lock without problems on a shared processor.
-	 */
-	DISABLE_INTS
-
-	/*
-	 * Currently, trace_hardirqs_off() will be called by DISABLE_INTS
-	 * and will clobber volatile registers when irq tracing is enabled
-	 * so we need to reload them. It may be possible to be smarter here
-	 * and move the irq tracing elsewhere but let's keep it simple for
-	 * now
-	 */
-#ifdef CONFIG_TRACE_IRQFLAGS
-	ld	r3,_DAR(r1)
-	ld	r4,_DSISR(r1)
-	ld	r5,_TRAP(r1)
-	ld	r12,_MSR(r1)
-	clrrdi	r5,r5,4
-#endif /* CONFIG_TRACE_IRQFLAGS */
 	/*
 	 * We need to set the _PAGE_USER bit if MSR_PR is set or if we are
 	 * accessing a userspace segment (even from the kernel). We assume
@@ -951,80 +1621,51 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB)
 	 * r4 contains the required access permissions
 	 * r5 contains the trap number
 	 *
-	 * at return r3 = 0 for success
+	 * at return r3 = 0 for success, 1 for page fault, negative for error
 	 */
-	bl	.hash_page		/* build HPTE if possible */
+	bl	hash_page		/* build HPTE if possible */
 	cmpdi	r3,0			/* see if hash_page succeeded */
 
-BEGIN_FW_FTR_SECTION
-	/*
-	 * If we had interrupts soft-enabled at the point where the
-	 * DSI/ISI occurred, and an interrupt came in during hash_page,
-	 * handle it now.
-	 * We jump to ret_from_except_lite rather than fast_exception_return
-	 * because ret_from_except_lite will check for and handle pending
-	 * interrupts if necessary.
-	 */
-	beq	13f
-END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
-
-BEGIN_FW_FTR_SECTION
-	/*
-	 * Here we have interrupts hard-disabled, so it is sufficient
-	 * to restore paca->{soft,hard}_enable and get out.
-	 */
+	/* Success */
 	beq	fast_exc_return_irq	/* Return from exception on success */
-END_FW_FTR_SECTION_IFCLR(FW_FEATURE_ISERIES)
-
-	/* For a hash failure, we don't bother re-enabling interrupts */
-	ble-	12f
-
-	/*
-	 * hash_page couldn't handle it, set soft interrupt enable back
-	 * to what it was before the trap.  Note that .arch_local_irq_restore
-	 * handles any interrupts pending at this point.
-	 */
-	ld	r3,SOFTE(r1)
-	TRACE_AND_RESTORE_IRQ_PARTIAL(r3, 11f)
-	bl	.arch_local_irq_restore
-	b	11f
 
-/* We have a data breakpoint exception - handle it */
-handle_dabr_fault:
-	bl	.save_nvgprs
-	ld      r4,_DAR(r1)
-	ld      r5,_DSISR(r1)
-	addi    r3,r1,STACK_FRAME_OVERHEAD
-	bl      .do_dabr
-	b       .ret_from_except_lite
+	/* Error */
+	blt-	13f
 
 /* Here we have a page fault that hash_page can't handle. */
 handle_page_fault:
-	ENABLE_INTS
 11:	ld	r4,_DAR(r1)
 	ld	r5,_DSISR(r1)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	.do_page_fault
+	bl	do_page_fault
 	cmpdi	r3,0
-	beq+	13f
-	bl	.save_nvgprs
+	beq+	12f
+	bl	save_nvgprs
 	mr	r5,r3
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	lwz	r4,_DAR(r1)
-	bl	.bad_page_fault
-	b	.ret_from_except
+	bl	bad_page_fault
+	b	ret_from_except
+
+/* We have a data breakpoint exception - handle it */
+handle_dabr_fault:
+	bl	save_nvgprs
+	ld      r4,_DAR(r1)
+	ld      r5,_DSISR(r1)
+	addi    r3,r1,STACK_FRAME_OVERHEAD
+	bl      do_break
+12:	b       ret_from_except_lite
 
-13:	b	.ret_from_except_lite
 
 /* We have a page fault that hash_page could handle but HV refused
  * the PTE insertion
  */
-12:	bl	.save_nvgprs
+13:	bl	save_nvgprs
 	mr	r5,r3
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	ld	r4,_DAR(r1)
-	bl	.low_hash_fault
-	b	.ret_from_except
+	bl	low_hash_fault
+	b	ret_from_except
 
 /*
  * We come here as a result of a DSI at a point where we don't want
@@ -1033,16 +1674,16 @@ handle_page_fault:
  * were soft-disabled.  We want to invoke the exception handler for
  * the access, or panic if there isn't a handler.
  */
-77:	bl	.save_nvgprs
+77:	bl	save_nvgprs
 	mr	r4,r3
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	li	r5,SIGSEGV
-	bl	.bad_page_fault
-	b	.ret_from_except
+	bl	bad_page_fault
+	b	ret_from_except
 
 	/* here we have a segment miss */
 do_ste_alloc:
-	bl	.ste_allocate		/* try to insert stab entry */
+	bl	ste_allocate		/* try to insert stab entry */
 	cmpdi	r3,0
 	bne-	handle_page_fault
 	b	fast_exception_return
@@ -1055,21 +1696,39 @@ do_ste_alloc:
  * We assume (DAR >> 60) == 0xc.
  */
 	.align	7
-_GLOBAL(do_stab_bolted)
+do_stab_bolted:
 	stw	r9,PACA_EXSLB+EX_CCR(r13)	/* save CR in exc. frame */
 	std	r11,PACA_EXSLB+EX_SRR0(r13)	/* save SRR0 in exc. frame */
+	mfspr	r11,SPRN_DAR			/* ea */
+
+	/*
+	 * check for bad kernel/user address
+	 * (ea & ~REGION_MASK) >= PGTABLE_RANGE
+	 */
+	rldicr. r9,r11,4,(63 - 46 - 4)
+	li	r9,0	/* VSID = 0 for bad address */
+	bne-	0f
+
+	/*
+	 * Calculate VSID:
+	 * This is the kernel vsid, we take the top for context from
+	 * the range. context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
+	 * Here we know that (ea >> 60) == 0xc
+	 */
+	lis	r9,(MAX_USER_CONTEXT + 1)@ha
+	addi	r9,r9,(MAX_USER_CONTEXT + 1)@l
+
+	srdi	r10,r11,SID_SHIFT
+	rldimi  r10,r9,ESID_BITS,0 /* proto vsid */
+	ASM_VSID_SCRAMBLE(r10, r9, 256M)
+	rldic	r9,r10,12,16	/* r9 = vsid << 12 */
 
+0:
 	/* Hash to the primary group */
 	ld	r10,PACASTABVIRT(r13)
-	mfspr	r11,SPRN_DAR
-	srdi	r11,r11,28
+	srdi	r11,r11,SID_SHIFT
 	rldimi	r10,r11,7,52	/* r10 = first ste of the group */
 
-	/* Calculate VSID */
-	/* This is a kernel address, so protovsid = ESID */
-	ASM_VSID_SCRAMBLE(r11, r9, 256M)
-	rldic	r9,r11,12,16	/* r9 = vsid << 12 */
-
 	/* Search the primary group for a free entry */
 1:	ld	r11,0(r10)	/* Test valid bit of the current ste	*/
 	andi.	r11,r11,0x80
@@ -1132,73 +1791,3 @@ _GLOBAL(do_stab_bolted)
 	ld	r13,PACA_EXSLB+EX_R13(r13)
 	rfid
 	b	.	/* prevent speculative execution */
-
-#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
-/*
- * Data area reserved for FWNMI option.
- * This address (0x7000) is fixed by the RPA.
- */
-	.= 0x7000
-	.globl fwnmi_data_area
-fwnmi_data_area:
-#endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */
-
-	/* iSeries does not use the FWNMI stuff, so it is safe to put
-	 * this here, even if we later allow kernels that will boot on
-	 * both pSeries and iSeries */
-#ifdef CONFIG_PPC_ISERIES
-        . = LPARMAP_PHYS
-	.globl xLparMap
-xLparMap:
-	.quad	HvEsidsToMap		/* xNumberEsids */
-	.quad	HvRangesToMap		/* xNumberRanges */
-	.quad	STAB0_PAGE		/* xSegmentTableOffs */
-	.zero	40			/* xRsvd */
-	/* xEsids (HvEsidsToMap entries of 2 quads) */
-	.quad	PAGE_OFFSET_ESID	/* xKernelEsid */
-	.quad	PAGE_OFFSET_VSID	/* xKernelVsid */
-	.quad	VMALLOC_START_ESID	/* xKernelEsid */
-	.quad	VMALLOC_START_VSID	/* xKernelVsid */
-	/* xRanges (HvRangesToMap entries of 3 quads) */
-	.quad	HvPagesToMap		/* xPages */
-	.quad	0			/* xOffset */
-	.quad	PAGE_OFFSET_VSID << (SID_SHIFT - HW_PAGE_SHIFT)	/* xVPN */
-
-#endif /* CONFIG_PPC_ISERIES */
-
-#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
-	/* pseries and powernv need to keep the whole page from
-	 * 0x7000 to 0x8000 free for use by the firmware
-	 */
-        . = 0x8000
-#endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */
-
-/*
- * Space for CPU0's segment table.
- *
- * On iSeries, the hypervisor must fill in at least one entry before
- * we get control (with relocate on).  The address is given to the hv
- * as a page number (see xLparMap above), so this must be at a
- * fixed address (the linker can't compute (u64)&initial_stab >>
- * PAGE_SHIFT).
- */
-	. = STAB0_OFFSET	/* 0x8000 */
-	.globl initial_stab
-initial_stab:
-	.space	4096
-#ifdef CONFIG_PPC_POWERNV
-_GLOBAL(opal_mc_secondary_handler)
-	HMT_MEDIUM
-	SET_SCRATCH0(r13)
-	GET_PACA(r13)
-	clrldi	r3,r3,2
-	tovirt(r3,r3)
-	std	r3,PACA_OPAL_MC_EVT(r13)
-	ld	r13,OPAL_MC_SRR0(r3)
-	mtspr	SPRN_SRR0,r13
-	ld	r13,OPAL_MC_SRR1(r3)
-	mtspr	SPRN_SRR1,r13
-	ld	r3,OPAL_MC_GPR3(r3)
-	GET_SCRATCH0(r13)
-	b	machine_check_pSeries
-#endif /* CONFIG_PPC_POWERNV */
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
new file mode 100644
index 00000000000..742694c1d85
--- /dev/null
+++ b/arch/powerpc/kernel/fadump.c
@@ -0,0 +1,1316 @@
+/*
+ * Firmware Assisted dump: A robust mechanism to get reliable kernel crash
+ * dump with assistance from firmware. This approach does not use kexec,
+ * instead firmware assists in booting the kdump kernel while preserving
+ * memory contents. The most of the code implementation has been adapted
+ * from phyp assisted dump implementation written by Linas Vepstas and
+ * Manish Ahuja
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2011 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+#define pr_fmt(fmt) "fadump: " fmt
+
+#include <linux/string.h>
+#include <linux/memblock.h>
+#include <linux/delay.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/crash_dump.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+
+#include <asm/page.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/fadump.h>
+#include <asm/debug.h>
+#include <asm/setup.h>
+
+static struct fw_dump fw_dump;
+static struct fadump_mem_struct fdm;
+static const struct fadump_mem_struct *fdm_active;
+
+static DEFINE_MUTEX(fadump_mutex);
+struct fad_crash_memory_ranges crash_memory_ranges[INIT_CRASHMEM_RANGES];
+int crash_mem_ranges;
+
+/* Scan the Firmware Assisted dump configuration details. */
+int __init early_init_dt_scan_fw_dump(unsigned long node,
+			const char *uname, int depth, void *data)
+{
+	const __be32 *sections;
+	int i, num_sections;
+	int size;
+	const int *token;
+
+	if (depth != 1 || strcmp(uname, "rtas") != 0)
+		return 0;
+
+	/*
+	 * Check if Firmware Assisted dump is supported. if yes, check
+	 * if dump has been initiated on last reboot.
+	 */
+	token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL);
+	if (!token)
+		return 1;
+
+	fw_dump.fadump_supported = 1;
+	fw_dump.ibm_configure_kernel_dump = *token;
+
+	/*
+	 * The 'ibm,kernel-dump' rtas node is present only if there is
+	 * dump data waiting for us.
+	 */
+	fdm_active = of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL);
+	if (fdm_active)
+		fw_dump.dump_active = 1;
+
+	/* Get the sizes required to store dump data for the firmware provided
+	 * dump sections.
+	 * For each dump section type supported, a 32bit cell which defines
+	 * the ID of a supported section followed by two 32 bit cells which
+	 * gives teh size of the section in bytes.
+	 */
+	sections = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes",
+					&size);
+
+	if (!sections)
+		return 1;
+
+	num_sections = size / (3 * sizeof(u32));
+
+	for (i = 0; i < num_sections; i++, sections += 3) {
+		u32 type = (u32)of_read_number(sections, 1);
+
+		switch (type) {
+		case FADUMP_CPU_STATE_DATA:
+			fw_dump.cpu_state_data_size =
+					of_read_ulong(&sections[1], 2);
+			break;
+		case FADUMP_HPTE_REGION:
+			fw_dump.hpte_region_size =
+					of_read_ulong(&sections[1], 2);
+			break;
+		}
+	}
+
+	return 1;
+}
+
+int is_fadump_active(void)
+{
+	return fw_dump.dump_active;
+}
+
+/* Print firmware assisted dump configurations for debugging purpose. */
+static void fadump_show_config(void)
+{
+	pr_debug("Support for firmware-assisted dump (fadump): %s\n",
+			(fw_dump.fadump_supported ? "present" : "no support"));
+
+	if (!fw_dump.fadump_supported)
+		return;
+
+	pr_debug("Fadump enabled    : %s\n",
+				(fw_dump.fadump_enabled ? "yes" : "no"));
+	pr_debug("Dump Active       : %s\n",
+				(fw_dump.dump_active ? "yes" : "no"));
+	pr_debug("Dump section sizes:\n");
+	pr_debug("    CPU state data size: %lx\n", fw_dump.cpu_state_data_size);
+	pr_debug("    HPTE region size   : %lx\n", fw_dump.hpte_region_size);
+	pr_debug("Boot memory size  : %lx\n", fw_dump.boot_memory_size);
+}
+
+static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm,
+				unsigned long addr)
+{
+	if (!fdm)
+		return 0;
+
+	memset(fdm, 0, sizeof(struct fadump_mem_struct));
+	addr = addr & PAGE_MASK;
+
+	fdm->header.dump_format_version = 0x00000001;
+	fdm->header.dump_num_sections = 3;
+	fdm->header.dump_status_flag = 0;
+	fdm->header.offset_first_dump_section =
+		(u32)offsetof(struct fadump_mem_struct, cpu_state_data);
+
+	/*
+	 * Fields for disk dump option.
+	 * We are not using disk dump option, hence set these fields to 0.
+	 */
+	fdm->header.dd_block_size = 0;
+	fdm->header.dd_block_offset = 0;
+	fdm->header.dd_num_blocks = 0;
+	fdm->header.dd_offset_disk_path = 0;
+
+	/* set 0 to disable an automatic dump-reboot. */
+	fdm->header.max_time_auto = 0;
+
+	/* Kernel dump sections */
+	/* cpu state data section. */
+	fdm->cpu_state_data.request_flag = FADUMP_REQUEST_FLAG;
+	fdm->cpu_state_data.source_data_type = FADUMP_CPU_STATE_DATA;
+	fdm->cpu_state_data.source_address = 0;
+	fdm->cpu_state_data.source_len = fw_dump.cpu_state_data_size;
+	fdm->cpu_state_data.destination_address = addr;
+	addr += fw_dump.cpu_state_data_size;
+
+	/* hpte region section */
+	fdm->hpte_region.request_flag = FADUMP_REQUEST_FLAG;
+	fdm->hpte_region.source_data_type = FADUMP_HPTE_REGION;
+	fdm->hpte_region.source_address = 0;
+	fdm->hpte_region.source_len = fw_dump.hpte_region_size;
+	fdm->hpte_region.destination_address = addr;
+	addr += fw_dump.hpte_region_size;
+
+	/* RMA region section */
+	fdm->rmr_region.request_flag = FADUMP_REQUEST_FLAG;
+	fdm->rmr_region.source_data_type = FADUMP_REAL_MODE_REGION;
+	fdm->rmr_region.source_address = RMA_START;
+	fdm->rmr_region.source_len = fw_dump.boot_memory_size;
+	fdm->rmr_region.destination_address = addr;
+	addr += fw_dump.boot_memory_size;
+
+	return addr;
+}
+
+/**
+ * fadump_calculate_reserve_size(): reserve variable boot area 5% of System RAM
+ *
+ * Function to find the largest memory size we need to reserve during early
+ * boot process. This will be the size of the memory that is required for a
+ * kernel to boot successfully.
+ *
+ * This function has been taken from phyp-assisted dump feature implementation.
+ *
+ * returns larger of 256MB or 5% rounded down to multiples of 256MB.
+ *
+ * TODO: Come up with better approach to find out more accurate memory size
+ * that is required for a kernel to boot successfully.
+ *
+ */
+static inline unsigned long fadump_calculate_reserve_size(void)
+{
+	unsigned long size;
+
+	/*
+	 * Check if the size is specified through fadump_reserve_mem= cmdline
+	 * option. If yes, then use that.
+	 */
+	if (fw_dump.reserve_bootvar)
+		return fw_dump.reserve_bootvar;
+
+	/* divide by 20 to get 5% of value */
+	size = memblock_end_of_DRAM() / 20;
+
+	/* round it down in multiples of 256 */
+	size = size & ~0x0FFFFFFFUL;
+
+	/* Truncate to memory_limit. We don't want to over reserve the memory.*/
+	if (memory_limit && size > memory_limit)
+		size = memory_limit;
+
+	return (size > MIN_BOOT_MEM ? size : MIN_BOOT_MEM);
+}
+
+/*
+ * Calculate the total memory size required to be reserved for
+ * firmware-assisted dump registration.
+ */
+static unsigned long get_fadump_area_size(void)
+{
+	unsigned long size = 0;
+
+	size += fw_dump.cpu_state_data_size;
+	size += fw_dump.hpte_region_size;
+	size += fw_dump.boot_memory_size;
+	size += sizeof(struct fadump_crash_info_header);
+	size += sizeof(struct elfhdr); /* ELF core header.*/
+	size += sizeof(struct elf_phdr); /* place holder for cpu notes */
+	/* Program headers for crash memory regions. */
+	size += sizeof(struct elf_phdr) * (memblock_num_regions(memory) + 2);
+
+	size = PAGE_ALIGN(size);
+	return size;
+}
+
+int __init fadump_reserve_mem(void)
+{
+	unsigned long base, size, memory_boundary;
+
+	if (!fw_dump.fadump_enabled)
+		return 0;
+
+	if (!fw_dump.fadump_supported) {
+		printk(KERN_INFO "Firmware-assisted dump is not supported on"
+				" this hardware\n");
+		fw_dump.fadump_enabled = 0;
+		return 0;
+	}
+	/*
+	 * Initialize boot memory size
+	 * If dump is active then we have already calculated the size during
+	 * first kernel.
+	 */
+	if (fdm_active)
+		fw_dump.boot_memory_size = fdm_active->rmr_region.source_len;
+	else
+		fw_dump.boot_memory_size = fadump_calculate_reserve_size();
+
+	/*
+	 * Calculate the memory boundary.
+	 * If memory_limit is less than actual memory boundary then reserve
+	 * the memory for fadump beyond the memory_limit and adjust the
+	 * memory_limit accordingly, so that the running kernel can run with
+	 * specified memory_limit.
+	 */
+	if (memory_limit && memory_limit < memblock_end_of_DRAM()) {
+		size = get_fadump_area_size();
+		if ((memory_limit + size) < memblock_end_of_DRAM())
+			memory_limit += size;
+		else
+			memory_limit = memblock_end_of_DRAM();
+		printk(KERN_INFO "Adjusted memory_limit for firmware-assisted"
+				" dump, now %#016llx\n", memory_limit);
+	}
+	if (memory_limit)
+		memory_boundary = memory_limit;
+	else
+		memory_boundary = memblock_end_of_DRAM();
+
+	if (fw_dump.dump_active) {
+		printk(KERN_INFO "Firmware-assisted dump is active.\n");
+		/*
+		 * If last boot has crashed then reserve all the memory
+		 * above boot_memory_size so that we don't touch it until
+		 * dump is written to disk by userspace tool. This memory
+		 * will be released for general use once the dump is saved.
+		 */
+		base = fw_dump.boot_memory_size;
+		size = memory_boundary - base;
+		memblock_reserve(base, size);
+		printk(KERN_INFO "Reserved %ldMB of memory at %ldMB "
+				"for saving crash dump\n",
+				(unsigned long)(size >> 20),
+				(unsigned long)(base >> 20));
+
+		fw_dump.fadumphdr_addr =
+				fdm_active->rmr_region.destination_address +
+				fdm_active->rmr_region.source_len;
+		pr_debug("fadumphdr_addr = %p\n",
+				(void *) fw_dump.fadumphdr_addr);
+	} else {
+		/* Reserve the memory at the top of memory. */
+		size = get_fadump_area_size();
+		base = memory_boundary - size;
+		memblock_reserve(base, size);
+		printk(KERN_INFO "Reserved %ldMB of memory at %ldMB "
+				"for firmware-assisted dump\n",
+				(unsigned long)(size >> 20),
+				(unsigned long)(base >> 20));
+	}
+	fw_dump.reserve_dump_area_start = base;
+	fw_dump.reserve_dump_area_size = size;
+	return 1;
+}
+
+/* Look for fadump= cmdline option. */
+static int __init early_fadump_param(char *p)
+{
+	if (!p)
+		return 1;
+
+	if (strncmp(p, "on", 2) == 0)
+		fw_dump.fadump_enabled = 1;
+	else if (strncmp(p, "off", 3) == 0)
+		fw_dump.fadump_enabled = 0;
+
+	return 0;
+}
+early_param("fadump", early_fadump_param);
+
+/* Look for fadump_reserve_mem= cmdline option */
+static int __init early_fadump_reserve_mem(char *p)
+{
+	if (p)
+		fw_dump.reserve_bootvar = memparse(p, &p);
+	return 0;
+}
+early_param("fadump_reserve_mem", early_fadump_reserve_mem);
+
+static void register_fw_dump(struct fadump_mem_struct *fdm)
+{
+	int rc;
+	unsigned int wait_time;
+
+	pr_debug("Registering for firmware-assisted kernel dump...\n");
+
+	/* TODO: Add upper time limit for the delay */
+	do {
+		rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL,
+			FADUMP_REGISTER, fdm,
+			sizeof(struct fadump_mem_struct));
+
+		wait_time = rtas_busy_delay_time(rc);
+		if (wait_time)
+			mdelay(wait_time);
+
+	} while (wait_time);
+
+	switch (rc) {
+	case -1:
+		printk(KERN_ERR "Failed to register firmware-assisted kernel"
+			" dump. Hardware Error(%d).\n", rc);
+		break;
+	case -3:
+		printk(KERN_ERR "Failed to register firmware-assisted kernel"
+			" dump. Parameter Error(%d).\n", rc);
+		break;
+	case -9:
+		printk(KERN_ERR "firmware-assisted kernel dump is already "
+			" registered.");
+		fw_dump.dump_registered = 1;
+		break;
+	case 0:
+		printk(KERN_INFO "firmware-assisted kernel dump registration"
+			" is successful\n");
+		fw_dump.dump_registered = 1;
+		break;
+	}
+}
+
+void crash_fadump(struct pt_regs *regs, const char *str)
+{
+	struct fadump_crash_info_header *fdh = NULL;
+
+	if (!fw_dump.dump_registered || !fw_dump.fadumphdr_addr)
+		return;
+
+	fdh = __va(fw_dump.fadumphdr_addr);
+	crashing_cpu = smp_processor_id();
+	fdh->crashing_cpu = crashing_cpu;
+	crash_save_vmcoreinfo();
+
+	if (regs)
+		fdh->regs = *regs;
+	else
+		ppc_save_regs(&fdh->regs);
+
+	fdh->cpu_online_mask = *cpu_online_mask;
+
+	/* Call ibm,os-term rtas call to trigger firmware assisted dump */
+	rtas_os_term((char *)str);
+}
+
+#define GPR_MASK	0xffffff0000000000
+static inline int fadump_gpr_index(u64 id)
+{
+	int i = -1;
+	char str[3];
+
+	if ((id & GPR_MASK) == REG_ID("GPR")) {
+		/* get the digits at the end */
+		id &= ~GPR_MASK;
+		id >>= 24;
+		str[2] = '\0';
+		str[1] = id & 0xff;
+		str[0] = (id >> 8) & 0xff;
+		sscanf(str, "%d", &i);
+		if (i > 31)
+			i = -1;
+	}
+	return i;
+}
+
+static inline void fadump_set_regval(struct pt_regs *regs, u64 reg_id,
+								u64 reg_val)
+{
+	int i;
+
+	i = fadump_gpr_index(reg_id);
+	if (i >= 0)
+		regs->gpr[i] = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("NIA"))
+		regs->nip = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("MSR"))
+		regs->msr = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("CTR"))
+		regs->ctr = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("LR"))
+		regs->link = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("XER"))
+		regs->xer = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("CR"))
+		regs->ccr = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("DAR"))
+		regs->dar = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("DSISR"))
+		regs->dsisr = (unsigned long)reg_val;
+}
+
+static struct fadump_reg_entry*
+fadump_read_registers(struct fadump_reg_entry *reg_entry, struct pt_regs *regs)
+{
+	memset(regs, 0, sizeof(struct pt_regs));
+
+	while (reg_entry->reg_id != REG_ID("CPUEND")) {
+		fadump_set_regval(regs, reg_entry->reg_id,
+					reg_entry->reg_value);
+		reg_entry++;
+	}
+	reg_entry++;
+	return reg_entry;
+}
+
+static u32 *fadump_append_elf_note(u32 *buf, char *name, unsigned type,
+						void *data, size_t data_len)
+{
+	struct elf_note note;
+
+	note.n_namesz = strlen(name) + 1;
+	note.n_descsz = data_len;
+	note.n_type   = type;
+	memcpy(buf, &note, sizeof(note));
+	buf += (sizeof(note) + 3)/4;
+	memcpy(buf, name, note.n_namesz);
+	buf += (note.n_namesz + 3)/4;
+	memcpy(buf, data, note.n_descsz);
+	buf += (note.n_descsz + 3)/4;
+
+	return buf;
+}
+
+static void fadump_final_note(u32 *buf)
+{
+	struct elf_note note;
+
+	note.n_namesz = 0;
+	note.n_descsz = 0;
+	note.n_type   = 0;
+	memcpy(buf, &note, sizeof(note));
+}
+
+static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs)
+{
+	struct elf_prstatus prstatus;
+
+	memset(&prstatus, 0, sizeof(prstatus));
+	/*
+	 * FIXME: How do i get PID? Do I really need it?
+	 * prstatus.pr_pid = ????
+	 */
+	elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
+	buf = fadump_append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+				&prstatus, sizeof(prstatus));
+	return buf;
+}
+
+static void fadump_update_elfcore_header(char *bufp)
+{
+	struct elfhdr *elf;
+	struct elf_phdr *phdr;
+
+	elf = (struct elfhdr *)bufp;
+	bufp += sizeof(struct elfhdr);
+
+	/* First note is a place holder for cpu notes info. */
+	phdr = (struct elf_phdr *)bufp;
+
+	if (phdr->p_type == PT_NOTE) {
+		phdr->p_paddr = fw_dump.cpu_notes_buf;
+		phdr->p_offset	= phdr->p_paddr;
+		phdr->p_filesz	= fw_dump.cpu_notes_buf_size;
+		phdr->p_memsz = fw_dump.cpu_notes_buf_size;
+	}
+	return;
+}
+
+static void *fadump_cpu_notes_buf_alloc(unsigned long size)
+{
+	void *vaddr;
+	struct page *page;
+	unsigned long order, count, i;
+
+	order = get_order(size);
+	vaddr = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
+	if (!vaddr)
+		return NULL;
+
+	count = 1 << order;
+	page = virt_to_page(vaddr);
+	for (i = 0; i < count; i++)
+		SetPageReserved(page + i);
+	return vaddr;
+}
+
+static void fadump_cpu_notes_buf_free(unsigned long vaddr, unsigned long size)
+{
+	struct page *page;
+	unsigned long order, count, i;
+
+	order = get_order(size);
+	count = 1 << order;
+	page = virt_to_page(vaddr);
+	for (i = 0; i < count; i++)
+		ClearPageReserved(page + i);
+	__free_pages(page, order);
+}
+
+/*
+ * Read CPU state dump data and convert it into ELF notes.
+ * The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be
+ * used to access the data to allow for additional fields to be added without
+ * affecting compatibility. Each list of registers for a CPU starts with
+ * "CPUSTRT" and ends with "CPUEND". Each register entry is of 16 bytes,
+ * 8 Byte ASCII identifier and 8 Byte register value. The register entry
+ * with identifier "CPUSTRT" and "CPUEND" contains 4 byte cpu id as part
+ * of register value. For more details refer to PAPR document.
+ *
+ * Only for the crashing cpu we ignore the CPU dump data and get exact
+ * state from fadump crash info structure populated by first kernel at the
+ * time of crash.
+ */
+static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm)
+{
+	struct fadump_reg_save_area_header *reg_header;
+	struct fadump_reg_entry *reg_entry;
+	struct fadump_crash_info_header *fdh = NULL;
+	void *vaddr;
+	unsigned long addr;
+	u32 num_cpus, *note_buf;
+	struct pt_regs regs;
+	int i, rc = 0, cpu = 0;
+
+	if (!fdm->cpu_state_data.bytes_dumped)
+		return -EINVAL;
+
+	addr = fdm->cpu_state_data.destination_address;
+	vaddr = __va(addr);
+
+	reg_header = vaddr;
+	if (reg_header->magic_number != REGSAVE_AREA_MAGIC) {
+		printk(KERN_ERR "Unable to read register save area.\n");
+		return -ENOENT;
+	}
+	pr_debug("--------CPU State Data------------\n");
+	pr_debug("Magic Number: %llx\n", reg_header->magic_number);
+	pr_debug("NumCpuOffset: %x\n", reg_header->num_cpu_offset);
+
+	vaddr += reg_header->num_cpu_offset;
+	num_cpus = *((u32 *)(vaddr));
+	pr_debug("NumCpus     : %u\n", num_cpus);
+	vaddr += sizeof(u32);
+	reg_entry = (struct fadump_reg_entry *)vaddr;
+
+	/* Allocate buffer to hold cpu crash notes. */
+	fw_dump.cpu_notes_buf_size = num_cpus * sizeof(note_buf_t);
+	fw_dump.cpu_notes_buf_size = PAGE_ALIGN(fw_dump.cpu_notes_buf_size);
+	note_buf = fadump_cpu_notes_buf_alloc(fw_dump.cpu_notes_buf_size);
+	if (!note_buf) {
+		printk(KERN_ERR "Failed to allocate 0x%lx bytes for "
+			"cpu notes buffer\n", fw_dump.cpu_notes_buf_size);
+		return -ENOMEM;
+	}
+	fw_dump.cpu_notes_buf = __pa(note_buf);
+
+	pr_debug("Allocated buffer for cpu notes of size %ld at %p\n",
+			(num_cpus * sizeof(note_buf_t)), note_buf);
+
+	if (fw_dump.fadumphdr_addr)
+		fdh = __va(fw_dump.fadumphdr_addr);
+
+	for (i = 0; i < num_cpus; i++) {
+		if (reg_entry->reg_id != REG_ID("CPUSTRT")) {
+			printk(KERN_ERR "Unable to read CPU state data\n");
+			rc = -ENOENT;
+			goto error_out;
+		}
+		/* Lower 4 bytes of reg_value contains logical cpu id */
+		cpu = reg_entry->reg_value & FADUMP_CPU_ID_MASK;
+		if (fdh && !cpumask_test_cpu(cpu, &fdh->cpu_online_mask)) {
+			SKIP_TO_NEXT_CPU(reg_entry);
+			continue;
+		}
+		pr_debug("Reading register data for cpu %d...\n", cpu);
+		if (fdh && fdh->crashing_cpu == cpu) {
+			regs = fdh->regs;
+			note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
+			SKIP_TO_NEXT_CPU(reg_entry);
+		} else {
+			reg_entry++;
+			reg_entry = fadump_read_registers(reg_entry, &regs);
+			note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
+		}
+	}
+	fadump_final_note(note_buf);
+
+	if (fdh) {
+		pr_debug("Updating elfcore header (%llx) with cpu notes\n",
+							fdh->elfcorehdr_addr);
+		fadump_update_elfcore_header((char *)__va(fdh->elfcorehdr_addr));
+	}
+	return 0;
+
+error_out:
+	fadump_cpu_notes_buf_free((unsigned long)__va(fw_dump.cpu_notes_buf),
+					fw_dump.cpu_notes_buf_size);
+	fw_dump.cpu_notes_buf = 0;
+	fw_dump.cpu_notes_buf_size = 0;
+	return rc;
+
+}
+
+/*
+ * Validate and process the dump data stored by firmware before exporting
+ * it through '/proc/vmcore'.
+ */
+static int __init process_fadump(const struct fadump_mem_struct *fdm_active)
+{
+	struct fadump_crash_info_header *fdh;
+	int rc = 0;
+
+	if (!fdm_active || !fw_dump.fadumphdr_addr)
+		return -EINVAL;
+
+	/* Check if the dump data is valid. */
+	if ((fdm_active->header.dump_status_flag == FADUMP_ERROR_FLAG) ||
+			(fdm_active->cpu_state_data.error_flags != 0) ||
+			(fdm_active->rmr_region.error_flags != 0)) {
+		printk(KERN_ERR "Dump taken by platform is not valid\n");
+		return -EINVAL;
+	}
+	if ((fdm_active->rmr_region.bytes_dumped !=
+			fdm_active->rmr_region.source_len) ||
+			!fdm_active->cpu_state_data.bytes_dumped) {
+		printk(KERN_ERR "Dump taken by platform is incomplete\n");
+		return -EINVAL;
+	}
+
+	/* Validate the fadump crash info header */
+	fdh = __va(fw_dump.fadumphdr_addr);
+	if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) {
+		printk(KERN_ERR "Crash info header is not valid.\n");
+		return -EINVAL;
+	}
+
+	rc = fadump_build_cpu_notes(fdm_active);
+	if (rc)
+		return rc;
+
+	/*
+	 * We are done validating dump info and elfcore header is now ready
+	 * to be exported. set elfcorehdr_addr so that vmcore module will
+	 * export the elfcore header through '/proc/vmcore'.
+	 */
+	elfcorehdr_addr = fdh->elfcorehdr_addr;
+
+	return 0;
+}
+
+static inline void fadump_add_crash_memory(unsigned long long base,
+					unsigned long long end)
+{
+	if (base == end)
+		return;
+
+	pr_debug("crash_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n",
+		crash_mem_ranges, base, end - 1, (end - base));
+	crash_memory_ranges[crash_mem_ranges].base = base;
+	crash_memory_ranges[crash_mem_ranges].size = end - base;
+	crash_mem_ranges++;
+}
+
+static void fadump_exclude_reserved_area(unsigned long long start,
+					unsigned long long end)
+{
+	unsigned long long ra_start, ra_end;
+
+	ra_start = fw_dump.reserve_dump_area_start;
+	ra_end = ra_start + fw_dump.reserve_dump_area_size;
+
+	if ((ra_start < end) && (ra_end > start)) {
+		if ((start < ra_start) && (end > ra_end)) {
+			fadump_add_crash_memory(start, ra_start);
+			fadump_add_crash_memory(ra_end, end);
+		} else if (start < ra_start) {
+			fadump_add_crash_memory(start, ra_start);
+		} else if (ra_end < end) {
+			fadump_add_crash_memory(ra_end, end);
+		}
+	} else
+		fadump_add_crash_memory(start, end);
+}
+
+static int fadump_init_elfcore_header(char *bufp)
+{
+	struct elfhdr *elf;
+
+	elf = (struct elfhdr *) bufp;
+	bufp += sizeof(struct elfhdr);
+	memcpy(elf->e_ident, ELFMAG, SELFMAG);
+	elf->e_ident[EI_CLASS] = ELF_CLASS;
+	elf->e_ident[EI_DATA] = ELF_DATA;
+	elf->e_ident[EI_VERSION] = EV_CURRENT;
+	elf->e_ident[EI_OSABI] = ELF_OSABI;
+	memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
+	elf->e_type = ET_CORE;
+	elf->e_machine = ELF_ARCH;
+	elf->e_version = EV_CURRENT;
+	elf->e_entry = 0;
+	elf->e_phoff = sizeof(struct elfhdr);
+	elf->e_shoff = 0;
+	elf->e_flags = ELF_CORE_EFLAGS;
+	elf->e_ehsize = sizeof(struct elfhdr);
+	elf->e_phentsize = sizeof(struct elf_phdr);
+	elf->e_phnum = 0;
+	elf->e_shentsize = 0;
+	elf->e_shnum = 0;
+	elf->e_shstrndx = 0;
+
+	return 0;
+}
+
+/*
+ * Traverse through memblock structure and setup crash memory ranges. These
+ * ranges will be used create PT_LOAD program headers in elfcore header.
+ */
+static void fadump_setup_crash_memory_ranges(void)
+{
+	struct memblock_region *reg;
+	unsigned long long start, end;
+
+	pr_debug("Setup crash memory ranges.\n");
+	crash_mem_ranges = 0;
+	/*
+	 * add the first memory chunk (RMA_START through boot_memory_size) as
+	 * a separate memory chunk. The reason is, at the time crash firmware
+	 * will move the content of this memory chunk to different location
+	 * specified during fadump registration. We need to create a separate
+	 * program header for this chunk with the correct offset.
+	 */
+	fadump_add_crash_memory(RMA_START, fw_dump.boot_memory_size);
+
+	for_each_memblock(memory, reg) {
+		start = (unsigned long long)reg->base;
+		end = start + (unsigned long long)reg->size;
+		if (start == RMA_START && end >= fw_dump.boot_memory_size)
+			start = fw_dump.boot_memory_size;
+
+		/* add this range excluding the reserved dump area. */
+		fadump_exclude_reserved_area(start, end);
+	}
+}
+
+/*
+ * If the given physical address falls within the boot memory region then
+ * return the relocated address that points to the dump region reserved
+ * for saving initial boot memory contents.
+ */
+static inline unsigned long fadump_relocate(unsigned long paddr)
+{
+	if (paddr > RMA_START && paddr < fw_dump.boot_memory_size)
+		return fdm.rmr_region.destination_address + paddr;
+	else
+		return paddr;
+}
+
+static int fadump_create_elfcore_headers(char *bufp)
+{
+	struct elfhdr *elf;
+	struct elf_phdr *phdr;
+	int i;
+
+	fadump_init_elfcore_header(bufp);
+	elf = (struct elfhdr *)bufp;
+	bufp += sizeof(struct elfhdr);
+
+	/*
+	 * setup ELF PT_NOTE, place holder for cpu notes info. The notes info
+	 * will be populated during second kernel boot after crash. Hence
+	 * this PT_NOTE will always be the first elf note.
+	 *
+	 * NOTE: Any new ELF note addition should be placed after this note.
+	 */
+	phdr = (struct elf_phdr *)bufp;
+	bufp += sizeof(struct elf_phdr);
+	phdr->p_type = PT_NOTE;
+	phdr->p_flags = 0;
+	phdr->p_vaddr = 0;
+	phdr->p_align = 0;
+
+	phdr->p_offset = 0;
+	phdr->p_paddr = 0;
+	phdr->p_filesz = 0;
+	phdr->p_memsz = 0;
+
+	(elf->e_phnum)++;
+
+	/* setup ELF PT_NOTE for vmcoreinfo */
+	phdr = (struct elf_phdr *)bufp;
+	bufp += sizeof(struct elf_phdr);
+	phdr->p_type	= PT_NOTE;
+	phdr->p_flags	= 0;
+	phdr->p_vaddr	= 0;
+	phdr->p_align	= 0;
+
+	phdr->p_paddr	= fadump_relocate(paddr_vmcoreinfo_note());
+	phdr->p_offset	= phdr->p_paddr;
+	phdr->p_memsz	= vmcoreinfo_max_size;
+	phdr->p_filesz	= vmcoreinfo_max_size;
+
+	/* Increment number of program headers. */
+	(elf->e_phnum)++;
+
+	/* setup PT_LOAD sections. */
+
+	for (i = 0; i < crash_mem_ranges; i++) {
+		unsigned long long mbase, msize;
+		mbase = crash_memory_ranges[i].base;
+		msize = crash_memory_ranges[i].size;
+
+		if (!msize)
+			continue;
+
+		phdr = (struct elf_phdr *)bufp;
+		bufp += sizeof(struct elf_phdr);
+		phdr->p_type	= PT_LOAD;
+		phdr->p_flags	= PF_R|PF_W|PF_X;
+		phdr->p_offset	= mbase;
+
+		if (mbase == RMA_START) {
+			/*
+			 * The entire RMA region will be moved by firmware
+			 * to the specified destination_address. Hence set
+			 * the correct offset.
+			 */
+			phdr->p_offset = fdm.rmr_region.destination_address;
+		}
+
+		phdr->p_paddr = mbase;
+		phdr->p_vaddr = (unsigned long)__va(mbase);
+		phdr->p_filesz = msize;
+		phdr->p_memsz = msize;
+		phdr->p_align = 0;
+
+		/* Increment number of program headers. */
+		(elf->e_phnum)++;
+	}
+	return 0;
+}
+
+static unsigned long init_fadump_header(unsigned long addr)
+{
+	struct fadump_crash_info_header *fdh;
+
+	if (!addr)
+		return 0;
+
+	fw_dump.fadumphdr_addr = addr;
+	fdh = __va(addr);
+	addr += sizeof(struct fadump_crash_info_header);
+
+	memset(fdh, 0, sizeof(struct fadump_crash_info_header));
+	fdh->magic_number = FADUMP_CRASH_INFO_MAGIC;
+	fdh->elfcorehdr_addr = addr;
+	/* We will set the crashing cpu id in crash_fadump() during crash. */
+	fdh->crashing_cpu = CPU_UNKNOWN;
+
+	return addr;
+}
+
+static void register_fadump(void)
+{
+	unsigned long addr;
+	void *vaddr;
+
+	/*
+	 * If no memory is reserved then we can not register for firmware-
+	 * assisted dump.
+	 */
+	if (!fw_dump.reserve_dump_area_size)
+		return;
+
+	fadump_setup_crash_memory_ranges();
+
+	addr = fdm.rmr_region.destination_address + fdm.rmr_region.source_len;
+	/* Initialize fadump crash info header. */
+	addr = init_fadump_header(addr);
+	vaddr = __va(addr);
+
+	pr_debug("Creating ELF core headers at %#016lx\n", addr);
+	fadump_create_elfcore_headers(vaddr);
+
+	/* register the future kernel dump with firmware. */
+	register_fw_dump(&fdm);
+}
+
+static int fadump_unregister_dump(struct fadump_mem_struct *fdm)
+{
+	int rc = 0;
+	unsigned int wait_time;
+
+	pr_debug("Un-register firmware-assisted dump\n");
+
+	/* TODO: Add upper time limit for the delay */
+	do {
+		rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL,
+			FADUMP_UNREGISTER, fdm,
+			sizeof(struct fadump_mem_struct));
+
+		wait_time = rtas_busy_delay_time(rc);
+		if (wait_time)
+			mdelay(wait_time);
+	} while (wait_time);
+
+	if (rc) {
+		printk(KERN_ERR "Failed to un-register firmware-assisted dump."
+			" unexpected error(%d).\n", rc);
+		return rc;
+	}
+	fw_dump.dump_registered = 0;
+	return 0;
+}
+
+static int fadump_invalidate_dump(struct fadump_mem_struct *fdm)
+{
+	int rc = 0;
+	unsigned int wait_time;
+
+	pr_debug("Invalidating firmware-assisted dump registration\n");
+
+	/* TODO: Add upper time limit for the delay */
+	do {
+		rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL,
+			FADUMP_INVALIDATE, fdm,
+			sizeof(struct fadump_mem_struct));
+
+		wait_time = rtas_busy_delay_time(rc);
+		if (wait_time)
+			mdelay(wait_time);
+	} while (wait_time);
+
+	if (rc) {
+		printk(KERN_ERR "Failed to invalidate firmware-assisted dump "
+			"rgistration. unexpected error(%d).\n", rc);
+		return rc;
+	}
+	fw_dump.dump_active = 0;
+	fdm_active = NULL;
+	return 0;
+}
+
+void fadump_cleanup(void)
+{
+	/* Invalidate the registration only if dump is active. */
+	if (fw_dump.dump_active) {
+		init_fadump_mem_struct(&fdm,
+			fdm_active->cpu_state_data.destination_address);
+		fadump_invalidate_dump(&fdm);
+	}
+}
+
+/*
+ * Release the memory that was reserved in early boot to preserve the memory
+ * contents. The released memory will be available for general use.
+ */
+static void fadump_release_memory(unsigned long begin, unsigned long end)
+{
+	unsigned long addr;
+	unsigned long ra_start, ra_end;
+
+	ra_start = fw_dump.reserve_dump_area_start;
+	ra_end = ra_start + fw_dump.reserve_dump_area_size;
+
+	for (addr = begin; addr < end; addr += PAGE_SIZE) {
+		/*
+		 * exclude the dump reserve area. Will reuse it for next
+		 * fadump registration.
+		 */
+		if (addr <= ra_end && ((addr + PAGE_SIZE) > ra_start))
+			continue;
+
+		free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
+	}
+}
+
+static void fadump_invalidate_release_mem(void)
+{
+	unsigned long reserved_area_start, reserved_area_end;
+	unsigned long destination_address;
+
+	mutex_lock(&fadump_mutex);
+	if (!fw_dump.dump_active) {
+		mutex_unlock(&fadump_mutex);
+		return;
+	}
+
+	destination_address = fdm_active->cpu_state_data.destination_address;
+	fadump_cleanup();
+	mutex_unlock(&fadump_mutex);
+
+	/*
+	 * Save the current reserved memory bounds we will require them
+	 * later for releasing the memory for general use.
+	 */
+	reserved_area_start = fw_dump.reserve_dump_area_start;
+	reserved_area_end = reserved_area_start +
+			fw_dump.reserve_dump_area_size;
+	/*
+	 * Setup reserve_dump_area_start and its size so that we can
+	 * reuse this reserved memory for Re-registration.
+	 */
+	fw_dump.reserve_dump_area_start = destination_address;
+	fw_dump.reserve_dump_area_size = get_fadump_area_size();
+
+	fadump_release_memory(reserved_area_start, reserved_area_end);
+	if (fw_dump.cpu_notes_buf) {
+		fadump_cpu_notes_buf_free(
+				(unsigned long)__va(fw_dump.cpu_notes_buf),
+				fw_dump.cpu_notes_buf_size);
+		fw_dump.cpu_notes_buf = 0;
+		fw_dump.cpu_notes_buf_size = 0;
+	}
+	/* Initialize the kernel dump memory structure for FAD registration. */
+	init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start);
+}
+
+static ssize_t fadump_release_memory_store(struct kobject *kobj,
+					struct kobj_attribute *attr,
+					const char *buf, size_t count)
+{
+	if (!fw_dump.dump_active)
+		return -EPERM;
+
+	if (buf[0] == '1') {
+		/*
+		 * Take away the '/proc/vmcore'. We are releasing the dump
+		 * memory, hence it will not be valid anymore.
+		 */
+		vmcore_cleanup();
+		fadump_invalidate_release_mem();
+
+	} else
+		return -EINVAL;
+	return count;
+}
+
+static ssize_t fadump_enabled_show(struct kobject *kobj,
+					struct kobj_attribute *attr,
+					char *buf)
+{
+	return sprintf(buf, "%d\n", fw_dump.fadump_enabled);
+}
+
+static ssize_t fadump_register_show(struct kobject *kobj,
+					struct kobj_attribute *attr,
+					char *buf)
+{
+	return sprintf(buf, "%d\n", fw_dump.dump_registered);
+}
+
+static ssize_t fadump_register_store(struct kobject *kobj,
+					struct kobj_attribute *attr,
+					const char *buf, size_t count)
+{
+	int ret = 0;
+
+	if (!fw_dump.fadump_enabled || fdm_active)
+		return -EPERM;
+
+	mutex_lock(&fadump_mutex);
+
+	switch (buf[0]) {
+	case '0':
+		if (fw_dump.dump_registered == 0) {
+			ret = -EINVAL;
+			goto unlock_out;
+		}
+		/* Un-register Firmware-assisted dump */
+		fadump_unregister_dump(&fdm);
+		break;
+	case '1':
+		if (fw_dump.dump_registered == 1) {
+			ret = -EINVAL;
+			goto unlock_out;
+		}
+		/* Register Firmware-assisted dump */
+		register_fadump();
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+unlock_out:
+	mutex_unlock(&fadump_mutex);
+	return ret < 0 ? ret : count;
+}
+
+static int fadump_region_show(struct seq_file *m, void *private)
+{
+	const struct fadump_mem_struct *fdm_ptr;
+
+	if (!fw_dump.fadump_enabled)
+		return 0;
+
+	mutex_lock(&fadump_mutex);
+	if (fdm_active)
+		fdm_ptr = fdm_active;
+	else {
+		mutex_unlock(&fadump_mutex);
+		fdm_ptr = &fdm;
+	}
+
+	seq_printf(m,
+			"CPU : [%#016llx-%#016llx] %#llx bytes, "
+			"Dumped: %#llx\n",
+			fdm_ptr->cpu_state_data.destination_address,
+			fdm_ptr->cpu_state_data.destination_address +
+			fdm_ptr->cpu_state_data.source_len - 1,
+			fdm_ptr->cpu_state_data.source_len,
+			fdm_ptr->cpu_state_data.bytes_dumped);
+	seq_printf(m,
+			"HPTE: [%#016llx-%#016llx] %#llx bytes, "
+			"Dumped: %#llx\n",
+			fdm_ptr->hpte_region.destination_address,
+			fdm_ptr->hpte_region.destination_address +
+			fdm_ptr->hpte_region.source_len - 1,
+			fdm_ptr->hpte_region.source_len,
+			fdm_ptr->hpte_region.bytes_dumped);
+	seq_printf(m,
+			"DUMP: [%#016llx-%#016llx] %#llx bytes, "
+			"Dumped: %#llx\n",
+			fdm_ptr->rmr_region.destination_address,
+			fdm_ptr->rmr_region.destination_address +
+			fdm_ptr->rmr_region.source_len - 1,
+			fdm_ptr->rmr_region.source_len,
+			fdm_ptr->rmr_region.bytes_dumped);
+
+	if (!fdm_active ||
+		(fw_dump.reserve_dump_area_start ==
+		fdm_ptr->cpu_state_data.destination_address))
+		goto out;
+
+	/* Dump is active. Show reserved memory region. */
+	seq_printf(m,
+			"    : [%#016llx-%#016llx] %#llx bytes, "
+			"Dumped: %#llx\n",
+			(unsigned long long)fw_dump.reserve_dump_area_start,
+			fdm_ptr->cpu_state_data.destination_address - 1,
+			fdm_ptr->cpu_state_data.destination_address -
+			fw_dump.reserve_dump_area_start,
+			fdm_ptr->cpu_state_data.destination_address -
+			fw_dump.reserve_dump_area_start);
+out:
+	if (fdm_active)
+		mutex_unlock(&fadump_mutex);
+	return 0;
+}
+
+static struct kobj_attribute fadump_release_attr = __ATTR(fadump_release_mem,
+						0200, NULL,
+						fadump_release_memory_store);
+static struct kobj_attribute fadump_attr = __ATTR(fadump_enabled,
+						0444, fadump_enabled_show,
+						NULL);
+static struct kobj_attribute fadump_register_attr = __ATTR(fadump_registered,
+						0644, fadump_register_show,
+						fadump_register_store);
+
+static int fadump_region_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, fadump_region_show, inode->i_private);
+}
+
+static const struct file_operations fadump_region_fops = {
+	.open    = fadump_region_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release,
+};
+
+static void fadump_init_files(void)
+{
+	struct dentry *debugfs_file;
+	int rc = 0;
+
+	rc = sysfs_create_file(kernel_kobj, &fadump_attr.attr);
+	if (rc)
+		printk(KERN_ERR "fadump: unable to create sysfs file"
+			" fadump_enabled (%d)\n", rc);
+
+	rc = sysfs_create_file(kernel_kobj, &fadump_register_attr.attr);
+	if (rc)
+		printk(KERN_ERR "fadump: unable to create sysfs file"
+			" fadump_registered (%d)\n", rc);
+
+	debugfs_file = debugfs_create_file("fadump_region", 0444,
+					powerpc_debugfs_root, NULL,
+					&fadump_region_fops);
+	if (!debugfs_file)
+		printk(KERN_ERR "fadump: unable to create debugfs file"
+				" fadump_region\n");
+
+	if (fw_dump.dump_active) {
+		rc = sysfs_create_file(kernel_kobj, &fadump_release_attr.attr);
+		if (rc)
+			printk(KERN_ERR "fadump: unable to create sysfs file"
+				" fadump_release_mem (%d)\n", rc);
+	}
+	return;
+}
+
+/*
+ * Prepare for firmware-assisted dump.
+ */
+int __init setup_fadump(void)
+{
+	if (!fw_dump.fadump_enabled)
+		return 0;
+
+	if (!fw_dump.fadump_supported) {
+		printk(KERN_ERR "Firmware-assisted dump is not supported on"
+			" this hardware\n");
+		return 0;
+	}
+
+	fadump_show_config();
+	/*
+	 * If dump data is available then see if it is valid and prepare for
+	 * saving it to the disk.
+	 */
+	if (fw_dump.dump_active) {
+		/*
+		 * if dump process fails then invalidate the registration
+		 * and release memory before proceeding for re-registration.
+		 */
+		if (process_fadump(fdm_active) < 0)
+			fadump_invalidate_release_mem();
+	}
+	/* Initialize the kernel dump memory structure for FAD registration. */
+	else if (fw_dump.reserve_dump_area_size)
+		init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start);
+	fadump_init_files();
+
+	return 1;
+}
+subsys_initcall(setup_fadump);
diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S
index de369558bf0..9ad236e5d2c 100644
--- a/arch/powerpc/kernel/fpu.S
+++ b/arch/powerpc/kernel/fpu.S
@@ -26,7 +26,7 @@
 #include <asm/ptrace.h>
 
 #ifdef CONFIG_VSX
-#define REST_32FPVSRS(n,c,base)						\
+#define __REST_32FPVSRS(n,c,base)					\
 BEGIN_FTR_SECTION							\
 	b	2f;							\
 END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
@@ -35,7 +35,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
 2:	REST_32VSRS(n,c,base);						\
 3:
 
-#define SAVE_32FPVSRS(n,c,base)						\
+#define __SAVE_32FPVSRS(n,c,base)					\
 BEGIN_FTR_SECTION							\
 	b	2f;							\
 END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
@@ -44,9 +44,77 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
 2:	SAVE_32VSRS(n,c,base);						\
 3:
 #else
-#define REST_32FPVSRS(n,b,base)	REST_32FPRS(n, base)
-#define SAVE_32FPVSRS(n,b,base)	SAVE_32FPRS(n, base)
+#define __REST_32FPVSRS(n,b,base)	REST_32FPRS(n, base)
+#define __SAVE_32FPVSRS(n,b,base)	SAVE_32FPRS(n, base)
 #endif
+#define REST_32FPVSRS(n,c,base) __REST_32FPVSRS(n,__REG_##c,__REG_##base)
+#define SAVE_32FPVSRS(n,c,base) __SAVE_32FPVSRS(n,__REG_##c,__REG_##base)
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+/* void do_load_up_transact_fpu(struct thread_struct *thread)
+ *
+ * This is similar to load_up_fpu but for the transactional version of the FP
+ * register set.  It doesn't mess with the task MSR or valid flags.
+ * Furthermore, we don't do lazy FP with TM currently.
+ */
+_GLOBAL(do_load_up_transact_fpu)
+	mfmsr	r6
+	ori	r5,r6,MSR_FP
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+	oris	r5,r5,MSR_VSX@h
+END_FTR_SECTION_IFSET(CPU_FTR_VSX)
+#endif
+	SYNC
+	MTMSRD(r5)
+
+	addi	r7,r3,THREAD_TRANSACT_FPSTATE
+	lfd	fr0,FPSTATE_FPSCR(r7)
+	MTFSF_L(fr0)
+	REST_32FPVSRS(0, R4, R7)
+
+	/* FP/VSX off again */
+	MTMSRD(r6)
+	SYNC
+
+	blr
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
+/*
+ * Enable use of the FPU, and VSX if possible, for the caller.
+ */
+_GLOBAL(fp_enable)
+	mfmsr	r3
+	ori	r3,r3,MSR_FP
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+	oris	r3,r3,MSR_VSX@h
+END_FTR_SECTION_IFSET(CPU_FTR_VSX)
+#endif
+	SYNC
+	MTMSRD(r3)
+	isync			/* (not necessary for arch 2.02 and later) */
+	blr
+
+/*
+ * Load state from memory into FP registers including FPSCR.
+ * Assumes the caller has enabled FP in the MSR.
+ */
+_GLOBAL(load_fp_state)
+	lfd	fr0,FPSTATE_FPSCR(r3)
+	MTFSF_L(fr0)
+	REST_32FPVSRS(0, R4, R3)
+	blr
+
+/*
+ * Store FP state into memory, including FPSCR
+ * Assumes the caller has enabled FP in the MSR.
+ */
+_GLOBAL(store_fp_state)
+	SAVE_32FPVSRS(0, R4, R3)
+	mffs	fr0
+	stfd	fr0,FPSTATE_FPSCR(r3)
+	blr
 
 /*
  * This task wants to use the FPU now.
@@ -54,6 +122,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
  * and save its floating-point registers in its thread_struct.
  * Load up this task's FP registers from its thread_struct,
  * enable the FPU for the current task and return to the task.
+ * Note that on 32-bit this can only use registers that will be
+ * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
  */
 _GLOBAL(load_up_fpu)
 	mfmsr	r5
@@ -79,9 +149,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	beq	1f
 	toreal(r4)
 	addi	r4,r4,THREAD		/* want last_task_used_math->thread */
-	SAVE_32FPVSRS(0, r5, r4)
+	addi	r10,r4,THREAD_FPSTATE
+	SAVE_32FPVSRS(0, R5, R10)
 	mffs	fr0
-	stfd	fr0,THREAD_FPSCR(r4)
+	stfd	fr0,FPSTATE_FPSCR(r10)
 	PPC_LL	r5,PT_REGS(r4)
 	toreal(r5)
 	PPC_LL	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
@@ -92,7 +163,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 #endif /* CONFIG_SMP */
 	/* enable use of FP after return */
 #ifdef CONFIG_PPC32
-	mfspr	r5,SPRN_SPRG_THREAD		/* current task's THREAD (phys) */
+	mfspr	r5,SPRN_SPRG_THREAD	/* current task's THREAD (phys) */
 	lwz	r4,THREAD_FPEXC_MODE(r5)
 	ori	r9,r9,MSR_FP		/* enable FP for current */
 	or	r9,r9,r4
@@ -104,9 +175,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	or	r12,r12,r4
 	std	r12,_MSR(r1)
 #endif
-	lfd	fr0,THREAD_FPSCR(r5)
+	addi	r10,r5,THREAD_FPSTATE
+	lfd	fr0,FPSTATE_FPSCR(r10)
 	MTFSF_L(fr0)
-	REST_32FPVSRS(0, r4, r5)
+	REST_32FPVSRS(0, R4, R10)
 #ifndef CONFIG_SMP
 	subi	r4,r5,THREAD
 	fromreal(r4)
@@ -138,11 +210,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	PPC_LCMPI	0,r3,0
 	beqlr-				/* if no previous owner, done */
 	addi	r3,r3,THREAD	        /* want THREAD of task */
+	PPC_LL	r6,THREAD_FPSAVEAREA(r3)
 	PPC_LL	r5,PT_REGS(r3)
-	PPC_LCMPI	0,r5,0
-	SAVE_32FPVSRS(0, r4 ,r3)
+	PPC_LCMPI	0,r6,0
+	bne	2f
+	addi	r6,r3,THREAD_FPSTATE
+2:	PPC_LCMPI	0,r5,0
+	SAVE_32FPVSRS(0, R4, R6)
 	mffs	fr0
-	stfd	fr0,THREAD_FPSCR(r3)
+	stfd	fr0,FPSTATE_FPSCR(r6)
 	beq	1f
 	PPC_LL	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
 	li	r3,MSR_FP|MSR_FE0|MSR_FE1
diff --git a/arch/powerpc/kernel/fsl_booke_entry_mapping.S b/arch/powerpc/kernel/fsl_booke_entry_mapping.S
index a92c79be272..f22e7e44fbf 100644
--- a/arch/powerpc/kernel/fsl_booke_entry_mapping.S
+++ b/arch/powerpc/kernel/fsl_booke_entry_mapping.S
@@ -176,6 +176,8 @@ skpinv:	addi	r6,r6,1				/* Increment */
 /* 7. Jump to KERNELBASE mapping */
 	lis	r6,(KERNELBASE & ~0xfff)@h
 	ori	r6,r6,(KERNELBASE & ~0xfff)@l
+	rlwinm	r7,r25,0,0x03ffffff
+	add	r6,r7,r6
 
 #elif defined(ENTRY_MAPPING_KEXEC_SETUP)
 /*
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
index bf99cfa6bbf..d178834fe50 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -10,6 +10,8 @@
  *
  */
 
+#define pr_fmt(fmt) "ftrace-powerpc: " fmt
+
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
 #include <linux/uaccess.h>
@@ -63,11 +65,9 @@ ftrace_modify_code(unsigned long ip, unsigned int old, unsigned int new)
 		return -EINVAL;
 
 	/* replace the text with the new text */
-	if (probe_kernel_write((void *)ip, &new, MCOUNT_INSN_SIZE))
+	if (patch_instruction((unsigned int *)ip, new))
 		return -EPERM;
 
-	flush_icache_range(ip, ip + 8);
-
 	return 0;
 }
 
@@ -76,6 +76,7 @@ ftrace_modify_code(unsigned long ip, unsigned int old, unsigned int new)
  */
 static int test_24bit_addr(unsigned long ip, unsigned long addr)
 {
+	addr = ppc_function_entry((void *)addr);
 
 	/* use the create_branch to verify that this offset can be branched */
 	return create_branch((unsigned int *)ip, addr, 0);
@@ -106,11 +107,9 @@ __ftrace_make_nop(struct module *mod,
 		  struct dyn_ftrace *rec, unsigned long addr)
 {
 	unsigned int op;
-	unsigned int jmp[5];
-	unsigned long ptr;
+	unsigned long entry, ptr;
 	unsigned long ip = rec->ip;
-	unsigned long tramp;
-	int offset;
+	void *tramp;
 
 	/* read where this goes */
 	if (probe_kernel_read(&op, (void *)ip, sizeof(int)))
@@ -118,106 +117,52 @@ __ftrace_make_nop(struct module *mod,
 
 	/* Make sure that that this is still a 24bit jump */
 	if (!is_bl_op(op)) {
-		printk(KERN_ERR "Not expected bl: opcode is %x\n", op);
+		pr_err("Not expected bl: opcode is %x\n", op);
 		return -EINVAL;
 	}
 
 	/* lets find where the pointer goes */
-	tramp = find_bl_target(ip, op);
+	tramp = (void *)find_bl_target(ip, op);
 
-	/*
-	 * On PPC64 the trampoline looks like:
-	 * 0x3d, 0x82, 0x00, 0x00,    addis   r12,r2, <high>
-	 * 0x39, 0x8c, 0x00, 0x00,    addi    r12,r12, <low>
-	 *   Where the bytes 2,3,6 and 7 make up the 32bit offset
-	 *   to the TOC that holds the pointer.
-	 *   to jump to.
-	 * 0xf8, 0x41, 0x00, 0x28,    std     r2,40(r1)
-	 * 0xe9, 0x6c, 0x00, 0x20,    ld      r11,32(r12)
-	 *   The actually address is 32 bytes from the offset
-	 *   into the TOC.
-	 * 0xe8, 0x4c, 0x00, 0x28,    ld      r2,40(r12)
-	 */
-
-	pr_devel("ip:%lx jumps to %lx r2: %lx", ip, tramp, mod->arch.toc);
+	pr_devel("ip:%lx jumps to %p", ip, tramp);
 
-	/* Find where the trampoline jumps to */
-	if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) {
-		printk(KERN_ERR "Failed to read %lx\n", tramp);
-		return -EFAULT;
-	}
-
-	pr_devel(" %08x %08x", jmp[0], jmp[1]);
-
-	/* verify that this is what we expect it to be */
-	if (((jmp[0] & 0xffff0000) != 0x3d820000) ||
-	    ((jmp[1] & 0xffff0000) != 0x398c0000) ||
-	    (jmp[2] != 0xf8410028) ||
-	    (jmp[3] != 0xe96c0020) ||
-	    (jmp[4] != 0xe84c0028)) {
-		printk(KERN_ERR "Not a trampoline\n");
+	if (!is_module_trampoline(tramp)) {
+		pr_err("Not a trampoline\n");
 		return -EINVAL;
 	}
 
-	/* The bottom half is signed extended */
-	offset = ((unsigned)((unsigned short)jmp[0]) << 16) +
-		(int)((short)jmp[1]);
-
-	pr_devel(" %x ", offset);
-
-	/* get the address this jumps too */
-	tramp = mod->arch.toc + offset + 32;
-	pr_devel("toc: %lx", tramp);
-
-	if (probe_kernel_read(jmp, (void *)tramp, 8)) {
-		printk(KERN_ERR "Failed to read %lx\n", tramp);
+	if (module_trampoline_target(mod, tramp, &ptr)) {
+		pr_err("Failed to get trampoline target\n");
 		return -EFAULT;
 	}
 
-	pr_devel(" %08x %08x\n", jmp[0], jmp[1]);
-
-	ptr = ((unsigned long)jmp[0] << 32) + jmp[1];
+	pr_devel("trampoline target %lx", ptr);
 
+	entry = ppc_global_function_entry((void *)addr);
 	/* This should match what was called */
-	if (ptr != ppc_function_entry((void *)addr)) {
-		printk(KERN_ERR "addr does not match %lx\n", ptr);
+	if (ptr != entry) {
+		pr_err("addr %lx does not match expected %lx\n", ptr, entry);
 		return -EINVAL;
 	}
 
 	/*
-	 * We want to nop the line, but the next line is
-	 *  0xe8, 0x41, 0x00, 0x28   ld r2,40(r1)
-	 * This needs to be turned to a nop too.
-	 */
-	if (probe_kernel_read(&op, (void *)(ip+4), MCOUNT_INSN_SIZE))
-		return -EFAULT;
-
-	if (op != 0xe8410028) {
-		printk(KERN_ERR "Next line is not ld! (%08x)\n", op);
-		return -EINVAL;
-	}
-
-	/*
-	 * Milton Miller pointed out that we can not blindly do nops.
-	 * If a task was preempted when calling a trace function,
-	 * the nops will remove the way to restore the TOC in r2
-	 * and the r2 TOC will get corrupted.
-	 */
-
-	/*
-	 * Replace:
-	 *   bl <tramp>  <==== will be replaced with "b 1f"
-	 *   ld r2,40(r1)
-	 *  1:
+	 * Our original call site looks like:
+	 *
+	 * bl <tramp>
+	 * ld r2,XX(r1)
+	 *
+	 * Milton Miller pointed out that we can not simply nop the branch.
+	 * If a task was preempted when calling a trace function, the nops
+	 * will remove the way to restore the TOC in r2 and the r2 TOC will
+	 * get corrupted.
+	 *
+	 * Use a b +8 to jump over the load.
 	 */
 	op = 0x48000008;	/* b +8 */
 
-	if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE))
+	if (patch_instruction((unsigned int *)ip, op))
 		return -EPERM;
 
-
-	flush_icache_range(ip, ip + 8);
-
 	return 0;
 }
 
@@ -236,7 +181,7 @@ __ftrace_make_nop(struct module *mod,
 
 	/* Make sure that that this is still a 24bit jump */
 	if (!is_bl_op(op)) {
-		printk(KERN_ERR "Not expected bl: opcode is %x\n", op);
+		pr_err("Not expected bl: opcode is %x\n", op);
 		return -EINVAL;
 	}
 
@@ -245,9 +190,9 @@ __ftrace_make_nop(struct module *mod,
 
 	/*
 	 * On PPC32 the trampoline looks like:
-	 *  0x3d, 0x60, 0x00, 0x00  lis r11,sym@ha
-	 *  0x39, 0x6b, 0x00, 0x00  addi r11,r11,sym@l
-	 *  0x7d, 0x69, 0x03, 0xa6  mtctr r11
+	 *  0x3d, 0x80, 0x00, 0x00  lis r12,sym@ha
+	 *  0x39, 0x8c, 0x00, 0x00  addi r12,r12,sym@l
+	 *  0x7d, 0x89, 0x03, 0xa6  mtctr r12
 	 *  0x4e, 0x80, 0x04, 0x20  bctr
 	 */
 
@@ -255,18 +200,18 @@ __ftrace_make_nop(struct module *mod,
 
 	/* Find where the trampoline jumps to */
 	if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) {
-		printk(KERN_ERR "Failed to read %lx\n", tramp);
+		pr_err("Failed to read %lx\n", tramp);
 		return -EFAULT;
 	}
 
 	pr_devel(" %08x %08x ", jmp[0], jmp[1]);
 
 	/* verify that this is what we expect it to be */
-	if (((jmp[0] & 0xffff0000) != 0x3d600000) ||
-	    ((jmp[1] & 0xffff0000) != 0x396b0000) ||
-	    (jmp[2] != 0x7d6903a6) ||
+	if (((jmp[0] & 0xffff0000) != 0x3d800000) ||
+	    ((jmp[1] & 0xffff0000) != 0x398c0000) ||
+	    (jmp[2] != 0x7d8903a6) ||
 	    (jmp[3] != 0x4e800420)) {
-		printk(KERN_ERR "Not a trampoline\n");
+		pr_err("Not a trampoline\n");
 		return -EINVAL;
 	}
 
@@ -278,19 +223,16 @@ __ftrace_make_nop(struct module *mod,
 	pr_devel(" %lx ", tramp);
 
 	if (tramp != addr) {
-		printk(KERN_ERR
-		       "Trampoline location %08lx does not match addr\n",
+		pr_err("Trampoline location %08lx does not match addr\n",
 		       tramp);
 		return -EINVAL;
 	}
 
 	op = PPC_INST_NOP;
 
-	if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE))
+	if (patch_instruction((unsigned int *)ip, op))
 		return -EPERM;
 
-	flush_icache_range(ip, ip + 8);
-
 	return 0;
 }
 #endif /* PPC64 */
@@ -322,15 +264,13 @@ int ftrace_make_nop(struct module *mod,
 	 */
 	if (!rec->arch.mod) {
 		if (!mod) {
-			printk(KERN_ERR "No module loaded addr=%lx\n",
-			       addr);
+			pr_err("No module loaded addr=%lx\n", addr);
 			return -EFAULT;
 		}
 		rec->arch.mod = mod;
 	} else if (mod) {
 		if (mod != rec->arch.mod) {
-			printk(KERN_ERR
-			       "Record mod %p not equal to passed in mod %p\n",
+			pr_err("Record mod %p not equal to passed in mod %p\n",
 			       rec->arch.mod, mod);
 			return -EINVAL;
 		}
@@ -351,45 +291,42 @@ static int
 __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 {
 	unsigned int op[2];
-	unsigned long ip = rec->ip;
+	void *ip = (void *)rec->ip;
 
 	/* read where this goes */
-	if (probe_kernel_read(op, (void *)ip, MCOUNT_INSN_SIZE * 2))
+	if (probe_kernel_read(op, ip, sizeof(op)))
 		return -EFAULT;
 
 	/*
-	 * It should be pointing to two nops or
-	 *  b +8; ld r2,40(r1)
+	 * We expect to see:
+	 *
+	 * b +8
+	 * ld r2,XX(r1)
+	 *
+	 * The load offset is different depending on the ABI. For simplicity
+	 * just mask it out when doing the compare.
 	 */
-	if (((op[0] != 0x48000008) || (op[1] != 0xe8410028)) &&
-	    ((op[0] != PPC_INST_NOP) || (op[1] != PPC_INST_NOP))) {
-		printk(KERN_ERR "Expected NOPs but have %x %x\n", op[0], op[1]);
+	if ((op[0] != 0x48000008) || ((op[1] & 0xffff0000) != 0xe8410000)) {
+		pr_err("Unexpected call sequence: %x %x\n", op[0], op[1]);
 		return -EINVAL;
 	}
 
 	/* If we never set up a trampoline to ftrace_caller, then bail */
 	if (!rec->arch.mod->arch.tramp) {
-		printk(KERN_ERR "No ftrace trampoline\n");
+		pr_err("No ftrace trampoline\n");
 		return -EINVAL;
 	}
 
-	/* create the branch to the trampoline */
-	op[0] = create_branch((unsigned int *)ip,
-			      rec->arch.mod->arch.tramp, BRANCH_SET_LINK);
-	if (!op[0]) {
-		printk(KERN_ERR "REL24 out of range!\n");
+	/* Ensure branch is within 24 bits */
+	if (!create_branch(ip, rec->arch.mod->arch.tramp, BRANCH_SET_LINK)) {
+		pr_err("Branch out of range\n");
 		return -EINVAL;
 	}
 
-	/* ld r2,40(r1) */
-	op[1] = 0xe8410028;
-
-	pr_devel("write to %lx\n", rec->ip);
-
-	if (probe_kernel_write((void *)ip, op, MCOUNT_INSN_SIZE * 2))
-		return -EPERM;
-
-	flush_icache_range(ip, ip + 8);
+	if (patch_branch(ip, rec->arch.mod->arch.tramp, BRANCH_SET_LINK)) {
+		pr_err("REL24 out of range!\n");
+		return -EINVAL;
+	}
 
 	return 0;
 }
@@ -406,13 +343,13 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 
 	/* It should be pointing to a nop */
 	if (op != PPC_INST_NOP) {
-		printk(KERN_ERR "Expected NOP but have %x\n", op);
+		pr_err("Expected NOP but have %x\n", op);
 		return -EINVAL;
 	}
 
 	/* If we never set up a trampoline to ftrace_caller, then bail */
 	if (!rec->arch.mod->arch.tramp) {
-		printk(KERN_ERR "No ftrace trampoline\n");
+		pr_err("No ftrace trampoline\n");
 		return -EINVAL;
 	}
 
@@ -420,17 +357,15 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 	op = create_branch((unsigned int *)ip,
 			   rec->arch.mod->arch.tramp, BRANCH_SET_LINK);
 	if (!op) {
-		printk(KERN_ERR "REL24 out of range!\n");
+		pr_err("REL24 out of range!\n");
 		return -EINVAL;
 	}
 
 	pr_devel("write to %lx\n", rec->ip);
 
-	if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE))
+	if (patch_instruction((unsigned int *)ip, op))
 		return -EPERM;
 
-	flush_icache_range(ip, ip + 8);
-
 	return 0;
 }
 #endif /* CONFIG_PPC64 */
@@ -460,7 +395,7 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 	 * already have a module defined.
 	 */
 	if (!rec->arch.mod) {
-		printk(KERN_ERR "No module loaded\n");
+		pr_err("No module loaded\n");
 		return -EINVAL;
 	}
 
@@ -484,13 +419,60 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 	return ret;
 }
 
-int __init ftrace_dyn_arch_init(void *data)
+static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
+{
+	unsigned long ftrace_addr = (unsigned long)FTRACE_ADDR;
+	int ret;
+
+	ret = ftrace_update_record(rec, enable);
+
+	switch (ret) {
+	case FTRACE_UPDATE_IGNORE:
+		return 0;
+	case FTRACE_UPDATE_MAKE_CALL:
+		return ftrace_make_call(rec, ftrace_addr);
+	case FTRACE_UPDATE_MAKE_NOP:
+		return ftrace_make_nop(NULL, rec, ftrace_addr);
+	}
+
+	return 0;
+}
+
+void ftrace_replace_code(int enable)
 {
-	/* caller expects data to be zero */
-	unsigned long *p = data;
+	struct ftrace_rec_iter *iter;
+	struct dyn_ftrace *rec;
+	int ret;
 
-	*p = 0;
+	for (iter = ftrace_rec_iter_start(); iter;
+	     iter = ftrace_rec_iter_next(iter)) {
+		rec = ftrace_rec_iter_record(iter);
+		ret = __ftrace_replace_code(rec, enable);
+		if (ret) {
+			ftrace_bug(ret, rec->ip);
+			return;
+		}
+	}
+}
+
+void arch_ftrace_update_code(int command)
+{
+	if (command & FTRACE_UPDATE_CALLS)
+		ftrace_replace_code(1);
+	else if (command & FTRACE_DISABLE_CALLS)
+		ftrace_replace_code(0);
+
+	if (command & FTRACE_UPDATE_TRACE_FUNC)
+		ftrace_update_ftrace_func(ftrace_trace_function);
+
+	if (command & FTRACE_START_FUNC_RET)
+		ftrace_enable_ftrace_graph_caller();
+	else if (command & FTRACE_STOP_FUNC_RET)
+		ftrace_disable_ftrace_graph_caller();
+}
 
+int __init ftrace_dyn_arch_init(void)
+{
 	return 0;
 }
 #endif /* CONFIG_DYNAMIC_FTRACE */
@@ -587,18 +569,17 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 		return;
 	}
 
-	if (ftrace_push_return_trace(old, self_addr, &trace.depth, 0) == -EBUSY) {
-		*parent = old;
-		return;
-	}
-
 	trace.func = self_addr;
+	trace.depth = current->curr_ret_stack + 1;
 
 	/* Only trace if the calling function expects to */
 	if (!ftrace_graph_entry(&trace)) {
-		current->curr_ret_stack--;
 		*parent = old;
+		return;
 	}
+
+	if (ftrace_push_return_trace(old, self_addr, &trace.depth, 0) == -EBUSY)
+		*parent = old;
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 0654dba2c1f..dc0488b6f6e 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -395,7 +395,7 @@ DataAccess:
 	bl	hash_page
 1:	lwz	r5,_DSISR(r11)		/* get DSISR value */
 	mfspr	r4,SPRN_DAR
-	EXC_XFER_EE_LITE(0x300, handle_page_fault)
+	EXC_XFER_LITE(0x300, handle_page_fault)
 
 
 /* Instruction access exception. */
@@ -410,7 +410,7 @@ InstructionAccess:
 	bl	hash_page
 1:	mr	r4,r12
 	mr	r5,r9
-	EXC_XFER_EE_LITE(0x400, handle_page_fault)
+	EXC_XFER_LITE(0x400, handle_page_fault)
 
 /* External interrupt */
 	EXCEPTION(0x500, HardwareInterrupt, do_IRQ, EXC_XFER_LITE)
diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S
index 872a6af83ba..7d7d8635227 100644
--- a/arch/powerpc/kernel/head_40x.S
+++ b/arch/powerpc/kernel/head_40x.S
@@ -394,7 +394,7 @@ label:
 	NORMAL_EXCEPTION_PROLOG
 	mr	r4,r12			/* Pass SRR0 as arg2 */
 	li	r5,0			/* Pass zero as arg3 */
-	EXC_XFER_EE_LITE(0x400, handle_page_fault)
+	EXC_XFER_LITE(0x400, handle_page_fault)
 
 /* 0x0500 - External Interrupt Exception */
 	EXCEPTION(0x0500, HardwareInterrupt, do_IRQ, EXC_XFER_LITE)
@@ -430,30 +430,18 @@ label:
 	EXCEPTION(0x0F00, Trap_0F, unknown_exception, EXC_XFER_EE)
 
 /* 0x1000 - Programmable Interval Timer (PIT) Exception */
-	START_EXCEPTION(0x1000, Decrementer)
-	NORMAL_EXCEPTION_PROLOG
-	lis	r0,TSR_PIS@h
-	mtspr	SPRN_TSR,r0		/* Clear the PIT exception */
-	addi	r3,r1,STACK_FRAME_OVERHEAD
-	EXC_XFER_LITE(0x1000, timer_interrupt)
-
-#if 0
-/* NOTE:
- * FIT and WDT handlers are not implemented yet.
- */
+	. = 0x1000
+	b Decrementer
 
 /* 0x1010 - Fixed Interval Timer (FIT) Exception
 */
-	STND_EXCEPTION(0x1010,	FITException,		unknown_exception)
+	. = 0x1010
+	b FITException
 
 /* 0x1020 - Watchdog Timer (WDT) Exception
 */
-#ifdef CONFIG_BOOKE_WDT
-	CRITICAL_EXCEPTION(0x1020, WDTException, WatchdogException)
-#else
-	CRITICAL_EXCEPTION(0x1020, WDTException, unknown_exception)
-#endif
-#endif
+	. = 0x1020
+	b WDTException
 
 /* 0x1100 - Data TLB Miss Exception
  * As the name implies, translation is not in the MMU, so search the
@@ -738,6 +726,29 @@ label:
 		(MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \
 		NOCOPY, crit_transfer_to_handler, ret_from_crit_exc)
 
+	/* Programmable Interval Timer (PIT) Exception. (from 0x1000) */
+Decrementer:
+	NORMAL_EXCEPTION_PROLOG
+	lis	r0,TSR_PIS@h
+	mtspr	SPRN_TSR,r0		/* Clear the PIT exception */
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	EXC_XFER_LITE(0x1000, timer_interrupt)
+
+	/* Fixed Interval Timer (FIT) Exception. (from 0x1010) */
+FITException:
+	NORMAL_EXCEPTION_PROLOG
+	addi	r3,r1,STACK_FRAME_OVERHEAD;
+	EXC_XFER_EE(0x1010, unknown_exception)
+
+	/* Watchdog Timer (WDT) Exception. (from 0x1020) */
+WDTException:
+	CRITICAL_EXCEPTION_PROLOG;
+	addi	r3,r1,STACK_FRAME_OVERHEAD;
+	EXC_XFER_TEMPLATE(WatchdogException, 0x1020+2,
+	                  (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)),
+			  NOCOPY, crit_transfer_to_handler,
+			  ret_from_crit_exc)
+
 /*
  * The other Data TLB exceptions bail out to this point
  * if they can't resolve the lightweight TLB fault.
@@ -747,7 +758,7 @@ DataAccess:
 	mfspr	r5,SPRN_ESR		/* Grab the ESR, save it, pass arg3 */
 	stw	r5,_ESR(r11)
 	mfspr	r4,SPRN_DEAR		/* Grab the DEAR, save it, pass arg2 */
-	EXC_XFER_EE_LITE(0x300, handle_page_fault)
+	EXC_XFER_LITE(0x300, handle_page_fault)
 
 /* Other PowerPC processors, namely those derived from the 6xx-series
  * have vectors from 0x2100 through 0x2F00 defined, but marked as reserved.
@@ -811,14 +822,6 @@ finish_tlb_load:
 	rfi			/* Should sync shadow TLBs */
 	b	.		/* prevent prefetch past rfi */
 
-/* extern void giveup_fpu(struct task_struct *prev)
- *
- * The PowerPC 4xx family of processors do not have an FPU, so this just
- * returns.
- */
-_ENTRY(giveup_fpu)
-	blr
-
 /* This is where the main kernel code starts.
  */
 start_here:
@@ -927,25 +930,6 @@ initial_mmu:
 	tlbwe	r4,r0,TLB_DATA		/* Load the data portion of the entry */
 	tlbwe	r3,r0,TLB_TAG		/* Load the tag portion of the entry */
 
-#if defined(CONFIG_SERIAL_TEXT_DEBUG) && defined(SERIAL_DEBUG_IO_BASE)
-
-	/* Load a TLB entry for the UART, so that ppc4xx_progress() can use
-	 * the UARTs nice and early.  We use a 4k real==virtual mapping. */
-
-	lis	r3,SERIAL_DEBUG_IO_BASE@h
-	ori	r3,r3,SERIAL_DEBUG_IO_BASE@l
-	mr	r4,r3
-	clrrwi	r4,r4,12
-	ori	r4,r4,(TLB_WR|TLB_I|TLB_M|TLB_G)
-
-	clrrwi	r3,r3,12
-	ori	r3,r3,(TLB_VALID | TLB_PAGESZ(PAGESZ_4K))
-
-	li	r0,0			/* TLB slot 0 */
-	tlbwe	r4,r0,TLB_DATA
-	tlbwe	r3,r0,TLB_TAG
-#endif /* CONFIG_SERIAL_DEBUG_TEXT && SERIAL_DEBUG_IO_BASE */
-
 	isync
 
 	/* Establish the exception vector base
diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S
index 7dd2981bcc5..c334f53453f 100644
--- a/arch/powerpc/kernel/head_44x.S
+++ b/arch/powerpc/kernel/head_44x.S
@@ -248,10 +248,11 @@ _ENTRY(_start);
 
 interrupt_base:
 	/* Critical Input Interrupt */
-	CRITICAL_EXCEPTION(0x0100, CriticalInput, unknown_exception)
+	CRITICAL_EXCEPTION(0x0100, CRITICAL, CriticalInput, unknown_exception)
 
 	/* Machine Check Interrupt */
-	CRITICAL_EXCEPTION(0x0200, MachineCheck, machine_check_exception)
+	CRITICAL_EXCEPTION(0x0200, MACHINE_CHECK, MachineCheck, \
+			   machine_check_exception)
 	MCHECK_EXCEPTION(0x0210, MachineCheckA, machine_check_exception)
 
 	/* Data Storage Interrupt */
@@ -261,7 +262,8 @@ interrupt_base:
 	INSTRUCTION_STORAGE_EXCEPTION
 
 	/* External Input Interrupt */
-	EXCEPTION(0x0500, ExternalInput, do_IRQ, EXC_XFER_LITE)
+	EXCEPTION(0x0500, BOOKE_INTERRUPT_EXTERNAL, ExternalInput, \
+		  do_IRQ, EXC_XFER_LITE)
 
 	/* Alignment Interrupt */
 	ALIGNMENT_EXCEPTION
@@ -273,29 +275,32 @@ interrupt_base:
 #ifdef CONFIG_PPC_FPU
 	FP_UNAVAILABLE_EXCEPTION
 #else
-	EXCEPTION(0x2010, FloatingPointUnavailable, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2010, BOOKE_INTERRUPT_FP_UNAVAIL, \
+		  FloatingPointUnavailable, unknown_exception, EXC_XFER_EE)
 #endif
 	/* System Call Interrupt */
 	START_EXCEPTION(SystemCall)
-	NORMAL_EXCEPTION_PROLOG
+	NORMAL_EXCEPTION_PROLOG(BOOKE_INTERRUPT_SYSCALL)
 	EXC_XFER_EE_LITE(0x0c00, DoSyscall)
 
 	/* Auxiliary Processor Unavailable Interrupt */
-	EXCEPTION(0x2020, AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2020, BOOKE_INTERRUPT_AP_UNAVAIL, \
+		  AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_EE)
 
 	/* Decrementer Interrupt */
 	DECREMENTER_EXCEPTION
 
 	/* Fixed Internal Timer Interrupt */
 	/* TODO: Add FIT support */
-	EXCEPTION(0x1010, FixedIntervalTimer, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1010, BOOKE_INTERRUPT_FIT, FixedIntervalTimer, \
+		  unknown_exception, EXC_XFER_EE)
 
 	/* Watchdog Timer Interrupt */
 	/* TODO: Add watchdog support */
 #ifdef CONFIG_BOOKE_WDT
-	CRITICAL_EXCEPTION(0x1020, WatchdogTimer, WatchdogException)
+	CRITICAL_EXCEPTION(0x1020, WATCHDOG, WatchdogTimer, WatchdogException)
 #else
-	CRITICAL_EXCEPTION(0x1020, WatchdogTimer, unknown_exception)
+	CRITICAL_EXCEPTION(0x1020, WATCHDOG, WatchdogTimer, unknown_exception)
 #endif
 
 	/* Data TLB Error Interrupt */
@@ -764,6 +769,8 @@ finish_tlb_load_47x:
 	 */
 	DEBUG_CRIT_EXCEPTION
 
+interrupt_end:
+
 /*
  * Global functions
  */
@@ -777,24 +784,6 @@ _GLOBAL(__fixup_440A_mcheck)
 	sync
 	blr
 
-/*
- * extern void giveup_altivec(struct task_struct *prev)
- *
- * The 44x core does not have an AltiVec unit.
- */
-_GLOBAL(giveup_altivec)
-	blr
-
-/*
- * extern void giveup_fpu(struct task_struct *prev)
- *
- * The 44x core does not have an FPU.
- */
-#ifndef CONFIG_PPC_FPU
-_GLOBAL(giveup_fpu)
-	blr
-#endif
-
 _GLOBAL(set_context)
 
 #ifdef CONFIG_BDI_SWITCH
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 06c7251c1bf..a95145d7f61 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -23,6 +23,7 @@
  */
 
 #include <linux/threads.h>
+#include <linux/init.h>
 #include <asm/reg.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
@@ -32,13 +33,13 @@
 #include <asm/cputable.h>
 #include <asm/setup.h>
 #include <asm/hvcall.h>
-#include <asm/iseries/lpar_map.h>
 #include <asm/thread_info.h>
 #include <asm/firmware.h>
 #include <asm/page_64.h>
 #include <asm/irqflags.h>
 #include <asm/kvm_book3s_asm.h>
 #include <asm/ptrace.h>
+#include <asm/hw_irq.h>
 
 /* The physical memory is laid out such that the secondary processor
  * spin code sits at 0x0000...0x00ff. On server, the vectors follow
@@ -57,10 +58,6 @@
  *	entry in r9 for debugging purposes
  *   2. Secondary processors enter at 0x60 with PIR in gpr3
  *
- *  For iSeries:
- *   1. The MMU is on (as it always is for iSeries)
- *   2. The kernel is entered at system_reset_iSeries
- *
  *  For Book3E processors:
  *   1. The MMU is on running in AS0 in a state defined in ePAPR
  *   2. The kernel is entered at __start
@@ -72,17 +69,18 @@ _stext:
 _GLOBAL(__start)
 	/* NOP this out unconditionally */
 BEGIN_FTR_SECTION
-	b	.__start_initialization_multiplatform
+	FIXUP_ENDIAN
+	b	__start_initialization_multiplatform
 END_FTR_SECTION(0, 1)
 
 	/* Catch branch to 0 in real mode */
 	trap
 
-	/* Secondary processors spin on this value until it becomes nonzero.
-	 * When it does it contains the real address of the descriptor
-	 * of the function that the cpu should jump to to continue
-	 * initialization.
+	/* Secondary processors spin on this value until it becomes non-zero.
+	 * When non-zero, it contains the real address of the function the cpu
+	 * should jump to.
 	 */
+	.balign 8
 	.globl  __secondary_hold_spinloop
 __secondary_hold_spinloop:
 	.llong	0x0
@@ -93,15 +91,6 @@ __secondary_hold_spinloop:
 __secondary_hold_acknowledge:
 	.llong	0x0
 
-#ifdef CONFIG_PPC_ISERIES
-	/*
-	 * At offset 0x20, there is a pointer to iSeries LPAR data.
-	 * This is required by the hypervisor
-	 */
-	. = 0x20
-	.llong hvReleaseData-KERNELBASE
-#endif /* CONFIG_PPC_ISERIES */
-
 #ifdef CONFIG_RELOCATABLE
 	/* This flag is set to 1 by a loader if the kernel should run
 	 * at the loaded address instead of the linked address.  This
@@ -128,6 +117,7 @@ __run_at_load:
  */
 	.globl	__secondary_hold
 __secondary_hold:
+	FIXUP_ENDIAN
 #ifndef CONFIG_PPC_BOOK3E
 	mfmsr	r24
 	ori	r24,r24,MSR_RI
@@ -135,6 +125,8 @@ __secondary_hold:
 #endif
 	/* Grab our physical cpu number */
 	mr	r24,r3
+	/* stash r4 for book3e */
+	mr	r25,r4
 
 	/* Tell the master cpu we're here */
 	/* Relocation is off & we are located at an address less */
@@ -142,16 +134,30 @@ __secondary_hold:
 	std	r24,__secondary_hold_acknowledge-_stext(0)
 	sync
 
+	li	r26,0
+#ifdef CONFIG_PPC_BOOK3E
+	tovirt(r26,r26)
+#endif
 	/* All secondary cpus wait here until told to start. */
-100:	ld	r4,__secondary_hold_spinloop-_stext(0)
-	cmpdi	0,r4,0
+100:	ld	r12,__secondary_hold_spinloop-_stext(r26)
+	cmpdi	0,r12,0
 	beq	100b
 
 #if defined(CONFIG_SMP) || defined(CONFIG_KEXEC)
-	ld	r4,0(r4)		/* deref function descriptor */
-	mtctr	r4
+#ifdef CONFIG_PPC_BOOK3E
+	tovirt(r12,r12)
+#endif
+	mtctr	r12
 	mr	r3,r24
+	/*
+	 * it may be the case that other platforms have r4 right to
+	 * begin with, this gives us some safety in case it is not
+	 */
+#ifdef CONFIG_PPC_BOOK3E
+	mr	r4,r25
+#else
 	li	r4,0
+#endif
 	/* Make sure that patched code is visible */
 	isync
 	bctr
@@ -178,15 +184,16 @@ _GLOBAL(generic_secondary_thread_init)
 	mr	r24,r3
 
 	/* turn on 64-bit mode */
-	bl	.enable_64b_mode
+	bl	enable_64b_mode
 
 	/* get a valid TOC pointer, wherever we're mapped at */
-	bl	.relative_toc
+	bl	relative_toc
+	tovirt(r2,r2)
 
 #ifdef CONFIG_PPC_BOOK3E
 	/* Book3E initialization */
 	mr	r3,r24
-	bl	.book3e_secondary_thread_init
+	bl	book3e_secondary_thread_init
 #endif
 	b	generic_secondary_common_init
 
@@ -200,20 +207,22 @@ _GLOBAL(generic_secondary_thread_init)
  * as SCOM before entry).
  */
 _GLOBAL(generic_secondary_smp_init)
+	FIXUP_ENDIAN
 	mr	r24,r3
 	mr	r25,r4
 
 	/* turn on 64-bit mode */
-	bl	.enable_64b_mode
+	bl	enable_64b_mode
 
 	/* get a valid TOC pointer, wherever we're mapped at */
-	bl	.relative_toc
+	bl	relative_toc
+	tovirt(r2,r2)
 
 #ifdef CONFIG_PPC_BOOK3E
 	/* Book3E initialization */
 	mr	r3,r24
 	mr	r4,r25
-	bl	.book3e_secondary_core_init
+	bl	book3e_secondary_core_init
 #endif
 
 generic_secondary_common_init:
@@ -225,7 +234,7 @@ generic_secondary_common_init:
 	ld	r13,0(r13)		/* Get base vaddr of paca array	 */
 #ifndef CONFIG_SMP
 	addi	r13,r13,PACA_SIZE	/* know r13 if used accidentally */
-	b	.kexec_wait		/* wait for next kernel if !SMP	 */
+	b	kexec_wait		/* wait for next kernel if !SMP	 */
 #else
 	LOAD_REG_ADDR(r7, nr_cpu_ids)	/* Load nr_cpu_ids address       */
 	lwz	r7,0(r7)		/* also the max paca allocated 	 */
@@ -239,7 +248,7 @@ generic_secondary_common_init:
 	blt	1b
 
 	mr	r3,r24			/* not found, copy phys to r3	 */
-	b	.kexec_wait		/* next kernel might do better	 */
+	b	kexec_wait		/* next kernel might do better	 */
 
 2:	SET_PACA(r13)
 #ifdef CONFIG_PPC_BOOK3E
@@ -253,11 +262,13 @@ generic_secondary_common_init:
 	/* See if we need to call a cpu state restore handler */
 	LOAD_REG_ADDR(r23, cur_cpu_spec)
 	ld	r23,0(r23)
-	ld	r23,CPU_SPEC_RESTORE(r23)
-	cmpdi	0,r23,0
+	ld	r12,CPU_SPEC_RESTORE(r23)
+	cmpdi	0,r12,0
 	beq	3f
-	ld	r23,0(r23)
-	mtctr	r23
+#if !defined(_CALL_ELF) || _CALL_ELF != 2
+	ld	r12,0(r12)
+#endif
+	mtctr	r12
 	bctrl
 
 3:	LOAD_REG_ADDR(r3, spinning_secondaries) /* Decrement spinning_secondaries */
@@ -288,7 +299,7 @@ generic_secondary_common_init:
  * Assumes we're mapped EA == RA if the MMU is on.
  */
 #ifdef CONFIG_PPC_BOOK3S
-_STATIC(__mmu_off)
+__mmu_off:
 	mfmsr	r3
 	andi.	r0,r3,MSR_IR|MSR_DR
 	beqlr
@@ -313,12 +324,12 @@ _STATIC(__mmu_off)
  *                 DT block, r4 is a physical pointer to the kernel itself
  *
  */
-_GLOBAL(__start_initialization_multiplatform)
+__start_initialization_multiplatform:
 	/* Make sure we are running in 64 bits mode */
-	bl	.enable_64b_mode
+	bl	enable_64b_mode
 
 	/* Get TOC pointer (current runtime address) */
-	bl	.relative_toc
+	bl	relative_toc
 
 	/* find out where we are now */
 	bcl	20,31,$+4
@@ -331,7 +342,7 @@ _GLOBAL(__start_initialization_multiplatform)
 	 */
 	cmpldi	cr0,r5,0
 	beq	1f
-	b	.__boot_from_prom		/* yes -> prom */
+	b	__boot_from_prom		/* yes -> prom */
 1:
 	/* Save parameters */
 	mr	r31,r3
@@ -343,8 +354,8 @@ _GLOBAL(__start_initialization_multiplatform)
 #endif
 
 #ifdef CONFIG_PPC_BOOK3E
-	bl	.start_initialization_book3e
-	b	.__after_prom_start
+	bl	start_initialization_book3e
+	b	__after_prom_start
 #else
 	/* Setup some critical 970 SPRs before switching MMU off */
 	mfspr	r0,SPRN_PVR
@@ -357,15 +368,15 @@ _GLOBAL(__start_initialization_multiplatform)
 	beq	1f
 	cmpwi	r0,0x45		/* 970GX */
 	bne	2f
-1:	bl	.__cpu_preinit_ppc970
+1:	bl	__cpu_preinit_ppc970
 2:
 
 	/* Switch off MMU if not already off */
-	bl	.__mmu_off
-	b	.__after_prom_start
+	bl	__mmu_off
+	b	__after_prom_start
 #endif /* CONFIG_PPC_BOOK3E */
 
-_INIT_STATIC(__boot_from_prom)
+__boot_from_prom:
 #ifdef CONFIG_PPC_OF_BOOT_TRAMPOLINE
 	/* Save parameters */
 	mr	r31,r3
@@ -384,7 +395,7 @@ _INIT_STATIC(__boot_from_prom)
 #ifdef CONFIG_RELOCATABLE
 	/* Relocate code for where we are now */
 	mr	r3,r26
-	bl	.relocate
+	bl	relocate
 #endif
 
 	/* Restore parameters */
@@ -396,14 +407,14 @@ _INIT_STATIC(__boot_from_prom)
 
 	/* Do all of the interaction with OF client interface */
 	mr	r8,r26
-	bl	.prom_init
+	bl	prom_init
 #endif /* #CONFIG_PPC_OF_BOOT_TRAMPOLINE */
 
 	/* We never return. We also hit that trap if trying to boot
 	 * from OF while CONFIG_PPC_OF_BOOT_TRAMPOLINE isn't selected */
 	trap
 
-_STATIC(__after_prom_start)
+__after_prom_start:
 #ifdef CONFIG_RELOCATABLE
 	/* process relocations for the final address of the kernel */
 	lis	r25,PAGE_OFFSET@highest	/* compute virtual base of kernel */
@@ -413,7 +424,7 @@ _STATIC(__after_prom_start)
 	bne	1f
 	add	r25,r25,r26
 1:	mr	r3,r25
-	bl	.relocate
+	bl	relocate
 #endif
 
 /*
@@ -435,7 +446,7 @@ _STATIC(__after_prom_start)
 	tovirt(r6,r6)			/* on booke, we already run at PAGE_OFFSET */
 #endif
 
-#ifdef CONFIG_CRASH_DUMP
+#ifdef CONFIG_RELOCATABLE
 /*
  * Check if the kernel has to be running as relocatable kernel based on the
  * variable __run_at_load, if it is set the kernel is treated as relocatable
@@ -445,29 +456,31 @@ _STATIC(__after_prom_start)
 	cmplwi	cr0,r7,1
 	bne	3f
 
-	li	r5,__end_interrupts - _stext	/* just copy interrupts */
+	/* just copy interrupts */
+	LOAD_REG_IMMEDIATE(r5, __end_interrupts - _stext)
 	b	5f
 3:
 #endif
 	lis	r5,(copy_to_here - _stext)@ha
 	addi	r5,r5,(copy_to_here - _stext)@l /* # bytes of memory to copy */
 
-	bl	.copy_and_flush		/* copy the first n bytes	 */
+	bl	copy_and_flush		/* copy the first n bytes	 */
 					/* this includes the code being	 */
 					/* executed here.		 */
 	addis	r8,r3,(4f - _stext)@ha	/* Jump to the copy of this code */
-	addi	r8,r8,(4f - _stext)@l	/* that we just made */
-	mtctr	r8
+	addi	r12,r8,(4f - _stext)@l	/* that we just made */
+	mtctr	r12
 	bctr
 
+.balign 8
 p_end:	.llong	_end - _stext
 
 4:	/* Now copy the rest of the kernel up to _end */
 	addis	r5,r26,(p_end - _stext)@ha
 	ld	r5,(p_end - _stext)@l(r5)	/* get _end */
-5:	bl	.copy_and_flush		/* copy the rest */
+5:	bl	copy_and_flush		/* copy the rest */
 
-9:	b	.start_here_multiplatform
+9:	b	start_here_multiplatform
 
 /*
  * Copy routine used to copy the kernel to start at physical address 0
@@ -502,6 +515,7 @@ _GLOBAL(copy_and_flush)
 	sync
 	addi	r5,r5,8
 	addi	r6,r6,8
+	isync
 	blr
 
 .align 8
@@ -530,7 +544,7 @@ __secondary_start_pmac_0:
 	
 _GLOBAL(pmac_secondary_start)
 	/* turn on 64-bit mode */
-	bl	.enable_64b_mode
+	bl	enable_64b_mode
 
 	li	r0,0
 	mfspr	r3,SPRN_HID4
@@ -542,10 +556,11 @@ _GLOBAL(pmac_secondary_start)
 	slbia
 
 	/* get TOC pointer (real address) */
-	bl	.relative_toc
+	bl	relative_toc
+	tovirt(r2,r2)
 
 	/* Copy some CPU settings from CPU 0 */
-	bl	.__restore_cpu_ppc970
+	bl	__restore_cpu_ppc970
 
 	/* pSeries do that early though I don't think we really need it */
 	mfmsr	r3
@@ -564,7 +579,8 @@ _GLOBAL(pmac_secondary_start)
 	 */
 	li	r0,0
 	stb	r0,PACASOFTIRQEN(r13)
-	stb	r0,PACAHARDIRQEN(r13)
+	li	r0,PACA_IRQ_HARD_DIS
+	stb	r0,PACAIRQHAPPENED(r13)
 
 	/* Create a temp kernel stack for use before relocation is on.	*/
 	ld	r1,PACAEMERGSP(r13)
@@ -582,7 +598,7 @@ _GLOBAL(pmac_secondary_start)
  *   1. Processor number
  *   2. Segment table pointer (virtual address)
  * On entry the following are set:
- *   r1	       = stack pointer.  vaddr for iSeries, raddr (temp stack) for pSeries
+ *   r1	       = stack pointer (real addr of temp stack)
  *   r24       = cpu# (in Linux terms)
  *   r13       = paca virtual address
  *   SPRG_PACA = paca virtual address
@@ -595,7 +611,7 @@ __secondary_start:
 	/* Set thread priority to MEDIUM */
 	HMT_MEDIUM
 
-	/* Initialize the kernel stack.  Just a repeat for iSeries.	 */
+	/* Initialize the kernel stack */
 	LOAD_REG_ADDR(r3, current_set)
 	sldi	r28,r24,3		/* get current_set[cpu#]	 */
 	ldx	r14,r3,r28
@@ -603,7 +619,7 @@ __secondary_start:
 	std	r14,PACAKSAVE(r13)
 
 	/* Do early setup for that CPU (stab, slb, hash table pointer) */
-	bl	.early_setup_secondary
+	bl	early_setup_secondary
 
 	/*
 	 * setup the new stack pointer, but *don't* use this until
@@ -615,20 +631,16 @@ __secondary_start:
 	li	r7,0
 	mtlr	r7
 
+	/* Mark interrupts soft and hard disabled (they might be enabled
+	 * in the PACA when doing hotplug)
+	 */
+	stb	r7,PACASOFTIRQEN(r13)
+	li	r0,PACA_IRQ_HARD_DIS
+	stb	r0,PACAIRQHAPPENED(r13)
+
 	/* enable MMU and jump to start_secondary */
-	LOAD_REG_ADDR(r3, .start_secondary_prolog)
+	LOAD_REG_ADDR(r3, start_secondary_prolog)
 	LOAD_REG_IMMEDIATE(r4, MSR_KERNEL)
-#ifdef CONFIG_PPC_ISERIES
-BEGIN_FW_FTR_SECTION
-	ori	r4,r4,MSR_EE
-	li	r8,1
-	stb	r8,PACAHARDIRQEN(r13)
-END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
-#endif
-BEGIN_FW_FTR_SECTION
-	stb	r7,PACAHARDIRQEN(r13)
-END_FW_FTR_SECTION_IFCLR(FW_FEATURE_ISERIES)
-	stb	r7,PACASOFTIRQEN(r13)
 
 	mtspr	SPRN_SRR0,r3
 	mtspr	SPRN_SRR1,r4
@@ -640,11 +652,11 @@ END_FW_FTR_SECTION_IFCLR(FW_FEATURE_ISERIES)
  * zero the stack back-chain pointer and get the TOC virtual address
  * before going into C code.
  */
-_GLOBAL(start_secondary_prolog)
+start_secondary_prolog:
 	ld	r2,PACATOC(r13)
 	li	r3,0
 	std	r3,0(r1)		/* Zero the stack frame pointer	*/
-	bl	.start_secondary
+	bl	start_secondary
 	b	.
 /*
  * Reset stack pointer and call start_secondary
@@ -655,14 +667,14 @@ _GLOBAL(start_secondary_resume)
 	ld	r1,PACAKSAVE(r13)	/* Reload kernel stack pointer */
 	li	r3,0
 	std	r3,0(r1)		/* Zero the stack frame pointer	*/
-	bl	.start_secondary
+	bl	start_secondary
 	b	.
 #endif
 
 /*
  * This subroutine clobbers r11 and r12
  */
-_GLOBAL(enable_64b_mode)
+enable_64b_mode:
 	mfmsr	r11			/* grab the current MSR */
 #ifdef CONFIG_PPC_BOOK3E
 	oris	r11,r11,0x8000		/* CM bit set, we'll set ICM later */
@@ -680,6 +692,13 @@ _GLOBAL(enable_64b_mode)
  * This puts the TOC pointer into r2, offset by 0x8000 (as expected
  * by the toolchain).  It computes the correct value for wherever we
  * are running at the moment, using position-independent code.
+ *
+ * Note: The compiler constructs pointers using offsets from the
+ * TOC in -mcmodel=medium mode. After we relocate to 0 but before
+ * the MMU is on we need our TOC to be a virtual address otherwise
+ * these pointers will be real addresses which may get stored and
+ * accessed later with the MMU on. We use tovirt() at the call
+ * sites to handle this.
  */
 _GLOBAL(relative_toc)
 	mflr	r0
@@ -690,14 +709,16 @@ _GLOBAL(relative_toc)
 	mtlr	r0
 	blr
 
+.balign 8
 p_toc:	.llong	__toc_start + 0x8000 - 0b
 
 /*
  * This is where the main kernel code starts.
  */
-_INIT_STATIC(start_here_multiplatform)
-	/* set up the TOC (real address) */
-	bl	.relative_toc
+start_here_multiplatform:
+	/* set up the TOC */
+	bl      relative_toc
+	tovirt(r2,r2)
 
 	/* Clear out the BSS. It may have been done in prom_init,
 	 * already but that's irrelevant since prom_init will soon
@@ -719,6 +740,7 @@ _INIT_STATIC(start_here_multiplatform)
 
 #ifdef CONFIG_PPC_EARLY_DEBUG_OPAL
 	/* Setup OPAL entry */
+	LOAD_REG_ADDR(r11, opal)
 	std	r28,0(r11);
 	std	r29,8(r11);
 #endif
@@ -754,9 +776,9 @@ _INIT_STATIC(start_here_multiplatform)
 
 	/* Restore parameters passed from prom_init/kexec */
 	mr	r3,r31
-	bl	.early_setup		/* also sets r13 and SPRG_PACA */
+	bl	early_setup		/* also sets r13 and SPRG_PACA */
 
-	LOAD_REG_ADDR(r3, .start_here_common)
+	LOAD_REG_ADDR(r3, start_here_common)
 	ld	r4,PACAKMSR(r13)
 	mtspr	SPRN_SRR0,r3
 	mtspr	SPRN_SRR1,r4
@@ -764,30 +786,27 @@ _INIT_STATIC(start_here_multiplatform)
 	b	.	/* prevent speculative execution */
 	
 	/* This is where all platforms converge execution */
-_INIT_GLOBAL(start_here_common)
+
+start_here_common:
 	/* relocation is on at this point */
 	std	r1,PACAKSAVE(r13)
 
 	/* Load the TOC (virtual address) */
 	ld	r2,PACATOC(r13)
 
-	bl	.setup_system
-
-	/* Load up the kernel context */
-5:
-	li	r5,0
-	stb	r5,PACASOFTIRQEN(r13)	/* Soft Disabled */
-#ifdef CONFIG_PPC_ISERIES
-BEGIN_FW_FTR_SECTION
-	mfmsr	r5
-	ori	r5,r5,MSR_EE		/* Hard Enabled on iSeries*/
-	mtmsrd	r5
-	li	r5,1
-END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
-#endif
-	stb	r5,PACAHARDIRQEN(r13)	/* Hard Disabled on others */
+	/* Do more system initializations in virtual mode */
+	bl	setup_system
+
+	/* Mark interrupts soft and hard disabled (they might be enabled
+	 * in the PACA when doing hotplug)
+	 */
+	li	r0,0
+	stb	r0,PACASOFTIRQEN(r13)
+	li	r0,PACA_IRQ_HARD_DIS
+	stb	r0,PACAIRQHAPPENED(r13)
 
-	bl	.start_kernel
+	/* Generic kernel entry */
+	bl	start_kernel
 
 	/* Not reached */
 	BUG_OPCODE
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index b68cb173ba2..7ee876d2adb 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -220,7 +220,7 @@ DataAccess:
 	mfspr	r4,SPRN_DAR
 	li	r10,0x00f0
 	mtspr	SPRN_DAR,r10	/* Tag DAR, to be used in DTLB Error */
-	EXC_XFER_EE_LITE(0x300, handle_page_fault)
+	EXC_XFER_LITE(0x300, handle_page_fault)
 
 /* Instruction access exception.
  * This is "never generated" by the MPC8xx.  We jump to it for other
@@ -231,7 +231,7 @@ InstructionAccess:
 	EXCEPTION_PROLOG
 	mr	r4,r12
 	mr	r5,r9
-	EXC_XFER_EE_LITE(0x400, handle_page_fault)
+	EXC_XFER_LITE(0x400, handle_page_fault)
 
 /* External interrupt */
 	EXCEPTION(0x500, HardwareInterrupt, do_IRQ, EXC_XFER_LITE)
@@ -691,10 +691,6 @@ modified_instr:
 	b	151b
 #endif
 
-	.globl	giveup_fpu
-giveup_fpu:
-	blr
-
 /*
  * This is where the main kernel code starts.
  */
@@ -862,6 +858,9 @@ initial_mmu:
 	addis	r11, r11, 0x0080	/* Add 8M */
 	mtspr	SPRN_MD_RPN, r11
 
+	addi	r10, r10, 0x0100
+	mtspr	SPRN_MD_CTR, r10
+
 	addis	r8, r8, 0x0080		/* Add 8M */
 	mtspr	SPRN_MD_EPN, r8
 	mtspr	SPRN_MD_TWC, r9
diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
index fc921bf62e1..a620203f7de 100644
--- a/arch/powerpc/kernel/head_booke.h
+++ b/arch/powerpc/kernel/head_booke.h
@@ -2,6 +2,9 @@
 #define __HEAD_BOOKE_H__
 
 #include <asm/ptrace.h>	/* for STACK_FRAME_REGS_MARKER */
+#include <asm/kvm_asm.h>
+#include <asm/kvm_booke_hv_asm.h>
+
 /*
  * Macros used for common Book-e exception handling
  */
@@ -28,14 +31,15 @@
  */
 #define THREAD_NORMSAVE(offset)	(THREAD_NORMSAVES + (offset * 4))
 
-#define NORMAL_EXCEPTION_PROLOG						     \
+#define NORMAL_EXCEPTION_PROLOG(intno)						     \
 	mtspr	SPRN_SPRG_WSCRATCH0, r10;	/* save one register */	     \
 	mfspr	r10, SPRN_SPRG_THREAD;					     \
 	stw	r11, THREAD_NORMSAVE(0)(r10);				     \
 	stw	r13, THREAD_NORMSAVE(2)(r10);				     \
 	mfcr	r13;			/* save CR in r13 for now	   */\
-	mfspr	r11,SPRN_SRR1;		/* check whether user or kernel    */\
-	andi.	r11,r11,MSR_PR;						     \
+	mfspr	r11, SPRN_SRR1;		                                     \
+	DO_KVM	BOOKE_INTERRUPT_##intno SPRN_SRR1;			     \
+	andi.	r11, r11, MSR_PR;	/* check whether user or kernel    */\
 	mr	r11, r1;						     \
 	beq	1f;							     \
 	/* if from user, start at top of this thread's kernel stack */       \
@@ -113,7 +117,7 @@
  * registers as the normal prolog above. Instead we use a portion of the
  * critical/machine check exception stack at low physical addresses.
  */
-#define EXC_LEVEL_EXCEPTION_PROLOG(exc_level, exc_level_srr0, exc_level_srr1) \
+#define EXC_LEVEL_EXCEPTION_PROLOG(exc_level, intno, exc_level_srr0, exc_level_srr1) \
 	mtspr	SPRN_SPRG_WSCRATCH_##exc_level,r8;			     \
 	BOOKE_LOAD_EXC_LEVEL_STACK(exc_level);/* r8 points to the exc_level stack*/ \
 	stw	r9,GPR9(r8);		/* save various registers	   */\
@@ -121,8 +125,9 @@
 	stw	r10,GPR10(r8);						     \
 	stw	r11,GPR11(r8);						     \
 	stw	r9,_CCR(r8);		/* save CR on stack		   */\
-	mfspr	r10,exc_level_srr1;	/* check whether user or kernel    */\
-	andi.	r10,r10,MSR_PR;						     \
+	mfspr	r11,exc_level_srr1;	/* check whether user or kernel    */\
+	DO_KVM	BOOKE_INTERRUPT_##intno exc_level_srr1;		             \
+	andi.	r11,r11,MSR_PR;						     \
 	mfspr	r11,SPRN_SPRG_THREAD;	/* if from user, start at top of   */\
 	lwz	r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\
 	addi	r11,r11,EXC_LVL_FRAME_OVERHEAD;	/* allocate stack frame    */\
@@ -162,12 +167,30 @@
 	SAVE_4GPRS(3, r11);						     \
 	SAVE_2GPRS(7, r11)
 
-#define CRITICAL_EXCEPTION_PROLOG \
-		EXC_LEVEL_EXCEPTION_PROLOG(CRIT, SPRN_CSRR0, SPRN_CSRR1)
+#define CRITICAL_EXCEPTION_PROLOG(intno) \
+		EXC_LEVEL_EXCEPTION_PROLOG(CRIT, intno, SPRN_CSRR0, SPRN_CSRR1)
 #define DEBUG_EXCEPTION_PROLOG \
-		EXC_LEVEL_EXCEPTION_PROLOG(DBG, SPRN_DSRR0, SPRN_DSRR1)
+		EXC_LEVEL_EXCEPTION_PROLOG(DBG, DEBUG, SPRN_DSRR0, SPRN_DSRR1)
 #define MCHECK_EXCEPTION_PROLOG \
-		EXC_LEVEL_EXCEPTION_PROLOG(MC, SPRN_MCSRR0, SPRN_MCSRR1)
+		EXC_LEVEL_EXCEPTION_PROLOG(MC, MACHINE_CHECK, \
+			SPRN_MCSRR0, SPRN_MCSRR1)
+
+/*
+ * Guest Doorbell -- this is a bit odd in that uses GSRR0/1 despite
+ * being delivered to the host.  This exception can only happen
+ * inside a KVM guest -- so we just handle up to the DO_KVM rather
+ * than try to fit this into one of the existing prolog macros.
+ */
+#define GUEST_DOORBELL_EXCEPTION \
+	START_EXCEPTION(GuestDoorbell);					     \
+	mtspr	SPRN_SPRG_WSCRATCH0, r10;	/* save one register */	     \
+	mfspr	r10, SPRN_SPRG_THREAD;					     \
+	stw	r11, THREAD_NORMSAVE(0)(r10);				     \
+	mfspr	r11, SPRN_SRR1;		                                     \
+	stw	r13, THREAD_NORMSAVE(2)(r10);				     \
+	mfcr	r13;			/* save CR in r13 for now	   */\
+	DO_KVM	BOOKE_INTERRUPT_GUEST_DBELL SPRN_GSRR1;			     \
+	trap
 
 /*
  * Exception vectors.
@@ -176,21 +199,16 @@
         .align 5;              						     \
 label:
 
-#define FINISH_EXCEPTION(func)					\
-	bl	transfer_to_handler_full;			\
-	.long	func;						\
-	.long	ret_from_except_full
-
-#define EXCEPTION(n, label, hdlr, xfer)				\
+#define EXCEPTION(n, intno, label, hdlr, xfer)			\
 	START_EXCEPTION(label);					\
-	NORMAL_EXCEPTION_PROLOG;				\
+	NORMAL_EXCEPTION_PROLOG(intno);				\
 	addi	r3,r1,STACK_FRAME_OVERHEAD;			\
 	xfer(n, hdlr)
 
-#define CRITICAL_EXCEPTION(n, label, hdlr)			\
-	START_EXCEPTION(label);					\
-	CRITICAL_EXCEPTION_PROLOG;				\
-	addi	r3,r1,STACK_FRAME_OVERHEAD;			\
+#define CRITICAL_EXCEPTION(n, intno, label, hdlr)			\
+	START_EXCEPTION(label);						\
+	CRITICAL_EXCEPTION_PROLOG(intno);				\
+	addi	r3,r1,STACK_FRAME_OVERHEAD;				\
 	EXC_XFER_TEMPLATE(hdlr, n+2, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \
 			  NOCOPY, crit_transfer_to_handler, \
 			  ret_from_crit_exc)
@@ -263,13 +281,13 @@ label:
 	andis.	r10,r10,(DBSR_IC|DBSR_BT)@h;				      \
 	beq+	2f;							      \
 									      \
-	lis	r10,KERNELBASE@h;	/* check if exception in vectors */   \
-	ori	r10,r10,KERNELBASE@l;					      \
+	lis	r10,interrupt_base@h;	/* check if exception in vectors */   \
+	ori	r10,r10,interrupt_base@l;				      \
 	cmplw	r12,r10;						      \
 	blt+	2f;			/* addr below exception vectors */    \
 									      \
-	lis	r10,DebugDebug@h;					      \
-	ori	r10,r10,DebugDebug@l;					      \
+	lis	r10,interrupt_end@h;					      \
+	ori	r10,r10,interrupt_end@l;				      \
 	cmplw	r12,r10;						      \
 	bgt+	2f;			/* addr above exception vectors */    \
 									      \
@@ -302,7 +320,7 @@ label:
 
 #define DEBUG_CRIT_EXCEPTION						      \
 	START_EXCEPTION(DebugCrit);					      \
-	CRITICAL_EXCEPTION_PROLOG;					      \
+	CRITICAL_EXCEPTION_PROLOG(DEBUG);				      \
 									      \
 	/*								      \
 	 * If there is a single step or branch-taken exception in an	      \
@@ -316,13 +334,13 @@ label:
 	andis.	r10,r10,(DBSR_IC|DBSR_BT)@h;				      \
 	beq+	2f;							      \
 									      \
-	lis	r10,KERNELBASE@h;	/* check if exception in vectors */   \
-	ori	r10,r10,KERNELBASE@l;					      \
+	lis	r10,interrupt_base@h;	/* check if exception in vectors */   \
+	ori	r10,r10,interrupt_base@l;				      \
 	cmplw	r12,r10;						      \
 	blt+	2f;			/* addr below exception vectors */    \
 									      \
-	lis	r10,DebugCrit@h;					      \
-	ori	r10,r10,DebugCrit@l;					      \
+	lis	r10,interrupt_end@h;					      \
+	ori	r10,r10,interrupt_end@l;				      \
 	cmplw	r12,r10;						      \
 	bgt+	2f;			/* addr above exception vectors */    \
 									      \
@@ -355,24 +373,24 @@ label:
 
 #define DATA_STORAGE_EXCEPTION						      \
 	START_EXCEPTION(DataStorage)					      \
-	NORMAL_EXCEPTION_PROLOG;					      \
+	NORMAL_EXCEPTION_PROLOG(DATA_STORAGE);		      \
 	mfspr	r5,SPRN_ESR;		/* Grab the ESR and save it */	      \
 	stw	r5,_ESR(r11);						      \
 	mfspr	r4,SPRN_DEAR;		/* Grab the DEAR */		      \
-	EXC_XFER_EE_LITE(0x0300, handle_page_fault)
+	EXC_XFER_LITE(0x0300, handle_page_fault)
 
 #define INSTRUCTION_STORAGE_EXCEPTION					      \
 	START_EXCEPTION(InstructionStorage)				      \
-	NORMAL_EXCEPTION_PROLOG;					      \
+	NORMAL_EXCEPTION_PROLOG(INST_STORAGE);		      \
 	mfspr	r5,SPRN_ESR;		/* Grab the ESR and save it */	      \
 	stw	r5,_ESR(r11);						      \
 	mr      r4,r12;                 /* Pass SRR0 as arg2 */		      \
 	li      r5,0;                   /* Pass zero as arg3 */		      \
-	EXC_XFER_EE_LITE(0x0400, handle_page_fault)
+	EXC_XFER_LITE(0x0400, handle_page_fault)
 
 #define ALIGNMENT_EXCEPTION						      \
 	START_EXCEPTION(Alignment)					      \
-	NORMAL_EXCEPTION_PROLOG;					      \
+	NORMAL_EXCEPTION_PROLOG(ALIGNMENT);		      \
 	mfspr   r4,SPRN_DEAR;           /* Grab the DEAR and save it */	      \
 	stw     r4,_DEAR(r11);						      \
 	addi    r3,r1,STACK_FRAME_OVERHEAD;				      \
@@ -380,7 +398,7 @@ label:
 
 #define PROGRAM_EXCEPTION						      \
 	START_EXCEPTION(Program)					      \
-	NORMAL_EXCEPTION_PROLOG;					      \
+	NORMAL_EXCEPTION_PROLOG(PROGRAM);		      \
 	mfspr	r4,SPRN_ESR;		/* Grab the ESR and save it */	      \
 	stw	r4,_ESR(r11);						      \
 	addi	r3,r1,STACK_FRAME_OVERHEAD;				      \
@@ -388,7 +406,7 @@ label:
 
 #define DECREMENTER_EXCEPTION						      \
 	START_EXCEPTION(Decrementer)					      \
-	NORMAL_EXCEPTION_PROLOG;					      \
+	NORMAL_EXCEPTION_PROLOG(DECREMENTER);		      \
 	lis     r0,TSR_DIS@h;           /* Setup the DEC interrupt mask */    \
 	mtspr   SPRN_TSR,r0;		/* Clear the DEC interrupt */	      \
 	addi    r3,r1,STACK_FRAME_OVERHEAD;				      \
@@ -396,7 +414,7 @@ label:
 
 #define FP_UNAVAILABLE_EXCEPTION					      \
 	START_EXCEPTION(FloatingPointUnavailable)			      \
-	NORMAL_EXCEPTION_PROLOG;					      \
+	NORMAL_EXCEPTION_PROLOG(FP_UNAVAIL);		      \
 	beq	1f;							      \
 	bl	load_up_fpu;		/* if from user, just load it up */   \
 	b	fast_exception_return;					      \
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index d5d78c4ceef..b497188a94a 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -65,29 +65,78 @@ _ENTRY(_start);
 	nop
 
 	/* Translate device tree address to physical, save in r30/r31 */
-	mfmsr	r16
-	mfspr	r17,SPRN_PID
-	rlwinm	r17,r17,16,0x3fff0000	/* turn PID into MAS6[SPID] */
-	rlwimi	r17,r16,28,0x00000001	/* turn MSR[DS] into MAS6[SAS] */
-	mtspr	SPRN_MAS6,r17
-
-	tlbsx	0,r3			/* must succeed */
-
-	mfspr	r16,SPRN_MAS1
-	mfspr	r20,SPRN_MAS3
-	rlwinm	r17,r16,25,0x1f		/* r17 = log2(page size) */
-	li	r18,1024
-	slw	r18,r18,r17		/* r18 = page size */
-	addi	r18,r18,-1
-	and	r19,r3,r18		/* r19 = page offset */
-	andc	r31,r20,r18		/* r31 = page base */
-	or	r31,r31,r19		/* r31 = devtree phys addr */
-	mfspr	r30,SPRN_MAS7
+	bl	get_phys_addr
+	mr	r30,r3
+	mr	r31,r4
 
 	li	r25,0			/* phys kernel start (low) */
 	li	r24,0			/* CPU number */
 	li	r23,0			/* phys kernel start (high) */
 
+#ifdef CONFIG_RELOCATABLE
+	LOAD_REG_ADDR_PIC(r3, _stext)	/* Get our current runtime base */
+
+	/* Translate _stext address to physical, save in r23/r25 */
+	bl	get_phys_addr
+	mr	r23,r3
+	mr	r25,r4
+
+	bl	0f
+0:	mflr	r8
+	addis	r3,r8,(is_second_reloc - 0b)@ha
+	lwz	r19,(is_second_reloc - 0b)@l(r3)
+
+	/* Check if this is the second relocation. */
+	cmpwi	r19,1
+	bne	1f
+
+	/*
+	 * For the second relocation, we already get the real memstart_addr
+	 * from device tree. So we will map PAGE_OFFSET to memstart_addr,
+	 * then the virtual address of start kernel should be:
+	 *          PAGE_OFFSET + (kernstart_addr - memstart_addr)
+	 * Since the offset between kernstart_addr and memstart_addr should
+	 * never be beyond 1G, so we can just use the lower 32bit of them
+	 * for the calculation.
+	 */
+	lis	r3,PAGE_OFFSET@h
+
+	addis	r4,r8,(kernstart_addr - 0b)@ha
+	addi	r4,r4,(kernstart_addr - 0b)@l
+	lwz	r5,4(r4)
+
+	addis	r6,r8,(memstart_addr - 0b)@ha
+	addi	r6,r6,(memstart_addr - 0b)@l
+	lwz	r7,4(r6)
+
+	subf	r5,r7,r5
+	add	r3,r3,r5
+	b	2f
+
+1:
+	/*
+	 * We have the runtime (virutal) address of our base.
+	 * We calculate our shift of offset from a 64M page.
+	 * We could map the 64M page we belong to at PAGE_OFFSET and
+	 * get going from there.
+	 */
+	lis	r4,KERNELBASE@h
+	ori	r4,r4,KERNELBASE@l
+	rlwinm	r6,r25,0,0x3ffffff		/* r6 = PHYS_START % 64M */
+	rlwinm	r5,r4,0,0x3ffffff		/* r5 = KERNELBASE % 64M */
+	subf	r3,r5,r6			/* r3 = r6 - r5 */
+	add	r3,r4,r3			/* Required Virtual Address */
+
+2:	bl	relocate
+
+	/*
+	 * For the second relocation, we already set the right tlb entries
+	 * for the kernel space, so skip the code in fsl_booke_entry_mapping.S
+	*/
+	cmpwi	r19,1
+	beq	set_ivor
+#endif
+
 /* We try to not make any assumptions about how the boot loader
  * setup or used the TLBs.  We invalidate all mappings from the
  * boot loader and load a single entry in TLB1[0] to map the
@@ -113,6 +162,7 @@ _ENTRY(__early_start)
 #include "fsl_booke_entry_mapping.S"
 #undef ENTRY_MAPPING_BOOT_SETUP
 
+set_ivor:
 	/* Establish the interrupt vector offsets */
 	SET_IVOR(0,  CriticalInput);
 	SET_IVOR(1,  MachineCheck);
@@ -166,8 +216,7 @@ _ENTRY(__early_start)
 	/* Check to see if we're the second processor, and jump
 	 * to the secondary_start code if so
 	 */
-	lis	r24, boot_cpuid@h
-	ori	r24, r24, boot_cpuid@l
+	LOAD_REG_ADDR_PIC(r24, boot_cpuid)
 	lwz	r24, 0(r24)
 	cmpwi	r24, -1
 	mfspr   r24,SPRN_PIR
@@ -192,11 +241,23 @@ _ENTRY(__early_start)
 	li	r0,0
 	stwu	r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1)
 
-	rlwinm  r22,r1,0,0,31-THREAD_SHIFT      /* current thread_info */
+	CURRENT_THREAD_INFO(r22, r1)
 	stw	r24, TI_CPU(r22)
 
 	bl	early_init
 
+#ifdef CONFIG_RELOCATABLE
+	mr	r3,r30
+	mr	r4,r31
+#ifdef CONFIG_PHYS_64BIT
+	mr	r5,r23
+	mr	r6,r25
+#else
+	mr	r5,r25
+#endif
+	bl	relocate_init
+#endif
+
 #ifdef CONFIG_DYNAMIC_MEMSTART
 	lis	r3,kernstart_addr@ha
 	la	r3,kernstart_addr@l(r3)
@@ -301,25 +362,26 @@ _ENTRY(__early_start)
 
 interrupt_base:
 	/* Critical Input Interrupt */
-	CRITICAL_EXCEPTION(0x0100, CriticalInput, unknown_exception)
+	CRITICAL_EXCEPTION(0x0100, CRITICAL, CriticalInput, unknown_exception)
 
 	/* Machine Check Interrupt */
 #ifdef CONFIG_E200
 	/* no RFMCI, MCSRRs on E200 */
-	CRITICAL_EXCEPTION(0x0200, MachineCheck, machine_check_exception)
+	CRITICAL_EXCEPTION(0x0200, MACHINE_CHECK, MachineCheck, \
+			   machine_check_exception)
 #else
 	MCHECK_EXCEPTION(0x0200, MachineCheck, machine_check_exception)
 #endif
 
 	/* Data Storage Interrupt */
 	START_EXCEPTION(DataStorage)
-	NORMAL_EXCEPTION_PROLOG
+	NORMAL_EXCEPTION_PROLOG(DATA_STORAGE)
 	mfspr	r5,SPRN_ESR		/* Grab the ESR, save it, pass arg3 */
 	stw	r5,_ESR(r11)
 	mfspr	r4,SPRN_DEAR		/* Grab the DEAR, save it, pass arg2 */
 	andis.	r10,r5,(ESR_ILK|ESR_DLK)@h
 	bne	1f
-	EXC_XFER_EE_LITE(0x0300, handle_page_fault)
+	EXC_XFER_LITE(0x0300, handle_page_fault)
 1:
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	EXC_XFER_EE_LITE(0x0300, CacheLockingException)
@@ -328,7 +390,7 @@ interrupt_base:
 	INSTRUCTION_STORAGE_EXCEPTION
 
 	/* External Input Interrupt */
-	EXCEPTION(0x0500, ExternalInput, do_IRQ, EXC_XFER_LITE)
+	EXCEPTION(0x0500, EXTERNAL, ExternalInput, do_IRQ, EXC_XFER_LITE)
 
 	/* Alignment Interrupt */
 	ALIGNMENT_EXCEPTION
@@ -342,32 +404,36 @@ interrupt_base:
 #else
 #ifdef CONFIG_E200
 	/* E200 treats 'normal' floating point instructions as FP Unavail exception */
-	EXCEPTION(0x0800, FloatingPointUnavailable, program_check_exception, EXC_XFER_EE)
+	EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, \
+		  program_check_exception, EXC_XFER_EE)
 #else
-	EXCEPTION(0x0800, FloatingPointUnavailable, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, \
+		  unknown_exception, EXC_XFER_EE)
 #endif
 #endif
 
 	/* System Call Interrupt */
 	START_EXCEPTION(SystemCall)
-	NORMAL_EXCEPTION_PROLOG
+	NORMAL_EXCEPTION_PROLOG(SYSCALL)
 	EXC_XFER_EE_LITE(0x0c00, DoSyscall)
 
 	/* Auxiliary Processor Unavailable Interrupt */
-	EXCEPTION(0x2900, AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2900, AP_UNAVAIL, AuxillaryProcessorUnavailable, \
+		  unknown_exception, EXC_XFER_EE)
 
 	/* Decrementer Interrupt */
 	DECREMENTER_EXCEPTION
 
 	/* Fixed Internal Timer Interrupt */
 	/* TODO: Add FIT support */
-	EXCEPTION(0x3100, FixedIntervalTimer, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x3100, FIT, FixedIntervalTimer, \
+		  unknown_exception, EXC_XFER_EE)
 
 	/* Watchdog Timer Interrupt */
 #ifdef CONFIG_BOOKE_WDT
-	CRITICAL_EXCEPTION(0x3200, WatchdogTimer, WatchdogException)
+	CRITICAL_EXCEPTION(0x3200, WATCHDOG, WatchdogTimer, WatchdogException)
 #else
-	CRITICAL_EXCEPTION(0x3200, WatchdogTimer, unknown_exception)
+	CRITICAL_EXCEPTION(0x3200, WATCHDOG, WatchdogTimer, unknown_exception)
 #endif
 
 	/* Data TLB Error Interrupt */
@@ -375,10 +441,16 @@ interrupt_base:
 	mtspr	SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */
 	mfspr	r10, SPRN_SPRG_THREAD
 	stw	r11, THREAD_NORMSAVE(0)(r10)
+#ifdef CONFIG_KVM_BOOKE_HV
+BEGIN_FTR_SECTION
+	mfspr	r11, SPRN_SRR1
+END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
+#endif
 	stw	r12, THREAD_NORMSAVE(1)(r10)
 	stw	r13, THREAD_NORMSAVE(2)(r10)
 	mfcr	r13
 	stw	r13, THREAD_NORMSAVE(3)(r10)
+	DO_KVM	BOOKE_INTERRUPT_DTLB_MISS SPRN_SRR1
 	mfspr	r10, SPRN_DEAR		/* Get faulting address */
 
 	/* If we are faulting a kernel address, we have to use the
@@ -463,10 +535,16 @@ interrupt_base:
 	mtspr	SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */
 	mfspr	r10, SPRN_SPRG_THREAD
 	stw	r11, THREAD_NORMSAVE(0)(r10)
+#ifdef CONFIG_KVM_BOOKE_HV
+BEGIN_FTR_SECTION
+	mfspr	r11, SPRN_SRR1
+END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
+#endif
 	stw	r12, THREAD_NORMSAVE(1)(r10)
 	stw	r13, THREAD_NORMSAVE(2)(r10)
 	mfcr	r13
 	stw	r13, THREAD_NORMSAVE(3)(r10)
+	DO_KVM	BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR1
 	mfspr	r10, SPRN_SRR0		/* Get faulting address */
 
 	/* If we are faulting a kernel address, we have to use the
@@ -538,36 +616,58 @@ interrupt_base:
 #ifdef CONFIG_SPE
 	/* SPE Unavailable */
 	START_EXCEPTION(SPEUnavailable)
-	NORMAL_EXCEPTION_PROLOG
-	bne	load_up_spe
-	addi	r3,r1,STACK_FRAME_OVERHEAD
+	NORMAL_EXCEPTION_PROLOG(SPE_ALTIVEC_UNAVAIL)
+	beq	1f
+	bl	load_up_spe
+	b	fast_exception_return
+1:	addi	r3,r1,STACK_FRAME_OVERHEAD
 	EXC_XFER_EE_LITE(0x2010, KernelSPE)
 #else
-	EXCEPTION(0x2020, SPEUnavailable, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2020, SPE_ALTIVEC_UNAVAIL, SPEUnavailable, \
+		  unknown_exception, EXC_XFER_EE)
 #endif /* CONFIG_SPE */
 
 	/* SPE Floating Point Data */
 #ifdef CONFIG_SPE
-	EXCEPTION(0x2030, SPEFloatingPointData, SPEFloatingPointException, EXC_XFER_EE);
+	EXCEPTION(0x2030, SPE_FP_DATA_ALTIVEC_ASSIST, SPEFloatingPointData,
+		  SPEFloatingPointException, EXC_XFER_EE)
 
 	/* SPE Floating Point Round */
-	EXCEPTION(0x2050, SPEFloatingPointRound, SPEFloatingPointRoundException, EXC_XFER_EE)
+	EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \
+		  SPEFloatingPointRoundException, EXC_XFER_EE)
 #else
-	EXCEPTION(0x2040, SPEFloatingPointData, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x2050, SPEFloatingPointRound, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2040, SPE_FP_DATA_ALTIVEC_ASSIST, SPEFloatingPointData,
+		  unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \
+		  unknown_exception, EXC_XFER_EE)
 #endif /* CONFIG_SPE */
 
 	/* Performance Monitor */
-	EXCEPTION(0x2060, PerformanceMonitor, performance_monitor_exception, EXC_XFER_STD)
+	EXCEPTION(0x2060, PERFORMANCE_MONITOR, PerformanceMonitor, \
+		  performance_monitor_exception, EXC_XFER_STD)
 
-	EXCEPTION(0x2070, Doorbell, doorbell_exception, EXC_XFER_STD)
+	EXCEPTION(0x2070, DOORBELL, Doorbell, doorbell_exception, EXC_XFER_STD)
 
-	CRITICAL_EXCEPTION(0x2080, CriticalDoorbell, unknown_exception)
+	CRITICAL_EXCEPTION(0x2080, DOORBELL_CRITICAL, \
+			   CriticalDoorbell, unknown_exception)
 
 	/* Debug Interrupt */
 	DEBUG_DEBUG_EXCEPTION
 	DEBUG_CRIT_EXCEPTION
 
+	GUEST_DOORBELL_EXCEPTION
+
+	CRITICAL_EXCEPTION(0, GUEST_DBELL_CRIT, CriticalGuestDoorbell, \
+			   unknown_exception)
+
+	/* Hypercall */
+	EXCEPTION(0, HV_SYSCALL, Hypercall, unknown_exception, EXC_XFER_EE)
+
+	/* Embedded Hypervisor Privilege */
+	EXCEPTION(0, HV_PRIV, Ehvpriv, unknown_exception, EXC_XFER_EE)
+
+interrupt_end:
+
 /*
  * Local functions
  */
@@ -743,7 +843,7 @@ tlb_write_entry:
 /* Note that the SPE support is closely modeled after the AltiVec
  * support.  Changes to one are likely to be applicable to the
  * other!  */
-load_up_spe:
+_GLOBAL(load_up_spe)
 /*
  * Disable SPE for the task which had SPE previously,
  * and save its SPE registers in its thread_struct.
@@ -791,20 +891,7 @@ load_up_spe:
 	subi	r4,r5,THREAD
 	stw	r4,last_task_used_spe@l(r3)
 #endif /* !CONFIG_SMP */
-	/* restore registers and return */
-2:	REST_4GPRS(3, r11)
-	lwz	r10,_CCR(r11)
-	REST_GPR(1, r11)
-	mtcr	r10
-	lwz	r10,_LINK(r11)
-	mtlr	r10
-	REST_GPR(10, r11)
-	mtspr	SPRN_SRR1,r9
-	mtspr	SPRN_SRR0,r12
-	REST_GPR(9, r11)
-	REST_GPR(12, r11)
-	lwz	r11,GPR11(r11)
-	rfi
+	blr
 
 /*
  * SPE unavailable trap from kernel - print a message, but let
@@ -830,6 +917,33 @@ KernelSPE:
 #endif /* CONFIG_SPE */
 
 /*
+ * Translate the effec addr in r3 to phys addr. The phys addr will be put
+ * into r3(higher 32bit) and r4(lower 32bit)
+ */
+get_phys_addr:
+	mfmsr	r8
+	mfspr	r9,SPRN_PID
+	rlwinm	r9,r9,16,0x3fff0000	/* turn PID into MAS6[SPID] */
+	rlwimi	r9,r8,28,0x00000001	/* turn MSR[DS] into MAS6[SAS] */
+	mtspr	SPRN_MAS6,r9
+
+	tlbsx	0,r3			/* must succeed */
+
+	mfspr	r8,SPRN_MAS1
+	mfspr	r12,SPRN_MAS3
+	rlwinm	r9,r8,25,0x1f		/* r9 = log2(page size) */
+	li	r10,1024
+	slw	r10,r10,r9		/* r10 = page size */
+	addi	r10,r10,-1
+	and	r11,r3,r10		/* r11 = page offset */
+	andc	r4,r12,r10		/* r4 = page base */
+	or	r4,r4,r11		/* r4 = devtree phys addr */
+#ifdef CONFIG_PHYS_64BIT
+	mfspr	r3,SPRN_MAS7
+#endif
+	blr
+
+/*
  * Global functions
  */
 
@@ -874,12 +988,17 @@ _GLOBAL(__setup_e500mc_ivors)
 	sync
 	blr
 
-/*
- * extern void giveup_altivec(struct task_struct *prev)
- *
- * The e500 core does not have an AltiVec unit.
- */
-_GLOBAL(giveup_altivec)
+/* setup ehv ivors for */
+_GLOBAL(__setup_ehv_ivors)
+	li	r3,GuestDoorbell@l
+	mtspr	SPRN_IVOR38,r3
+	li	r3,CriticalGuestDoorbell@l
+	mtspr	SPRN_IVOR39,r3
+	li	r3,Hypercall@l
+	mtspr	SPRN_IVOR40,r3
+	li	r3,Ehvpriv@l
+	mtspr	SPRN_IVOR41,r3
+	sync
 	blr
 
 #ifdef CONFIG_SPE
@@ -917,16 +1036,6 @@ _GLOBAL(giveup_spe)
 #endif /* CONFIG_SPE */
 
 /*
- * extern void giveup_fpu(struct task_struct *prev)
- *
- * Not all FSL Book-E cores have an FPU
- */
-#ifndef CONFIG_PPC_FPU
-_GLOBAL(giveup_fpu)
-	blr
-#endif
-
-/*
  * extern void abort(void)
  *
  * At present, this routine just applies a system reset.
@@ -1004,28 +1113,68 @@ _GLOBAL(flush_dcache_L1)
 
 	blr
 
+/* Flush L1 d-cache, invalidate and disable d-cache and i-cache */
+_GLOBAL(__flush_disable_L1)
+	mflr	r10
+	bl	flush_dcache_L1	/* Flush L1 d-cache */
+	mtlr	r10
+
+	mfspr	r4, SPRN_L1CSR0	/* Invalidate and disable d-cache */
+	li	r5, 2
+	rlwimi	r4, r5, 0, 3
+
+	msync
+	isync
+	mtspr	SPRN_L1CSR0, r4
+	isync
+
+1:	mfspr	r4, SPRN_L1CSR0	/* Wait for the invalidate to finish */
+	andi.	r4, r4, 2
+	bne	1b
+
+	mfspr	r4, SPRN_L1CSR1	/* Invalidate and disable i-cache */
+	li	r5, 2
+	rlwimi	r4, r5, 0, 3
+
+	mtspr	SPRN_L1CSR1, r4
+	isync
+
+	blr
+
 #ifdef CONFIG_SMP
 /* When we get here, r24 needs to hold the CPU # */
 	.globl __secondary_start
 __secondary_start:
-	lis	r3,__secondary_hold_acknowledge@h
-	ori	r3,r3,__secondary_hold_acknowledge@l
-	stw	r24,0(r3)
-
-	li	r3,0
-	mr	r4,r24		/* Why? */
-	bl	call_setup_cpu
-
-	lis	r3,tlbcam_index@ha
-	lwz	r3,tlbcam_index@l(r3)
+	LOAD_REG_ADDR_PIC(r3, tlbcam_index)
+	lwz	r3,0(r3)
 	mtctr	r3
 	li	r26,0		/* r26 safe? */
 
+	bl	switch_to_as1
+	mr	r27,r3		/* tlb entry */
 	/* Load each CAM entry */
 1:	mr	r3,r26
 	bl	loadcam_entry
 	addi	r26,r26,1
 	bdnz	1b
+	mr	r3,r27		/* tlb entry */
+	LOAD_REG_ADDR_PIC(r4, memstart_addr)
+	lwz	r4,0(r4)
+	mr	r5,r25		/* phys kernel start */
+	rlwinm	r5,r5,0,~0x3ffffff	/* aligned 64M */
+	subf	r4,r5,r4	/* memstart_addr - phys kernel start */
+	li	r5,0		/* no device tree */
+	li	r6,0		/* not boot cpu */
+	bl	restore_to_as0
+
+
+	lis	r3,__secondary_hold_acknowledge@h
+	ori	r3,r3,__secondary_hold_acknowledge@l
+	stw	r24,0(r3)
+
+	li	r3,0
+	mr	r4,r24		/* Why? */
+	bl	call_setup_cpu
 
 	/* get current_thread_info and current */
 	lis	r1,secondary_ti@ha
@@ -1062,6 +1211,112 @@ __secondary_hold_acknowledge:
 #endif
 
 /*
+ * Create a tlb entry with the same effective and physical address as
+ * the tlb entry used by the current running code. But set the TS to 1.
+ * Then switch to the address space 1. It will return with the r3 set to
+ * the ESEL of the new created tlb.
+ */
+_GLOBAL(switch_to_as1)
+	mflr	r5
+
+	/* Find a entry not used */
+	mfspr	r3,SPRN_TLB1CFG
+	andi.	r3,r3,0xfff
+	mfspr	r4,SPRN_PID
+	rlwinm	r4,r4,16,0x3fff0000	/* turn PID into MAS6[SPID] */
+	mtspr	SPRN_MAS6,r4
+1:	lis	r4,0x1000		/* Set MAS0(TLBSEL) = 1 */
+	addi	r3,r3,-1
+	rlwimi	r4,r3,16,4,15		/* Setup MAS0 = TLBSEL | ESEL(r3) */
+	mtspr	SPRN_MAS0,r4
+	tlbre
+	mfspr	r4,SPRN_MAS1
+	andis.	r4,r4,MAS1_VALID@h
+	bne	1b
+
+	/* Get the tlb entry used by the current running code */
+	bl	0f
+0:	mflr	r4
+	tlbsx	0,r4
+
+	mfspr	r4,SPRN_MAS1
+	ori	r4,r4,MAS1_TS		/* Set the TS = 1 */
+	mtspr	SPRN_MAS1,r4
+
+	mfspr	r4,SPRN_MAS0
+	rlwinm	r4,r4,0,~MAS0_ESEL_MASK
+	rlwimi	r4,r3,16,4,15		/* Setup MAS0 = TLBSEL | ESEL(r3) */
+	mtspr	SPRN_MAS0,r4
+	tlbwe
+	isync
+	sync
+
+	mfmsr	r4
+	ori	r4,r4,MSR_IS | MSR_DS
+	mtspr	SPRN_SRR0,r5
+	mtspr	SPRN_SRR1,r4
+	sync
+	rfi
+
+/*
+ * Restore to the address space 0 and also invalidate the tlb entry created
+ * by switch_to_as1.
+ * r3 - the tlb entry which should be invalidated
+ * r4 - __pa(PAGE_OFFSET in AS1) - __pa(PAGE_OFFSET in AS0)
+ * r5 - device tree virtual address. If r4 is 0, r5 is ignored.
+ * r6 - boot cpu
+*/
+_GLOBAL(restore_to_as0)
+	mflr	r0
+
+	bl	0f
+0:	mflr	r9
+	addi	r9,r9,1f - 0b
+
+	/*
+	 * We may map the PAGE_OFFSET in AS0 to a different physical address,
+	 * so we need calculate the right jump and device tree address based
+	 * on the offset passed by r4.
+	 */
+	add	r9,r9,r4
+	add	r5,r5,r4
+	add	r0,r0,r4
+
+2:	mfmsr	r7
+	li	r8,(MSR_IS | MSR_DS)
+	andc	r7,r7,r8
+
+	mtspr	SPRN_SRR0,r9
+	mtspr	SPRN_SRR1,r7
+	sync
+	rfi
+
+	/* Invalidate the temporary tlb entry for AS1 */
+1:	lis	r9,0x1000		/* Set MAS0(TLBSEL) = 1 */
+	rlwimi	r9,r3,16,4,15		/* Setup MAS0 = TLBSEL | ESEL(r3) */
+	mtspr	SPRN_MAS0,r9
+	tlbre
+	mfspr	r9,SPRN_MAS1
+	rlwinm	r9,r9,0,2,31		/* Clear MAS1 Valid and IPPROT */
+	mtspr	SPRN_MAS1,r9
+	tlbwe
+	isync
+
+	cmpwi	r4,0
+	cmpwi	cr1,r6,0
+	cror	eq,4*cr1+eq,eq
+	bne	3f			/* offset != 0 && is_boot_cpu */
+	mtlr	r0
+	blr
+
+	/*
+	 * The PAGE_OFFSET will map to a different physical address,
+	 * jump to _start to do another relocation again.
+	*/
+3:	mr	r3,r5
+	bl	_start
+
+/*
  * We put a few things here that have to be page-aligned. This stuff
  * goes at the beginning of the data segment, which is page-aligned.
  */
diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index 2bc0584be81..0bb5918faaa 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -28,7 +28,6 @@
 #include <linux/percpu.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <linux/init.h>
 #include <linux/smp.h>
 
 #include <asm/hw_breakpoint.h>
@@ -73,7 +72,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
 	 * If so, DABR will be populated in single_step_dabr_instruction().
 	 */
 	if (current->thread.last_hit_ubp != bp)
-		set_dabr(info->address | info->type | DABR_TRANSLATION);
+		__set_breakpoint(info);
 
 	return 0;
 }
@@ -97,7 +96,7 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp)
 	}
 
 	*slot = NULL;
-	set_dabr(0);
+	hw_breakpoint_disable();
 }
 
 /*
@@ -111,7 +110,7 @@ void arch_unregister_hw_breakpoint(struct perf_event *bp)
 	 * and the single_step_dabr_instruction(), then cleanup the breakpoint
 	 * restoration variables to prevent dangling pointers.
 	 */
-	if (bp->ctx->task)
+	if (bp->ctx && bp->ctx->task)
 		bp->ctx->task->thread.last_hit_ubp = NULL;
 }
 
@@ -127,19 +126,13 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp)
 
 int arch_bp_generic_fields(int type, int *gen_bp_type)
 {
-	switch (type) {
-	case DABR_DATA_READ:
-		*gen_bp_type = HW_BREAKPOINT_R;
-		break;
-	case DABR_DATA_WRITE:
-		*gen_bp_type = HW_BREAKPOINT_W;
-		break;
-	case (DABR_DATA_WRITE | DABR_DATA_READ):
-		*gen_bp_type = (HW_BREAKPOINT_W | HW_BREAKPOINT_R);
-		break;
-	default:
+	*gen_bp_type = 0;
+	if (type & HW_BRK_TYPE_READ)
+		*gen_bp_type |= HW_BREAKPOINT_R;
+	if (type & HW_BRK_TYPE_WRITE)
+		*gen_bp_type |= HW_BREAKPOINT_W;
+	if (*gen_bp_type == 0)
 		return -EINVAL;
-	}
 	return 0;
 }
 
@@ -148,26 +141,26 @@ int arch_bp_generic_fields(int type, int *gen_bp_type)
  */
 int arch_validate_hwbkpt_settings(struct perf_event *bp)
 {
-	int ret = -EINVAL;
+	int ret = -EINVAL, length_max;
 	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
 
 	if (!bp)
 		return ret;
 
-	switch (bp->attr.bp_type) {
-	case HW_BREAKPOINT_R:
-		info->type = DABR_DATA_READ;
-		break;
-	case HW_BREAKPOINT_W:
-		info->type = DABR_DATA_WRITE;
-		break;
-	case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
-		info->type = (DABR_DATA_READ | DABR_DATA_WRITE);
-		break;
-	default:
+	info->type = HW_BRK_TYPE_TRANSLATE;
+	if (bp->attr.bp_type & HW_BREAKPOINT_R)
+		info->type |= HW_BRK_TYPE_READ;
+	if (bp->attr.bp_type & HW_BREAKPOINT_W)
+		info->type |= HW_BRK_TYPE_WRITE;
+	if (info->type == HW_BRK_TYPE_TRANSLATE)
+		/* must set alteast read or write */
 		return ret;
-	}
-
+	if (!(bp->attr.exclude_user))
+		info->type |= HW_BRK_TYPE_USER;
+	if (!(bp->attr.exclude_kernel))
+		info->type |= HW_BRK_TYPE_KERNEL;
+	if (!(bp->attr.exclude_hv))
+		info->type |= HW_BRK_TYPE_HYP;
 	info->address = bp->attr.bp_addr;
 	info->len = bp->attr.bp_len;
 
@@ -177,8 +170,16 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
 	 * HW_BREAKPOINT_ALIGN by rounding off to the lower address, the
 	 * 'symbolsize' should satisfy the check below.
 	 */
+	length_max = 8; /* DABR */
+	if (cpu_has_feature(CPU_FTR_DAWR)) {
+		length_max = 512 ; /* 64 doublewords */
+		/* DAWR region can't cross 512 boundary */
+		if ((bp->attr.bp_addr >> 10) != 
+		    ((bp->attr.bp_addr + bp->attr.bp_len - 1) >> 10))
+			return -EINVAL;
+	}
 	if (info->len >
-	    (HW_BREAKPOINT_LEN - (info->address & HW_BREAKPOINT_ALIGN)))
+	    (length_max - (info->address & HW_BREAKPOINT_ALIGN)))
 		return -EINVAL;
 	return 0;
 }
@@ -197,7 +198,7 @@ void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs)
 
 	info = counter_arch_bp(tsk->thread.last_hit_ubp);
 	regs->msr &= ~MSR_SE;
-	set_dabr(info->address | info->type | DABR_TRANSLATION);
+	__set_breakpoint(info);
 	tsk->thread.last_hit_ubp = NULL;
 }
 
@@ -215,7 +216,7 @@ int __kprobes hw_breakpoint_handler(struct die_args *args)
 	unsigned long dar = regs->dar;
 
 	/* Disable breakpoints during exception handling */
-	set_dabr(0);
+	hw_breakpoint_disable();
 
 	/*
 	 * The counter may be concurrently released but that can only
@@ -248,12 +249,14 @@ int __kprobes hw_breakpoint_handler(struct die_args *args)
 	 * we still need to single-step the instruction, but we don't
 	 * generate an event.
 	 */
-	info->extraneous_interrupt = !((bp->attr.bp_addr <= dar) &&
-			(dar - bp->attr.bp_addr < bp->attr.bp_len));
+	info->type &= ~HW_BRK_TYPE_EXTRANEOUS_IRQ;
+	if (!((bp->attr.bp_addr <= dar) &&
+	      (dar - bp->attr.bp_addr < bp->attr.bp_len)))
+		info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ;
 
 	/* Do not emulate user-space instructions, instead single-step them */
 	if (user_mode(regs)) {
-		bp->ctx->task->thread.last_hit_ubp = bp;
+		current->thread.last_hit_ubp = bp;
 		regs->msr |= MSR_SE;
 		goto out;
 	}
@@ -278,10 +281,10 @@ int __kprobes hw_breakpoint_handler(struct die_args *args)
 	 * As a policy, the callback is invoked in a 'trigger-after-execute'
 	 * fashion
 	 */
-	if (!info->extraneous_interrupt)
+	if (!(info->type & HW_BRK_TYPE_EXTRANEOUS_IRQ))
 		perf_bp_event(bp, regs);
 
-	set_dabr(info->address | info->type | DABR_TRANSLATION);
+	__set_breakpoint(info);
 out:
 	rcu_read_unlock();
 	return rc;
@@ -294,7 +297,7 @@ int __kprobes single_step_dabr_instruction(struct die_args *args)
 {
 	struct pt_regs *regs = args->regs;
 	struct perf_event *bp = NULL;
-	struct arch_hw_breakpoint *bp_info;
+	struct arch_hw_breakpoint *info;
 
 	bp = current->thread.last_hit_ubp;
 	/*
@@ -304,16 +307,16 @@ int __kprobes single_step_dabr_instruction(struct die_args *args)
 	if (!bp)
 		return NOTIFY_DONE;
 
-	bp_info = counter_arch_bp(bp);
+	info = counter_arch_bp(bp);
 
 	/*
 	 * We shall invoke the user-defined callback function in the single
 	 * stepping handler to confirm to 'trigger-after-execute' semantics
 	 */
-	if (!bp_info->extraneous_interrupt)
+	if (!(info->type & HW_BRK_TYPE_EXTRANEOUS_IRQ))
 		perf_bp_event(bp, regs);
 
-	set_dabr(bp_info->address | bp_info->type | DABR_TRANSLATION);
+	__set_breakpoint(info);
 	current->thread.last_hit_ubp = NULL;
 
 	/*
diff --git a/arch/powerpc/kernel/ibmebus.c b/arch/powerpc/kernel/ibmebus.c
index d39ae606ff8..1114d13ac19 100644
--- a/arch/powerpc/kernel/ibmebus.c
+++ b/arch/powerpc/kernel/ibmebus.c
@@ -47,7 +47,6 @@
 #include <linux/stat.h>
 #include <linux/of_platform.h>
 #include <asm/ibmebus.h>
-#include <asm/abs_addr.h>
 
 static struct device ibmebus_bus_device = { /* fake "parent" device */
 	.init_name = "ibmebus",
@@ -65,7 +64,8 @@ static struct of_device_id __initdata ibmebus_matches[] = {
 static void *ibmebus_alloc_coherent(struct device *dev,
 				    size_t size,
 				    dma_addr_t *dma_handle,
-				    gfp_t flag)
+				    gfp_t flag,
+				    struct dma_attrs *attrs)
 {
 	void *mem;
 
@@ -77,7 +77,8 @@ static void *ibmebus_alloc_coherent(struct device *dev,
 
 static void ibmebus_free_coherent(struct device *dev,
 				  size_t size, void *vaddr,
-				  dma_addr_t dma_handle)
+				  dma_addr_t dma_handle,
+				  struct dma_attrs *attrs)
 {
 	kfree(vaddr);
 }
@@ -136,8 +137,8 @@ static u64 ibmebus_dma_get_required_mask(struct device *dev)
 }
 
 static struct dma_map_ops ibmebus_dma_ops = {
-	.alloc_coherent     = ibmebus_alloc_coherent,
-	.free_coherent      = ibmebus_free_coherent,
+	.alloc              = ibmebus_alloc_coherent,
+	.free               = ibmebus_free_coherent,
 	.map_sg             = ibmebus_map_sg,
 	.unmap_sg           = ibmebus_unmap_sg,
 	.dma_supported      = ibmebus_dma_supported,
@@ -204,7 +205,7 @@ static int ibmebus_create_devices(const struct of_device_id *matches)
 	return ret;
 }
 
-int ibmebus_register_driver(struct of_platform_driver *drv)
+int ibmebus_register_driver(struct platform_driver *drv)
 {
 	/* If the driver uses devices that ibmebus doesn't know, add them */
 	ibmebus_create_devices(drv->driver.of_match_table);
@@ -214,7 +215,7 @@ int ibmebus_register_driver(struct of_platform_driver *drv)
 }
 EXPORT_SYMBOL(ibmebus_register_driver);
 
-void ibmebus_unregister_driver(struct of_platform_driver *drv)
+void ibmebus_unregister_driver(struct platform_driver *drv)
 {
 	driver_unregister(&drv->driver);
 }
@@ -291,6 +292,7 @@ out:
 		return rc;
 	return count;
 }
+static BUS_ATTR(probe, S_IWUSR, NULL, ibmebus_store_probe);
 
 static ssize_t ibmebus_store_remove(struct bus_type *bus,
 				    const char *buf, size_t count)
@@ -316,13 +318,14 @@ static ssize_t ibmebus_store_remove(struct bus_type *bus,
 		return -ENODEV;
 	}
 }
+static BUS_ATTR(remove, S_IWUSR, NULL, ibmebus_store_remove);
 
-
-static struct bus_attribute ibmebus_bus_attrs[] = {
-	__ATTR(probe, S_IWUSR, NULL, ibmebus_store_probe),
-	__ATTR(remove, S_IWUSR, NULL, ibmebus_store_remove),
-	__ATTR_NULL
+static struct attribute *ibmbus_bus_attrs[] = {
+	&bus_attr_probe.attr,
+	&bus_attr_remove.attr,
+	NULL,
 };
+ATTRIBUTE_GROUPS(ibmbus_bus);
 
 static int ibmebus_bus_bus_match(struct device *dev, struct device_driver *drv)
 {
@@ -337,11 +340,10 @@ static int ibmebus_bus_bus_match(struct device *dev, struct device_driver *drv)
 static int ibmebus_bus_device_probe(struct device *dev)
 {
 	int error = -ENODEV;
-	struct of_platform_driver *drv;
+	struct platform_driver *drv;
 	struct platform_device *of_dev;
-	const struct of_device_id *match;
 
-	drv = to_of_platform_driver(dev->driver);
+	drv = to_platform_driver(dev->driver);
 	of_dev = to_platform_device(dev);
 
 	if (!drv->probe)
@@ -349,9 +351,8 @@ static int ibmebus_bus_device_probe(struct device *dev)
 
 	of_dev_get(of_dev);
 
-	match = of_match_device(drv->driver.of_match_table, dev);
-	if (match)
-		error = drv->probe(of_dev, match);
+	if (of_driver_match_device(dev, dev->driver))
+		error = drv->probe(of_dev);
 	if (error)
 		of_dev_put(of_dev);
 
@@ -361,7 +362,7 @@ static int ibmebus_bus_device_probe(struct device *dev)
 static int ibmebus_bus_device_remove(struct device *dev)
 {
 	struct platform_device *of_dev = to_platform_device(dev);
-	struct of_platform_driver *drv = to_of_platform_driver(dev->driver);
+	struct platform_driver *drv = to_platform_driver(dev->driver);
 
 	if (dev->driver && drv->remove)
 		drv->remove(of_dev);
@@ -371,7 +372,7 @@ static int ibmebus_bus_device_remove(struct device *dev)
 static void ibmebus_bus_device_shutdown(struct device *dev)
 {
 	struct platform_device *of_dev = to_platform_device(dev);
-	struct of_platform_driver *drv = to_of_platform_driver(dev->driver);
+	struct platform_driver *drv = to_platform_driver(dev->driver);
 
 	if (dev->driver && drv->shutdown)
 		drv->shutdown(of_dev);
@@ -418,7 +419,7 @@ struct device_attribute ibmebus_bus_device_attrs[] = {
 static int ibmebus_bus_legacy_suspend(struct device *dev, pm_message_t mesg)
 {
 	struct platform_device *of_dev = to_platform_device(dev);
-	struct of_platform_driver *drv = to_of_platform_driver(dev->driver);
+	struct platform_driver *drv = to_platform_driver(dev->driver);
 	int ret = 0;
 
 	if (dev->driver && drv->suspend)
@@ -429,7 +430,7 @@ static int ibmebus_bus_legacy_suspend(struct device *dev, pm_message_t mesg)
 static int ibmebus_bus_legacy_resume(struct device *dev)
 {
 	struct platform_device *of_dev = to_platform_device(dev);
-	struct of_platform_driver *drv = to_of_platform_driver(dev->driver);
+	struct platform_driver *drv = to_platform_driver(dev->driver);
 	int ret = 0;
 
 	if (dev->driver && drv->resume)
@@ -713,8 +714,8 @@ static struct dev_pm_ops ibmebus_bus_dev_pm_ops = {
 
 struct bus_type ibmebus_bus_type = {
 	.name      = "ibmebus",
-	.uevent    = of_device_uevent,
-	.bus_attrs = ibmebus_bus_attrs,
+	.uevent    = of_device_uevent_modalias,
+	.bus_groups = ibmbus_bus_groups,
 	.match     = ibmebus_bus_bus_match,
 	.probe     = ibmebus_bus_device_probe,
 	.remove    = ibmebus_bus_device_remove,
diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
index 0a48bf5db6c..d7216c9abda 100644
--- a/arch/powerpc/kernel/idle.c
+++ b/arch/powerpc/kernel/idle.c
@@ -26,18 +26,13 @@
 #include <linux/sysctl.h>
 #include <linux/tick.h>
 
-#include <asm/system.h>
 #include <asm/processor.h>
 #include <asm/cputable.h>
 #include <asm/time.h>
 #include <asm/machdep.h>
+#include <asm/runlatch.h>
 #include <asm/smp.h>
 
-#ifdef CONFIG_HOTPLUG_CPU
-#define cpu_should_die()	cpu_is_offline(smp_processor_id())
-#else
-#define cpu_should_die()	0
-#endif
 
 unsigned long cpuidle_disable = IDLE_NO_OVERRIDE;
 EXPORT_SYMBOL(cpuidle_disable);
@@ -50,87 +45,39 @@ static int __init powersave_off(char *arg)
 }
 __setup("powersave=off", powersave_off);
 
-/*
- * The body of the idle task.
- */
-void cpu_idle(void)
+#ifdef CONFIG_HOTPLUG_CPU
+void arch_cpu_idle_dead(void)
 {
-	if (ppc_md.idle_loop)
-		ppc_md.idle_loop();	/* doesn't return */
-
-	set_thread_flag(TIF_POLLING_NRFLAG);
-	while (1) {
-		tick_nohz_idle_enter();
-		rcu_idle_enter();
-
-		while (!need_resched() && !cpu_should_die()) {
-			ppc64_runlatch_off();
-
-			if (ppc_md.power_save) {
-				clear_thread_flag(TIF_POLLING_NRFLAG);
-				/*
-				 * smp_mb is so clearing of TIF_POLLING_NRFLAG
-				 * is ordered w.r.t. need_resched() test.
-				 */
-				smp_mb();
-				local_irq_disable();
-
-				/* Don't trace irqs off for idle */
-				stop_critical_timings();
-
-				/* check again after disabling irqs */
-				if (!need_resched() && !cpu_should_die())
-					ppc_md.power_save();
-
-				start_critical_timings();
-
-				local_irq_enable();
-				set_thread_flag(TIF_POLLING_NRFLAG);
-
-			} else {
-				/*
-				 * Go into low thread priority and possibly
-				 * low power mode.
-				 */
-				HMT_low();
-				HMT_very_low();
-			}
-		}
-
-		HMT_medium();
-		ppc64_runlatch_on();
-		rcu_idle_exit();
-		tick_nohz_idle_exit();
-		preempt_enable_no_resched();
-		if (cpu_should_die())
-			cpu_die();
-		schedule();
-		preempt_disable();
-	}
+	sched_preempt_enable_no_resched();
+	cpu_die();
 }
+#endif
 
-
-/*
- * cpu_idle_wait - Used to ensure that all the CPUs come out of the old
- * idle loop and start using the new idle loop.
- * Required while changing idle handler on SMP systems.
- * Caller must have changed idle handler to the new value before the call.
- * This window may be larger on shared systems.
- */
-void cpu_idle_wait(void)
+void arch_cpu_idle(void)
 {
-	int cpu;
-	smp_mb();
-
-	/* kick all the CPUs so that they exit out of old idle routine */
-	get_online_cpus();
-	for_each_online_cpu(cpu) {
-		if (cpu != smp_processor_id())
-			smp_send_reschedule(cpu);
+	ppc64_runlatch_off();
+
+	if (ppc_md.power_save) {
+		ppc_md.power_save();
+		/*
+		 * Some power_save functions return with
+		 * interrupts enabled, some don't.
+		 */
+		if (irqs_disabled())
+			local_irq_enable();
+	} else {
+		local_irq_enable();
+		/*
+		 * Go into low thread priority and possibly
+		 * low power mode.
+		 */
+		HMT_low();
+		HMT_very_low();
 	}
-	put_online_cpus();
+
+	HMT_medium();
+	ppc64_runlatch_on();
 }
-EXPORT_SYMBOL_GPL(cpu_idle_wait);
 
 int powersave_nap;
 
@@ -138,7 +85,7 @@ int powersave_nap;
 /*
  * Register the sysctl to set/clear powersave_nap.
  */
-static ctl_table powersave_nap_ctl_table[]={
+static struct ctl_table powersave_nap_ctl_table[] = {
 	{
 		.procname	= "powersave-nap",
 		.data		= &powersave_nap,
@@ -148,7 +95,7 @@ static ctl_table powersave_nap_ctl_table[]={
 	},
 	{}
 };
-static ctl_table powersave_nap_sysctl_root[] = {
+static struct ctl_table powersave_nap_sysctl_root[] = {
 	{
 		.procname	= "kernel",
 		.mode		= 0555,
diff --git a/arch/powerpc/kernel/idle_6xx.S b/arch/powerpc/kernel/idle_6xx.S
index 15c611de1ee..1686916cc7f 100644
--- a/arch/powerpc/kernel/idle_6xx.S
+++ b/arch/powerpc/kernel/idle_6xx.S
@@ -135,7 +135,7 @@ BEGIN_FTR_SECTION
 	DSSALL
 	sync
 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
-	rlwinm	r9,r1,0,0,31-THREAD_SHIFT	/* current thread_info */
+	CURRENT_THREAD_INFO(r9, r1)
 	lwz	r8,TI_LOCAL_FLAGS(r9)	/* set napping bit */
 	ori	r8,r8,_TLF_NAPPING	/* so when we take an exception */
 	stw	r8,TI_LOCAL_FLAGS(r9)	/* it will return to our caller */
@@ -158,7 +158,7 @@ _GLOBAL(power_save_ppc32_restore)
 	stw	r9,_NIP(r11)		/* make it do a blr */
 
 #ifdef CONFIG_SMP
-	rlwinm	r12,r11,0,0,31-THREAD_SHIFT
+	CURRENT_THREAD_INFO(r12, r11)
 	lwz	r11,TI_CPU(r12)		/* get cpu number * 4 */
 	slwi	r11,r11,2
 #else
diff --git a/arch/powerpc/kernel/idle_book3e.S b/arch/powerpc/kernel/idle_book3e.S
index 16c002d6bdf..48c21acef91 100644
--- a/arch/powerpc/kernel/idle_book3e.S
+++ b/arch/powerpc/kernel/idle_book3e.S
@@ -16,11 +16,13 @@
 #include <asm/ppc-opcode.h>
 #include <asm/processor.h>
 #include <asm/thread_info.h>
+#include <asm/epapr_hcalls.h>
 
 /* 64-bit version only for now */
 #ifdef CONFIG_PPC64
 
-_GLOBAL(book3e_idle)
+.macro BOOK3E_IDLE name loop
+_GLOBAL(\name)
 	/* Save LR for later */
 	mflr	r0
 	std	r0,16(r1)
@@ -29,43 +31,30 @@ _GLOBAL(book3e_idle)
 	wrteei	0
 
 	/* Now check if an interrupt came in while we were soft disabled
-	 * since we may otherwise lose it (doorbells etc...). We know
-	 * that since PACAHARDIRQEN will have been cleared in that case.
+	 * since we may otherwise lose it (doorbells etc...).
 	 */
-	lbz	r3,PACAHARDIRQEN(r13)
+	lbz	r3,PACAIRQHAPPENED(r13)
 	cmpwi	cr0,r3,0
-	beqlr
+	bnelr
 
-	/* Now we are going to mark ourselves as soft and hard enables in
+	/* Now we are going to mark ourselves as soft and hard enabled in
 	 * order to be able to take interrupts while asleep. We inform lockdep
 	 * of that. We don't actually turn interrupts on just yet tho.
 	 */
 #ifdef CONFIG_TRACE_IRQFLAGS
 	stdu    r1,-128(r1)
-	bl	.trace_hardirqs_on
+	bl	trace_hardirqs_on
+	addi    r1,r1,128
 #endif
 	li	r0,1
 	stb	r0,PACASOFTIRQEN(r13)
-	stb	r0,PACAHARDIRQEN(r13)
 	
 	/* Interrupts will make use return to LR, so get something we want
 	 * in there
 	 */
 	bl	1f
 
-	/* Hard disable interrupts again */
-	wrteei	0
-
-	/* Mark them off again in the PACA as well */
-	li	r0,0
-	stb	r0,PACASOFTIRQEN(r13)
-	stb	r0,PACAHARDIRQEN(r13)
-
-	/* Tell lockdep about it */
-#ifdef CONFIG_TRACE_IRQFLAGS
-	bl	.trace_hardirqs_off
-	addi    r1,r1,128
-#endif
+	/* And return (interrupts are on) */
 	ld	r0,16(r1)
 	mtlr	r0
 	blr
@@ -73,14 +62,40 @@ _GLOBAL(book3e_idle)
 1:	/* Let's set the _TLF_NAPPING flag so interrupts make us return
 	 * to the right spot
 	*/
-	clrrdi	r11,r1,THREAD_SHIFT
+	CURRENT_THREAD_INFO(r11, r1)
 	ld	r10,TI_LOCAL_FLAGS(r11)
 	ori	r10,r10,_TLF_NAPPING
 	std	r10,TI_LOCAL_FLAGS(r11)
 
 	/* We can now re-enable hard interrupts and go to sleep */
 	wrteei	1
-1:	PPC_WAIT(0)
+	\loop
+
+.endm
+
+.macro BOOK3E_IDLE_LOOP
+1:
+	PPC_WAIT(0)
 	b	1b
+.endm
+
+/* epapr_ev_idle_start below is patched with the proper hcall
+   opcodes during kernel initialization */
+.macro EPAPR_EV_IDLE_LOOP
+idle_loop:
+	LOAD_REG_IMMEDIATE(r11, EV_HCALL_TOKEN(EV_IDLE))
+
+.global epapr_ev_idle_start
+epapr_ev_idle_start:
+	li      r3, -1
+	nop
+	nop
+	nop
+	b       idle_loop
+.endm
+
+BOOK3E_IDLE epapr_ev_idle EPAPR_EV_IDLE_LOOP
+
+BOOK3E_IDLE book3e_idle BOOK3E_IDLE_LOOP
 
 #endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/kernel/idle_e500.S b/arch/powerpc/kernel/idle_e500.S
index 4f0ab85f378..15448668988 100644
--- a/arch/powerpc/kernel/idle_e500.S
+++ b/arch/powerpc/kernel/idle_e500.S
@@ -21,7 +21,7 @@
 	.text
 
 _GLOBAL(e500_idle)
-	rlwinm	r3,r1,0,0,31-THREAD_SHIFT	/* current thread_info */
+	CURRENT_THREAD_INFO(r3, r1)
 	lwz	r4,TI_LOCAL_FLAGS(r3)	/* set napping bit */
 	ori	r4,r4,_TLF_NAPPING	/* so when we take an exception */
 	stw	r4,TI_LOCAL_FLAGS(r3)	/* it will return to our caller */
@@ -96,7 +96,7 @@ _GLOBAL(power_save_ppc32_restore)
 	stw	r9,_NIP(r11)		/* make it do a blr */
 
 #ifdef CONFIG_SMP
-	rlwinm	r12,r1,0,0,31-THREAD_SHIFT
+	CURRENT_THREAD_INFO(r12, r1)
 	lwz	r11,TI_CPU(r12)		/* get cpu number * 4 */
 	slwi	r11,r11,2
 #else
diff --git a/arch/powerpc/kernel/idle_power4.S b/arch/powerpc/kernel/idle_power4.S
index ba319547860..f57a19348bd 100644
--- a/arch/powerpc/kernel/idle_power4.S
+++ b/arch/powerpc/kernel/idle_power4.S
@@ -14,6 +14,7 @@
 #include <asm/thread_info.h>
 #include <asm/ppc_asm.h>
 #include <asm/asm-offsets.h>
+#include <asm/irqflags.h>
 
 #undef DEBUG
 
@@ -29,19 +30,36 @@ END_FTR_SECTION_IFCLR(CPU_FTR_CAN_NAP)
 	cmpwi	0,r4,0
 	beqlr
 
-	/* Go to NAP now */
+	/* Hard disable interrupts */
 	mfmsr	r7
 	rldicl	r0,r7,48,1
 	rotldi	r0,r0,16
-	mtmsrd	r0,1			/* hard-disable interrupts */
+	mtmsrd	r0,1
+
+	/* Check if something happened while soft-disabled */
+	lbz	r0,PACAIRQHAPPENED(r13)
+	cmpwi	cr0,r0,0
+	bnelr
+
+	/* Soft-enable interrupts */
+#ifdef CONFIG_TRACE_IRQFLAGS
+	mflr	r0
+	std	r0,16(r1)
+	stdu    r1,-128(r1)
+	bl	trace_hardirqs_on
+	addi    r1,r1,128
+	ld	r0,16(r1)
+	mtlr	r0
+	mfmsr	r7
+#endif /* CONFIG_TRACE_IRQFLAGS */
+
 	li	r0,1
 	stb	r0,PACASOFTIRQEN(r13)	/* we'll hard-enable shortly */
-	stb	r0,PACAHARDIRQEN(r13)
 BEGIN_FTR_SECTION
 	DSSALL
 	sync
 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
-	clrrdi	r9,r1,THREAD_SHIFT	/* current thread_info */
+	CURRENT_THREAD_INFO(r9, r1)
 	ld	r8,TI_LOCAL_FLAGS(r9)	/* set napping bit */
 	ori	r8,r8,_TLF_NAPPING	/* so when we take an exception */
 	std	r8,TI_LOCAL_FLAGS(r9)	/* it will return to our caller */
diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S
index fcdff198da4..5cf3d367190 100644
--- a/arch/powerpc/kernel/idle_power7.S
+++ b/arch/powerpc/kernel/idle_power7.S
@@ -1,5 +1,5 @@
 /*
- *  This file contains the power_save function for 970-family CPUs.
+ *  This file contains the power_save function for Power7 CPUs.
  *
  *  This program is free software; you can redistribute it and/or
  *  modify it under the terms of the GNU General Public License
@@ -15,18 +15,37 @@
 #include <asm/ppc_asm.h>
 #include <asm/asm-offsets.h>
 #include <asm/ppc-opcode.h>
+#include <asm/hw_irq.h>
+#include <asm/kvm_book3s_asm.h>
+#include <asm/opal.h>
 
 #undef DEBUG
 
-	.text
+/* Idle state entry routines */
 
-_GLOBAL(power7_idle)
-	/* Now check if user or arch enabled NAP mode */
-	LOAD_REG_ADDRBASE(r3,powersave_nap)
-	lwz	r4,ADDROFF(powersave_nap)(r3)
-	cmpwi	0,r4,0
-	beqlr
+#define	IDLE_STATE_ENTER_SEQ(IDLE_INST)				\
+	/* Magic NAP/SLEEP/WINKLE mode enter sequence */	\
+	std	r0,0(r1);					\
+	ptesync;						\
+	ld	r0,0(r1);					\
+1:	cmp	cr0,r0,r0;					\
+	bne	1b;						\
+	IDLE_INST;						\
+	b	.
 
+	.text
+
+/*
+ * Pass requested state in r3:
+ * 	0 - nap
+ * 	1 - sleep
+ *
+ * To check IRQ_HAPPENED in r4
+ * 	0 - don't check
+ * 	1 - check
+ */
+_GLOBAL(power7_powersave_common)
+	/* Use r3 to pass state nap/sleep/winkle */
 	/* NAP is a state loss, we create a regs frame on the
 	 * stack, fill it up with the state we care about and
 	 * stick a pointer to it in PACAR1. We really only
@@ -43,7 +62,7 @@ _GLOBAL(power7_idle)
 	/* Make sure FPU, VSX etc... are flushed as we may lose
 	 * state when going to nap mode
 	 */
-	bl	.discard_lazy_cpu_state
+	bl	discard_lazy_cpu_state
 #endif /* CONFIG_SMP */
 
 	/* Hard disable interrupts */
@@ -51,27 +70,96 @@ _GLOBAL(power7_idle)
 	rldicl	r9,r9,48,1
 	rotldi	r9,r9,16
 	mtmsrd	r9,1			/* hard-disable interrupts */
+
+	/* Check if something happened while soft-disabled */
+	lbz	r0,PACAIRQHAPPENED(r13)
+	cmpwi	cr0,r0,0
+	beq	1f
+	cmpwi	cr0,r4,0
+	beq	1f
+	addi	r1,r1,INT_FRAME_SIZE
+	ld	r0,16(r1)
+	mtlr	r0
+	blr
+
+1:	/* We mark irqs hard disabled as this is the state we'll
+	 * be in when returning and we need to tell arch_local_irq_restore()
+	 * about it
+	 */
+	li	r0,PACA_IRQ_HARD_DIS
+	stb	r0,PACAIRQHAPPENED(r13)
+
+	/* We haven't lost state ... yet */
 	li	r0,0
-	stb	r0,PACASOFTIRQEN(r13)	/* we'll hard-enable shortly */
-	stb	r0,PACAHARDIRQEN(r13)
 	stb	r0,PACA_NAPSTATELOST(r13)
 
 	/* Continue saving state */
 	SAVE_GPR(2, r1)
 	SAVE_NVGPRS(r1)
-	mfcr	r3
-	std	r3,_CCR(r1)
+	mfcr	r4
+	std	r4,_CCR(r1)
 	std	r9,_MSR(r1)
 	std	r1,PACAR1(r13)
 
-	/* Magic NAP mode enter sequence */
-	std	r0,0(r1)
-	ptesync
-	ld	r0,0(r1)
-1:	cmp	cr0,r0,r0
-	bne	1b
-	PPC_NAP
-	b	.
+_GLOBAL(power7_enter_nap_mode)
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	/* Tell KVM we're napping */
+	li	r4,KVM_HWTHREAD_IN_NAP
+	stb	r4,HSTATE_HWTHREAD_STATE(r13)
+#endif
+	cmpwi	cr0,r3,1
+	beq	2f
+	IDLE_STATE_ENTER_SEQ(PPC_NAP)
+	/* No return */
+2:	IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
+	/* No return */
+
+_GLOBAL(power7_idle)
+	/* Now check if user or arch enabled NAP mode */
+	LOAD_REG_ADDRBASE(r3,powersave_nap)
+	lwz	r4,ADDROFF(powersave_nap)(r3)
+	cmpwi	0,r4,0
+	beqlr
+	li	r3, 1
+	/* fall through */
+
+_GLOBAL(power7_nap)
+	mr	r4,r3
+	li	r3,0
+	b	power7_powersave_common
+	/* No return */
+
+_GLOBAL(power7_sleep)
+	li	r3,1
+	li	r4,1
+	b	power7_powersave_common
+	/* No return */
+
+_GLOBAL(power7_wakeup_tb_loss)
+	ld	r2,PACATOC(r13);
+	ld	r1,PACAR1(r13)
+
+	/* Time base re-sync */
+	li	r0,OPAL_RESYNC_TIMEBASE
+	LOAD_REG_ADDR(r11,opal);
+	ld	r12,8(r11);
+	ld	r2,0(r11);
+	mtctr	r12
+	bctrl
+
+	/* TODO: Check r3 for failure */
+
+	REST_NVGPRS(r1)
+	REST_GPR(2, r1)
+	ld	r3,_CCR(r1)
+	ld	r4,_MSR(r1)
+	ld	r5,_NIP(r1)
+	addi	r1,r1,INT_FRAME_SIZE
+	mtcr	r3
+	mfspr	r3,SPRN_SRR1		/* Return SRR1 */
+	mtspr	SPRN_SRR1,r4
+	mtspr	SPRN_SRR0,r5
+	rfid
 
 _GLOBAL(power7_wakeup_loss)
 	ld	r1,PACAR1(r13)
@@ -89,7 +177,7 @@ _GLOBAL(power7_wakeup_loss)
 _GLOBAL(power7_wakeup_noloss)
 	lbz	r0,PACA_NAPSTATELOST(r13)
 	cmpwi	r0,0
-	bne	.power7_wakeup_loss
+	bne	power7_wakeup_loss
 	ld	r1,PACAR1(r13)
 	ld	r4,_MSR(r1)
 	ld	r5,_NIP(r1)
diff --git a/arch/powerpc/kernel/init_task.c b/arch/powerpc/kernel/init_task.c
deleted file mode 100644
index d076d465dbd..00000000000
--- a/arch/powerpc/kernel/init_task.c
+++ /dev/null
@@ -1,29 +0,0 @@
-#include <linux/mm.h>
-#include <linux/export.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/init_task.h>
-#include <linux/fs.h>
-#include <linux/mqueue.h>
-#include <asm/uaccess.h>
-
-static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
-static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-/*
- * Initial thread structure.
- *
- * We need to make sure that this is 16384-byte aligned due to the
- * way process stacks are handled. This is done by having a special
- * "init_task" linker map entry..
- */
-union thread_union init_thread_union __init_task_data =
-	{ INIT_THREAD_INFO(init_task) };
-
-/*
- * Initial task structure.
- *
- * All other task structs will be allocated on slabs in fork.c
- */
-struct task_struct init_task = INIT_TASK(init_task);
-
-EXPORT_SYMBOL(init_task);
diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c
index 12d329bcbb9..24b968f8e4d 100644
--- a/arch/powerpc/kernel/io-workarounds.c
+++ b/arch/powerpc/kernel/io-workarounds.c
@@ -53,8 +53,10 @@ static struct iowa_bus *iowa_pci_find(unsigned long vaddr, unsigned long paddr)
 	return NULL;
 }
 
+#ifdef CONFIG_PPC_INDIRECT_MMIO
 struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
 {
+	unsigned hugepage_shift;
 	struct iowa_bus *bus;
 	int token;
 
@@ -70,11 +72,17 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
 		if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END)
 			return NULL;
 
-		ptep = find_linux_pte(init_mm.pgd, vaddr);
+		ptep = find_linux_pte_or_hugepte(init_mm.pgd, vaddr,
+						 &hugepage_shift);
 		if (ptep == NULL)
 			paddr = 0;
-		else
+		else {
+			/*
+			 * we don't have hugepages backing iomem
+			 */
+			WARN_ON(hugepage_shift);
 			paddr = pte_pfn(*ptep) << PAGE_SHIFT;
+		}
 		bus = iowa_pci_find(vaddr, paddr);
 
 		if (bus == NULL)
@@ -83,13 +91,25 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
 
 	return bus;
 }
+#else /* CONFIG_PPC_INDIRECT_MMIO */
+struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
+{
+	return NULL;
+}
+#endif /* !CONFIG_PPC_INDIRECT_MMIO */
 
+#ifdef CONFIG_PPC_INDIRECT_PIO
 struct iowa_bus *iowa_pio_find_bus(unsigned long port)
 {
 	unsigned long vaddr = (unsigned long)pci_io_base + port;
 	return iowa_pci_find(vaddr, 0);
 }
-
+#else
+struct iowa_bus *iowa_pio_find_bus(unsigned long port)
+{
+	return NULL;
+}
+#endif
 
 #define DEF_PCI_AC_RET(name, ret, at, al, space, aa)		\
 static ret iowa_##name at					\
@@ -118,7 +138,7 @@ static void iowa_##name at					\
 #undef DEF_PCI_AC_RET
 #undef DEF_PCI_AC_NORET
 
-static const struct ppc_pci_io __devinitconst iowa_pci_io = {
+static const struct ppc_pci_io iowa_pci_io = {
 
 #define DEF_PCI_AC_RET(name, ret, at, al, space, aa)	.name = iowa_##name,
 #define DEF_PCI_AC_NORET(name, at, al, space, aa)	.name = iowa_##name,
@@ -130,6 +150,7 @@ static const struct ppc_pci_io __devinitconst iowa_pci_io = {
 
 };
 
+#ifdef CONFIG_PPC_INDIRECT_MMIO
 static void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size,
 				  unsigned long flags, void *caller)
 {
@@ -144,9 +165,12 @@ static void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size,
 	}
 	return res;
 }
+#else /* CONFIG_PPC_INDIRECT_MMIO */
+#define iowa_ioremap NULL
+#endif /* !CONFIG_PPC_INDIRECT_MMIO */
 
 /* Enable IO workaround */
-static void __devinit io_workaround_init(void)
+static void io_workaround_init(void)
 {
 	static int io_workaround_inited;
 
@@ -158,9 +182,8 @@ static void __devinit io_workaround_init(void)
 }
 
 /* Register new bus to support workaround */
-void __devinit iowa_register_bus(struct pci_controller *phb,
-			struct ppc_pci_io *ops,
-			int (*initfunc)(struct iowa_bus *, void *), void *data)
+void iowa_register_bus(struct pci_controller *phb, struct ppc_pci_io *ops,
+		       int (*initfunc)(struct iowa_bus *, void *), void *data)
 {
 	struct iowa_bus *bus;
 	struct device_node *np = phb->dn;
diff --git a/arch/powerpc/kernel/io.c b/arch/powerpc/kernel/io.c
index 886381f32c3..2a2b4aeab80 100644
--- a/arch/powerpc/kernel/io.c
+++ b/arch/powerpc/kernel/io.c
@@ -25,6 +25,9 @@
 #include <asm/firmware.h>
 #include <asm/bug.h>
 
+/* See definition in io.h */
+bool isa_io_special;
+
 void _insb(const volatile u8 __iomem *port, void *buf, long count)
 {
 	u8 *tbuf = buf;
diff --git a/arch/powerpc/kernel/iomap.c b/arch/powerpc/kernel/iomap.c
index 97a3715ac8b..12e48d56f77 100644
--- a/arch/powerpc/kernel/iomap.c
+++ b/arch/powerpc/kernel/iomap.c
@@ -3,7 +3,6 @@
  *
  * (C) Copyright 2004 Linus Torvalds
  */
-#include <linux/init.h>
 #include <linux/pci.h>
 #include <linux/mm.h>
 #include <linux/export.h>
@@ -24,7 +23,7 @@ unsigned int ioread16(void __iomem *addr)
 }
 unsigned int ioread16be(void __iomem *addr)
 {
-	return in_be16(addr);
+	return readw_be(addr);
 }
 unsigned int ioread32(void __iomem *addr)
 {
@@ -32,7 +31,7 @@ unsigned int ioread32(void __iomem *addr)
 }
 unsigned int ioread32be(void __iomem *addr)
 {
-	return in_be32(addr);
+	return readl_be(addr);
 }
 EXPORT_SYMBOL(ioread8);
 EXPORT_SYMBOL(ioread16);
@@ -50,7 +49,7 @@ void iowrite16(u16 val, void __iomem *addr)
 }
 void iowrite16be(u16 val, void __iomem *addr)
 {
-	out_be16(addr, val);
+	writew_be(val, addr);
 }
 void iowrite32(u32 val, void __iomem *addr)
 {
@@ -58,7 +57,7 @@ void iowrite32(u32 val, void __iomem *addr)
 }
 void iowrite32be(u32 val, void __iomem *addr)
 {
-	out_be32(addr, val);
+	writel_be(val, addr);
 }
 EXPORT_SYMBOL(iowrite8);
 EXPORT_SYMBOL(iowrite16);
@@ -76,15 +75,15 @@ EXPORT_SYMBOL(iowrite32be);
  */
 void ioread8_rep(void __iomem *addr, void *dst, unsigned long count)
 {
-	_insb((u8 __iomem *) addr, dst, count);
+	readsb(addr, dst, count);
 }
 void ioread16_rep(void __iomem *addr, void *dst, unsigned long count)
 {
-	_insw_ns((u16 __iomem *) addr, dst, count);
+	readsw(addr, dst, count);
 }
 void ioread32_rep(void __iomem *addr, void *dst, unsigned long count)
 {
-	_insl_ns((u32 __iomem *) addr, dst, count);
+	readsl(addr, dst, count);
 }
 EXPORT_SYMBOL(ioread8_rep);
 EXPORT_SYMBOL(ioread16_rep);
@@ -92,15 +91,15 @@ EXPORT_SYMBOL(ioread32_rep);
 
 void iowrite8_rep(void __iomem *addr, const void *src, unsigned long count)
 {
-	_outsb((u8 __iomem *) addr, src, count);
+	writesb(addr, src, count);
 }
 void iowrite16_rep(void __iomem *addr, const void *src, unsigned long count)
 {
-	_outsw_ns((u16 __iomem *) addr, src, count);
+	writesw(addr, src, count);
 }
 void iowrite32_rep(void __iomem *addr, const void *src, unsigned long count)
 {
-	_outsl_ns((u32 __iomem *) addr, src, count);
+	writesl(addr, src, count);
 }
 EXPORT_SYMBOL(iowrite8_rep);
 EXPORT_SYMBOL(iowrite16_rep);
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 0cfcf98aafc..88e3ec6e1d9 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -33,12 +33,20 @@
 #include <linux/bitmap.h>
 #include <linux/iommu-helper.h>
 #include <linux/crash_dump.h>
+#include <linux/hash.h>
+#include <linux/fault-inject.h>
+#include <linux/pci.h>
+#include <linux/iommu.h>
+#include <linux/sched.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/iommu.h>
 #include <asm/pci-bridge.h>
 #include <asm/machdep.h>
 #include <asm/kdump.h>
+#include <asm/fadump.h>
+#include <asm/vio.h>
+#include <asm/tce.h>
 
 #define DBG(...)
 
@@ -57,6 +65,114 @@ static int __init setup_iommu(char *str)
 
 __setup("iommu=", setup_iommu);
 
+static DEFINE_PER_CPU(unsigned int, iommu_pool_hash);
+
+/*
+ * We precalculate the hash to avoid doing it on every allocation.
+ *
+ * The hash is important to spread CPUs across all the pools. For example,
+ * on a POWER7 with 4 way SMT we want interrupts on the primary threads and
+ * with 4 pools all primary threads would map to the same pool.
+ */
+static int __init setup_iommu_pool_hash(void)
+{
+	unsigned int i;
+
+	for_each_possible_cpu(i)
+		per_cpu(iommu_pool_hash, i) = hash_32(i, IOMMU_POOL_HASHBITS);
+
+	return 0;
+}
+subsys_initcall(setup_iommu_pool_hash);
+
+#ifdef CONFIG_FAIL_IOMMU
+
+static DECLARE_FAULT_ATTR(fail_iommu);
+
+static int __init setup_fail_iommu(char *str)
+{
+	return setup_fault_attr(&fail_iommu, str);
+}
+__setup("fail_iommu=", setup_fail_iommu);
+
+static bool should_fail_iommu(struct device *dev)
+{
+	return dev->archdata.fail_iommu && should_fail(&fail_iommu, 1);
+}
+
+static int __init fail_iommu_debugfs(void)
+{
+	struct dentry *dir = fault_create_debugfs_attr("fail_iommu",
+						       NULL, &fail_iommu);
+
+	return PTR_ERR_OR_ZERO(dir);
+}
+late_initcall(fail_iommu_debugfs);
+
+static ssize_t fail_iommu_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", dev->archdata.fail_iommu);
+}
+
+static ssize_t fail_iommu_store(struct device *dev,
+				struct device_attribute *attr, const char *buf,
+				size_t count)
+{
+	int i;
+
+	if (count > 0 && sscanf(buf, "%d", &i) > 0)
+		dev->archdata.fail_iommu = (i == 0) ? 0 : 1;
+
+	return count;
+}
+
+static DEVICE_ATTR(fail_iommu, S_IRUGO|S_IWUSR, fail_iommu_show,
+		   fail_iommu_store);
+
+static int fail_iommu_bus_notify(struct notifier_block *nb,
+				 unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	if (action == BUS_NOTIFY_ADD_DEVICE) {
+		if (device_create_file(dev, &dev_attr_fail_iommu))
+			pr_warn("Unable to create IOMMU fault injection sysfs "
+				"entries\n");
+	} else if (action == BUS_NOTIFY_DEL_DEVICE) {
+		device_remove_file(dev, &dev_attr_fail_iommu);
+	}
+
+	return 0;
+}
+
+static struct notifier_block fail_iommu_bus_notifier = {
+	.notifier_call = fail_iommu_bus_notify
+};
+
+static int __init fail_iommu_setup(void)
+{
+#ifdef CONFIG_PCI
+	bus_register_notifier(&pci_bus_type, &fail_iommu_bus_notifier);
+#endif
+#ifdef CONFIG_IBMVIO
+	bus_register_notifier(&vio_bus_type, &fail_iommu_bus_notifier);
+#endif
+
+	return 0;
+}
+/*
+ * Must execute after PCI and VIO subsystem have initialised but before
+ * devices are probed.
+ */
+arch_initcall(fail_iommu_setup);
+#else
+static inline bool should_fail_iommu(struct device *dev)
+{
+	return false;
+}
+#endif
+
 static unsigned long iommu_range_alloc(struct device *dev,
 				       struct iommu_table *tbl,
                                        unsigned long npages,
@@ -70,6 +186,9 @@ static unsigned long iommu_range_alloc(struct device *dev,
 	int pass = 0;
 	unsigned long align_mask;
 	unsigned long boundary_size;
+	unsigned long flags;
+	unsigned int pool_nr;
+	struct iommu_pool *pool;
 
 	align_mask = 0xffffffffffffffffl >> (64 - align_order);
 
@@ -82,59 +201,83 @@ static unsigned long iommu_range_alloc(struct device *dev,
 		return DMA_ERROR_CODE;
 	}
 
-	if (handle && *handle)
-		start = *handle;
+	if (should_fail_iommu(dev))
+		return DMA_ERROR_CODE;
+
+	/*
+	 * We don't need to disable preemption here because any CPU can
+	 * safely use any IOMMU pool.
+	 */
+	pool_nr = __raw_get_cpu_var(iommu_pool_hash) & (tbl->nr_pools - 1);
+
+	if (largealloc)
+		pool = &(tbl->large_pool);
 	else
-		start = largealloc ? tbl->it_largehint : tbl->it_hint;
+		pool = &(tbl->pools[pool_nr]);
+
+	spin_lock_irqsave(&(pool->lock), flags);
 
-	/* Use only half of the table for small allocs (15 pages or less) */
-	limit = largealloc ? tbl->it_size : tbl->it_halfpoint;
+again:
+	if ((pass == 0) && handle && *handle &&
+	    (*handle >= pool->start) && (*handle < pool->end))
+		start = *handle;
+	else
+		start = pool->hint;
 
-	if (largealloc && start < tbl->it_halfpoint)
-		start = tbl->it_halfpoint;
+	limit = pool->end;
 
 	/* The case below can happen if we have a small segment appended
 	 * to a large, or when the previous alloc was at the very end of
 	 * the available space. If so, go back to the initial start.
 	 */
 	if (start >= limit)
-		start = largealloc ? tbl->it_largehint : tbl->it_hint;
-
- again:
+		start = pool->start;
 
 	if (limit + tbl->it_offset > mask) {
 		limit = mask - tbl->it_offset + 1;
 		/* If we're constrained on address range, first try
 		 * at the masked hint to avoid O(n) search complexity,
-		 * but on second pass, start at 0.
+		 * but on second pass, start at 0 in pool 0.
 		 */
-		if ((start & mask) >= limit || pass > 0)
-			start = 0;
-		else
+		if ((start & mask) >= limit || pass > 0) {
+			spin_unlock(&(pool->lock));
+			pool = &(tbl->pools[0]);
+			spin_lock(&(pool->lock));
+			start = pool->start;
+		} else {
 			start &= mask;
+		}
 	}
 
 	if (dev)
 		boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
-				      1 << IOMMU_PAGE_SHIFT);
+				      1 << tbl->it_page_shift);
 	else
-		boundary_size = ALIGN(1UL << 32, 1 << IOMMU_PAGE_SHIFT);
+		boundary_size = ALIGN(1UL << 32, 1 << tbl->it_page_shift);
 	/* 4GB boundary for iseries_hv_alloc and iseries_hv_map */
 
-	n = iommu_area_alloc(tbl->it_map, limit, start, npages,
-			     tbl->it_offset, boundary_size >> IOMMU_PAGE_SHIFT,
-			     align_mask);
+	n = iommu_area_alloc(tbl->it_map, limit, start, npages, tbl->it_offset,
+			     boundary_size >> tbl->it_page_shift, align_mask);
 	if (n == -1) {
-		if (likely(pass < 2)) {
-			/* First failure, just rescan the half of the table.
-			 * Second failure, rescan the other half of the table.
-			 */
-			start = (largealloc ^ pass) ? tbl->it_halfpoint : 0;
-			limit = pass ? tbl->it_size : limit;
+		if (likely(pass == 0)) {
+			/* First try the pool from the start */
+			pool->hint = pool->start;
+			pass++;
+			goto again;
+
+		} else if (pass <= tbl->nr_pools) {
+			/* Now try scanning all the other pools */
+			spin_unlock(&(pool->lock));
+			pool_nr = (pool_nr + 1) & (tbl->nr_pools - 1);
+			pool = &tbl->pools[pool_nr];
+			spin_lock(&(pool->lock));
+			pool->hint = pool->start;
 			pass++;
 			goto again;
+
 		} else {
-			/* Third failure, give up */
+			/* Give up */
+			spin_unlock_irqrestore(&(pool->lock), flags);
 			return DMA_ERROR_CODE;
 		}
 	}
@@ -144,10 +287,10 @@ static unsigned long iommu_range_alloc(struct device *dev,
 	/* Bump the hint to a new block for small allocs. */
 	if (largealloc) {
 		/* Don't bump to new block to avoid fragmentation */
-		tbl->it_largehint = end;
+		pool->hint = end;
 	} else {
 		/* Overflow will be taken care of at the next allocation */
-		tbl->it_hint = (end + tbl->it_blocksize - 1) &
+		pool->hint = (end + tbl->it_blocksize - 1) &
 		                ~(tbl->it_blocksize - 1);
 	}
 
@@ -155,6 +298,8 @@ static unsigned long iommu_range_alloc(struct device *dev,
 	if (handle)
 		*handle = end;
 
+	spin_unlock_irqrestore(&(pool->lock), flags);
+
 	return n;
 }
 
@@ -164,26 +309,22 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 			      unsigned long mask, unsigned int align_order,
 			      struct dma_attrs *attrs)
 {
-	unsigned long entry, flags;
+	unsigned long entry;
 	dma_addr_t ret = DMA_ERROR_CODE;
 	int build_fail;
 
-	spin_lock_irqsave(&(tbl->it_lock), flags);
-
 	entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order);
 
-	if (unlikely(entry == DMA_ERROR_CODE)) {
-		spin_unlock_irqrestore(&(tbl->it_lock), flags);
+	if (unlikely(entry == DMA_ERROR_CODE))
 		return DMA_ERROR_CODE;
-	}
 
 	entry += tbl->it_offset;	/* Offset into real TCE table */
-	ret = entry << IOMMU_PAGE_SHIFT;	/* Set the return dma address */
+	ret = entry << tbl->it_page_shift;	/* Set the return dma address */
 
 	/* Put the TCEs in the HW table */
 	build_fail = ppc_md.tce_build(tbl, entry, npages,
-	                              (unsigned long)page & IOMMU_PAGE_MASK,
-	                              direction, attrs);
+				      (unsigned long)page &
+				      IOMMU_PAGE_MASK(tbl), direction, attrs);
 
 	/* ppc_md.tce_build() only returns non-zero for transient errors.
 	 * Clean up the table bitmap in this case and return
@@ -192,8 +333,6 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 	 */
 	if (unlikely(build_fail)) {
 		__iommu_free(tbl, ret, npages);
-
-		spin_unlock_irqrestore(&(tbl->it_lock), flags);
 		return DMA_ERROR_CODE;
 	}
 
@@ -201,20 +340,18 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 	if (ppc_md.tce_flush)
 		ppc_md.tce_flush(tbl);
 
-	spin_unlock_irqrestore(&(tbl->it_lock), flags);
-
 	/* Make sure updates are seen by hardware */
 	mb();
 
 	return ret;
 }
 
-static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 
-			 unsigned int npages)
+static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr,
+			     unsigned int npages)
 {
 	unsigned long entry, free_entry;
 
-	entry = dma_addr >> IOMMU_PAGE_SHIFT;
+	entry = dma_addr >> tbl->it_page_shift;
 	free_entry = entry - tbl->it_offset;
 
 	if (((free_entry + npages) > tbl->it_size) ||
@@ -230,20 +367,57 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 			printk(KERN_INFO "\tindex     = 0x%llx\n", (u64)tbl->it_index);
 			WARN_ON(1);
 		}
-		return;
+
+		return false;
 	}
 
+	return true;
+}
+
+static struct iommu_pool *get_pool(struct iommu_table *tbl,
+				   unsigned long entry)
+{
+	struct iommu_pool *p;
+	unsigned long largepool_start = tbl->large_pool.start;
+
+	/* The large pool is the last pool at the top of the table */
+	if (entry >= largepool_start) {
+		p = &tbl->large_pool;
+	} else {
+		unsigned int pool_nr = entry / tbl->poolsize;
+
+		BUG_ON(pool_nr > tbl->nr_pools);
+		p = &tbl->pools[pool_nr];
+	}
+
+	return p;
+}
+
+static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
+			 unsigned int npages)
+{
+	unsigned long entry, free_entry;
+	unsigned long flags;
+	struct iommu_pool *pool;
+
+	entry = dma_addr >> tbl->it_page_shift;
+	free_entry = entry - tbl->it_offset;
+
+	pool = get_pool(tbl, free_entry);
+
+	if (!iommu_free_check(tbl, dma_addr, npages))
+		return;
+
 	ppc_md.tce_free(tbl, entry, npages);
+
+	spin_lock_irqsave(&(pool->lock), flags);
 	bitmap_clear(tbl->it_map, free_entry, npages);
+	spin_unlock_irqrestore(&(pool->lock), flags);
 }
 
 static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 		unsigned int npages)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&(tbl->it_lock), flags);
-
 	__iommu_free(tbl, dma_addr, npages);
 
 	/* Make sure TLB cache is flushed if the HW needs it. We do
@@ -252,8 +426,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 	 */
 	if (ppc_md.tce_flush)
 		ppc_md.tce_flush(tbl);
-
-	spin_unlock_irqrestore(&(tbl->it_lock), flags);
 }
 
 int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
@@ -262,7 +434,6 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 		 struct dma_attrs *attrs)
 {
 	dma_addr_t dma_next = 0, dma_addr;
-	unsigned long flags;
 	struct scatterlist *s, *outs, *segstart;
 	int outcount, incount, i, build_fail = 0;
 	unsigned int align;
@@ -284,8 +455,6 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 
 	DBG("sg mapping %d elements:\n", nelems);
 
-	spin_lock_irqsave(&(tbl->it_lock), flags);
-
 	max_seg_size = dma_get_max_seg_size(dev);
 	for_each_sg(sglist, s, nelems, i) {
 		unsigned long vaddr, npages, entry, slen;
@@ -298,13 +467,13 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 		}
 		/* Allocate iommu entries for that segment */
 		vaddr = (unsigned long) sg_virt(s);
-		npages = iommu_num_pages(vaddr, slen, IOMMU_PAGE_SIZE);
+		npages = iommu_num_pages(vaddr, slen, IOMMU_PAGE_SIZE(tbl));
 		align = 0;
-		if (IOMMU_PAGE_SHIFT < PAGE_SHIFT && slen >= PAGE_SIZE &&
+		if (tbl->it_page_shift < PAGE_SHIFT && slen >= PAGE_SIZE &&
 		    (vaddr & ~PAGE_MASK) == 0)
-			align = PAGE_SHIFT - IOMMU_PAGE_SHIFT;
+			align = PAGE_SHIFT - tbl->it_page_shift;
 		entry = iommu_range_alloc(dev, tbl, npages, &handle,
-					  mask >> IOMMU_PAGE_SHIFT, align);
+					  mask >> tbl->it_page_shift, align);
 
 		DBG("  - vaddr: %lx, size: %lx\n", vaddr, slen);
 
@@ -319,16 +488,16 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 
 		/* Convert entry to a dma_addr_t */
 		entry += tbl->it_offset;
-		dma_addr = entry << IOMMU_PAGE_SHIFT;
-		dma_addr |= (s->offset & ~IOMMU_PAGE_MASK);
+		dma_addr = entry << tbl->it_page_shift;
+		dma_addr |= (s->offset & ~IOMMU_PAGE_MASK(tbl));
 
 		DBG("  - %lu pages, entry: %lx, dma_addr: %lx\n",
 			    npages, entry, dma_addr);
 
 		/* Insert into HW table */
 		build_fail = ppc_md.tce_build(tbl, entry, npages,
-		                              vaddr & IOMMU_PAGE_MASK,
-		                              direction, attrs);
+					      vaddr & IOMMU_PAGE_MASK(tbl),
+					      direction, attrs);
 		if(unlikely(build_fail))
 			goto failure;
 
@@ -368,8 +537,6 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 	if (ppc_md.tce_flush)
 		ppc_md.tce_flush(tbl);
 
-	spin_unlock_irqrestore(&(tbl->it_lock), flags);
-
 	DBG("mapped %d elements:\n", outcount);
 
 	/* For the sake of iommu_unmap_sg, we clear out the length in the
@@ -391,9 +558,9 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 		if (s->dma_length != 0) {
 			unsigned long vaddr, npages;
 
-			vaddr = s->dma_address & IOMMU_PAGE_MASK;
+			vaddr = s->dma_address & IOMMU_PAGE_MASK(tbl);
 			npages = iommu_num_pages(s->dma_address, s->dma_length,
-						 IOMMU_PAGE_SIZE);
+						 IOMMU_PAGE_SIZE(tbl));
 			__iommu_free(tbl, vaddr, npages);
 			s->dma_address = DMA_ERROR_CODE;
 			s->dma_length = 0;
@@ -401,7 +568,6 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 		if (s == outs)
 			break;
 	}
-	spin_unlock_irqrestore(&(tbl->it_lock), flags);
 	return 0;
 }
 
@@ -411,15 +577,12 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
 		struct dma_attrs *attrs)
 {
 	struct scatterlist *sg;
-	unsigned long flags;
 
 	BUG_ON(direction == DMA_NONE);
 
 	if (!tbl)
 		return;
 
-	spin_lock_irqsave(&(tbl->it_lock), flags);
-
 	sg = sglist;
 	while (nelems--) {
 		unsigned int npages;
@@ -428,7 +591,7 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
 		if (sg->dma_length == 0)
 			break;
 		npages = iommu_num_pages(dma_handle, sg->dma_length,
-					 IOMMU_PAGE_SIZE);
+					 IOMMU_PAGE_SIZE(tbl));
 		__iommu_free(tbl, dma_handle, npages);
 		sg = sg_next(sg);
 	}
@@ -439,13 +602,16 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
 	 */
 	if (ppc_md.tce_flush)
 		ppc_md.tce_flush(tbl);
-
-	spin_unlock_irqrestore(&(tbl->it_lock), flags);
 }
 
 static void iommu_table_clear(struct iommu_table *tbl)
 {
-	if (!is_kdump_kernel()) {
+	/*
+	 * In case of firmware assisted dump system goes through clean
+	 * reboot process at the time of system crash. Hence it's safe to
+	 * clear the TCE entries if firmware assisted dump is active.
+	 */
+	if (!is_kdump_kernel() || is_fadump_active()) {
 		/* Clear the table in case firmware left allocations in it */
 		ppc_md.tce_free(tbl, tbl->it_offset, tbl->it_size);
 		return;
@@ -488,14 +654,13 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
 	unsigned long sz;
 	static int welcomed = 0;
 	struct page *page;
-
-	/* Set aside 1/4 of the table for large allocations. */
-	tbl->it_halfpoint = tbl->it_size * 3 / 4;
+	unsigned int i;
+	struct iommu_pool *p;
 
 	/* number of bytes needed for the bitmap */
-	sz = (tbl->it_size + 7) >> 3;
+	sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
 
-	page = alloc_pages_node(nid, GFP_ATOMIC, get_order(sz));
+	page = alloc_pages_node(nid, GFP_KERNEL, get_order(sz));
 	if (!page)
 		panic("iommu_init_table: Can't allocate %ld bytes\n", sz);
 	tbl->it_map = page_address(page);
@@ -509,9 +674,28 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
 	if (tbl->it_offset == 0)
 		set_bit(0, tbl->it_map);
 
-	tbl->it_hint = 0;
-	tbl->it_largehint = tbl->it_halfpoint;
-	spin_lock_init(&tbl->it_lock);
+	/* We only split the IOMMU table if we have 1GB or more of space */
+	if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024))
+		tbl->nr_pools = IOMMU_NR_POOLS;
+	else
+		tbl->nr_pools = 1;
+
+	/* We reserve the top 1/4 of the table for large allocations */
+	tbl->poolsize = (tbl->it_size * 3 / 4) / tbl->nr_pools;
+
+	for (i = 0; i < tbl->nr_pools; i++) {
+		p = &tbl->pools[i];
+		spin_lock_init(&(p->lock));
+		p->start = tbl->poolsize * i;
+		p->hint = p->start;
+		p->end = p->start + tbl->poolsize;
+	}
+
+	p = &tbl->large_pool;
+	spin_lock_init(&(p->lock));
+	p->start = tbl->poolsize * i;
+	p->hint = p->start;
+	p->end = tbl->it_size;
 
 	iommu_table_clear(tbl);
 
@@ -526,7 +710,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
 
 void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 {
-	unsigned long bitmap_sz, i;
+	unsigned long bitmap_sz;
 	unsigned int order;
 
 	if (!tbl || !tbl->it_map) {
@@ -535,18 +719,26 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 		return;
 	}
 
-	/* verify that table contains no entries */
-	/* it_size is in entries, and we're examining 64 at a time */
-	for (i = 0; i < (tbl->it_size/64); i++) {
-		if (tbl->it_map[i] != 0) {
-			printk(KERN_WARNING "%s: Unexpected TCEs for %s\n",
-				__func__, node_name);
-			break;
-		}
+	/*
+	 * In case we have reserved the first bit, we should not emit
+	 * the warning below.
+	 */
+	if (tbl->it_offset == 0)
+		clear_bit(0, tbl->it_map);
+
+#ifdef CONFIG_IOMMU_API
+	if (tbl->it_group) {
+		iommu_group_put(tbl->it_group);
+		BUG_ON(tbl->it_group);
 	}
+#endif
+
+	/* verify that table contains no entries */
+	if (!bitmap_empty(tbl->it_map, tbl->it_size))
+		pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name);
 
 	/* calculate bitmap size in bytes */
-	bitmap_sz = (tbl->it_size + 7) / 8;
+	bitmap_sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
 
 	/* free bitmap */
 	order = get_order(bitmap_sz);
@@ -575,16 +767,16 @@ dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl,
 
 	vaddr = page_address(page) + offset;
 	uaddr = (unsigned long)vaddr;
-	npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE);
+	npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE(tbl));
 
 	if (tbl) {
 		align = 0;
-		if (IOMMU_PAGE_SHIFT < PAGE_SHIFT && size >= PAGE_SIZE &&
+		if (tbl->it_page_shift < PAGE_SHIFT && size >= PAGE_SIZE &&
 		    ((unsigned long)vaddr & ~PAGE_MASK) == 0)
-			align = PAGE_SHIFT - IOMMU_PAGE_SHIFT;
+			align = PAGE_SHIFT - tbl->it_page_shift;
 
 		dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction,
-					 mask >> IOMMU_PAGE_SHIFT, align,
+					 mask >> tbl->it_page_shift, align,
 					 attrs);
 		if (dma_handle == DMA_ERROR_CODE) {
 			if (printk_ratelimit())  {
@@ -593,7 +785,7 @@ dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl,
 					 npages);
 			}
 		} else
-			dma_handle |= (uaddr & ~IOMMU_PAGE_MASK);
+			dma_handle |= (uaddr & ~IOMMU_PAGE_MASK(tbl));
 	}
 
 	return dma_handle;
@@ -608,7 +800,8 @@ void iommu_unmap_page(struct iommu_table *tbl, dma_addr_t dma_handle,
 	BUG_ON(direction == DMA_NONE);
 
 	if (tbl) {
-		npages = iommu_num_pages(dma_handle, size, IOMMU_PAGE_SIZE);
+		npages = iommu_num_pages(dma_handle, size,
+					 IOMMU_PAGE_SIZE(tbl));
 		iommu_free(tbl, dma_handle, npages);
 	}
 }
@@ -652,10 +845,10 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
 	memset(ret, 0, size);
 
 	/* Set up tces to cover the allocated range */
-	nio_pages = size >> IOMMU_PAGE_SHIFT;
-	io_order = get_iommu_order(size);
+	nio_pages = size >> tbl->it_page_shift;
+	io_order = get_iommu_order(size, tbl);
 	mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
-			      mask >> IOMMU_PAGE_SHIFT, io_order, NULL);
+			      mask >> tbl->it_page_shift, io_order, NULL);
 	if (mapping == DMA_ERROR_CODE) {
 		free_pages((unsigned long)ret, order);
 		return NULL;
@@ -671,9 +864,311 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 		unsigned int nio_pages;
 
 		size = PAGE_ALIGN(size);
-		nio_pages = size >> IOMMU_PAGE_SHIFT;
+		nio_pages = size >> tbl->it_page_shift;
 		iommu_free(tbl, dma_handle, nio_pages);
 		size = PAGE_ALIGN(size);
 		free_pages((unsigned long)vaddr, get_order(size));
 	}
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+static void group_release(void *iommu_data)
+{
+	struct iommu_table *tbl = iommu_data;
+	tbl->it_group = NULL;
+}
+
+void iommu_register_group(struct iommu_table *tbl,
+		int pci_domain_number, unsigned long pe_num)
+{
+	struct iommu_group *grp;
+	char *name;
+
+	grp = iommu_group_alloc();
+	if (IS_ERR(grp)) {
+		pr_warn("powerpc iommu api: cannot create new group, err=%ld\n",
+				PTR_ERR(grp));
+		return;
+	}
+	tbl->it_group = grp;
+	iommu_group_set_iommudata(grp, tbl, group_release);
+	name = kasprintf(GFP_KERNEL, "domain%d-pe%lx",
+			pci_domain_number, pe_num);
+	if (!name)
+		return;
+	iommu_group_set_name(grp, name);
+	kfree(name);
+}
+
+enum dma_data_direction iommu_tce_direction(unsigned long tce)
+{
+	if ((tce & TCE_PCI_READ) && (tce & TCE_PCI_WRITE))
+		return DMA_BIDIRECTIONAL;
+	else if (tce & TCE_PCI_READ)
+		return DMA_TO_DEVICE;
+	else if (tce & TCE_PCI_WRITE)
+		return DMA_FROM_DEVICE;
+	else
+		return DMA_NONE;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_direction);
+
+void iommu_flush_tce(struct iommu_table *tbl)
+{
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+}
+EXPORT_SYMBOL_GPL(iommu_flush_tce);
+
+int iommu_tce_clear_param_check(struct iommu_table *tbl,
+		unsigned long ioba, unsigned long tce_value,
+		unsigned long npages)
+{
+	/* ppc_md.tce_free() does not support any value but 0 */
+	if (tce_value)
+		return -EINVAL;
+
+	if (ioba & ~IOMMU_PAGE_MASK(tbl))
+		return -EINVAL;
+
+	ioba >>= tbl->it_page_shift;
+	if (ioba < tbl->it_offset)
+		return -EINVAL;
+
+	if ((ioba + npages) > (tbl->it_offset + tbl->it_size))
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_clear_param_check);
+
+int iommu_tce_put_param_check(struct iommu_table *tbl,
+		unsigned long ioba, unsigned long tce)
+{
+	if (!(tce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+		return -EINVAL;
+
+	if (tce & ~(IOMMU_PAGE_MASK(tbl) | TCE_PCI_WRITE | TCE_PCI_READ))
+		return -EINVAL;
+
+	if (ioba & ~IOMMU_PAGE_MASK(tbl))
+		return -EINVAL;
+
+	ioba >>= tbl->it_page_shift;
+	if (ioba < tbl->it_offset)
+		return -EINVAL;
+
+	if ((ioba + 1) > (tbl->it_offset + tbl->it_size))
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_put_param_check);
+
+unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry)
+{
+	unsigned long oldtce;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	spin_lock(&(pool->lock));
+
+	oldtce = ppc_md.tce_get(tbl, entry);
+	if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))
+		ppc_md.tce_free(tbl, entry, 1);
+	else
+		oldtce = 0;
+
+	spin_unlock(&(pool->lock));
+
+	return oldtce;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tce);
+
+int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
+		unsigned long entry, unsigned long pages)
+{
+	unsigned long oldtce;
+	struct page *page;
+
+	for ( ; pages; --pages, ++entry) {
+		oldtce = iommu_clear_tce(tbl, entry);
+		if (!oldtce)
+			continue;
+
+		page = pfn_to_page(oldtce >> PAGE_SHIFT);
+		WARN_ON(!page);
+		if (page) {
+			if (oldtce & TCE_PCI_WRITE)
+				SetPageDirty(page);
+			put_page(page);
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages);
+
+/*
+ * hwaddr is a kernel virtual address here (0xc... bazillion),
+ * tce_build converts it to a physical address.
+ */
+int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
+		unsigned long hwaddr, enum dma_data_direction direction)
+{
+	int ret = -EBUSY;
+	unsigned long oldtce;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	spin_lock(&(pool->lock));
+
+	oldtce = ppc_md.tce_get(tbl, entry);
+	/* Add new entry if it is not busy */
+	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+		ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, direction, NULL);
+
+	spin_unlock(&(pool->lock));
+
+	/* if (unlikely(ret))
+		pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n",
+			__func__, hwaddr, entry << IOMMU_PAGE_SHIFT(tbl),
+				hwaddr, ret); */
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_build);
+
+int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry,
+		unsigned long tce)
+{
+	int ret;
+	struct page *page = NULL;
+	unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
+	enum dma_data_direction direction = iommu_tce_direction(tce);
+
+	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+			direction != DMA_TO_DEVICE, &page);
+	if (unlikely(ret != 1)) {
+		/* pr_err("iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT(tbl), ret); */
+		return -EFAULT;
+	}
+	hwaddr = (unsigned long) page_address(page) + offset;
+
+	ret = iommu_tce_build(tbl, entry, hwaddr, direction);
+	if (ret)
+		put_page(page);
+
+	if (ret < 0)
+		pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n",
+			__func__, entry << tbl->it_page_shift, tce, ret);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode);
+
+int iommu_take_ownership(struct iommu_table *tbl)
+{
+	unsigned long sz = (tbl->it_size + 7) >> 3;
+
+	if (tbl->it_offset == 0)
+		clear_bit(0, tbl->it_map);
+
+	if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
+		pr_err("iommu_tce: it_map is not empty");
+		return -EBUSY;
+	}
+
+	memset(tbl->it_map, 0xff, sz);
+	iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
+
+	/*
+	 * Disable iommu bypass, otherwise the user can DMA to all of
+	 * our physical memory via the bypass window instead of just
+	 * the pages that has been explicitly mapped into the iommu
+	 */
+	if (tbl->set_bypass)
+		tbl->set_bypass(tbl, false);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_take_ownership);
+
+void iommu_release_ownership(struct iommu_table *tbl)
+{
+	unsigned long sz = (tbl->it_size + 7) >> 3;
+
+	iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
+	memset(tbl->it_map, 0, sz);
+
+	/* Restore bit#0 set by iommu_init_table() */
+	if (tbl->it_offset == 0)
+		set_bit(0, tbl->it_map);
+
+	/* The kernel owns the device now, we can restore the iommu bypass */
+	if (tbl->set_bypass)
+		tbl->set_bypass(tbl, true);
+}
+EXPORT_SYMBOL_GPL(iommu_release_ownership);
+
+int iommu_add_device(struct device *dev)
+{
+	struct iommu_table *tbl;
+	int ret = 0;
+
+	if (WARN_ON(dev->iommu_group)) {
+		pr_warn("iommu_tce: device %s is already in iommu group %d, skipping\n",
+				dev_name(dev),
+				iommu_group_id(dev->iommu_group));
+		return -EBUSY;
+	}
+
+	tbl = get_iommu_table_base(dev);
+	if (!tbl || !tbl->it_group) {
+		pr_debug("iommu_tce: skipping device %s with no tbl\n",
+				dev_name(dev));
+		return 0;
+	}
+
+	pr_debug("iommu_tce: adding %s to iommu group %d\n",
+			dev_name(dev), iommu_group_id(tbl->it_group));
+
+	if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) {
+		pr_err("iommu_tce: unsupported iommu page size.");
+		pr_err("%s has not been added\n", dev_name(dev));
+		return -EINVAL;
+	}
+
+	ret = iommu_group_add_device(tbl->it_group, dev);
+	if (ret < 0)
+		pr_err("iommu_tce: %s has not been added, ret=%d\n",
+				dev_name(dev), ret);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_add_device);
+
+void iommu_del_device(struct device *dev)
+{
+	/*
+	 * Some devices might not have IOMMU table and group
+	 * and we needn't detach them from the associated
+	 * IOMMU groups
+	 */
+	if (!dev->iommu_group) {
+		pr_debug("iommu_tce: skipping device %s with no tbl\n",
+			 dev_name(dev));
+		return;
+	}
+
+	iommu_group_remove_device(dev);
+}
+EXPORT_SYMBOL_GPL(iommu_del_device);
+
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 701d4aceb4f..248ee7e5beb 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -57,7 +57,6 @@
 #include <linux/of_irq.h>
 
 #include <asm/uaccess.h>
-#include <asm/system.h>
 #include <asm/io.h>
 #include <asm/pgtable.h>
 #include <asm/irq.h>
@@ -67,6 +66,7 @@
 #include <asm/machdep.h>
 #include <asm/udbg.h>
 #include <asm/smp.h>
+#include <asm/debug.h>
 
 #ifdef CONFIG_PPC64
 #include <asm/paca.h>
@@ -93,20 +93,16 @@ extern int tau_interrupts(int);
 
 #ifdef CONFIG_PPC64
 
-#ifndef CONFIG_SPARSE_IRQ
-EXPORT_SYMBOL(irq_desc);
-#endif
-
 int distribute_irqs = 1;
 
-static inline notrace unsigned long get_hard_enabled(void)
+static inline notrace unsigned long get_irq_happened(void)
 {
-	unsigned long enabled;
+	unsigned long happened;
 
 	__asm__ __volatile__("lbz %0,%1(13)"
-	: "=r" (enabled) : "i" (offsetof(struct paca_struct, hard_enabled)));
+	: "=r" (happened) : "i" (offsetof(struct paca_struct, irq_happened)));
 
-	return enabled;
+	return happened;
 }
 
 static inline notrace void set_soft_enabled(unsigned long enable)
@@ -115,84 +111,232 @@ static inline notrace void set_soft_enabled(unsigned long enable)
 	: : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled)));
 }
 
-static inline notrace void decrementer_check_overflow(void)
+static inline notrace int decrementer_check_overflow(void)
 {
-	u64 now = get_tb_or_rtc();
-	u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
-
-	if (now >= *next_tb)
-		set_dec(1);
+ 	u64 now = get_tb_or_rtc();
+ 	u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
+ 
+	return now >= *next_tb;
 }
 
-notrace void arch_local_irq_restore(unsigned long en)
+/* This is called whenever we are re-enabling interrupts
+ * and returns either 0 (nothing to do) or 500/900/280/a00/e80 if
+ * there's an EE, DEC or DBELL to generate.
+ *
+ * This is called in two contexts: From arch_local_irq_restore()
+ * before soft-enabling interrupts, and from the exception exit
+ * path when returning from an interrupt from a soft-disabled to
+ * a soft enabled context. In both case we have interrupts hard
+ * disabled.
+ *
+ * We take care of only clearing the bits we handled in the
+ * PACA irq_happened field since we can only re-emit one at a
+ * time and we don't want to "lose" one.
+ */
+notrace unsigned int __check_irq_replay(void)
 {
 	/*
-	 * get_paca()->soft_enabled = en;
-	 * Is it ever valid to use local_irq_restore(0) when soft_enabled is 1?
-	 * That was allowed before, and in such a case we do need to take care
-	 * that gcc will set soft_enabled directly via r13, not choose to use
-	 * an intermediate register, lest we're preempted to a different cpu.
+	 * We use local_paca rather than get_paca() to avoid all
+	 * the debug_smp_processor_id() business in this low level
+	 * function
+	 */
+	unsigned char happened = local_paca->irq_happened;
+
+	/* Clear bit 0 which we wouldn't clear otherwise */
+	local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
+
+	/*
+	 * Force the delivery of pending soft-disabled interrupts on PS3.
+	 * Any HV call will have this side effect.
+	 */
+	if (firmware_has_feature(FW_FEATURE_PS3_LV1)) {
+		u64 tmp, tmp2;
+		lv1_get_version_info(&tmp, &tmp2);
+	}
+
+	/*
+	 * We may have missed a decrementer interrupt. We check the
+	 * decrementer itself rather than the paca irq_happened field
+	 * in case we also had a rollover while hard disabled
+	 */
+	local_paca->irq_happened &= ~PACA_IRQ_DEC;
+	if ((happened & PACA_IRQ_DEC) || decrementer_check_overflow())
+		return 0x900;
+
+	/* Finally check if an external interrupt happened */
+	local_paca->irq_happened &= ~PACA_IRQ_EE;
+	if (happened & PACA_IRQ_EE)
+		return 0x500;
+
+#ifdef CONFIG_PPC_BOOK3E
+	/* Finally check if an EPR external interrupt happened
+	 * this bit is typically set if we need to handle another
+	 * "edge" interrupt from within the MPIC "EPR" handler
 	 */
+	local_paca->irq_happened &= ~PACA_IRQ_EE_EDGE;
+	if (happened & PACA_IRQ_EE_EDGE)
+		return 0x500;
+
+	local_paca->irq_happened &= ~PACA_IRQ_DBELL;
+	if (happened & PACA_IRQ_DBELL)
+		return 0x280;
+#else
+	local_paca->irq_happened &= ~PACA_IRQ_DBELL;
+	if (happened & PACA_IRQ_DBELL) {
+		if (cpu_has_feature(CPU_FTR_HVMODE))
+			return 0xe80;
+		return 0xa00;
+	}
+#endif /* CONFIG_PPC_BOOK3E */
+
+	/* There should be nothing left ! */
+	BUG_ON(local_paca->irq_happened != 0);
+
+	return 0;
+}
+
+notrace void arch_local_irq_restore(unsigned long en)
+{
+	unsigned char irq_happened;
+	unsigned int replay;
+
+	/* Write the new soft-enabled value */
 	set_soft_enabled(en);
 	if (!en)
 		return;
+	/*
+	 * From this point onward, we can take interrupts, preempt,
+	 * etc... unless we got hard-disabled. We check if an event
+	 * happened. If none happened, we know we can just return.
+	 *
+	 * We may have preempted before the check below, in which case
+	 * we are checking the "new" CPU instead of the old one. This
+	 * is only a problem if an event happened on the "old" CPU.
+	 *
+	 * External interrupt events will have caused interrupts to
+	 * be hard-disabled, so there is no problem, we
+	 * cannot have preempted.
+	 */
+	irq_happened = get_irq_happened();
+	if (!irq_happened)
+		return;
 
-#ifdef CONFIG_PPC_STD_MMU_64
-	if (firmware_has_feature(FW_FEATURE_ISERIES)) {
+	/*
+	 * We need to hard disable to get a trusted value from
+	 * __check_irq_replay(). We also need to soft-disable
+	 * again to avoid warnings in there due to the use of
+	 * per-cpu variables.
+	 *
+	 * We know that if the value in irq_happened is exactly 0x01
+	 * then we are already hard disabled (there are other less
+	 * common cases that we'll ignore for now), so we skip the
+	 * (expensive) mtmsrd.
+	 */
+	if (unlikely(irq_happened != PACA_IRQ_HARD_DIS))
+		__hard_irq_disable();
+#ifdef CONFIG_TRACE_IRQFLAGS
+	else {
 		/*
-		 * Do we need to disable preemption here?  Not really: in the
-		 * unlikely event that we're preempted to a different cpu in
-		 * between getting r13, loading its lppaca_ptr, and loading
-		 * its any_int, we might call iseries_handle_interrupts without
-		 * an interrupt pending on the new cpu, but that's no disaster,
-		 * is it?  And the business of preempting us off the old cpu
-		 * would itself involve a local_irq_restore which handles the
-		 * interrupt to that cpu.
-		 *
-		 * But use "local_paca->lppaca_ptr" instead of "get_lppaca()"
-		 * to avoid any preemption checking added into get_paca().
+		 * We should already be hard disabled here. We had bugs
+		 * where that wasn't the case so let's dbl check it and
+		 * warn if we are wrong. Only do that when IRQ tracing
+		 * is enabled as mfmsr() can be costly.
 		 */
-		if (local_paca->lppaca_ptr->int_dword.any_int)
-			iseries_handle_interrupts();
+		if (WARN_ON(mfmsr() & MSR_EE))
+			__hard_irq_disable();
 	}
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_TRACE_IRQFLAG */
+
+	set_soft_enabled(0);
+
+	/*
+	 * Check if anything needs to be re-emitted. We haven't
+	 * soft-enabled yet to avoid warnings in decrementer_check_overflow
+	 * accessing per-cpu variables
+	 */
+	replay = __check_irq_replay();
+
+	/* We can soft-enable now */
+	set_soft_enabled(1);
 
 	/*
-	 * if (get_paca()->hard_enabled) return;
-	 * But again we need to take care that gcc gets hard_enabled directly
-	 * via r13, not choose to use an intermediate register, lest we're
-	 * preempted to a different cpu in between the two instructions.
+	 * And replay if we have to. This will return with interrupts
+	 * hard-enabled.
 	 */
-	if (get_hard_enabled())
+	if (replay) {
+		__replay_interrupt(replay);
 		return;
+	}
+
+	/* Finally, let's ensure we are hard enabled */
+	__hard_irq_enable();
+}
+EXPORT_SYMBOL(arch_local_irq_restore);
 
+/*
+ * This is specifically called by assembly code to re-enable interrupts
+ * if they are currently disabled. This is typically called before
+ * schedule() or do_signal() when returning to userspace. We do it
+ * in C to avoid the burden of dealing with lockdep etc...
+ *
+ * NOTE: This is called with interrupts hard disabled but not marked
+ * as such in paca->irq_happened, so we need to resync this.
+ */
+void notrace restore_interrupts(void)
+{
+	if (irqs_disabled()) {
+		local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
+		local_irq_enable();
+	} else
+		__hard_irq_enable();
+}
+
+/*
+ * This is a helper to use when about to go into idle low-power
+ * when the latter has the side effect of re-enabling interrupts
+ * (such as calling H_CEDE under pHyp).
+ *
+ * You call this function with interrupts soft-disabled (this is
+ * already the case when ppc_md.power_save is called). The function
+ * will return whether to enter power save or just return.
+ *
+ * In the former case, it will have notified lockdep of interrupts
+ * being re-enabled and generally sanitized the lazy irq state,
+ * and in the latter case it will leave with interrupts hard
+ * disabled and marked as such, so the local_irq_enable() call
+ * in arch_cpu_idle() will properly re-enable everything.
+ */
+bool prep_irq_for_idle(void)
+{
 	/*
-	 * Need to hard-enable interrupts here.  Since currently disabled,
-	 * no need to take further asm precautions against preemption; but
-	 * use local_paca instead of get_paca() to avoid preemption checking.
+	 * First we need to hard disable to ensure no interrupt
+	 * occurs before we effectively enter the low power state
 	 */
-	local_paca->hard_enabled = en;
+	hard_irq_disable();
 
 	/*
-	 * Trigger the decrementer if we have a pending event. Some processors
-	 * only trigger on edge transitions of the sign bit. We might also
-	 * have disabled interrupts long enough that the decrementer wrapped
-	 * to positive.
+	 * If anything happened while we were soft-disabled,
+	 * we return now and do not enter the low power state.
 	 */
-	decrementer_check_overflow();
+	if (lazy_irq_pending())
+		return false;
+
+	/* Tell lockdep we are about to re-enable */
+	trace_hardirqs_on();
 
 	/*
-	 * Force the delivery of pending soft-disabled interrupts on PS3.
-	 * Any HV call will have this side effect.
+	 * Mark interrupts as soft-enabled and clear the
+	 * PACA_IRQ_HARD_DIS from the pending mask since we
+	 * are about to hard enable as well as a side effect
+	 * of entering the low power state.
 	 */
-	if (firmware_has_feature(FW_FEATURE_PS3_LV1)) {
-		u64 tmp, tmp2;
-		lv1_get_version_info(&tmp, &tmp2);
-	}
+	local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
+	local_paca->soft_enabled = 1;
 
-	__hard_irq_enable();
+	/* Tell the caller to enter the low power state */
+	return true;
 }
-EXPORT_SYMBOL(arch_local_irq_restore);
+
 #endif /* CONFIG_PPC64 */
 
 int arch_show_interrupts(struct seq_file *p, int prec)
@@ -210,15 +354,20 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 
 	seq_printf(p, "%*s: ", prec, "LOC");
 	for_each_online_cpu(j)
-		seq_printf(p, "%10u ", per_cpu(irq_stat, j).timer_irqs);
-        seq_printf(p, "  Local timer interrupts\n");
+		seq_printf(p, "%10u ", per_cpu(irq_stat, j).timer_irqs_event);
+        seq_printf(p, "  Local timer interrupts for timer event device\n");
+
+	seq_printf(p, "%*s: ", prec, "LOC");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", per_cpu(irq_stat, j).timer_irqs_others);
+        seq_printf(p, "  Local timer interrupts for others\n");
 
 	seq_printf(p, "%*s: ", prec, "SPU");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", per_cpu(irq_stat, j).spurious_irqs);
 	seq_printf(p, "  Spurious interrupts\n");
 
-	seq_printf(p, "%*s: ", prec, "CNT");
+	seq_printf(p, "%*s: ", prec, "PMI");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", per_cpu(irq_stat, j).pmu_irqs);
 	seq_printf(p, "  Performance monitoring interrupts\n");
@@ -228,6 +377,15 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 		seq_printf(p, "%10u ", per_cpu(irq_stat, j).mce_exceptions);
 	seq_printf(p, "  Machine check exceptions\n");
 
+#ifdef CONFIG_PPC_DOORBELL
+	if (cpu_has_feature(CPU_FTR_DBELL)) {
+		seq_printf(p, "%*s: ", prec, "DBL");
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ", per_cpu(irq_stat, j).doorbell_irqs);
+		seq_printf(p, "  Doorbell interrupts\n");
+	}
+#endif
+
 	return 0;
 }
 
@@ -236,11 +394,15 @@ int arch_show_interrupts(struct seq_file *p, int prec)
  */
 u64 arch_irq_stat_cpu(unsigned int cpu)
 {
-	u64 sum = per_cpu(irq_stat, cpu).timer_irqs;
+	u64 sum = per_cpu(irq_stat, cpu).timer_irqs_event;
 
 	sum += per_cpu(irq_stat, cpu).pmu_irqs;
 	sum += per_cpu(irq_stat, cpu).mce_exceptions;
 	sum += per_cpu(irq_stat, cpu).spurious_irqs;
+	sum += per_cpu(irq_stat, cpu).timer_irqs_others;
+#ifdef CONFIG_PPC_DOORBELL
+	sum += per_cpu(irq_stat, cpu).doorbell_irqs;
+#endif
 
 	return sum;
 }
@@ -256,14 +418,10 @@ void migrate_irqs(void)
 
 	alloc_cpumask_var(&mask, GFP_KERNEL);
 
-	for_each_irq(irq) {
+	for_each_irq_desc(irq, desc) {
 		struct irq_data *data;
 		struct irq_chip *chip;
 
-		desc = irq_to_desc(irq);
-		if (!desc)
-			continue;
-
 		data = irq_desc_get_irq_data(desc);
 		if (irqd_is_per_cpu(data))
 			continue;
@@ -289,50 +447,6 @@ void migrate_irqs(void)
 }
 #endif
 
-static inline void handle_one_irq(unsigned int irq)
-{
-	struct thread_info *curtp, *irqtp;
-	unsigned long saved_sp_limit;
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
-	if (!desc)
-		return;
-
-	/* Switch to the irq stack to handle this */
-	curtp = current_thread_info();
-	irqtp = hardirq_ctx[smp_processor_id()];
-
-	if (curtp == irqtp) {
-		/* We're already on the irq stack, just handle it */
-		desc->handle_irq(irq, desc);
-		return;
-	}
-
-	saved_sp_limit = current->thread.ksp_limit;
-
-	irqtp->task = curtp->task;
-	irqtp->flags = 0;
-
-	/* Copy the softirq bits in preempt_count so that the
-	 * softirq checks work in the hardirq context. */
-	irqtp->preempt_count = (irqtp->preempt_count & ~SOFTIRQ_MASK) |
-			       (curtp->preempt_count & SOFTIRQ_MASK);
-
-	current->thread.ksp_limit = (unsigned long)irqtp +
-		_ALIGN_UP(sizeof(struct thread_info), 16);
-
-	call_handle_irq(irq, desc, irqtp, desc->handle_irq);
-	current->thread.ksp_limit = saved_sp_limit;
-	irqtp->task = NULL;
-
-	/* Set any flag that may have been set on the
-	 * alternate stack
-	 */
-	if (irqtp->flags)
-		set_bits(irqtp->flags, &curtp->flags);
-}
-
 static inline void check_stack_overflow(void)
 {
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
@@ -349,37 +463,72 @@ static inline void check_stack_overflow(void)
 #endif
 }
 
-void do_IRQ(struct pt_regs *regs)
+void __do_irq(struct pt_regs *regs)
 {
-	struct pt_regs *old_regs = set_irq_regs(regs);
 	unsigned int irq;
 
-	trace_irq_entry(regs);
-
 	irq_enter();
 
+	trace_irq_entry(regs);
+
 	check_stack_overflow();
 
+	/*
+	 * Query the platform PIC for the interrupt & ack it.
+	 *
+	 * This will typically lower the interrupt line to the CPU
+	 */
 	irq = ppc_md.get_irq();
 
-	if (irq != NO_IRQ && irq != NO_IRQ_IGNORE)
-		handle_one_irq(irq);
-	else if (irq != NO_IRQ_IGNORE)
+	/* We can hard enable interrupts now to allow perf interrupts */
+	may_hard_irq_enable();
+
+	/* And finally process it */
+	if (unlikely(irq == NO_IRQ))
 		__get_cpu_var(irq_stat).spurious_irqs++;
+	else
+		generic_handle_irq(irq);
+
+	trace_irq_exit(regs);
 
 	irq_exit();
-	set_irq_regs(old_regs);
+}
+
+void do_IRQ(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+	struct thread_info *curtp, *irqtp, *sirqtp;
 
-#ifdef CONFIG_PPC_ISERIES
-	if (firmware_has_feature(FW_FEATURE_ISERIES) &&
-			get_lppaca()->int_dword.fields.decr_int) {
-		get_lppaca()->int_dword.fields.decr_int = 0;
-		/* Signal a fake decrementer interrupt */
-		timer_interrupt(regs);
+	/* Switch to the irq stack to handle this */
+	curtp = current_thread_info();
+	irqtp = hardirq_ctx[raw_smp_processor_id()];
+	sirqtp = softirq_ctx[raw_smp_processor_id()];
+
+	/* Already there ? */
+	if (unlikely(curtp == irqtp || curtp == sirqtp)) {
+		__do_irq(regs);
+		set_irq_regs(old_regs);
+		return;
 	}
-#endif
 
-	trace_irq_exit(regs);
+	/* Prepare the thread_info in the irq stack */
+	irqtp->task = curtp->task;
+	irqtp->flags = 0;
+
+	/* Copy the preempt_count so that the [soft]irq checks work. */
+	irqtp->preempt_count = curtp->preempt_count;
+
+	/* Switch stack and call */
+	call_do_irq(regs, irqtp);
+
+	/* Restore stack limit */
+	irqtp->task = NULL;
+
+	/* Copy back updates to the thread_info */
+	if (irqtp->flags)
+		set_bits(irqtp->flags, &curtp->flags);
+
+	set_irq_regs(old_regs);
 }
 
 void __init init_IRQ(void)
@@ -406,8 +555,13 @@ void exc_lvl_ctx_init(void)
 #ifdef CONFIG_PPC64
 		cpu_nr = i;
 #else
+#ifdef CONFIG_SMP
 		cpu_nr = get_hard_smp_processor_id(i);
+#else
+		cpu_nr = 0;
+#endif
 #endif
+
 		memset((void *)critirq_ctx[cpu_nr], 0, THREAD_SIZE);
 		tp = critirq_ctx[cpu_nr];
 		tp->cpu = cpu_nr;
@@ -440,28 +594,22 @@ void irq_ctx_init(void)
 		memset((void *)softirq_ctx[i], 0, THREAD_SIZE);
 		tp = softirq_ctx[i];
 		tp->cpu = i;
-		tp->preempt_count = 0;
 
 		memset((void *)hardirq_ctx[i], 0, THREAD_SIZE);
 		tp = hardirq_ctx[i];
 		tp->cpu = i;
-		tp->preempt_count = HARDIRQ_OFFSET;
 	}
 }
 
-static inline void do_softirq_onstack(void)
+void do_softirq_own_stack(void)
 {
 	struct thread_info *curtp, *irqtp;
-	unsigned long saved_sp_limit = current->thread.ksp_limit;
 
 	curtp = current_thread_info();
 	irqtp = softirq_ctx[smp_processor_id()];
 	irqtp->task = curtp->task;
 	irqtp->flags = 0;
-	current->thread.ksp_limit = (unsigned long)irqtp +
-				    _ALIGN_UP(sizeof(struct thread_info), 16);
 	call_do_softirq(irqtp);
-	current->thread.ksp_limit = saved_sp_limit;
 	irqtp->task = NULL;
 
 	/* Set any flag that may have been set on the
@@ -471,430 +619,19 @@ static inline void do_softirq_onstack(void)
 		set_bits(irqtp->flags, &curtp->flags);
 }
 
-void do_softirq(void)
-{
-	unsigned long flags;
-
-	if (in_interrupt())
-		return;
-
-	local_irq_save(flags);
-
-	if (local_softirq_pending())
-		do_softirq_onstack();
-
-	local_irq_restore(flags);
-}
-
-
-/*
- * IRQ controller and virtual interrupts
- */
-
-/* The main irq map itself is an array of NR_IRQ entries containing the
- * associate host and irq number. An entry with a host of NULL is free.
- * An entry can be allocated if it's free, the allocator always then sets
- * hwirq first to the host's invalid irq number and then fills ops.
- */
-struct irq_map_entry {
-	irq_hw_number_t	hwirq;
-	struct irq_host	*host;
-};
-
-static LIST_HEAD(irq_hosts);
-static DEFINE_RAW_SPINLOCK(irq_big_lock);
-static DEFINE_MUTEX(revmap_trees_mutex);
-static struct irq_map_entry irq_map[NR_IRQS];
-static unsigned int irq_virq_count = NR_IRQS;
-static struct irq_host *irq_default_host;
-
-irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
-{
-	return irq_map[d->irq].hwirq;
-}
-EXPORT_SYMBOL_GPL(irqd_to_hwirq);
-
 irq_hw_number_t virq_to_hw(unsigned int virq)
 {
-	return irq_map[virq].hwirq;
+	struct irq_data *irq_data = irq_get_irq_data(virq);
+	return WARN_ON(!irq_data) ? 0 : irq_data->hwirq;
 }
 EXPORT_SYMBOL_GPL(virq_to_hw);
 
-bool virq_is_host(unsigned int virq, struct irq_host *host)
-{
-	return irq_map[virq].host == host;
-}
-EXPORT_SYMBOL_GPL(virq_is_host);
-
-static int default_irq_host_match(struct irq_host *h, struct device_node *np)
-{
-	return h->of_node != NULL && h->of_node == np;
-}
-
-struct irq_host *irq_alloc_host(struct device_node *of_node,
-				unsigned int revmap_type,
-				unsigned int revmap_arg,
-				struct irq_host_ops *ops,
-				irq_hw_number_t inval_irq)
-{
-	struct irq_host *host;
-	unsigned int size = sizeof(struct irq_host);
-	unsigned int i;
-	unsigned int *rmap;
-	unsigned long flags;
-
-	/* Allocate structure and revmap table if using linear mapping */
-	if (revmap_type == IRQ_HOST_MAP_LINEAR)
-		size += revmap_arg * sizeof(unsigned int);
-	host = kzalloc(size, GFP_KERNEL);
-	if (host == NULL)
-		return NULL;
-
-	/* Fill structure */
-	host->revmap_type = revmap_type;
-	host->inval_irq = inval_irq;
-	host->ops = ops;
-	host->of_node = of_node_get(of_node);
-
-	if (host->ops->match == NULL)
-		host->ops->match = default_irq_host_match;
-
-	raw_spin_lock_irqsave(&irq_big_lock, flags);
-
-	/* If it's a legacy controller, check for duplicates and
-	 * mark it as allocated (we use irq 0 host pointer for that
-	 */
-	if (revmap_type == IRQ_HOST_MAP_LEGACY) {
-		if (irq_map[0].host != NULL) {
-			raw_spin_unlock_irqrestore(&irq_big_lock, flags);
-			of_node_put(host->of_node);
-			kfree(host);
-			return NULL;
-		}
-		irq_map[0].host = host;
-	}
-
-	list_add(&host->link, &irq_hosts);
-	raw_spin_unlock_irqrestore(&irq_big_lock, flags);
-
-	/* Additional setups per revmap type */
-	switch(revmap_type) {
-	case IRQ_HOST_MAP_LEGACY:
-		/* 0 is always the invalid number for legacy */
-		host->inval_irq = 0;
-		/* setup us as the host for all legacy interrupts */
-		for (i = 1; i < NUM_ISA_INTERRUPTS; i++) {
-			irq_map[i].hwirq = i;
-			smp_wmb();
-			irq_map[i].host = host;
-			smp_wmb();
-
-			/* Legacy flags are left to default at this point,
-			 * one can then use irq_create_mapping() to
-			 * explicitly change them
-			 */
-			ops->map(host, i, i);
-
-			/* Clear norequest flags */
-			irq_clear_status_flags(i, IRQ_NOREQUEST);
-		}
-		break;
-	case IRQ_HOST_MAP_LINEAR:
-		rmap = (unsigned int *)(host + 1);
-		for (i = 0; i < revmap_arg; i++)
-			rmap[i] = NO_IRQ;
-		host->revmap_data.linear.size = revmap_arg;
-		smp_wmb();
-		host->revmap_data.linear.revmap = rmap;
-		break;
-	case IRQ_HOST_MAP_TREE:
-		INIT_RADIX_TREE(&host->revmap_data.tree, GFP_KERNEL);
-		break;
-	default:
-		break;
-	}
-
-	pr_debug("irq: Allocated host of type %d @0x%p\n", revmap_type, host);
-
-	return host;
-}
-
-struct irq_host *irq_find_host(struct device_node *node)
-{
-	struct irq_host *h, *found = NULL;
-	unsigned long flags;
-
-	/* We might want to match the legacy controller last since
-	 * it might potentially be set to match all interrupts in
-	 * the absence of a device node. This isn't a problem so far
-	 * yet though...
-	 */
-	raw_spin_lock_irqsave(&irq_big_lock, flags);
-	list_for_each_entry(h, &irq_hosts, link)
-		if (h->ops->match(h, node)) {
-			found = h;
-			break;
-		}
-	raw_spin_unlock_irqrestore(&irq_big_lock, flags);
-	return found;
-}
-EXPORT_SYMBOL_GPL(irq_find_host);
-
-void irq_set_default_host(struct irq_host *host)
-{
-	pr_debug("irq: Default host set to @0x%p\n", host);
-
-	irq_default_host = host;
-}
-
-void irq_set_virq_count(unsigned int count)
-{
-	pr_debug("irq: Trying to set virq count to %d\n", count);
-
-	BUG_ON(count < NUM_ISA_INTERRUPTS);
-	if (count < NR_IRQS)
-		irq_virq_count = count;
-}
-
-static int irq_setup_virq(struct irq_host *host, unsigned int virq,
-			    irq_hw_number_t hwirq)
-{
-	int res;
-
-	res = irq_alloc_desc_at(virq, 0);
-	if (res != virq) {
-		pr_debug("irq: -> allocating desc failed\n");
-		goto error;
-	}
-
-	/* map it */
-	smp_wmb();
-	irq_map[virq].hwirq = hwirq;
-	smp_mb();
-
-	if (host->ops->map(host, virq, hwirq)) {
-		pr_debug("irq: -> mapping failed, freeing\n");
-		goto errdesc;
-	}
-
-	irq_clear_status_flags(virq, IRQ_NOREQUEST);
-
-	return 0;
-
-errdesc:
-	irq_free_descs(virq, 1);
-error:
-	irq_free_virt(virq, 1);
-	return -1;
-}
-
-unsigned int irq_create_direct_mapping(struct irq_host *host)
-{
-	unsigned int virq;
-
-	if (host == NULL)
-		host = irq_default_host;
-
-	BUG_ON(host == NULL);
-	WARN_ON(host->revmap_type != IRQ_HOST_MAP_NOMAP);
-
-	virq = irq_alloc_virt(host, 1, 0);
-	if (virq == NO_IRQ) {
-		pr_debug("irq: create_direct virq allocation failed\n");
-		return NO_IRQ;
-	}
-
-	pr_debug("irq: create_direct obtained virq %d\n", virq);
-
-	if (irq_setup_virq(host, virq, virq))
-		return NO_IRQ;
-
-	return virq;
-}
-
-unsigned int irq_create_mapping(struct irq_host *host,
-				irq_hw_number_t hwirq)
-{
-	unsigned int virq, hint;
-
-	pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", host, hwirq);
-
-	/* Look for default host if nececssary */
-	if (host == NULL)
-		host = irq_default_host;
-	if (host == NULL) {
-		printk(KERN_WARNING "irq_create_mapping called for"
-		       " NULL host, hwirq=%lx\n", hwirq);
-		WARN_ON(1);
-		return NO_IRQ;
-	}
-	pr_debug("irq: -> using host @%p\n", host);
-
-	/* Check if mapping already exists */
-	virq = irq_find_mapping(host, hwirq);
-	if (virq != NO_IRQ) {
-		pr_debug("irq: -> existing mapping on virq %d\n", virq);
-		return virq;
-	}
-
-	/* Get a virtual interrupt number */
-	if (host->revmap_type == IRQ_HOST_MAP_LEGACY) {
-		/* Handle legacy */
-		virq = (unsigned int)hwirq;
-		if (virq == 0 || virq >= NUM_ISA_INTERRUPTS)
-			return NO_IRQ;
-		return virq;
-	} else {
-		/* Allocate a virtual interrupt number */
-		hint = hwirq % irq_virq_count;
-		virq = irq_alloc_virt(host, 1, hint);
-		if (virq == NO_IRQ) {
-			pr_debug("irq: -> virq allocation failed\n");
-			return NO_IRQ;
-		}
-	}
-
-	if (irq_setup_virq(host, virq, hwirq))
-		return NO_IRQ;
-
-	pr_debug("irq: irq %lu on host %s mapped to virtual irq %u\n",
-		hwirq, host->of_node ? host->of_node->full_name : "null", virq);
-
-	return virq;
-}
-EXPORT_SYMBOL_GPL(irq_create_mapping);
-
-unsigned int irq_create_of_mapping(struct device_node *controller,
-				   const u32 *intspec, unsigned int intsize)
-{
-	struct irq_host *host;
-	irq_hw_number_t hwirq;
-	unsigned int type = IRQ_TYPE_NONE;
-	unsigned int virq;
-
-	if (controller == NULL)
-		host = irq_default_host;
-	else
-		host = irq_find_host(controller);
-	if (host == NULL) {
-		printk(KERN_WARNING "irq: no irq host found for %s !\n",
-		       controller->full_name);
-		return NO_IRQ;
-	}
-
-	/* If host has no translation, then we assume interrupt line */
-	if (host->ops->xlate == NULL)
-		hwirq = intspec[0];
-	else {
-		if (host->ops->xlate(host, controller, intspec, intsize,
-				     &hwirq, &type))
-			return NO_IRQ;
-	}
-
-	/* Create mapping */
-	virq = irq_create_mapping(host, hwirq);
-	if (virq == NO_IRQ)
-		return virq;
-
-	/* Set type if specified and different than the current one */
-	if (type != IRQ_TYPE_NONE &&
-	    type != (irqd_get_trigger_type(irq_get_irq_data(virq))))
-		irq_set_irq_type(virq, type);
-	return virq;
-}
-EXPORT_SYMBOL_GPL(irq_create_of_mapping);
-
-void irq_dispose_mapping(unsigned int virq)
-{
-	struct irq_host *host;
-	irq_hw_number_t hwirq;
-
-	if (virq == NO_IRQ)
-		return;
-
-	host = irq_map[virq].host;
-	if (WARN_ON(host == NULL))
-		return;
-
-	/* Never unmap legacy interrupts */
-	if (host->revmap_type == IRQ_HOST_MAP_LEGACY)
-		return;
-
-	irq_set_status_flags(virq, IRQ_NOREQUEST);
-
-	/* remove chip and handler */
-	irq_set_chip_and_handler(virq, NULL, NULL);
-
-	/* Make sure it's completed */
-	synchronize_irq(virq);
-
-	/* Tell the PIC about it */
-	if (host->ops->unmap)
-		host->ops->unmap(host, virq);
-	smp_mb();
-
-	/* Clear reverse map */
-	hwirq = irq_map[virq].hwirq;
-	switch(host->revmap_type) {
-	case IRQ_HOST_MAP_LINEAR:
-		if (hwirq < host->revmap_data.linear.size)
-			host->revmap_data.linear.revmap[hwirq] = NO_IRQ;
-		break;
-	case IRQ_HOST_MAP_TREE:
-		mutex_lock(&revmap_trees_mutex);
-		radix_tree_delete(&host->revmap_data.tree, hwirq);
-		mutex_unlock(&revmap_trees_mutex);
-		break;
-	}
-
-	/* Destroy map */
-	smp_mb();
-	irq_map[virq].hwirq = host->inval_irq;
-
-	irq_free_descs(virq, 1);
-	/* Free it */
-	irq_free_virt(virq, 1);
-}
-EXPORT_SYMBOL_GPL(irq_dispose_mapping);
-
-unsigned int irq_find_mapping(struct irq_host *host,
-			      irq_hw_number_t hwirq)
-{
-	unsigned int i;
-	unsigned int hint = hwirq % irq_virq_count;
-
-	/* Look for default host if nececssary */
-	if (host == NULL)
-		host = irq_default_host;
-	if (host == NULL)
-		return NO_IRQ;
-
-	/* legacy -> bail early */
-	if (host->revmap_type == IRQ_HOST_MAP_LEGACY)
-		return hwirq;
-
-	/* Slow path does a linear search of the map */
-	if (hint < NUM_ISA_INTERRUPTS)
-		hint = NUM_ISA_INTERRUPTS;
-	i = hint;
-	do  {
-		if (irq_map[i].host == host &&
-		    irq_map[i].hwirq == hwirq)
-			return i;
-		i++;
-		if (i >= irq_virq_count)
-			i = NUM_ISA_INTERRUPTS;
-	} while(i != hint);
-	return NO_IRQ;
-}
-EXPORT_SYMBOL_GPL(irq_find_mapping);
-
 #ifdef CONFIG_SMP
 int irq_choose_cpu(const struct cpumask *mask)
 {
 	int cpuid;
 
-	if (cpumask_equal(mask, cpu_all_mask)) {
+	if (cpumask_equal(mask, cpu_online_mask)) {
 		static int irq_rover;
 		static DEFINE_RAW_SPINLOCK(irq_rover_lock);
 		unsigned long flags;
@@ -925,232 +662,11 @@ int irq_choose_cpu(const struct cpumask *mask)
 }
 #endif
 
-unsigned int irq_radix_revmap_lookup(struct irq_host *host,
-				     irq_hw_number_t hwirq)
-{
-	struct irq_map_entry *ptr;
-	unsigned int virq;
-
-	if (WARN_ON_ONCE(host->revmap_type != IRQ_HOST_MAP_TREE))
-		return irq_find_mapping(host, hwirq);
-
-	/*
-	 * The ptr returned references the static global irq_map.
-	 * but freeing an irq can delete nodes along the path to
-	 * do the lookup via call_rcu.
-	 */
-	rcu_read_lock();
-	ptr = radix_tree_lookup(&host->revmap_data.tree, hwirq);
-	rcu_read_unlock();
-
-	/*
-	 * If found in radix tree, then fine.
-	 * Else fallback to linear lookup - this should not happen in practice
-	 * as it means that we failed to insert the node in the radix tree.
-	 */
-	if (ptr)
-		virq = ptr - irq_map;
-	else
-		virq = irq_find_mapping(host, hwirq);
-
-	return virq;
-}
-
-void irq_radix_revmap_insert(struct irq_host *host, unsigned int virq,
-			     irq_hw_number_t hwirq)
-{
-	if (WARN_ON(host->revmap_type != IRQ_HOST_MAP_TREE))
-		return;
-
-	if (virq != NO_IRQ) {
-		mutex_lock(&revmap_trees_mutex);
-		radix_tree_insert(&host->revmap_data.tree, hwirq,
-				  &irq_map[virq]);
-		mutex_unlock(&revmap_trees_mutex);
-	}
-}
-
-unsigned int irq_linear_revmap(struct irq_host *host,
-			       irq_hw_number_t hwirq)
-{
-	unsigned int *revmap;
-
-	if (WARN_ON_ONCE(host->revmap_type != IRQ_HOST_MAP_LINEAR))
-		return irq_find_mapping(host, hwirq);
-
-	/* Check revmap bounds */
-	if (unlikely(hwirq >= host->revmap_data.linear.size))
-		return irq_find_mapping(host, hwirq);
-
-	/* Check if revmap was allocated */
-	revmap = host->revmap_data.linear.revmap;
-	if (unlikely(revmap == NULL))
-		return irq_find_mapping(host, hwirq);
-
-	/* Fill up revmap with slow path if no mapping found */
-	if (unlikely(revmap[hwirq] == NO_IRQ))
-		revmap[hwirq] = irq_find_mapping(host, hwirq);
-
-	return revmap[hwirq];
-}
-
-unsigned int irq_alloc_virt(struct irq_host *host,
-			    unsigned int count,
-			    unsigned int hint)
-{
-	unsigned long flags;
-	unsigned int i, j, found = NO_IRQ;
-
-	if (count == 0 || count > (irq_virq_count - NUM_ISA_INTERRUPTS))
-		return NO_IRQ;
-
-	raw_spin_lock_irqsave(&irq_big_lock, flags);
-
-	/* Use hint for 1 interrupt if any */
-	if (count == 1 && hint >= NUM_ISA_INTERRUPTS &&
-	    hint < irq_virq_count && irq_map[hint].host == NULL) {
-		found = hint;
-		goto hint_found;
-	}
-
-	/* Look for count consecutive numbers in the allocatable
-	 * (non-legacy) space
-	 */
-	for (i = NUM_ISA_INTERRUPTS, j = 0; i < irq_virq_count; i++) {
-		if (irq_map[i].host != NULL)
-			j = 0;
-		else
-			j++;
-
-		if (j == count) {
-			found = i - count + 1;
-			break;
-		}
-	}
-	if (found == NO_IRQ) {
-		raw_spin_unlock_irqrestore(&irq_big_lock, flags);
-		return NO_IRQ;
-	}
- hint_found:
-	for (i = found; i < (found + count); i++) {
-		irq_map[i].hwirq = host->inval_irq;
-		smp_wmb();
-		irq_map[i].host = host;
-	}
-	raw_spin_unlock_irqrestore(&irq_big_lock, flags);
-	return found;
-}
-
-void irq_free_virt(unsigned int virq, unsigned int count)
-{
-	unsigned long flags;
-	unsigned int i;
-
-	WARN_ON (virq < NUM_ISA_INTERRUPTS);
-	WARN_ON (count == 0 || (virq + count) > irq_virq_count);
-
-	if (virq < NUM_ISA_INTERRUPTS) {
-		if (virq + count < NUM_ISA_INTERRUPTS)
-			return;
-		count  =- NUM_ISA_INTERRUPTS - virq;
-		virq = NUM_ISA_INTERRUPTS;
-	}
-
-	if (count > irq_virq_count || virq > irq_virq_count - count) {
-		if (virq > irq_virq_count)
-			return;
-		count = irq_virq_count - virq;
-	}
-
-	raw_spin_lock_irqsave(&irq_big_lock, flags);
-	for (i = virq; i < (virq + count); i++) {
-		struct irq_host *host;
-
-		host = irq_map[i].host;
-		irq_map[i].hwirq = host->inval_irq;
-		smp_wmb();
-		irq_map[i].host = NULL;
-	}
-	raw_spin_unlock_irqrestore(&irq_big_lock, flags);
-}
-
 int arch_early_irq_init(void)
 {
 	return 0;
 }
 
-#ifdef CONFIG_VIRQ_DEBUG
-static int virq_debug_show(struct seq_file *m, void *private)
-{
-	unsigned long flags;
-	struct irq_desc *desc;
-	const char *p;
-	static const char none[] = "none";
-	void *data;
-	int i;
-
-	seq_printf(m, "%-5s  %-7s  %-15s  %-18s  %s\n", "virq", "hwirq",
-		      "chip name", "chip data", "host name");
-
-	for (i = 1; i < nr_irqs; i++) {
-		desc = irq_to_desc(i);
-		if (!desc)
-			continue;
-
-		raw_spin_lock_irqsave(&desc->lock, flags);
-
-		if (desc->action && desc->action->handler) {
-			struct irq_chip *chip;
-
-			seq_printf(m, "%5d  ", i);
-			seq_printf(m, "0x%05lx  ", irq_map[i].hwirq);
-
-			chip = irq_desc_get_chip(desc);
-			if (chip && chip->name)
-				p = chip->name;
-			else
-				p = none;
-			seq_printf(m, "%-15s  ", p);
-
-			data = irq_desc_get_chip_data(desc);
-			seq_printf(m, "0x%16p  ", data);
-
-			if (irq_map[i].host && irq_map[i].host->of_node)
-				p = irq_map[i].host->of_node->full_name;
-			else
-				p = none;
-			seq_printf(m, "%s\n", p);
-		}
-
-		raw_spin_unlock_irqrestore(&desc->lock, flags);
-	}
-
-	return 0;
-}
-
-static int virq_debug_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, virq_debug_show, inode->i_private);
-}
-
-static const struct file_operations virq_debug_fops = {
-	.open = virq_debug_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static int __init irq_debugfs_init(void)
-{
-	if (debugfs_create_file("virq_mapping", S_IRUGO, powerpc_debugfs_root,
-				 NULL, &virq_debug_fops) == NULL)
-		return -ENOMEM;
-
-	return 0;
-}
-__initcall(irq_debugfs_init);
-#endif /* CONFIG_VIRQ_DEBUG */
-
 #ifdef CONFIG_PPC64
 static int __init setup_noirqdistrib(char *str)
 {
diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c
index 479752901ec..0f199709796 100644
--- a/arch/powerpc/kernel/isa-bridge.c
+++ b/arch/powerpc/kernel/isa-bridge.c
@@ -29,7 +29,6 @@
 #include <asm/pci-bridge.h>
 #include <asm/machdep.h>
 #include <asm/ppc-pci.h>
-#include <asm/firmware.h>
 
 unsigned long isa_io_base;	/* NULL if no ISA bus */
 EXPORT_SYMBOL(isa_io_base);
@@ -42,8 +41,8 @@ EXPORT_SYMBOL_GPL(isa_bridge_pcidev);
 #define ISA_SPACE_MASK 0x1
 #define ISA_SPACE_IO 0x1
 
-static void __devinit pci_process_ISA_OF_ranges(struct device_node *isa_node,
-						unsigned long phb_io_base_phys)
+static void pci_process_ISA_OF_ranges(struct device_node *isa_node,
+				      unsigned long phb_io_base_phys)
 {
 	/* We should get some saner parsing here and remove these structs */
 	struct pci_address {
@@ -171,8 +170,8 @@ void __init isa_bridge_find_early(struct pci_controller *hose)
  * isa_bridge_find_late - Find and map the ISA IO space upon discovery of
  *                        a new ISA bridge
  */
-static void __devinit isa_bridge_find_late(struct pci_dev *pdev,
-					   struct device_node *devnode)
+static void isa_bridge_find_late(struct pci_dev *pdev,
+				 struct device_node *devnode)
 {
 	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
 
@@ -216,8 +215,8 @@ static void isa_bridge_remove(void)
 /**
  * isa_bridge_notify - Get notified of PCI devices addition/removal
  */
-static int __devinit isa_bridge_notify(struct notifier_block *nb,
-				       unsigned long action, void *data)
+static int isa_bridge_notify(struct notifier_block *nb, unsigned long action,
+			     void *data)
 {
 	struct device *dev = data;
 	struct pci_dev *pdev = to_pci_dev(dev);
@@ -261,8 +260,6 @@ static struct notifier_block isa_bridge_notifier = {
  */
 static int __init isa_bridge_init(void)
 {
-	if (firmware_has_feature(FW_FEATURE_ISERIES))
-		return 0;
 	bus_register_notifier(&pci_bus_type, &isa_bridge_notifier);
 	return 0;
 }
diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
index 76a6e40a6f7..8504657379f 100644
--- a/arch/powerpc/kernel/kgdb.c
+++ b/arch/powerpc/kernel/kgdb.c
@@ -15,7 +15,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/init.h>
 #include <linux/kgdb.h>
 #include <linux/smp.h>
 #include <linux/signal.h>
@@ -24,6 +23,8 @@
 #include <asm/current.h>
 #include <asm/processor.h>
 #include <asm/machdep.h>
+#include <asm/debug.h>
+#include <linux/slab.h>
 
 /*
  * This table contains the mapping between PowerPC hardware trap types, and
@@ -100,6 +101,21 @@ static int computeSignal(unsigned int tt)
 	return SIGHUP;		/* default for things we don't know about */
 }
 
+/**
+ *
+ *	kgdb_skipexception - Bail out of KGDB when we've been triggered.
+ *	@exception: Exception vector number
+ *	@regs: Current &struct pt_regs.
+ *
+ *	On some architectures we need to skip a breakpoint exception when
+ *	it occurs after a breakpoint has been removed.
+ *
+ */
+int kgdb_skipexception(int exception, struct pt_regs *regs)
+{
+	return kgdb_isremovedbreak(regs->nip);
+}
+
 static int kgdb_call_nmi_hook(struct pt_regs *regs)
 {
 	kgdb_nmicallback(raw_smp_processor_id(), regs);
@@ -134,9 +150,12 @@ static int kgdb_handle_breakpoint(struct pt_regs *regs)
 	return 1;
 }
 
+static DEFINE_PER_CPU(struct thread_info, kgdb_thread_info);
 static int kgdb_singlestep(struct pt_regs *regs)
 {
 	struct thread_info *thread_info, *exception_thread_info;
+	struct thread_info *backup_current_thread_info =
+		&__get_cpu_var(kgdb_thread_info);
 
 	if (user_mode(regs))
 		return 0;
@@ -154,13 +173,17 @@ static int kgdb_singlestep(struct pt_regs *regs)
 	thread_info = (struct thread_info *)(regs->gpr[1] & ~(THREAD_SIZE-1));
 	exception_thread_info = current_thread_info();
 
-	if (thread_info != exception_thread_info)
+	if (thread_info != exception_thread_info) {
+		/* Save the original current_thread_info. */
+		memcpy(backup_current_thread_info, exception_thread_info, sizeof *thread_info);
 		memcpy(exception_thread_info, thread_info, sizeof *thread_info);
+	}
 
 	kgdb_handle_exception(0, SIGTRAP, 0, regs);
 
 	if (thread_info != exception_thread_info)
-		memcpy(thread_info, exception_thread_info, sizeof *thread_info);
+		/* Restore current_thread_info lastly. */
+		memcpy(exception_thread_info, backup_current_thread_info, sizeof *thread_info);
 
 	return 1;
 }
@@ -175,7 +198,7 @@ static int kgdb_iabr_match(struct pt_regs *regs)
 	return 1;
 }
 
-static int kgdb_dabr_match(struct pt_regs *regs)
+static int kgdb_break_match(struct pt_regs *regs)
 {
 	if (user_mode(regs))
 		return 0;
@@ -409,7 +432,6 @@ int kgdb_arch_handle_exception(int vector, int signo, int err_code,
 #else
 			linux_regs->msr |= MSR_SE;
 #endif
-			kgdb_single_step = 1;
 			atomic_set(&kgdb_cpu_doing_single_step,
 				   raw_smp_processor_id());
 		}
@@ -436,7 +458,7 @@ static void *old__debugger;
 static void *old__debugger_bpt;
 static void *old__debugger_sstep;
 static void *old__debugger_iabr_match;
-static void *old__debugger_dabr_match;
+static void *old__debugger_break_match;
 static void *old__debugger_fault_handler;
 
 int kgdb_arch_init(void)
@@ -446,7 +468,7 @@ int kgdb_arch_init(void)
 	old__debugger_bpt = __debugger_bpt;
 	old__debugger_sstep = __debugger_sstep;
 	old__debugger_iabr_match = __debugger_iabr_match;
-	old__debugger_dabr_match = __debugger_dabr_match;
+	old__debugger_break_match = __debugger_break_match;
 	old__debugger_fault_handler = __debugger_fault_handler;
 
 	__debugger_ipi = kgdb_call_nmi_hook;
@@ -454,7 +476,7 @@ int kgdb_arch_init(void)
 	__debugger_bpt = kgdb_handle_breakpoint;
 	__debugger_sstep = kgdb_singlestep;
 	__debugger_iabr_match = kgdb_iabr_match;
-	__debugger_dabr_match = kgdb_dabr_match;
+	__debugger_break_match = kgdb_break_match;
 	__debugger_fault_handler = kgdb_not_implemented;
 
 	return 0;
@@ -467,6 +489,6 @@ void kgdb_arch_exit(void)
 	__debugger_bpt = old__debugger_bpt;
 	__debugger_sstep = old__debugger_sstep;
 	__debugger_iabr_match = old__debugger_iabr_match;
-	__debugger_dabr_match = old__debugger_dabr_match;
+	__debugger_break_match = old__debugger_break_match;
 	__debugger_fault_handler = old__debugger_fault_handler;
 }
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index bc47352deb1..2f72af82513 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -32,16 +32,10 @@
 #include <linux/module.h>
 #include <linux/kdebug.h>
 #include <linux/slab.h>
+#include <asm/code-patching.h>
 #include <asm/cacheflush.h>
 #include <asm/sstep.h>
 #include <asm/uaccess.h>
-#include <asm/system.h>
-
-#ifdef CONFIG_PPC_ADV_DEBUG_REGS
-#define MSR_SINGLESTEP	(MSR_DE)
-#else
-#define MSR_SINGLESTEP	(MSR_SE)
-#endif
 
 DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
 DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
@@ -105,19 +99,7 @@ void __kprobes arch_remove_kprobe(struct kprobe *p)
 
 static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 {
-	/* We turn off async exceptions to ensure that the single step will
-	 * be for the instruction we have the kprobe on, if we dont its
-	 * possible we'd get the single step reported for an exception handler
-	 * like Decrementer or External Interrupt */
-	regs->msr &= ~MSR_EE;
-	regs->msr |= MSR_SINGLESTEP;
-#ifdef CONFIG_PPC_ADV_DEBUG_REGS
-	regs->msr &= ~MSR_CE;
-	mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM);
-#ifdef CONFIG_PPC_47x
-	isync();
-#endif
-#endif
+	enable_single_step(regs);
 
 	/*
 	 * On powerpc we should single step on the original
@@ -311,7 +293,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
 {
 	struct kretprobe_instance *ri = NULL;
 	struct hlist_head *head, empty_rp;
-	struct hlist_node *node, *tmp;
+	struct hlist_node *tmp;
 	unsigned long flags, orig_ret_address = 0;
 	unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
 
@@ -331,7 +313,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
 	 *       real return address, and all the rest will point to
 	 *       kretprobe_trampoline
 	 */
-	hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+	hlist_for_each_entry_safe(ri, tmp, head, hlist) {
 		if (ri->task != current)
 			/* another task is sharing our hash bucket */
 			continue;
@@ -358,7 +340,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
 	kretprobe_hash_unlock(current, &flags);
 	preempt_enable_no_resched();
 
-	hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
+	hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
 		hlist_del(&ri->hlist);
 		kfree(ri);
 	}
@@ -448,7 +430,7 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 	case KPROBE_HIT_SSDONE:
 		/*
 		 * We increment the nmissed count for accounting,
-		 * we can also use npre/npostfault count for accouting
+		 * we can also use npre/npostfault count for accounting
 		 * these specific fault cases.
 		 */
 		kprobes_inc_nmissed_count(cur);
@@ -510,12 +492,10 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 	return ret;
 }
 
-#ifdef CONFIG_PPC64
 unsigned long arch_deref_entry_point(void *entry)
 {
-	return ((func_descr_t *)entry)->entry;
+	return ppc_global_function_entry(entry);
 }
-#endif
 
 int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
@@ -527,8 +507,12 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	/* setup return addr to the jprobe handler routine */
 	regs->nip = arch_deref_entry_point(jp->entry);
 #ifdef CONFIG_PPC64
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+	regs->gpr[12] = (unsigned long)jp->entry;
+#else
 	regs->gpr[2] = (unsigned long)(((func_descr_t *)jp->entry)->toc);
 #endif
+#endif
 
 	return 1;
 }
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 2985338d0e1..33aa4ddf597 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved.
+ * Copyright 2010-2011 Freescale Semiconductor, Inc.
  *
  * Authors:
  *     Alexander Graf <agraf@suse.de>
@@ -29,6 +30,8 @@
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
 #include <asm/disassemble.h>
+#include <asm/ppc-opcode.h>
+#include <asm/epapr_hcalls.h>
 
 #define KVM_MAGIC_PAGE		(-4096L)
 #define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x)
@@ -41,41 +44,37 @@
 #define KVM_INST_B		0x48000000
 #define KVM_INST_B_MASK		0x03ffffff
 #define KVM_INST_B_MAX		0x01ffffff
+#define KVM_INST_LI		0x38000000
 
 #define KVM_MASK_RT		0x03e00000
 #define KVM_RT_30		0x03c00000
 #define KVM_MASK_RB		0x0000f800
 #define KVM_INST_MFMSR		0x7c0000a6
-#define KVM_INST_MFSPR_SPRG0	0x7c1042a6
-#define KVM_INST_MFSPR_SPRG1	0x7c1142a6
-#define KVM_INST_MFSPR_SPRG2	0x7c1242a6
-#define KVM_INST_MFSPR_SPRG3	0x7c1342a6
-#define KVM_INST_MFSPR_SRR0	0x7c1a02a6
-#define KVM_INST_MFSPR_SRR1	0x7c1b02a6
-#define KVM_INST_MFSPR_DAR	0x7c1302a6
-#define KVM_INST_MFSPR_DSISR	0x7c1202a6
-
-#define KVM_INST_MTSPR_SPRG0	0x7c1043a6
-#define KVM_INST_MTSPR_SPRG1	0x7c1143a6
-#define KVM_INST_MTSPR_SPRG2	0x7c1243a6
-#define KVM_INST_MTSPR_SPRG3	0x7c1343a6
-#define KVM_INST_MTSPR_SRR0	0x7c1a03a6
-#define KVM_INST_MTSPR_SRR1	0x7c1b03a6
-#define KVM_INST_MTSPR_DAR	0x7c1303a6
-#define KVM_INST_MTSPR_DSISR	0x7c1203a6
+
+#define SPR_FROM		0
+#define SPR_TO			0x100
+
+#define KVM_INST_SPR(sprn, moveto) (0x7c0002a6 | \
+				    (((sprn) & 0x1f) << 16) | \
+				    (((sprn) & 0x3e0) << 6) | \
+				    (moveto))
+
+#define KVM_INST_MFSPR(sprn)	KVM_INST_SPR(sprn, SPR_FROM)
+#define KVM_INST_MTSPR(sprn)	KVM_INST_SPR(sprn, SPR_TO)
 
 #define KVM_INST_TLBSYNC	0x7c00046c
 #define KVM_INST_MTMSRD_L0	0x7c000164
 #define KVM_INST_MTMSRD_L1	0x7c010164
 #define KVM_INST_MTMSR		0x7c000124
 
+#define KVM_INST_WRTEE		0x7c000106
 #define KVM_INST_WRTEEI_0	0x7c000146
 #define KVM_INST_WRTEEI_1	0x7c008146
 
 #define KVM_INST_MTSRIN		0x7c0001e4
 
 static bool kvm_patching_worked = true;
-static char kvm_tmp[1024 * 1024];
+char kvm_tmp[1024 * 1024];
 static int kvm_tmp_index;
 
 static inline void kvm_patch_ins(u32 *inst, u32 new_inst)
@@ -270,26 +269,27 @@ static void kvm_patch_ins_mtmsr(u32 *inst, u32 rt)
 
 #ifdef CONFIG_BOOKE
 
-extern u32 kvm_emulate_wrteei_branch_offs;
-extern u32 kvm_emulate_wrteei_ee_offs;
-extern u32 kvm_emulate_wrteei_len;
-extern u32 kvm_emulate_wrteei[];
+extern u32 kvm_emulate_wrtee_branch_offs;
+extern u32 kvm_emulate_wrtee_reg_offs;
+extern u32 kvm_emulate_wrtee_orig_ins_offs;
+extern u32 kvm_emulate_wrtee_len;
+extern u32 kvm_emulate_wrtee[];
 
-static void kvm_patch_ins_wrteei(u32 *inst)
+static void kvm_patch_ins_wrtee(u32 *inst, u32 rt, int imm_one)
 {
 	u32 *p;
 	int distance_start;
 	int distance_end;
 	ulong next_inst;
 
-	p = kvm_alloc(kvm_emulate_wrteei_len * 4);
+	p = kvm_alloc(kvm_emulate_wrtee_len * 4);
 	if (!p)
 		return;
 
 	/* Find out where we are and put everything there */
 	distance_start = (ulong)p - (ulong)inst;
 	next_inst = ((ulong)inst + 4);
-	distance_end = next_inst - (ulong)&p[kvm_emulate_wrteei_branch_offs];
+	distance_end = next_inst - (ulong)&p[kvm_emulate_wrtee_branch_offs];
 
 	/* Make sure we only write valid b instructions */
 	if (distance_start > KVM_INST_B_MAX) {
@@ -298,10 +298,65 @@ static void kvm_patch_ins_wrteei(u32 *inst)
 	}
 
 	/* Modify the chunk to fit the invocation */
-	memcpy(p, kvm_emulate_wrteei, kvm_emulate_wrteei_len * 4);
-	p[kvm_emulate_wrteei_branch_offs] |= distance_end & KVM_INST_B_MASK;
-	p[kvm_emulate_wrteei_ee_offs] |= (*inst & MSR_EE);
-	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrteei_len * 4);
+	memcpy(p, kvm_emulate_wrtee, kvm_emulate_wrtee_len * 4);
+	p[kvm_emulate_wrtee_branch_offs] |= distance_end & KVM_INST_B_MASK;
+
+	if (imm_one) {
+		p[kvm_emulate_wrtee_reg_offs] =
+			KVM_INST_LI | __PPC_RT(R30) | MSR_EE;
+	} else {
+		/* Make clobbered registers work too */
+		switch (get_rt(rt)) {
+		case 30:
+			kvm_patch_ins_ll(&p[kvm_emulate_wrtee_reg_offs],
+					 magic_var(scratch2), KVM_RT_30);
+			break;
+		case 31:
+			kvm_patch_ins_ll(&p[kvm_emulate_wrtee_reg_offs],
+					 magic_var(scratch1), KVM_RT_30);
+			break;
+		default:
+			p[kvm_emulate_wrtee_reg_offs] |= rt;
+			break;
+		}
+	}
+
+	p[kvm_emulate_wrtee_orig_ins_offs] = *inst;
+	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrtee_len * 4);
+
+	/* Patch the invocation */
+	kvm_patch_ins_b(inst, distance_start);
+}
+
+extern u32 kvm_emulate_wrteei_0_branch_offs;
+extern u32 kvm_emulate_wrteei_0_len;
+extern u32 kvm_emulate_wrteei_0[];
+
+static void kvm_patch_ins_wrteei_0(u32 *inst)
+{
+	u32 *p;
+	int distance_start;
+	int distance_end;
+	ulong next_inst;
+
+	p = kvm_alloc(kvm_emulate_wrteei_0_len * 4);
+	if (!p)
+		return;
+
+	/* Find out where we are and put everything there */
+	distance_start = (ulong)p - (ulong)inst;
+	next_inst = ((ulong)inst + 4);
+	distance_end = next_inst - (ulong)&p[kvm_emulate_wrteei_0_branch_offs];
+
+	/* Make sure we only write valid b instructions */
+	if (distance_start > KVM_INST_B_MAX) {
+		kvm_patching_worked = false;
+		return;
+	}
+
+	memcpy(p, kvm_emulate_wrteei_0, kvm_emulate_wrteei_0_len * 4);
+	p[kvm_emulate_wrteei_0_branch_offs] |= distance_end & KVM_INST_B_MASK;
+	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrteei_0_len * 4);
 
 	/* Patch the invocation */
 	kvm_patch_ins_b(inst, distance_start);
@@ -358,13 +413,13 @@ static void kvm_map_magic_page(void *data)
 {
 	u32 *features = data;
 
-	ulong in[8];
+	ulong in[8] = {0};
 	ulong out[8];
 
 	in[0] = KVM_MAGIC_PAGE;
-	in[1] = KVM_MAGIC_PAGE;
+	in[1] = KVM_MAGIC_PAGE | MAGIC_PAGE_FLAG_NOT_MAPPED_NX;
 
-	kvm_hypercall(in, out, HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE);
+	epapr_hypercall(in, out, KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE));
 
 	*features = out[0];
 }
@@ -380,56 +435,191 @@ static void kvm_check_ins(u32 *inst, u32 features)
 	case KVM_INST_MFMSR:
 		kvm_patch_ins_ld(inst, magic_var(msr), inst_rt);
 		break;
-	case KVM_INST_MFSPR_SPRG0:
+	case KVM_INST_MFSPR(SPRN_SPRG0):
 		kvm_patch_ins_ld(inst, magic_var(sprg0), inst_rt);
 		break;
-	case KVM_INST_MFSPR_SPRG1:
+	case KVM_INST_MFSPR(SPRN_SPRG1):
 		kvm_patch_ins_ld(inst, magic_var(sprg1), inst_rt);
 		break;
-	case KVM_INST_MFSPR_SPRG2:
+	case KVM_INST_MFSPR(SPRN_SPRG2):
 		kvm_patch_ins_ld(inst, magic_var(sprg2), inst_rt);
 		break;
-	case KVM_INST_MFSPR_SPRG3:
+	case KVM_INST_MFSPR(SPRN_SPRG3):
 		kvm_patch_ins_ld(inst, magic_var(sprg3), inst_rt);
 		break;
-	case KVM_INST_MFSPR_SRR0:
+	case KVM_INST_MFSPR(SPRN_SRR0):
 		kvm_patch_ins_ld(inst, magic_var(srr0), inst_rt);
 		break;
-	case KVM_INST_MFSPR_SRR1:
+	case KVM_INST_MFSPR(SPRN_SRR1):
 		kvm_patch_ins_ld(inst, magic_var(srr1), inst_rt);
 		break;
-	case KVM_INST_MFSPR_DAR:
+#ifdef CONFIG_BOOKE
+	case KVM_INST_MFSPR(SPRN_DEAR):
+#else
+	case KVM_INST_MFSPR(SPRN_DAR):
+#endif
 		kvm_patch_ins_ld(inst, magic_var(dar), inst_rt);
 		break;
-	case KVM_INST_MFSPR_DSISR:
+	case KVM_INST_MFSPR(SPRN_DSISR):
 		kvm_patch_ins_lwz(inst, magic_var(dsisr), inst_rt);
 		break;
 
+#ifdef CONFIG_PPC_BOOK3E_MMU
+	case KVM_INST_MFSPR(SPRN_MAS0):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_lwz(inst, magic_var(mas0), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_MAS1):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_lwz(inst, magic_var(mas1), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_MAS2):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_ld(inst, magic_var(mas2), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_MAS3):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_lwz(inst, magic_var(mas7_3) + 4, inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_MAS4):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_lwz(inst, magic_var(mas4), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_MAS6):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_lwz(inst, magic_var(mas6), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_MAS7):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_lwz(inst, magic_var(mas7_3), inst_rt);
+		break;
+#endif /* CONFIG_PPC_BOOK3E_MMU */
+
+	case KVM_INST_MFSPR(SPRN_SPRG4):
+#ifdef CONFIG_BOOKE
+	case KVM_INST_MFSPR(SPRN_SPRG4R):
+#endif
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_ld(inst, magic_var(sprg4), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_SPRG5):
+#ifdef CONFIG_BOOKE
+	case KVM_INST_MFSPR(SPRN_SPRG5R):
+#endif
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_ld(inst, magic_var(sprg5), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_SPRG6):
+#ifdef CONFIG_BOOKE
+	case KVM_INST_MFSPR(SPRN_SPRG6R):
+#endif
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_ld(inst, magic_var(sprg6), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_SPRG7):
+#ifdef CONFIG_BOOKE
+	case KVM_INST_MFSPR(SPRN_SPRG7R):
+#endif
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_ld(inst, magic_var(sprg7), inst_rt);
+		break;
+
+#ifdef CONFIG_BOOKE
+	case KVM_INST_MFSPR(SPRN_ESR):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_lwz(inst, magic_var(esr), inst_rt);
+		break;
+#endif
+
+	case KVM_INST_MFSPR(SPRN_PIR):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_lwz(inst, magic_var(pir), inst_rt);
+		break;
+
+
 	/* Stores */
-	case KVM_INST_MTSPR_SPRG0:
+	case KVM_INST_MTSPR(SPRN_SPRG0):
 		kvm_patch_ins_std(inst, magic_var(sprg0), inst_rt);
 		break;
-	case KVM_INST_MTSPR_SPRG1:
+	case KVM_INST_MTSPR(SPRN_SPRG1):
 		kvm_patch_ins_std(inst, magic_var(sprg1), inst_rt);
 		break;
-	case KVM_INST_MTSPR_SPRG2:
+	case KVM_INST_MTSPR(SPRN_SPRG2):
 		kvm_patch_ins_std(inst, magic_var(sprg2), inst_rt);
 		break;
-	case KVM_INST_MTSPR_SPRG3:
+	case KVM_INST_MTSPR(SPRN_SPRG3):
 		kvm_patch_ins_std(inst, magic_var(sprg3), inst_rt);
 		break;
-	case KVM_INST_MTSPR_SRR0:
+	case KVM_INST_MTSPR(SPRN_SRR0):
 		kvm_patch_ins_std(inst, magic_var(srr0), inst_rt);
 		break;
-	case KVM_INST_MTSPR_SRR1:
+	case KVM_INST_MTSPR(SPRN_SRR1):
 		kvm_patch_ins_std(inst, magic_var(srr1), inst_rt);
 		break;
-	case KVM_INST_MTSPR_DAR:
+#ifdef CONFIG_BOOKE
+	case KVM_INST_MTSPR(SPRN_DEAR):
+#else
+	case KVM_INST_MTSPR(SPRN_DAR):
+#endif
 		kvm_patch_ins_std(inst, magic_var(dar), inst_rt);
 		break;
-	case KVM_INST_MTSPR_DSISR:
+	case KVM_INST_MTSPR(SPRN_DSISR):
 		kvm_patch_ins_stw(inst, magic_var(dsisr), inst_rt);
 		break;
+#ifdef CONFIG_PPC_BOOK3E_MMU
+	case KVM_INST_MTSPR(SPRN_MAS0):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_stw(inst, magic_var(mas0), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_MAS1):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_stw(inst, magic_var(mas1), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_MAS2):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_std(inst, magic_var(mas2), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_MAS3):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_stw(inst, magic_var(mas7_3) + 4, inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_MAS4):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_stw(inst, magic_var(mas4), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_MAS6):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_stw(inst, magic_var(mas6), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_MAS7):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_stw(inst, magic_var(mas7_3), inst_rt);
+		break;
+#endif /* CONFIG_PPC_BOOK3E_MMU */
+
+	case KVM_INST_MTSPR(SPRN_SPRG4):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_std(inst, magic_var(sprg4), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_SPRG5):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_std(inst, magic_var(sprg5), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_SPRG6):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_std(inst, magic_var(sprg6), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_SPRG7):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_std(inst, magic_var(sprg7), inst_rt);
+		break;
+
+#ifdef CONFIG_BOOKE
+	case KVM_INST_MTSPR(SPRN_ESR):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_stw(inst, magic_var(esr), inst_rt);
+		break;
+#endif
 
 	/* Nops */
 	case KVM_INST_TLBSYNC:
@@ -444,6 +634,11 @@ static void kvm_check_ins(u32 *inst, u32 features)
 	case KVM_INST_MTMSRD_L0:
 		kvm_patch_ins_mtmsr(inst, inst_rt);
 		break;
+#ifdef CONFIG_BOOKE
+	case KVM_INST_WRTEE:
+		kvm_patch_ins_wrtee(inst, inst_rt, 0);
+		break;
+#endif
 	}
 
 	switch (inst_no_rt & ~KVM_MASK_RB) {
@@ -461,13 +656,19 @@ static void kvm_check_ins(u32 *inst, u32 features)
 	switch (_inst) {
 #ifdef CONFIG_BOOKE
 	case KVM_INST_WRTEEI_0:
+		kvm_patch_ins_wrteei_0(inst);
+		break;
+
 	case KVM_INST_WRTEEI_1:
-		kvm_patch_ins_wrteei(inst);
+		kvm_patch_ins_wrtee(inst, 0, 1);
 		break;
 #endif
 	}
 }
 
+extern u32 kvm_template_start[];
+extern u32 kvm_template_end[];
+
 static void kvm_use_magic_page(void)
 {
 	u32 *p;
@@ -488,87 +689,32 @@ static void kvm_use_magic_page(void)
 	start = (void*)_stext;
 	end = (void*)_etext;
 
-	for (p = start; p < end; p++)
+	/*
+	 * Being interrupted in the middle of patching would
+	 * be bad for SPRG4-7, which KVM can't keep in sync
+	 * with emulated accesses because reads don't trap.
+	 */
+	local_irq_disable();
+
+	for (p = start; p < end; p++) {
+		/* Avoid patching the template code */
+		if (p >= kvm_template_start && p < kvm_template_end) {
+			p = kvm_template_end - 1;
+			continue;
+		}
 		kvm_check_ins(p, features);
+	}
+
+	local_irq_enable();
 
 	printk(KERN_INFO "KVM: Live patching for a fast VM %s\n",
 			 kvm_patching_worked ? "worked" : "failed");
 }
 
-unsigned long kvm_hypercall(unsigned long *in,
-			    unsigned long *out,
-			    unsigned long nr)
-{
-	unsigned long register r0 asm("r0");
-	unsigned long register r3 asm("r3") = in[0];
-	unsigned long register r4 asm("r4") = in[1];
-	unsigned long register r5 asm("r5") = in[2];
-	unsigned long register r6 asm("r6") = in[3];
-	unsigned long register r7 asm("r7") = in[4];
-	unsigned long register r8 asm("r8") = in[5];
-	unsigned long register r9 asm("r9") = in[6];
-	unsigned long register r10 asm("r10") = in[7];
-	unsigned long register r11 asm("r11") = nr;
-	unsigned long register r12 asm("r12");
-
-	asm volatile("bl	kvm_hypercall_start"
-		     : "=r"(r0), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6),
-		       "=r"(r7), "=r"(r8), "=r"(r9), "=r"(r10), "=r"(r11),
-		       "=r"(r12)
-		     : "r"(r3), "r"(r4), "r"(r5), "r"(r6), "r"(r7), "r"(r8),
-		       "r"(r9), "r"(r10), "r"(r11)
-		     : "memory", "cc", "xer", "ctr", "lr");
-
-	out[0] = r4;
-	out[1] = r5;
-	out[2] = r6;
-	out[3] = r7;
-	out[4] = r8;
-	out[5] = r9;
-	out[6] = r10;
-	out[7] = r11;
-
-	return r3;
-}
-EXPORT_SYMBOL_GPL(kvm_hypercall);
-
-static int kvm_para_setup(void)
-{
-	extern u32 kvm_hypercall_start;
-	struct device_node *hyper_node;
-	u32 *insts;
-	int len, i;
-
-	hyper_node = of_find_node_by_path("/hypervisor");
-	if (!hyper_node)
-		return -1;
-
-	insts = (u32*)of_get_property(hyper_node, "hcall-instructions", &len);
-	if (len % 4)
-		return -1;
-	if (len > (4 * 4))
-		return -1;
-
-	for (i = 0; i < (len / 4); i++)
-		kvm_patch_ins(&(&kvm_hypercall_start)[i], insts[i]);
-
-	return 0;
-}
-
 static __init void kvm_free_tmp(void)
 {
-	unsigned long start, end;
-
-	start = (ulong)&kvm_tmp[kvm_tmp_index + (PAGE_SIZE - 1)] & PAGE_MASK;
-	end = (ulong)&kvm_tmp[ARRAY_SIZE(kvm_tmp)] & PAGE_MASK;
-
-	/* Free the tmp space we don't need */
-	for (; start < end; start += PAGE_SIZE) {
-		ClearPageReserved(virt_to_page(start));
-		init_page_count(virt_to_page(start));
-		free_page(start);
-		totalram_pages++;
-	}
+	free_reserved_area(&kvm_tmp[kvm_tmp_index],
+			   &kvm_tmp[ARRAY_SIZE(kvm_tmp)], -1, NULL);
 }
 
 static int __init kvm_guest_init(void)
@@ -576,7 +722,7 @@ static int __init kvm_guest_init(void)
 	if (!kvm_para_available())
 		goto free_tmp;
 
-	if (kvm_para_setup())
+	if (!epapr_paravirt_enabled)
 		goto free_tmp;
 
 	if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE))
diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S
index f2b1b2523e6..e100ff324a8 100644
--- a/arch/powerpc/kernel/kvm_emul.S
+++ b/arch/powerpc/kernel/kvm_emul.S
@@ -13,6 +13,7 @@
  * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  *
  * Copyright SUSE Linux Products GmbH 2010
+ * Copyright 2010-2011 Freescale Semiconductor, Inc.
  *
  * Authors: Alexander Graf <agraf@suse.de>
  */
@@ -23,16 +24,6 @@
 #include <asm/page.h>
 #include <asm/asm-offsets.h>
 
-/* Hypercall entry point. Will be patched with device tree instructions. */
-
-.global kvm_hypercall_start
-kvm_hypercall_start:
-	li	r3, -1
-	nop
-	nop
-	nop
-	blr
-
 #define KVM_MAGIC_PAGE		(-4096)
 
 #ifdef CONFIG_64BIT
@@ -65,6 +56,9 @@ kvm_hypercall_start:
 	   shared->critical == r1 and r2 is always != r1 */		\
 	STL64(r2, KVM_MAGIC_PAGE + KVM_MAGIC_CRITICAL, 0);
 
+.global kvm_template_start
+kvm_template_start:
+
 .global kvm_emulate_mtmsrd
 kvm_emulate_mtmsrd:
 
@@ -128,7 +122,7 @@ kvm_emulate_mtmsrd_len:
 	.long (kvm_emulate_mtmsrd_end - kvm_emulate_mtmsrd) / 4
 
 
-#define MSR_SAFE_BITS (MSR_EE | MSR_CE | MSR_ME | MSR_RI)
+#define MSR_SAFE_BITS (MSR_EE | MSR_RI)
 #define MSR_CRITICAL_BITS ~MSR_SAFE_BITS
 
 .global kvm_emulate_mtmsr
@@ -167,6 +161,9 @@ maybe_stay_in_guest:
 kvm_emulate_mtmsr_reg2:
 	ori	r30, r0, 0
 
+	/* Put MSR into magic page because we don't call mtmsr */
+	STL64(r30, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
 	/* Check if we have to fetch an interrupt */
 	lwz	r31, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0)
 	cmpwi	r31, 0
@@ -174,15 +171,10 @@ kvm_emulate_mtmsr_reg2:
 
 	/* Check if we may trigger an interrupt */
 	andi.	r31, r30, MSR_EE
-	beq	no_mtmsr
-
-	b	do_mtmsr
+	bne	do_mtmsr
 
 no_mtmsr:
 
-	/* Put MSR into magic page because we don't call mtmsr */
-	STL64(r30, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
-
 	SCRATCH_RESTORE
 
 	/* Go back to caller */
@@ -210,24 +202,80 @@ kvm_emulate_mtmsr_orig_ins_offs:
 kvm_emulate_mtmsr_len:
 	.long (kvm_emulate_mtmsr_end - kvm_emulate_mtmsr) / 4
 
+/* also used for wrteei 1 */
+.global kvm_emulate_wrtee
+kvm_emulate_wrtee:
+
+	SCRATCH_SAVE
 
+	/* Fetch old MSR in r31 */
+	LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
 
-.global kvm_emulate_wrteei
-kvm_emulate_wrteei:
+	/* Insert new MSR[EE] */
+kvm_emulate_wrtee_reg:
+	ori	r30, r0, 0
+	rlwimi	r31, r30, 0, MSR_EE
+
+	/*
+	 * If MSR[EE] is now set, check for a pending interrupt.
+	 * We could skip this if MSR[EE] was already on, but that
+	 * should be rare, so don't bother.
+	 */
+	andi.	r30, r30, MSR_EE
+
+	/* Put MSR into magic page because we don't call wrtee */
+	STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
 
+	beq	no_wrtee
+
+	/* Check if we have to fetch an interrupt */
+	lwz	r30, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0)
+	cmpwi	r30, 0
+	bne	do_wrtee
+
+no_wrtee:
+	SCRATCH_RESTORE
+
+	/* Go back to caller */
+kvm_emulate_wrtee_branch:
+	b	.
+
+do_wrtee:
+	SCRATCH_RESTORE
+
+	/* Just fire off the wrtee if it's critical */
+kvm_emulate_wrtee_orig_ins:
+	wrtee	r0
+
+	b	kvm_emulate_wrtee_branch
+
+kvm_emulate_wrtee_end:
+
+.global kvm_emulate_wrtee_branch_offs
+kvm_emulate_wrtee_branch_offs:
+	.long (kvm_emulate_wrtee_branch - kvm_emulate_wrtee) / 4
+
+.global kvm_emulate_wrtee_reg_offs
+kvm_emulate_wrtee_reg_offs:
+	.long (kvm_emulate_wrtee_reg - kvm_emulate_wrtee) / 4
+
+.global kvm_emulate_wrtee_orig_ins_offs
+kvm_emulate_wrtee_orig_ins_offs:
+	.long (kvm_emulate_wrtee_orig_ins - kvm_emulate_wrtee) / 4
+
+.global kvm_emulate_wrtee_len
+kvm_emulate_wrtee_len:
+	.long (kvm_emulate_wrtee_end - kvm_emulate_wrtee) / 4
+
+.global kvm_emulate_wrteei_0
+kvm_emulate_wrteei_0:
 	SCRATCH_SAVE
 
 	/* Fetch old MSR in r31 */
 	LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
 
 	/* Remove MSR_EE from old MSR */
-	li	r30, 0
-	ori	r30, r30, MSR_EE
-	andc	r31, r31, r30
-
-	/* OR new MSR_EE onto the old MSR */
-kvm_emulate_wrteei_ee:
-	ori	r31, r31, 0
+	rlwinm	r31, r31, 0, ~MSR_EE
 
 	/* Write new MSR value back */
 	STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
@@ -235,22 +283,17 @@ kvm_emulate_wrteei_ee:
 	SCRATCH_RESTORE
 
 	/* Go back to caller */
-kvm_emulate_wrteei_branch:
+kvm_emulate_wrteei_0_branch:
 	b	.
-kvm_emulate_wrteei_end:
-
-.global kvm_emulate_wrteei_branch_offs
-kvm_emulate_wrteei_branch_offs:
-	.long (kvm_emulate_wrteei_branch - kvm_emulate_wrteei) / 4
+kvm_emulate_wrteei_0_end:
 
-.global kvm_emulate_wrteei_ee_offs
-kvm_emulate_wrteei_ee_offs:
-	.long (kvm_emulate_wrteei_ee - kvm_emulate_wrteei) / 4
-
-.global kvm_emulate_wrteei_len
-kvm_emulate_wrteei_len:
-	.long (kvm_emulate_wrteei_end - kvm_emulate_wrteei) / 4
+.global kvm_emulate_wrteei_0_branch_offs
+kvm_emulate_wrteei_0_branch_offs:
+	.long (kvm_emulate_wrteei_0_branch - kvm_emulate_wrteei_0) / 4
 
+.global kvm_emulate_wrteei_0_len
+kvm_emulate_wrteei_0_len:
+	.long (kvm_emulate_wrteei_0_end - kvm_emulate_wrteei_0) / 4
 
 .global kvm_emulate_mtsrin
 kvm_emulate_mtsrin:
@@ -300,3 +343,6 @@ kvm_emulate_mtsrin_orig_ins_offs:
 .global kvm_emulate_mtsrin_len
 kvm_emulate_mtsrin_len:
 	.long (kvm_emulate_mtsrin_end - kvm_emulate_mtsrin) / 4
+
+.global kvm_template_end
+kvm_template_end:
diff --git a/arch/powerpc/kernel/legacy_serial.c b/arch/powerpc/kernel/legacy_serial.c
index bedd12e1cfb..936258881c9 100644
--- a/arch/powerpc/kernel/legacy_serial.c
+++ b/arch/powerpc/kernel/legacy_serial.c
@@ -35,7 +35,7 @@ static struct legacy_serial_info {
 	phys_addr_t			taddr;
 } legacy_serial_infos[MAX_LEGACY_SERIAL_PORTS];
 
-static struct __initdata of_device_id legacy_serial_parents[] = {
+static struct of_device_id legacy_serial_parents[] __initdata = {
 	{.type = "soc",},
 	{.type = "tsi-bridge",},
 	{.type = "opb", },
@@ -48,6 +48,9 @@ static struct __initdata of_device_id legacy_serial_parents[] = {
 static unsigned int legacy_serial_count;
 static int legacy_serial_console = -1;
 
+static const upf_t legacy_port_flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST |
+	UPF_SHARE_IRQ | UPF_FIXED_PORT;
+
 static unsigned int tsi_serial_in(struct uart_port *p, int offset)
 {
 	unsigned int tmp;
@@ -71,8 +74,9 @@ static int __init add_legacy_port(struct device_node *np, int want_index,
 				  phys_addr_t taddr, unsigned long irq,
 				  upf_t flags, int irq_check_parent)
 {
-	const __be32 *clk, *spd;
+	const __be32 *clk, *spd, *rs;
 	u32 clock = BASE_BAUD * 16;
+	u32 shift = 0;
 	int index;
 
 	/* get clock freq. if present */
@@ -83,6 +87,11 @@ static int __init add_legacy_port(struct device_node *np, int want_index,
 	/* get default speed if present */
 	spd = of_get_property(np, "current-speed", NULL);
 
+	/* get register shift if present */
+	rs = of_get_property(np, "reg-shift", NULL);
+	if (rs && *rs)
+		shift = be32_to_cpup(rs);
+
 	/* If we have a location index, then try to use it */
 	if (want_index >= 0 && want_index < MAX_LEGACY_SERIAL_PORTS)
 		index = want_index;
@@ -99,7 +108,7 @@ static int __init add_legacy_port(struct device_node *np, int want_index,
 		legacy_serial_count = index + 1;
 
 	/* Check if there is a port who already claimed our slot */
-	if (legacy_serial_infos[index].np != 0) {
+	if (legacy_serial_infos[index].np != NULL) {
 		/* if we still have some room, move it, else override */
 		if (legacy_serial_count < MAX_LEGACY_SERIAL_PORTS) {
 			printk(KERN_DEBUG "Moved legacy port %d -> %d\n",
@@ -126,6 +135,7 @@ static int __init add_legacy_port(struct device_node *np, int want_index,
 	legacy_serial_ports[index].uartclk = clock;
 	legacy_serial_ports[index].irq = irq;
 	legacy_serial_ports[index].flags = flags;
+	legacy_serial_ports[index].regshift = shift;
 	legacy_serial_infos[index].taddr = taddr;
 	legacy_serial_infos[index].np = of_node_get(np);
 	legacy_serial_infos[index].clock = clock;
@@ -152,9 +162,7 @@ static int __init add_legacy_soc_port(struct device_node *np,
 				      struct device_node *soc_dev)
 {
 	u64 addr;
-	const u32 *addrp;
-	upf_t flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | UPF_SHARE_IRQ
-		| UPF_FIXED_PORT;
+	const __be32 *addrp;
 	struct device_node *tsi = of_get_parent(np);
 
 	/* We only support ports that have a clock frequency properly
@@ -163,9 +171,8 @@ static int __init add_legacy_soc_port(struct device_node *np,
 	if (of_get_property(np, "clock-frequency", NULL) == NULL)
 		return -1;
 
-	/* if reg-shift or offset, don't try to use it */
-	if ((of_get_property(np, "reg-shift", NULL) != NULL) ||
-		(of_get_property(np, "reg-offset", NULL) != NULL))
+	/* if reg-offset don't try to use it */
+	if ((of_get_property(np, "reg-offset", NULL) != NULL))
 		return -1;
 
 	/* if rtas uses this device, don't try to use it as well */
@@ -185,9 +192,11 @@ static int __init add_legacy_soc_port(struct device_node *np,
 	 * IO port value. It will be fixed up later along with the irq
 	 */
 	if (tsi && !strcmp(tsi->type, "tsi-bridge"))
-		return add_legacy_port(np, -1, UPIO_TSI, addr, addr, NO_IRQ, flags, 0);
+		return add_legacy_port(np, -1, UPIO_TSI, addr, addr,
+				       NO_IRQ, legacy_port_flags, 0);
 	else
-		return add_legacy_port(np, -1, UPIO_MEM, addr, addr, NO_IRQ, flags, 0);
+		return add_legacy_port(np, -1, UPIO_MEM, addr, addr,
+				       NO_IRQ, legacy_port_flags, 0);
 }
 
 static int __init add_legacy_isa_port(struct device_node *np,
@@ -221,14 +230,19 @@ static int __init add_legacy_isa_port(struct device_node *np,
 	/* Translate ISA address. If it fails, we still register the port
 	 * with no translated address so that it can be picked up as an IO
 	 * port later by the serial driver
+	 *
+	 * Note: Don't even try on P8 lpc, we know it's not directly mapped
 	 */
-	taddr = of_translate_address(np, reg);
-	if (taddr == OF_BAD_ADDR)
+	if (!of_device_is_compatible(isa_brg, "ibm,power8-lpc")) {
+		taddr = of_translate_address(np, reg);
+		if (taddr == OF_BAD_ADDR)
+			taddr = 0;
+	} else
 		taddr = 0;
 
 	/* Add port, irq will be dealt with later */
-	return add_legacy_port(np, index, UPIO_PORT, be32_to_cpu(reg[1]), taddr,
-			       NO_IRQ, UPF_BOOT_AUTOCONF, 0);
+	return add_legacy_port(np, index, UPIO_PORT, be32_to_cpu(reg[1]),
+			       taddr, NO_IRQ, legacy_port_flags, 0);
 
 }
 
@@ -237,7 +251,7 @@ static int __init add_legacy_pci_port(struct device_node *np,
 				      struct device_node *pci_dev)
 {
 	u64 addr, base;
-	const u32 *addrp;
+	const __be32 *addrp;
 	unsigned int flags;
 	int iotype, index = -1, lindex = 0;
 
@@ -270,7 +284,7 @@ static int __init add_legacy_pci_port(struct device_node *np,
 	if (iotype == UPIO_MEM)
 		base = addr;
 	else
-		base = addrp[2];
+		base = of_read_number(&addrp[2], 1);
 
 	/* Try to guess an index... If we have subdevices of the pci dev,
 	 * we get to their "reg" property
@@ -301,25 +315,40 @@ static int __init add_legacy_pci_port(struct device_node *np,
 	 * IO port value. It will be fixed up later along with the irq
 	 */
 	return add_legacy_port(np, index, iotype, base, addr, NO_IRQ,
-			       UPF_BOOT_AUTOCONF, np != pci_dev);
+			       legacy_port_flags, np != pci_dev);
 }
 #endif
 
 static void __init setup_legacy_serial_console(int console)
 {
-	struct legacy_serial_info *info =
-		&legacy_serial_infos[console];
+	struct legacy_serial_info *info = &legacy_serial_infos[console];
+	struct plat_serial8250_port *port = &legacy_serial_ports[console];
 	void __iomem *addr;
+	unsigned int stride;
 
-	if (info->taddr == 0)
-		return;
-	addr = ioremap(info->taddr, 0x1000);
-	if (addr == NULL)
-		return;
+	stride = 1 << port->regshift;
+
+	/* Check if a translated MMIO address has been found */
+	if (info->taddr) {
+		addr = ioremap(info->taddr, 0x1000);
+		if (addr == NULL)
+			return;
+		udbg_uart_init_mmio(addr, stride);
+	} else {
+		/* Check if it's PIO and we support untranslated PIO */
+		if (port->iotype == UPIO_PORT && isa_io_special)
+			udbg_uart_init_pio(port->iobase, stride);
+		else
+			return;
+	}
+
+	/* Try to query the current speed */
 	if (info->speed == 0)
-		info->speed = udbg_probe_uart_speed(addr, info->clock);
+		info->speed = udbg_probe_uart_speed(info->clock);
+
+	/* Set it up */
 	DBG("default console speed = %d\n", info->speed);
-	udbg_init_uart(addr, info->speed, info->clock);
+	udbg_uart_setup(info->speed, info->clock);
 }
 
 /*
@@ -367,10 +396,13 @@ void __init find_legacy_serial_ports(void)
 	/* Next, fill our array with ISA ports */
 	for_each_node_by_type(np, "serial") {
 		struct device_node *isa = of_get_parent(np);
-		if (isa && !strcmp(isa->name, "isa")) {
-			index = add_legacy_isa_port(np, isa);
-			if (index >= 0 && np == stdout)
-				legacy_serial_console = index;
+		if (isa && (!strcmp(isa->name, "isa") ||
+			    !strcmp(isa->name, "lpc"))) {
+			if (of_device_is_available(np)) {
+				index = add_legacy_isa_port(np, isa);
+				if (index >= 0 && np == stdout)
+					legacy_serial_console = index;
+			}
 		}
 		of_node_put(isa);
 	}
@@ -387,7 +419,7 @@ void __init find_legacy_serial_ports(void)
 			of_node_put(parent);
 			continue;
 		}
-		/* Check for known pciclass, and also check wether we have
+		/* Check for known pciclass, and also check whether we have
 		 * a device with child nodes for ports or not
 		 */
 		if (of_device_is_compatible(np, "pciclass,0700") ||
diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c
deleted file mode 100644
index 578f35f1872..00000000000
--- a/arch/powerpc/kernel/lparcfg.c
+++ /dev/null
@@ -1,813 +0,0 @@
-/*
- * PowerPC64 LPAR Configuration Information Driver
- *
- * Dave Engebretsen engebret@us.ibm.com
- *    Copyright (c) 2003 Dave Engebretsen
- * Will Schmidt willschm@us.ibm.com
- *    SPLPAR updates, Copyright (c) 2003 Will Schmidt IBM Corporation.
- *    seq_file updates, Copyright (c) 2004 Will Schmidt IBM Corporation.
- * Nathan Lynch nathanl@austin.ibm.com
- *    Added lparcfg_write, Copyright (C) 2004 Nathan Lynch IBM Corporation.
- *
- *      This program is free software; you can redistribute it and/or
- *      modify it under the terms of the GNU General Public License
- *      as published by the Free Software Foundation; either version
- *      2 of the License, or (at your option) any later version.
- *
- * This driver creates a proc file at /proc/ppc64/lparcfg which contains
- * keyword - value pairs that specify the configuration of the partition.
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/proc_fs.h>
-#include <linux/init.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include <asm/uaccess.h>
-#include <asm/iseries/hv_lp_config.h>
-#include <asm/lppaca.h>
-#include <asm/hvcall.h>
-#include <asm/firmware.h>
-#include <asm/rtas.h>
-#include <asm/system.h>
-#include <asm/time.h>
-#include <asm/prom.h>
-#include <asm/vdso_datapage.h>
-#include <asm/vio.h>
-#include <asm/mmu.h>
-
-#define MODULE_VERS "1.9"
-#define MODULE_NAME "lparcfg"
-
-/* #define LPARCFG_DEBUG */
-
-static struct proc_dir_entry *proc_ppc64_lparcfg;
-
-/*
- * Track sum of all purrs across all processors. This is used to further
- * calculate usage values by different applications
- */
-static unsigned long get_purr(void)
-{
-	unsigned long sum_purr = 0;
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		if (firmware_has_feature(FW_FEATURE_ISERIES))
-			sum_purr += lppaca_of(cpu).emulated_time_base;
-		else {
-			struct cpu_usage *cu;
-
-			cu = &per_cpu(cpu_usage_array, cpu);
-			sum_purr += cu->current_tb;
-		}
-	}
-	return sum_purr;
-}
-
-#ifdef CONFIG_PPC_ISERIES
-
-/*
- * Methods used to fetch LPAR data when running on an iSeries platform.
- */
-static int iseries_lparcfg_data(struct seq_file *m, void *v)
-{
-	unsigned long pool_id;
-	int shared, entitled_capacity, max_entitled_capacity;
-	int processors, max_processors;
-	unsigned long purr = get_purr();
-
-	shared = (int)(local_paca->lppaca_ptr->shared_proc);
-
-	seq_printf(m, "system_active_processors=%d\n",
-		   (int)HvLpConfig_getSystemPhysicalProcessors());
-
-	seq_printf(m, "system_potential_processors=%d\n",
-		   (int)HvLpConfig_getSystemPhysicalProcessors());
-
-	processors = (int)HvLpConfig_getPhysicalProcessors();
-	seq_printf(m, "partition_active_processors=%d\n", processors);
-
-	max_processors = (int)HvLpConfig_getMaxPhysicalProcessors();
-	seq_printf(m, "partition_potential_processors=%d\n", max_processors);
-
-	if (shared) {
-		entitled_capacity = HvLpConfig_getSharedProcUnits();
-		max_entitled_capacity = HvLpConfig_getMaxSharedProcUnits();
-	} else {
-		entitled_capacity = processors * 100;
-		max_entitled_capacity = max_processors * 100;
-	}
-	seq_printf(m, "partition_entitled_capacity=%d\n", entitled_capacity);
-
-	seq_printf(m, "partition_max_entitled_capacity=%d\n",
-		   max_entitled_capacity);
-
-	if (shared) {
-		pool_id = HvLpConfig_getSharedPoolIndex();
-		seq_printf(m, "pool=%d\n", (int)pool_id);
-		seq_printf(m, "pool_capacity=%d\n",
-			   (int)(HvLpConfig_getNumProcsInSharedPool(pool_id) *
-				 100));
-		seq_printf(m, "purr=%ld\n", purr);
-	}
-
-	seq_printf(m, "shared_processor_mode=%d\n", shared);
-
-	return 0;
-}
-
-#else				/* CONFIG_PPC_ISERIES */
-
-static int iseries_lparcfg_data(struct seq_file *m, void *v)
-{
-	return 0;
-}
-
-#endif				/* CONFIG_PPC_ISERIES */
-
-#ifdef CONFIG_PPC_PSERIES
-/*
- * Methods used to fetch LPAR data when running on a pSeries platform.
- */
-
-struct hvcall_ppp_data {
-	u64	entitlement;
-	u64	unallocated_entitlement;
-	u16	group_num;
-	u16	pool_num;
-	u8	capped;
-	u8	weight;
-	u8	unallocated_weight;
-	u16	active_procs_in_pool;
-	u16	active_system_procs;
-	u16	phys_platform_procs;
-	u32	max_proc_cap_avail;
-	u32	entitled_proc_cap_avail;
-};
-
-/*
- * H_GET_PPP hcall returns info in 4 parms.
- *  entitled_capacity,unallocated_capacity,
- *  aggregation, resource_capability).
- *
- *  R4 = Entitled Processor Capacity Percentage.
- *  R5 = Unallocated Processor Capacity Percentage.
- *  R6 (AABBCCDDEEFFGGHH).
- *      XXXX - reserved (0)
- *          XXXX - reserved (0)
- *              XXXX - Group Number
- *                  XXXX - Pool Number.
- *  R7 (IIJJKKLLMMNNOOPP).
- *      XX - reserved. (0)
- *        XX - bit 0-6 reserved (0).   bit 7 is Capped indicator.
- *          XX - variable processor Capacity Weight
- *            XX - Unallocated Variable Processor Capacity Weight.
- *              XXXX - Active processors in Physical Processor Pool.
- *                  XXXX  - Processors active on platform.
- *  R8 (QQQQRRRRRRSSSSSS). if ibm,partition-performance-parameters-level >= 1
- *	XXXX - Physical platform procs allocated to virtualization.
- *	    XXXXXX - Max procs capacity % available to the partitions pool.
- *	          XXXXXX - Entitled procs capacity % available to the
- *			   partitions pool.
- */
-static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data)
-{
-	unsigned long rc;
-	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
-
-	rc = plpar_hcall9(H_GET_PPP, retbuf);
-
-	ppp_data->entitlement = retbuf[0];
-	ppp_data->unallocated_entitlement = retbuf[1];
-
-	ppp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
-	ppp_data->pool_num = retbuf[2] & 0xffff;
-
-	ppp_data->capped = (retbuf[3] >> 6 * 8) & 0x01;
-	ppp_data->weight = (retbuf[3] >> 5 * 8) & 0xff;
-	ppp_data->unallocated_weight = (retbuf[3] >> 4 * 8) & 0xff;
-	ppp_data->active_procs_in_pool = (retbuf[3] >> 2 * 8) & 0xffff;
-	ppp_data->active_system_procs = retbuf[3] & 0xffff;
-
-	ppp_data->phys_platform_procs = retbuf[4] >> 6 * 8;
-	ppp_data->max_proc_cap_avail = (retbuf[4] >> 3 * 8) & 0xffffff;
-	ppp_data->entitled_proc_cap_avail = retbuf[4] & 0xffffff;
-
-	return rc;
-}
-
-static unsigned h_pic(unsigned long *pool_idle_time,
-		      unsigned long *num_procs)
-{
-	unsigned long rc;
-	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
-
-	rc = plpar_hcall(H_PIC, retbuf);
-
-	*pool_idle_time = retbuf[0];
-	*num_procs = retbuf[1];
-
-	return rc;
-}
-
-/*
- * parse_ppp_data
- * Parse out the data returned from h_get_ppp and h_pic
- */
-static void parse_ppp_data(struct seq_file *m)
-{
-	struct hvcall_ppp_data ppp_data;
-	struct device_node *root;
-	const int *perf_level;
-	int rc;
-
-	rc = h_get_ppp(&ppp_data);
-	if (rc)
-		return;
-
-	seq_printf(m, "partition_entitled_capacity=%lld\n",
-	           ppp_data.entitlement);
-	seq_printf(m, "group=%d\n", ppp_data.group_num);
-	seq_printf(m, "system_active_processors=%d\n",
-	           ppp_data.active_system_procs);
-
-	/* pool related entries are appropriate for shared configs */
-	if (lppaca_of(0).shared_proc) {
-		unsigned long pool_idle_time, pool_procs;
-
-		seq_printf(m, "pool=%d\n", ppp_data.pool_num);
-
-		/* report pool_capacity in percentage */
-		seq_printf(m, "pool_capacity=%d\n",
-			   ppp_data.active_procs_in_pool * 100);
-
-		h_pic(&pool_idle_time, &pool_procs);
-		seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time);
-		seq_printf(m, "pool_num_procs=%ld\n", pool_procs);
-	}
-
-	seq_printf(m, "unallocated_capacity_weight=%d\n",
-		   ppp_data.unallocated_weight);
-	seq_printf(m, "capacity_weight=%d\n", ppp_data.weight);
-	seq_printf(m, "capped=%d\n", ppp_data.capped);
-	seq_printf(m, "unallocated_capacity=%lld\n",
-		   ppp_data.unallocated_entitlement);
-
-	/* The last bits of information returned from h_get_ppp are only
-	 * valid if the ibm,partition-performance-parameters-level
-	 * property is >= 1.
-	 */
-	root = of_find_node_by_path("/");
-	if (root) {
-		perf_level = of_get_property(root,
-				"ibm,partition-performance-parameters-level",
-					     NULL);
-		if (perf_level && (*perf_level >= 1)) {
-			seq_printf(m,
-			    "physical_procs_allocated_to_virtualization=%d\n",
-				   ppp_data.phys_platform_procs);
-			seq_printf(m, "max_proc_capacity_available=%d\n",
-				   ppp_data.max_proc_cap_avail);
-			seq_printf(m, "entitled_proc_capacity_available=%d\n",
-				   ppp_data.entitled_proc_cap_avail);
-		}
-
-		of_node_put(root);
-	}
-}
-
-/**
- * parse_mpp_data
- * Parse out data returned from h_get_mpp
- */
-static void parse_mpp_data(struct seq_file *m)
-{
-	struct hvcall_mpp_data mpp_data;
-	int rc;
-
-	rc = h_get_mpp(&mpp_data);
-	if (rc)
-		return;
-
-	seq_printf(m, "entitled_memory=%ld\n", mpp_data.entitled_mem);
-
-	if (mpp_data.mapped_mem != -1)
-		seq_printf(m, "mapped_entitled_memory=%ld\n",
-		           mpp_data.mapped_mem);
-
-	seq_printf(m, "entitled_memory_group_number=%d\n", mpp_data.group_num);
-	seq_printf(m, "entitled_memory_pool_number=%d\n", mpp_data.pool_num);
-
-	seq_printf(m, "entitled_memory_weight=%d\n", mpp_data.mem_weight);
-	seq_printf(m, "unallocated_entitled_memory_weight=%d\n",
-	           mpp_data.unallocated_mem_weight);
-	seq_printf(m, "unallocated_io_mapping_entitlement=%ld\n",
-	           mpp_data.unallocated_entitlement);
-
-	if (mpp_data.pool_size != -1)
-		seq_printf(m, "entitled_memory_pool_size=%ld bytes\n",
-		           mpp_data.pool_size);
-
-	seq_printf(m, "entitled_memory_loan_request=%ld\n",
-	           mpp_data.loan_request);
-
-	seq_printf(m, "backing_memory=%ld bytes\n", mpp_data.backing_mem);
-}
-
-/**
- * parse_mpp_x_data
- * Parse out data returned from h_get_mpp_x
- */
-static void parse_mpp_x_data(struct seq_file *m)
-{
-	struct hvcall_mpp_x_data mpp_x_data;
-
-	if (!firmware_has_feature(FW_FEATURE_XCMO))
-		return;
-	if (h_get_mpp_x(&mpp_x_data))
-		return;
-
-	seq_printf(m, "coalesced_bytes=%ld\n", mpp_x_data.coalesced_bytes);
-
-	if (mpp_x_data.pool_coalesced_bytes)
-		seq_printf(m, "pool_coalesced_bytes=%ld\n",
-			   mpp_x_data.pool_coalesced_bytes);
-	if (mpp_x_data.pool_purr_cycles)
-		seq_printf(m, "coalesce_pool_purr=%ld\n", mpp_x_data.pool_purr_cycles);
-	if (mpp_x_data.pool_spurr_cycles)
-		seq_printf(m, "coalesce_pool_spurr=%ld\n", mpp_x_data.pool_spurr_cycles);
-}
-
-#define SPLPAR_CHARACTERISTICS_TOKEN 20
-#define SPLPAR_MAXLENGTH 1026*(sizeof(char))
-
-/*
- * parse_system_parameter_string()
- * Retrieve the potential_processors, max_entitled_capacity and friends
- * through the get-system-parameter rtas call.  Replace keyword strings as
- * necessary.
- */
-static void parse_system_parameter_string(struct seq_file *m)
-{
-	int call_status;
-
-	unsigned char *local_buffer = kmalloc(SPLPAR_MAXLENGTH, GFP_KERNEL);
-	if (!local_buffer) {
-		printk(KERN_ERR "%s %s kmalloc failure at line %d\n",
-		       __FILE__, __func__, __LINE__);
-		return;
-	}
-
-	spin_lock(&rtas_data_buf_lock);
-	memset(rtas_data_buf, 0, SPLPAR_MAXLENGTH);
-	call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
-				NULL,
-				SPLPAR_CHARACTERISTICS_TOKEN,
-				__pa(rtas_data_buf),
-				RTAS_DATA_BUF_SIZE);
-	memcpy(local_buffer, rtas_data_buf, SPLPAR_MAXLENGTH);
-	spin_unlock(&rtas_data_buf_lock);
-
-	if (call_status != 0) {
-		printk(KERN_INFO
-		       "%s %s Error calling get-system-parameter (0x%x)\n",
-		       __FILE__, __func__, call_status);
-	} else {
-		int splpar_strlen;
-		int idx, w_idx;
-		char *workbuffer = kzalloc(SPLPAR_MAXLENGTH, GFP_KERNEL);
-		if (!workbuffer) {
-			printk(KERN_ERR "%s %s kmalloc failure at line %d\n",
-			       __FILE__, __func__, __LINE__);
-			kfree(local_buffer);
-			return;
-		}
-#ifdef LPARCFG_DEBUG
-		printk(KERN_INFO "success calling get-system-parameter\n");
-#endif
-		splpar_strlen = local_buffer[0] * 256 + local_buffer[1];
-		local_buffer += 2;	/* step over strlen value */
-
-		w_idx = 0;
-		idx = 0;
-		while ((*local_buffer) && (idx < splpar_strlen)) {
-			workbuffer[w_idx++] = local_buffer[idx++];
-			if ((local_buffer[idx] == ',')
-			    || (local_buffer[idx] == '\0')) {
-				workbuffer[w_idx] = '\0';
-				if (w_idx) {
-					/* avoid the empty string */
-					seq_printf(m, "%s\n", workbuffer);
-				}
-				memset(workbuffer, 0, SPLPAR_MAXLENGTH);
-				idx++;	/* skip the comma */
-				w_idx = 0;
-			} else if (local_buffer[idx] == '=') {
-				/* code here to replace workbuffer contents
-				   with different keyword strings */
-				if (0 == strcmp(workbuffer, "MaxEntCap")) {
-					strcpy(workbuffer,
-					       "partition_max_entitled_capacity");
-					w_idx = strlen(workbuffer);
-				}
-				if (0 == strcmp(workbuffer, "MaxPlatProcs")) {
-					strcpy(workbuffer,
-					       "system_potential_processors");
-					w_idx = strlen(workbuffer);
-				}
-			}
-		}
-		kfree(workbuffer);
-		local_buffer -= 2;	/* back up over strlen value */
-	}
-	kfree(local_buffer);
-}
-
-/* Return the number of processors in the system.
- * This function reads through the device tree and counts
- * the virtual processors, this does not include threads.
- */
-static int lparcfg_count_active_processors(void)
-{
-	struct device_node *cpus_dn = NULL;
-	int count = 0;
-
-	while ((cpus_dn = of_find_node_by_type(cpus_dn, "cpu"))) {
-#ifdef LPARCFG_DEBUG
-		printk(KERN_ERR "cpus_dn %p\n", cpus_dn);
-#endif
-		count++;
-	}
-	return count;
-}
-
-static void pseries_cmo_data(struct seq_file *m)
-{
-	int cpu;
-	unsigned long cmo_faults = 0;
-	unsigned long cmo_fault_time = 0;
-
-	seq_printf(m, "cmo_enabled=%d\n", firmware_has_feature(FW_FEATURE_CMO));
-
-	if (!firmware_has_feature(FW_FEATURE_CMO))
-		return;
-
-	for_each_possible_cpu(cpu) {
-		cmo_faults += lppaca_of(cpu).cmo_faults;
-		cmo_fault_time += lppaca_of(cpu).cmo_fault_time;
-	}
-
-	seq_printf(m, "cmo_faults=%lu\n", cmo_faults);
-	seq_printf(m, "cmo_fault_time_usec=%lu\n",
-		   cmo_fault_time / tb_ticks_per_usec);
-	seq_printf(m, "cmo_primary_psp=%d\n", cmo_get_primary_psp());
-	seq_printf(m, "cmo_secondary_psp=%d\n", cmo_get_secondary_psp());
-	seq_printf(m, "cmo_page_size=%lu\n", cmo_get_page_size());
-}
-
-static void splpar_dispatch_data(struct seq_file *m)
-{
-	int cpu;
-	unsigned long dispatches = 0;
-	unsigned long dispatch_dispersions = 0;
-
-	for_each_possible_cpu(cpu) {
-		dispatches += lppaca_of(cpu).yield_count;
-		dispatch_dispersions += lppaca_of(cpu).dispersion_count;
-	}
-
-	seq_printf(m, "dispatches=%lu\n", dispatches);
-	seq_printf(m, "dispatch_dispersions=%lu\n", dispatch_dispersions);
-}
-
-static void parse_em_data(struct seq_file *m)
-{
-	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
-
-	if (plpar_hcall(H_GET_EM_PARMS, retbuf) == H_SUCCESS)
-		seq_printf(m, "power_mode_data=%016lx\n", retbuf[0]);
-}
-
-static int pseries_lparcfg_data(struct seq_file *m, void *v)
-{
-	int partition_potential_processors;
-	int partition_active_processors;
-	struct device_node *rtas_node;
-	const int *lrdrp = NULL;
-
-	rtas_node = of_find_node_by_path("/rtas");
-	if (rtas_node)
-		lrdrp = of_get_property(rtas_node, "ibm,lrdr-capacity", NULL);
-
-	if (lrdrp == NULL) {
-		partition_potential_processors = vdso_data->processorCount;
-	} else {
-		partition_potential_processors = *(lrdrp + 4);
-	}
-	of_node_put(rtas_node);
-
-	partition_active_processors = lparcfg_count_active_processors();
-
-	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
-		/* this call handles the ibm,get-system-parameter contents */
-		parse_system_parameter_string(m);
-		parse_ppp_data(m);
-		parse_mpp_data(m);
-		parse_mpp_x_data(m);
-		pseries_cmo_data(m);
-		splpar_dispatch_data(m);
-
-		seq_printf(m, "purr=%ld\n", get_purr());
-	} else {		/* non SPLPAR case */
-
-		seq_printf(m, "system_active_processors=%d\n",
-			   partition_potential_processors);
-
-		seq_printf(m, "system_potential_processors=%d\n",
-			   partition_potential_processors);
-
-		seq_printf(m, "partition_max_entitled_capacity=%d\n",
-			   partition_potential_processors * 100);
-
-		seq_printf(m, "partition_entitled_capacity=%d\n",
-			   partition_active_processors * 100);
-	}
-
-	seq_printf(m, "partition_active_processors=%d\n",
-		   partition_active_processors);
-
-	seq_printf(m, "partition_potential_processors=%d\n",
-		   partition_potential_processors);
-
-	seq_printf(m, "shared_processor_mode=%d\n", lppaca_of(0).shared_proc);
-
-	seq_printf(m, "slb_size=%d\n", mmu_slb_size);
-
-	parse_em_data(m);
-
-	return 0;
-}
-
-static ssize_t update_ppp(u64 *entitlement, u8 *weight)
-{
-	struct hvcall_ppp_data ppp_data;
-	u8 new_weight;
-	u64 new_entitled;
-	ssize_t retval;
-
-	/* Get our current parameters */
-	retval = h_get_ppp(&ppp_data);
-	if (retval)
-		return retval;
-
-	if (entitlement) {
-		new_weight = ppp_data.weight;
-		new_entitled = *entitlement;
-	} else if (weight) {
-		new_weight = *weight;
-		new_entitled = ppp_data.entitlement;
-	} else
-		return -EINVAL;
-
-	pr_debug("%s: current_entitled = %llu, current_weight = %u\n",
-		 __func__, ppp_data.entitlement, ppp_data.weight);
-
-	pr_debug("%s: new_entitled = %llu, new_weight = %u\n",
-		 __func__, new_entitled, new_weight);
-
-	retval = plpar_hcall_norets(H_SET_PPP, new_entitled, new_weight);
-	return retval;
-}
-
-/**
- * update_mpp
- *
- * Update the memory entitlement and weight for the partition.  Caller must
- * specify either a new entitlement or weight, not both, to be updated
- * since the h_set_mpp call takes both entitlement and weight as parameters.
- */
-static ssize_t update_mpp(u64 *entitlement, u8 *weight)
-{
-	struct hvcall_mpp_data mpp_data;
-	u64 new_entitled;
-	u8 new_weight;
-	ssize_t rc;
-
-	if (entitlement) {
-		/* Check with vio to ensure the new memory entitlement
-		 * can be handled.
-		 */
-		rc = vio_cmo_entitlement_update(*entitlement);
-		if (rc)
-			return rc;
-	}
-
-	rc = h_get_mpp(&mpp_data);
-	if (rc)
-		return rc;
-
-	if (entitlement) {
-		new_weight = mpp_data.mem_weight;
-		new_entitled = *entitlement;
-	} else if (weight) {
-		new_weight = *weight;
-		new_entitled = mpp_data.entitled_mem;
-	} else
-		return -EINVAL;
-
-	pr_debug("%s: current_entitled = %lu, current_weight = %u\n",
-	         __func__, mpp_data.entitled_mem, mpp_data.mem_weight);
-
-	pr_debug("%s: new_entitled = %llu, new_weight = %u\n",
-		 __func__, new_entitled, new_weight);
-
-	rc = plpar_hcall_norets(H_SET_MPP, new_entitled, new_weight);
-	return rc;
-}
-
-/*
- * Interface for changing system parameters (variable capacity weight
- * and entitled capacity).  Format of input is "param_name=value";
- * anything after value is ignored.  Valid parameters at this time are
- * "partition_entitled_capacity" and "capacity_weight".  We use
- * H_SET_PPP to alter parameters.
- *
- * This function should be invoked only on systems with
- * FW_FEATURE_SPLPAR.
- */
-static ssize_t lparcfg_write(struct file *file, const char __user * buf,
-			     size_t count, loff_t * off)
-{
-	int kbuf_sz = 64;
-	char kbuf[kbuf_sz];
-	char *tmp;
-	u64 new_entitled, *new_entitled_ptr = &new_entitled;
-	u8 new_weight, *new_weight_ptr = &new_weight;
-	ssize_t retval;
-
-	if (!firmware_has_feature(FW_FEATURE_SPLPAR) ||
-			firmware_has_feature(FW_FEATURE_ISERIES))
-		return -EINVAL;
-
-	if (count > kbuf_sz)
-		return -EINVAL;
-
-	if (copy_from_user(kbuf, buf, count))
-		return -EFAULT;
-
-	kbuf[count - 1] = '\0';
-	tmp = strchr(kbuf, '=');
-	if (!tmp)
-		return -EINVAL;
-
-	*tmp++ = '\0';
-
-	if (!strcmp(kbuf, "partition_entitled_capacity")) {
-		char *endp;
-		*new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
-		if (endp == tmp)
-			return -EINVAL;
-
-		retval = update_ppp(new_entitled_ptr, NULL);
-	} else if (!strcmp(kbuf, "capacity_weight")) {
-		char *endp;
-		*new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
-		if (endp == tmp)
-			return -EINVAL;
-
-		retval = update_ppp(NULL, new_weight_ptr);
-	} else if (!strcmp(kbuf, "entitled_memory")) {
-		char *endp;
-		*new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
-		if (endp == tmp)
-			return -EINVAL;
-
-		retval = update_mpp(new_entitled_ptr, NULL);
-	} else if (!strcmp(kbuf, "entitled_memory_weight")) {
-		char *endp;
-		*new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
-		if (endp == tmp)
-			return -EINVAL;
-
-		retval = update_mpp(NULL, new_weight_ptr);
-	} else
-		return -EINVAL;
-
-	if (retval == H_SUCCESS || retval == H_CONSTRAINED) {
-		retval = count;
-	} else if (retval == H_BUSY) {
-		retval = -EBUSY;
-	} else if (retval == H_HARDWARE) {
-		retval = -EIO;
-	} else if (retval == H_PARAMETER) {
-		retval = -EINVAL;
-	}
-
-	return retval;
-}
-
-#else				/* CONFIG_PPC_PSERIES */
-
-static int pseries_lparcfg_data(struct seq_file *m, void *v)
-{
-	return 0;
-}
-
-static ssize_t lparcfg_write(struct file *file, const char __user * buf,
-			     size_t count, loff_t * off)
-{
-	return -EINVAL;
-}
-
-#endif				/* CONFIG_PPC_PSERIES */
-
-static int lparcfg_data(struct seq_file *m, void *v)
-{
-	struct device_node *rootdn;
-	const char *model = "";
-	const char *system_id = "";
-	const char *tmp;
-	const unsigned int *lp_index_ptr;
-	unsigned int lp_index = 0;
-
-	seq_printf(m, "%s %s\n", MODULE_NAME, MODULE_VERS);
-
-	rootdn = of_find_node_by_path("/");
-	if (rootdn) {
-		tmp = of_get_property(rootdn, "model", NULL);
-		if (tmp) {
-			model = tmp;
-			/* Skip "IBM," - see platforms/iseries/dt.c */
-			if (firmware_has_feature(FW_FEATURE_ISERIES))
-				model += 4;
-		}
-		tmp = of_get_property(rootdn, "system-id", NULL);
-		if (tmp) {
-			system_id = tmp;
-			/* Skip "IBM," - see platforms/iseries/dt.c */
-			if (firmware_has_feature(FW_FEATURE_ISERIES))
-				system_id += 4;
-		}
-		lp_index_ptr = of_get_property(rootdn, "ibm,partition-no",
-					NULL);
-		if (lp_index_ptr)
-			lp_index = *lp_index_ptr;
-		of_node_put(rootdn);
-	}
-	seq_printf(m, "serial_number=%s\n", system_id);
-	seq_printf(m, "system_type=%s\n", model);
-	seq_printf(m, "partition_id=%d\n", (int)lp_index);
-
-	if (firmware_has_feature(FW_FEATURE_ISERIES))
-		return iseries_lparcfg_data(m, v);
-	return pseries_lparcfg_data(m, v);
-}
-
-static int lparcfg_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, lparcfg_data, NULL);
-}
-
-static const struct file_operations lparcfg_fops = {
-	.owner		= THIS_MODULE,
-	.read		= seq_read,
-	.write		= lparcfg_write,
-	.open		= lparcfg_open,
-	.release	= single_release,
-	.llseek		= seq_lseek,
-};
-
-static int __init lparcfg_init(void)
-{
-	struct proc_dir_entry *ent;
-	umode_t mode = S_IRUSR | S_IRGRP | S_IROTH;
-
-	/* Allow writing if we have FW_FEATURE_SPLPAR */
-	if (firmware_has_feature(FW_FEATURE_SPLPAR) &&
-			!firmware_has_feature(FW_FEATURE_ISERIES))
-		mode |= S_IWUSR;
-
-	ent = proc_create("powerpc/lparcfg", mode, NULL, &lparcfg_fops);
-	if (!ent) {
-		printk(KERN_ERR "Failed to create powerpc/lparcfg\n");
-		return -EIO;
-	}
-
-	proc_ppc64_lparcfg = ent;
-	return 0;
-}
-
-static void __exit lparcfg_cleanup(void)
-{
-	if (proc_ppc64_lparcfg)
-		remove_proc_entry("lparcfg", proc_ppc64_lparcfg->parent);
-}
-
-module_init(lparcfg_init);
-module_exit(lparcfg_cleanup);
-MODULE_DESCRIPTION("Interface for LPAR configuration data");
-MODULE_AUTHOR("Dave Engebretsen");
-MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c
index c957b1202bd..015ae55c186 100644
--- a/arch/powerpc/kernel/machine_kexec.c
+++ b/arch/powerpc/kernel/machine_kexec.c
@@ -18,19 +18,17 @@
 #include <linux/ftrace.h>
 
 #include <asm/machdep.h>
+#include <asm/pgalloc.h>
 #include <asm/prom.h>
 #include <asm/sections.h>
 
 void machine_kexec_mask_interrupts(void) {
 	unsigned int i;
+	struct irq_desc *desc;
 
-	for_each_irq(i) {
-		struct irq_desc *desc = irq_to_desc(i);
+	for_each_irq_desc(i, desc) {
 		struct irq_chip *chip;
 
-		if (!desc)
-			continue;
-
 		chip = irq_desc_get_chip(desc);
 		if (!chip)
 			continue;
@@ -78,6 +76,17 @@ void arch_crash_save_vmcoreinfo(void)
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 	VMCOREINFO_SYMBOL(contig_page_data);
 #endif
+#if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP)
+	VMCOREINFO_SYMBOL(vmemmap_list);
+	VMCOREINFO_SYMBOL(mmu_vmemmap_psize);
+	VMCOREINFO_SYMBOL(mmu_psize_defs);
+	VMCOREINFO_STRUCT_SIZE(vmemmap_backing);
+	VMCOREINFO_OFFSET(vmemmap_backing, list);
+	VMCOREINFO_OFFSET(vmemmap_backing, phys);
+	VMCOREINFO_OFFSET(vmemmap_backing, virt_addr);
+	VMCOREINFO_STRUCT_SIZE(mmu_psize_def);
+	VMCOREINFO_OFFSET(mmu_psize_def, shift);
+#endif
 }
 
 /*
@@ -139,7 +148,7 @@ void __init reserve_crashkernel(void)
 		 * a small SLB (128MB) since the crash kernel needs to place
 		 * itself and some stacks to be in the first segment.
 		 */
-		crashk_res.start = min(0x80000000ULL, (ppc64_rma_size / 2));
+		crashk_res.start = min(0x8000000ULL, (ppc64_rma_size / 2));
 #else
 		crashk_res.start = KDUMP_KERNELBASE;
 #endif
@@ -168,7 +177,7 @@ void __init reserve_crashkernel(void)
 	if (memory_limit && memory_limit <= crashk_res.end) {
 		memory_limit = crashk_res.end + 1;
 		printk("Adjusted memory limit for crashkernel, now 0x%llx\n",
-		       (unsigned long long)memory_limit);
+		       memory_limit);
 	}
 
 	printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
@@ -187,7 +196,9 @@ int overlaps_crashkernel(unsigned long start, unsigned long size)
 
 /* Values we need to export to the second kernel via the device tree. */
 static phys_addr_t kernel_end;
+static phys_addr_t crashk_base;
 static phys_addr_t crashk_size;
+static unsigned long long mem_limit;
 
 static struct property kernel_end_prop = {
 	.name = "linux,kernel-end",
@@ -198,7 +209,7 @@ static struct property kernel_end_prop = {
 static struct property crashk_base_prop = {
 	.name = "linux,crashkernel-base",
 	.length = sizeof(phys_addr_t),
-	.value = &crashk_res.start,
+	.value = &crashk_base
 };
 
 static struct property crashk_size_prop = {
@@ -207,6 +218,14 @@ static struct property crashk_size_prop = {
 	.value = &crashk_size,
 };
 
+static struct property memory_limit_prop = {
+	.name = "linux,memory-limit",
+	.length = sizeof(unsigned long long),
+	.value = &mem_limit,
+};
+
+#define cpu_to_be_ulong	__PASTE(cpu_to_be, BITS_PER_LONG)
+
 static void __init export_crashk_values(struct device_node *node)
 {
 	struct property *prop;
@@ -215,17 +234,25 @@ static void __init export_crashk_values(struct device_node *node)
 	 * be sure what's in them, so remove them. */
 	prop = of_find_property(node, "linux,crashkernel-base", NULL);
 	if (prop)
-		prom_remove_property(node, prop);
+		of_remove_property(node, prop);
 
 	prop = of_find_property(node, "linux,crashkernel-size", NULL);
 	if (prop)
-		prom_remove_property(node, prop);
+		of_remove_property(node, prop);
 
 	if (crashk_res.start != 0) {
-		prom_add_property(node, &crashk_base_prop);
-		crashk_size = resource_size(&crashk_res);
-		prom_add_property(node, &crashk_size_prop);
+		crashk_base = cpu_to_be_ulong(crashk_res.start),
+		of_add_property(node, &crashk_base_prop);
+		crashk_size = cpu_to_be_ulong(resource_size(&crashk_res));
+		of_add_property(node, &crashk_size_prop);
 	}
+
+	/*
+	 * memory_limit is required by the kexec-tools to limit the
+	 * crash regions to the actual memory used.
+	 */
+	mem_limit = cpu_to_be_ulong(memory_limit);
+	of_update_property(node, &memory_limit_prop);
 }
 
 static int __init kexec_setup(void)
@@ -240,11 +267,11 @@ static int __init kexec_setup(void)
 	/* remove any stale properties so ours can be found */
 	prop = of_find_property(node, kernel_end_prop.name, NULL);
 	if (prop)
-		prom_remove_property(node, prop);
+		of_remove_property(node, prop);
 
 	/* information needed by userspace when using default_machine_kexec */
-	kernel_end = __pa(_end);
-	prom_add_property(node, &kernel_end_prop);
+	kernel_end = cpu_to_be_ulong(__pa(_end));
+	of_add_property(node, &kernel_end_prop);
 
 	export_crashk_values(node);
 
diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c
index d7f609086a9..879b3aacac3 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -17,6 +17,7 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/cpu.h>
+#include <linux/hardirq.h>
 
 #include <asm/page.h>
 #include <asm/current.h>
@@ -162,6 +163,8 @@ static int kexec_all_irq_disabled = 0;
 static void kexec_smp_down(void *arg)
 {
 	local_irq_disable();
+	hard_irq_disable();
+
 	mb(); /* make sure our irqs are disabled before we say they are */
 	get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF;
 	while(kexec_all_irq_disabled == 0)
@@ -234,7 +237,7 @@ static void wake_offline_cpus(void)
 		if (!cpu_online(cpu)) {
 			printk(KERN_INFO "kexec: Waking offline cpu %d.\n",
 			       cpu);
-			cpu_up(cpu);
+			WARN_ON(cpu_up(cpu));
 		}
 	}
 }
@@ -244,6 +247,8 @@ static void kexec_prepare_cpus(void)
 	wake_offline_cpus();
 	smp_call_function(kexec_smp_down, NULL, /* wait */0);
 	local_irq_disable();
+	hard_irq_disable();
+
 	mb(); /* make sure IRQs are disabled before we say they are */
 	get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF;
 
@@ -281,6 +286,7 @@ static void kexec_prepare_cpus(void)
 	if (ppc_md.kexec_cpu_down)
 		ppc_md.kexec_cpu_down(0, 0);
 	local_irq_disable();
+	hard_irq_disable();
 }
 
 #endif /* SMP */
@@ -306,7 +312,7 @@ static union thread_union kexec_stack __init_task_data =
  */
 struct paca_struct kexec_paca;
 
-/* Our assembly helper, in kexec_stub.S */
+/* Our assembly helper, in misc_64.S */
 extern void kexec_sequence(void *newstack, unsigned long start,
 			   void *image, void *control,
 			   void (*clear_all)(void)) __noreturn;
@@ -330,10 +336,13 @@ void default_machine_kexec(struct kimage *image)
 	pr_debug("kexec: Starting switchover sequence.\n");
 
 	/* switch to a staticly allocated stack.  Based on irq stack code.
+	 * We setup preempt_count to avoid using VMX in memcpy.
 	 * XXX: the task struct will likely be invalid once we do the copy!
 	 */
 	kexec_stack.thread_info.task = current_thread_info()->task;
 	kexec_stack.thread_info.flags = 0;
+	kexec_stack.thread_info.preempt_count = HARDIRQ_OFFSET;
+	kexec_stack.thread_info.cpu = current_thread_info()->cpu;
 
 	/* We need a static PACA, too; copy this CPU's PACA over and switch to
 	 * it.  Also poison per_cpu_offset to catch anyone using non-static
@@ -360,6 +369,7 @@ void default_machine_kexec(struct kimage *image)
 
 /* Values we need to export to the second kernel via the device tree. */
 static unsigned long htab_base;
+static unsigned long htab_size;
 
 static struct property htab_base_prop = {
 	.name = "linux,htab-base",
@@ -370,7 +380,7 @@ static struct property htab_base_prop = {
 static struct property htab_size_prop = {
 	.name = "linux,htab-size",
 	.length = sizeof(unsigned long),
-	.value = &htab_size_bytes,
+	.value = &htab_size,
 };
 
 static int __init export_htab_values(void)
@@ -389,14 +399,15 @@ static int __init export_htab_values(void)
 	/* remove any stale propertys so ours can be found */
 	prop = of_find_property(node, htab_base_prop.name, NULL);
 	if (prop)
-		prom_remove_property(node, prop);
+		of_remove_property(node, prop);
 	prop = of_find_property(node, htab_size_prop.name, NULL);
 	if (prop)
-		prom_remove_property(node, prop);
+		of_remove_property(node, prop);
 
-	htab_base = __pa(htab_address);
-	prom_add_property(node, &htab_base_prop);
-	prom_add_property(node, &htab_size_prop);
+	htab_base = cpu_to_be64(__pa(htab_address));
+	of_add_property(node, &htab_base_prop);
+	htab_size = cpu_to_be64(htab_size_bytes);
+	of_add_property(node, &htab_size_prop);
 
 	of_node_put(node);
 	return 0;
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
new file mode 100644
index 00000000000..a7fd4cb78b7
--- /dev/null
+++ b/arch/powerpc/kernel/mce.c
@@ -0,0 +1,352 @@
+/*
+ * Machine check exception handling.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2013 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+#define pr_fmt(fmt) "mce: " fmt
+
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/percpu.h>
+#include <linux/export.h>
+#include <linux/irq_work.h>
+#include <asm/mce.h>
+
+static DEFINE_PER_CPU(int, mce_nest_count);
+static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
+
+/* Queue for delayed MCE events. */
+static DEFINE_PER_CPU(int, mce_queue_count);
+static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
+
+static void machine_check_process_queued_event(struct irq_work *work);
+struct irq_work mce_event_process_work = {
+        .func = machine_check_process_queued_event,
+};
+
+static void mce_set_error_info(struct machine_check_event *mce,
+			       struct mce_error_info *mce_err)
+{
+	mce->error_type = mce_err->error_type;
+	switch (mce_err->error_type) {
+	case MCE_ERROR_TYPE_UE:
+		mce->u.ue_error.ue_error_type = mce_err->u.ue_error_type;
+		break;
+	case MCE_ERROR_TYPE_SLB:
+		mce->u.slb_error.slb_error_type = mce_err->u.slb_error_type;
+		break;
+	case MCE_ERROR_TYPE_ERAT:
+		mce->u.erat_error.erat_error_type = mce_err->u.erat_error_type;
+		break;
+	case MCE_ERROR_TYPE_TLB:
+		mce->u.tlb_error.tlb_error_type = mce_err->u.tlb_error_type;
+		break;
+	case MCE_ERROR_TYPE_UNKNOWN:
+	default:
+		break;
+	}
+}
+
+/*
+ * Decode and save high level MCE information into per cpu buffer which
+ * is an array of machine_check_event structure.
+ */
+void save_mce_event(struct pt_regs *regs, long handled,
+		    struct mce_error_info *mce_err,
+		    uint64_t nip, uint64_t addr)
+{
+	uint64_t srr1;
+	int index = __get_cpu_var(mce_nest_count)++;
+	struct machine_check_event *mce = &__get_cpu_var(mce_event[index]);
+
+	/*
+	 * Return if we don't have enough space to log mce event.
+	 * mce_nest_count may go beyond MAX_MC_EVT but that's ok,
+	 * the check below will stop buffer overrun.
+	 */
+	if (index >= MAX_MC_EVT)
+		return;
+
+	/* Populate generic machine check info */
+	mce->version = MCE_V1;
+	mce->srr0 = nip;
+	mce->srr1 = regs->msr;
+	mce->gpr3 = regs->gpr[3];
+	mce->in_use = 1;
+
+	mce->initiator = MCE_INITIATOR_CPU;
+	if (handled)
+		mce->disposition = MCE_DISPOSITION_RECOVERED;
+	else
+		mce->disposition = MCE_DISPOSITION_NOT_RECOVERED;
+	mce->severity = MCE_SEV_ERROR_SYNC;
+
+	srr1 = regs->msr;
+
+	/*
+	 * Populate the mce error_type and type-specific error_type.
+	 */
+	mce_set_error_info(mce, mce_err);
+
+	if (!addr)
+		return;
+
+	if (mce->error_type == MCE_ERROR_TYPE_TLB) {
+		mce->u.tlb_error.effective_address_provided = true;
+		mce->u.tlb_error.effective_address = addr;
+	} else if (mce->error_type == MCE_ERROR_TYPE_SLB) {
+		mce->u.slb_error.effective_address_provided = true;
+		mce->u.slb_error.effective_address = addr;
+	} else if (mce->error_type == MCE_ERROR_TYPE_ERAT) {
+		mce->u.erat_error.effective_address_provided = true;
+		mce->u.erat_error.effective_address = addr;
+	} else if (mce->error_type == MCE_ERROR_TYPE_UE) {
+		mce->u.ue_error.effective_address_provided = true;
+		mce->u.ue_error.effective_address = addr;
+	}
+	return;
+}
+
+/*
+ * get_mce_event:
+ *	mce	Pointer to machine_check_event structure to be filled.
+ *	release Flag to indicate whether to free the event slot or not.
+ *		0 <= do not release the mce event. Caller will invoke
+ *		     release_mce_event() once event has been consumed.
+ *		1 <= release the slot.
+ *
+ *	return	1 = success
+ *		0 = failure
+ *
+ * get_mce_event() will be called by platform specific machine check
+ * handle routine and in KVM.
+ * When we call get_mce_event(), we are still in interrupt context and
+ * preemption will not be scheduled until ret_from_expect() routine
+ * is called.
+ */
+int get_mce_event(struct machine_check_event *mce, bool release)
+{
+	int index = __get_cpu_var(mce_nest_count) - 1;
+	struct machine_check_event *mc_evt;
+	int ret = 0;
+
+	/* Sanity check */
+	if (index < 0)
+		return ret;
+
+	/* Check if we have MCE info to process. */
+	if (index < MAX_MC_EVT) {
+		mc_evt = &__get_cpu_var(mce_event[index]);
+		/* Copy the event structure and release the original */
+		if (mce)
+			*mce = *mc_evt;
+		if (release)
+			mc_evt->in_use = 0;
+		ret = 1;
+	}
+	/* Decrement the count to free the slot. */
+	if (release)
+		__get_cpu_var(mce_nest_count)--;
+
+	return ret;
+}
+
+void release_mce_event(void)
+{
+	get_mce_event(NULL, true);
+}
+
+/*
+ * Queue up the MCE event which then can be handled later.
+ */
+void machine_check_queue_event(void)
+{
+	int index;
+	struct machine_check_event evt;
+
+	if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
+		return;
+
+	index = __get_cpu_var(mce_queue_count)++;
+	/* If queue is full, just return for now. */
+	if (index >= MAX_MC_EVT) {
+		__get_cpu_var(mce_queue_count)--;
+		return;
+	}
+	__get_cpu_var(mce_event_queue[index]) = evt;
+
+	/* Queue irq work to process this event later. */
+	irq_work_queue(&mce_event_process_work);
+}
+
+/*
+ * process pending MCE event from the mce event queue. This function will be
+ * called during syscall exit.
+ */
+static void machine_check_process_queued_event(struct irq_work *work)
+{
+	int index;
+
+	/*
+	 * For now just print it to console.
+	 * TODO: log this error event to FSP or nvram.
+	 */
+	while (__get_cpu_var(mce_queue_count) > 0) {
+		index = __get_cpu_var(mce_queue_count) - 1;
+		machine_check_print_event_info(
+				&__get_cpu_var(mce_event_queue[index]));
+		__get_cpu_var(mce_queue_count)--;
+	}
+}
+
+void machine_check_print_event_info(struct machine_check_event *evt)
+{
+	const char *level, *sevstr, *subtype;
+	static const char *mc_ue_types[] = {
+		"Indeterminate",
+		"Instruction fetch",
+		"Page table walk ifetch",
+		"Load/Store",
+		"Page table walk Load/Store",
+	};
+	static const char *mc_slb_types[] = {
+		"Indeterminate",
+		"Parity",
+		"Multihit",
+	};
+	static const char *mc_erat_types[] = {
+		"Indeterminate",
+		"Parity",
+		"Multihit",
+	};
+	static const char *mc_tlb_types[] = {
+		"Indeterminate",
+		"Parity",
+		"Multihit",
+	};
+
+	/* Print things out */
+	if (evt->version != MCE_V1) {
+		pr_err("Machine Check Exception, Unknown event version %d !\n",
+		       evt->version);
+		return;
+	}
+	switch (evt->severity) {
+	case MCE_SEV_NO_ERROR:
+		level = KERN_INFO;
+		sevstr = "Harmless";
+		break;
+	case MCE_SEV_WARNING:
+		level = KERN_WARNING;
+		sevstr = "";
+		break;
+	case MCE_SEV_ERROR_SYNC:
+		level = KERN_ERR;
+		sevstr = "Severe";
+		break;
+	case MCE_SEV_FATAL:
+	default:
+		level = KERN_ERR;
+		sevstr = "Fatal";
+		break;
+	}
+
+	printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
+	       evt->disposition == MCE_DISPOSITION_RECOVERED ?
+	       "Recovered" : "[Not recovered");
+	printk("%s  Initiator: %s\n", level,
+	       evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown");
+	switch (evt->error_type) {
+	case MCE_ERROR_TYPE_UE:
+		subtype = evt->u.ue_error.ue_error_type <
+			ARRAY_SIZE(mc_ue_types) ?
+			mc_ue_types[evt->u.ue_error.ue_error_type]
+			: "Unknown";
+		printk("%s  Error type: UE [%s]\n", level, subtype);
+		if (evt->u.ue_error.effective_address_provided)
+			printk("%s    Effective address: %016llx\n",
+			       level, evt->u.ue_error.effective_address);
+		if (evt->u.ue_error.physical_address_provided)
+			printk("%s      Physial address: %016llx\n",
+			       level, evt->u.ue_error.physical_address);
+		break;
+	case MCE_ERROR_TYPE_SLB:
+		subtype = evt->u.slb_error.slb_error_type <
+			ARRAY_SIZE(mc_slb_types) ?
+			mc_slb_types[evt->u.slb_error.slb_error_type]
+			: "Unknown";
+		printk("%s  Error type: SLB [%s]\n", level, subtype);
+		if (evt->u.slb_error.effective_address_provided)
+			printk("%s    Effective address: %016llx\n",
+			       level, evt->u.slb_error.effective_address);
+		break;
+	case MCE_ERROR_TYPE_ERAT:
+		subtype = evt->u.erat_error.erat_error_type <
+			ARRAY_SIZE(mc_erat_types) ?
+			mc_erat_types[evt->u.erat_error.erat_error_type]
+			: "Unknown";
+		printk("%s  Error type: ERAT [%s]\n", level, subtype);
+		if (evt->u.erat_error.effective_address_provided)
+			printk("%s    Effective address: %016llx\n",
+			       level, evt->u.erat_error.effective_address);
+		break;
+	case MCE_ERROR_TYPE_TLB:
+		subtype = evt->u.tlb_error.tlb_error_type <
+			ARRAY_SIZE(mc_tlb_types) ?
+			mc_tlb_types[evt->u.tlb_error.tlb_error_type]
+			: "Unknown";
+		printk("%s  Error type: TLB [%s]\n", level, subtype);
+		if (evt->u.tlb_error.effective_address_provided)
+			printk("%s    Effective address: %016llx\n",
+			       level, evt->u.tlb_error.effective_address);
+		break;
+	default:
+	case MCE_ERROR_TYPE_UNKNOWN:
+		printk("%s  Error type: Unknown\n", level);
+		break;
+	}
+}
+
+uint64_t get_mce_fault_addr(struct machine_check_event *evt)
+{
+	switch (evt->error_type) {
+	case MCE_ERROR_TYPE_UE:
+		if (evt->u.ue_error.effective_address_provided)
+			return evt->u.ue_error.effective_address;
+		break;
+	case MCE_ERROR_TYPE_SLB:
+		if (evt->u.slb_error.effective_address_provided)
+			return evt->u.slb_error.effective_address;
+		break;
+	case MCE_ERROR_TYPE_ERAT:
+		if (evt->u.erat_error.effective_address_provided)
+			return evt->u.erat_error.effective_address;
+		break;
+	case MCE_ERROR_TYPE_TLB:
+		if (evt->u.tlb_error.effective_address_provided)
+			return evt->u.tlb_error.effective_address;
+		break;
+	default:
+	case MCE_ERROR_TYPE_UNKNOWN:
+		break;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(get_mce_fault_addr);
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
new file mode 100644
index 00000000000..aa9aff3d6ad
--- /dev/null
+++ b/arch/powerpc/kernel/mce_power.c
@@ -0,0 +1,313 @@
+/*
+ * Machine check exception handling CPU-side for power7 and power8
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2013 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+#define pr_fmt(fmt) "mce_power: " fmt
+
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <asm/mmu.h>
+#include <asm/mce.h>
+#include <asm/machdep.h>
+
+/* flush SLBs and reload */
+static void flush_and_reload_slb(void)
+{
+	struct slb_shadow *slb;
+	unsigned long i, n;
+
+	/* Invalidate all SLBs */
+	asm volatile("slbmte %0,%0; slbia" : : "r" (0));
+
+#ifdef CONFIG_KVM_BOOK3S_HANDLER
+	/*
+	 * If machine check is hit when in guest or in transition, we will
+	 * only flush the SLBs and continue.
+	 */
+	if (get_paca()->kvm_hstate.in_guest)
+		return;
+#endif
+
+	/* For host kernel, reload the SLBs from shadow SLB buffer. */
+	slb = get_slb_shadow();
+	if (!slb)
+		return;
+
+	n = min_t(u32, be32_to_cpu(slb->persistent), SLB_MIN_SIZE);
+
+	/* Load up the SLB entries from shadow SLB */
+	for (i = 0; i < n; i++) {
+		unsigned long rb = be64_to_cpu(slb->save_area[i].esid);
+		unsigned long rs = be64_to_cpu(slb->save_area[i].vsid);
+
+		rb = (rb & ~0xFFFul) | i;
+		asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb));
+	}
+}
+
+static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits)
+{
+	long handled = 1;
+
+	/*
+	 * flush and reload SLBs for SLB errors and flush TLBs for TLB errors.
+	 * reset the error bits whenever we handle them so that at the end
+	 * we can check whether we handled all of them or not.
+	 * */
+	if (dsisr & slb_error_bits) {
+		flush_and_reload_slb();
+		/* reset error bits */
+		dsisr &= ~(slb_error_bits);
+	}
+	if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) {
+		if (cur_cpu_spec && cur_cpu_spec->flush_tlb)
+			cur_cpu_spec->flush_tlb(TLBIEL_INVAL_PAGE);
+		/* reset error bits */
+		dsisr &= ~P7_DSISR_MC_TLB_MULTIHIT_MFTLB;
+	}
+	/* Any other errors we don't understand? */
+	if (dsisr & 0xffffffffUL)
+		handled = 0;
+
+	return handled;
+}
+
+static long mce_handle_derror_p7(uint64_t dsisr)
+{
+	return mce_handle_derror(dsisr, P7_DSISR_MC_SLB_ERRORS);
+}
+
+static long mce_handle_common_ierror(uint64_t srr1)
+{
+	long handled = 0;
+
+	switch (P7_SRR1_MC_IFETCH(srr1)) {
+	case 0:
+		break;
+	case P7_SRR1_MC_IFETCH_SLB_PARITY:
+	case P7_SRR1_MC_IFETCH_SLB_MULTIHIT:
+		/* flush and reload SLBs for SLB errors. */
+		flush_and_reload_slb();
+		handled = 1;
+		break;
+	case P7_SRR1_MC_IFETCH_TLB_MULTIHIT:
+		if (cur_cpu_spec && cur_cpu_spec->flush_tlb) {
+			cur_cpu_spec->flush_tlb(TLBIEL_INVAL_PAGE);
+			handled = 1;
+		}
+		break;
+	default:
+		break;
+	}
+
+	return handled;
+}
+
+static long mce_handle_ierror_p7(uint64_t srr1)
+{
+	long handled = 0;
+
+	handled = mce_handle_common_ierror(srr1);
+
+	if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) {
+		flush_and_reload_slb();
+		handled = 1;
+	}
+	return handled;
+}
+
+static void mce_get_common_ierror(struct mce_error_info *mce_err, uint64_t srr1)
+{
+	switch (P7_SRR1_MC_IFETCH(srr1)) {
+	case P7_SRR1_MC_IFETCH_SLB_PARITY:
+		mce_err->error_type = MCE_ERROR_TYPE_SLB;
+		mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY;
+		break;
+	case P7_SRR1_MC_IFETCH_SLB_MULTIHIT:
+		mce_err->error_type = MCE_ERROR_TYPE_SLB;
+		mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
+		break;
+	case P7_SRR1_MC_IFETCH_TLB_MULTIHIT:
+		mce_err->error_type = MCE_ERROR_TYPE_TLB;
+		mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
+		break;
+	case P7_SRR1_MC_IFETCH_UE:
+	case P7_SRR1_MC_IFETCH_UE_IFU_INTERNAL:
+		mce_err->error_type = MCE_ERROR_TYPE_UE;
+		mce_err->u.ue_error_type = MCE_UE_ERROR_IFETCH;
+		break;
+	case P7_SRR1_MC_IFETCH_UE_TLB_RELOAD:
+		mce_err->error_type = MCE_ERROR_TYPE_UE;
+		mce_err->u.ue_error_type =
+				MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH;
+		break;
+	}
+}
+
+static void mce_get_ierror_p7(struct mce_error_info *mce_err, uint64_t srr1)
+{
+	mce_get_common_ierror(mce_err, srr1);
+	if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) {
+		mce_err->error_type = MCE_ERROR_TYPE_SLB;
+		mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
+	}
+}
+
+static void mce_get_derror_p7(struct mce_error_info *mce_err, uint64_t dsisr)
+{
+	if (dsisr & P7_DSISR_MC_UE) {
+		mce_err->error_type = MCE_ERROR_TYPE_UE;
+		mce_err->u.ue_error_type = MCE_UE_ERROR_LOAD_STORE;
+	} else if (dsisr & P7_DSISR_MC_UE_TABLEWALK) {
+		mce_err->error_type = MCE_ERROR_TYPE_UE;
+		mce_err->u.ue_error_type =
+				MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE;
+	} else if (dsisr & P7_DSISR_MC_ERAT_MULTIHIT) {
+		mce_err->error_type = MCE_ERROR_TYPE_ERAT;
+		mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
+	} else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT) {
+		mce_err->error_type = MCE_ERROR_TYPE_SLB;
+		mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
+	} else if (dsisr & P7_DSISR_MC_SLB_PARITY_MFSLB) {
+		mce_err->error_type = MCE_ERROR_TYPE_SLB;
+		mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY;
+	} else if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) {
+		mce_err->error_type = MCE_ERROR_TYPE_TLB;
+		mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
+	} else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT_PARITY) {
+		mce_err->error_type = MCE_ERROR_TYPE_SLB;
+		mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
+	}
+}
+
+static long mce_handle_ue_error(struct pt_regs *regs)
+{
+	long handled = 0;
+
+	/*
+	 * On specific SCOM read via MMIO we may get a machine check
+	 * exception with SRR0 pointing inside opal. If that is the
+	 * case OPAL may have recovery address to re-read SCOM data in
+	 * different way and hence we can recover from this MC.
+	 */
+
+	if (ppc_md.mce_check_early_recovery) {
+		if (ppc_md.mce_check_early_recovery(regs))
+			handled = 1;
+	}
+	return handled;
+}
+
+long __machine_check_early_realmode_p7(struct pt_regs *regs)
+{
+	uint64_t srr1, nip, addr;
+	long handled = 1;
+	struct mce_error_info mce_error_info = { 0 };
+
+	srr1 = regs->msr;
+	nip = regs->nip;
+
+	/*
+	 * Handle memory errors depending whether this was a load/store or
+	 * ifetch exception. Also, populate the mce error_type and
+	 * type-specific error_type from either SRR1 or DSISR, depending
+	 * whether this was a load/store or ifetch exception
+	 */
+	if (P7_SRR1_MC_LOADSTORE(srr1)) {
+		handled = mce_handle_derror_p7(regs->dsisr);
+		mce_get_derror_p7(&mce_error_info, regs->dsisr);
+		addr = regs->dar;
+	} else {
+		handled = mce_handle_ierror_p7(srr1);
+		mce_get_ierror_p7(&mce_error_info, srr1);
+		addr = regs->nip;
+	}
+
+	/* Handle UE error. */
+	if (mce_error_info.error_type == MCE_ERROR_TYPE_UE)
+		handled = mce_handle_ue_error(regs);
+
+	save_mce_event(regs, handled, &mce_error_info, nip, addr);
+	return handled;
+}
+
+static void mce_get_ierror_p8(struct mce_error_info *mce_err, uint64_t srr1)
+{
+	mce_get_common_ierror(mce_err, srr1);
+	if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) {
+		mce_err->error_type = MCE_ERROR_TYPE_ERAT;
+		mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
+	}
+}
+
+static void mce_get_derror_p8(struct mce_error_info *mce_err, uint64_t dsisr)
+{
+	mce_get_derror_p7(mce_err, dsisr);
+	if (dsisr & P8_DSISR_MC_ERAT_MULTIHIT_SEC) {
+		mce_err->error_type = MCE_ERROR_TYPE_ERAT;
+		mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
+	}
+}
+
+static long mce_handle_ierror_p8(uint64_t srr1)
+{
+	long handled = 0;
+
+	handled = mce_handle_common_ierror(srr1);
+
+	if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) {
+		flush_and_reload_slb();
+		handled = 1;
+	}
+	return handled;
+}
+
+static long mce_handle_derror_p8(uint64_t dsisr)
+{
+	return mce_handle_derror(dsisr, P8_DSISR_MC_SLB_ERRORS);
+}
+
+long __machine_check_early_realmode_p8(struct pt_regs *regs)
+{
+	uint64_t srr1, nip, addr;
+	long handled = 1;
+	struct mce_error_info mce_error_info = { 0 };
+
+	srr1 = regs->msr;
+	nip = regs->nip;
+
+	if (P7_SRR1_MC_LOADSTORE(srr1)) {
+		handled = mce_handle_derror_p8(regs->dsisr);
+		mce_get_derror_p8(&mce_error_info, regs->dsisr);
+		addr = regs->dar;
+	} else {
+		handled = mce_handle_ierror_p8(srr1);
+		mce_get_ierror_p8(&mce_error_info, srr1);
+		addr = regs->nip;
+	}
+
+	/* Handle UE error. */
+	if (mce_error_info.error_type == MCE_ERROR_TYPE_UE)
+		handled = mce_handle_ue_error(regs);
+
+	save_mce_event(regs, handled, &mce_error_info, nip, addr);
+	return handled;
+}
diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S
index b69463ec201..7ce26d45777 100644
--- a/arch/powerpc/kernel/misc.S
+++ b/arch/powerpc/kernel/misc.S
@@ -5,7 +5,6 @@
  * Largely rewritten by Cort Dougan (cort@cs.nmt.edu)
  * and Paul Mackerras.
  *
- * Adapted for iSeries by Mike Corrigan (mikejc@us.ibm.com)
  * PPC64 updates by Dave Engebretsen (engebret@us.ibm.com)
  *
  * setjmp/longjmp code by Paul Mackerras.
@@ -55,13 +54,6 @@ _GLOBAL(add_reloc_offset)
 	.align	3
 2:	PPC_LONG 1b
 
-_GLOBAL(kernel_execve)
-	li	r0,__NR_execve
-	sc
-	bnslr
-	neg	r3,r3
-	blr
-
 _GLOBAL(setjmp)
 	mflr	r0
 	PPC_STL	r0,0(r3)
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index 7cd07b42ca1..7c6bb4b17b4 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -36,26 +36,44 @@
 
 	.text
 
+/*
+ * We store the saved ksp_limit in the unused part
+ * of the STACK_FRAME_OVERHEAD
+ */
 _GLOBAL(call_do_softirq)
 	mflr	r0
 	stw	r0,4(r1)
+	lwz	r10,THREAD+KSP_LIMIT(r2)
+	addi	r11,r3,THREAD_INFO_GAP
 	stwu	r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3)
 	mr	r1,r3
+	stw	r10,8(r1)
+	stw	r11,THREAD+KSP_LIMIT(r2)
 	bl	__do_softirq
+	lwz	r10,8(r1)
 	lwz	r1,0(r1)
 	lwz	r0,4(r1)
+	stw	r10,THREAD+KSP_LIMIT(r2)
 	mtlr	r0
 	blr
 
-_GLOBAL(call_handle_irq)
+/*
+ * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
+ */
+_GLOBAL(call_do_irq)
 	mflr	r0
 	stw	r0,4(r1)
-	mtctr	r6
-	stwu	r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r5)
-	mr	r1,r5
-	bctrl
+	lwz	r10,THREAD+KSP_LIMIT(r2)
+	addi	r11,r4,THREAD_INFO_GAP
+	stwu	r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4)
+	mr	r1,r4
+	stw	r10,8(r1)
+	stw	r11,THREAD+KSP_LIMIT(r2)
+	bl	__do_irq
+	lwz	r10,8(r1)
 	lwz	r1,0(r1)
 	lwz	r0,4(r1)
+	stw	r10,THREAD+KSP_LIMIT(r2)
 	mtlr	r0
 	blr
 
@@ -179,7 +197,7 @@ _GLOBAL(low_choose_750fx_pll)
 	mtspr	SPRN_HID1,r4
 
 	/* Store new HID1 image */
-	rlwinm	r6,r1,0,0,(31-THREAD_SHIFT)
+	CURRENT_THREAD_INFO(r6, r1)
 	lwz	r6,TI_CPU(r6)
 	slwi	r6,r6,2
 	addis	r6,r6,nap_save_hid1@ha
@@ -327,8 +345,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_UNIFIED_ID_CACHE)
  *
  * flush_icache_range(unsigned long start, unsigned long stop)
  */
-_KPROBE(__flush_icache_range)
+_KPROBE(flush_icache_range)
 BEGIN_FTR_SECTION
+	PURGE_PREFETCHED_INS
 	blr				/* for 601, do nothing */
 END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
 	li	r5,L1_CACHE_BYTES-1
@@ -432,6 +451,7 @@ _GLOBAL(invalidate_dcache_range)
  */
 _GLOBAL(__flush_dcache_icache)
 BEGIN_FTR_SECTION
+	PURGE_PREFETCHED_INS
 	blr
 END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
 	rlwinm	r3,r3,0,0,31-PAGE_SHIFT		/* Get page base address */
@@ -473,6 +493,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_44x)
  */
 _GLOBAL(__flush_dcache_icache_phys)
 BEGIN_FTR_SECTION
+	PURGE_PREFETCHED_INS
 	blr					/* for 601, do nothing */
 END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
 	mfmsr	r10
@@ -643,6 +664,20 @@ _GLOBAL(__lshrdi3)
 	blr
 
 /*
+ * 64-bit comparison: __cmpdi2(s64 a, s64 b)
+ * Returns 0 if a < b, 1 if a == b, 2 if a > b.
+ */
+_GLOBAL(__cmpdi2)
+	cmpw	r3,r5
+	li	r3,1
+	bne	1f
+	cmplw	r4,r6
+	beqlr
+1:	li	r3,0
+	bltlr
+	li	r3,2
+	blr
+/*
  * 64-bit comparison: __ucmpdi2(u64 a, u64 b)
  * Returns 0 if a < b, 1 if a == b, 2 if a > b.
  */
@@ -657,49 +692,27 @@ _GLOBAL(__ucmpdi2)
 	li	r3,2
 	blr
 
+_GLOBAL(__bswapdi2)
+	rotlwi  r9,r4,8
+	rotlwi  r10,r3,8
+	rlwimi  r9,r4,24,0,7
+	rlwimi  r10,r3,24,0,7
+	rlwimi  r9,r4,24,16,23
+	rlwimi  r10,r3,24,16,23
+	mr      r3,r9
+	mr      r4,r10
+	blr
+
 _GLOBAL(abs)
 	srawi	r4,r3,31
 	xor	r3,r3,r4
 	sub	r3,r3,r4
 	blr
 
-/*
- * Create a kernel thread
- *   kernel_thread(fn, arg, flags)
- */
-_GLOBAL(kernel_thread)
-	stwu	r1,-16(r1)
-	stw	r30,8(r1)
-	stw	r31,12(r1)
-	mr	r30,r3		/* function */
-	mr	r31,r4		/* argument */
-	ori	r3,r5,CLONE_VM	/* flags */
-	oris	r3,r3,CLONE_UNTRACED>>16
-	li	r4,0		/* new sp (unused) */
-	li	r0,__NR_clone
-	sc
-	bns+	1f		/* did system call indicate error? */
-	neg	r3,r3		/* if so, make return code negative */
-1:	cmpwi	0,r3,0		/* parent or child? */
-	bne	2f		/* return if parent */
-	li	r0,0		/* make top-level stack frame */
-	stwu	r0,-16(r1)
-	mtlr	r30		/* fn addr in lr */
-	mr	r3,r31		/* load arg and call fn */
-	PPC440EP_ERR42
-	blrl
-	li	r0,__NR_exit	/* exit if function returns */
-	li	r3,0
-	sc
-2:	lwz	r30,8(r1)
-	lwz	r31,12(r1)
-	addi	r1,r1,16
-	blr
-
 #ifdef CONFIG_SMP
 _GLOBAL(start_secondary_resume)
 	/* Reset stack */
-	rlwinm	r1,r1,0,0,(31-THREAD_SHIFT)	/* current_thread_info() */
+	CURRENT_THREAD_INFO(r1, r1)
 	addi	r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD
 	li	r3,0
 	stw	r3,0(r1)		/* Zero the stack frame pointer	*/
@@ -738,8 +751,23 @@ relocate_new_kernel:
 	mr      r5, r31
 
 	li	r0, 0
-#elif defined(CONFIG_44x)  && !defined(CONFIG_PPC_47x)
+#elif defined(CONFIG_44x)
 
+	/* Save our parameters */
+	mr	r29, r3
+	mr	r30, r4
+	mr	r31, r5
+
+#ifdef CONFIG_PPC_47x
+	/* Check for 47x cores */
+	mfspr	r3,SPRN_PVR
+	srwi	r3,r3,16
+	cmplwi	cr0,r3,PVR_476@h
+	beq	setup_map_47x
+	cmplwi	cr0,r3,PVR_476_ISS@h
+	beq	setup_map_47x
+#endif /* CONFIG_PPC_47x */
+	
 /*
  * Code for setting up 1:1 mapping for PPC440x for KEXEC
  *
@@ -753,16 +781,15 @@ relocate_new_kernel:
  * 5) Invalidate the tmp mapping.
  *
  * - Based on the kexec support code for FSL BookE
- * - Doesn't support 47x yet.
  *
  */
-	/* Save our parameters */
-	mr	r29, r3
-	mr	r30, r4
-	mr	r31, r5
 
-	/* Load our MSR_IS and TID to MMUCR for TLB search */
-	mfspr	r3,SPRN_PID
+	/* 
+	 * Load the PID with kernel PID (0).
+	 * Also load our MSR_IS and TID to MMUCR for TLB search.
+	 */
+	li	r3, 0
+	mtspr	SPRN_PID, r3
 	mfmsr	r4
 	andi.	r4,r4,MSR_IS@l
 	beq	wmmucr
@@ -900,6 +927,179 @@ next_tlb:
 	li	r3, 0
 	tlbwe	r3, r24, PPC44x_TLB_PAGEID
 	sync
+	b	ppc44x_map_done
+
+#ifdef CONFIG_PPC_47x
+
+	/* 1:1 mapping for 47x */
+
+setup_map_47x:
+
+	/*
+	 * Load the kernel pid (0) to PID and also to MMUCR[TID].
+	 * Also set the MSR IS->MMUCR STS
+	 */
+	li	r3, 0
+	mtspr	SPRN_PID, r3			/* Set PID */
+	mfmsr	r4				/* Get MSR */
+	andi.	r4, r4, MSR_IS@l		/* TS=1? */
+	beq	1f				/* If not, leave STS=0 */
+	oris	r3, r3, PPC47x_MMUCR_STS@h	/* Set STS=1 */
+1:	mtspr	SPRN_MMUCR, r3			/* Put MMUCR */
+	sync
+
+	/* Find the entry we are running from */
+	bl	2f
+2:	mflr	r23
+	tlbsx	r23, 0, r23
+	tlbre	r24, r23, 0			/* TLB Word 0 */
+	tlbre	r25, r23, 1			/* TLB Word 1 */
+	tlbre	r26, r23, 2			/* TLB Word 2 */
+
+
+	/*
+	 * Invalidates all the tlb entries by writing to 256 RPNs(r4)
+	 * of 4k page size in all  4 ways (0-3 in r3).
+	 * This would invalidate the entire UTLB including the one we are
+	 * running from. However the shadow TLB entries would help us 
+	 * to continue the execution, until we flush them (rfi/isync).
+	 */
+	addis	r3, 0, 0x8000			/* specify the way */
+	addi	r4, 0, 0			/* TLB Word0 = (EPN=0, VALID = 0) */
+	addi	r5, 0, 0
+	b	clear_utlb_entry
+
+	/* Align the loop to speed things up. from head_44x.S */
+	.align	6
+
+clear_utlb_entry:
+
+	tlbwe	r4, r3, 0
+	tlbwe	r5, r3, 1
+	tlbwe	r5, r3, 2
+	addis	r3, r3, 0x2000			/* Increment the way */
+	cmpwi	r3, 0
+	bne	clear_utlb_entry
+	addis	r3, 0, 0x8000
+	addis	r4, r4, 0x100			/* Increment the EPN */
+	cmpwi	r4, 0
+	bne	clear_utlb_entry
+
+	/* Create the entries in the other address space */
+	mfmsr	r5
+	rlwinm	r7, r5, 27, 31, 31		/* Get the TS (Bit 26) from MSR */
+	xori	r7, r7, 1			/* r7 = !TS */
+
+	insrwi	r24, r7, 1, 21			/* Change the TS in the saved TLB word 0 */
+
+	/* 
+	 * write out the TLB entries for the tmp mapping
+	 * Use way '0' so that we could easily invalidate it later.
+	 */
+	lis	r3, 0x8000			/* Way '0' */ 
+
+	tlbwe	r24, r3, 0
+	tlbwe	r25, r3, 1
+	tlbwe	r26, r3, 2
+
+	/* Update the msr to the new TS */
+	insrwi	r5, r7, 1, 26
+
+	bl	1f
+1:	mflr	r6
+	addi	r6, r6, (2f-1b)
+
+	mtspr	SPRN_SRR0, r6
+	mtspr	SPRN_SRR1, r5
+	rfi
+
+	/* 
+	 * Now we are in the tmp address space.
+	 * Create a 1:1 mapping for 0-2GiB in the original TS.
+	 */
+2:
+	li	r3, 0
+	li	r4, 0				/* TLB Word 0 */
+	li	r5, 0				/* TLB Word 1 */
+	li	r6, 0
+	ori	r6, r6, PPC47x_TLB2_S_RWX	/* TLB word 2 */
+
+	li	r8, 0				/* PageIndex */
+
+	xori	r7, r7, 1			/* revert back to original TS */
+
+write_utlb:
+	rotlwi	r5, r8, 28			/* RPN = PageIndex * 256M */
+						/* ERPN = 0 as we don't use memory above 2G */
+
+	mr	r4, r5				/* EPN = RPN */
+	ori	r4, r4, (PPC47x_TLB0_VALID | PPC47x_TLB0_256M)
+	insrwi	r4, r7, 1, 21			/* Insert the TS to Word 0 */
+
+	tlbwe	r4, r3, 0			/* Write out the entries */
+	tlbwe	r5, r3, 1
+	tlbwe	r6, r3, 2
+	addi	r8, r8, 1
+	cmpwi	r8, 8				/* Have we completed ? */
+	bne	write_utlb
+
+	/* make sure we complete the TLB write up */
+	isync
+
+	/* 
+	 * Prepare to jump to the 1:1 mapping.
+	 * 1) Extract page size of the tmp mapping
+	 *    DSIZ = TLB_Word0[22:27]
+	 * 2) Calculate the physical address of the address
+	 *    to jump to.
+	 */
+	rlwinm	r10, r24, 0, 22, 27
+
+	cmpwi	r10, PPC47x_TLB0_4K
+	bne	0f
+	li	r10, 0x1000			/* r10 = 4k */
+	bl	1f
+
+0:
+	/* Defaults to 256M */
+	lis	r10, 0x1000
+	
+	bl	1f
+1:	mflr	r4
+	addi	r4, r4, (2f-1b)			/* virtual address  of 2f */
+
+	subi	r11, r10, 1			/* offsetmask = Pagesize - 1 */
+	not	r10, r11			/* Pagemask = ~(offsetmask) */
+
+	and	r5, r25, r10			/* Physical page */
+	and	r6, r4, r11			/* offset within the current page */
+
+	or	r5, r5, r6			/* Physical address for 2f */
+
+	/* Switch the TS in MSR to the original one */
+	mfmsr	r8
+	insrwi	r8, r7, 1, 26
+
+	mtspr	SPRN_SRR1, r8
+	mtspr	SPRN_SRR0, r5
+	rfi
+
+2:
+	/* Invalidate the tmp mapping */
+	lis	r3, 0x8000			/* Way '0' */
+
+	clrrwi	r24, r24, 12			/* Clear the valid bit */
+	tlbwe	r24, r3, 0
+	tlbwe	r25, r3, 1
+	tlbwe	r26, r3, 2
+
+	/* Make sure we complete the TLB write and flush the shadow TLB */
+	isync
+
+#endif
+
+ppc44x_map_done:
+
 
 	/* Restore the parameters */
 	mr	r3, r29
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index 616921ef143..4e314b90c75 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -34,20 +34,18 @@ _GLOBAL(call_do_softirq)
 	std	r0,16(r1)
 	stdu	r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3)
 	mr	r1,r3
-	bl	.__do_softirq
+	bl	__do_softirq
 	ld	r1,0(r1)
 	ld	r0,16(r1)
 	mtlr	r0
 	blr
 
-_GLOBAL(call_handle_irq)
-	ld	r8,0(r6)
+_GLOBAL(call_do_irq)
 	mflr	r0
 	std	r0,16(r1)
-	mtctr	r8
-	stdu	r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r5)
-	mr	r1,r5
-	bctrl
+	stdu	r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4)
+	mr	r1,r4
+	bl	__do_irq
 	ld	r1,0(r1)
 	ld	r0,16(r1)
 	mtlr	r0
@@ -67,8 +65,11 @@ PPC64_CACHES:
  *   flush all bytes from start through stop-1 inclusive
  */
 
-_KPROBE(__flush_icache_range)
-
+_KPROBE(flush_icache_range)
+BEGIN_FTR_SECTION
+	PURGE_PREFETCHED_INS
+	blr
+END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
 /*
  * Flush the data cache to memory 
  * 
@@ -211,6 +212,11 @@ _GLOBAL(__flush_dcache_icache)
  * Different systems have different cache line sizes
  */
 
+BEGIN_FTR_SECTION
+	PURGE_PREFETCHED_INS
+	blr
+END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
+
 /* Flush the dcache */
  	ld	r7,PPC64_CACHES@toc(r2)
 	clrrdi	r3,r3,PAGE_SHIFT           	    /* Page align */
@@ -234,8 +240,53 @@ _GLOBAL(__flush_dcache_icache)
 	isync
 	blr
 
+_GLOBAL(__bswapdi2)
+	srdi	r8,r3,32
+	rlwinm	r7,r3,8,0xffffffff
+	rlwimi	r7,r3,24,0,7
+	rlwinm	r9,r8,8,0xffffffff
+	rlwimi	r7,r3,24,16,23
+	rlwimi	r9,r8,24,0,7
+	rlwimi	r9,r8,24,16,23
+	sldi	r7,r7,32
+	or	r3,r7,r9
+	blr
+
+
+#ifdef CONFIG_PPC_EARLY_DEBUG_BOOTX
+_GLOBAL(rmci_on)
+	sync
+	isync
+	li	r3,0x100
+	rldicl	r3,r3,32,0
+	mfspr	r5,SPRN_HID4
+	or	r5,r5,r3
+	sync
+	mtspr	SPRN_HID4,r5
+	isync
+	slbia
+	isync
+	sync
+	blr
+
+_GLOBAL(rmci_off)
+	sync
+	isync
+	li	r3,0x100
+	rldicl	r3,r3,32,0
+	mfspr	r5,SPRN_HID4
+	andc	r5,r5,r3
+	sync
+	mtspr	SPRN_HID4,r5
+	isync
+	slbia
+	isync
+	sync
+	blr
+#endif /* CONFIG_PPC_EARLY_DEBUG_BOOTX */
 
 #if defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_MAPLE)
+
 /*
  * Do an IO access in real mode
  */
@@ -301,11 +352,6 @@ _GLOBAL(real_writeb)
 
 #ifdef CONFIG_PPC_PASEMI
 
-/* No support in all binutils for these yet, so use defines */
-#define LBZCIX(RT,RA,RB)  .long (0x7c0006aa|(RT<<21)|(RA<<16)|(RB << 11))
-#define STBCIX(RS,RA,RB)  .long (0x7c0007aa|(RS<<21)|(RA<<16)|(RB << 11))
-
-
 _GLOBAL(real_205_readb)
 	mfmsr	r7
 	ori	r0,r7,MSR_DR
@@ -314,7 +360,7 @@ _GLOBAL(real_205_readb)
 	mtmsrd	r0
 	sync
 	isync
-	LBZCIX(r3,0,r3)
+	LBZCIX(R3,R0,R3)
 	isync
 	mtmsrd	r7
 	sync
@@ -329,7 +375,7 @@ _GLOBAL(real_205_writeb)
 	mtmsrd	r0
 	sync
 	isync
-	STBCIX(r3,0,r4)
+	STBCIX(R3,R0,R4)
 	isync
 	mtmsrd	r7
 	sync
@@ -410,53 +456,6 @@ _GLOBAL(scom970_write)
 	blr
 #endif /* CONFIG_CPU_FREQ_PMAC64 || CONFIG_CPU_FREQ_MAPLE */
 
-
-/*
- * Create a kernel thread
- *   kernel_thread(fn, arg, flags)
- */
-_GLOBAL(kernel_thread)
-	std	r29,-24(r1)
-	std	r30,-16(r1)
-	stdu	r1,-STACK_FRAME_OVERHEAD(r1)
-	mr	r29,r3
-	mr	r30,r4
-	ori	r3,r5,CLONE_VM	/* flags */
-	oris	r3,r3,(CLONE_UNTRACED>>16)
-	li	r4,0		/* new sp (unused) */
-	li	r0,__NR_clone
-	sc
-	bns+	1f		/* did system call indicate error? */
-	neg	r3,r3		/* if so, make return code negative */
-1:	cmpdi	0,r3,0		/* parent or child? */
-	bne	2f		/* return if parent */
-	li	r0,0
-	stdu	r0,-STACK_FRAME_OVERHEAD(r1)
-	ld	r2,8(r29)
-	ld	r29,0(r29)
-	mtlr	r29              /* fn addr in lr */
-	mr	r3,r30	        /* load arg and call fn */
-	blrl
-	li	r0,__NR_exit	/* exit after child exits */
-        li	r3,0
-	sc
-2:	addi	r1,r1,STACK_FRAME_OVERHEAD
-	ld	r29,-24(r1)
-	ld	r30,-16(r1)
-	blr
-
-/*
- * disable_kernel_fp()
- * Disable the FPU.
- */
-_GLOBAL(disable_kernel_fp)
-	mfmsr	r3
-	rldicl	r0,r3,(63-MSR_FP_LG),1
-	rldicl	r3,r0,(MSR_FP_LG+1),0
-	mtmsrd	r3			/* disable use of fpu now */
-	isync
-	blr
-
 /* kexec_wait(phys_cpu)
  *
  * wait for the flag to change, indicating this kernel is going away but
@@ -507,7 +506,7 @@ _GLOBAL(kexec_smp_wait)
 	stb	r4,PACAKEXECSTATE(r13)
 	SYNC
 
-	b	.kexec_wait
+	b	kexec_wait
 
 /*
  * switch to real mode (turn mmu off)
@@ -577,7 +576,7 @@ _GLOBAL(kexec_sequence)
 
 	/* copy dest pages, flush whole dest image */
 	mr	r3,r29
-	bl	.kexec_copy_flush	/* (image) */
+	bl	kexec_copy_flush	/* (image) */
 
 	/* turn off mmu */
 	bl	real_mode
@@ -587,7 +586,7 @@ _GLOBAL(kexec_sequence)
 	mr	r4,r30		/* start, aka phys mem offset */
 	li	r5,0x100
 	li	r6,0
-	bl	.copy_and_flush	/* (dest, src, copy limit, start offset) */
+	bl	copy_and_flush	/* (dest, src, copy limit, start offset) */
 1:	/* assume normal blr return */
 
 	/* release other cpus to the new kernel secondary start at 0x60 */
@@ -596,8 +595,12 @@ _GLOBAL(kexec_sequence)
 	stw	r6,kexec_flag-1b(5)
 
 	/* clear out hardware hash page table and tlb */
-	ld	r5,0(r27)		/* deref function descriptor */
-	mtctr	r5
+#if !defined(_CALL_ELF) || _CALL_ELF != 2
+	ld	r12,0(r27)		/* deref function descriptor */
+#else
+	mr	r12,r27
+#endif
+	mtctr	r12
 	bctrl				/* ppc_md.hpte_clear_all(void); */
 
 /*
@@ -631,3 +634,31 @@ _GLOBAL(kexec_sequence)
 	li	r5,0
 	blr	/* image->start(physid, image->start, 0); */
 #endif /* CONFIG_KEXEC */
+
+#ifdef CONFIG_MODULES
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+
+#ifdef CONFIG_MODVERSIONS
+.weak __crc_TOC.
+.section "___kcrctab+TOC.","a"
+.globl __kcrctab_TOC.
+__kcrctab_TOC.:
+	.llong	__crc_TOC.
+#endif
+
+/*
+ * Export a fake .TOC. since both modpost and depmod will complain otherwise.
+ * Both modpost and depmod strip the leading . so we do the same here.
+ */
+.section "__ksymtab_strings","a"
+__kstrtab_TOC.:
+	.asciz "TOC."
+
+.section "___ksymtab+TOC.","a"
+/* This symbol name is important: it's used by modpost to find exported syms */
+.globl __ksymtab_TOC.
+__ksymtab_TOC.:
+	.llong 0 /* .value */
+	.llong __kstrtab_TOC.
+#endif /* ELFv2 */
+#endif /* MODULES */
diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index 2d275707f41..9547381b631 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -25,8 +25,7 @@
 #include <asm/uaccess.h>
 #include <asm/firmware.h>
 #include <linux/sort.h>
-
-#include "setup.h"
+#include <asm/setup.h>
 
 LIST_HEAD(module_bug_list);
 
diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c
index 0b6d79617d7..6cff040bf45 100644
--- a/arch/powerpc/kernel/module_32.c
+++ b/arch/powerpc/kernel/module_32.c
@@ -26,8 +26,7 @@
 #include <linux/cache.h>
 #include <linux/bug.h>
 #include <linux/sort.h>
-
-#include "setup.h"
+#include <asm/setup.h>
 
 #if 0
 #define DEBUGP printk
@@ -176,8 +175,8 @@ int module_frob_arch_sections(Elf32_Ehdr *hdr,
 
 static inline int entry_matches(struct ppc_plt_entry *entry, Elf32_Addr val)
 {
-	if (entry->jump[0] == 0x3d600000 + ((val + 0x8000) >> 16)
-	    && entry->jump[1] == 0x396b0000 + (val & 0xffff))
+	if (entry->jump[0] == 0x3d800000 + ((val + 0x8000) >> 16)
+	    && entry->jump[1] == 0x398c0000 + (val & 0xffff))
 		return 1;
 	return 0;
 }
@@ -204,10 +203,9 @@ static uint32_t do_plt_call(void *location,
 		entry++;
 	}
 
-	/* Stolen from Paul Mackerras as well... */
-	entry->jump[0] = 0x3d600000+((val+0x8000)>>16);	/* lis r11,sym@ha */
-	entry->jump[1] = 0x396b0000 + (val&0xffff);	/* addi r11,r11,sym@l*/
-	entry->jump[2] = 0x7d6903a6;			/* mtctr r11 */
+	entry->jump[0] = 0x3d800000+((val+0x8000)>>16); /* lis r12,sym@ha */
+	entry->jump[1] = 0x398c0000 + (val&0xffff);     /* addi r12,r12,sym@l*/
+	entry->jump[2] = 0x7d8903a6;                    /* mtctr r12 */
 	entry->jump[3] = 0x4e800420;			/* bctr */
 
 	DEBUGP("Initialized plt for 0x%x at %p\n", val, entry);
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index 9f44a775a10..d807ee626af 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -22,12 +22,12 @@
 #include <linux/vmalloc.h>
 #include <linux/ftrace.h>
 #include <linux/bug.h>
+#include <linux/uaccess.h>
 #include <asm/module.h>
 #include <asm/firmware.h>
 #include <asm/code-patching.h>
 #include <linux/sort.h>
-
-#include "setup.h"
+#include <asm/setup.h>
 
 /* FIXME: We don't do .init separately.  To do this, we'd need to have
    a separate r2 value in the init and core section, and stub between
@@ -42,35 +42,170 @@
 #define DEBUGP(fmt , ...)
 #endif
 
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+#define R2_STACK_OFFSET 24
+
+/* An address is simply the address of the function. */
+typedef unsigned long func_desc_t;
+
+static func_desc_t func_desc(unsigned long addr)
+{
+	return addr;
+}
+static unsigned long func_addr(unsigned long addr)
+{
+	return addr;
+}
+static unsigned long stub_func_addr(func_desc_t func)
+{
+	return func;
+}
+
+/* PowerPC64 specific values for the Elf64_Sym st_other field.  */
+#define STO_PPC64_LOCAL_BIT	5
+#define STO_PPC64_LOCAL_MASK	(7 << STO_PPC64_LOCAL_BIT)
+#define PPC64_LOCAL_ENTRY_OFFSET(other)					\
+ (((1 << (((other) & STO_PPC64_LOCAL_MASK) >> STO_PPC64_LOCAL_BIT)) >> 2) << 2)
+
+static unsigned int local_entry_offset(const Elf64_Sym *sym)
+{
+	/* sym->st_other indicates offset to local entry point
+	 * (otherwise it will assume r12 is the address of the start
+	 * of function and try to derive r2 from it). */
+	return PPC64_LOCAL_ENTRY_OFFSET(sym->st_other);
+}
+#else
+#define R2_STACK_OFFSET 40
+
+/* An address is address of the OPD entry, which contains address of fn. */
+typedef struct ppc64_opd_entry func_desc_t;
+
+static func_desc_t func_desc(unsigned long addr)
+{
+	return *(struct ppc64_opd_entry *)addr;
+}
+static unsigned long func_addr(unsigned long addr)
+{
+	return func_desc(addr).funcaddr;
+}
+static unsigned long stub_func_addr(func_desc_t func)
+{
+	return func.funcaddr;
+}
+static unsigned int local_entry_offset(const Elf64_Sym *sym)
+{
+	return 0;
+}
+#endif
+
 /* Like PPC32, we need little trampolines to do > 24-bit jumps (into
    the kernel itself).  But on PPC64, these need to be used for every
    jump, actually, to reset r2 (TOC+0x8000). */
 struct ppc64_stub_entry
 {
-	/* 28 byte jump instruction sequence (7 instructions) */
-	unsigned char jump[28];
-	unsigned char unused[4];
+	/* 28 byte jump instruction sequence (7 instructions). We only
+	 * need 6 instructions on ABIv2 but we always allocate 7 so
+	 * so we don't have to modify the trampoline load instruction. */
+	u32 jump[7];
+	u32 unused;
 	/* Data for the above code */
-	struct ppc64_opd_entry opd;
+	func_desc_t funcdata;
 };
 
-/* We use a stub to fix up r2 (TOC ptr) and to jump to the (external)
-   function which may be more than 24-bits away.  We could simply
-   patch the new r2 value and function pointer into the stub, but it's
-   significantly shorter to put these values at the end of the stub
-   code, and patch the stub address (32-bits relative to the TOC ptr,
-   r2) into the stub. */
-static struct ppc64_stub_entry ppc64_stub =
-{ .jump = {
-	0x3d, 0x82, 0x00, 0x00, /* addis   r12,r2, <high> */
-	0x39, 0x8c, 0x00, 0x00, /* addi    r12,r12, <low> */
+/*
+ * PPC64 uses 24 bit jumps, but we need to jump into other modules or
+ * the kernel which may be further.  So we jump to a stub.
+ *
+ * For ELFv1 we need to use this to set up the new r2 value (aka TOC
+ * pointer).  For ELFv2 it's the callee's responsibility to set up the
+ * new r2, but for both we need to save the old r2.
+ *
+ * We could simply patch the new r2 value and function pointer into
+ * the stub, but it's significantly shorter to put these values at the
+ * end of the stub code, and patch the stub address (32-bits relative
+ * to the TOC ptr, r2) into the stub.
+ */
+
+static u32 ppc64_stub_insns[] = {
+	0x3d620000,			/* addis   r11,r2, <high> */
+	0x396b0000,			/* addi    r11,r11, <low> */
 	/* Save current r2 value in magic place on the stack. */
-	0xf8, 0x41, 0x00, 0x28, /* std     r2,40(r1) */
-	0xe9, 0x6c, 0x00, 0x20, /* ld      r11,32(r12) */
-	0xe8, 0x4c, 0x00, 0x28, /* ld      r2,40(r12) */
-	0x7d, 0x69, 0x03, 0xa6, /* mtctr   r11 */
-	0x4e, 0x80, 0x04, 0x20  /* bctr */
-} };
+	0xf8410000|R2_STACK_OFFSET,	/* std     r2,R2_STACK_OFFSET(r1) */
+	0xe98b0020,			/* ld      r12,32(r11) */
+#if !defined(_CALL_ELF) || _CALL_ELF != 2
+	/* Set up new r2 from function descriptor */
+	0xe84b0028,			/* ld      r2,40(r11) */
+#endif
+	0x7d8903a6,			/* mtctr   r12 */
+	0x4e800420			/* bctr */
+};
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+static u32 ppc64_stub_mask[] = {
+	0xffff0000,
+	0xffff0000,
+	0xffffffff,
+	0xffffffff,
+#if !defined(_CALL_ELF) || _CALL_ELF != 2
+	0xffffffff,
+#endif
+	0xffffffff,
+	0xffffffff
+};
+
+bool is_module_trampoline(u32 *p)
+{
+	unsigned int i;
+	u32 insns[ARRAY_SIZE(ppc64_stub_insns)];
+
+	BUILD_BUG_ON(sizeof(ppc64_stub_insns) != sizeof(ppc64_stub_mask));
+
+	if (probe_kernel_read(insns, p, sizeof(insns)))
+		return -EFAULT;
+
+	for (i = 0; i < ARRAY_SIZE(ppc64_stub_insns); i++) {
+		u32 insna = insns[i];
+		u32 insnb = ppc64_stub_insns[i];
+		u32 mask = ppc64_stub_mask[i];
+
+		if ((insna & mask) != (insnb & mask))
+			return false;
+	}
+
+	return true;
+}
+
+int module_trampoline_target(struct module *mod, u32 *trampoline,
+			     unsigned long *target)
+{
+	u32 buf[2];
+	u16 upper, lower;
+	long offset;
+	void *toc_entry;
+
+	if (probe_kernel_read(buf, trampoline, sizeof(buf)))
+		return -EFAULT;
+
+	upper = buf[0] & 0xffff;
+	lower = buf[1] & 0xffff;
+
+	/* perform the addis/addi, both signed */
+	offset = ((short)upper << 16) + (short)lower;
+
+	/*
+	 * Now get the address this trampoline jumps to. This
+	 * is always 32 bytes into our trampoline stub.
+	 */
+	toc_entry = (void *)mod->arch.toc + offset + 32;
+
+	if (probe_kernel_read(target, toc_entry, sizeof(*target)))
+		return -EFAULT;
+
+	return 0;
+}
+
+#endif
 
 /* Count how many different 24-bit relocations (different symbol,
    different addend) */
@@ -173,17 +308,27 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr,
 	return relocs * sizeof(struct ppc64_stub_entry);
 }
 
+/* Still needed for ELFv2, for .TOC. */
 static void dedotify_versions(struct modversion_info *vers,
 			      unsigned long size)
 {
 	struct modversion_info *end;
 
 	for (end = (void *)vers + size; vers < end; vers++)
-		if (vers->name[0] == '.')
+		if (vers->name[0] == '.') {
 			memmove(vers->name, vers->name+1, strlen(vers->name));
+#ifdef ARCH_RELOCATES_KCRCTAB
+			/* The TOC symbol has no CRC computed. To avoid CRC
+			 * check failing, we must force it to the expected
+			 * value (see CRC check in module.c).
+			 */
+			if (!strcmp(vers->name, "TOC."))
+				vers->crc = -(unsigned long)reloc_start;
+#endif
+		}
 }
 
-/* Undefined symbols which refer to .funcname, hack to funcname */
+/* Undefined symbols which refer to .funcname, hack to funcname (or .TOC.) */
 static void dedotify(Elf64_Sym *syms, unsigned int numsyms, char *strtab)
 {
 	unsigned int i;
@@ -197,6 +342,24 @@ static void dedotify(Elf64_Sym *syms, unsigned int numsyms, char *strtab)
 	}
 }
 
+static Elf64_Sym *find_dot_toc(Elf64_Shdr *sechdrs,
+			       const char *strtab,
+			       unsigned int symindex)
+{
+	unsigned int i, numsyms;
+	Elf64_Sym *syms;
+
+	syms = (Elf64_Sym *)sechdrs[symindex].sh_addr;
+	numsyms = sechdrs[symindex].sh_size / sizeof(Elf64_Sym);
+
+	for (i = 1; i < numsyms; i++) {
+		if (syms[i].st_shndx == SHN_UNDEF
+		    && strcmp(strtab + syms[i].st_name, "TOC.") == 0)
+			return &syms[i];
+	}
+	return NULL;
+}
+
 int module_frob_arch_sections(Elf64_Ehdr *hdr,
 			      Elf64_Shdr *sechdrs,
 			      char *secstrings,
@@ -261,16 +424,12 @@ static inline unsigned long my_r2(Elf64_Shdr *sechdrs, struct module *me)
 /* Patch stub to reference function and correct r2 value. */
 static inline int create_stub(Elf64_Shdr *sechdrs,
 			      struct ppc64_stub_entry *entry,
-			      struct ppc64_opd_entry *opd,
+			      unsigned long addr,
 			      struct module *me)
 {
-	Elf64_Half *loc1, *loc2;
 	long reladdr;
 
-	*entry = ppc64_stub;
-
-	loc1 = (Elf64_Half *)&entry->jump[2];
-	loc2 = (Elf64_Half *)&entry->jump[6];
+	memcpy(entry->jump, ppc64_stub_insns, sizeof(ppc64_stub_insns));
 
 	/* Stub uses address relative to r2. */
 	reladdr = (unsigned long)entry - my_r2(sechdrs, me);
@@ -281,35 +440,33 @@ static inline int create_stub(Elf64_Shdr *sechdrs,
 	}
 	DEBUGP("Stub %p get data from reladdr %li\n", entry, reladdr);
 
-	*loc1 = PPC_HA(reladdr);
-	*loc2 = PPC_LO(reladdr);
-	entry->opd.funcaddr = opd->funcaddr;
-	entry->opd.r2 = opd->r2;
+	entry->jump[0] |= PPC_HA(reladdr);
+	entry->jump[1] |= PPC_LO(reladdr);
+	entry->funcdata = func_desc(addr);
 	return 1;
 }
 
-/* Create stub to jump to function described in this OPD: we need the
+/* Create stub to jump to function described in this OPD/ptr: we need the
    stub to set up the TOC ptr (r2) for the function. */
 static unsigned long stub_for_addr(Elf64_Shdr *sechdrs,
-				   unsigned long opdaddr,
+				   unsigned long addr,
 				   struct module *me)
 {
 	struct ppc64_stub_entry *stubs;
-	struct ppc64_opd_entry *opd = (void *)opdaddr;
 	unsigned int i, num_stubs;
 
 	num_stubs = sechdrs[me->arch.stubs_section].sh_size / sizeof(*stubs);
 
 	/* Find this stub, or if that fails, the next avail. entry */
 	stubs = (void *)sechdrs[me->arch.stubs_section].sh_addr;
-	for (i = 0; stubs[i].opd.funcaddr; i++) {
+	for (i = 0; stub_func_addr(stubs[i].funcdata); i++) {
 		BUG_ON(i >= num_stubs);
 
-		if (stubs[i].opd.funcaddr == opd->funcaddr)
+		if (stub_func_addr(stubs[i].funcdata) == func_addr(addr))
 			return (unsigned long)&stubs[i];
 	}
 
-	if (!create_stub(sechdrs, &stubs[i], opd, me))
+	if (!create_stub(sechdrs, &stubs[i], addr, me))
 		return 0;
 
 	return (unsigned long)&stubs[i];
@@ -324,7 +481,8 @@ static int restore_r2(u32 *instruction, struct module *me)
 		       me->name, *instruction);
 		return 0;
 	}
-	*instruction = 0xe8410028;	/* ld r2,40(r1) */
+	/* ld r2,R2_STACK_OFFSET(r1) */
+	*instruction = 0xe8410000 | R2_STACK_OFFSET;
 	return 1;
 }
 
@@ -342,6 +500,17 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 
 	DEBUGP("Applying ADD relocate section %u to %u\n", relsec,
 	       sechdrs[relsec].sh_info);
+
+	/* First time we're called, we can fix up .TOC. */
+	if (!me->arch.toc_fixed) {
+		sym = find_dot_toc(sechdrs, strtab, symindex);
+		/* It's theoretically possible that a module doesn't want a
+		 * .TOC. so don't fail it just for that. */
+		if (sym)
+			sym->st_value = my_r2(sechdrs, me);
+		me->arch.toc_fixed = true;
+	}
+
 	for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rela); i++) {
 		/* This is where to make the change */
 		location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
@@ -386,6 +555,14 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 				| (value & 0xffff);
 			break;
 
+		case R_PPC64_TOC16_LO:
+			/* Subtract TOC pointer */
+			value -= my_r2(sechdrs, me);
+			*((uint16_t *) location)
+				= (*((uint16_t *) location) & ~0xffff)
+				| (value & 0xffff);
+			break;
+
 		case R_PPC64_TOC16_DS:
 			/* Subtract TOC pointer */
 			value -= my_r2(sechdrs, me);
@@ -399,6 +576,28 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 				| (value & 0xfffc);
 			break;
 
+		case R_PPC64_TOC16_LO_DS:
+			/* Subtract TOC pointer */
+			value -= my_r2(sechdrs, me);
+			if ((value & 3) != 0) {
+				printk("%s: bad TOC16_LO_DS relocation (%lu)\n",
+				       me->name, value);
+				return -ENOEXEC;
+			}
+			*((uint16_t *) location)
+				= (*((uint16_t *) location) & ~0xfffc)
+				| (value & 0xfffc);
+			break;
+
+		case R_PPC64_TOC16_HA:
+			/* Subtract TOC pointer */
+			value -= my_r2(sechdrs, me);
+			value = ((value + 0x8000) >> 16);
+			*((uint16_t *) location)
+				= (*((uint16_t *) location) & ~0xffff)
+				| (value & 0xffff);
+			break;
+
 		case R_PPC_REL24:
 			/* FIXME: Handle weak symbols here --RR */
 			if (sym->st_shndx == SHN_UNDEF) {
@@ -408,7 +607,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 					return -ENOENT;
 				if (!restore_r2((u32 *)location + 1, me))
 					return -ENOEXEC;
-			}
+			} else
+				value += local_entry_offset(sym);
 
 			/* Convert value to relative */
 			value -= (unsigned long)location;
@@ -429,6 +629,31 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 			*location = value - (unsigned long)location;
 			break;
 
+		case R_PPC64_TOCSAVE:
+			/*
+			 * Marker reloc indicates we don't have to save r2.
+			 * That would only save us one instruction, so ignore
+			 * it.
+			 */
+			break;
+
+		case R_PPC64_REL16_HA:
+			/* Subtract location pointer */
+			value -= (unsigned long)location;
+			value = ((value + 0x8000) >> 16);
+			*((uint16_t *) location)
+				= (*((uint16_t *) location) & ~0xffff)
+				| (value & 0xffff);
+			break;
+
+		case R_PPC64_REL16_LO:
+			/* Subtract location pointer */
+			value -= (unsigned long)location;
+			*((uint16_t *) location)
+				= (*((uint16_t *) location) & ~0xffff)
+				| (value & 0xffff);
+			break;
+
 		default:
 			printk("%s: Unknown ADD relocation: %lu\n",
 			       me->name,
diff --git a/arch/powerpc/kernel/mpc7450-pmu.c b/arch/powerpc/kernel/mpc7450-pmu.c
deleted file mode 100644
index fe21b515ca4..00000000000
--- a/arch/powerpc/kernel/mpc7450-pmu.c
+++ /dev/null
@@ -1,422 +0,0 @@
-/*
- * Performance counter support for MPC7450-family processors.
- *
- * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/string.h>
-#include <linux/perf_event.h>
-#include <asm/reg.h>
-#include <asm/cputable.h>
-
-#define N_COUNTER	6	/* Number of hardware counters */
-#define MAX_ALT		3	/* Maximum number of event alternative codes */
-
-/*
- * Bits in event code for MPC7450 family
- */
-#define PM_THRMULT_MSKS	0x40000
-#define PM_THRESH_SH	12
-#define PM_THRESH_MSK	0x3f
-#define PM_PMC_SH	8
-#define PM_PMC_MSK	7
-#define PM_PMCSEL_MSK	0x7f
-
-/*
- * Classify events according to how specific their PMC requirements are.
- * Result is:
- *	0: can go on any PMC
- *	1: can go on PMCs 1-4
- *	2: can go on PMCs 1,2,4
- *	3: can go on PMCs 1 or 2
- *	4: can only go on one PMC
- *	-1: event code is invalid
- */
-#define N_CLASSES	5
-
-static int mpc7450_classify_event(u32 event)
-{
-	int pmc;
-
-	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
-	if (pmc) {
-		if (pmc > N_COUNTER)
-			return -1;
-		return 4;
-	}
-	event &= PM_PMCSEL_MSK;
-	if (event <= 1)
-		return 0;
-	if (event <= 7)
-		return 1;
-	if (event <= 13)
-		return 2;
-	if (event <= 22)
-		return 3;
-	return -1;
-}
-
-/*
- * Events using threshold and possible threshold scale:
- *	code	scale?	name
- *	11e	N	PM_INSTQ_EXCEED_CYC
- *	11f	N	PM_ALTV_IQ_EXCEED_CYC
- *	128	Y	PM_DTLB_SEARCH_EXCEED_CYC
- *	12b	Y	PM_LD_MISS_EXCEED_L1_CYC
- *	220	N	PM_CQ_EXCEED_CYC
- *	30c	N	PM_GPR_RB_EXCEED_CYC
- *	30d	?	PM_FPR_IQ_EXCEED_CYC ?
- *	311	Y	PM_ITLB_SEARCH_EXCEED
- *	410	N	PM_GPR_IQ_EXCEED_CYC
- */
-
-/*
- * Return use of threshold and threshold scale bits:
- * 0 = uses neither, 1 = uses threshold, 2 = uses both
- */
-static int mpc7450_threshold_use(u32 event)
-{
-	int pmc, sel;
-
-	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
-	sel = event & PM_PMCSEL_MSK;
-	switch (pmc) {
-	case 1:
-		if (sel == 0x1e || sel == 0x1f)
-			return 1;
-		if (sel == 0x28 || sel == 0x2b)
-			return 2;
-		break;
-	case 2:
-		if (sel == 0x20)
-			return 1;
-		break;
-	case 3:
-		if (sel == 0xc || sel == 0xd)
-			return 1;
-		if (sel == 0x11)
-			return 2;
-		break;
-	case 4:
-		if (sel == 0x10)
-			return 1;
-		break;
-	}
-	return 0;
-}
-
-/*
- * Layout of constraint bits:
- * 33222222222211111111110000000000
- * 10987654321098765432109876543210
- *  |<    ><  > < > < ><><><><><><>
- *  TS TV   G4   G3  G2P6P5P4P3P2P1
- *
- * P1 - P6
- *	0 - 11: Count of events needing PMC1 .. PMC6
- *
- * G2
- *	12 - 14: Count of events needing PMC1 or PMC2
- *
- * G3
- *	16 - 18: Count of events needing PMC1, PMC2 or PMC4
- *
- * G4
- *	20 - 23: Count of events needing PMC1, PMC2, PMC3 or PMC4
- *
- * TV
- *	24 - 29: Threshold value requested
- *
- * TS
- *	30: Threshold scale value requested
- */
-
-static u32 pmcbits[N_COUNTER][2] = {
-	{ 0x00844002, 0x00111001 },	/* PMC1 mask, value: P1,G2,G3,G4 */
-	{ 0x00844008, 0x00111004 },	/* PMC2: P2,G2,G3,G4 */
-	{ 0x00800020, 0x00100010 },	/* PMC3: P3,G4 */
-	{ 0x00840080, 0x00110040 },	/* PMC4: P4,G3,G4 */
-	{ 0x00000200, 0x00000100 },	/* PMC5: P5 */
-	{ 0x00000800, 0x00000400 }	/* PMC6: P6 */
-};
-
-static u32 classbits[N_CLASSES - 1][2] = {
-	{ 0x00000000, 0x00000000 },	/* class 0: no constraint */
-	{ 0x00800000, 0x00100000 },	/* class 1: G4 */
-	{ 0x00040000, 0x00010000 },	/* class 2: G3 */
-	{ 0x00004000, 0x00001000 },	/* class 3: G2 */
-};
-
-static int mpc7450_get_constraint(u64 event, unsigned long *maskp,
-				  unsigned long *valp)
-{
-	int pmc, class;
-	u32 mask, value;
-	int thresh, tuse;
-
-	class = mpc7450_classify_event(event);
-	if (class < 0)
-		return -1;
-	if (class == 4) {
-		pmc = ((unsigned int)event >> PM_PMC_SH) & PM_PMC_MSK;
-		mask  = pmcbits[pmc - 1][0];
-		value = pmcbits[pmc - 1][1];
-	} else {
-		mask  = classbits[class][0];
-		value = classbits[class][1];
-	}
-
-	tuse = mpc7450_threshold_use(event);
-	if (tuse) {
-		thresh = ((unsigned int)event >> PM_THRESH_SH) & PM_THRESH_MSK;
-		mask  |= 0x3f << 24;
-		value |= thresh << 24;
-		if (tuse == 2) {
-			mask |= 0x40000000;
-			if ((unsigned int)event & PM_THRMULT_MSKS)
-				value |= 0x40000000;
-		}
-	}
-
-	*maskp = mask;
-	*valp = value;
-	return 0;
-}
-
-static const unsigned int event_alternatives[][MAX_ALT] = {
-	{ 0x217, 0x317 },		/* PM_L1_DCACHE_MISS */
-	{ 0x418, 0x50f, 0x60f },	/* PM_SNOOP_RETRY */
-	{ 0x502, 0x602 },		/* PM_L2_HIT */
-	{ 0x503, 0x603 },		/* PM_L3_HIT */
-	{ 0x504, 0x604 },		/* PM_L2_ICACHE_MISS */
-	{ 0x505, 0x605 },		/* PM_L3_ICACHE_MISS */
-	{ 0x506, 0x606 },		/* PM_L2_DCACHE_MISS */
-	{ 0x507, 0x607 },		/* PM_L3_DCACHE_MISS */
-	{ 0x50a, 0x623 },		/* PM_LD_HIT_L3 */
-	{ 0x50b, 0x624 },		/* PM_ST_HIT_L3 */
-	{ 0x50d, 0x60d },		/* PM_L2_TOUCH_HIT */
-	{ 0x50e, 0x60e },		/* PM_L3_TOUCH_HIT */
-	{ 0x512, 0x612 },		/* PM_INT_LOCAL */
-	{ 0x513, 0x61d },		/* PM_L2_MISS */
-	{ 0x514, 0x61e },		/* PM_L3_MISS */
-};
-
-/*
- * Scan the alternatives table for a match and return the
- * index into the alternatives table if found, else -1.
- */
-static int find_alternative(u32 event)
-{
-	int i, j;
-
-	for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
-		if (event < event_alternatives[i][0])
-			break;
-		for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
-			if (event == event_alternatives[i][j])
-				return i;
-	}
-	return -1;
-}
-
-static int mpc7450_get_alternatives(u64 event, unsigned int flags, u64 alt[])
-{
-	int i, j, nalt = 1;
-	u32 ae;
-
-	alt[0] = event;
-	nalt = 1;
-	i = find_alternative((u32)event);
-	if (i >= 0) {
-		for (j = 0; j < MAX_ALT; ++j) {
-			ae = event_alternatives[i][j];
-			if (ae && ae != (u32)event)
-				alt[nalt++] = ae;
-		}
-	}
-	return nalt;
-}
-
-/*
- * Bitmaps of which PMCs each class can use for classes 0 - 3.
- * Bit i is set if PMC i+1 is usable.
- */
-static const u8 classmap[N_CLASSES] = {
-	0x3f, 0x0f, 0x0b, 0x03, 0
-};
-
-/* Bit position and width of each PMCSEL field */
-static const int pmcsel_shift[N_COUNTER] = {
-	6,	0,	27,	22,	17,	11
-};
-static const u32 pmcsel_mask[N_COUNTER] = {
-	0x7f,	0x3f,	0x1f,	0x1f,	0x1f,	0x3f
-};
-
-/*
- * Compute MMCR0/1/2 values for a set of events.
- */
-static int mpc7450_compute_mmcr(u64 event[], int n_ev,
-				unsigned int hwc[], unsigned long mmcr[])
-{
-	u8 event_index[N_CLASSES][N_COUNTER];
-	int n_classevent[N_CLASSES];
-	int i, j, class, tuse;
-	u32 pmc_inuse = 0, pmc_avail;
-	u32 mmcr0 = 0, mmcr1 = 0, mmcr2 = 0;
-	u32 ev, pmc, thresh;
-
-	if (n_ev > N_COUNTER)
-		return -1;
-
-	/* First pass: count usage in each class */
-	for (i = 0; i < N_CLASSES; ++i)
-		n_classevent[i] = 0;
-	for (i = 0; i < n_ev; ++i) {
-		class = mpc7450_classify_event(event[i]);
-		if (class < 0)
-			return -1;
-		j = n_classevent[class]++;
-		event_index[class][j] = i;
-	}
-
-	/* Second pass: allocate PMCs from most specific event to least */
-	for (class = N_CLASSES - 1; class >= 0; --class) {
-		for (i = 0; i < n_classevent[class]; ++i) {
-			ev = event[event_index[class][i]];
-			if (class == 4) {
-				pmc = (ev >> PM_PMC_SH) & PM_PMC_MSK;
-				if (pmc_inuse & (1 << (pmc - 1)))
-					return -1;
-			} else {
-				/* Find a suitable PMC */
-				pmc_avail = classmap[class] & ~pmc_inuse;
-				if (!pmc_avail)
-					return -1;
-				pmc = ffs(pmc_avail);
-			}
-			pmc_inuse |= 1 << (pmc - 1);
-
-			tuse = mpc7450_threshold_use(ev);
-			if (tuse) {
-				thresh = (ev >> PM_THRESH_SH) & PM_THRESH_MSK;
-				mmcr0 |= thresh << 16;
-				if (tuse == 2 && (ev & PM_THRMULT_MSKS))
-					mmcr2 = 0x80000000;
-			}
-			ev &= pmcsel_mask[pmc - 1];
-			ev <<= pmcsel_shift[pmc - 1];
-			if (pmc <= 2)
-				mmcr0 |= ev;
-			else
-				mmcr1 |= ev;
-			hwc[event_index[class][i]] = pmc - 1;
-		}
-	}
-
-	if (pmc_inuse & 1)
-		mmcr0 |= MMCR0_PMC1CE;
-	if (pmc_inuse & 0x3e)
-		mmcr0 |= MMCR0_PMCnCE;
-
-	/* Return MMCRx values */
-	mmcr[0] = mmcr0;
-	mmcr[1] = mmcr1;
-	mmcr[2] = mmcr2;
-	return 0;
-}
-
-/*
- * Disable counting by a PMC.
- * Note that the pmc argument is 0-based here, not 1-based.
- */
-static void mpc7450_disable_pmc(unsigned int pmc, unsigned long mmcr[])
-{
-	if (pmc <= 1)
-		mmcr[0] &= ~(pmcsel_mask[pmc] << pmcsel_shift[pmc]);
-	else
-		mmcr[1] &= ~(pmcsel_mask[pmc] << pmcsel_shift[pmc]);
-}
-
-static int mpc7450_generic_events[] = {
-	[PERF_COUNT_HW_CPU_CYCLES]		= 1,
-	[PERF_COUNT_HW_INSTRUCTIONS]		= 2,
-	[PERF_COUNT_HW_CACHE_MISSES]		= 0x217, /* PM_L1_DCACHE_MISS */
-	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x122, /* PM_BR_CMPL */
-	[PERF_COUNT_HW_BRANCH_MISSES] 		= 0x41c, /* PM_BR_MPRED */
-};
-
-#define C(x)	PERF_COUNT_HW_CACHE_##x
-
-/*
- * Table of generalized cache-related events.
- * 0 means not supported, -1 means nonsensical, other values
- * are event codes.
- */
-static int mpc7450_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
-	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
-		[C(OP_READ)] = {	0,		0x225	},
-		[C(OP_WRITE)] = {	0,		0x227	},
-		[C(OP_PREFETCH)] = {	0,		0	},
-	},
-	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
-		[C(OP_READ)] = {	0x129,		0x115	},
-		[C(OP_WRITE)] = {	-1,		-1	},
-		[C(OP_PREFETCH)] = {	0x634,		0	},
-	},
-	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
-		[C(OP_READ)] = {	0,		0	},
-		[C(OP_WRITE)] = {	0,		0	},
-		[C(OP_PREFETCH)] = {	0,		0	},
-	},
-	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
-		[C(OP_READ)] = {	0,		0x312	},
-		[C(OP_WRITE)] = {	-1,		-1	},
-		[C(OP_PREFETCH)] = {	-1,		-1	},
-	},
-	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
-		[C(OP_READ)] = {	0,		0x223	},
-		[C(OP_WRITE)] = {	-1,		-1	},
-		[C(OP_PREFETCH)] = {	-1,		-1	},
-	},
-	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
-		[C(OP_READ)] = {	0x122,		0x41c	},
-		[C(OP_WRITE)] = {	-1,		-1	},
-		[C(OP_PREFETCH)] = {	-1,		-1	},
-	},
-	[C(NODE)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
-		[C(OP_READ)] = {	-1,		-1	},
-		[C(OP_WRITE)] = {	-1,		-1	},
-		[C(OP_PREFETCH)] = {	-1,		-1	},
-	},
-};
-
-struct power_pmu mpc7450_pmu = {
-	.name			= "MPC7450 family",
-	.n_counter		= N_COUNTER,
-	.max_alternatives	= MAX_ALT,
-	.add_fields		= 0x00111555ul,
-	.test_adder		= 0x00301000ul,
-	.compute_mmcr		= mpc7450_compute_mmcr,
-	.get_constraint		= mpc7450_get_constraint,
-	.get_alternatives	= mpc7450_get_alternatives,
-	.disable_pmc		= mpc7450_disable_pmc,
-	.n_generic		= ARRAY_SIZE(mpc7450_generic_events),
-	.generic_events		= mpc7450_generic_events,
-	.cache_events		= &mpc7450_cache_events,
-};
-
-static int __init init_mpc7450_pmu(void)
-{
-	if (!cur_cpu_spec->oprofile_cpu_type ||
-	    strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc/7450"))
-		return -ENODEV;
-
-	return register_power_pmu(&mpc7450_pmu);
-}
-
-early_initcall(init_mpc7450_pmu);
diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c