44 files changed, 7360 insertions, 5948 deletions
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index f0e6f28427b..756b482f819 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -1,13 +1,27 @@
 config PPC_PSERIES
 	depends on PPC64 && PPC_BOOK3S
 	bool "IBM pSeries & new (POWER5-based) iSeries"
+	select HAVE_PCSPKR_PLATFORM
 	select MPIC
+	select OF_DYNAMIC
+	select PCI_MSI
+	select PPC_XICS
+	select PPC_ICP_NATIVE
+	select PPC_ICP_HV
+	select PPC_ICS_RTAS
 	select PPC_I8259
 	select PPC_RTAS
+	select PPC_RTAS_DAEMON
 	select RTAS_ERROR_LOGGING
 	select PPC_UDBG_16550
 	select PPC_NATIVE
-	select PPC_PCI_CHOICE if EMBEDDED
+	select PPC_PCI_CHOICE if EXPERT
+	select ZLIB_DEFLATE
+	select PPC_DOORBELL
+	select HAVE_CONTEXT_TRACKING
+	select HOTPLUG_CPU if SMP
+	select ARCH_RANDOM
+	select PPC_DOORBELL
 	default y
 
 config PPC_SPLPAR
@@ -20,23 +34,46 @@ config PPC_SPLPAR
 	  processors, that is, which share physical processors between
 	  two or more partitions.
 
-config EEH
-	bool "PCI Extended Error Handling (EEH)" if EMBEDDED
-	depends on PPC_PSERIES && PCI
-	default y if !EMBEDDED
-
 config PSERIES_MSI
        bool
-       depends on PCI_MSI && EEH
+       depends on PCI_MSI && PPC_PSERIES && EEH
        default y
 
+config PSERIES_ENERGY
+	tristate "pSeries energy management capabilities driver"
+	depends on PPC_PSERIES
+	default y
+	help
+	  Provides interface to platform energy management capabilities
+	  on supported PSERIES platforms.
+	  Provides: /sys/devices/system/cpu/pseries_(de)activation_hint_list
+	  and /sys/devices/system/cpu/cpuN/pseries_(de)activation_hint
+
 config SCANLOG
 	tristate "Scanlog dump interface"
 	depends on RTAS_PROC && PPC_PSERIES
 
+config IO_EVENT_IRQ
+	bool "IO Event Interrupt support"
+	depends on PPC_PSERIES
+	default y
+	help
+	  Select this option, if you want to enable support for IO Event
+	  interrupts. IO event interrupt is a mechanism provided by RTAS
+	  to return information about hardware error and non-error events
+	  which may need OS attention. RTAS returns events for multiple
+	  event types and scopes. Device drivers can register their handlers
+	  to receive events.
+
+	  This option will only enable the IO event platform code. You
+	  will still need to enable or compile the actual drivers
+	  that use this infrastructure to handle IO event interrupts.
+
+	  Say Y if you are unsure.
+
 config LPARCFG
 	bool "LPAR Configuration Data"
-	depends on PPC_PSERIES || PPC_ISERIES
+	depends on PPC_PSERIES
 	help
 	Provide system capacity information via human readable
 	<key word>=<value> pairs through a /proc/ppc64/lparcfg interface.
@@ -44,6 +81,12 @@ config LPARCFG
 config PPC_PSERIES_DEBUG
 	depends on PPC_PSERIES && PPC_EARLY_DEBUG
 	bool "Enable extra debug logging in platforms/pseries"
+        help
+	  Say Y here if you want the pseries core to produce a bunch of
+	  debug messages to the system log. Select this if you are having a
+	  problem with the pseries core and want to see more of what is
+	  going on. This does not enable debugging in lpar.c, which must
+	  be manually done due to its verbosity.
 	default y
 
 config PPC_SMLPAR
@@ -59,7 +102,7 @@ config PPC_SMLPAR
 
 config CMM
 	tristate "Collaborative memory management"
-	depends on PPC_SMLPAR && !CRASH_DUMP
+	depends on PPC_SMLPAR
 	default y
 	help
 	  Select this option, if you want to enable the kernel interface
@@ -69,6 +112,18 @@ config CMM
 	  will be reused for other LPARs. The interface allows firmware to
 	  balance memory across many LPARs.
 
+config HV_PERF_CTRS
+       bool "Hypervisor supplied PMU events (24x7 & GPCI)"
+       default y
+       depends on PERF_EVENTS && PPC_PSERIES
+       help
+	  Enable access to hypervisor supplied counters in perf. Currently,
+	  this enables code that uses the hcall GetPerfCounterInfo and 24x7
+	  interfaces to retrieve counters. GPCI exists on Power 6 and later
+	  systems. 24x7 is available on Power 8 systems.
+
+          If unsure, select Y.
+
 config DTL
 	bool "Dispatch Trace Log"
 	depends on PPC_SPLPAR && DEBUG_FS
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index 790c0b872d4..03480796af9 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -1,21 +1,16 @@
-ifeq ($(CONFIG_PPC64),y)
-EXTRA_CFLAGS		+= -mno-minimal-toc
-endif
-
-ifeq ($(CONFIG_PPC_PSERIES_DEBUG),y)
-EXTRA_CFLAGS		+= -DDEBUG
-endif
+ccflags-$(CONFIG_PPC64)			:= $(NO_MINIMAL_TOC)
+ccflags-$(CONFIG_PPC_PSERIES_DEBUG)	+= -DDEBUG
 
 obj-y			:= lpar.o hvCall.o nvram.o reconfig.o \
-			   setup.o iommu.o ras.o rtasd.o \
-			   firmware.o power.o
+			   setup.o iommu.o event_sources.o ras.o \
+			   firmware.o power.o dlpar.o mobility.o rng.o
 obj-$(CONFIG_SMP)	+= smp.o
-obj-$(CONFIG_XICS)	+= xics.o
 obj-$(CONFIG_SCANLOG)	+= scanlog.o
-obj-$(CONFIG_EEH)	+= eeh.o eeh_cache.o eeh_driver.o eeh_event.o eeh_sysfs.o
+obj-$(CONFIG_EEH)	+= eeh_pseries.o
 obj-$(CONFIG_KEXEC)	+= kexec.o
 obj-$(CONFIG_PCI)	+= pci.o pci_dlpar.o
 obj-$(CONFIG_PSERIES_MSI)	+= msi.o
+obj-$(CONFIG_PSERIES_ENERGY)	+= pseries_energy.o
 
 obj-$(CONFIG_HOTPLUG_CPU)	+= hotplug-cpu.o
 obj-$(CONFIG_MEMORY_HOTPLUG)	+= hotplug-memory.o
@@ -23,6 +18,11 @@ obj-$(CONFIG_MEMORY_HOTPLUG)	+= hotplug-memory.o
 obj-$(CONFIG_HVC_CONSOLE)	+= hvconsole.o
 obj-$(CONFIG_HVCS)		+= hvcserver.o
 obj-$(CONFIG_HCALL_STATS)	+= hvCall_inst.o
-obj-$(CONFIG_PHYP_DUMP)	+= phyp_dump.o
 obj-$(CONFIG_CMM)		+= cmm.o
 obj-$(CONFIG_DTL)		+= dtl.o
+obj-$(CONFIG_IO_EVENT_IRQ)	+= io_event_irq.o
+obj-$(CONFIG_LPARCFG)		+= lparcfg.o
+
+ifeq ($(CONFIG_PPC_PSERIES),y)
+obj-$(CONFIG_SUSPEND)		+= suspend.o
+endif
diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
index 6567439fe78..2d8bf15879f 100644
--- a/arch/powerpc/platforms/pseries/cmm.c
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -24,7 +24,7 @@
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
-#include <linux/init.h>
+#include <linux/gfp.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/oom.h>
@@ -32,30 +32,38 @@
 #include <linux/sched.h>
 #include <linux/stringify.h>
 #include <linux/swap.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <asm/firmware.h>
 #include <asm/hvcall.h>
 #include <asm/mmu.h>
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
-
-#include "plpar_wrappers.h"
+#include <linux/memory.h>
+#include <asm/plpar_wrappers.h>
 
 #define CMM_DRIVER_VERSION	"1.0.0"
 #define CMM_DEFAULT_DELAY	1
+#define CMM_HOTPLUG_DELAY	5
 #define CMM_DEBUG			0
 #define CMM_DISABLE		0
 #define CMM_OOM_KB		1024
 #define CMM_MIN_MEM_MB		256
 #define KB2PAGES(_p)		((_p)>>(PAGE_SHIFT-10))
 #define PAGES2KB(_p)		((_p)<<(PAGE_SHIFT-10))
+/*
+ * The priority level tries to ensure that this notifier is called as
+ * late as possible to reduce thrashing in the shared memory pool.
+ */
+#define CMM_MEM_HOTPLUG_PRI	1
+#define CMM_MEM_ISOLATE_PRI	15
 
 static unsigned int delay = CMM_DEFAULT_DELAY;
+static unsigned int hotplug_delay = CMM_HOTPLUG_DELAY;
 static unsigned int oom_kb = CMM_OOM_KB;
 static unsigned int cmm_debug = CMM_DEBUG;
 static unsigned int cmm_disabled = CMM_DISABLE;
 static unsigned long min_mem_mb = CMM_MIN_MEM_MB;
-static struct sys_device cmm_sysdev;
+static struct device cmm_dev;
 
 MODULE_AUTHOR("Brian King <brking@linux.vnet.ibm.com>");
 MODULE_DESCRIPTION("IBM System p Collaborative Memory Manager");
@@ -65,6 +73,10 @@ MODULE_VERSION(CMM_DRIVER_VERSION);
 module_param_named(delay, delay, uint, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(delay, "Delay (in seconds) between polls to query hypervisor paging requests. "
 		 "[Default=" __stringify(CMM_DEFAULT_DELAY) "]");
+module_param_named(hotplug_delay, hotplug_delay, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(delay, "Delay (in seconds) after memory hotplug remove "
+		 "before loaning resumes. "
+		 "[Default=" __stringify(CMM_HOTPLUG_DELAY) "]");
 module_param_named(oom_kb, oom_kb, uint, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(oom_kb, "Amount of memory in kb to free on OOM. "
 		 "[Default=" __stringify(CMM_OOM_KB) "]");
@@ -92,6 +104,9 @@ static unsigned long oom_freed_pages;
 static struct cmm_page_array *cmm_page_list;
 static DEFINE_SPINLOCK(cmm_lock);
 
+static DEFINE_MUTEX(hotplug_mutex);
+static int hotplug_occurred; /* protected by the hotplug mutex */
+
 static struct task_struct *cmm_thread_ptr;
 
 /**
@@ -110,6 +125,17 @@ static long cmm_alloc_pages(long nr)
 	cmm_dbg("Begin request for %ld pages\n", nr);
 
 	while (nr) {
+		/* Exit if a hotplug operation is in progress or occurred */
+		if (mutex_trylock(&hotplug_mutex)) {
+			if (hotplug_occurred) {
+				mutex_unlock(&hotplug_mutex);
+				break;
+			}
+			mutex_unlock(&hotplug_mutex);
+		} else {
+			break;
+		}
+
 		addr = __get_free_page(GFP_NOIO | __GFP_NOWARN |
 				       __GFP_NORETRY | __GFP_NOMEMALLOC);
 		if (!addr)
@@ -119,8 +145,9 @@ static long cmm_alloc_pages(long nr)
 		if (!pa || pa->index >= CMM_NR_PAGES) {
 			/* Need a new page for the page list. */
 			spin_unlock(&cmm_lock);
-			npa = (struct cmm_page_array *)__get_free_page(GFP_NOIO | __GFP_NOWARN |
-								       __GFP_NORETRY | __GFP_NOMEMALLOC);
+			npa = (struct cmm_page_array *)__get_free_page(
+					GFP_NOIO | __GFP_NOWARN |
+					__GFP_NORETRY | __GFP_NOMEMALLOC);
 			if (!npa) {
 				pr_info("%s: Can not allocate new page list\n", __func__);
 				free_page(addr);
@@ -229,8 +256,9 @@ static void cmm_get_mpp(void)
 {
 	int rc;
 	struct hvcall_mpp_data mpp_data;
-	unsigned long active_pages_target;
-	signed long page_loan_request;
+	signed long active_pages_target, page_loan_request, target;
+	signed long total_pages = totalram_pages + loaned_pages;
+	signed long min_mem_pages = (min_mem_mb * 1024 * 1024) / PAGE_SIZE;
 
 	rc = h_get_mpp(&mpp_data);
 
@@ -238,17 +266,25 @@ static void cmm_get_mpp(void)
 		return;
 
 	page_loan_request = div_s64((s64)mpp_data.loan_request, PAGE_SIZE);
-	loaned_pages_target = page_loan_request + loaned_pages;
-	if (loaned_pages_target > oom_freed_pages)
-		loaned_pages_target -= oom_freed_pages;
+	target = page_loan_request + (signed long)loaned_pages;
+
+	if (target < 0 || total_pages < min_mem_pages)
+		target = 0;
+
+	if (target > oom_freed_pages)
+		target -= oom_freed_pages;
 	else
-		loaned_pages_target = 0;
+		target = 0;
+
+	active_pages_target = total_pages - target;
+
+	if (min_mem_pages > active_pages_target)
+		target = total_pages - min_mem_pages;
 
-	active_pages_target = totalram_pages + loaned_pages - loaned_pages_target;
+	if (target < 0)
+		target = 0;
 
-	if ((min_mem_mb * 1024 * 1024) > (active_pages_target * PAGE_SIZE))
-		loaned_pages_target = totalram_pages + loaned_pages -
-			((min_mem_mb * 1024 * 1024) / PAGE_SIZE);
+	loaned_pages_target = target;
 
 	cmm_dbg("delta = %ld, loaned = %lu, target = %lu, oom = %lu, totalram = %lu\n",
 		page_loan_request, loaned_pages, loaned_pages_target,
@@ -273,9 +309,28 @@ static int cmm_thread(void *dummy)
 	while (1) {
 		timeleft = msleep_interruptible(delay * 1000);
 
-		if (kthread_should_stop() || timeleft) {
-			loaned_pages_target = loaned_pages;
+		if (kthread_should_stop() || timeleft)
 			break;
+
+		if (mutex_trylock(&hotplug_mutex)) {
+			if (hotplug_occurred) {
+				hotplug_occurred = 0;
+				mutex_unlock(&hotplug_mutex);
+				cmm_dbg("Hotplug operation has occurred, "
+						"loaning activity suspended "
+						"for %d seconds.\n",
+						hotplug_delay);
+				timeleft = msleep_interruptible(hotplug_delay *
+						1000);
+				if (kthread_should_stop() || timeleft)
+					break;
+				continue;
+			}
+			mutex_unlock(&hotplug_mutex);
+		} else {
+			cmm_dbg("Hotplug operation in progress, activity "
+					"suspended\n");
+			continue;
 		}
 
 		cmm_get_mpp();
@@ -290,25 +345,25 @@ static int cmm_thread(void *dummy)
 }
 
 #define CMM_SHOW(name, format, args...)			\
-	static ssize_t show_##name(struct sys_device *dev,	\
-				   struct sysdev_attribute *attr,	\
+	static ssize_t show_##name(struct device *dev,	\
+				   struct device_attribute *attr,	\
 				   char *buf)			\
 	{							\
 		return sprintf(buf, format, ##args);		\
 	}							\
-	static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL)
+	static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
 
 CMM_SHOW(loaned_kb, "%lu\n", PAGES2KB(loaned_pages));
 CMM_SHOW(loaned_target_kb, "%lu\n", PAGES2KB(loaned_pages_target));
 
-static ssize_t show_oom_pages(struct sys_device *dev,
-			      struct sysdev_attribute *attr, char *buf)
+static ssize_t show_oom_pages(struct device *dev,
+			      struct device_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%lu\n", PAGES2KB(oom_freed_pages));
 }
 
-static ssize_t store_oom_pages(struct sys_device *dev,
-			       struct sysdev_attribute *attr,
+static ssize_t store_oom_pages(struct device *dev,
+			       struct device_attribute *attr,
 			       const char *buf, size_t count)
 {
 	unsigned long val = simple_strtoul (buf, NULL, 10);
@@ -322,17 +377,18 @@ static ssize_t store_oom_pages(struct sys_device *dev,
 	return count;
 }
 
-static SYSDEV_ATTR(oom_freed_kb, S_IWUSR| S_IRUGO,
+static DEVICE_ATTR(oom_freed_kb, S_IWUSR | S_IRUGO,
 		   show_oom_pages, store_oom_pages);
 
-static struct sysdev_attribute *cmm_attrs[] = {
-	&attr_loaned_kb,
-	&attr_loaned_target_kb,
-	&attr_oom_freed_kb,
+static struct device_attribute *cmm_attrs[] = {
+	&dev_attr_loaned_kb,
+	&dev_attr_loaned_target_kb,
+	&dev_attr_oom_freed_kb,
 };
 
-static struct sysdev_class cmm_sysdev_class = {
+static struct bus_type cmm_subsys = {
 	.name = "cmm",
+	.dev_name = "cmm",
 };
 
 /**
@@ -341,21 +397,21 @@ static struct sysdev_class cmm_sysdev_class = {
  * Return value:
  * 	0 on success / other on failure
  **/
-static int cmm_sysfs_register(struct sys_device *sysdev)
+static int cmm_sysfs_register(struct device *dev)
 {
 	int i, rc;
 
-	if ((rc = sysdev_class_register(&cmm_sysdev_class)))
+	if ((rc = subsys_system_register(&cmm_subsys, NULL)))
 		return rc;
 
-	sysdev->id = 0;
-	sysdev->cls = &cmm_sysdev_class;
+	dev->id = 0;
+	dev->bus = &cmm_subsys;
 
-	if ((rc = sysdev_register(sysdev)))
-		goto class_unregister;
+	if ((rc = device_register(dev)))
+		goto subsys_unregister;
 
 	for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++) {
-		if ((rc = sysdev_create_file(sysdev, cmm_attrs[i])))
+		if ((rc = device_create_file(dev, cmm_attrs[i])))
 			goto fail;
 	}
 
@@ -363,10 +419,10 @@ static int cmm_sysfs_register(struct sys_device *sysdev)
 
 fail:
 	while (--i >= 0)
-		sysdev_remove_file(sysdev, cmm_attrs[i]);
-	sysdev_unregister(sysdev);
-class_unregister:
-	sysdev_class_unregister(&cmm_sysdev_class);
+		device_remove_file(dev, cmm_attrs[i]);
+	device_unregister(dev);
+subsys_unregister:
+	bus_unregister(&cmm_subsys);
 	return rc;
 }
 
@@ -374,14 +430,14 @@ class_unregister:
  * cmm_unregister_sysfs - Unregister from sysfs
  *
  **/
-static void cmm_unregister_sysfs(struct sys_device *sysdev)
+static void cmm_unregister_sysfs(struct device *dev)
 {
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++)
-		sysdev_remove_file(sysdev, cmm_attrs[i]);
-	sysdev_unregister(sysdev);
-	sysdev_class_unregister(&cmm_sysdev_class);
+		device_remove_file(dev, cmm_attrs[i]);
+	device_unregister(dev);
+	bus_unregister(&cmm_subsys);
 }
 
 /**
@@ -405,6 +461,183 @@ static struct notifier_block cmm_reboot_nb = {
 };
 
 /**
+ * cmm_count_pages - Count the number of pages loaned in a particular range.
+ *
+ * @arg: memory_isolate_notify structure with address range and count
+ *
+ * Return value:
+ *      0 on success
+ **/
+static unsigned long cmm_count_pages(void *arg)
+{
+	struct memory_isolate_notify *marg = arg;
+	struct cmm_page_array *pa;
+	unsigned long start = (unsigned long)pfn_to_kaddr(marg->start_pfn);
+	unsigned long end = start + (marg->nr_pages << PAGE_SHIFT);
+	unsigned long idx;
+
+	spin_lock(&cmm_lock);
+	pa = cmm_page_list;
+	while (pa) {
+		if ((unsigned long)pa >= start && (unsigned long)pa < end)
+			marg->pages_found++;
+		for (idx = 0; idx < pa->index; idx++)
+			if (pa->page[idx] >= start && pa->page[idx] < end)
+				marg->pages_found++;
+		pa = pa->next;
+	}
+	spin_unlock(&cmm_lock);
+	return 0;
+}
+
+/**
+ * cmm_memory_isolate_cb - Handle memory isolation notifier calls
+ * @self:	notifier block struct
+ * @action:	action to take
+ * @arg:	struct memory_isolate_notify data for handler
+ *
+ * Return value:
+ *	NOTIFY_OK or notifier error based on subfunction return value
+ **/
+static int cmm_memory_isolate_cb(struct notifier_block *self,
+				 unsigned long action, void *arg)
+{
+	int ret = 0;
+
+	if (action == MEM_ISOLATE_COUNT)
+		ret = cmm_count_pages(arg);
+
+	return notifier_from_errno(ret);
+}
+
+static struct notifier_block cmm_mem_isolate_nb = {
+	.notifier_call = cmm_memory_isolate_cb,
+	.priority = CMM_MEM_ISOLATE_PRI
+};
+
+/**
+ * cmm_mem_going_offline - Unloan pages where memory is to be removed
+ * @arg: memory_notify structure with page range to be offlined
+ *
+ * Return value:
+ *	0 on success
+ **/
+static int cmm_mem_going_offline(void *arg)
+{
+	struct memory_notify *marg = arg;
+	unsigned long start_page = (unsigned long)pfn_to_kaddr(marg->start_pfn);
+	unsigned long end_page = start_page + (marg->nr_pages << PAGE_SHIFT);
+	struct cmm_page_array *pa_curr, *pa_last, *npa;
+	unsigned long idx;
+	unsigned long freed = 0;
+
+	cmm_dbg("Memory going offline, searching 0x%lx (%ld pages).\n",
+			start_page, marg->nr_pages);
+	spin_lock(&cmm_lock);
+
+	/* Search the page list for pages in the range to be offlined */
+	pa_last = pa_curr = cmm_page_list;
+	while (pa_curr) {
+		for (idx = (pa_curr->index - 1); (idx + 1) > 0; idx--) {
+			if ((pa_curr->page[idx] < start_page) ||
+			    (pa_curr->page[idx] >= end_page))
+				continue;
+
+			plpar_page_set_active(__pa(pa_curr->page[idx]));
+			free_page(pa_curr->page[idx]);
+			freed++;
+			loaned_pages--;
+			totalram_pages++;
+			pa_curr->page[idx] = pa_last->page[--pa_last->index];
+			if (pa_last->index == 0) {
+				if (pa_curr == pa_last)
+					pa_curr = pa_last->next;
+				pa_last = pa_last->next;
+				free_page((unsigned long)cmm_page_list);
+				cmm_page_list = pa_last;
+				continue;
+			}
+		}
+		pa_curr = pa_curr->next;
+	}
+
+	/* Search for page list structures in the range to be offlined */
+	pa_last = NULL;
+	pa_curr = cmm_page_list;
+	while (pa_curr) {
+		if (((unsigned long)pa_curr >= start_page) &&
+				((unsigned long)pa_curr < end_page)) {
+			npa = (struct cmm_page_array *)__get_free_page(
+					GFP_NOIO | __GFP_NOWARN |
+					__GFP_NORETRY | __GFP_NOMEMALLOC);
+			if (!npa) {
+				spin_unlock(&cmm_lock);
+				cmm_dbg("Failed to allocate memory for list "
+						"management. Memory hotplug "
+						"failed.\n");
+				return ENOMEM;
+			}
+			memcpy(npa, pa_curr, PAGE_SIZE);
+			if (pa_curr == cmm_page_list)
+				cmm_page_list = npa;
+			if (pa_last)
+				pa_last->next = npa;
+			free_page((unsigned long) pa_curr);
+			freed++;
+			pa_curr = npa;
+		}
+
+		pa_last = pa_curr;
+		pa_curr = pa_curr->next;
+	}
+
+	spin_unlock(&cmm_lock);
+	cmm_dbg("Released %ld pages in the search range.\n", freed);
+
+	return 0;
+}
+
+/**
+ * cmm_memory_cb - Handle memory hotplug notifier calls
+ * @self:	notifier block struct
+ * @action:	action to take
+ * @arg:	struct memory_notify data for handler
+ *
+ * Return value:
+ *	NOTIFY_OK or notifier error based on subfunction return value
+ *
+ **/
+static int cmm_memory_cb(struct notifier_block *self,
+			unsigned long action, void *arg)
+{
+	int ret = 0;
+
+	switch (action) {
+	case MEM_GOING_OFFLINE:
+		mutex_lock(&hotplug_mutex);
+		hotplug_occurred = 1;
+		ret = cmm_mem_going_offline(arg);
+		break;
+	case MEM_OFFLINE:
+	case MEM_CANCEL_OFFLINE:
+		mutex_unlock(&hotplug_mutex);
+		cmm_dbg("Memory offline operation complete.\n");
+		break;
+	case MEM_GOING_ONLINE:
+	case MEM_ONLINE:
+	case MEM_CANCEL_ONLINE:
+		break;
+	}
+
+	return notifier_from_errno(ret);
+}
+
+static struct notifier_block cmm_mem_nb = {
+	.notifier_call = cmm_memory_cb,
+	.priority = CMM_MEM_HOTPLUG_PRI
+};
+
+/**
  * cmm_init - Module initialization
  *
  * Return value:
@@ -423,22 +656,28 @@ static int cmm_init(void)
 	if ((rc = register_reboot_notifier(&cmm_reboot_nb)))
 		goto out_oom_notifier;
 
-	if ((rc = cmm_sysfs_register(&cmm_sysdev)))
+	if ((rc = cmm_sysfs_register(&cmm_dev)))
 		goto out_reboot_notifier;
 
+	if (register_memory_notifier(&cmm_mem_nb) ||
+	    register_memory_isolate_notifier(&cmm_mem_isolate_nb))
+		goto out_unregister_notifier;
+
 	if (cmm_disabled)
 		return rc;
 
 	cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
 	if (IS_ERR(cmm_thread_ptr)) {
 		rc = PTR_ERR(cmm_thread_ptr);
-		goto out_unregister_sysfs;
+		goto out_unregister_notifier;
 	}
 
 	return rc;
 
-out_unregister_sysfs:
-	cmm_unregister_sysfs(&cmm_sysdev);
+out_unregister_notifier:
+	unregister_memory_notifier(&cmm_mem_nb);
+	unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
+	cmm_unregister_sysfs(&cmm_dev);
 out_reboot_notifier:
 	unregister_reboot_notifier(&cmm_reboot_nb);
 out_oom_notifier:
@@ -458,8 +697,10 @@ static void cmm_exit(void)
 		kthread_stop(cmm_thread_ptr);
 	unregister_oom_notifier(&cmm_oom_nb);
 	unregister_reboot_notifier(&cmm_reboot_nb);
+	unregister_memory_notifier(&cmm_mem_nb);
+	unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
 	cmm_free_pages(loaned_pages);
-	cmm_unregister_sysfs(&cmm_sysdev);
+	cmm_unregister_sysfs(&cmm_dev);
 }
 
 /**
diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
new file mode 100644
index 00000000000..2d0b4d68a40
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -0,0 +1,542 @@
+/*
+ * Support for dynamic reconfiguration for PCI, Memory, and CPU
+ * Hotplug and Dynamic Logical Partitioning on RPA platforms.
+ *
+ * Copyright (C) 2009 Nathan Fontenot
+ * Copyright (C) 2009 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/notifier.h>
+#include <linux/spinlock.h>
+#include <linux/cpu.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+#include "offline_states.h"
+
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <asm/uaccess.h>
+#include <asm/rtas.h>
+
+struct cc_workarea {
+	u32	drc_index;
+	u32	zero;
+	u32	name_offset;
+	u32	prop_length;
+	u32	prop_offset;
+};
+
+void dlpar_free_cc_property(struct property *prop)
+{
+	kfree(prop->name);
+	kfree(prop->value);
+	kfree(prop);
+}
+
+static struct property *dlpar_parse_cc_property(struct cc_workarea *ccwa)
+{
+	struct property *prop;
+	char *name;
+	char *value;
+
+	prop = kzalloc(sizeof(*prop), GFP_KERNEL);
+	if (!prop)
+		return NULL;
+
+	name = (char *)ccwa + ccwa->name_offset;
+	prop->name = kstrdup(name, GFP_KERNEL);
+
+	prop->length = ccwa->prop_length;
+	value = (char *)ccwa + ccwa->prop_offset;
+	prop->value = kmemdup(value, prop->length, GFP_KERNEL);
+	if (!prop->value) {
+		dlpar_free_cc_property(prop);
+		return NULL;
+	}
+
+	return prop;
+}
+
+static struct device_node *dlpar_parse_cc_node(struct cc_workarea *ccwa,
+					       const char *path)
+{
+	struct device_node *dn;
+	char *name;
+
+	/* If parent node path is "/" advance path to NULL terminator to
+	 * prevent double leading slashs in full_name.
+	 */
+	if (!path[1])
+		path++;
+
+	dn = kzalloc(sizeof(*dn), GFP_KERNEL);
+	if (!dn)
+		return NULL;
+
+	name = (char *)ccwa + ccwa->name_offset;
+	dn->full_name = kasprintf(GFP_KERNEL, "%s/%s", path, name);
+	if (!dn->full_name) {
+		kfree(dn);
+		return NULL;
+	}
+
+	of_node_set_flag(dn, OF_DYNAMIC);
+	of_node_init(dn);
+
+	return dn;
+}
+
+static void dlpar_free_one_cc_node(struct device_node *dn)
+{
+	struct property *prop;
+
+	while (dn->properties) {
+		prop = dn->properties;
+		dn->properties = prop->next;
+		dlpar_free_cc_property(prop);
+	}
+
+	kfree(dn->full_name);
+	kfree(dn);
+}
+
+void dlpar_free_cc_nodes(struct device_node *dn)
+{
+	if (dn->child)
+		dlpar_free_cc_nodes(dn->child);
+
+	if (dn->sibling)
+		dlpar_free_cc_nodes(dn->sibling);
+
+	dlpar_free_one_cc_node(dn);
+}
+
+#define COMPLETE	0
+#define NEXT_SIBLING    1
+#define NEXT_CHILD      2
+#define NEXT_PROPERTY   3
+#define PREV_PARENT     4
+#define MORE_MEMORY     5
+#define CALL_AGAIN	-2
+#define ERR_CFG_USE     -9003
+
+struct device_node *dlpar_configure_connector(u32 drc_index,
+					      struct device_node *parent)
+{
+	struct device_node *dn;
+	struct device_node *first_dn = NULL;
+	struct device_node *last_dn = NULL;
+	struct property *property;
+	struct property *last_property = NULL;
+	struct cc_workarea *ccwa;
+	char *data_buf;
+	const char *parent_path = parent->full_name;
+	int cc_token;
+	int rc = -1;
+
+	cc_token = rtas_token("ibm,configure-connector");
+	if (cc_token == RTAS_UNKNOWN_SERVICE)
+		return NULL;
+
+	data_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+	if (!data_buf)
+		return NULL;
+
+	ccwa = (struct cc_workarea *)&data_buf[0];
+	ccwa->drc_index = drc_index;
+	ccwa->zero = 0;
+
+	do {
+		/* Since we release the rtas_data_buf lock between configure
+		 * connector calls we want to re-populate the rtas_data_buffer
+		 * with the contents of the previous call.
+		 */
+		spin_lock(&rtas_data_buf_lock);
+
+		memcpy(rtas_data_buf, data_buf, RTAS_DATA_BUF_SIZE);
+		rc = rtas_call(cc_token, 2, 1, NULL, rtas_data_buf, NULL);
+		memcpy(data_buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
+
+		spin_unlock(&rtas_data_buf_lock);
+
+		switch (rc) {
+		case COMPLETE:
+			break;
+
+		case NEXT_SIBLING:
+			dn = dlpar_parse_cc_node(ccwa, parent_path);
+			if (!dn)
+				goto cc_error;
+
+			dn->parent = last_dn->parent;
+			last_dn->sibling = dn;
+			last_dn = dn;
+			break;
+
+		case NEXT_CHILD:
+			if (first_dn)
+				parent_path = last_dn->full_name;
+
+			dn = dlpar_parse_cc_node(ccwa, parent_path);
+			if (!dn)
+				goto cc_error;
+
+			if (!first_dn) {
+				dn->parent = parent;
+				first_dn = dn;
+			} else {
+				dn->parent = last_dn;
+				if (last_dn)
+					last_dn->child = dn;
+			}
+
+			last_dn = dn;
+			break;
+
+		case NEXT_PROPERTY:
+			property = dlpar_parse_cc_property(ccwa);
+			if (!property)
+				goto cc_error;
+
+			if (!last_dn->properties)
+				last_dn->properties = property;
+			else
+				last_property->next = property;
+
+			last_property = property;
+			break;
+
+		case PREV_PARENT:
+			last_dn = last_dn->parent;
+			parent_path = last_dn->parent->full_name;
+			break;
+
+		case CALL_AGAIN:
+			break;
+
+		case MORE_MEMORY:
+		case ERR_CFG_USE:
+		default:
+			printk(KERN_ERR "Unexpected Error (%d) "
+			       "returned from configure-connector\n", rc);
+			goto cc_error;
+		}
+	} while (rc);
+
+cc_error:
+	kfree(data_buf);
+
+	if (rc) {
+		if (first_dn)
+			dlpar_free_cc_nodes(first_dn);
+
+		return NULL;
+	}
+
+	return first_dn;
+}
+
+static struct device_node *derive_parent(const char *path)
+{
+	struct device_node *parent;
+	char *last_slash;
+
+	last_slash = strrchr(path, '/');
+	if (last_slash == path) {
+		parent = of_find_node_by_path("/");
+	} else {
+		char *parent_path;
+		int parent_path_len = last_slash - path + 1;
+		parent_path = kmalloc(parent_path_len, GFP_KERNEL);
+		if (!parent_path)
+			return NULL;
+
+		strlcpy(parent_path, path, parent_path_len);
+		parent = of_find_node_by_path(parent_path);
+		kfree(parent_path);
+	}
+
+	return parent;
+}
+
+int dlpar_attach_node(struct device_node *dn)
+{
+	int rc;
+
+	dn->parent = derive_parent(dn->full_name);
+	if (!dn->parent)
+		return -ENOMEM;
+
+	rc = of_attach_node(dn);
+	if (rc) {
+		printk(KERN_ERR "Failed to add device node %s\n",
+		       dn->full_name);
+		return rc;
+	}
+
+	of_node_put(dn->parent);
+	return 0;
+}
+
+int dlpar_detach_node(struct device_node *dn)
+{
+	struct device_node *child;
+	int rc;
+
+	child = of_get_next_child(dn, NULL);
+	while (child) {
+		dlpar_detach_node(child);
+		child = of_get_next_child(dn, child);
+	}
+
+	rc = of_detach_node(dn);
+	if (rc)
+		return rc;
+
+	of_node_put(dn); /* Must decrement the refcount */
+	return 0;
+}
+
+#define DR_ENTITY_SENSE		9003
+#define DR_ENTITY_PRESENT	1
+#define DR_ENTITY_UNUSABLE	2
+#define ALLOCATION_STATE	9003
+#define ALLOC_UNUSABLE		0
+#define ALLOC_USABLE		1
+#define ISOLATION_STATE		9001
+#define ISOLATE			0
+#define UNISOLATE		1
+
+int dlpar_acquire_drc(u32 drc_index)
+{
+	int dr_status, rc;
+
+	rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status,
+		       DR_ENTITY_SENSE, drc_index);
+	if (rc || dr_status != DR_ENTITY_UNUSABLE)
+		return -1;
+
+	rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_USABLE);
+	if (rc)
+		return rc;
+
+	rc = rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE);
+	if (rc) {
+		rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE);
+		return rc;
+	}
+
+	return 0;
+}
+
+int dlpar_release_drc(u32 drc_index)
+{
+	int dr_status, rc;
+
+	rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status,
+		       DR_ENTITY_SENSE, drc_index);
+	if (rc || dr_status != DR_ENTITY_PRESENT)
+		return -1;
+
+	rc = rtas_set_indicator(ISOLATION_STATE, drc_index, ISOLATE);
+	if (rc)
+		return rc;
+
+	rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE);
+	if (rc) {
+		rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE);
+		return rc;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+
+static int dlpar_online_cpu(struct device_node *dn)
+{
+	int rc = 0;
+	unsigned int cpu;
+	int len, nthreads, i;
+	const u32 *intserv;
+
+	intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len);
+	if (!intserv)
+		return -EINVAL;
+
+	nthreads = len / sizeof(u32);
+
+	cpu_maps_update_begin();
+	for (i = 0; i < nthreads; i++) {
+		for_each_present_cpu(cpu) {
+			if (get_hard_smp_processor_id(cpu) != intserv[i])
+				continue;
+			BUG_ON(get_cpu_current_state(cpu)
+					!= CPU_STATE_OFFLINE);
+			cpu_maps_update_done();
+			rc = cpu_up(cpu);
+			if (rc)
+				goto out;
+			cpu_maps_update_begin();
+
+			break;
+		}
+		if (cpu == num_possible_cpus())
+			printk(KERN_WARNING "Could not find cpu to online "
+			       "with physical id 0x%x\n", intserv[i]);
+	}
+	cpu_maps_update_done();
+
+out:
+	return rc;
+
+}
+
+static ssize_t dlpar_cpu_probe(const char *buf, size_t count)
+{
+	struct device_node *dn, *parent;
+	unsigned long drc_index;
+	int rc;
+
+	rc = strict_strtoul(buf, 0, &drc_index);
+	if (rc)
+		return -EINVAL;
+
+	parent = of_find_node_by_path("/cpus");
+	if (!parent)
+		return -ENODEV;
+
+	dn = dlpar_configure_connector(drc_index, parent);
+	if (!dn)
+		return -EINVAL;
+
+	of_node_put(parent);
+
+	rc = dlpar_acquire_drc(drc_index);
+	if (rc) {
+		dlpar_free_cc_nodes(dn);
+		return -EINVAL;
+	}
+
+	rc = dlpar_attach_node(dn);
+	if (rc) {
+		dlpar_release_drc(drc_index);
+		dlpar_free_cc_nodes(dn);
+		return rc;
+	}
+
+	rc = dlpar_online_cpu(dn);
+	if (rc)
+		return rc;
+
+	return count;
+}
+
+static int dlpar_offline_cpu(struct device_node *dn)
+{
+	int rc = 0;
+	unsigned int cpu;
+	int len, nthreads, i;
+	const u32 *intserv;
+
+	intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len);
+	if (!intserv)
+		return -EINVAL;
+
+	nthreads = len / sizeof(u32);
+
+	cpu_maps_update_begin();
+	for (i = 0; i < nthreads; i++) {
+		for_each_present_cpu(cpu) {
+			if (get_hard_smp_processor_id(cpu) != intserv[i])
+				continue;
+
+			if (get_cpu_current_state(cpu) == CPU_STATE_OFFLINE)
+				break;
+
+			if (get_cpu_current_state(cpu) == CPU_STATE_ONLINE) {
+				set_preferred_offline_state(cpu, CPU_STATE_OFFLINE);
+				cpu_maps_update_done();
+				rc = cpu_down(cpu);
+				if (rc)
+					goto out;
+				cpu_maps_update_begin();
+				break;
+
+			}
+
+			/*
+			 * The cpu is in CPU_STATE_INACTIVE.
+			 * Upgrade it's state to CPU_STATE_OFFLINE.
+			 */
+			set_preferred_offline_state(cpu, CPU_STATE_OFFLINE);
+			BUG_ON(plpar_hcall_norets(H_PROD, intserv[i])
+								!= H_SUCCESS);
+			__cpu_die(cpu);
+			break;
+		}
+		if (cpu == num_possible_cpus())
+			printk(KERN_WARNING "Could not find cpu to offline "
+			       "with physical id 0x%x\n", intserv[i]);
+	}
+	cpu_maps_update_done();
+
+out:
+	return rc;
+
+}
+
+static ssize_t dlpar_cpu_release(const char *buf, size_t count)
+{
+	struct device_node *dn;
+	const u32 *drc_index;
+	int rc;
+
+	dn = of_find_node_by_path(buf);
+	if (!dn)
+		return -EINVAL;
+
+	drc_index = of_get_property(dn, "ibm,my-drc-index", NULL);
+	if (!drc_index) {
+		of_node_put(dn);
+		return -EINVAL;
+	}
+
+	rc = dlpar_offline_cpu(dn);
+	if (rc) {
+		of_node_put(dn);
+		return -EINVAL;
+	}
+
+	rc = dlpar_release_drc(*drc_index);
+	if (rc) {
+		of_node_put(dn);
+		return rc;
+	}
+
+	rc = dlpar_detach_node(dn);
+	if (rc) {
+		dlpar_acquire_drc(*drc_index);
+		return rc;
+	}
+
+	of_node_put(dn);
+
+	return count;
+}
+
+static int __init pseries_dlpar_init(void)
+{
+	ppc_md.cpu_probe = dlpar_cpu_probe;
+	ppc_md.cpu_release = dlpar_cpu_release;
+
+	return 0;
+}
+machine_device_initcall(pseries, pseries_dlpar_init);
+
+#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c
index ab69925d579..7d61498e45c 100644
--- a/arch/powerpc/platforms/pseries/dtl.c
+++ b/arch/powerpc/platforms/pseries/dtl.c
@@ -20,32 +20,15 @@
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/debugfs.h>
+#include <linux/spinlock.h>
 #include <asm/smp.h>
-#include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/firmware.h>
-
-#include "plpar_wrappers.h"
-
-/*
- * Layout of entries in the hypervisor's DTL buffer. Although we don't
- * actually access the internals of an entry (we only need to know the size),
- * we might as well define it here for reference.
- */
-struct dtl_entry {
-	u8	dispatch_reason;
-	u8	preempt_reason;
-	u16	processor_id;
-	u32	enqueue_to_dispatch_time;
-	u32	ready_to_enqueue_time;
-	u32	waiting_to_ready_time;
-	u64	timebase;
-	u64	fault_addr;
-	u64	srr0;
-	u64	srr1;
-};
+#include <asm/lppaca.h>
+#include <asm/debug.h>
+#include <asm/plpar_wrappers.h>
 
 struct dtl {
 	struct dtl_entry	*buf;
@@ -53,8 +36,9 @@ struct dtl {
 	int			cpu;
 	int			buf_entries;
 	u64			last_idx;
+	spinlock_t		lock;
 };
-static DEFINE_PER_CPU(struct dtl, dtl);
+static DEFINE_PER_CPU(struct dtl, cpu_dtl);
 
 /*
  * Dispatch trace log event mask:
@@ -66,34 +50,106 @@ static u8 dtl_event_mask = 0x7;
 
 
 /*
- * Size of per-cpu log buffers. Default is just under 16 pages worth.
+ * Size of per-cpu log buffers. Firmware requires that the buffer does
+ * not cross a 4k boundary.
  */
-static int dtl_buf_entries = (16 * 85);
+static int dtl_buf_entries = N_DISPATCH_LOG;
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+struct dtl_ring {
+	u64	write_index;
+	struct dtl_entry *write_ptr;
+	struct dtl_entry *buf;
+	struct dtl_entry *buf_end;
+	u8	saved_dtl_mask;
+};
 
+static DEFINE_PER_CPU(struct dtl_ring, dtl_rings);
 
-static int dtl_enable(struct dtl *dtl)
+static atomic_t dtl_count;
+
+/*
+ * The cpu accounting code controls the DTL ring buffer, and we get
+ * given entries as they are processed.
+ */
+static void consume_dtle(struct dtl_entry *dtle, u64 index)
 {
-	unsigned long addr;
-	int ret, hwcpu;
+	struct dtl_ring *dtlr = &__get_cpu_var(dtl_rings);
+	struct dtl_entry *wp = dtlr->write_ptr;
+	struct lppaca *vpa = local_paca->lppaca_ptr;
 
-	/* only allow one reader */
-	if (dtl->buf)
-		return -EBUSY;
+	if (!wp)
+		return;
 
-	/* we need to store the original allocation size for use during read */
-	dtl->buf_entries = dtl_buf_entries;
+	*wp = *dtle;
+	barrier();
 
-	dtl->buf = kmalloc_node(dtl->buf_entries * sizeof(struct dtl_entry),
-			GFP_KERNEL, cpu_to_node(dtl->cpu));
-	if (!dtl->buf) {
-		printk(KERN_WARNING "%s: buffer alloc failed for cpu %d\n",
-				__func__, dtl->cpu);
-		return -ENOMEM;
-	}
+	/* check for hypervisor ring buffer overflow, ignore this entry if so */
+	if (index + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx))
+		return;
+
+	++wp;
+	if (wp == dtlr->buf_end)
+		wp = dtlr->buf;
+	dtlr->write_ptr = wp;
+
+	/* incrementing write_index makes the new entry visible */
+	smp_wmb();
+	++dtlr->write_index;
+}
+
+static int dtl_start(struct dtl *dtl)
+{
+	struct dtl_ring *dtlr = &per_cpu(dtl_rings, dtl->cpu);
+
+	dtlr->buf = dtl->buf;
+	dtlr->buf_end = dtl->buf + dtl->buf_entries;
+	dtlr->write_index = 0;
+
+	/* setting write_ptr enables logging into our buffer */
+	smp_wmb();
+	dtlr->write_ptr = dtl->buf;
+
+	/* enable event logging */
+	dtlr->saved_dtl_mask = lppaca_of(dtl->cpu).dtl_enable_mask;
+	lppaca_of(dtl->cpu).dtl_enable_mask |= dtl_event_mask;
+
+	dtl_consumer = consume_dtle;
+	atomic_inc(&dtl_count);
+	return 0;
+}
+
+static void dtl_stop(struct dtl *dtl)
+{
+	struct dtl_ring *dtlr = &per_cpu(dtl_rings, dtl->cpu);
+
+	dtlr->write_ptr = NULL;
+	smp_wmb();
+
+	dtlr->buf = NULL;
+
+	/* restore dtl_enable_mask */
+	lppaca_of(dtl->cpu).dtl_enable_mask = dtlr->saved_dtl_mask;
+
+	if (atomic_dec_and_test(&dtl_count))
+		dtl_consumer = NULL;
+}
+
+static u64 dtl_current_index(struct dtl *dtl)
+{
+	return per_cpu(dtl_rings, dtl->cpu).write_index;
+}
+
+#else /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+
+static int dtl_start(struct dtl *dtl)
+{
+	unsigned long addr;
+	int ret, hwcpu;
 
 	/* Register our dtl buffer with the hypervisor. The HV expects the
 	 * buffer size to be passed in the second word of the buffer */
-	((u32 *)dtl->buf)[1] = dtl->buf_entries * sizeof(struct dtl_entry);
+	((u32 *)dtl->buf)[1] = DISPATCH_LOG_BYTES;
 
 	hwcpu = get_hard_smp_processor_id(dtl->cpu);
 	addr = __pa(dtl->buf);
@@ -101,34 +157,84 @@ static int dtl_enable(struct dtl *dtl)
 	if (ret) {
 		printk(KERN_WARNING "%s: DTL registration for cpu %d (hw %d) "
 		       "failed with %d\n", __func__, dtl->cpu, hwcpu, ret);
-		kfree(dtl->buf);
 		return -EIO;
 	}
 
 	/* set our initial buffer indices */
-	dtl->last_idx = lppaca[dtl->cpu].dtl_idx = 0;
+	lppaca_of(dtl->cpu).dtl_idx = 0;
 
 	/* ensure that our updates to the lppaca fields have occurred before
 	 * we actually enable the logging */
 	smp_wmb();
 
 	/* enable event logging */
-	lppaca[dtl->cpu].dtl_enable_mask = dtl_event_mask;
+	lppaca_of(dtl->cpu).dtl_enable_mask = dtl_event_mask;
 
 	return 0;
 }
 
-static void dtl_disable(struct dtl *dtl)
+static void dtl_stop(struct dtl *dtl)
 {
 	int hwcpu = get_hard_smp_processor_id(dtl->cpu);
 
-	lppaca[dtl->cpu].dtl_enable_mask = 0x0;
+	lppaca_of(dtl->cpu).dtl_enable_mask = 0x0;
+
+	unregister_dtl(hwcpu);
+}
+
+static u64 dtl_current_index(struct dtl *dtl)
+{
+	return lppaca_of(dtl->cpu).dtl_idx;
+}
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+
+static int dtl_enable(struct dtl *dtl)
+{
+	long int n_entries;
+	long int rc;
+	struct dtl_entry *buf = NULL;
 
-	unregister_dtl(hwcpu, __pa(dtl->buf));
+	if (!dtl_cache)
+		return -ENOMEM;
 
-	kfree(dtl->buf);
+	/* only allow one reader */
+	if (dtl->buf)
+		return -EBUSY;
+
+	n_entries = dtl_buf_entries;
+	buf = kmem_cache_alloc_node(dtl_cache, GFP_KERNEL, cpu_to_node(dtl->cpu));
+	if (!buf) {
+		printk(KERN_WARNING "%s: buffer alloc failed for cpu %d\n",
+				__func__, dtl->cpu);
+		return -ENOMEM;
+	}
+
+	spin_lock(&dtl->lock);
+	rc = -EBUSY;
+	if (!dtl->buf) {
+		/* store the original allocation size for use during read */
+		dtl->buf_entries = n_entries;
+		dtl->buf = buf;
+		dtl->last_idx = 0;
+		rc = dtl_start(dtl);
+		if (rc)
+			dtl->buf = NULL;
+	}
+	spin_unlock(&dtl->lock);
+
+	if (rc)
+		kmem_cache_free(dtl_cache, buf);
+	return rc;
+}
+
+static void dtl_disable(struct dtl *dtl)
+{
+	spin_lock(&dtl->lock);
+	dtl_stop(dtl);
+	kmem_cache_free(dtl_cache, dtl->buf);
 	dtl->buf = NULL;
 	dtl->buf_entries = 0;
+	spin_unlock(&dtl->lock);
 }
 
 /* file interface */
@@ -156,8 +262,9 @@ static int dtl_file_release(struct inode *inode, struct file *filp)
 static ssize_t dtl_file_read(struct file *filp, char __user *buf, size_t len,
 		loff_t *pos)
 {
-	int rc, cur_idx, last_idx, n_read, n_req, read_size;
+	long int rc, n_read, n_req, read_size;
 	struct dtl *dtl;
+	u64 cur_idx, last_idx, i;
 
 	if ((len % sizeof(struct dtl_entry)) != 0)
 		return -EINVAL;
@@ -170,46 +277,53 @@ static ssize_t dtl_file_read(struct file *filp, char __user *buf, size_t len,
 	/* actual number of entries read */
 	n_read = 0;
 
-	cur_idx = lppaca[dtl->cpu].dtl_idx;
+	spin_lock(&dtl->lock);
+
+	cur_idx = dtl_current_index(dtl);
 	last_idx = dtl->last_idx;
 
-	if (cur_idx - last_idx > dtl->buf_entries) {
-		pr_debug("%s: hv buffer overflow for cpu %d, samples lost\n",
-				__func__, dtl->cpu);
-	}
+	if (last_idx + dtl->buf_entries <= cur_idx)
+		last_idx = cur_idx - dtl->buf_entries + 1;
+
+	if (last_idx + n_req > cur_idx)
+		n_req = cur_idx - last_idx;
+
+	if (n_req > 0)
+		dtl->last_idx = last_idx + n_req;
+
+	spin_unlock(&dtl->lock);
+
+	if (n_req <= 0)
+		return 0;
 
-	cur_idx  %= dtl->buf_entries;
-	last_idx %= dtl->buf_entries;
+	i = last_idx % dtl->buf_entries;
 
 	/* read the tail of the buffer if we've wrapped */
-	if (last_idx > cur_idx) {
-		read_size = min(n_req, dtl->buf_entries - last_idx);
+	if (i + n_req > dtl->buf_entries) {
+		read_size = dtl->buf_entries - i;
 
-		rc = copy_to_user(buf, &dtl->buf[last_idx],
+		rc = copy_to_user(buf, &dtl->buf[i],
 				read_size * sizeof(struct dtl_entry));
 		if (rc)
 			return -EFAULT;
 
-		last_idx = 0;
+		i = 0;
 		n_req -= read_size;
 		n_read += read_size;
 		buf += read_size * sizeof(struct dtl_entry);
 	}
 
 	/* .. and now the head */
-	read_size = min(n_req, cur_idx - last_idx);
-	rc = copy_to_user(buf, &dtl->buf[last_idx],
-			read_size * sizeof(struct dtl_entry));
+	rc = copy_to_user(buf, &dtl->buf[i], n_req * sizeof(struct dtl_entry));
 	if (rc)
 		return -EFAULT;
 
-	n_read += read_size;
-	dtl->last_idx += n_read;
+	n_read += n_req;
 
 	return n_read * sizeof(struct dtl_entry);
 }
 
-static struct file_operations dtl_fops = {
+static const struct file_operations dtl_fops = {
 	.open		= dtl_file_open,
 	.release	= dtl_file_release,
 	.read		= dtl_file_read,
@@ -251,7 +365,7 @@ static int dtl_init(void)
 
 	event_mask_file = debugfs_create_x8("dtl_event_mask", 0600,
 				dtl_dir, &dtl_event_mask);
-	buf_entries_file = debugfs_create_u32("dtl_buf_entries", 0600,
+	buf_entries_file = debugfs_create_u32("dtl_buf_entries", 0400,
 				dtl_dir, &dtl_buf_entries);
 
 	if (!event_mask_file || !buf_entries_file) {
@@ -261,7 +375,8 @@ static int dtl_init(void)
 
 	/* set up the per-cpu log structures */
 	for_each_possible_cpu(i) {
-		struct dtl *dtl = &per_cpu(dtl, i);
+		struct dtl *dtl = &per_cpu(cpu_dtl, i);
+		spin_lock_init(&dtl->lock);
 		dtl->cpu = i;
 
 		rc = dtl_setup_file(dtl);
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
deleted file mode 100644
index 989d6462c15..00000000000
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ /dev/null
@@ -1,1282 +0,0 @@
-/*
- * eeh.c
- * Copyright IBM Corporation 2001, 2005, 2006
- * Copyright Dave Engebretsen & Todd Inglett 2001
- * Copyright Linas Vepstas 2005, 2006
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- *
- * Please address comments and feedback to Linas Vepstas <linas@austin.ibm.com>
- */
-
-#undef DEBUG
-
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/pci.h>
-#include <linux/proc_fs.h>
-#include <linux/rbtree.h>
-#include <linux/seq_file.h>
-#include <linux/spinlock.h>
-#include <linux/of.h>
-
-#include <asm/atomic.h>
-#include <asm/eeh.h>
-#include <asm/eeh_event.h>
-#include <asm/io.h>
-#include <asm/machdep.h>
-#include <asm/ppc-pci.h>
-#include <asm/rtas.h>
-
-
-/** Overview:
- *  EEH, or "Extended Error Handling" is a PCI bridge technology for
- *  dealing with PCI bus errors that can't be dealt with within the
- *  usual PCI framework, except by check-stopping the CPU.  Systems
- *  that are designed for high-availability/reliability cannot afford
- *  to crash due to a "mere" PCI error, thus the need for EEH.
- *  An EEH-capable bridge operates by converting a detected error
- *  into a "slot freeze", taking the PCI adapter off-line, making
- *  the slot behave, from the OS'es point of view, as if the slot
- *  were "empty": all reads return 0xff's and all writes are silently
- *  ignored.  EEH slot isolation events can be triggered by parity
- *  errors on the address or data busses (e.g. during posted writes),
- *  which in turn might be caused by low voltage on the bus, dust,
- *  vibration, humidity, radioactivity or plain-old failed hardware.
- *
- *  Note, however, that one of the leading causes of EEH slot
- *  freeze events are buggy device drivers, buggy device microcode,
- *  or buggy device hardware.  This is because any attempt by the
- *  device to bus-master data to a memory address that is not
- *  assigned to the device will trigger a slot freeze.   (The idea
- *  is to prevent devices-gone-wild from corrupting system memory).
- *  Buggy hardware/drivers will have a miserable time co-existing
- *  with EEH.
- *
- *  Ideally, a PCI device driver, when suspecting that an isolation
- *  event has occured (e.g. by reading 0xff's), will then ask EEH
- *  whether this is the case, and then take appropriate steps to
- *  reset the PCI slot, the PCI device, and then resume operations.
- *  However, until that day,  the checking is done here, with the
- *  eeh_check_failure() routine embedded in the MMIO macros.  If
- *  the slot is found to be isolated, an "EEH Event" is synthesized
- *  and sent out for processing.
- */
-
-/* If a device driver keeps reading an MMIO register in an interrupt
- * handler after a slot isolation event, it might be broken.
- * This sets the threshold for how many read attempts we allow
- * before printing an error message.
- */
-#define EEH_MAX_FAILS	2100000
-
-/* Time to wait for a PCI slot to report status, in milliseconds */
-#define PCI_BUS_RESET_WAIT_MSEC (60*1000)
-
-/* RTAS tokens */
-static int ibm_set_eeh_option;
-static int ibm_set_slot_reset;
-static int ibm_read_slot_reset_state;
-static int ibm_read_slot_reset_state2;
-static int ibm_slot_error_detail;
-static int ibm_get_config_addr_info;
-static int ibm_get_config_addr_info2;
-static int ibm_configure_bridge;
-
-int eeh_subsystem_enabled;
-EXPORT_SYMBOL(eeh_subsystem_enabled);
-
-/* Lock to avoid races due to multiple reports of an error */
-static DEFINE_SPINLOCK(confirm_error_lock);
-
-/* Buffer for reporting slot-error-detail rtas calls. Its here
- * in BSS, and not dynamically alloced, so that it ends up in
- * RMO where RTAS can access it.
- */
-static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];
-static DEFINE_SPINLOCK(slot_errbuf_lock);
-static int eeh_error_buf_size;
-
-/* Buffer for reporting pci register dumps. Its here in BSS, and
- * not dynamically alloced, so that it ends up in RMO where RTAS
- * can access it.
- */
-#define EEH_PCI_REGS_LOG_LEN 4096
-static unsigned char pci_regs_buf[EEH_PCI_REGS_LOG_LEN];
-
-/* System monitoring statistics */
-static unsigned long no_device;
-static unsigned long no_dn;
-static unsigned long no_cfg_addr;
-static unsigned long ignored_check;
-static unsigned long total_mmio_ffs;
-static unsigned long false_positives;
-static unsigned long slot_resets;
-
-#define IS_BRIDGE(class_code) (((class_code)<<16) == PCI_BASE_CLASS_BRIDGE)
-
-/* --------------------------------------------------------------- */
-/* Below lies the EEH event infrastructure */
-
-static void rtas_slot_error_detail(struct pci_dn *pdn, int severity,
-                                   char *driver_log, size_t loglen)
-{
-	int config_addr;
-	unsigned long flags;
-	int rc;
-
-	/* Log the error with the rtas logger */
-	spin_lock_irqsave(&slot_errbuf_lock, flags);
-	memset(slot_errbuf, 0, eeh_error_buf_size);
-
-	/* Use PE configuration address, if present */
-	config_addr = pdn->eeh_config_addr;
-	if (pdn->eeh_pe_config_addr)
-		config_addr = pdn->eeh_pe_config_addr;
-
-	rc = rtas_call(ibm_slot_error_detail,
-	               8, 1, NULL, config_addr,
-	               BUID_HI(pdn->phb->buid),
-	               BUID_LO(pdn->phb->buid),
-	               virt_to_phys(driver_log), loglen,
-	               virt_to_phys(slot_errbuf),
-	               eeh_error_buf_size,
-	               severity);
-
-	if (rc == 0)
-		log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0);
-	spin_unlock_irqrestore(&slot_errbuf_lock, flags);
-}
-
-/**
- * gather_pci_data - copy assorted PCI config space registers to buff
- * @pdn: device to report data for
- * @buf: point to buffer in which to log
- * @len: amount of room in buffer
- *
- * This routine captures assorted PCI configuration space data,
- * and puts them into a buffer for RTAS error logging.
- */
-static size_t gather_pci_data(struct pci_dn *pdn, char * buf, size_t len)
-{
-	struct pci_dev *dev = pdn->pcidev;
-	u32 cfg;
-	int cap, i;
-	int n = 0;
-
-	n += scnprintf(buf+n, len-n, "%s\n", pdn->node->full_name);
-	printk(KERN_WARNING "EEH: of node=%s\n", pdn->node->full_name);
-
-	rtas_read_config(pdn, PCI_VENDOR_ID, 4, &cfg);
-	n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg);
-	printk(KERN_WARNING "EEH: PCI device/vendor: %08x\n", cfg);
-
-	rtas_read_config(pdn, PCI_COMMAND, 4, &cfg);
-	n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg);
-	printk(KERN_WARNING "EEH: PCI cmd/status register: %08x\n", cfg);
-
-	if (!dev) {
-		printk(KERN_WARNING "EEH: no PCI device for this of node\n");
-		return n;
-	}
-
-	/* Gather bridge-specific registers */
-	if (dev->class >> 16 == PCI_BASE_CLASS_BRIDGE) {
-		rtas_read_config(pdn, PCI_SEC_STATUS, 2, &cfg);
-		n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg);
-		printk(KERN_WARNING "EEH: Bridge secondary status: %04x\n", cfg);
-
-		rtas_read_config(pdn, PCI_BRIDGE_CONTROL, 2, &cfg);
-		n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg);
-		printk(KERN_WARNING "EEH: Bridge control: %04x\n", cfg);
-	}
-
-	/* Dump out the PCI-X command and status regs */
-	cap = pci_find_capability(dev, PCI_CAP_ID_PCIX);
-	if (cap) {
-		rtas_read_config(pdn, cap, 4, &cfg);
-		n += scnprintf(buf+n, len-n, "pcix-cmd:%x\n", cfg);
-		printk(KERN_WARNING "EEH: PCI-X cmd: %08x\n", cfg);
-
-		rtas_read_config(pdn, cap+4, 4, &cfg);
-		n += scnprintf(buf+n, len-n, "pcix-stat:%x\n", cfg);
-		printk(KERN_WARNING "EEH: PCI-X status: %08x\n", cfg);
-	}
-
-	/* If PCI-E capable, dump PCI-E cap 10, and the AER */
-	cap = pci_find_capability(dev, PCI_CAP_ID_EXP);
-	if (cap) {
-		n += scnprintf(buf+n, len-n, "pci-e cap10:\n");
-		printk(KERN_WARNING
-		       "EEH: PCI-E capabilities and status follow:\n");
-
-		for (i=0; i<=8; i++) {
-			rtas_read_config(pdn, cap+4*i, 4, &cfg);
-			n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg);
-			printk(KERN_WARNING "EEH: PCI-E %02x: %08x\n", i, cfg);
-		}
-
-		cap = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
-		if (cap) {
-			n += scnprintf(buf+n, len-n, "pci-e AER:\n");
-			printk(KERN_WARNING
-			       "EEH: PCI-E AER capability register set follows:\n");
-
-			for (i=0; i<14; i++) {
-				rtas_read_config(pdn, cap+4*i, 4, &cfg);
-				n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg);
-				printk(KERN_WARNING "EEH: PCI-E AER %02x: %08x\n", i, cfg);
-			}
-		}
-	}
-
-	/* Gather status on devices under the bridge */
-	if (dev->class >> 16 == PCI_BASE_CLASS_BRIDGE) {
-		struct device_node *dn;
-
-		for_each_child_of_node(pdn->node, dn) {
-			pdn = PCI_DN(dn);
-			if (pdn)
-				n += gather_pci_data(pdn, buf+n, len-n);
-		}
-	}
-
-	return n;
-}
-
-void eeh_slot_error_detail(struct pci_dn *pdn, int severity)
-{
-	size_t loglen = 0;
-	pci_regs_buf[0] = 0;
-
-	rtas_pci_enable(pdn, EEH_THAW_MMIO);
-	loglen = gather_pci_data(pdn, pci_regs_buf, EEH_PCI_REGS_LOG_LEN);
-
-	rtas_slot_error_detail(pdn, severity, pci_regs_buf, loglen);
-}
-
-/**
- * read_slot_reset_state - Read the reset state of a device node's slot
- * @dn: device node to read
- * @rets: array to return results in
- */
-static int read_slot_reset_state(struct pci_dn *pdn, int rets[])
-{
-	int token, outputs;
-	int config_addr;
-
-	if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) {
-		token = ibm_read_slot_reset_state2;
-		outputs = 4;
-	} else {
-		token = ibm_read_slot_reset_state;
-		rets[2] = 0; /* fake PE Unavailable info */
-		outputs = 3;
-	}
-
-	/* Use PE configuration address, if present */
-	config_addr = pdn->eeh_config_addr;
-	if (pdn->eeh_pe_config_addr)
-		config_addr = pdn->eeh_pe_config_addr;
-
-	return rtas_call(token, 3, outputs, rets, config_addr,
-			 BUID_HI(pdn->phb->buid), BUID_LO(pdn->phb->buid));
-}
-
-/**
- * eeh_wait_for_slot_status - returns error status of slot
- * @pdn pci device node
- * @max_wait_msecs maximum number to millisecs to wait
- *
- * Return negative value if a permanent error, else return
- * Partition Endpoint (PE) status value.
- *
- * If @max_wait_msecs is positive, then this routine will
- * sleep until a valid status can be obtained, or until
- * the max allowed wait time is exceeded, in which case
- * a -2 is returned.
- */
-int
-eeh_wait_for_slot_status(struct pci_dn *pdn, int max_wait_msecs)
-{
-	int rc;
-	int rets[3];
-	int mwait;
-
-	while (1) {
-		rc = read_slot_reset_state(pdn, rets);
-		if (rc) return rc;
-		if (rets[1] == 0) return -1;  /* EEH is not supported */
-
-		if (rets[0] != 5) return rets[0]; /* return actual status */
-
-		if (rets[2] == 0) return -1; /* permanently unavailable */
-
-		if (max_wait_msecs <= 0) break;
-
-		mwait = rets[2];
-		if (mwait <= 0) {
-			printk (KERN_WARNING
-			        "EEH: Firmware returned bad wait value=%d\n", mwait);
-			mwait = 1000;
-		} else if (mwait > 300*1000) {
-			printk (KERN_WARNING
-			        "EEH: Firmware is taking too long, time=%d\n", mwait);
-			mwait = 300*1000;
-		}
-		max_wait_msecs -= mwait;
-		msleep (mwait);
-	}
-
-	printk(KERN_WARNING "EEH: Timed out waiting for slot status\n");
-	return -2;
-}
-
-/**
- * eeh_token_to_phys - convert EEH address token to phys address
- * @token i/o token, should be address in the form 0xA....
- */
-static inline unsigned long eeh_token_to_phys(unsigned long token)
-{
-	pte_t *ptep;
-	unsigned long pa;
-
-	ptep = find_linux_pte(init_mm.pgd, token);
-	if (!ptep)
-		return token;
-	pa = pte_pfn(*ptep) << PAGE_SHIFT;
-
-	return pa | (token & (PAGE_SIZE-1));
-}
-
-/** 
- * Return the "partitionable endpoint" (pe) under which this device lies
- */
-struct device_node * find_device_pe(struct device_node *dn)
-{
-	while ((dn->parent) && PCI_DN(dn->parent) &&
-	      (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) {
-		dn = dn->parent;
-	}
-	return dn;
-}
-
-/** Mark all devices that are children of this device as failed.
- *  Mark the device driver too, so that it can see the failure
- *  immediately; this is critical, since some drivers poll
- *  status registers in interrupts ... If a driver is polling,
- *  and the slot is frozen, then the driver can deadlock in
- *  an interrupt context, which is bad.
- */
-
-static void __eeh_mark_slot(struct device_node *parent, int mode_flag)
-{
-	struct device_node *dn;
-
-	for_each_child_of_node(parent, dn) {
-		if (PCI_DN(dn)) {
-			/* Mark the pci device driver too */
-			struct pci_dev *dev = PCI_DN(dn)->pcidev;
-
-			PCI_DN(dn)->eeh_mode |= mode_flag;
-
-			if (dev && dev->driver)
-				dev->error_state = pci_channel_io_frozen;
-
-			__eeh_mark_slot(dn, mode_flag);
-		}
-	}
-}
-
-void eeh_mark_slot (struct device_node *dn, int mode_flag)
-{
-	struct pci_dev *dev;
-	dn = find_device_pe (dn);
-
-	/* Back up one, since config addrs might be shared */
-	if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent))
-		dn = dn->parent;
-
-	PCI_DN(dn)->eeh_mode |= mode_flag;
-
-	/* Mark the pci device too */
-	dev = PCI_DN(dn)->pcidev;
-	if (dev)
-		dev->error_state = pci_channel_io_frozen;
-
-	__eeh_mark_slot(dn, mode_flag);
-}
-
-static void __eeh_clear_slot(struct device_node *parent, int mode_flag)
-{
-	struct device_node *dn;
-
-	for_each_child_of_node(parent, dn) {
-		if (PCI_DN(dn)) {
-			PCI_DN(dn)->eeh_mode &= ~mode_flag;
-			PCI_DN(dn)->eeh_check_count = 0;
-			__eeh_clear_slot(dn, mode_flag);
-		}
-	}
-}
-
-void eeh_clear_slot (struct device_node *dn, int mode_flag)
-{
-	unsigned long flags;
-	spin_lock_irqsave(&confirm_error_lock, flags);
-	
-	dn = find_device_pe (dn);
-	
-	/* Back up one, since config addrs might be shared */
-	if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent))
-		dn = dn->parent;
-
-	PCI_DN(dn)->eeh_mode &= ~mode_flag;
-	PCI_DN(dn)->eeh_check_count = 0;
-	__eeh_clear_slot(dn, mode_flag);
-	spin_unlock_irqrestore(&confirm_error_lock, flags);
-}
-
-/**
- * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze
- * @dn device node
- * @dev pci device, if known
- *
- * Check for an EEH failure for the given device node.  Call this
- * routine if the result of a read was all 0xff's and you want to
- * find out if this is due to an EEH slot freeze.  This routine
- * will query firmware for the EEH status.
- *
- * Returns 0 if there has not been an EEH error; otherwise returns
- * a non-zero value and queues up a slot isolation event notification.
- *
- * It is safe to call this routine in an interrupt context.
- */
-int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
-{
-	int ret;
-	int rets[3];
-	unsigned long flags;
-	struct pci_dn *pdn;
-	int rc = 0;
-	const char *location;
-
-	total_mmio_ffs++;
-
-	if (!eeh_subsystem_enabled)
-		return 0;
-
-	if (!dn) {
-		no_dn++;
-		return 0;
-	}
-	dn = find_device_pe(dn);
-	pdn = PCI_DN(dn);
-
-	/* Access to IO BARs might get this far and still not want checking. */
-	if (!(pdn->eeh_mode & EEH_MODE_SUPPORTED) ||
-	    pdn->eeh_mode & EEH_MODE_NOCHECK) {
-		ignored_check++;
-		pr_debug("EEH: Ignored check (%x) for %s %s\n",
-			 pdn->eeh_mode, pci_name (dev), dn->full_name);
-		return 0;
-	}
-
-	if (!pdn->eeh_config_addr && !pdn->eeh_pe_config_addr) {
-		no_cfg_addr++;
-		return 0;
-	}
-
-	/* If we already have a pending isolation event for this
-	 * slot, we know it's bad already, we don't need to check.
-	 * Do this checking under a lock; as multiple PCI devices
-	 * in one slot might report errors simultaneously, and we
-	 * only want one error recovery routine running.
-	 */
-	spin_lock_irqsave(&confirm_error_lock, flags);
-	rc = 1;
-	if (pdn->eeh_mode & EEH_MODE_ISOLATED) {
-		pdn->eeh_check_count ++;
-		if (pdn->eeh_check_count % EEH_MAX_FAILS == 0) {
-			location = of_get_property(dn, "ibm,loc-code", NULL);
-			printk (KERN_ERR "EEH: %d reads ignored for recovering device at "
-				"location=%s driver=%s pci addr=%s\n",
-				pdn->eeh_check_count, location,
-				dev->driver->name, pci_name(dev));
-			printk (KERN_ERR "EEH: Might be infinite loop in %s driver\n",
-				dev->driver->name);
-			dump_stack();
-		}
-		goto dn_unlock;
-	}
-
-	/*
-	 * Now test for an EEH failure.  This is VERY expensive.
-	 * Note that the eeh_config_addr may be a parent device
-	 * in the case of a device behind a bridge, or it may be
-	 * function zero of a multi-function device.
-	 * In any case they must share a common PHB.
-	 */
-	ret = read_slot_reset_state(pdn, rets);
-
-	/* If the call to firmware failed, punt */
-	if (ret != 0) {
-		printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n",
-		       ret, dn->full_name);
-		false_positives++;
-		pdn->eeh_false_positives ++;
-		rc = 0;
-		goto dn_unlock;
-	}
-
-	/* Note that config-io to empty slots may fail;
-	 * they are empty when they don't have children. */
-	if ((rets[0] == 5) && (rets[2] == 0) && (dn->child == NULL)) {
-		false_positives++;
-		pdn->eeh_false_positives ++;
-		rc = 0;
-		goto dn_unlock;
-	}
-
-	/* If EEH is not supported on this device, punt. */
-	if (rets[1] != 1) {
-		printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n",
-		       ret, dn->full_name);
-		false_positives++;
-		pdn->eeh_false_positives ++;
-		rc = 0;
-		goto dn_unlock;
-	}
-
-	/* If not the kind of error we know about, punt. */
-	if (rets[0] != 1 && rets[0] != 2 && rets[0] != 4 && rets[0] != 5) {
-		false_positives++;
-		pdn->eeh_false_positives ++;
-		rc = 0;
-		goto dn_unlock;
-	}
-
-	slot_resets++;
- 
-	/* Avoid repeated reports of this failure, including problems
-	 * with other functions on this device, and functions under
-	 * bridges. */
-	eeh_mark_slot (dn, EEH_MODE_ISOLATED);
-	spin_unlock_irqrestore(&confirm_error_lock, flags);
-
-	eeh_send_failure_event (dn, dev);
-
-	/* Most EEH events are due to device driver bugs.  Having
-	 * a stack trace will help the device-driver authors figure
-	 * out what happened.  So print that out. */
-	dump_stack();
-	return 1;
-
-dn_unlock:
-	spin_unlock_irqrestore(&confirm_error_lock, flags);
-	return rc;
-}
-
-EXPORT_SYMBOL_GPL(eeh_dn_check_failure);
-
-/**
- * eeh_check_failure - check if all 1's data is due to EEH slot freeze
- * @token i/o token, should be address in the form 0xA....
- * @val value, should be all 1's (XXX why do we need this arg??)
- *
- * Check for an EEH failure at the given token address.  Call this
- * routine if the result of a read was all 0xff's and you want to
- * find out if this is due to an EEH slot freeze event.  This routine
- * will query firmware for the EEH status.
- *
- * Note this routine is safe to call in an interrupt context.
- */
-unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val)
-{
-	unsigned long addr;
-	struct pci_dev *dev;
-	struct device_node *dn;
-
-	/* Finding the phys addr + pci device; this is pretty quick. */
-	addr = eeh_token_to_phys((unsigned long __force) token);
-	dev = pci_get_device_by_addr(addr);
-	if (!dev) {
-		no_device++;
-		return val;
-	}
-
-	dn = pci_device_to_OF_node(dev);
-	eeh_dn_check_failure (dn, dev);
-
-	pci_dev_put(dev);
-	return val;
-}
-
-EXPORT_SYMBOL(eeh_check_failure);
-
-/* ------------------------------------------------------------- */
-/* The code below deals with error recovery */
-
-/**
- * rtas_pci_enable - enable MMIO or DMA transfers for this slot
- * @pdn pci device node
- */
-
-int
-rtas_pci_enable(struct pci_dn *pdn, int function)
-{
-	int config_addr;
-	int rc;
-
-	/* Use PE configuration address, if present */
-	config_addr = pdn->eeh_config_addr;
-	if (pdn->eeh_pe_config_addr)
-		config_addr = pdn->eeh_pe_config_addr;
-
-	rc = rtas_call(ibm_set_eeh_option, 4, 1, NULL,
-	               config_addr,
-	               BUID_HI(pdn->phb->buid),
-	               BUID_LO(pdn->phb->buid),
-		            function);
-
-	if (rc)
-		printk(KERN_WARNING "EEH: Unexpected state change %d, err=%d dn=%s\n",
-		        function, rc, pdn->node->full_name);
-
-	rc = eeh_wait_for_slot_status (pdn, PCI_BUS_RESET_WAIT_MSEC);
-	if ((rc == 4) && (function == EEH_THAW_MMIO))
-		return 0;
-
-	return rc;
-}
-
-/**
- * rtas_pci_slot_reset - raises/lowers the pci #RST line
- * @pdn pci device node
- * @state: 1/0 to raise/lower the #RST
- *
- * Clear the EEH-frozen condition on a slot.  This routine
- * asserts the PCI #RST line if the 'state' argument is '1',
- * and drops the #RST line if 'state is '0'.  This routine is
- * safe to call in an interrupt context.
- *
- */
-
-static void
-rtas_pci_slot_reset(struct pci_dn *pdn, int state)
-{
-	int config_addr;
-	int rc;
-
-	BUG_ON (pdn==NULL); 
-
-	if (!pdn->phb) {
-		printk (KERN_WARNING "EEH: in slot reset, device node %s has no phb\n",
-		        pdn->node->full_name);
-		return;
-	}
-
-	/* Use PE configuration address, if present */
-	config_addr = pdn->eeh_config_addr;
-	if (pdn->eeh_pe_config_addr)
-		config_addr = pdn->eeh_pe_config_addr;
-
-	rc = rtas_call(ibm_set_slot_reset,4,1, NULL,
-	               config_addr,
-	               BUID_HI(pdn->phb->buid),
-	               BUID_LO(pdn->phb->buid),
-	               state);
-	if (rc)
-		printk (KERN_WARNING "EEH: Unable to reset the failed slot,"
-		        " (%d) #RST=%d dn=%s\n",
-		        rc, state, pdn->node->full_name);
-}
-
-/**
- * pcibios_set_pcie_slot_reset - Set PCI-E reset state
- * @dev:	pci device struct
- * @state:	reset state to enter
- *
- * Return value:
- * 	0 if success
- **/
-int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state)
-{
-	struct device_node *dn = pci_device_to_OF_node(dev);
-	struct pci_dn *pdn = PCI_DN(dn);
-
-	switch (state) {
-	case pcie_deassert_reset:
-		rtas_pci_slot_reset(pdn, 0);
-		break;
-	case pcie_hot_reset:
-		rtas_pci_slot_reset(pdn, 1);
-		break;
-	case pcie_warm_reset:
-		rtas_pci_slot_reset(pdn, 3);
-		break;
-	default:
-		return -EINVAL;
-	};
-
-	return 0;
-}
-
-/**
- * rtas_set_slot_reset -- assert the pci #RST line for 1/4 second
- * @pdn: pci device node to be reset.
- *
- *  Return 0 if success, else a non-zero value.
- */
-
-static void __rtas_set_slot_reset(struct pci_dn *pdn)
-{
-	rtas_pci_slot_reset (pdn, 1);
-
-	/* The PCI bus requires that the reset be held high for at least
-	 * a 100 milliseconds. We wait a bit longer 'just in case'.  */
-
-#define PCI_BUS_RST_HOLD_TIME_MSEC 250
-	msleep (PCI_BUS_RST_HOLD_TIME_MSEC);
-	
-	/* We might get hit with another EEH freeze as soon as the 
-	 * pci slot reset line is dropped. Make sure we don't miss
-	 * these, and clear the flag now. */
-	eeh_clear_slot (pdn->node, EEH_MODE_ISOLATED);
-
-	rtas_pci_slot_reset (pdn, 0);
-
-	/* After a PCI slot has been reset, the PCI Express spec requires
-	 * a 1.5 second idle time for the bus to stabilize, before starting
-	 * up traffic. */
-#define PCI_BUS_SETTLE_TIME_MSEC 1800
-	msleep (PCI_BUS_SETTLE_TIME_MSEC);
-}
-
-int rtas_set_slot_reset(struct pci_dn *pdn)
-{
-	int i, rc;
-
-	/* Take three shots at resetting the bus */
-	for (i=0; i<3; i++) {
-		__rtas_set_slot_reset(pdn);
-
-		rc = eeh_wait_for_slot_status(pdn, PCI_BUS_RESET_WAIT_MSEC);
-		if (rc == 0)
-			return 0;
-
-		if (rc < 0) {
-			printk(KERN_ERR "EEH: unrecoverable slot failure %s\n",
-			       pdn->node->full_name);
-			return -1;
-		}
-		printk(KERN_ERR "EEH: bus reset %d failed on slot %s, rc=%d\n",
-		       i+1, pdn->node->full_name, rc);
-	}
-
-	return -1;
-}
-
-/* ------------------------------------------------------- */
-/** Save and restore of PCI BARs
- *
- * Although firmware will set up BARs during boot, it doesn't
- * set up device BAR's after a device reset, although it will,
- * if requested, set up bridge configuration. Thus, we need to
- * configure the PCI devices ourselves.  
- */
-
-/**
- * __restore_bars - Restore the Base Address Registers
- * @pdn: pci device node
- *
- * Loads the PCI configuration space base address registers,
- * the expansion ROM base address, the latency timer, and etc.
- * from the saved values in the device node.
- */
-static inline void __restore_bars (struct pci_dn *pdn)
-{
-	int i;
-	u32 cmd;
-
-	if (NULL==pdn->phb) return;
-	for (i=4; i<10; i++) {
-		rtas_write_config(pdn, i*4, 4, pdn->config_space[i]);
-	}
-
-	/* 12 == Expansion ROM Address */
-	rtas_write_config(pdn, 12*4, 4, pdn->config_space[12]);
-
-#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF))
-#define SAVED_BYTE(OFF) (((u8 *)(pdn->config_space))[BYTE_SWAP(OFF)])
-
-	rtas_write_config (pdn, PCI_CACHE_LINE_SIZE, 1,
-	            SAVED_BYTE(PCI_CACHE_LINE_SIZE));
-
-	rtas_write_config (pdn, PCI_LATENCY_TIMER, 1,
-	            SAVED_BYTE(PCI_LATENCY_TIMER));
-
-	/* max latency, min grant, interrupt pin and line */
-	rtas_write_config(pdn, 15*4, 4, pdn->config_space[15]);
-
-	/* Restore PERR & SERR bits, some devices require it,
-	   don't touch the other command bits */
-	rtas_read_config(pdn, PCI_COMMAND, 4, &cmd);
-	if (pdn->config_space[1] & PCI_COMMAND_PARITY)
-		cmd |= PCI_COMMAND_PARITY;
-	else
-		cmd &= ~PCI_COMMAND_PARITY;
-	if (pdn->config_space[1] & PCI_COMMAND_SERR)
-		cmd |= PCI_COMMAND_SERR;
-	else
-		cmd &= ~PCI_COMMAND_SERR;
-	rtas_write_config(pdn, PCI_COMMAND, 4, cmd);
-}
-
-/**
- * eeh_restore_bars - restore the PCI config space info
- *
- * This routine performs a recursive walk to the children
- * of this device as well.
- */
-void eeh_restore_bars(struct pci_dn *pdn)
-{
-	struct device_node *dn;
-	if (!pdn) 
-		return;
-	
-	if ((pdn->eeh_mode & EEH_MODE_SUPPORTED) && !IS_BRIDGE(pdn->class_code))
-		__restore_bars (pdn);
-
-	for_each_child_of_node(pdn->node, dn)
-		eeh_restore_bars (PCI_DN(dn));
-}
-
-/**
- * eeh_save_bars - save device bars
- *
- * Save the values of the device bars. Unlike the restore
- * routine, this routine is *not* recursive. This is because
- * PCI devices are added individuallly; but, for the restore,
- * an entire slot is reset at a time.
- */
-static void eeh_save_bars(struct pci_dn *pdn)
-{
-	int i;
-
-	if (!pdn )
-		return;
-	
-	for (i = 0; i < 16; i++)
-		rtas_read_config(pdn, i * 4, 4, &pdn->config_space[i]);
-}
-
-void
-rtas_configure_bridge(struct pci_dn *pdn)
-{
-	int config_addr;
-	int rc;
-
-	/* Use PE configuration address, if present */
-	config_addr = pdn->eeh_config_addr;
-	if (pdn->eeh_pe_config_addr)
-		config_addr = pdn->eeh_pe_config_addr;
-
-	rc = rtas_call(ibm_configure_bridge,3,1, NULL,
-	               config_addr,
-	               BUID_HI(pdn->phb->buid),
-	               BUID_LO(pdn->phb->buid));
-	if (rc) {
-		printk (KERN_WARNING "EEH: Unable to configure device bridge (%d) for %s\n",
-		        rc, pdn->node->full_name);
-	}
-}
-
-/* ------------------------------------------------------------- */
-/* The code below deals with enabling EEH for devices during  the
- * early boot sequence.  EEH must be enabled before any PCI probing
- * can be done.
- */
-
-#define EEH_ENABLE 1
-
-struct eeh_early_enable_info {
-	unsigned int buid_hi;
-	unsigned int buid_lo;
-};
-
-static int get_pe_addr (int config_addr,
-                        struct eeh_early_enable_info *info)
-{
-	unsigned int rets[3];
-	int ret;
-
-	/* Use latest config-addr token on power6 */
-	if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) {
-		/* Make sure we have a PE in hand */
-		ret = rtas_call (ibm_get_config_addr_info2, 4, 2, rets,
-			config_addr, info->buid_hi, info->buid_lo, 1);
-		if (ret || (rets[0]==0))
-			return 0;
-
-		ret = rtas_call (ibm_get_config_addr_info2, 4, 2, rets,
-			config_addr, info->buid_hi, info->buid_lo, 0);
-		if (ret)
-			return 0;
-		return rets[0];
-	}
-
-	/* Use older config-addr token on power5 */
-	if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) {
-		ret = rtas_call (ibm_get_config_addr_info, 4, 2, rets,
-			config_addr, info->buid_hi, info->buid_lo, 0);
-		if (ret)
-			return 0;
-		return rets[0];
-	}
-	return 0;
-}
-
-/* Enable eeh for the given device node. */
-static void *early_enable_eeh(struct device_node *dn, void *data)
-{
-	unsigned int rets[3];
-	struct eeh_early_enable_info *info = data;
-	int ret;
-	const u32 *class_code = of_get_property(dn, "class-code", NULL);
-	const u32 *vendor_id = of_get_property(dn, "vendor-id", NULL);
-	const u32 *device_id = of_get_property(dn, "device-id", NULL);
-	const u32 *regs;
-	int enable;
-	struct pci_dn *pdn = PCI_DN(dn);
-
-	pdn->class_code = 0;
-	pdn->eeh_mode = 0;
-	pdn->eeh_check_count = 0;
-	pdn->eeh_freeze_count = 0;
-	pdn->eeh_false_positives = 0;
-
-	if (!of_device_is_available(dn))
-		return NULL;
-
-	/* Ignore bad nodes. */
-	if (!class_code || !vendor_id || !device_id)
-		return NULL;
-
-	/* There is nothing to check on PCI to ISA bridges */
-	if (dn->type && !strcmp(dn->type, "isa")) {
-		pdn->eeh_mode |= EEH_MODE_NOCHECK;
-		return NULL;
-	}
-	pdn->class_code = *class_code;
-
-	/* Ok... see if this device supports EEH.  Some do, some don't,
-	 * and the only way to find out is to check each and every one. */
-	regs = of_get_property(dn, "reg", NULL);
-	if (regs) {
-		/* First register entry is addr (00BBSS00)  */
-		/* Try to enable eeh */
-		ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL,
-		                regs[0], info->buid_hi, info->buid_lo,
-		                EEH_ENABLE);
-
-		enable = 0;
-		if (ret == 0) {
-			pdn->eeh_config_addr = regs[0];
-
-			/* If the newer, better, ibm,get-config-addr-info is supported, 
-			 * then use that instead. */
-			pdn->eeh_pe_config_addr = get_pe_addr(pdn->eeh_config_addr, info);
-
-			/* Some older systems (Power4) allow the
-			 * ibm,set-eeh-option call to succeed even on nodes
-			 * where EEH is not supported. Verify support
-			 * explicitly. */
-			ret = read_slot_reset_state(pdn, rets);
-			if ((ret == 0) && (rets[1] == 1))
-				enable = 1;
-		}
-
-		if (enable) {
-			eeh_subsystem_enabled = 1;
-			pdn->eeh_mode |= EEH_MODE_SUPPORTED;
-
-			pr_debug("EEH: %s: eeh enabled, config=%x pe_config=%x\n",
-				 dn->full_name, pdn->eeh_config_addr,
-				 pdn->eeh_pe_config_addr);
-		} else {
-
-			/* This device doesn't support EEH, but it may have an
-			 * EEH parent, in which case we mark it as supported. */
-			if (dn->parent && PCI_DN(dn->parent)
-			    && (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) {
-				/* Parent supports EEH. */
-				pdn->eeh_mode |= EEH_MODE_SUPPORTED;
-				pdn->eeh_config_addr = PCI_DN(dn->parent)->eeh_config_addr;
-				return NULL;
-			}
-		}
-	} else {
-		printk(KERN_WARNING "EEH: %s: unable to get reg property.\n",
-		       dn->full_name);
-	}
-
-	eeh_save_bars(pdn);
-	return NULL;
-}
-
-/*
- * Initialize EEH by trying to enable it for all of the adapters in the system.
- * As a side effect we can determine here if eeh is supported at all.
- * Note that we leave EEH on so failed config cycles won't cause a machine
- * check.  If a user turns off EEH for a particular adapter they are really
- * telling Linux to ignore errors.  Some hardware (e.g. POWER5) won't
- * grant access to a slot if EEH isn't enabled, and so we always enable
- * EEH for all slots/all devices.
- *
- * The eeh-force-off option disables EEH checking globally, for all slots.
- * Even if force-off is set, the EEH hardware is still enabled, so that
- * newer systems can boot.
- */
-void __init eeh_init(void)
-{
-	struct device_node *phb, *np;
-	struct eeh_early_enable_info info;
-
-	spin_lock_init(&confirm_error_lock);
-	spin_lock_init(&slot_errbuf_lock);
-
-	np = of_find_node_by_path("/rtas");
-	if (np == NULL)
-		return;
-
-	ibm_set_eeh_option = rtas_token("ibm,set-eeh-option");
-	ibm_set_slot_reset = rtas_token("ibm,set-slot-reset");
-	ibm_read_slot_reset_state2 = rtas_token("ibm,read-slot-reset-state2");
-	ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state");
-	ibm_slot_error_detail = rtas_token("ibm,slot-error-detail");
-	ibm_get_config_addr_info = rtas_token("ibm,get-config-addr-info");
-	ibm_get_config_addr_info2 = rtas_token("ibm,get-config-addr-info2");
-	ibm_configure_bridge = rtas_token ("ibm,configure-bridge");
-
-	if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE)
-		return;
-
-	eeh_error_buf_size = rtas_token("rtas-error-log-max");
-	if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) {
-		eeh_error_buf_size = 1024;
-	}
-	if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) {
-		printk(KERN_WARNING "EEH: rtas-error-log-max is bigger than allocated "
-		      "buffer ! (%d vs %d)", eeh_error_buf_size, RTAS_ERROR_LOG_MAX);
-		eeh_error_buf_size = RTAS_ERROR_LOG_MAX;
-	}
-
-	/* Enable EEH for all adapters.  Note that eeh requires buid's */
-	for (phb = of_find_node_by_name(NULL, "pci"); phb;
-	     phb = of_find_node_by_name(phb, "pci")) {
-		unsigned long buid;
-
-		buid = get_phb_buid(phb);
-		if (buid == 0 || PCI_DN(phb) == NULL)
-			continue;
-
-		info.buid_lo = BUID_LO(buid);
-		info.buid_hi = BUID_HI(buid);
-		traverse_pci_devices(phb, early_enable_eeh, &info);
-	}
-
-	if (eeh_subsystem_enabled)
-		printk(KERN_INFO "EEH: PCI Enhanced I/O Error Handling Enabled\n");
-	else
-		printk(KERN_WARNING "EEH: No capable adapters found\n");
-}
-
-/**
- * eeh_add_device_early - enable EEH for the indicated device_node
- * @dn: device node for which to set up EEH
- *
- * This routine must be used to perform EEH initialization for PCI
- * devices that were added after system boot (e.g. hotplug, dlpar).
- * This routine must be called before any i/o is performed to the
- * adapter (inluding any config-space i/o).
- * Whether this actually enables EEH or not for this device depends
- * on the CEC architecture, type of the device, on earlier boot
- * command-line arguments & etc.
- */
-static void eeh_add_device_early(struct device_node *dn)
-{
-	struct pci_controller *phb;
-	struct eeh_early_enable_info info;
-
-	if (!dn || !PCI_DN(dn))
-		return;
-	phb = PCI_DN(dn)->phb;
-
-	/* USB Bus children of PCI devices will not have BUID's */
-	if (NULL == phb || 0 == phb->buid)
-		return;
-
-	info.buid_hi = BUID_HI(phb->buid);
-	info.buid_lo = BUID_LO(phb->buid);
-	early_enable_eeh(dn, &info);
-}
-
-void eeh_add_device_tree_early(struct device_node *dn)
-{
-	struct device_node *sib;
-
-	for_each_child_of_node(dn, sib)
-		eeh_add_device_tree_early(sib);
-	eeh_add_device_early(dn);
-}
-EXPORT_SYMBOL_GPL(eeh_add_device_tree_early);
-
-/**
- * eeh_add_device_late - perform EEH initialization for the indicated pci device
- * @dev: pci device for which to set up EEH
- *
- * This routine must be used to complete EEH initialization for PCI
- * devices that were added after system boot (e.g. hotplug, dlpar).
- */
-static void eeh_add_device_late(struct pci_dev *dev)
-{
-	struct device_node *dn;
-	struct pci_dn *pdn;
-
-	if (!dev || !eeh_subsystem_enabled)
-		return;
-
-	pr_debug("EEH: Adding device %s\n", pci_name(dev));
-
-	dn = pci_device_to_OF_node(dev);
-	pdn = PCI_DN(dn);
-	if (pdn->pcidev == dev) {
-		pr_debug("EEH: Already referenced !\n");
-		return;
-	}
-	WARN_ON(pdn->pcidev);
-
-	pci_dev_get (dev);
-	pdn->pcidev = dev;
-
-	pci_addr_cache_insert_device(dev);
-	eeh_sysfs_add_device(dev);
-}
-
-void eeh_add_device_tree_late(struct pci_bus *bus)
-{
-	struct pci_dev *dev;
-
-	list_for_each_entry(dev, &bus->devices, bus_list) {
- 		eeh_add_device_late(dev);
- 		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
- 			struct pci_bus *subbus = dev->subordinate;
- 			if (subbus)
- 				eeh_add_device_tree_late(subbus);
- 		}
-	}
-}
-EXPORT_SYMBOL_GPL(eeh_add_device_tree_late);
-
-/**
- * eeh_remove_device - undo EEH setup for the indicated pci device
- * @dev: pci device to be removed
- *
- * This routine should be called when a device is removed from
- * a running system (e.g. by hotplug or dlpar).  It unregisters
- * the PCI device from the EEH subsystem.  I/O errors affecting
- * this device will no longer be detected after this call; thus,
- * i/o errors affecting this slot may leave this device unusable.
- */
-static void eeh_remove_device(struct pci_dev *dev)
-{
-	struct device_node *dn;
-	if (!dev || !eeh_subsystem_enabled)
-		return;
-
-	/* Unregister the device with the EEH/PCI address search system */
-	pr_debug("EEH: Removing device %s\n", pci_name(dev));
-
-	dn = pci_device_to_OF_node(dev);
-	if (PCI_DN(dn)->pcidev == NULL) {
-		pr_debug("EEH: Not referenced !\n");
-		return;
-	}
-	PCI_DN(dn)->pcidev = NULL;
-	pci_dev_put (dev);
-
-	pci_addr_cache_remove_device(dev);
-	eeh_sysfs_remove_device(dev);
-}
-
-void eeh_remove_bus_device(struct pci_dev *dev)
-{
-	struct pci_bus *bus = dev->subordinate;
-	struct pci_dev *child, *tmp;
-
-	eeh_remove_device(dev);
-
-	if (bus && dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
-		list_for_each_entry_safe(child, tmp, &bus->devices, bus_list)
-			 eeh_remove_bus_device(child);
-	}
-}
-EXPORT_SYMBOL_GPL(eeh_remove_bus_device);
-
-static int proc_eeh_show(struct seq_file *m, void *v)
-{
-	if (0 == eeh_subsystem_enabled) {
-		seq_printf(m, "EEH Subsystem is globally disabled\n");
-		seq_printf(m, "eeh_total_mmio_ffs=%ld\n", total_mmio_ffs);
-	} else {
-		seq_printf(m, "EEH Subsystem is enabled\n");
-		seq_printf(m,
-				"no device=%ld\n"
-				"no device node=%ld\n"
-				"no config address=%ld\n"
-				"check not wanted=%ld\n"
-				"eeh_total_mmio_ffs=%ld\n"
-				"eeh_false_positives=%ld\n"
-				"eeh_slot_resets=%ld\n",
-				no_device, no_dn, no_cfg_addr, 
-				ignored_check, total_mmio_ffs, 
-				false_positives,
-				slot_resets);
-	}
-
-	return 0;
-}
-
-static int proc_eeh_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_eeh_show, NULL);
-}
-
-static const struct file_operations proc_eeh_operations = {
-	.open      = proc_eeh_open,
-	.read      = seq_read,
-	.llseek    = seq_lseek,
-	.release   = single_release,
-};
-
-static int __init eeh_init_proc(void)
-{
-	if (machine_is(pseries))
-		proc_create("ppc64/eeh", 0, NULL, &proc_eeh_operations);
-	return 0;
-}
-__initcall(eeh_init_proc);
diff --git a/arch/powerpc/platforms/pseries/eeh_cache.c b/arch/powerpc/platforms/pseries/eeh_cache.c
deleted file mode 100644
index ce37040af87..00000000000
--- a/arch/powerpc/platforms/pseries/eeh_cache.c
+++ /dev/null
@@ -1,308 +0,0 @@
-/*
- * eeh_cache.c
- * PCI address cache; allows the lookup of PCI devices based on I/O address
- *
- * Copyright IBM Corporation 2004
- * Copyright Linas Vepstas <linas@austin.ibm.com> 2004
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- */
-
-#include <linux/list.h>
-#include <linux/pci.h>
-#include <linux/rbtree.h>
-#include <linux/spinlock.h>
-#include <asm/atomic.h>
-#include <asm/pci-bridge.h>
-#include <asm/ppc-pci.h>
-
-
-/**
- * The pci address cache subsystem.  This subsystem places
- * PCI device address resources into a red-black tree, sorted
- * according to the address range, so that given only an i/o
- * address, the corresponding PCI device can be **quickly**
- * found. It is safe to perform an address lookup in an interrupt
- * context; this ability is an important feature.
- *
- * Currently, the only customer of this code is the EEH subsystem;
- * thus, this code has been somewhat tailored to suit EEH better.
- * In particular, the cache does *not* hold the addresses of devices
- * for which EEH is not enabled.
- *
- * (Implementation Note: The RB tree seems to be better/faster
- * than any hash algo I could think of for this problem, even
- * with the penalty of slow pointer chases for d-cache misses).
- */
-struct pci_io_addr_range
-{
-	struct rb_node rb_node;
-	unsigned long addr_lo;
-	unsigned long addr_hi;
-	struct pci_dev *pcidev;
-	unsigned int flags;
-};
-
-static struct pci_io_addr_cache
-{
-	struct rb_root rb_root;
-	spinlock_t piar_lock;
-} pci_io_addr_cache_root;
-
-static inline struct pci_dev *__pci_get_device_by_addr(unsigned long addr)
-{
-	struct rb_node *n = pci_io_addr_cache_root.rb_root.rb_node;
-
-	while (n) {
-		struct pci_io_addr_range *piar;
-		piar = rb_entry(n, struct pci_io_addr_range, rb_node);
-
-		if (addr < piar->addr_lo) {
-			n = n->rb_left;
-		} else {
-			if (addr > piar->addr_hi) {
-				n = n->rb_right;
-			} else {
-				pci_dev_get(piar->pcidev);
-				return piar->pcidev;
-			}
-		}
-	}
-
-	return NULL;
-}
-
-/**
- * pci_get_device_by_addr - Get device, given only address
- * @addr: mmio (PIO) phys address or i/o port number
- *
- * Given an mmio phys address, or a port number, find a pci device
- * that implements this address.  Be sure to pci_dev_put the device
- * when finished.  I/O port numbers are assumed to be offset
- * from zero (that is, they do *not* have pci_io_addr added in).
- * It is safe to call this function within an interrupt.
- */
-struct pci_dev *pci_get_device_by_addr(unsigned long addr)
-{
-	struct pci_dev *dev;
-	unsigned long flags;
-
-	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
-	dev = __pci_get_device_by_addr(addr);
-	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
-	return dev;
-}
-
-#ifdef DEBUG
-/*
- * Handy-dandy debug print routine, does nothing more
- * than print out the contents of our addr cache.
- */
-static void pci_addr_cache_print(struct pci_io_addr_cache *cache)
-{
-	struct rb_node *n;
-	int cnt = 0;
-
-	n = rb_first(&cache->rb_root);
-	while (n) {
-		struct pci_io_addr_range *piar;
-		piar = rb_entry(n, struct pci_io_addr_range, rb_node);
-		printk(KERN_DEBUG "PCI: %s addr range %d [%lx-%lx]: %s\n",
-		       (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt,
-		       piar->addr_lo, piar->addr_hi, pci_name(piar->pcidev));
-		cnt++;
-		n = rb_next(n);
-	}
-}
-#endif
-
-/* Insert address range into the rb tree. */
-static struct pci_io_addr_range *
-pci_addr_cache_insert(struct pci_dev *dev, unsigned long alo,
-		      unsigned long ahi, unsigned int flags)
-{
-	struct rb_node **p = &pci_io_addr_cache_root.rb_root.rb_node;
-	struct rb_node *parent = NULL;
-	struct pci_io_addr_range *piar;
-
-	/* Walk tree, find a place to insert into tree */
-	while (*p) {
-		parent = *p;
-		piar = rb_entry(parent, struct pci_io_addr_range, rb_node);
-		if (ahi < piar->addr_lo) {
-			p = &parent->rb_left;
-		} else if (alo > piar->addr_hi) {
-			p = &parent->rb_right;
-		} else {
-			if (dev != piar->pcidev ||
-			    alo != piar->addr_lo || ahi != piar->addr_hi) {
-				printk(KERN_WARNING "PIAR: overlapping address range\n");
-			}
-			return piar;
-		}
-	}
-	piar = kmalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC);
-	if (!piar)
-		return NULL;
-
-	pci_dev_get(dev);
-	piar->addr_lo = alo;
-	piar->addr_hi = ahi;
-	piar->pcidev = dev;
-	piar->flags = flags;
-
-#ifdef DEBUG
-	printk(KERN_DEBUG "PIAR: insert range=[%lx:%lx] dev=%s\n",
-	                  alo, ahi, pci_name (dev));
-#endif
-
-	rb_link_node(&piar->rb_node, parent, p);
-	rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root);
-
-	return piar;
-}
-
-static void __pci_addr_cache_insert_device(struct pci_dev *dev)
-{
-	struct device_node *dn;
-	struct pci_dn *pdn;
-	int i;
-
-	dn = pci_device_to_OF_node(dev);
-	if (!dn) {
-		printk(KERN_WARNING "PCI: no pci dn found for dev=%s\n", pci_name(dev));
-		return;
-	}
-
-	/* Skip any devices for which EEH is not enabled. */
-	pdn = PCI_DN(dn);
-	if (!(pdn->eeh_mode & EEH_MODE_SUPPORTED) ||
-	    pdn->eeh_mode & EEH_MODE_NOCHECK) {
-#ifdef DEBUG
-		printk(KERN_INFO "PCI: skip building address cache for=%s - %s\n",
-		       pci_name(dev), pdn->node->full_name);
-#endif
-		return;
-	}
-
-	/* Walk resources on this device, poke them into the tree */
-	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
-		unsigned long start = pci_resource_start(dev,i);
-		unsigned long end = pci_resource_end(dev,i);
-		unsigned int flags = pci_resource_flags(dev,i);
-
-		/* We are interested only bus addresses, not dma or other stuff */
-		if (0 == (flags & (IORESOURCE_IO | IORESOURCE_MEM)))
-			continue;
-		if (start == 0 || ~start == 0 || end == 0 || ~end == 0)
-			 continue;
-		pci_addr_cache_insert(dev, start, end, flags);
-	}
-}
-
-/**
- * pci_addr_cache_insert_device - Add a device to the address cache
- * @dev: PCI device whose I/O addresses we are interested in.
- *
- * In order to support the fast lookup of devices based on addresses,
- * we maintain a cache of devices that can be quickly searched.
- * This routine adds a device to that cache.
- */
-void pci_addr_cache_insert_device(struct pci_dev *dev)
-{
-	unsigned long flags;
-
-	/* Ignore PCI bridges */
-	if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE)
-		return;
-
-	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
-	__pci_addr_cache_insert_device(dev);
-	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
-}
-
-static inline void __pci_addr_cache_remove_device(struct pci_dev *dev)
-{
-	struct rb_node *n;
-
-restart:
-	n = rb_first(&pci_io_addr_cache_root.rb_root);
-	while (n) {
-		struct pci_io_addr_range *piar;
-		piar = rb_entry(n, struct pci_io_addr_range, rb_node);
-
-		if (piar->pcidev == dev) {
-			rb_erase(n, &pci_io_addr_cache_root.rb_root);
-			pci_dev_put(piar->pcidev);
-			kfree(piar);
-			goto restart;
-		}
-		n = rb_next(n);
-	}
-}
-
-/**
- * pci_addr_cache_remove_device - remove pci device from addr cache
- * @dev: device to remove
- *
- * Remove a device from the addr-cache tree.
- * This is potentially expensive, since it will walk
- * the tree multiple times (once per resource).
- * But so what; device removal doesn't need to be that fast.
- */
-void pci_addr_cache_remove_device(struct pci_dev *dev)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
-	__pci_addr_cache_remove_device(dev);
-	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
-}
-
-/**
- * pci_addr_cache_build - Build a cache of I/O addresses
- *
- * Build a cache of pci i/o addresses.  This cache will be used to
- * find the pci device that corresponds to a given address.
- * This routine scans all pci busses to build the cache.
- * Must be run late in boot process, after the pci controllers
- * have been scanned for devices (after all device resources are known).
- */
-void __init pci_addr_cache_build(void)
-{
-	struct device_node *dn;
-	struct pci_dev *dev = NULL;
-
-	spin_lock_init(&pci_io_addr_cache_root.piar_lock);
-
-	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
-
-		pci_addr_cache_insert_device(dev);
-
-		dn = pci_device_to_OF_node(dev);
-		if (!dn)
-			continue;
-		pci_dev_get(dev);  /* matching put is in eeh_remove_device() */
-		PCI_DN(dn)->pcidev = dev;
-
-		eeh_sysfs_add_device(dev);
-	}
-
-#ifdef DEBUG
-	/* Verify tree built up above, echo back the list of addrs. */
-	pci_addr_cache_print(&pci_io_addr_cache_root);
-#endif
-}
-
diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c
deleted file mode 100644
index 9a2a6e32f00..00000000000
--- a/arch/powerpc/platforms/pseries/eeh_driver.c
+++ /dev/null
@@ -1,513 +0,0 @@
-/*
- * PCI Error Recovery Driver for RPA-compliant PPC64 platform.
- * Copyright IBM Corp. 2004 2005
- * Copyright Linas Vepstas <linas@linas.org> 2004, 2005
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
- */
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/pci.h>
-#include <asm/eeh.h>
-#include <asm/eeh_event.h>
-#include <asm/ppc-pci.h>
-#include <asm/pci-bridge.h>
-#include <asm/prom.h>
-#include <asm/rtas.h>
-
-
-static inline const char * pcid_name (struct pci_dev *pdev)
-{
-	if (pdev && pdev->dev.driver)
-		return pdev->dev.driver->name;
-	return "";
-}
-
-#if 0
-static void print_device_node_tree(struct pci_dn *pdn, int dent)
-{
-	int i;
-	struct device_node *pc;
-
-	if (!pdn)
-		return;
-	for (i = 0; i < dent; i++)
-		printk(" ");
-	printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n",
-		pdn->node->name, pdn->eeh_mode, pdn->eeh_config_addr,
-		pdn->eeh_pe_config_addr, pdn->node->full_name);
-	dent += 3;
-	pc = pdn->node->child;
-	while (pc) {
-		print_device_node_tree(PCI_DN(pc), dent);
-		pc = pc->sibling;
-	}
-}
-#endif
-
-/** 
- * irq_in_use - return true if this irq is being used 
- */
-static int irq_in_use(unsigned int irq)
-{
-	int rc = 0;
-	unsigned long flags;
-   struct irq_desc *desc = irq_desc + irq;
-
-	spin_lock_irqsave(&desc->lock, flags);
-	if (desc->action)
-		rc = 1;
-	spin_unlock_irqrestore(&desc->lock, flags);
-	return rc;
-}
-
-/**
- * eeh_disable_irq - disable interrupt for the recovering device
- */
-static void eeh_disable_irq(struct pci_dev *dev)
-{
-	struct device_node *dn = pci_device_to_OF_node(dev);
-
-	/* Don't disable MSI and MSI-X interrupts. They are
-	 * effectively disabled by the DMA Stopped state
-	 * when an EEH error occurs.
-	*/
-	if (dev->msi_enabled || dev->msix_enabled)
-		return;
-
-	if (!irq_in_use(dev->irq))
-		return;
-
-	PCI_DN(dn)->eeh_mode |= EEH_MODE_IRQ_DISABLED;
-	disable_irq_nosync(dev->irq);
-}
-
-/**
- * eeh_enable_irq - enable interrupt for the recovering device
- */
-static void eeh_enable_irq(struct pci_dev *dev)
-{
-	struct device_node *dn = pci_device_to_OF_node(dev);
-
-	if ((PCI_DN(dn)->eeh_mode) & EEH_MODE_IRQ_DISABLED) {
-		PCI_DN(dn)->eeh_mode &= ~EEH_MODE_IRQ_DISABLED;
-		enable_irq(dev->irq);
-	}
-}
-
-/* ------------------------------------------------------- */
-/**
- * eeh_report_error - report pci error to each device driver
- * 
- * Report an EEH error to each device driver, collect up and 
- * merge the device driver responses. Cumulative response 
- * passed back in "userdata".
- */
-
-static void eeh_report_error(struct pci_dev *dev, void *userdata)
-{
-	enum pci_ers_result rc, *res = userdata;
-	struct pci_driver *driver = dev->driver;
-
-	dev->error_state = pci_channel_io_frozen;
-
-	if (!driver)
-		return;
-
-	eeh_disable_irq(dev);
-
-	if (!driver->err_handler ||
-	    !driver->err_handler->error_detected)
-		return;
-
-	rc = driver->err_handler->error_detected (dev, pci_channel_io_frozen);
-
-	/* A driver that needs a reset trumps all others */
-	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
-	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
-}
-
-/**
- * eeh_report_mmio_enabled - tell drivers that MMIO has been enabled
- *
- * Tells each device driver that IO ports, MMIO and config space I/O
- * are now enabled. Collects up and merges the device driver responses.
- * Cumulative response passed back in "userdata".
- */
-
-static void eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata)
-{
-	enum pci_ers_result rc, *res = userdata;
-	struct pci_driver *driver = dev->driver;
-
-	if (!driver ||
-	    !driver->err_handler ||
-	    !driver->err_handler->mmio_enabled)
-		return;
-
-	rc = driver->err_handler->mmio_enabled (dev);
-
-	/* A driver that needs a reset trumps all others */
-	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
-	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
-}
-
-/**
- * eeh_report_reset - tell device that slot has been reset
- */
-
-static void eeh_report_reset(struct pci_dev *dev, void *userdata)
-{
-	enum pci_ers_result rc, *res = userdata;
-	struct pci_driver *driver = dev->driver;
-
-	if (!driver)
-		return;
-
-	dev->error_state = pci_channel_io_normal;
-
-	eeh_enable_irq(dev);
-
-	if (!driver->err_handler ||
-	    !driver->err_handler->slot_reset)
-		return;
-
-	rc = driver->err_handler->slot_reset(dev);
-	if ((*res == PCI_ERS_RESULT_NONE) ||
-	    (*res == PCI_ERS_RESULT_RECOVERED)) *res = rc;
-	if (*res == PCI_ERS_RESULT_DISCONNECT &&
-	     rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
-}
-
-/**
- * eeh_report_resume - tell device to resume normal operations
- */
-
-static void eeh_report_resume(struct pci_dev *dev, void *userdata)
-{
-	struct pci_driver *driver = dev->driver;
-
-	dev->error_state = pci_channel_io_normal;
-
-	if (!driver)
-		return;
-
-	eeh_enable_irq(dev);
-
-	if (!driver->err_handler ||
-	    !driver->err_handler->resume)
-		return;
-
-	driver->err_handler->resume(dev);
-}
-
-/**
- * eeh_report_failure - tell device driver that device is dead.
- *
- * This informs the device driver that the device is permanently
- * dead, and that no further recovery attempts will be made on it.
- */
-
-static void eeh_report_failure(struct pci_dev *dev, void *userdata)
-{
-	struct pci_driver *driver = dev->driver;
-
-	dev->error_state = pci_channel_io_perm_failure;
-
-	if (!driver)
-		return;
-
-	eeh_disable_irq(dev);
-
-	if (!driver->err_handler ||
-	    !driver->err_handler->error_detected)
-		return;
-
-	driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
-}
-
-/* ------------------------------------------------------- */
-/**
- * handle_eeh_events -- reset a PCI device after hard lockup.
- *
- * pSeries systems will isolate a PCI slot if the PCI-Host
- * bridge detects address or data parity errors, DMA's
- * occurring to wild addresses (which usually happen due to
- * bugs in device drivers or in PCI adapter firmware).
- * Slot isolations also occur if #SERR, #PERR or other misc
- * PCI-related errors are detected.
- *
- * Recovery process consists of unplugging the device driver
- * (which generated hotplug events to userspace), then issuing
- * a PCI #RST to the device, then reconfiguring the PCI config
- * space for all bridges & devices under this slot, and then
- * finally restarting the device drivers (which cause a second
- * set of hotplug events to go out to userspace).
- */
-
-/**
- * eeh_reset_device() -- perform actual reset of a pci slot
- * @bus: pointer to the pci bus structure corresponding
- *            to the isolated slot. A non-null value will
- *            cause all devices under the bus to be removed
- *            and then re-added.
- * @pe_dn: pointer to a "Partionable Endpoint" device node.
- *            This is the top-level structure on which pci
- *            bus resets can be performed.
- */
-
-static int eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus)
-{
-	struct device_node *dn;
-	int cnt, rc;
-
-	/* pcibios will clear the counter; save the value */
-	cnt = pe_dn->eeh_freeze_count;
-
-	if (bus)
-		pcibios_remove_pci_devices(bus);
-
-	/* Reset the pci controller. (Asserts RST#; resets config space).
-	 * Reconfigure bridges and devices. Don't try to bring the system
-	 * up if the reset failed for some reason. */
-	rc = rtas_set_slot_reset(pe_dn);
-	if (rc)
-		return rc;
-
-	/* Walk over all functions on this device.  */
-	dn = pe_dn->node;
-	if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent))
-		dn = dn->parent->child;
-
-	while (dn) {
-		struct pci_dn *ppe = PCI_DN(dn);
-		/* On Power4, always true because eeh_pe_config_addr=0 */
-		if (pe_dn->eeh_pe_config_addr == ppe->eeh_pe_config_addr) {
-			rtas_configure_bridge(ppe);
-			eeh_restore_bars(ppe);
- 		}
-		dn = dn->sibling;
-	}
-
-	/* Give the system 5 seconds to finish running the user-space
-	 * hotplug shutdown scripts, e.g. ifdown for ethernet.  Yes, 
-	 * this is a hack, but if we don't do this, and try to bring 
-	 * the device up before the scripts have taken it down, 
-	 * potentially weird things happen.
-	 */
-	if (bus) {
-		ssleep (5);
-		pcibios_add_pci_devices(bus);
-	}
-	pe_dn->eeh_freeze_count = cnt;
-
-	return 0;
-}
-
-/* The longest amount of time to wait for a pci device
- * to come back on line, in seconds.
- */
-#define MAX_WAIT_FOR_RECOVERY 150
-
-struct pci_dn * handle_eeh_events (struct eeh_event *event)
-{
-	struct device_node *frozen_dn;
-	struct pci_dn *frozen_pdn;
-	struct pci_bus *frozen_bus;
-	int rc = 0;
-	enum pci_ers_result result = PCI_ERS_RESULT_NONE;
-	const char *location, *pci_str, *drv_str;
-
-	frozen_dn = find_device_pe(event->dn);
-	if (!frozen_dn) {
-
-		location = of_get_property(event->dn, "ibm,loc-code", NULL);
-		location = location ? location : "unknown";
-		printk(KERN_ERR "EEH: Error: Cannot find partition endpoint "
-		                "for location=%s pci addr=%s\n",
-		        location, pci_name(event->dev));
-		return NULL;
-	}
-
-	frozen_bus = pcibios_find_pci_bus(frozen_dn);
-	location = of_get_property(frozen_dn, "ibm,loc-code", NULL);
-	location = location ? location : "unknown";
-
-	/* There are two different styles for coming up with the PE.
-	 * In the old style, it was the highest EEH-capable device
-	 * which was always an EADS pci bridge.  In the new style,
-	 * there might not be any EADS bridges, and even when there are,
-	 * the firmware marks them as "EEH incapable". So another
-	 * two-step is needed to find the pci bus.. */
-	if (!frozen_bus)
-		frozen_bus = pcibios_find_pci_bus (frozen_dn->parent);
-
-	if (!frozen_bus) {
-		printk(KERN_ERR "EEH: Cannot find PCI bus "
-		        "for location=%s dn=%s\n",
-		        location, frozen_dn->full_name);
-		return NULL;
-	}
-
-	frozen_pdn = PCI_DN(frozen_dn);
-	frozen_pdn->eeh_freeze_count++;
-
-	if (frozen_pdn->pcidev) {
-		pci_str = pci_name (frozen_pdn->pcidev);
-		drv_str = pcid_name (frozen_pdn->pcidev);
-	} else {
-		pci_str = pci_name (event->dev);
-		drv_str = pcid_name (event->dev);
-	}
-	
-	if (frozen_pdn->eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES)
-		goto excess_failures;
-
-	printk(KERN_WARNING
-	   "EEH: This PCI device has failed %d times in the last hour:\n",
-		frozen_pdn->eeh_freeze_count);
-	printk(KERN_WARNING
-		"EEH: location=%s driver=%s pci addr=%s\n",
-		location, drv_str, pci_str);
-
-	/* Walk the various device drivers attached to this slot through
-	 * a reset sequence, giving each an opportunity to do what it needs
-	 * to accomplish the reset.  Each child gets a report of the
-	 * status ... if any child can't handle the reset, then the entire
-	 * slot is dlpar removed and added.
-	 */
-	pci_walk_bus(frozen_bus, eeh_report_error, &result);
-
-	/* Get the current PCI slot state. This can take a long time,
-	 * sometimes over 3 seconds for certain systems. */
-	rc = eeh_wait_for_slot_status (frozen_pdn, MAX_WAIT_FOR_RECOVERY*1000);
-	if (rc < 0) {
-		printk(KERN_WARNING "EEH: Permanent failure\n");
-		goto hard_fail;
-	}
-
-	/* Since rtas may enable MMIO when posting the error log,
-	 * don't post the error log until after all dev drivers
-	 * have been informed.
-	 */
-	eeh_slot_error_detail(frozen_pdn, EEH_LOG_TEMP_FAILURE);
-
-	/* If all device drivers were EEH-unaware, then shut
-	 * down all of the device drivers, and hope they
-	 * go down willingly, without panicing the system.
-	 */
-	if (result == PCI_ERS_RESULT_NONE) {
-		rc = eeh_reset_device(frozen_pdn, frozen_bus);
-		if (rc) {
-			printk(KERN_WARNING "EEH: Unable to reset, rc=%d\n", rc);
-			goto hard_fail;
-		}
-	}
-
-	/* If all devices reported they can proceed, then re-enable MMIO */
-	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
-		rc = rtas_pci_enable(frozen_pdn, EEH_THAW_MMIO);
-
-		if (rc < 0)
-			goto hard_fail;
-		if (rc) {
-			result = PCI_ERS_RESULT_NEED_RESET;
-		} else {
-			result = PCI_ERS_RESULT_NONE;
-			pci_walk_bus(frozen_bus, eeh_report_mmio_enabled, &result);
-		}
-	}
-
-	/* If all devices reported they can proceed, then re-enable DMA */
-	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
-		rc = rtas_pci_enable(frozen_pdn, EEH_THAW_DMA);
-
-		if (rc < 0)
-			goto hard_fail;
-		if (rc)
-			result = PCI_ERS_RESULT_NEED_RESET;
-		else
-			result = PCI_ERS_RESULT_RECOVERED;
-	}
-
-	/* If any device has a hard failure, then shut off everything. */
-	if (result == PCI_ERS_RESULT_DISCONNECT) {
-		printk(KERN_WARNING "EEH: Device driver gave up\n");
-		goto hard_fail;
-	}
-
-	/* If any device called out for a reset, then reset the slot */
-	if (result == PCI_ERS_RESULT_NEED_RESET) {
-		rc = eeh_reset_device(frozen_pdn, NULL);
-		if (rc) {
-			printk(KERN_WARNING "EEH: Cannot reset, rc=%d\n", rc);
-			goto hard_fail;
-		}
-		result = PCI_ERS_RESULT_NONE;
-		pci_walk_bus(frozen_bus, eeh_report_reset, &result);
-	}
-
-	/* All devices should claim they have recovered by now. */
-	if ((result != PCI_ERS_RESULT_RECOVERED) &&
-	    (result != PCI_ERS_RESULT_NONE)) {
-		printk(KERN_WARNING "EEH: Not recovered\n");
-		goto hard_fail;
-	}
-
-	/* Tell all device drivers that they can resume operations */
-	pci_walk_bus(frozen_bus, eeh_report_resume, NULL);
-
-	return frozen_pdn;
-	
-excess_failures:
-	/*
-	 * About 90% of all real-life EEH failures in the field
-	 * are due to poorly seated PCI cards. Only 10% or so are
-	 * due to actual, failed cards.
-	 */
-	printk(KERN_ERR
-	   "EEH: PCI device at location=%s driver=%s pci addr=%s \n"
-		"has failed %d times in the last hour "
-		"and has been permanently disabled. \n"
-		"Please try reseating this device or replacing it.\n",
-		location, drv_str, pci_str, frozen_pdn->eeh_freeze_count);
-	goto perm_error;
-
-hard_fail:
-	printk(KERN_ERR
-	   "EEH: Unable to recover from failure of PCI device "
-	   "at location=%s driver=%s pci addr=%s \n"
-	   "Please try reseating this device or replacing it.\n",
-		location, drv_str, pci_str);
-
-perm_error:
-	eeh_slot_error_detail(frozen_pdn, EEH_LOG_PERM_FAILURE);
-
-	/* Notify all devices that they're about to go down. */
-	pci_walk_bus(frozen_bus, eeh_report_failure, NULL);
-
-	/* Shut down the device drivers for good. */
-	pcibios_remove_pci_devices(frozen_bus);
-
-	return NULL;
-}
-
-/* ---------- end of file ---------- */
diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c
deleted file mode 100644
index ddb80f5d850..00000000000
--- a/arch/powerpc/platforms/pseries/eeh_event.c
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * eeh_event.c
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- *
- * Copyright (c) 2005 Linas Vepstas <linas@linas.org>
- */
-
-#include <linux/delay.h>
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/pci.h>
-#include <linux/workqueue.h>
-#include <asm/eeh_event.h>
-#include <asm/ppc-pci.h>
-
-/** Overview:
- *  EEH error states may be detected within exception handlers;
- *  however, the recovery processing needs to occur asynchronously
- *  in a normal kernel context and not an interrupt context.
- *  This pair of routines creates an event and queues it onto a
- *  work-queue, where a worker thread can drive recovery.
- */
-
-/* EEH event workqueue setup. */
-static DEFINE_SPINLOCK(eeh_eventlist_lock);
-LIST_HEAD(eeh_eventlist);
-static void eeh_thread_launcher(struct work_struct *);
-DECLARE_WORK(eeh_event_wq, eeh_thread_launcher);
-
-/* Serialize reset sequences for a given pci device */
-DEFINE_MUTEX(eeh_event_mutex);
-
-/**
- * eeh_event_handler - dispatch EEH events.
- * @dummy - unused
- *
- * The detection of a frozen slot can occur inside an interrupt,
- * where it can be hard to do anything about it.  The goal of this
- * routine is to pull these detection events out of the context
- * of the interrupt handler, and re-dispatch them for processing
- * at a later time in a normal context.
- */
-static int eeh_event_handler(void * dummy)
-{
-	unsigned long flags;
-	struct eeh_event	*event;
-	struct pci_dn *pdn;
-
-	daemonize ("eehd");
-	set_current_state(TASK_INTERRUPTIBLE);
-
-	spin_lock_irqsave(&eeh_eventlist_lock, flags);
-	event = NULL;
-
-	/* Unqueue the event, get ready to process. */
-	if (!list_empty(&eeh_eventlist)) {
-		event = list_entry(eeh_eventlist.next, struct eeh_event, list);
-		list_del(&event->list);
-	}
-	spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
-
-	if (event == NULL)
-		return 0;
-
-	/* Serialize processing of EEH events */
-	mutex_lock(&eeh_event_mutex);
-	eeh_mark_slot(event->dn, EEH_MODE_RECOVERING);
-
-	printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
-	       pci_name(event->dev));
-
-	pdn = handle_eeh_events(event);
-
-	eeh_clear_slot(event->dn, EEH_MODE_RECOVERING);
-	pci_dev_put(event->dev);
-	kfree(event);
-	mutex_unlock(&eeh_event_mutex);
-
-	/* If there are no new errors after an hour, clear the counter. */
-	if (pdn && pdn->eeh_freeze_count>0) {
-		msleep_interruptible (3600*1000);
-		if (pdn->eeh_freeze_count>0)
-			pdn->eeh_freeze_count--;
-	}
-
-	return 0;
-}
-
-/**
- * eeh_thread_launcher
- * @dummy - unused
- */
-static void eeh_thread_launcher(struct work_struct *dummy)
-{
-	if (kernel_thread(eeh_event_handler, NULL, CLONE_KERNEL) < 0)
-		printk(KERN_ERR "Failed to start EEH daemon\n");
-}
-
-/**
- * eeh_send_failure_event - generate a PCI error event
- * @dev pci device
- *
- * This routine can be called within an interrupt context;
- * the actual event will be delivered in a normal context
- * (from a workqueue).
- */
-int eeh_send_failure_event (struct device_node *dn,
-                            struct pci_dev *dev)
-{
-	unsigned long flags;
-	struct eeh_event *event;
-	const char *location;
-
-	if (!mem_init_done) {
-		printk(KERN_ERR "EEH: event during early boot not handled\n");
-		location = of_get_property(dn, "ibm,loc-code", NULL);
-		printk(KERN_ERR "EEH: device node = %s\n", dn->full_name);
-		printk(KERN_ERR "EEH: PCI location = %s\n", location);
-		return 1;
-	}
-	event = kmalloc(sizeof(*event), GFP_ATOMIC);
-	if (event == NULL) {
-		printk (KERN_ERR "EEH: out of memory, event not handled\n");
-		return 1;
- 	}
-
-	if (dev)
-		pci_dev_get(dev);
-
-	event->dn = dn;
-	event->dev = dev;
-
-	/* We may or may not be called in an interrupt context */
-	spin_lock_irqsave(&eeh_eventlist_lock, flags);
-	list_add(&event->list, &eeh_eventlist);
-	spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
-
-	schedule_work(&eeh_event_wq);
-
-	return 0;
-}
-
-/********************** END OF FILE ******************************/
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
new file mode 100644
index 00000000000..0bec0c02c5e
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -0,0 +1,761 @@
+/*
+ * The file intends to implement the platform dependent EEH operations on pseries.
+ * Actually, the pseries platform is built based on RTAS heavily. That means the
+ * pseries platform dependent EEH operations will be built on RTAS calls. The functions
+ * are devired from arch/powerpc/platforms/pseries/eeh.c and necessary cleanup has
+ * been done.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2011.
+ * Copyright IBM Corporation 2001, 2005, 2006
+ * Copyright Dave Engebretsen & Todd Inglett 2001
+ * Copyright Linas Vepstas 2005, 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/atomic.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/of.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/rbtree.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/io.h>
+#include <asm/machdep.h>
+#include <asm/ppc-pci.h>
+#include <asm/rtas.h>
+
+/* RTAS tokens */
+static int ibm_set_eeh_option;
+static int ibm_set_slot_reset;
+static int ibm_read_slot_reset_state;
+static int ibm_read_slot_reset_state2;
+static int ibm_slot_error_detail;
+static int ibm_get_config_addr_info;
+static int ibm_get_config_addr_info2;
+static int ibm_configure_bridge;
+static int ibm_configure_pe;
+
+/*
+ * Buffer for reporting slot-error-detail rtas calls. Its here
+ * in BSS, and not dynamically alloced, so that it ends up in
+ * RMO where RTAS can access it.
+ */
+static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];
+static DEFINE_SPINLOCK(slot_errbuf_lock);
+static int eeh_error_buf_size;
+
+/**
+ * pseries_eeh_init - EEH platform dependent initialization
+ *
+ * EEH platform dependent initialization on pseries.
+ */
+static int pseries_eeh_init(void)
+{
+	/* figure out EEH RTAS function call tokens */
+	ibm_set_eeh_option		= rtas_token("ibm,set-eeh-option");
+	ibm_set_slot_reset		= rtas_token("ibm,set-slot-reset");
+	ibm_read_slot_reset_state2	= rtas_token("ibm,read-slot-reset-state2");
+	ibm_read_slot_reset_state	= rtas_token("ibm,read-slot-reset-state");
+	ibm_slot_error_detail		= rtas_token("ibm,slot-error-detail");
+	ibm_get_config_addr_info2	= rtas_token("ibm,get-config-addr-info2");
+	ibm_get_config_addr_info	= rtas_token("ibm,get-config-addr-info");
+	ibm_configure_pe		= rtas_token("ibm,configure-pe");
+	ibm_configure_bridge		= rtas_token("ibm,configure-bridge");
+
+	/*
+	 * Necessary sanity check. We needn't check "get-config-addr-info"
+	 * and its variant since the old firmware probably support address
+	 * of domain/bus/slot/function for EEH RTAS operations.
+	 */
+	if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE) {
+		pr_warning("%s: RTAS service <ibm,set-eeh-option> invalid\n",
+			__func__);
+		return -EINVAL;
+	} else if (ibm_set_slot_reset == RTAS_UNKNOWN_SERVICE) {
+		pr_warning("%s: RTAS service <ibm,set-slot-reset> invalid\n",
+			__func__);
+		return -EINVAL;
+	} else if (ibm_read_slot_reset_state2 == RTAS_UNKNOWN_SERVICE &&
+		   ibm_read_slot_reset_state == RTAS_UNKNOWN_SERVICE) {
+		pr_warning("%s: RTAS service <ibm,read-slot-reset-state2> and "
+			"<ibm,read-slot-reset-state> invalid\n",
+			__func__);
+		return -EINVAL;
+	} else if (ibm_slot_error_detail == RTAS_UNKNOWN_SERVICE) {
+		pr_warning("%s: RTAS service <ibm,slot-error-detail> invalid\n",
+			__func__);
+		return -EINVAL;
+	} else if (ibm_configure_pe == RTAS_UNKNOWN_SERVICE &&
+		   ibm_configure_bridge == RTAS_UNKNOWN_SERVICE) {
+		pr_warning("%s: RTAS service <ibm,configure-pe> and "
+			"<ibm,configure-bridge> invalid\n",
+			__func__);
+		return -EINVAL;
+	}
+
+	/* Initialize error log lock and size */
+	spin_lock_init(&slot_errbuf_lock);
+	eeh_error_buf_size = rtas_token("rtas-error-log-max");
+	if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) {
+		pr_warning("%s: unknown EEH error log size\n",
+			__func__);
+		eeh_error_buf_size = 1024;
+	} else if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) {
+		pr_warning("%s: EEH error log size %d exceeds the maximal %d\n",
+			__func__, eeh_error_buf_size, RTAS_ERROR_LOG_MAX);
+		eeh_error_buf_size = RTAS_ERROR_LOG_MAX;
+	}
+
+	/* Set EEH probe mode */
+	eeh_probe_mode_set(EEH_PROBE_MODE_DEVTREE);
+
+	return 0;
+}
+
+static int pseries_eeh_cap_start(struct device_node *dn)
+{
+	struct pci_dn *pdn = PCI_DN(dn);
+	u32 status;
+
+	if (!pdn)
+		return 0;
+
+	rtas_read_config(pdn, PCI_STATUS, 2, &status);
+	if (!(status & PCI_STATUS_CAP_LIST))
+		return 0;
+
+	return PCI_CAPABILITY_LIST;
+}
+
+
+static int pseries_eeh_find_cap(struct device_node *dn, int cap)
+{
+	struct pci_dn *pdn = PCI_DN(dn);
+	int pos = pseries_eeh_cap_start(dn);
+	int cnt = 48;	/* Maximal number of capabilities */
+	u32 id;
+
+	if (!pos)
+		return 0;
+
+        while (cnt--) {
+		rtas_read_config(pdn, pos, 1, &pos);
+		if (pos < 0x40)
+			break;
+		pos &= ~3;
+		rtas_read_config(pdn, pos + PCI_CAP_LIST_ID, 1, &id);
+		if (id == 0xff)
+			break;
+		if (id == cap)
+			return pos;
+		pos += PCI_CAP_LIST_NEXT;
+	}
+
+	return 0;
+}
+
+static int pseries_eeh_find_ecap(struct device_node *dn, int cap)
+{
+	struct pci_dn *pdn = PCI_DN(dn);
+	struct eeh_dev *edev = of_node_to_eeh_dev(dn);
+	u32 header;
+	int pos = 256;
+	int ttl = (4096 - 256) / 8;
+
+	if (!edev || !edev->pcie_cap)
+		return 0;
+	if (rtas_read_config(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL)
+		return 0;
+	else if (!header)
+		return 0;
+
+	while (ttl-- > 0) {
+		if (PCI_EXT_CAP_ID(header) == cap && pos)
+			return pos;
+
+		pos = PCI_EXT_CAP_NEXT(header);
+		if (pos < 256)
+			break;
+
+		if (rtas_read_config(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL)
+			break;
+	}
+
+	return 0;
+}
+
+/**
+ * pseries_eeh_of_probe - EEH probe on the given device
+ * @dn: OF node
+ * @flag: Unused
+ *
+ * When EEH module is installed during system boot, all PCI devices
+ * are checked one by one to see if it supports EEH. The function
+ * is introduced for the purpose.
+ */
+static void *pseries_eeh_of_probe(struct device_node *dn, void *flag)
+{
+	struct eeh_dev *edev;
+	struct eeh_pe pe;
+	struct pci_dn *pdn = PCI_DN(dn);
+	const __be32 *classp, *vendorp, *devicep;
+	u32 class_code;
+	const __be32 *regs;
+	u32 pcie_flags;
+	int enable = 0;
+	int ret;
+
+	/* Retrieve OF node and eeh device */
+	edev = of_node_to_eeh_dev(dn);
+	if (edev->pe || !of_device_is_available(dn))
+		return NULL;
+
+	/* Retrieve class/vendor/device IDs */
+	classp = of_get_property(dn, "class-code", NULL);
+	vendorp = of_get_property(dn, "vendor-id", NULL);
+	devicep = of_get_property(dn, "device-id", NULL);
+
+	/* Skip for bad OF node or PCI-ISA bridge */
+	if (!classp || !vendorp || !devicep)
+		return NULL;
+	if (dn->type && !strcmp(dn->type, "isa"))
+		return NULL;
+
+	class_code = of_read_number(classp, 1);
+
+	/*
+	 * Update class code and mode of eeh device. We need
+	 * correctly reflects that current device is root port
+	 * or PCIe switch downstream port.
+	 */
+	edev->class_code = class_code;
+	edev->pcix_cap = pseries_eeh_find_cap(dn, PCI_CAP_ID_PCIX);
+	edev->pcie_cap = pseries_eeh_find_cap(dn, PCI_CAP_ID_EXP);
+	edev->aer_cap = pseries_eeh_find_ecap(dn, PCI_EXT_CAP_ID_ERR);
+	edev->mode &= 0xFFFFFF00;
+	if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) {
+		edev->mode |= EEH_DEV_BRIDGE;
+		if (edev->pcie_cap) {
+			rtas_read_config(pdn, edev->pcie_cap + PCI_EXP_FLAGS,
+					 2, &pcie_flags);
+			pcie_flags = (pcie_flags & PCI_EXP_FLAGS_TYPE) >> 4;
+			if (pcie_flags == PCI_EXP_TYPE_ROOT_PORT)
+				edev->mode |= EEH_DEV_ROOT_PORT;
+			else if (pcie_flags == PCI_EXP_TYPE_DOWNSTREAM)
+				edev->mode |= EEH_DEV_DS_PORT;
+		}
+	}
+
+	/* Retrieve the device address */
+	regs = of_get_property(dn, "reg", NULL);
+	if (!regs) {
+		pr_warning("%s: OF node property %s::reg not found\n",
+			__func__, dn->full_name);
+		return NULL;
+	}
+
+	/* Initialize the fake PE */
+	memset(&pe, 0, sizeof(struct eeh_pe));
+	pe.phb = edev->phb;
+	pe.config_addr = of_read_number(regs, 1);
+
+	/* Enable EEH on the device */
+	ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE);
+	if (!ret) {
+		edev->config_addr = of_read_number(regs, 1);
+		/* Retrieve PE address */
+		edev->pe_config_addr = eeh_ops->get_pe_addr(&pe);
+		pe.addr = edev->pe_config_addr;
+
+		/* Some older systems (Power4) allow the ibm,set-eeh-option
+		 * call to succeed even on nodes where EEH is not supported.
+		 * Verify support explicitly.
+		 */
+		ret = eeh_ops->get_state(&pe, NULL);
+		if (ret > 0 && ret != EEH_STATE_NOT_SUPPORT)
+			enable = 1;
+
+		if (enable) {
+			eeh_set_enable(true);
+			eeh_add_to_parent_pe(edev);
+
+			pr_debug("%s: EEH enabled on %s PHB#%d-PE#%x, config addr#%x\n",
+				__func__, dn->full_name, pe.phb->global_number,
+				pe.addr, pe.config_addr);
+		} else if (dn->parent && of_node_to_eeh_dev(dn->parent) &&
+			   (of_node_to_eeh_dev(dn->parent))->pe) {
+			/* This device doesn't support EEH, but it may have an
+			 * EEH parent, in which case we mark it as supported.
+			 */
+			edev->config_addr = of_node_to_eeh_dev(dn->parent)->config_addr;
+			edev->pe_config_addr = of_node_to_eeh_dev(dn->parent)->pe_config_addr;
+			eeh_add_to_parent_pe(edev);
+		}
+	}
+
+	/* Save memory bars */
+	eeh_save_bars(edev);
+
+	return NULL;
+}
+
+/**
+ * pseries_eeh_set_option - Initialize EEH or MMIO/DMA reenable
+ * @pe: EEH PE
+ * @option: operation to be issued
+ *
+ * The function is used to control the EEH functionality globally.
+ * Currently, following options are support according to PAPR:
+ * Enable EEH, Disable EEH, Enable MMIO and Enable DMA
+ */
+static int pseries_eeh_set_option(struct eeh_pe *pe, int option)
+{
+	int ret = 0;
+	int config_addr;
+
+	/*
+	 * When we're enabling or disabling EEH functioality on
+	 * the particular PE, the PE config address is possibly
+	 * unavailable. Therefore, we have to figure it out from
+	 * the FDT node.
+	 */
+	switch (option) {
+	case EEH_OPT_DISABLE:
+	case EEH_OPT_ENABLE:
+	case EEH_OPT_THAW_MMIO:
+	case EEH_OPT_THAW_DMA:
+		config_addr = pe->config_addr;
+		if (pe->addr)
+			config_addr = pe->addr;
+		break;
+
+	default:
+		pr_err("%s: Invalid option %d\n",
+			__func__, option);
+		return -EINVAL;
+	}
+
+	ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL,
+			config_addr, BUID_HI(pe->phb->buid),
+			BUID_LO(pe->phb->buid), option);
+
+	return ret;
+}
+
+/**
+ * pseries_eeh_get_pe_addr - Retrieve PE address
+ * @pe: EEH PE
+ *
+ * Retrieve the assocated PE address. Actually, there're 2 RTAS
+ * function calls dedicated for the purpose. We need implement
+ * it through the new function and then the old one. Besides,
+ * you should make sure the config address is figured out from
+ * FDT node before calling the function.
+ *
+ * It's notable that zero'ed return value means invalid PE config
+ * address.
+ */
+static int pseries_eeh_get_pe_addr(struct eeh_pe *pe)
+{
+	int ret = 0;
+	int rets[3];
+
+	if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) {
+		/*
+		 * First of all, we need to make sure there has one PE
+		 * associated with the device. Otherwise, PE address is
+		 * meaningless.
+		 */
+		ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
+				pe->config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid), 1);
+		if (ret || (rets[0] == 0))
+			return 0;
+
+		/* Retrieve the associated PE config address */
+		ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
+				pe->config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid), 0);
+		if (ret) {
+			pr_warning("%s: Failed to get address for PHB#%d-PE#%x\n",
+				__func__, pe->phb->global_number, pe->config_addr);
+			return 0;
+		}
+
+		return rets[0];
+	}
+
+	if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) {
+		ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets,
+				pe->config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid), 0);
+		if (ret) {
+			pr_warning("%s: Failed to get address for PHB#%d-PE#%x\n",
+				__func__, pe->phb->global_number, pe->config_addr);
+			return 0;
+		}
+
+		return rets[0];
+	}
+
+	return ret;
+}
+
+/**
+ * pseries_eeh_get_state - Retrieve PE state
+ * @pe: EEH PE
+ * @state: return value
+ *
+ * Retrieve the state of the specified PE. On RTAS compliant
+ * pseries platform, there already has one dedicated RTAS function
+ * for the purpose. It's notable that the associated PE config address
+ * might be ready when calling the function. Therefore, endeavour to
+ * use the PE config address if possible. Further more, there're 2
+ * RTAS calls for the purpose, we need to try the new one and back
+ * to the old one if the new one couldn't work properly.
+ */
+static int pseries_eeh_get_state(struct eeh_pe *pe, int *state)
+{
+	int config_addr;
+	int ret;
+	int rets[4];
+	int result;
+
+	/* Figure out PE config address if possible */
+	config_addr = pe->config_addr;
+	if (pe->addr)
+		config_addr = pe->addr;
+
+	if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) {
+		ret = rtas_call(ibm_read_slot_reset_state2, 3, 4, rets,
+				config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid));
+	} else if (ibm_read_slot_reset_state != RTAS_UNKNOWN_SERVICE) {
+		/* Fake PE unavailable info */
+		rets[2] = 0;
+		ret = rtas_call(ibm_read_slot_reset_state, 3, 3, rets,
+				config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid));
+	} else {
+		return EEH_STATE_NOT_SUPPORT;
+	}
+
+	if (ret)
+		return ret;
+
+	/* Parse the result out */
+	result = 0;
+	if (rets[1]) {
+		switch(rets[0]) {
+		case 0:
+			result &= ~EEH_STATE_RESET_ACTIVE;
+			result |= EEH_STATE_MMIO_ACTIVE;
+			result |= EEH_STATE_DMA_ACTIVE;
+			break;
+		case 1:
+			result |= EEH_STATE_RESET_ACTIVE;
+			result |= EEH_STATE_MMIO_ACTIVE;
+			result |= EEH_STATE_DMA_ACTIVE;
+			break;
+		case 2:
+			result &= ~EEH_STATE_RESET_ACTIVE;
+			result &= ~EEH_STATE_MMIO_ACTIVE;
+			result &= ~EEH_STATE_DMA_ACTIVE;
+			break;
+		case 4:
+			result &= ~EEH_STATE_RESET_ACTIVE;
+			result &= ~EEH_STATE_MMIO_ACTIVE;
+			result &= ~EEH_STATE_DMA_ACTIVE;
+			result |= EEH_STATE_MMIO_ENABLED;
+			break;
+		case 5:
+			if (rets[2]) {
+				if (state) *state = rets[2];
+				result = EEH_STATE_UNAVAILABLE;
+			} else {
+				result = EEH_STATE_NOT_SUPPORT;
+			}
+			break;
+		default:
+			result = EEH_STATE_NOT_SUPPORT;
+		}
+	} else {
+		result = EEH_STATE_NOT_SUPPORT;
+	}
+
+	return result;
+}
+
+/**
+ * pseries_eeh_reset - Reset the specified PE
+ * @pe: EEH PE
+ * @option: reset option
+ *
+ * Reset the specified PE
+ */
+static int pseries_eeh_reset(struct eeh_pe *pe, int option)
+{
+	int config_addr;
+	int ret;
+
+	/* Figure out PE address */
+	config_addr = pe->config_addr;
+	if (pe->addr)
+		config_addr = pe->addr;
+
+	/* Reset PE through RTAS call */
+	ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
+			config_addr, BUID_HI(pe->phb->buid),
+			BUID_LO(pe->phb->buid), option);
+
+	/* If fundamental-reset not supported, try hot-reset */
+	if (option == EEH_RESET_FUNDAMENTAL &&
+	    ret == -8) {
+		option = EEH_RESET_HOT;
+		ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
+				config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid), option);
+	}
+
+	/* We need reset hold or settlement delay */
+	if (option == EEH_RESET_FUNDAMENTAL ||
+	    option == EEH_RESET_HOT)
+		msleep(EEH_PE_RST_HOLD_TIME);
+	else
+		msleep(EEH_PE_RST_SETTLE_TIME);
+
+	return ret;
+}
+
+/**
+ * pseries_eeh_wait_state - Wait for PE state
+ * @pe: EEH PE
+ * @max_wait: maximal period in microsecond
+ *
+ * Wait for the state of associated PE. It might take some time
+ * to retrieve the PE's state.
+ */
+static int pseries_eeh_wait_state(struct eeh_pe *pe, int max_wait)
+{
+	int ret;
+	int mwait;
+
+	/*
+	 * According to PAPR, the state of PE might be temporarily
+	 * unavailable. Under the circumstance, we have to wait
+	 * for indicated time determined by firmware. The maximal
+	 * wait time is 5 minutes, which is acquired from the original
+	 * EEH implementation. Also, the original implementation
+	 * also defined the minimal wait time as 1 second.
+	 */
+#define EEH_STATE_MIN_WAIT_TIME	(1000)
+#define EEH_STATE_MAX_WAIT_TIME	(300 * 1000)
+
+	while (1) {
+		ret = pseries_eeh_get_state(pe, &mwait);
+
+		/*
+		 * If the PE's state is temporarily unavailable,
+		 * we have to wait for the specified time. Otherwise,
+		 * the PE's state will be returned immediately.
+		 */
+		if (ret != EEH_STATE_UNAVAILABLE)
+			return ret;
+
+		if (max_wait <= 0) {
+			pr_warning("%s: Timeout when getting PE's state (%d)\n",
+				__func__, max_wait);
+			return EEH_STATE_NOT_SUPPORT;
+		}
+
+		if (mwait <= 0) {
+			pr_warning("%s: Firmware returned bad wait value %d\n",
+				__func__, mwait);
+			mwait = EEH_STATE_MIN_WAIT_TIME;
+		} else if (mwait > EEH_STATE_MAX_WAIT_TIME) {
+			pr_warning("%s: Firmware returned too long wait value %d\n",
+				__func__, mwait);
+			mwait = EEH_STATE_MAX_WAIT_TIME;
+		}
+
+		max_wait -= mwait;
+		msleep(mwait);
+	}
+
+	return EEH_STATE_NOT_SUPPORT;
+}
+
+/**
+ * pseries_eeh_get_log - Retrieve error log
+ * @pe: EEH PE
+ * @severity: temporary or permanent error log
+ * @drv_log: driver log to be combined with retrieved error log
+ * @len: length of driver log
+ *
+ * Retrieve the temporary or permanent error from the PE.
+ * Actually, the error will be retrieved through the dedicated
+ * RTAS call.
+ */
+static int pseries_eeh_get_log(struct eeh_pe *pe, int severity, char *drv_log, unsigned long len)
+{
+	int config_addr;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&slot_errbuf_lock, flags);
+	memset(slot_errbuf, 0, eeh_error_buf_size);
+
+	/* Figure out the PE address */
+	config_addr = pe->config_addr;
+	if (pe->addr)
+		config_addr = pe->addr;
+
+	ret = rtas_call(ibm_slot_error_detail, 8, 1, NULL, config_addr,
+			BUID_HI(pe->phb->buid), BUID_LO(pe->phb->buid),
+			virt_to_phys(drv_log), len,
+			virt_to_phys(slot_errbuf), eeh_error_buf_size,
+			severity);
+	if (!ret)
+		log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0);
+	spin_unlock_irqrestore(&slot_errbuf_lock, flags);
+
+	return ret;
+}
+
+/**
+ * pseries_eeh_configure_bridge - Configure PCI bridges in the indicated PE
+ * @pe: EEH PE
+ *
+ * The function will be called to reconfigure the bridges included
+ * in the specified PE so that the mulfunctional PE would be recovered
+ * again.
+ */
+static int pseries_eeh_configure_bridge(struct eeh_pe *pe)
+{
+	int config_addr;
+	int ret;
+
+	/* Figure out the PE address */
+	config_addr = pe->config_addr;
+	if (pe->addr)
+		config_addr = pe->addr;
+
+	/* Use new configure-pe function, if supported */
+	if (ibm_configure_pe != RTAS_UNKNOWN_SERVICE) {
+		ret = rtas_call(ibm_configure_pe, 3, 1, NULL,
+				config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid));
+	} else if (ibm_configure_bridge != RTAS_UNKNOWN_SERVICE) {
+		ret = rtas_call(ibm_configure_bridge, 3, 1, NULL,
+				config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid));
+	} else {
+		return -EFAULT;
+	}
+
+	if (ret)
+		pr_warning("%s: Unable to configure bridge PHB#%d-PE#%x (%d)\n",
+			__func__, pe->phb->global_number, pe->addr, ret);
+
+	return ret;
+}
+
+/**
+ * pseries_eeh_read_config - Read PCI config space
+ * @dn: device node
+ * @where: PCI address
+ * @size: size to read
+ * @val: return value
+ *
+ * Read config space from the speicifed device
+ */
+static int pseries_eeh_read_config(struct device_node *dn, int where, int size, u32 *val)
+{
+	struct pci_dn *pdn;
+
+	pdn = PCI_DN(dn);
+
+	return rtas_read_config(pdn, where, size, val);
+}
+
+/**
+ * pseries_eeh_write_config - Write PCI config space
+ * @dn: device node
+ * @where: PCI address
+ * @size: size to write
+ * @val: value to be written
+ *
+ * Write config space to the specified device
+ */
+static int pseries_eeh_write_config(struct device_node *dn, int where, int size, u32 val)
+{
+	struct pci_dn *pdn;
+
+	pdn = PCI_DN(dn);
+
+	return rtas_write_config(pdn, where, size, val);
+}
+
+static struct eeh_ops pseries_eeh_ops = {
+	.name			= "pseries",
+	.init			= pseries_eeh_init,
+	.of_probe		= pseries_eeh_of_probe,
+	.dev_probe		= NULL,
+	.set_option		= pseries_eeh_set_option,
+	.get_pe_addr		= pseries_eeh_get_pe_addr,
+	.get_state		= pseries_eeh_get_state,
+	.reset			= pseries_eeh_reset,
+	.wait_state		= pseries_eeh_wait_state,
+	.get_log		= pseries_eeh_get_log,
+	.configure_bridge       = pseries_eeh_configure_bridge,
+	.read_config		= pseries_eeh_read_config,
+	.write_config		= pseries_eeh_write_config,
+	.next_error		= NULL,
+	.restore_config		= NULL
+};
+
+/**
+ * eeh_pseries_init - Register platform dependent EEH operations
+ *
+ * EEH initialization on pseries platform. This function should be
+ * called before any EEH related functions.
+ */
+static int __init eeh_pseries_init(void)
+{
+	int ret = -EINVAL;
+
+	if (!machine_is(pseries))
+		return ret;
+
+	ret = eeh_ops_register(&pseries_eeh_ops);
+	if (!ret)
+		pr_info("EEH: pSeries platform initialized\n");
+	else
+		pr_info("EEH: pSeries platform initialization failure (%d)\n",
+			ret);
+
+	return ret;
+}
+
+early_initcall(eeh_pseries_init);
diff --git a/arch/powerpc/platforms/pseries/eeh_sysfs.c b/arch/powerpc/platforms/pseries/eeh_sysfs.c
deleted file mode 100644
index 15e13b56890..00000000000
--- a/arch/powerpc/platforms/pseries/eeh_sysfs.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Sysfs entries for PCI Error Recovery for PAPR-compliant platform.
- * Copyright IBM Corporation 2007
- * Copyright Linas Vepstas <linas@austin.ibm.com> 2007
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
- */
-#include <linux/pci.h>
-#include <asm/ppc-pci.h>
-#include <asm/pci-bridge.h>
-#include <linux/kobject.h>
-
-/**
- * EEH_SHOW_ATTR -- create sysfs entry for eeh statistic
- * @_name: name of file in sysfs directory
- * @_memb: name of member in struct pci_dn to access
- * @_format: printf format for display
- *
- * All of the attributes look very similar, so just
- * auto-gen a cut-n-paste routine to display them.
- */
-#define EEH_SHOW_ATTR(_name,_memb,_format)               \
-static ssize_t eeh_show_##_name(struct device *dev,      \
-		struct device_attribute *attr, char *buf)          \
-{                                                        \
-	struct pci_dev *pdev = to_pci_dev(dev);               \
-	struct device_node *dn = pci_device_to_OF_node(pdev); \
-	struct pci_dn *pdn;                                   \
-	                                                      \
-	if (!dn || PCI_DN(dn) == NULL)                        \
-		return 0;                                          \
-	                                                      \
-	pdn = PCI_DN(dn);                                     \
-	return sprintf(buf, _format "\n", pdn->_memb);        \
-}                                                        \
-static DEVICE_ATTR(_name, S_IRUGO, eeh_show_##_name, NULL);
-
-
-EEH_SHOW_ATTR(eeh_mode, eeh_mode, "0x%x");
-EEH_SHOW_ATTR(eeh_config_addr, eeh_config_addr, "0x%x");
-EEH_SHOW_ATTR(eeh_pe_config_addr, eeh_pe_config_addr, "0x%x");
-EEH_SHOW_ATTR(eeh_check_count, eeh_check_count, "%d");
-EEH_SHOW_ATTR(eeh_freeze_count, eeh_freeze_count, "%d");
-EEH_SHOW_ATTR(eeh_false_positives, eeh_false_positives, "%d");
-
-void eeh_sysfs_add_device(struct pci_dev *pdev)
-{
-	int rc=0;
-
-	rc += device_create_file(&pdev->dev, &dev_attr_eeh_mode);
-	rc += device_create_file(&pdev->dev, &dev_attr_eeh_config_addr);
-	rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
-	rc += device_create_file(&pdev->dev, &dev_attr_eeh_check_count);
-	rc += device_create_file(&pdev->dev, &dev_attr_eeh_false_positives);
-	rc += device_create_file(&pdev->dev, &dev_attr_eeh_freeze_count);
-
-	if (rc)
-		printk(KERN_WARNING "EEH: Unable to create sysfs entries\n");
-}
-
-void eeh_sysfs_remove_device(struct pci_dev *pdev)
-{
-	device_remove_file(&pdev->dev, &dev_attr_eeh_mode);
-	device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr);
-	device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
-	device_remove_file(&pdev->dev, &dev_attr_eeh_check_count);
-	device_remove_file(&pdev->dev, &dev_attr_eeh_false_positives);
-	device_remove_file(&pdev->dev, &dev_attr_eeh_freeze_count);
-}
-
diff --git a/arch/powerpc/platforms/pseries/event_sources.c b/arch/powerpc/platforms/pseries/event_sources.c
new file mode 100644
index 00000000000..18380e8f6df
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/event_sources.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2001 Dave Engebretsen IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <asm/prom.h>
+
+#include "pseries.h"
+
+void request_event_sources_irqs(struct device_node *np,
+				irq_handler_t handler,
+				const char *name)
+{
+	int i, index, count = 0;
+	struct of_phandle_args oirq;
+	const u32 *opicprop;
+	unsigned int opicplen;
+	unsigned int virqs[16];
+
+	/* Check for obsolete "open-pic-interrupt" property. If present, then
+	 * map those interrupts using the default interrupt host and default
+	 * trigger
+	 */
+	opicprop = of_get_property(np, "open-pic-interrupt", &opicplen);
+	if (opicprop) {
+		opicplen /= sizeof(u32);
+		for (i = 0; i < opicplen; i++) {
+			if (count > 15)
+				break;
+			virqs[count] = irq_create_mapping(NULL, *(opicprop++));
+			if (virqs[count] == NO_IRQ) {
+				pr_err("event-sources: Unable to allocate "
+				       "interrupt number for %s\n",
+				       np->full_name);
+				WARN_ON(1);
+			}
+			else
+				count++;
+
+		}
+	}
+	/* Else use normal interrupt tree parsing */
+	else {
+		/* First try to do a proper OF tree parsing */
+		for (index = 0; of_irq_parse_one(np, index, &oirq) == 0;
+		     index++) {
+			if (count > 15)
+				break;
+			virqs[count] = irq_create_of_mapping(&oirq);
+			if (virqs[count] == NO_IRQ) {
+				pr_err("event-sources: Unable to allocate "
+				       "interrupt number for %s\n",
+				       np->full_name);
+				WARN_ON(1);
+			}
+			else
+				count++;
+		}
+	}
+
+	/* Now request them */
+	for (i = 0; i < count; i++) {
+		if (request_irq(virqs[i], handler, 0, name, NULL)) {
+			pr_err("event-sources: Unable to request interrupt "
+			       "%d for %s\n", virqs[i], np->full_name);
+			WARN_ON(1);
+			return;
+		}
+	}
+}
+
diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c
index 5a707da3f5c..8c80588abac 100644
--- a/arch/powerpc/platforms/pseries/firmware.c
+++ b/arch/powerpc/platforms/pseries/firmware.c
@@ -28,13 +28,18 @@
 
 #include "pseries.h"
 
-typedef struct {
+struct hypertas_fw_feature {
     unsigned long val;
     char * name;
-} firmware_feature_t;
+};
 
-static __initdata firmware_feature_t
-firmware_features_table[FIRMWARE_MAX_FEATURES] = {
+/*
+ * The names in this table match names in rtas/ibm,hypertas-functions.  If the
+ * entry ends in a '*', only upto the '*' is matched.  Otherwise the entire
+ * string must match.
+ */
+static __initdata struct hypertas_fw_feature
+hypertas_fw_features_table[] = {
 	{FW_FEATURE_PFT,		"hcall-pft"},
 	{FW_FEATURE_TCE,		"hcall-tce"},
 	{FW_FEATURE_SPRG0,		"hcall-sprg0"},
@@ -51,37 +56,78 @@ firmware_features_table[FIRMWARE_MAX_FEATURES] = {
 	{FW_FEATURE_VIO,		"hcall-vio"},
 	{FW_FEATURE_RDMA,		"hcall-rdma"},
 	{FW_FEATURE_LLAN,		"hcall-lLAN"},
-	{FW_FEATURE_BULK,		"hcall-bulk"},
+	{FW_FEATURE_BULK_REMOVE,	"hcall-bulk"},
 	{FW_FEATURE_XDABR,		"hcall-xdabr"},
 	{FW_FEATURE_MULTITCE,		"hcall-multi-tce"},
 	{FW_FEATURE_SPLPAR,		"hcall-splpar"},
-	{FW_FEATURE_BULK_REMOVE,	"hcall-bulk"},
+	{FW_FEATURE_VPHN,		"hcall-vphn"},
+	{FW_FEATURE_SET_MODE,		"hcall-set-mode"},
+	{FW_FEATURE_BEST_ENERGY,	"hcall-best-energy-1*"},
 };
 
 /* Build up the firmware features bitmask using the contents of
  * device-tree/ibm,hypertas-functions.  Ultimately this functionality may
  * be moved into prom.c prom_init().
  */
-void __init fw_feature_init(const char *hypertas, unsigned long len)
+void __init fw_hypertas_feature_init(const char *hypertas, unsigned long len)
 {
 	const char *s;
 	int i;
 
-	pr_debug(" -> fw_feature_init()\n");
+	pr_debug(" -> fw_hypertas_feature_init()\n");
 
 	for (s = hypertas; s < hypertas + len; s += strlen(s) + 1) {
-		for (i = 0; i < FIRMWARE_MAX_FEATURES; i++) {
-			/* check value against table of strings */
-			if (!firmware_features_table[i].name ||
-			    strcmp(firmware_features_table[i].name, s))
+		for (i = 0; i < ARRAY_SIZE(hypertas_fw_features_table); i++) {
+			const char *name = hypertas_fw_features_table[i].name;
+			size_t size;
+
+			/*
+			 * If there is a '*' at the end of name, only check
+			 * upto there
+			 */
+			size = strlen(name);
+			if (size && name[size - 1] == '*') {
+				if (strncmp(name, s, size - 1))
+					continue;
+			} else if (strcmp(name, s))
 				continue;
 
 			/* we have a match */
 			powerpc_firmware_features |=
-				firmware_features_table[i].val;
+				hypertas_fw_features_table[i].val;
 			break;
 		}
 	}
 
-	pr_debug(" <- fw_feature_init()\n");
+	pr_debug(" <- fw_hypertas_feature_init()\n");
+}
+
+struct vec5_fw_feature {
+	unsigned long	val;
+	unsigned int	feature;
+};
+
+static __initdata struct vec5_fw_feature
+vec5_fw_features_table[] = {
+	{FW_FEATURE_TYPE1_AFFINITY,	OV5_TYPE1_AFFINITY},
+	{FW_FEATURE_PRRN,		OV5_PRRN},
+};
+
+void __init fw_vec5_feature_init(const char *vec5, unsigned long len)
+{
+	unsigned int index, feat;
+	int i;
+
+	pr_debug(" -> fw_vec5_feature_init()\n");
+
+	for (i = 0; i < ARRAY_SIZE(vec5_fw_features_table); i++) {
+		index = OV5_INDX(vec5_fw_features_table[i].feature);
+		feat = OV5_FEAT(vec5_fw_features_table[i].feature);
+
+		if (vec5[index] & feat)
+			powerpc_firmware_features |=
+				vec5_fw_features_table[i].val;
+	}
+
+	pr_debug(" <- fw_vec5_feature_init()\n");
 }
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index a20ead87153..20d62975856 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -19,111 +19,215 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/interrupt.h>
 #include <linux/delay.h>
+#include <linux/sched.h>	/* for idle_task_exit */
 #include <linux/cpu.h>
-#include <asm/system.h>
+#include <linux/of.h>
 #include <asm/prom.h>
 #include <asm/rtas.h>
 #include <asm/firmware.h>
 #include <asm/machdep.h>
 #include <asm/vdso_datapage.h>
-#include <asm/pSeries_reconfig.h>
-#include "xics.h"
-#include "plpar_wrappers.h"
+#include <asm/xics.h>
+#include <asm/plpar_wrappers.h>
+
+#include "offline_states.h"
 
 /* This version can't take the spinlock, because it never returns */
-static struct rtas_args rtas_stop_self_args = {
-	.token = RTAS_UNKNOWN_SERVICE,
-	.nargs = 0,
-	.nret = 1,
-	.rets = &rtas_stop_self_args.args[0],
-};
+static int rtas_stop_self_token = RTAS_UNKNOWN_SERVICE;
+
+static DEFINE_PER_CPU(enum cpu_state_vals, preferred_offline_state) =
+							CPU_STATE_OFFLINE;
+static DEFINE_PER_CPU(enum cpu_state_vals, current_state) = CPU_STATE_OFFLINE;
+
+static enum cpu_state_vals default_offline_state = CPU_STATE_OFFLINE;
+
+static int cede_offline_enabled __read_mostly = 1;
+
+/*
+ * Enable/disable cede_offline when available.
+ */
+static int __init setup_cede_offline(char *str)
+{
+	if (!strcmp(str, "off"))
+		cede_offline_enabled = 0;
+	else if (!strcmp(str, "on"))
+		cede_offline_enabled = 1;
+	else
+		return 0;
+	return 1;
+}
+
+__setup("cede_offline=", setup_cede_offline);
+
+enum cpu_state_vals get_cpu_current_state(int cpu)
+{
+	return per_cpu(current_state, cpu);
+}
+
+void set_cpu_current_state(int cpu, enum cpu_state_vals state)
+{
+	per_cpu(current_state, cpu) = state;
+}
+
+enum cpu_state_vals get_preferred_offline_state(int cpu)
+{
+	return per_cpu(preferred_offline_state, cpu);
+}
+
+void set_preferred_offline_state(int cpu, enum cpu_state_vals state)
+{
+	per_cpu(preferred_offline_state, cpu) = state;
+}
+
+void set_default_offline_state(int cpu)
+{
+	per_cpu(preferred_offline_state, cpu) = default_offline_state;
+}
 
 static void rtas_stop_self(void)
 {
-	struct rtas_args *args = &rtas_stop_self_args;
+	static struct rtas_args args = {
+		.nargs = 0,
+		.nret = 1,
+		.rets = &args.args[0],
+	};
+
+	args.token = cpu_to_be32(rtas_stop_self_token);
 
 	local_irq_disable();
 
-	BUG_ON(args->token == RTAS_UNKNOWN_SERVICE);
+	BUG_ON(rtas_stop_self_token == RTAS_UNKNOWN_SERVICE);
 
 	printk("cpu %u (hwid %u) Ready to die...\n",
 	       smp_processor_id(), hard_smp_processor_id());
-	enter_rtas(__pa(args));
+	enter_rtas(__pa(&args));
 
 	panic("Alas, I survived.\n");
 }
 
 static void pseries_mach_cpu_die(void)
 {
+	unsigned int cpu = smp_processor_id();
+	unsigned int hwcpu = hard_smp_processor_id();
+	u8 cede_latency_hint = 0;
+
 	local_irq_disable();
 	idle_task_exit();
 	xics_teardown_cpu();
-	unregister_slb_shadow(hard_smp_processor_id(), __pa(get_slb_shadow()));
-	rtas_stop_self();
-	/* Should never get here... */
-	BUG();
-	for(;;);
-}
 
-static int qcss_tok;	/* query-cpu-stopped-state token */
+	if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
+		set_cpu_current_state(cpu, CPU_STATE_INACTIVE);
+		if (ppc_md.suspend_disable_cpu)
+			ppc_md.suspend_disable_cpu();
 
-/* Get state of physical CPU.
- * Return codes:
- *	0	- The processor is in the RTAS stopped state
- *	1	- stop-self is in progress
- *	2	- The processor is not in the RTAS stopped state
- *	-1	- Hardware Error
- *	-2	- Hardware Busy, Try again later.
- */
-static int query_cpu_stopped(unsigned int pcpu)
-{
-	int cpu_status, status;
+		cede_latency_hint = 2;
+
+		get_lppaca()->idle = 1;
+		if (!lppaca_shared_proc(get_lppaca()))
+			get_lppaca()->donate_dedicated_cpu = 1;
+
+		while (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
+			while (!prep_irq_for_idle()) {
+				local_irq_enable();
+				local_irq_disable();
+			}
+
+			extended_cede_processor(cede_latency_hint);
+		}
+
+		local_irq_disable();
+
+		if (!lppaca_shared_proc(get_lppaca()))
+			get_lppaca()->donate_dedicated_cpu = 0;
+		get_lppaca()->idle = 0;
+
+		if (get_preferred_offline_state(cpu) == CPU_STATE_ONLINE) {
+			unregister_slb_shadow(hwcpu);
 
-	status = rtas_call(qcss_tok, 1, 2, &cpu_status, pcpu);
-	if (status != 0) {
-		printk(KERN_ERR
-		       "RTAS query-cpu-stopped-state failed: %i\n", status);
-		return status;
+			hard_irq_disable();
+			/*
+			 * Call to start_secondary_resume() will not return.
+			 * Kernel stack will be reset and start_secondary()
+			 * will be called to continue the online operation.
+			 */
+			start_secondary_resume();
+		}
 	}
 
-	return cpu_status;
+	/* Requested state is CPU_STATE_OFFLINE at this point */
+	WARN_ON(get_preferred_offline_state(cpu) != CPU_STATE_OFFLINE);
+
+	set_cpu_current_state(cpu, CPU_STATE_OFFLINE);
+	unregister_slb_shadow(hwcpu);
+	rtas_stop_self();
+
+	/* Should never get here... */
+	BUG();
+	for(;;);
 }
 
 static int pseries_cpu_disable(void)
 {
 	int cpu = smp_processor_id();
 
-	cpu_clear(cpu, cpu_online_map);
+	set_cpu_online(cpu, false);
 	vdso_data->processorCount--;
 
 	/*fix boot_cpuid here*/
 	if (cpu == boot_cpuid)
-		boot_cpuid = any_online_cpu(cpu_online_map);
+		boot_cpuid = cpumask_any(cpu_online_mask);
 
 	/* FIXME: abstract this to not be platform specific later on */
 	xics_migrate_irqs_away();
 	return 0;
 }
 
+/*
+ * pseries_cpu_die: Wait for the cpu to die.
+ * @cpu: logical processor id of the CPU whose death we're awaiting.
+ *
+ * This function is called from the context of the thread which is performing
+ * the cpu-offline. Here we wait for long enough to allow the cpu in question
+ * to self-destroy so that the cpu-offline thread can send the CPU_DEAD
+ * notifications.
+ *
+ * OTOH, pseries_mach_cpu_die() is called by the @cpu when it wants to
+ * self-destruct.
+ */
 static void pseries_cpu_die(unsigned int cpu)
 {
 	int tries;
-	int cpu_status;
+	int cpu_status = 1;
 	unsigned int pcpu = get_hard_smp_processor_id(cpu);
 
-	for (tries = 0; tries < 25; tries++) {
-		cpu_status = query_cpu_stopped(pcpu);
-		if (cpu_status == 0 || cpu_status == -1)
-			break;
-		cpu_relax();
+	if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
+		cpu_status = 1;
+		for (tries = 0; tries < 5000; tries++) {
+			if (get_cpu_current_state(cpu) == CPU_STATE_INACTIVE) {
+				cpu_status = 0;
+				break;
+			}
+			msleep(1);
+		}
+	} else if (get_preferred_offline_state(cpu) == CPU_STATE_OFFLINE) {
+
+		for (tries = 0; tries < 25; tries++) {
+			cpu_status = smp_query_cpu_stopped(pcpu);
+			if (cpu_status == QCSS_STOPPED ||
+			    cpu_status == QCSS_HARDWARE_ERROR)
+				break;
+			cpu_relax();
+		}
 	}
+
 	if (cpu_status != 0) {
 		printk("Querying DEAD? cpu %i (%i) shows %i\n",
 		       cpu, pcpu, cpu_status);
 	}
 
-	/* Isolation and deallocation are definatly done by
+	/* Isolation and deallocation are definitely done by
 	 * drslot_chrp_cpu.  If they were not they would be
 	 * done here.  Change isolate state to Isolate and
 	 * change allocation-state to Unusable.
@@ -132,7 +236,7 @@ static void pseries_cpu_die(unsigned int cpu)
 }
 
 /*
- * Update cpu_present_map and paca(s) for a new cpu node.  The wrinkle
+ * Update cpu_present_mask and paca(s) for a new cpu node.  The wrinkle
  * here is that a cpu device node may represent up to two logical cpus
  * in the SMT case.  We must honor the assumption in other code that
  * the logical ids for sibling SMT threads x and y are adjacent, such
@@ -141,7 +245,7 @@ static void pseries_cpu_die(unsigned int cpu)
 static int pseries_add_processor(struct device_node *np)
 {
 	unsigned int cpu;
-	cpumask_t candidate_map, tmp = CPU_MASK_NONE;
+	cpumask_var_t candidate_mask, tmp;
 	int err = -ENOSPC, len, nthreads, i;
 	const u32 *intserv;
 
@@ -149,48 +253,53 @@ static int pseries_add_processor(struct device_node *np)
 	if (!intserv)
 		return 0;
 
+	zalloc_cpumask_var(&candidate_mask, GFP_KERNEL);
+	zalloc_cpumask_var(&tmp, GFP_KERNEL);
+
 	nthreads = len / sizeof(u32);
 	for (i = 0; i < nthreads; i++)
-		cpu_set(i, tmp);
+		cpumask_set_cpu(i, tmp);
 
 	cpu_maps_update_begin();
 
-	BUG_ON(!cpus_subset(cpu_present_map, cpu_possible_map));
+	BUG_ON(!cpumask_subset(cpu_present_mask, cpu_possible_mask));
 
 	/* Get a bitmap of unoccupied slots. */
-	cpus_xor(candidate_map, cpu_possible_map, cpu_present_map);
-	if (cpus_empty(candidate_map)) {
+	cpumask_xor(candidate_mask, cpu_possible_mask, cpu_present_mask);
+	if (cpumask_empty(candidate_mask)) {
 		/* If we get here, it most likely means that NR_CPUS is
 		 * less than the partition's max processors setting.
 		 */
 		printk(KERN_ERR "Cannot add cpu %s; this system configuration"
 		       " supports %d logical cpus.\n", np->full_name,
-		       cpus_weight(cpu_possible_map));
+		       cpumask_weight(cpu_possible_mask));
 		goto out_unlock;
 	}
 
-	while (!cpus_empty(tmp))
-		if (cpus_subset(tmp, candidate_map))
+	while (!cpumask_empty(tmp))
+		if (cpumask_subset(tmp, candidate_mask))
 			/* Found a range where we can insert the new cpu(s) */
 			break;
 		else
-			cpus_shift_left(tmp, tmp, nthreads);
+			cpumask_shift_left(tmp, tmp, nthreads);
 
-	if (cpus_empty(tmp)) {
-		printk(KERN_ERR "Unable to find space in cpu_present_map for"
+	if (cpumask_empty(tmp)) {
+		printk(KERN_ERR "Unable to find space in cpu_present_mask for"
 		       " processor %s with %d thread(s)\n", np->name,
 		       nthreads);
 		goto out_unlock;
 	}
 
-	for_each_cpu_mask(cpu, tmp) {
-		BUG_ON(cpu_isset(cpu, cpu_present_map));
-		cpu_set(cpu, cpu_present_map);
+	for_each_cpu(cpu, tmp) {
+		BUG_ON(cpu_present(cpu));
+		set_cpu_present(cpu, true);
 		set_hard_smp_processor_id(cpu, *intserv++);
 	}
 	err = 0;
 out_unlock:
 	cpu_maps_update_done();
+	free_cpumask_var(candidate_mask);
+	free_cpumask_var(tmp);
 	return err;
 }
 
@@ -217,11 +326,11 @@ static void pseries_remove_processor(struct device_node *np)
 			if (get_hard_smp_processor_id(cpu) != intserv[i])
 				continue;
 			BUG_ON(cpu_online(cpu));
-			cpu_clear(cpu, cpu_present_map);
+			set_cpu_present(cpu, false);
 			set_hard_smp_processor_id(cpu, -1);
 			break;
 		}
-		if (cpu == NR_CPUS)
+		if (cpu >= nr_cpu_ids)
 			printk(KERN_WARNING "Could not find cpu to remove "
 			       "with physical id 0x%x\n", intserv[i]);
 	}
@@ -231,31 +340,47 @@ static void pseries_remove_processor(struct device_node *np)
 static int pseries_smp_notifier(struct notifier_block *nb,
 				unsigned long action, void *node)
 {
-	int err = NOTIFY_OK;
+	int err = 0;
 
 	switch (action) {
-	case PSERIES_RECONFIG_ADD:
-		if (pseries_add_processor(node))
-			err = NOTIFY_BAD;
+	case OF_RECONFIG_ATTACH_NODE:
+		err = pseries_add_processor(node);
 		break;
-	case PSERIES_RECONFIG_REMOVE:
+	case OF_RECONFIG_DETACH_NODE:
 		pseries_remove_processor(node);
 		break;
-	default:
-		err = NOTIFY_DONE;
-		break;
 	}
-	return err;
+	return notifier_from_errno(err);
 }
 
 static struct notifier_block pseries_smp_nb = {
 	.notifier_call = pseries_smp_notifier,
 };
 
+#define MAX_CEDE_LATENCY_LEVELS		4
+#define	CEDE_LATENCY_PARAM_LENGTH	10
+#define CEDE_LATENCY_PARAM_MAX_LENGTH	\
+	(MAX_CEDE_LATENCY_LEVELS * CEDE_LATENCY_PARAM_LENGTH * sizeof(char))
+#define CEDE_LATENCY_TOKEN		45
+
+static char cede_parameters[CEDE_LATENCY_PARAM_MAX_LENGTH];
+
+static int parse_cede_parameters(void)
+{
+	memset(cede_parameters, 0, CEDE_LATENCY_PARAM_MAX_LENGTH);
+	return rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+			 NULL,
+			 CEDE_LATENCY_TOKEN,
+			 __pa(cede_parameters),
+			 CEDE_LATENCY_PARAM_MAX_LENGTH);
+}
+
 static int __init pseries_cpu_hotplug_init(void)
 {
 	struct device_node *np;
 	const char *typep;
+	int cpu;
+	int qcss_tok;
 
 	for_each_node_by_name(np, "interrupt-controller") {
 		typep = of_get_property(np, "compatible", NULL);
@@ -268,10 +393,10 @@ static int __init pseries_cpu_hotplug_init(void)
 		}
 	}
 
-	rtas_stop_self_args.token = rtas_token("stop-self");
+	rtas_stop_self_token = rtas_token("stop-self");
 	qcss_tok = rtas_token("query-cpu-stopped-state");
 
-	if (rtas_stop_self_args.token == RTAS_UNKNOWN_SERVICE ||
+	if (rtas_stop_self_token == RTAS_UNKNOWN_SERVICE ||
 			qcss_tok == RTAS_UNKNOWN_SERVICE) {
 		printk(KERN_INFO "CPU Hotplug not supported by firmware "
 				"- disabling.\n");
@@ -283,9 +408,17 @@ static int __init pseries_cpu_hotplug_init(void)
 	smp_ops->cpu_die = pseries_cpu_die;
 
 	/* Processors can be added/removed only on LPAR */
-	if (firmware_has_feature(FW_FEATURE_LPAR))
-		pSeries_reconfig_notifier_register(&pseries_smp_nb);
+	if (firmware_has_feature(FW_FEATURE_LPAR)) {
+		of_reconfig_notifier_register(&pseries_smp_nb);
+		cpu_maps_update_begin();
+		if (cede_offline_enabled && parse_cede_parameters() == 0) {
+			default_offline_state = CPU_STATE_INACTIVE;
+			for_each_online_cpu(cpu)
+				set_default_offline_state(cpu);
+		}
+		cpu_maps_update_done();
+	}
 
 	return 0;
 }
-arch_initcall(pseries_cpu_hotplug_init);
+machine_arch_initcall(pseries, pseries_cpu_hotplug_init);
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 9b21ee68ea5..7995135170a 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -10,54 +10,107 @@
  */
 
 #include <linux/of.h>
-#include <linux/lmb.h>
+#include <linux/of_address.h>
+#include <linux/memblock.h>
+#include <linux/vmalloc.h>
+#include <linux/memory.h>
+#include <linux/memory_hotplug.h>
+
 #include <asm/firmware.h>
 #include <asm/machdep.h>
-#include <asm/pSeries_reconfig.h>
+#include <asm/prom.h>
 #include <asm/sparsemem.h>
 
-static int pseries_remove_lmb(unsigned long base, unsigned int lmb_size)
+unsigned long pseries_memory_block_size(void)
 {
-	unsigned long start, start_pfn;
-	struct zone *zone;
-	int ret;
+	struct device_node *np;
+	unsigned int memblock_size = MIN_MEMORY_BLOCK_SIZE;
+	struct resource r;
 
-	start_pfn = base >> PAGE_SHIFT;
+	np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+	if (np) {
+		const __be64 *size;
 
-	if (!pfn_valid(start_pfn)) {
-		lmb_remove(base, lmb_size);
-		return 0;
+		size = of_get_property(np, "ibm,lmb-size", NULL);
+		if (size)
+			memblock_size = be64_to_cpup(size);
+		of_node_put(np);
+	} else  if (machine_is(pseries)) {
+		/* This fallback really only applies to pseries */
+		unsigned int memzero_size = 0;
+
+		np = of_find_node_by_path("/memory@0");
+		if (np) {
+			if (!of_address_to_resource(np, 0, &r))
+				memzero_size = resource_size(&r);
+			of_node_put(np);
+		}
+
+		if (memzero_size) {
+			/* We now know the size of memory@0, use this to find
+			 * the first memoryblock and get its size.
+			 */
+			char buf[64];
+
+			sprintf(buf, "/memory@%x", memzero_size);
+			np = of_find_node_by_path(buf);
+			if (np) {
+				if (!of_address_to_resource(np, 0, &r))
+					memblock_size = resource_size(&r);
+				of_node_put(np);
+			}
+		}
 	}
+	return memblock_size;
+}
 
-	zone = page_zone(pfn_to_page(start_pfn));
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static int pseries_remove_memory(u64 start, u64 size)
+{
+	int ret;
 
-	/*
-	 * Remove section mappings and sysfs entries for the
-	 * section of the memory we are removing.
-	 *
-	 * NOTE: Ideally, this should be done in generic code like
-	 * remove_memory(). But remove_memory() gets called by writing
-	 * to sysfs "state" file and we can't remove sysfs entries
-	 * while writing to it. So we have to defer it to here.
-	 */
-	ret = __remove_pages(zone, start_pfn, lmb_size >> PAGE_SHIFT);
-	if (ret)
-		return ret;
+	/* Remove htab bolted mappings for this section of memory */
+	start = (unsigned long)__va(start);
+	ret = remove_section_mapping(start, start + size);
 
-	/*
-	 * Update memory regions for memory remove
+	/* Ensure all vmalloc mappings are flushed in case they also
+	 * hit that section of memory
 	 */
-	lmb_remove(base, lmb_size);
+	vm_unmap_aliases();
 
-	/*
-	 * Remove htab bolted mappings for this section of memory
-	 */
-	start = (unsigned long)__va(base);
-	ret = remove_section_mapping(start, start + lmb_size);
 	return ret;
 }
 
-static int pseries_remove_memory(struct device_node *np)
+static int pseries_remove_memblock(unsigned long base, unsigned int memblock_size)
+{
+	unsigned long block_sz, start_pfn;
+	int sections_per_block;
+	int i, nid;
+
+	start_pfn = base >> PAGE_SHIFT;
+
+	lock_device_hotplug();
+
+	if (!pfn_valid(start_pfn))
+		goto out;
+
+	block_sz = pseries_memory_block_size();
+	sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
+	nid = memory_add_physaddr_to_nid(base);
+
+	for (i = 0; i < sections_per_block; i++) {
+		remove_memory(nid, base, MIN_MEMORY_BLOCK_SIZE);
+		base += MIN_MEMORY_BLOCK_SIZE;
+	}
+
+out:
+	/* Update memory regions for memory remove */
+	memblock_remove(base, memblock_size);
+	unlock_device_hotplug();
+	return 0;
+}
+
+static int pseries_remove_mem_node(struct device_node *np)
 {
 	const char *type;
 	const unsigned int *regs;
@@ -73,7 +126,7 @@ static int pseries_remove_memory(struct device_node *np)
 		return 0;
 
 	/*
-	 * Find the bae address and size of the lmb
+	 * Find the bae address and size of the memblock
 	 */
 	regs = of_get_property(np, "reg", NULL);
 	if (!regs)
@@ -82,11 +135,22 @@ static int pseries_remove_memory(struct device_node *np)
 	base = *(unsigned long *)regs;
 	lmb_size = regs[3];
 
-	ret = pseries_remove_lmb(base, lmb_size);
-	return ret;
+	pseries_remove_memblock(base, lmb_size);
+	return 0;
+}
+#else
+static inline int pseries_remove_memblock(unsigned long base,
+					  unsigned int memblock_size)
+{
+	return -EOPNOTSUPP;
 }
+static inline int pseries_remove_mem_node(struct device_node *np)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
 
-static int pseries_add_memory(struct device_node *np)
+static int pseries_add_mem_node(struct device_node *np)
 {
 	const char *type;
 	const unsigned int *regs;
@@ -102,7 +166,7 @@ static int pseries_add_memory(struct device_node *np)
 		return 0;
 
 	/*
-	 * Find the base and size of the lmb
+	 * Find the base and size of the memblock
 	 */
 	regs = of_get_property(np, "reg", NULL);
 	if (!regs)
@@ -114,63 +178,76 @@ static int pseries_add_memory(struct device_node *np)
 	/*
 	 * Update memory region to represent the memory add
 	 */
-	ret = lmb_add(base, lmb_size);
+	ret = memblock_add(base, lmb_size);
 	return (ret < 0) ? -EINVAL : 0;
 }
 
-static int pseries_drconf_memory(unsigned long *base, unsigned int action)
+static int pseries_update_drconf_memory(struct of_prop_reconfig *pr)
 {
-	struct device_node *np;
-	const unsigned long *lmb_size;
-	int rc;
+	struct of_drconf_cell *new_drmem, *old_drmem;
+	unsigned long memblock_size;
+	u32 entries;
+	u32 *p;
+	int i, rc = -EINVAL;
 
-	np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
-	if (!np)
+	memblock_size = pseries_memory_block_size();
+	if (!memblock_size)
 		return -EINVAL;
 
-	lmb_size = of_get_property(np, "ibm,lmb-size", NULL);
-	if (!lmb_size) {
-		of_node_put(np);
+	p = (u32 *)of_get_property(pr->dn, "ibm,dynamic-memory", NULL);
+	if (!p)
 		return -EINVAL;
-	}
 
-	if (action == PSERIES_DRCONF_MEM_ADD) {
-		rc = lmb_add(*base, *lmb_size);
-		rc = (rc < 0) ? -EINVAL : 0;
-	} else if (action == PSERIES_DRCONF_MEM_REMOVE) {
-		rc = pseries_remove_lmb(*base, *lmb_size);
-	} else {
-		rc = -EINVAL;
+	/* The first int of the property is the number of lmb's described
+	 * by the property. This is followed by an array of of_drconf_cell
+	 * entries. Get the niumber of entries and skip to the array of
+	 * of_drconf_cell's.
+	 */
+	entries = *p++;
+	old_drmem = (struct of_drconf_cell *)p;
+
+	p = (u32 *)pr->prop->value;
+	p++;
+	new_drmem = (struct of_drconf_cell *)p;
+
+	for (i = 0; i < entries; i++) {
+		if ((old_drmem[i].flags & DRCONF_MEM_ASSIGNED) &&
+		    (!(new_drmem[i].flags & DRCONF_MEM_ASSIGNED))) {
+			rc = pseries_remove_memblock(old_drmem[i].base_addr,
+						     memblock_size);
+			break;
+		} else if ((!(old_drmem[i].flags & DRCONF_MEM_ASSIGNED)) &&
+			   (new_drmem[i].flags & DRCONF_MEM_ASSIGNED)) {
+			rc = memblock_add(old_drmem[i].base_addr,
+					  memblock_size);
+			rc = (rc < 0) ? -EINVAL : 0;
+			break;
+		}
 	}
 
-	of_node_put(np);
 	return rc;
 }
 
 static int pseries_memory_notifier(struct notifier_block *nb,
-				unsigned long action, void *node)
+				   unsigned long action, void *node)
 {
-	int err = NOTIFY_OK;
+	struct of_prop_reconfig *pr;
+	int err = 0;
 
 	switch (action) {
-	case PSERIES_RECONFIG_ADD:
-		if (pseries_add_memory(node))
-			err = NOTIFY_BAD;
+	case OF_RECONFIG_ATTACH_NODE:
+		err = pseries_add_mem_node(node);
 		break;
-	case PSERIES_RECONFIG_REMOVE:
-		if (pseries_remove_memory(node))
-			err = NOTIFY_BAD;
+	case OF_RECONFIG_DETACH_NODE:
+		err = pseries_remove_mem_node(node);
 		break;
-	case PSERIES_DRCONF_MEM_ADD:
-	case PSERIES_DRCONF_MEM_REMOVE:
-		if (pseries_drconf_memory(node, action))
-			err = NOTIFY_BAD;
-		break;
-	default:
-		err = NOTIFY_DONE;
+	case OF_RECONFIG_UPDATE_PROPERTY:
+		pr = (struct of_prop_reconfig *)node;
+		if (!strcmp(pr->prop->name, "ibm,dynamic-memory"))
+			err = pseries_update_drconf_memory(pr);
 		break;
 	}
-	return err;
+	return notifier_from_errno(err);
 }
 
 static struct notifier_block pseries_mem_nb = {
@@ -180,7 +257,11 @@ static struct notifier_block pseries_mem_nb = {
 static int __init pseries_memory_hotplug_init(void)
 {
 	if (firmware_has_feature(FW_FEATURE_LPAR))
-		pSeries_reconfig_notifier_register(&pseries_mem_nb);
+		of_reconfig_notifier_register(&pseries_mem_nb);
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+	ppc_md.remove_memory = pseries_remove_memory;
+#endif
 
 	return 0;
 }
diff --git a/arch/powerpc/platforms/pseries/hvCall.S b/arch/powerpc/platforms/pseries/hvCall.S
index c1427b3634e..99ecf0a5a92 100644
--- a/arch/powerpc/platforms/pseries/hvCall.S
+++ b/arch/powerpc/platforms/pseries/hvCall.S
@@ -11,100 +11,126 @@
 #include <asm/processor.h>
 #include <asm/ppc_asm.h>
 #include <asm/asm-offsets.h>
+#include <asm/ptrace.h>
 	
-#define STK_PARM(i)     (48 + ((i)-3)*8)
+#ifdef CONFIG_TRACEPOINTS
+
+	.section	".toc","aw"
+
+	.globl hcall_tracepoint_refcount
+hcall_tracepoint_refcount:
+	.llong	0
+
+	.section	".text"
 
-#ifdef CONFIG_HCALL_STATS
 /*
- * precall must preserve all registers.  use unused STK_PARM()
- * areas to save snapshots and opcode.
+ * precall must preserve all registers.  use unused STK_PARAM()
+ * areas to save snapshots and opcode. We branch around this
+ * in early init (eg when populating the MMU hashtable) by using an
+ * unconditional cpu feature.
  */
-#define HCALL_INST_PRECALL					\
-	std	r3,STK_PARM(r3)(r1);	/* save opcode */	\
-	mftb	r0;			/* get timebase and */	\
-	std     r0,STK_PARM(r5)(r1);	/* save for later */	\
+#define HCALL_INST_PRECALL(FIRST_REG)				\
 BEGIN_FTR_SECTION;						\
-	mfspr	r0,SPRN_PURR;		/* get PURR and */	\
-	std	r0,STK_PARM(r6)(r1);	/* save for later */	\
-END_FTR_SECTION_IFSET(CPU_FTR_PURR);
-	
+	b	1f;						\
+END_FTR_SECTION(0, 1);						\
+	ld      r12,hcall_tracepoint_refcount@toc(r2);		\
+	std	r12,32(r1);					\
+	cmpdi	r12,0;						\
+	beq+	1f;						\
+	mflr	r0;						\
+	std	r3,STK_PARAM(R3)(r1);				\
+	std	r4,STK_PARAM(R4)(r1);				\
+	std	r5,STK_PARAM(R5)(r1);				\
+	std	r6,STK_PARAM(R6)(r1);				\
+	std	r7,STK_PARAM(R7)(r1);				\
+	std	r8,STK_PARAM(R8)(r1);				\
+	std	r9,STK_PARAM(R9)(r1);				\
+	std	r10,STK_PARAM(R10)(r1);				\
+	std	r0,16(r1);					\
+	addi	r4,r1,STK_PARAM(FIRST_REG);			\
+	stdu	r1,-STACK_FRAME_OVERHEAD(r1);			\
+	bl	__trace_hcall_entry;				\
+	addi	r1,r1,STACK_FRAME_OVERHEAD;			\
+	ld	r0,16(r1);					\
+	ld	r3,STK_PARAM(R3)(r1);				\
+	ld	r4,STK_PARAM(R4)(r1);				\
+	ld	r5,STK_PARAM(R5)(r1);				\
+	ld	r6,STK_PARAM(R6)(r1);				\
+	ld	r7,STK_PARAM(R7)(r1);				\
+	ld	r8,STK_PARAM(R8)(r1);				\
+	ld	r9,STK_PARAM(R9)(r1);				\
+	ld	r10,STK_PARAM(R10)(r1);				\
+	mtlr	r0;						\
+1:
+
 /*
  * postcall is performed immediately before function return which
  * allows liberal use of volatile registers.  We branch around this
  * in early init (eg when populating the MMU hashtable) by using an
  * unconditional cpu feature.
  */
-#define HCALL_INST_POSTCALL					\
+#define __HCALL_INST_POSTCALL					\
 BEGIN_FTR_SECTION;						\
 	b	1f;						\
 END_FTR_SECTION(0, 1);						\
-	ld	r4,STK_PARM(r3)(r1);	/* validate opcode */	\
-	cmpldi	cr7,r4,MAX_HCALL_OPCODE;			\
-	bgt-	cr7,1f;						\
-								\
-	/* get time and PURR snapshots after hcall */		\
-	mftb	r7;			/* timebase after */	\
-BEGIN_FTR_SECTION;						\
-	mfspr	r8,SPRN_PURR;		/* PURR after */	\
-	ld	r6,STK_PARM(r6)(r1);	/* PURR before */	\
-	subf	r6,r6,r8;		/* delta */		\
-END_FTR_SECTION_IFSET(CPU_FTR_PURR);				\
-	ld	r5,STK_PARM(r5)(r1);	/* timebase before */	\
-	subf	r5,r5,r7;		/* time delta */	\
-								\
-	/* calculate address of stat structure r4 = opcode */	\
-	srdi	r4,r4,2;		/* index into array */	\
-	mulli	r4,r4,HCALL_STAT_SIZE;				\
-	LOAD_REG_ADDR(r7, per_cpu__hcall_stats);		\
-	add	r4,r4,r7;					\
-	ld	r7,PACA_DATA_OFFSET(r13); /* per cpu offset */	\
-	add	r4,r4,r7;					\
-								\
-	/* update stats	*/					\
-	ld	r7,HCALL_STAT_CALLS(r4); /* count */		\
-	addi	r7,r7,1;					\
-	std	r7,HCALL_STAT_CALLS(r4);			\
-	ld      r7,HCALL_STAT_TB(r4);	/* timebase */		\
-	add	r7,r7,r5;					\
-	std	r7,HCALL_STAT_TB(r4);				\
-BEGIN_FTR_SECTION;						\
-	ld	r7,HCALL_STAT_PURR(r4);	/* PURR */		\
-	add	r7,r7,r6;					\
-	std	r7,HCALL_STAT_PURR(r4);				\
-END_FTR_SECTION_IFSET(CPU_FTR_PURR);				\
+	ld      r12,32(r1);					\
+	cmpdi	r12,0;						\
+	beq+	1f;						\
+	mflr	r0;						\
+	ld	r6,STK_PARAM(R3)(r1);				\
+	std	r3,STK_PARAM(R3)(r1);				\
+	mr	r4,r3;						\
+	mr	r3,r6;						\
+	std	r0,16(r1);					\
+	stdu	r1,-STACK_FRAME_OVERHEAD(r1);			\
+	bl	__trace_hcall_exit;				\
+	addi	r1,r1,STACK_FRAME_OVERHEAD;			\
+	ld	r0,16(r1);					\
+	ld	r3,STK_PARAM(R3)(r1);				\
+	mtlr	r0;						\
 1:
+
+#define HCALL_INST_POSTCALL_NORETS				\
+	li	r5,0;						\
+	__HCALL_INST_POSTCALL
+
+#define HCALL_INST_POSTCALL(BUFREG)				\
+	mr	r5,BUFREG;					\
+	__HCALL_INST_POSTCALL
+
 #else
-#define HCALL_INST_PRECALL
-#define HCALL_INST_POSTCALL
+#define HCALL_INST_PRECALL(FIRST_ARG)
+#define HCALL_INST_POSTCALL_NORETS
+#define HCALL_INST_POSTCALL(BUFREG)
 #endif
 
 	.text
 
-_GLOBAL(plpar_hcall_norets)
+_GLOBAL_TOC(plpar_hcall_norets)
 	HMT_MEDIUM
 
 	mfcr	r0
 	stw	r0,8(r1)
 
-	HCALL_INST_PRECALL
+	HCALL_INST_PRECALL(R4)
 
 	HVSC				/* invoke the hypervisor */
 
-	HCALL_INST_POSTCALL
+	HCALL_INST_POSTCALL_NORETS
 
 	lwz	r0,8(r1)
 	mtcrf	0xff,r0
 	blr				/* return r3 = status */
 
-_GLOBAL(plpar_hcall)
+_GLOBAL_TOC(plpar_hcall)
 	HMT_MEDIUM
 
 	mfcr	r0
 	stw	r0,8(r1)
 
-	HCALL_INST_PRECALL
+	HCALL_INST_PRECALL(R5)
 
-	std     r4,STK_PARM(r4)(r1)     /* Save ret buffer */
+	std     r4,STK_PARAM(R4)(r1)     /* Save ret buffer */
 
 	mr	r4,r5
 	mr	r5,r6
@@ -115,13 +141,13 @@ _GLOBAL(plpar_hcall)
 
 	HVSC				/* invoke the hypervisor */
 
-	ld	r12,STK_PARM(r4)(r1)
+	ld	r12,STK_PARAM(R4)(r1)
 	std	r4,  0(r12)
 	std	r5,  8(r12)
 	std	r6, 16(r12)
 	std	r7, 24(r12)
 
-	HCALL_INST_POSTCALL
+	HCALL_INST_POSTCALL(r12)
 
 	lwz	r0,8(r1)
 	mtcrf	0xff,r0
@@ -140,7 +166,7 @@ _GLOBAL(plpar_hcall_raw)
 	mfcr	r0
 	stw	r0,8(r1)
 
-	std     r4,STK_PARM(r4)(r1)     /* Save ret buffer */
+	std     r4,STK_PARAM(R4)(r1)     /* Save ret buffer */
 
 	mr	r4,r5
 	mr	r5,r6
@@ -151,7 +177,7 @@ _GLOBAL(plpar_hcall_raw)
 
 	HVSC				/* invoke the hypervisor */
 
-	ld	r12,STK_PARM(r4)(r1)
+	ld	r12,STK_PARAM(R4)(r1)
 	std	r4,  0(r12)
 	std	r5,  8(r12)
 	std	r6, 16(r12)
@@ -162,15 +188,15 @@ _GLOBAL(plpar_hcall_raw)
 
 	blr				/* return r3 = status */
 
-_GLOBAL(plpar_hcall9)
+_GLOBAL_TOC(plpar_hcall9)
 	HMT_MEDIUM
 
 	mfcr	r0
 	stw	r0,8(r1)
 
-	HCALL_INST_PRECALL
+	HCALL_INST_PRECALL(R5)
 
-	std     r4,STK_PARM(r4)(r1)     /* Save ret buffer */
+	std     r4,STK_PARAM(R4)(r1)     /* Save ret buffer */
 
 	mr	r4,r5
 	mr	r5,r6
@@ -178,14 +204,14 @@ _GLOBAL(plpar_hcall9)
 	mr	r7,r8
 	mr	r8,r9
 	mr	r9,r10
-	ld	r10,STK_PARM(r11)(r1)	 /* put arg7 in R10 */
-	ld	r11,STK_PARM(r12)(r1)	 /* put arg8 in R11 */
-	ld	r12,STK_PARM(r13)(r1)    /* put arg9 in R12 */
+	ld	r10,STK_PARAM(R11)(r1)	 /* put arg7 in R10 */
+	ld	r11,STK_PARAM(R12)(r1)	 /* put arg8 in R11 */
+	ld	r12,STK_PARAM(R13)(r1)    /* put arg9 in R12 */
 
 	HVSC				/* invoke the hypervisor */
 
 	mr	r0,r12
-	ld	r12,STK_PARM(r4)(r1)
+	ld	r12,STK_PARAM(R4)(r1)
 	std	r4,  0(r12)
 	std	r5,  8(r12)
 	std	r6, 16(r12)
@@ -196,7 +222,45 @@ _GLOBAL(plpar_hcall9)
 	std	r11,56(r12)
 	std	r0, 64(r12)
 
-	HCALL_INST_POSTCALL
+	HCALL_INST_POSTCALL(r12)
+
+	lwz	r0,8(r1)
+	mtcrf	0xff,r0
+
+	blr				/* return r3 = status */
+
+/* See plpar_hcall_raw to see why this is needed */
+_GLOBAL(plpar_hcall9_raw)
+	HMT_MEDIUM
+
+	mfcr	r0
+	stw	r0,8(r1)
+
+	std     r4,STK_PARAM(R4)(r1)     /* Save ret buffer */
+
+	mr	r4,r5
+	mr	r5,r6
+	mr	r6,r7
+	mr	r7,r8
+	mr	r8,r9
+	mr	r9,r10
+	ld	r10,STK_PARAM(R11)(r1)	 /* put arg7 in R10 */
+	ld	r11,STK_PARAM(R12)(r1)	 /* put arg8 in R11 */
+	ld	r12,STK_PARAM(R13)(r1)    /* put arg9 in R12 */
+
+	HVSC				/* invoke the hypervisor */
+
+	mr	r0,r12
+	ld	r12,STK_PARAM(R4)(r1)
+	std	r4,  0(r12)
+	std	r5,  8(r12)
+	std	r6, 16(r12)
+	std	r7, 24(r12)
+	std	r8, 32(r12)
+	std	r9, 40(r12)
+	std	r10,48(r12)
+	std	r11,56(r12)
+	std	r0, 64(r12)
 
 	lwz	r0,8(r1)
 	mtcrf	0xff,r0
diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c b/arch/powerpc/platforms/pseries/hvCall_inst.c
index eae51ef9af2..cf4e7736e4f 100644
--- a/arch/powerpc/platforms/pseries/hvCall_inst.c
+++ b/arch/powerpc/platforms/pseries/hvCall_inst.c
@@ -26,6 +26,7 @@
 #include <asm/hvcall.h>
 #include <asm/firmware.h>
 #include <asm/cputable.h>
+#include <asm/trace.h>
 
 DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats);
 
@@ -54,7 +55,7 @@ static void hc_stop(struct seq_file *m, void *p)
 static int hc_show(struct seq_file *m, void *p)
 {
 	unsigned long h_num = (unsigned long)p;
-	struct hcall_stats *hs = (struct hcall_stats *)m->private;
+	struct hcall_stats *hs = m->private;
 
 	if (hs[h_num].num_calls) {
 		if (cpu_has_feature(CPU_FTR_PURR))
@@ -71,7 +72,7 @@ static int hc_show(struct seq_file *m, void *p)
 	return 0;
 }
 
-static struct seq_operations hcall_inst_seq_ops = {
+static const struct seq_operations hcall_inst_seq_ops = {
         .start = hc_start,
         .next  = hc_next,
         .stop  = hc_stop,
@@ -85,7 +86,7 @@ static int hcall_inst_seq_open(struct inode *inode, struct file *file)
 
 	rc = seq_open(file, &hcall_inst_seq_ops);
 	seq = file->private_data;
-	seq->private = file->f_path.dentry->d_inode->i_private;
+	seq->private = file_inode(file)->i_private;
 
 	return rc;
 }
@@ -100,6 +101,33 @@ static const struct file_operations hcall_inst_seq_fops = {
 #define	HCALL_ROOT_DIR		"hcall_inst"
 #define CPU_NAME_BUF_SIZE	32
 
+
+static void probe_hcall_entry(void *ignored, unsigned long opcode, unsigned long *args)
+{
+	struct hcall_stats *h;
+
+	if (opcode > MAX_HCALL_OPCODE)
+		return;
+
+	h = &__get_cpu_var(hcall_stats)[opcode / 4];
+	h->tb_start = mftb();
+	h->purr_start = mfspr(SPRN_PURR);
+}
+
+static void probe_hcall_exit(void *ignored, unsigned long opcode, unsigned long retval,
+			     unsigned long *retbuf)
+{
+	struct hcall_stats *h;
+
+	if (opcode > MAX_HCALL_OPCODE)
+		return;
+
+	h = &__get_cpu_var(hcall_stats)[opcode / 4];
+	h->num_calls++;
+	h->tb_total += mftb() - h->tb_start;
+	h->purr_total += mfspr(SPRN_PURR) - h->purr_start;
+}
+
 static int __init hcall_inst_init(void)
 {
 	struct dentry *hcall_root;
@@ -110,6 +138,14 @@ static int __init hcall_inst_init(void)
 	if (!firmware_has_feature(FW_FEATURE_LPAR))
 		return 0;
 
+	if (register_trace_hcall_entry(probe_hcall_entry, NULL))
+		return -EINVAL;
+
+	if (register_trace_hcall_exit(probe_hcall_exit, NULL)) {
+		unregister_trace_hcall_entry(probe_hcall_entry, NULL);
+		return -EINVAL;
+	}
+
 	hcall_root = debugfs_create_dir(HCALL_ROOT_DIR, NULL);
 	if (!hcall_root)
 		return -ENOMEM;
diff --git a/arch/powerpc/platforms/pseries/hvconsole.c b/arch/powerpc/platforms/pseries/hvconsole.c
index 3f6a89b0981..849b29b3e9a 100644
--- a/arch/powerpc/platforms/pseries/hvconsole.c
+++ b/arch/powerpc/platforms/pseries/hvconsole.c
@@ -24,10 +24,11 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/errno.h>
 #include <asm/hvcall.h>
 #include <asm/hvconsole.h>
-#include "plpar_wrappers.h"
+#include <asm/plpar_wrappers.h>
 
 /**
  * hvc_get_chars - retrieve characters from firmware for denoted vterm adatper
@@ -39,10 +40,16 @@
  */
 int hvc_get_chars(uint32_t vtermno, char *buf, int count)
 {
-	unsigned long got;
+	long ret;
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+	unsigned long *lbuf = (unsigned long *)buf;
+
+	ret = plpar_hcall(H_GET_TERM_CHAR, retbuf, vtermno);
+	lbuf[0] = be64_to_cpu(retbuf[1]);
+	lbuf[1] = be64_to_cpu(retbuf[2]);
 
-	if (plpar_get_term_char(vtermno, &got, buf) == H_SUCCESS)
-		return got;
+	if (ret == H_SUCCESS)
+		return retbuf[0];
 
 	return 0;
 }
@@ -68,12 +75,13 @@ int hvc_put_chars(uint32_t vtermno, const char *buf, int count)
 	if (count > MAX_VIO_PUT_CHARS)
 		count = MAX_VIO_PUT_CHARS;
 
-	ret = plpar_hcall_norets(H_PUT_TERM_CHAR, vtermno, count, lbuf[0],
-				 lbuf[1]);
+	ret = plpar_hcall_norets(H_PUT_TERM_CHAR, vtermno, count,
+				 cpu_to_be64(lbuf[0]),
+				 cpu_to_be64(lbuf[1]));
 	if (ret == H_SUCCESS)
 		return count;
 	if (ret == H_BUSY)
-		return 0;
+		return -EAGAIN;
 	return -EIO;
 }
 
diff --git a/arch/powerpc/platforms/pseries/hvcserver.c b/arch/powerpc/platforms/pseries/hvcserver.c
index fcf4b4cbeaf..4557e91626c 100644
--- a/arch/powerpc/platforms/pseries/hvcserver.c
+++ b/arch/powerpc/platforms/pseries/hvcserver.c
@@ -23,6 +23,7 @@
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/string.h>
 
 #include <asm/hvcall.h>
 #include <asm/hvcserver.h>
@@ -188,9 +189,9 @@ int hvcs_get_partner_info(uint32_t unit_address, struct list_head *head,
 			= (unsigned int)last_p_partition_ID;
 
 		/* copy the Null-term char too */
-		strncpy(&next_partner_info->location_code[0],
+		strlcpy(&next_partner_info->location_code[0],
 			(char *)&pi_buff[2],
-			strlen((char *)&pi_buff[2]) + 1);
+			sizeof(next_partner_info->location_code));
 
 		list_add_tail(&(next_partner_info->node), head);
 		next_partner_info = NULL;
diff --git a/arch/powerpc/platforms/pseries/io_event_irq.c b/arch/powerpc/platforms/pseries/io_event_irq.c
new file mode 100644
index 00000000000..0240c4ff878
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/io_event_irq.c
@@ -0,0 +1,165 @@
+/*
+ * Copyright 2010 2011 Mark Nelson and Tseng-Hui (Frank) Lin, IBM Corporation
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/of.h>
+#include <linux/list.h>
+#include <linux/notifier.h>
+
+#include <asm/machdep.h>
+#include <asm/rtas.h>
+#include <asm/irq.h>
+#include <asm/io_event_irq.h>
+
+#include "pseries.h"
+
+/*
+ * IO event interrupt is a mechanism provided by RTAS to return
+ * information about hardware error and non-error events. Device
+ * drivers can register their event handlers to receive events.
+ * Device drivers are expected to use atomic_notifier_chain_register()
+ * and atomic_notifier_chain_unregister() to register and unregister
+ * their event handlers. Since multiple IO event types and scopes
+ * share an IO event interrupt, the event handlers are called one
+ * by one until the IO event is claimed by one of the handlers.
+ * The event handlers are expected to return NOTIFY_OK if the
+ * event is handled by the event handler or NOTIFY_DONE if the
+ * event does not belong to the handler.
+ *
+ * Usage:
+ *
+ * Notifier function:
+ * #include <asm/io_event_irq.h>
+ * int event_handler(struct notifier_block *nb, unsigned long val, void *data) {
+ * 	p = (struct pseries_io_event_sect_data *) data;
+ * 	if (! is_my_event(p->scope, p->event_type)) return NOTIFY_DONE;
+ * 		:
+ * 		:
+ * 	return NOTIFY_OK;
+ * }
+ * struct notifier_block event_nb = {
+ * 	.notifier_call = event_handler,
+ * }
+ *
+ * Registration:
+ * atomic_notifier_chain_register(&pseries_ioei_notifier_list, &event_nb);
+ *
+ * Unregistration:
+ * atomic_notifier_chain_unregister(&pseries_ioei_notifier_list, &event_nb);
+ */
+
+ATOMIC_NOTIFIER_HEAD(pseries_ioei_notifier_list);
+EXPORT_SYMBOL_GPL(pseries_ioei_notifier_list);
+
+static int ioei_check_exception_token;
+
+static char ioei_rtas_buf[RTAS_DATA_BUF_SIZE] __cacheline_aligned;
+
+/**
+ * Find the data portion of an IO Event section from event log.
+ * @elog: RTAS error/event log.
+ *
+ * Return:
+ * 	pointer to a valid IO event section data. NULL if not found.
+ */
+static struct pseries_io_event * ioei_find_event(struct rtas_error_log *elog)
+{
+	struct pseries_errorlog *sect;
+
+	/* We should only ever get called for io-event interrupts, but if
+	 * we do get called for another type then something went wrong so
+	 * make some noise about it.
+	 * RTAS_TYPE_IO only exists in extended event log version 6 or later.
+	 * No need to check event log version.
+	 */
+	if (unlikely(rtas_error_type(elog) != RTAS_TYPE_IO)) {
+		printk_once(KERN_WARNING"io_event_irq: Unexpected event type %d",
+			    rtas_error_type(elog));
+		return NULL;
+	}
+
+	sect = get_pseries_errorlog(elog, PSERIES_ELOG_SECT_ID_IO_EVENT);
+	if (unlikely(!sect)) {
+		printk_once(KERN_WARNING "io_event_irq: RTAS extended event "
+			    "log does not contain an IO Event section. "
+			    "Could be a bug in system firmware!\n");
+		return NULL;
+	}
+	return (struct pseries_io_event *) &sect->data;
+}
+
+/*
+ * PAPR:
+ * - check-exception returns the first found error or event and clear that
+ *   error or event so it is reported once.
+ * - Each interrupt returns one event. If a plateform chooses to report
+ *   multiple events through a single interrupt, it must ensure that the
+ *   interrupt remains asserted until check-exception has been used to
+ *   process all out-standing events for that interrupt.
+ *
+ * Implementation notes:
+ * - Events must be processed in the order they are returned. Hence,
+ *   sequential in nature.
+ * - The owner of an event is determined by combinations of scope,
+ *   event type, and sub-type. There is no easy way to pre-sort clients
+ *   by scope or event type alone. For example, Torrent ISR route change
+ *   event is reported with scope 0x00 (Not Applicatable) rather than
+ *   0x3B (Torrent-hub). It is better to let the clients to identify
+ *   who owns the event.
+ */
+
+static irqreturn_t ioei_interrupt(int irq, void *dev_id)
+{
+	struct pseries_io_event *event;
+	int rtas_rc;
+
+	for (;;) {
+		rtas_rc = rtas_call(ioei_check_exception_token, 6, 1, NULL,
+				    RTAS_VECTOR_EXTERNAL_INTERRUPT,
+				    virq_to_hw(irq),
+				    RTAS_IO_EVENTS, 1 /* Time Critical */,
+				    __pa(ioei_rtas_buf),
+				    RTAS_DATA_BUF_SIZE);
+		if (rtas_rc != 0)
+			break;
+
+		event = ioei_find_event((struct rtas_error_log *)ioei_rtas_buf);
+		if (!event)
+			continue;
+
+		atomic_notifier_call_chain(&pseries_ioei_notifier_list,
+					   0, event);
+	}
+	return IRQ_HANDLED;
+}
+
+static int __init ioei_init(void)
+{
+	struct device_node *np;
+
+	ioei_check_exception_token = rtas_token("check-exception");
+	if (ioei_check_exception_token == RTAS_UNKNOWN_SERVICE)
+		return -ENODEV;
+
+	np = of_find_node_by_path("/event-sources/ibm,io-events");
+	if (np) {
+		request_event_sources_irqs(np, ioei_interrupt, "IO_EVENT");
+		pr_info("IBM I/O event interrupts enabled\n");
+		of_node_put(np);
+	} else {
+		return -ENODEV;
+	}
+	return 0;
+}
+machine_subsys_initcall(pseries, ioei_init);
+
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 3ee01b4f425..33b552ffbe5 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -28,26 +28,57 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
+#include <linux/memblock.h>
 #include <linux/spinlock.h>
+#include <linux/sched.h>	/* for show_stack */
 #include <linux/string.h>
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
 #include <linux/crash_dump.h>
+#include <linux/memory.h>
+#include <linux/of.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/rtas.h>
 #include <asm/iommu.h>
 #include <asm/pci-bridge.h>
 #include <asm/machdep.h>
-#include <asm/abs_addr.h>
-#include <asm/pSeries_reconfig.h>
 #include <asm/firmware.h>
 #include <asm/tce.h>
 #include <asm/ppc-pci.h>
 #include <asm/udbg.h>
+#include <asm/mmzone.h>
+#include <asm/plpar_wrappers.h>
 
-#include "plpar_wrappers.h"
 
+static void tce_invalidate_pSeries_sw(struct iommu_table *tbl,
+				      __be64 *startp, __be64 *endp)
+{
+	u64 __iomem *invalidate = (u64 __iomem *)tbl->it_index;
+	unsigned long start, end, inc;
+
+	start = __pa(startp);
+	end = __pa(endp);
+	inc = L1_CACHE_BYTES; /* invalidate a cacheline of TCEs at a time */
+
+	/* If this is non-zero, change the format.  We shift the
+	 * address and or in the magic from the device tree. */
+	if (tbl->it_busno) {
+		start <<= 12;
+		end <<= 12;
+		inc <<= 12;
+		start |= tbl->it_busno;
+		end |= tbl->it_busno;
+	}
+
+	end |= inc - 1; /* round up end to be different than start */
+
+	mb(); /* Make sure TCEs in memory are written */
+	while (start <= end) {
+		out_be64(invalidate, start);
+		start += inc;
+	}
+}
 
 static int tce_build_pSeries(struct iommu_table *tbl, long index,
 			      long npages, unsigned long uaddr,
@@ -55,7 +86,7 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index,
 			      struct dma_attrs *attrs)
 {
 	u64 proto_tce;
-	u64 *tcep;
+	__be64 *tcep, *tces;
 	u64 rpn;
 
 	proto_tce = TCE_PCI_READ; // Read allowed
@@ -63,37 +94,43 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index,
 	if (direction != DMA_TO_DEVICE)
 		proto_tce |= TCE_PCI_WRITE;
 
-	tcep = ((u64 *)tbl->it_base) + index;
+	tces = tcep = ((__be64 *)tbl->it_base) + index;
 
 	while (npages--) {
-		/* can't move this out since we might cross LMB boundary */
-		rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
-		*tcep = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;
+		/* can't move this out since we might cross MEMBLOCK boundary */
+		rpn = __pa(uaddr) >> TCE_SHIFT;
+		*tcep = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT);
 
 		uaddr += TCE_PAGE_SIZE;
 		tcep++;
 	}
+
+	if (tbl->it_type & TCE_PCI_SWINV_CREATE)
+		tce_invalidate_pSeries_sw(tbl, tces, tcep - 1);
 	return 0;
 }
 
 
 static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages)
 {
-	u64 *tcep;
+	__be64 *tcep, *tces;
 
-	tcep = ((u64 *)tbl->it_base) + index;
+	tces = tcep = ((__be64 *)tbl->it_base) + index;
 
 	while (npages--)
 		*(tcep++) = 0;
+
+	if (tbl->it_type & TCE_PCI_SWINV_FREE)
+		tce_invalidate_pSeries_sw(tbl, tces, tcep - 1);
 }
 
 static unsigned long tce_get_pseries(struct iommu_table *tbl, long index)
 {
-	u64 *tcep;
+	__be64 *tcep;
 
-	tcep = ((u64 *)tbl->it_base) + index;
+	tcep = ((__be64 *)tbl->it_base) + index;
 
-	return *tcep;
+	return be64_to_cpu(*tcep);
 }
 
 static void tce_free_pSeriesLP(struct iommu_table*, long, long);
@@ -110,7 +147,7 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
 	int ret = 0;
 	long tcenum_start = tcenum, npages_start = npages;
 
-	rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
+	rpn = __pa(uaddr) >> TCE_SHIFT;
 	proto_tce = TCE_PCI_READ;
 	if (direction != DMA_TO_DEVICE)
 		proto_tce |= TCE_PCI_WRITE;
@@ -140,7 +177,7 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
 	return ret;
 }
 
-static DEFINE_PER_CPU(u64 *, tce_page) = NULL;
+static DEFINE_PER_CPU(__be64 *, tce_page);
 
 static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 				     long npages, unsigned long uaddr,
@@ -149,33 +186,37 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 {
 	u64 rc = 0;
 	u64 proto_tce;
-	u64 *tcep;
+	__be64 *tcep;
 	u64 rpn;
 	long l, limit;
 	long tcenum_start = tcenum, npages_start = npages;
 	int ret = 0;
+	unsigned long flags;
 
 	if (npages == 1) {
 		return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
 		                           direction, attrs);
 	}
 
+	local_irq_save(flags);	/* to protect tcep and the page behind it */
+
 	tcep = __get_cpu_var(tce_page);
 
 	/* This is safe to do since interrupts are off when we're called
 	 * from iommu_alloc{,_sg}()
 	 */
 	if (!tcep) {
-		tcep = (u64 *)__get_free_page(GFP_ATOMIC);
+		tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
 		/* If allocation fails, fall back to the loop implementation */
 		if (!tcep) {
+			local_irq_restore(flags);
 			return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
 					    direction, attrs);
 		}
 		__get_cpu_var(tce_page) = tcep;
 	}
 
-	rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
+	rpn = __pa(uaddr) >> TCE_SHIFT;
 	proto_tce = TCE_PCI_READ;
 	if (direction != DMA_TO_DEVICE)
 		proto_tce |= TCE_PCI_WRITE;
@@ -189,19 +230,21 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 		limit = min_t(long, npages, 4096/TCE_ENTRY_SIZE);
 
 		for (l = 0; l < limit; l++) {
-			tcep[l] = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;
+			tcep[l] = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT);
 			rpn++;
 		}
 
 		rc = plpar_tce_put_indirect((u64)tbl->it_index,
 					    (u64)tcenum << 12,
-					    (u64)virt_to_abs(tcep),
+					    (u64)__pa(tcep),
 					    limit);
 
 		npages -= limit;
 		tcenum += limit;
 	} while (npages > 0 && !rc);
 
+	local_irq_restore(flags);
+
 	if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
 		ret = (int)rc;
 		tce_freemulti_pSeriesLP(tbl, tcenum_start,
@@ -270,13 +313,161 @@ static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum)
 	return tce_ret;
 }
 
+/* this is compatible with cells for the device tree property */
+struct dynamic_dma_window_prop {
+	__be32	liobn;		/* tce table number */
+	__be64	dma_base;	/* address hi,lo */
+	__be32	tce_shift;	/* ilog2(tce_page_size) */
+	__be32	window_shift;	/* ilog2(tce_window_size) */
+};
+
+struct direct_window {
+	struct device_node *device;
+	const struct dynamic_dma_window_prop *prop;
+	struct list_head list;
+};
+
+/* Dynamic DMA Window support */
+struct ddw_query_response {
+	__be32 windows_available;
+	__be32 largest_available_block;
+	__be32 page_size;
+	__be32 migration_capable;
+};
+
+struct ddw_create_response {
+	__be32 liobn;
+	__be32 addr_hi;
+	__be32 addr_lo;
+};
+
+static LIST_HEAD(direct_window_list);
+/* prevents races between memory on/offline and window creation */
+static DEFINE_SPINLOCK(direct_window_list_lock);
+/* protects initializing window twice for same device */
+static DEFINE_MUTEX(direct_window_init_mutex);
+#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
+
+static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
+					unsigned long num_pfn, const void *arg)
+{
+	const struct dynamic_dma_window_prop *maprange = arg;
+	int rc;
+	u64 tce_size, num_tce, dma_offset, next;
+	u32 tce_shift;
+	long limit;
+
+	tce_shift = be32_to_cpu(maprange->tce_shift);
+	tce_size = 1ULL << tce_shift;
+	next = start_pfn << PAGE_SHIFT;
+	num_tce = num_pfn << PAGE_SHIFT;
+
+	/* round back to the beginning of the tce page size */
+	num_tce += next & (tce_size - 1);
+	next &= ~(tce_size - 1);
+
+	/* covert to number of tces */
+	num_tce |= tce_size - 1;
+	num_tce >>= tce_shift;
+
+	do {
+		/*
+		 * Set up the page with TCE data, looping through and setting
+		 * the values.
+		 */
+		limit = min_t(long, num_tce, 512);
+		dma_offset = next + be64_to_cpu(maprange->dma_base);
+
+		rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn),
+					     dma_offset,
+					     0, limit);
+		next += limit * tce_size;
+		num_tce -= limit;
+	} while (num_tce > 0 && !rc);
+
+	return rc;
+}
+
+static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
+					unsigned long num_pfn, const void *arg)
+{
+	const struct dynamic_dma_window_prop *maprange = arg;
+	u64 tce_size, num_tce, dma_offset, next, proto_tce, liobn;
+	__be64 *tcep;
+	u32 tce_shift;
+	u64 rc = 0;
+	long l, limit;
+
+	local_irq_disable();	/* to protect tcep and the page behind it */
+	tcep = __get_cpu_var(tce_page);
+
+	if (!tcep) {
+		tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
+		if (!tcep) {
+			local_irq_enable();
+			return -ENOMEM;
+		}
+		__get_cpu_var(tce_page) = tcep;
+	}
+
+	proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;
+
+	liobn = (u64)be32_to_cpu(maprange->liobn);
+	tce_shift = be32_to_cpu(maprange->tce_shift);
+	tce_size = 1ULL << tce_shift;
+	next = start_pfn << PAGE_SHIFT;
+	num_tce = num_pfn << PAGE_SHIFT;
+
+	/* round back to the beginning of the tce page size */
+	num_tce += next & (tce_size - 1);
+	next &= ~(tce_size - 1);
+
+	/* covert to number of tces */
+	num_tce |= tce_size - 1;
+	num_tce >>= tce_shift;
+
+	/* We can map max one pageful of TCEs at a time */
+	do {
+		/*
+		 * Set up the page with TCE data, looping through and setting
+		 * the values.
+		 */
+		limit = min_t(long, num_tce, 4096/TCE_ENTRY_SIZE);
+		dma_offset = next + be64_to_cpu(maprange->dma_base);
+
+		for (l = 0; l < limit; l++) {
+			tcep[l] = cpu_to_be64(proto_tce | next);
+			next += tce_size;
+		}
+
+		rc = plpar_tce_put_indirect(liobn,
+					    dma_offset,
+					    (u64)__pa(tcep),
+					    limit);
+
+		num_tce -= limit;
+	} while (num_tce > 0 && !rc);
+
+	/* error cleanup: caller will clear whole range */
+
+	local_irq_enable();
+	return rc;
+}
+
+static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn,
+		unsigned long num_pfn, void *arg)
+{
+	return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);
+}
+
+
 #ifdef CONFIG_PCI
 static void iommu_table_setparms(struct pci_controller *phb,
 				 struct device_node *dn,
 				 struct iommu_table *tbl)
 {
 	struct device_node *node;
-	const unsigned long *basep;
+	const unsigned long *basep, *sw_inval;
 	const u32 *sizep;
 
 	node = phb->dn;
@@ -295,9 +486,10 @@ static void iommu_table_setparms(struct pci_controller *phb,
 		memset((void *)tbl->it_base, 0, *sizep);
 
 	tbl->it_busno = phb->bus->number;
+	tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
 
 	/* Units of tce entries */
-	tbl->it_offset = phb->dma_window_base_cur >> IOMMU_PAGE_SHIFT;
+	tbl->it_offset = phb->dma_window_base_cur >> tbl->it_page_shift;
 
 	/* Test if we are going over 2GB of DMA space */
 	if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) {
@@ -308,11 +500,27 @@ static void iommu_table_setparms(struct pci_controller *phb,
 	phb->dma_window_base_cur += phb->dma_window_size;
 
 	/* Set the tce table size - measured in entries */
-	tbl->it_size = phb->dma_window_size >> IOMMU_PAGE_SHIFT;
+	tbl->it_size = phb->dma_window_size >> tbl->it_page_shift;
 
 	tbl->it_index = 0;
 	tbl->it_blocksize = 16;
 	tbl->it_type = TCE_PCI;
+
+	sw_inval = of_get_property(node, "linux,tce-sw-invalidate-info", NULL);
+	if (sw_inval) {
+		/*
+		 * This property contains information on how to
+		 * invalidate the TCE entry.  The first property is
+		 * the base MMIO address used to invalidate entries.
+		 * The second property tells us the format of the TCE
+		 * invalidate (whether it needs to be shifted) and
+		 * some magic routing info to add to our invalidate
+		 * command.
+		 */
+		tbl->it_index = (unsigned long) ioremap(sw_inval[0], 8);
+		tbl->it_busno = sw_inval[1]; /* overload this with magic */
+		tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE;
+	}
 }
 
 /*
@@ -323,19 +531,19 @@ static void iommu_table_setparms(struct pci_controller *phb,
 static void iommu_table_setparms_lpar(struct pci_controller *phb,
 				      struct device_node *dn,
 				      struct iommu_table *tbl,
-				      const void *dma_window,
-				      int bussubno)
+				      const __be32 *dma_window)
 {
 	unsigned long offset, size;
 
-	tbl->it_busno  = bussubno;
 	of_parse_dma_window(dn, dma_window, &tbl->it_index, &offset, &size);
 
+	tbl->it_busno = phb->bus->number;
+	tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
 	tbl->it_base   = 0;
 	tbl->it_blocksize  = 16;
 	tbl->it_type = TCE_PCI;
-	tbl->it_offset = offset >> IOMMU_PAGE_SHIFT;
-	tbl->it_size = size >> IOMMU_PAGE_SHIFT;
+	tbl->it_offset = offset >> tbl->it_page_shift;
+	tbl->it_size = size >> tbl->it_page_shift;
 }
 
 static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
@@ -388,7 +596,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 
 		while (pci->phb->dma_window_size * children > 0x80000000ul)
 			pci->phb->dma_window_size >>= 1;
-		pr_debug("No ISA/IDE, window size is 0x%lx\n",
+		pr_debug("No ISA/IDE, window size is 0x%llx\n",
 			 pci->phb->dma_window_size);
 		pci->phb->dma_window_base_cur = 0;
 
@@ -403,18 +611,19 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 	pci->phb->dma_window_size = 0x8000000ul;
 	pci->phb->dma_window_base_cur = 0x8000000ul;
 
-	tbl = kmalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
+	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
 			   pci->phb->node);
 
 	iommu_table_setparms(pci->phb, dn, tbl);
 	pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
+	iommu_register_group(tbl, pci_domain_nr(bus), 0);
 
 	/* Divide the rest (1.75GB) among the children */
 	pci->phb->dma_window_size = 0x80000000ul;
 	while (pci->phb->dma_window_size * children > 0x70000000ul)
 		pci->phb->dma_window_size >>= 1;
 
-	pr_debug("ISA/IDE, window size is 0x%lx\n", pci->phb->dma_window_size);
+	pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size);
 }
 
 
@@ -423,7 +632,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 	struct iommu_table *tbl;
 	struct device_node *dn, *pdn;
 	struct pci_dn *ppci;
-	const void *dma_window = NULL;
+	const __be32 *dma_window = NULL;
 
 	dn = pci_bus_to_OF_node(bus);
 
@@ -448,16 +657,13 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 		 pdn->full_name, ppci->iommu_table);
 
 	if (!ppci->iommu_table) {
-		tbl = kmalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
+		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
 				   ppci->phb->node);
-		iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window,
-			bus->number);
+		iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
 		ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node);
+		iommu_register_group(tbl, pci_domain_nr(bus), 0);
 		pr_debug("  created table: %p\n", ppci->iommu_table);
 	}
-
-	if (pdn != dn)
-		PCI_DN(dn)->iommu_table = ppci->iommu_table;
 }
 
 
@@ -468,7 +674,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 
 	pr_debug("pci_dma_dev_setup_pSeries: %s\n", pci_name(dev));
 
-	dn = dev->dev.archdata.of_node;
+	dn = dev->dev.of_node;
 
 	/* If we're the direct child of a root bus, then we need to allocate
 	 * an iommu table ourselves. The bus setup code should have setup
@@ -478,11 +684,13 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 		struct pci_controller *phb = PCI_DN(dn)->phb;
 
 		pr_debug(" --> first child, no bridge. Allocating iommu table.\n");
-		tbl = kmalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
+		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
 				   phb->node);
 		iommu_table_setparms(phb, dn, tbl);
 		PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node);
-		dev->dev.archdata.dma_data = PCI_DN(dn)->iommu_table;
+		iommu_register_group(tbl, pci_domain_nr(phb->bus), 0);
+		set_iommu_table_base_and_group(&dev->dev,
+					       PCI_DN(dn)->iommu_table);
 		return;
 	}
 
@@ -494,23 +702,380 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 		dn = dn->parent;
 
 	if (dn && PCI_DN(dn))
-		dev->dev.archdata.dma_data = PCI_DN(dn)->iommu_table;
+		set_iommu_table_base_and_group(&dev->dev,
+					       PCI_DN(dn)->iommu_table);
 	else
 		printk(KERN_WARNING "iommu: Device %s has no iommu table\n",
 		       pci_name(dev));
 }
 
+static int __read_mostly disable_ddw;
+
+static int __init disable_ddw_setup(char *str)
+{
+	disable_ddw = 1;
+	printk(KERN_INFO "ppc iommu: disabling ddw.\n");
+
+	return 0;
+}
+
+early_param("disable_ddw", disable_ddw_setup);
+
+static void remove_ddw(struct device_node *np)
+{
+	struct dynamic_dma_window_prop *dwp;
+	struct property *win64;
+	const u32 *ddw_avail;
+	u64 liobn;
+	int len, ret;
+
+	ddw_avail = of_get_property(np, "ibm,ddw-applicable", &len);
+	win64 = of_find_property(np, DIRECT64_PROPNAME, NULL);
+	if (!win64)
+		return;
+
+	if (!ddw_avail || len < 3 * sizeof(u32) || win64->length < sizeof(*dwp))
+		goto delprop;
+
+	dwp = win64->value;
+	liobn = (u64)be32_to_cpu(dwp->liobn);
+
+	/* clear the whole window, note the arg is in kernel pages */
+	ret = tce_clearrange_multi_pSeriesLP(0,
+		1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp);
+	if (ret)
+		pr_warning("%s failed to clear tces in window.\n",
+			 np->full_name);
+	else
+		pr_debug("%s successfully cleared tces in window.\n",
+			 np->full_name);
+
+	ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn);
+	if (ret)
+		pr_warning("%s: failed to remove direct window: rtas returned "
+			"%d to ibm,remove-pe-dma-window(%x) %llx\n",
+			np->full_name, ret, ddw_avail[2], liobn);
+	else
+		pr_debug("%s: successfully removed direct window: rtas returned "
+			"%d to ibm,remove-pe-dma-window(%x) %llx\n",
+			np->full_name, ret, ddw_avail[2], liobn);
+
+delprop:
+	ret = of_remove_property(np, win64);
+	if (ret)
+		pr_warning("%s: failed to remove direct window property: %d\n",
+			np->full_name, ret);
+}
+
+static u64 find_existing_ddw(struct device_node *pdn)
+{
+	struct direct_window *window;
+	const struct dynamic_dma_window_prop *direct64;
+	u64 dma_addr = 0;
+
+	spin_lock(&direct_window_list_lock);
+	/* check if we already created a window and dupe that config if so */
+	list_for_each_entry(window, &direct_window_list, list) {
+		if (window->device == pdn) {
+			direct64 = window->prop;
+			dma_addr = be64_to_cpu(direct64->dma_base);
+			break;
+		}
+	}
+	spin_unlock(&direct_window_list_lock);
+
+	return dma_addr;
+}
+
+static int find_existing_ddw_windows(void)
+{
+	int len;
+	struct device_node *pdn;
+	struct direct_window *window;
+	const struct dynamic_dma_window_prop *direct64;
+
+	if (!firmware_has_feature(FW_FEATURE_LPAR))
+		return 0;
+
+	for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
+		direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len);
+		if (!direct64)
+			continue;
+
+		window = kzalloc(sizeof(*window), GFP_KERNEL);
+		if (!window || len < sizeof(struct dynamic_dma_window_prop)) {
+			kfree(window);
+			remove_ddw(pdn);
+			continue;
+		}
+
+		window->device = pdn;
+		window->prop = direct64;
+		spin_lock(&direct_window_list_lock);
+		list_add(&window->list, &direct_window_list);
+		spin_unlock(&direct_window_list_lock);
+	}
+
+	return 0;
+}
+machine_arch_initcall(pseries, find_existing_ddw_windows);
+
+static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
+			struct ddw_query_response *query)
+{
+	struct eeh_dev *edev;
+	u32 cfg_addr;
+	u64 buid;
+	int ret;
+
+	/*
+	 * Get the config address and phb buid of the PE window.
+	 * Rely on eeh to retrieve this for us.
+	 * Retrieve them from the pci device, not the node with the
+	 * dma-window property
+	 */
+	edev = pci_dev_to_eeh_dev(dev);
+	cfg_addr = edev->config_addr;
+	if (edev->pe_config_addr)
+		cfg_addr = edev->pe_config_addr;
+	buid = edev->phb->buid;
+
+	ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query,
+		  cfg_addr, BUID_HI(buid), BUID_LO(buid));
+	dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x"
+		" returned %d\n", ddw_avail[0], cfg_addr, BUID_HI(buid),
+		BUID_LO(buid), ret);
+	return ret;
+}
+
+static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail,
+			struct ddw_create_response *create, int page_shift,
+			int window_shift)
+{
+	struct eeh_dev *edev;
+	u32 cfg_addr;
+	u64 buid;
+	int ret;
+
+	/*
+	 * Get the config address and phb buid of the PE window.
+	 * Rely on eeh to retrieve this for us.
+	 * Retrieve them from the pci device, not the node with the
+	 * dma-window property
+	 */
+	edev = pci_dev_to_eeh_dev(dev);
+	cfg_addr = edev->config_addr;
+	if (edev->pe_config_addr)
+		cfg_addr = edev->pe_config_addr;
+	buid = edev->phb->buid;
+
+	do {
+		/* extra outputs are LIOBN and dma-addr (hi, lo) */
+		ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create, cfg_addr,
+				BUID_HI(buid), BUID_LO(buid), page_shift, window_shift);
+	} while (rtas_busy_delay(ret));
+	dev_info(&dev->dev,
+		"ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d "
+		"(liobn = 0x%x starting addr = %x %x)\n", ddw_avail[1],
+		 cfg_addr, BUID_HI(buid), BUID_LO(buid), page_shift,
+		 window_shift, ret, create->liobn, create->addr_hi, create->addr_lo);
+
+	return ret;
+}
+
+struct failed_ddw_pdn {
+	struct device_node *pdn;
+	struct list_head list;
+};
+
+static LIST_HEAD(failed_ddw_pdn_list);
+
+/*
+ * If the PE supports dynamic dma windows, and there is space for a table
+ * that can map all pages in a linear offset, then setup such a table,
+ * and record the dma-offset in the struct device.
+ *
+ * dev: the pci device we are checking
+ * pdn: the parent pe node with the ibm,dma_window property
+ * Future: also check if we can remap the base window for our base page size
+ *
+ * returns the dma offset for use by dma_set_mask
+ */
+static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
+{
+	int len, ret;
+	struct ddw_query_response query;
+	struct ddw_create_response create;
+	int page_shift;
+	u64 dma_addr, max_addr;
+	struct device_node *dn;
+	const u32 *uninitialized_var(ddw_avail);
+	struct direct_window *window;
+	struct property *win64;
+	struct dynamic_dma_window_prop *ddwprop;
+	struct failed_ddw_pdn *fpdn;
+
+	mutex_lock(&direct_window_init_mutex);
+
+	dma_addr = find_existing_ddw(pdn);
+	if (dma_addr != 0)
+		goto out_unlock;
+
+	/*
+	 * If we already went through this for a previous function of
+	 * the same device and failed, we don't want to muck with the
+	 * DMA window again, as it will race with in-flight operations
+	 * and can lead to EEHs. The above mutex protects access to the
+	 * list.
+	 */
+	list_for_each_entry(fpdn, &failed_ddw_pdn_list, list) {
+		if (!strcmp(fpdn->pdn->full_name, pdn->full_name))
+			goto out_unlock;
+	}
+
+	/*
+	 * the ibm,ddw-applicable property holds the tokens for:
+	 * ibm,query-pe-dma-window
+	 * ibm,create-pe-dma-window
+	 * ibm,remove-pe-dma-window
+	 * for the given node in that order.
+	 * the property is actually in the parent, not the PE
+	 */
+	ddw_avail = of_get_property(pdn, "ibm,ddw-applicable", &len);
+	if (!ddw_avail || len < 3 * sizeof(u32))
+		goto out_failed;
+
+       /*
+	 * Query if there is a second window of size to map the
+	 * whole partition.  Query returns number of windows, largest
+	 * block assigned to PE (partition endpoint), and two bitmasks
+	 * of page sizes: supported and supported for migrate-dma.
+	 */
+	dn = pci_device_to_OF_node(dev);
+	ret = query_ddw(dev, ddw_avail, &query);
+	if (ret != 0)
+		goto out_failed;
+
+	if (query.windows_available == 0) {
+		/*
+		 * no additional windows are available for this device.
+		 * We might be able to reallocate the existing window,
+		 * trading in for a larger page size.
+		 */
+		dev_dbg(&dev->dev, "no free dynamic windows");
+		goto out_failed;
+	}
+	if (be32_to_cpu(query.page_size) & 4) {
+		page_shift = 24; /* 16MB */
+	} else if (be32_to_cpu(query.page_size) & 2) {
+		page_shift = 16; /* 64kB */
+	} else if (be32_to_cpu(query.page_size) & 1) {
+		page_shift = 12; /* 4kB */
+	} else {
+		dev_dbg(&dev->dev, "no supported direct page size in mask %x",
+			  query.page_size);
+		goto out_failed;
+	}
+	/* verify the window * number of ptes will map the partition */
+	/* check largest block * page size > max memory hotplug addr */
+	max_addr = memory_hotplug_max();
+	if (be32_to_cpu(query.largest_available_block) < (max_addr >> page_shift)) {
+		dev_dbg(&dev->dev, "can't map partiton max 0x%llx with %u "
+			  "%llu-sized pages\n", max_addr,  query.largest_available_block,
+			  1ULL << page_shift);
+		goto out_failed;
+	}
+	len = order_base_2(max_addr);
+	win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
+	if (!win64) {
+		dev_info(&dev->dev,
+			"couldn't allocate property for 64bit dma window\n");
+		goto out_failed;
+	}
+	win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
+	win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
+	win64->length = sizeof(*ddwprop);
+	if (!win64->name || !win64->value) {
+		dev_info(&dev->dev,
+			"couldn't allocate property name and value\n");
+		goto out_free_prop;
+	}
+
+	ret = create_ddw(dev, ddw_avail, &create, page_shift, len);
+	if (ret != 0)
+		goto out_free_prop;
+
+	ddwprop->liobn = create.liobn;
+	ddwprop->dma_base = cpu_to_be64(of_read_number(&create.addr_hi, 2));
+	ddwprop->tce_shift = cpu_to_be32(page_shift);
+	ddwprop->window_shift = cpu_to_be32(len);
+
+	dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %s\n",
+		  create.liobn, dn->full_name);
+
+	window = kzalloc(sizeof(*window), GFP_KERNEL);
+	if (!window)
+		goto out_clear_window;
+
+	ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
+			win64->value, tce_setrange_multi_pSeriesLP_walk);
+	if (ret) {
+		dev_info(&dev->dev, "failed to map direct window for %s: %d\n",
+			 dn->full_name, ret);
+		goto out_free_window;
+	}
+
+	ret = of_add_property(pdn, win64);
+	if (ret) {
+		dev_err(&dev->dev, "unable to add dma window property for %s: %d",
+			 pdn->full_name, ret);
+		goto out_free_window;
+	}
+
+	window->device = pdn;
+	window->prop = ddwprop;
+	spin_lock(&direct_window_list_lock);
+	list_add(&window->list, &direct_window_list);
+	spin_unlock(&direct_window_list_lock);
+
+	dma_addr = of_read_number(&create.addr_hi, 2);
+	goto out_unlock;
+
+out_free_window:
+	kfree(window);
+
+out_clear_window:
+	remove_ddw(pdn);
+
+out_free_prop:
+	kfree(win64->name);
+	kfree(win64->value);
+	kfree(win64);
+
+out_failed:
+
+	fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);
+	if (!fpdn)
+		goto out_unlock;
+	fpdn->pdn = pdn;
+	list_add(&fpdn->list, &failed_ddw_pdn_list);
+
+out_unlock:
+	mutex_unlock(&direct_window_init_mutex);
+	return dma_addr;
+}
+
 static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 {
 	struct device_node *pdn, *dn;
 	struct iommu_table *tbl;
-	const void *dma_window = NULL;
+	const __be32 *dma_window = NULL;
 	struct pci_dn *pci;
 
 	pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev));
 
 	/* dev setup for LPAR is a little tricky, since the device tree might
-	 * contain the dma-window properties per-device and not neccesarily
+	 * contain the dma-window properties per-device and not necessarily
 	 * for the bus. So we need to search upwards in the tree until we
 	 * either hit a dma-window property, OR find a parent with a table
 	 * already allocated.
@@ -528,52 +1093,181 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 	if (!pdn || !PCI_DN(pdn)) {
 		printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: "
 		       "no DMA window found for pci dev=%s dn=%s\n",
-				 pci_name(dev), dn? dn->full_name : "<null>");
+				 pci_name(dev), of_node_full_name(dn));
 		return;
 	}
 	pr_debug("  parent is %s\n", pdn->full_name);
 
-	/* Check for parent == NULL so we don't try to setup the empty EADS
-	 * slots on POWER4 machines.
-	 */
-	if (dma_window == NULL || pdn->parent == NULL) {
-		pr_debug("  no dma window for device, linking to parent\n");
-		dev->dev.archdata.dma_data = PCI_DN(pdn)->iommu_table;
-		return;
-	}
-
 	pci = PCI_DN(pdn);
 	if (!pci->iommu_table) {
-		tbl = kmalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
+		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
 				   pci->phb->node);
-		iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window,
-			pci->phb->bus->number);
+		iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
 		pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
+		iommu_register_group(tbl, pci_domain_nr(pci->phb->bus), 0);
 		pr_debug("  created table: %p\n", pci->iommu_table);
 	} else {
 		pr_debug("  found DMA window, table: %p\n", pci->iommu_table);
 	}
 
-	dev->dev.archdata.dma_data = pci->iommu_table;
+	set_iommu_table_base_and_group(&dev->dev, pci->iommu_table);
 }
+
+static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask)
+{
+	bool ddw_enabled = false;
+	struct device_node *pdn, *dn;
+	struct pci_dev *pdev;
+	const __be32 *dma_window = NULL;
+	u64 dma_offset;
+
+	if (!dev->dma_mask)
+		return -EIO;
+
+	if (!dev_is_pci(dev))
+		goto check_mask;
+
+	pdev = to_pci_dev(dev);
+
+	/* only attempt to use a new window if 64-bit DMA is requested */
+	if (!disable_ddw && dma_mask == DMA_BIT_MASK(64)) {
+		dn = pci_device_to_OF_node(pdev);
+		dev_dbg(dev, "node is %s\n", dn->full_name);
+
+		/*
+		 * the device tree might contain the dma-window properties
+		 * per-device and not necessarily for the bus. So we need to
+		 * search upwards in the tree until we either hit a dma-window
+		 * property, OR find a parent with a table already allocated.
+		 */
+		for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;
+				pdn = pdn->parent) {
+			dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+			if (dma_window)
+				break;
+		}
+		if (pdn && PCI_DN(pdn)) {
+			dma_offset = enable_ddw(pdev, pdn);
+			if (dma_offset != 0) {
+				dev_info(dev, "Using 64-bit direct DMA at offset %llx\n", dma_offset);
+				set_dma_offset(dev, dma_offset);
+				set_dma_ops(dev, &dma_direct_ops);
+				ddw_enabled = true;
+			}
+		}
+	}
+
+	/* fall back on iommu ops, restore table pointer with ops */
+	if (!ddw_enabled && get_dma_ops(dev) != &dma_iommu_ops) {
+		dev_info(dev, "Restoring 32-bit DMA via iommu\n");
+		set_dma_ops(dev, &dma_iommu_ops);
+		pci_dma_dev_setup_pSeriesLP(pdev);
+	}
+
+check_mask:
+	if (!dma_supported(dev, dma_mask))
+		return -EIO;
+
+	*dev->dma_mask = dma_mask;
+	return 0;
+}
+
+static u64 dma_get_required_mask_pSeriesLP(struct device *dev)
+{
+	if (!dev->dma_mask)
+		return 0;
+
+	if (!disable_ddw && dev_is_pci(dev)) {
+		struct pci_dev *pdev = to_pci_dev(dev);
+		struct device_node *dn;
+
+		dn = pci_device_to_OF_node(pdev);
+
+		/* search upwards for ibm,dma-window */
+		for (; dn && PCI_DN(dn) && !PCI_DN(dn)->iommu_table;
+				dn = dn->parent)
+			if (of_get_property(dn, "ibm,dma-window", NULL))
+				break;
+		/* if there is a ibm,ddw-applicable property require 64 bits */
+		if (dn && PCI_DN(dn) &&
+				of_get_property(dn, "ibm,ddw-applicable", NULL))
+			return DMA_BIT_MASK(64);
+	}
+
+	return dma_iommu_ops.get_required_mask(dev);
+}
+
 #else  /* CONFIG_PCI */
 #define pci_dma_bus_setup_pSeries	NULL
 #define pci_dma_dev_setup_pSeries	NULL
 #define pci_dma_bus_setup_pSeriesLP	NULL
 #define pci_dma_dev_setup_pSeriesLP	NULL
+#define dma_set_mask_pSeriesLP		NULL
+#define dma_get_required_mask_pSeriesLP	NULL
 #endif /* !CONFIG_PCI */
 
+static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,
+		void *data)
+{
+	struct direct_window *window;
+	struct memory_notify *arg = data;
+	int ret = 0;
+
+	switch (action) {
+	case MEM_GOING_ONLINE:
+		spin_lock(&direct_window_list_lock);
+		list_for_each_entry(window, &direct_window_list, list) {
+			ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn,
+					arg->nr_pages, window->prop);
+			/* XXX log error */
+		}
+		spin_unlock(&direct_window_list_lock);
+		break;
+	case MEM_CANCEL_ONLINE:
+	case MEM_OFFLINE:
+		spin_lock(&direct_window_list_lock);
+		list_for_each_entry(window, &direct_window_list, list) {
+			ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn,
+					arg->nr_pages, window->prop);
+			/* XXX log error */
+		}
+		spin_unlock(&direct_window_list_lock);
+		break;
+	default:
+		break;
+	}
+	if (ret && action != MEM_CANCEL_ONLINE)
+		return NOTIFY_BAD;
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block iommu_mem_nb = {
+	.notifier_call = iommu_mem_notifier,
+};
+
 static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node)
 {
 	int err = NOTIFY_OK;
 	struct device_node *np = node;
 	struct pci_dn *pci = PCI_DN(np);
+	struct direct_window *window;
 
 	switch (action) {
-	case PSERIES_RECONFIG_REMOVE:
-		if (pci && pci->iommu_table &&
-		    of_get_property(np, "ibm,dma-window", NULL))
+	case OF_RECONFIG_DETACH_NODE:
+		remove_ddw(np);
+		if (pci && pci->iommu_table)
 			iommu_free_table(pci->iommu_table, np->full_name);
+
+		spin_lock(&direct_window_list_lock);
+		list_for_each_entry(window, &direct_window_list, list) {
+			if (window->device == np) {
+				list_del(&window->list);
+				kfree(window);
+				break;
+			}
+		}
+		spin_unlock(&direct_window_list_lock);
 		break;
 	default:
 		err = NOTIFY_DONE;
@@ -589,13 +1283,8 @@ static struct notifier_block iommu_reconfig_nb = {
 /* These are called very early. */
 void iommu_init_early_pSeries(void)
 {
-	if (of_chosen && of_get_property(of_chosen, "linux,iommu-off", NULL)) {
-		/* Direct I/O, IOMMU off */
-		ppc_md.pci_dma_dev_setup = NULL;
-		ppc_md.pci_dma_bus_setup = NULL;
-		set_pci_dma_ops(&dma_direct_ops);
+	if (of_chosen && of_get_property(of_chosen, "linux,iommu-off", NULL))
 		return;
-	}
 
 	if (firmware_has_feature(FW_FEATURE_LPAR)) {
 		if (firmware_has_feature(FW_FEATURE_MULTITCE)) {
@@ -608,6 +1297,8 @@ void iommu_init_early_pSeries(void)
 		ppc_md.tce_get   = tce_get_pSeriesLP;
 		ppc_md.pci_dma_bus_setup = pci_dma_bus_setup_pSeriesLP;
 		ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_pSeriesLP;
+		ppc_md.dma_set_mask = dma_set_mask_pSeriesLP;
+		ppc_md.dma_get_required_mask = dma_get_required_mask_pSeriesLP;
 	} else {
 		ppc_md.tce_build = tce_build_pSeries;
 		ppc_md.tce_free  = tce_free_pSeries;
@@ -617,8 +1308,23 @@ void iommu_init_early_pSeries(void)
 	}
 
 
-	pSeries_reconfig_notifier_register(&iommu_reconfig_nb);
+	of_reconfig_notifier_register(&iommu_reconfig_nb);
+	register_memory_notifier(&iommu_mem_nb);
 
 	set_pci_dma_ops(&dma_iommu_ops);
 }
 
+static int __init disable_multitce(char *str)
+{
+	if (strcmp(str, "off") == 0 &&
+	    firmware_has_feature(FW_FEATURE_LPAR) &&
+	    firmware_has_feature(FW_FEATURE_MULTITCE)) {
+		printk(KERN_INFO "Disabling MULTITCE firmware feature\n");
+		ppc_md.tce_build = tce_build_pSeriesLP;
+		ppc_md.tce_free	 = tce_free_pSeriesLP;
+		powerpc_firmware_features &= ~FW_FEATURE_MULTITCE;
+	}
+	return 1;
+}
+
+__setup("multitce=", disable_multitce);
diff --git a/arch/powerpc/platforms/pseries/kexec.c b/arch/powerpc/platforms/pseries/kexec.c
index 53cbd53d874..13fa95b3aa8 100644
--- a/arch/powerpc/platforms/pseries/kexec.c
+++ b/arch/powerpc/platforms/pseries/kexec.c
@@ -7,35 +7,48 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+
 #include <asm/machdep.h>
 #include <asm/page.h>
 #include <asm/firmware.h>
 #include <asm/kexec.h>
 #include <asm/mpic.h>
+#include <asm/xics.h>
 #include <asm/smp.h>
+#include <asm/plpar_wrappers.h>
 
 #include "pseries.h"
-#include "xics.h"
-#include "plpar_wrappers.h"
 
 static void pseries_kexec_cpu_down(int crash_shutdown, int secondary)
 {
 	/* Don't risk a hypervisor call if we're crashing */
 	if (firmware_has_feature(FW_FEATURE_SPLPAR) && !crash_shutdown) {
-		unsigned long addr;
+		int ret;
+		int cpu = smp_processor_id();
+		int hwcpu = hard_smp_processor_id();
+
+		if (get_lppaca()->dtl_enable_mask) {
+			ret = unregister_dtl(hwcpu);
+			if (ret) {
+				pr_err("WARNING: DTL deregistration for cpu "
+				       "%d (hw %d) failed with %d\n",
+				       cpu, hwcpu, ret);
+			}
+		}
 
-		addr = __pa(get_slb_shadow());
-		if (unregister_slb_shadow(hard_smp_processor_id(), addr))
-			printk("SLB shadow buffer deregistration of "
-			       "cpu %u (hw_cpu_id %d) failed\n",
-			       smp_processor_id(),
-			       hard_smp_processor_id());
+		ret = unregister_slb_shadow(hwcpu);
+		if (ret) {
+			pr_err("WARNING: SLB shadow buffer deregistration "
+			       "for cpu %d (hw %d) failed with %d\n",
+			       cpu, hwcpu, ret);
+		}
 
-		addr = __pa(get_lppaca());
-		if (unregister_vpa(hard_smp_processor_id(), addr)) {
-			printk("VPA deregistration of cpu %u (hw_cpu_id %d) "
-					"failed\n", smp_processor_id(),
-					hard_smp_processor_id());
+		ret = unregister_vpa(hwcpu);
+		if (ret) {
+			pr_err("WARNING: VPA deregistration for cpu %d "
+			       "(hw %d) failed with %d\n", cpu, hwcpu, ret);
 		}
 	}
 }
@@ -61,13 +74,3 @@ void __init setup_kexec_cpu_down_xics(void)
 {
 	ppc_md.kexec_cpu_down = pseries_kexec_cpu_down_xics;
 }
-
-static int __init pseries_kexec_setup(void)
-{
-	ppc_md.machine_kexec = default_machine_kexec;
-	ppc_md.machine_kexec_prepare = default_machine_kexec_prepare;
-	ppc_md.machine_crash_shutdown = default_machine_crash_shutdown;
-
-	return 0;
-}
-machine_device_initcall(pseries, pseries_kexec_setup);
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 52a80e5840e..b02af9ef3ff 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -25,12 +25,12 @@
 #include <linux/kernel.h>
 #include <linux/dma-mapping.h>
 #include <linux/console.h>
+#include <linux/export.h>
 #include <asm/processor.h>
 #include <asm/mmu.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/machdep.h>
-#include <asm/abs_addr.h>
 #include <asm/mmu_context.h>
 #include <asm/iommu.h>
 #include <asm/tlbflush.h>
@@ -39,10 +39,19 @@
 #include <asm/cputable.h>
 #include <asm/udbg.h>
 #include <asm/smp.h>
+#include <asm/trace.h>
+#include <asm/firmware.h>
+#include <asm/plpar_wrappers.h>
 
-#include "plpar_wrappers.h"
 #include "pseries.h"
 
+/* Flag bits for H_BULK_REMOVE */
+#define HBR_REQUEST	0x4000000000000000UL
+#define HBR_RESPONSE	0x8000000000000000UL
+#define HBR_END		0xc000000000000000UL
+#define HBR_AVPN	0x0200000000000000UL
+#define HBR_ANDCOND	0x0100000000000000UL
+
 
 /* in hvCall.S */
 EXPORT_SYMBOL(plpar_hcall);
@@ -51,234 +60,72 @@ EXPORT_SYMBOL(plpar_hcall_norets);
 
 extern void pSeries_find_serial_port(void);
 
-
-static int vtermno;	/* virtual terminal# for udbg  */
-
-#define __ALIGNED__ __attribute__((__aligned__(sizeof(long))))
-static void udbg_hvsi_putc(char c)
-{
-	/* packet's seqno isn't used anyways */
-	uint8_t packet[] __ALIGNED__ = { 0xff, 5, 0, 0, c };
-	int rc;
-
-	if (c == '\n')
-		udbg_hvsi_putc('\r');
-
-	do {
-		rc = plpar_put_term_char(vtermno, sizeof(packet), packet);
-	} while (rc == H_BUSY);
-}
-
-static long hvsi_udbg_buf_len;
-static uint8_t hvsi_udbg_buf[256];
-
-static int udbg_hvsi_getc_poll(void)
-{
-	unsigned char ch;
-	int rc, i;
-
-	if (hvsi_udbg_buf_len == 0) {
-		rc = plpar_get_term_char(vtermno, &hvsi_udbg_buf_len, hvsi_udbg_buf);
-		if (rc != H_SUCCESS || hvsi_udbg_buf[0] != 0xff) {
-			/* bad read or non-data packet */
-			hvsi_udbg_buf_len = 0;
-		} else {
-			/* remove the packet header */
-			for (i = 4; i < hvsi_udbg_buf_len; i++)
-				hvsi_udbg_buf[i-4] = hvsi_udbg_buf[i];
-			hvsi_udbg_buf_len -= 4;
-		}
-	}
-
-	if (hvsi_udbg_buf_len <= 0 || hvsi_udbg_buf_len > 256) {
-		/* no data ready */
-		hvsi_udbg_buf_len = 0;
-		return -1;
-	}
-
-	ch = hvsi_udbg_buf[0];
-	/* shift remaining data down */
-	for (i = 1; i < hvsi_udbg_buf_len; i++) {
-		hvsi_udbg_buf[i-1] = hvsi_udbg_buf[i];
-	}
-	hvsi_udbg_buf_len--;
-
-	return ch;
-}
-
-static int udbg_hvsi_getc(void)
-{
-	int ch;
-	for (;;) {
-		ch = udbg_hvsi_getc_poll();
-		if (ch == -1) {
-			/* This shouldn't be needed...but... */
-			volatile unsigned long delay;
-			for (delay=0; delay < 2000000; delay++)
-				;
-		} else {
-			return ch;
-		}
-	}
-}
-
-static void udbg_putcLP(char c)
-{
-	char buf[16];
-	unsigned long rc;
-
-	if (c == '\n')
-		udbg_putcLP('\r');
-
-	buf[0] = c;
-	do {
-		rc = plpar_put_term_char(vtermno, 1, buf);
-	} while(rc == H_BUSY);
-}
-
-/* Buffered chars getc */
-static long inbuflen;
-static long inbuf[2];	/* must be 2 longs */
-
-static int udbg_getc_pollLP(void)
-{
-	/* The interface is tricky because it may return up to 16 chars.
-	 * We save them statically for future calls to udbg_getc().
-	 */
-	char ch, *buf = (char *)inbuf;
-	int i;
-	long rc;
-	if (inbuflen == 0) {
-		/* get some more chars. */
-		inbuflen = 0;
-		rc = plpar_get_term_char(vtermno, &inbuflen, buf);
-		if (rc != H_SUCCESS)
-			inbuflen = 0;	/* otherwise inbuflen is garbage */
-	}
-	if (inbuflen <= 0 || inbuflen > 16) {
-		/* Catch error case as well as other oddities (corruption) */
-		inbuflen = 0;
-		return -1;
-	}
-	ch = buf[0];
-	for (i = 1; i < inbuflen; i++)	/* shuffle them down. */
-		buf[i-1] = buf[i];
-	inbuflen--;
-	return ch;
-}
-
-static int udbg_getcLP(void)
-{
-	int ch;
-	for (;;) {
-		ch = udbg_getc_pollLP();
-		if (ch == -1) {
-			/* This shouldn't be needed...but... */
-			volatile unsigned long delay;
-			for (delay=0; delay < 2000000; delay++)
-				;
-		} else {
-			return ch;
-		}
-	}
-}
-
-/* call this from early_init() for a working debug console on
- * vterm capable LPAR machines
- */
-void __init udbg_init_debug_lpar(void)
-{
-	vtermno = 0;
-	udbg_putc = udbg_putcLP;
-	udbg_getc = udbg_getcLP;
-	udbg_getc_poll = udbg_getc_pollLP;
-
-	register_early_udbg_console();
-}
-
-/* returns 0 if couldn't find or use /chosen/stdout as console */
-void __init find_udbg_vterm(void)
-{
-	struct device_node *stdout_node;
-	const u32 *termno;
-	const char *name;
-
-	/* find the boot console from /chosen/stdout */
-	if (!of_chosen)
-		return;
-	name = of_get_property(of_chosen, "linux,stdout-path", NULL);
-	if (name == NULL)
-		return;
-	stdout_node = of_find_node_by_path(name);
-	if (!stdout_node)
-		return;
-	name = of_get_property(stdout_node, "name", NULL);
-	if (!name) {
-		printk(KERN_WARNING "stdout node missing 'name' property!\n");
-		goto out;
-	}
-
-	/* Check if it's a virtual terminal */
-	if (strncmp(name, "vty", 3) != 0)
-		goto out;
-	termno = of_get_property(stdout_node, "reg", NULL);
-	if (termno == NULL)
-		goto out;
-	vtermno = termno[0];
-
-	if (of_device_is_compatible(stdout_node, "hvterm1")) {
-		udbg_putc = udbg_putcLP;
-		udbg_getc = udbg_getcLP;
-		udbg_getc_poll = udbg_getc_pollLP;
-		add_preferred_console("hvc", termno[0] & 0xff, NULL);
-	} else if (of_device_is_compatible(stdout_node, "hvterm-protocol")) {
-		vtermno = termno[0];
-		udbg_putc = udbg_hvsi_putc;
-		udbg_getc = udbg_hvsi_getc;
-		udbg_getc_poll = udbg_hvsi_getc_poll;
-		add_preferred_console("hvsi", termno[0] & 0xff, NULL);
-	}
-out:
-	of_node_put(stdout_node);
-}
-
 void vpa_init(int cpu)
 {
 	int hwcpu = get_hard_smp_processor_id(cpu);
 	unsigned long addr;
 	long ret;
+	struct paca_struct *pp;
+	struct dtl_entry *dtl;
+
+	/*
+	 * The spec says it "may be problematic" if CPU x registers the VPA of
+	 * CPU y. We should never do that, but wail if we ever do.
+	 */
+	WARN_ON(cpu != smp_processor_id());
 
 	if (cpu_has_feature(CPU_FTR_ALTIVEC))
-		lppaca[cpu].vmxregs_in_use = 1;
+		lppaca_of(cpu).vmxregs_in_use = 1;
+
+	if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		lppaca_of(cpu).ebb_regs_in_use = 1;
 
-	addr = __pa(&lppaca[cpu]);
+	addr = __pa(&lppaca_of(cpu));
 	ret = register_vpa(hwcpu, addr);
 
 	if (ret) {
-		printk(KERN_ERR "WARNING: vpa_init: VPA registration for "
-				"cpu %d (hw %d) of area %lx returns %ld\n",
-				cpu, hwcpu, addr, ret);
+		pr_err("WARNING: VPA registration for cpu %d (hw %d) of area "
+		       "%lx failed with %ld\n", cpu, hwcpu, addr, ret);
 		return;
 	}
 	/*
 	 * PAPR says this feature is SLB-Buffer but firmware never
 	 * reports that.  All SPLPAR support SLB shadow buffer.
 	 */
-	addr = __pa(&slb_shadow[cpu]);
+	addr = __pa(paca[cpu].slb_shadow_ptr);
 	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
 		ret = register_slb_shadow(hwcpu, addr);
 		if (ret)
-			printk(KERN_ERR
-			       "WARNING: vpa_init: SLB shadow buffer "
-			       "registration for cpu %d (hw %d) of area %lx "
-			       "returns %ld\n", cpu, hwcpu, addr, ret);
+			pr_err("WARNING: SLB shadow buffer registration for "
+			       "cpu %d (hw %d) of area %lx failed with %ld\n",
+			       cpu, hwcpu, addr, ret);
+	}
+
+	/*
+	 * Register dispatch trace log, if one has been allocated.
+	 */
+	pp = &paca[cpu];
+	dtl = pp->dispatch_log;
+	if (dtl) {
+		pp->dtl_ridx = 0;
+		pp->dtl_curr = dtl;
+		lppaca_of(cpu).dtl_idx = 0;
+
+		/* hypervisor reads buffer length from this field */
+		dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES);
+		ret = register_dtl(hwcpu, __pa(dtl));
+		if (ret)
+			pr_err("WARNING: DTL registration of cpu %d (hw %d) "
+			       "failed with %ld\n", smp_processor_id(),
+			       hwcpu, ret);
+		lppaca_of(cpu).dtl_enable_mask = 2;
 	}
 }
 
 static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
- 			      unsigned long va, unsigned long pa,
- 			      unsigned long rflags, unsigned long vflags,
-			      int psize, int ssize)
+				     unsigned long vpn, unsigned long pa,
+				     unsigned long rflags, unsigned long vflags,
+				     int psize, int apsize, int ssize)
 {
 	unsigned long lpar_rc;
 	unsigned long flags;
@@ -286,15 +133,15 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
 	unsigned long hpte_v, hpte_r;
 
 	if (!(vflags & HPTE_V_BOLTED))
-		pr_debug("hpte_insert(group=%lx, va=%016lx, pa=%016lx, "
-			 "rflags=%lx, vflags=%lx, psize=%d)\n",
-			 hpte_group, va, pa, rflags, vflags, psize);
+		pr_devel("hpte_insert(group=%lx, vpn=%016lx, "
+			 "pa=%016lx, rflags=%lx, vflags=%lx, psize=%d)\n",
+			 hpte_group, vpn,  pa, rflags, vflags, psize);
 
-	hpte_v = hpte_encode_v(va, psize, ssize) | vflags | HPTE_V_VALID;
-	hpte_r = hpte_encode_r(pa, psize) | rflags;
+	hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
+	hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
 
 	if (!(vflags & HPTE_V_BOLTED))
-		pr_debug(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
+		pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
 
 	/* Now fill in the actual HPTE */
 	/* Set CEC cookie to 0         */
@@ -305,13 +152,16 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
 	flags = 0;
 
 	/* Make pHyp happy */
-	if ((rflags & _PAGE_NO_CACHE) & !(rflags & _PAGE_WRITETHRU))
-		hpte_r &= ~_PAGE_COHERENT;
+	if ((rflags & _PAGE_NO_CACHE) && !(rflags & _PAGE_WRITETHRU))
+		hpte_r &= ~HPTE_R_M;
+
+	if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N))
+		flags |= H_COALESCE_CAND;
 
 	lpar_rc = plpar_pte_enter(flags, hpte_group, hpte_v, hpte_r, &slot);
 	if (unlikely(lpar_rc == H_PTEG_FULL)) {
 		if (!(vflags & HPTE_V_BOLTED))
-			pr_debug(" full\n");
+			pr_devel(" full\n");
 		return -1;
 	}
 
@@ -322,11 +172,11 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
 	 */
 	if (unlikely(lpar_rc != H_SUCCESS)) {
 		if (!(vflags & HPTE_V_BOLTED))
-			pr_debug(" lpar err %lu\n", lpar_rc);
+			pr_devel(" lpar err %ld\n", lpar_rc);
 		return -2;
 	}
 	if (!(vflags & HPTE_V_BOLTED))
-		pr_debug(" -> slot: %lu\n", slot & 7);
+		pr_devel(" -> slot: %lu\n", slot & 7);
 
 	/* Because of iSeries, we have to pass down the secondary
 	 * bucket bit here as well
@@ -353,7 +203,13 @@ static long pSeries_lpar_hpte_remove(unsigned long hpte_group)
 					   (0x1UL << 4), &dummy1, &dummy2);
 		if (lpar_rc == H_SUCCESS)
 			return i;
-		BUG_ON(lpar_rc != H_NOT_FOUND);
+
+		/*
+		 * The test for adjunct partition is performed before the
+		 * ANDCOND test.  H_RESOURCE may be returned, so we need to
+		 * check for that as well.
+		 */
+		BUG_ON(lpar_rc != H_NOT_FOUND && lpar_rc != H_RESOURCE);
 
 		slot_offset++;
 		slot_offset &= 0x7;
@@ -366,39 +222,47 @@ static void pSeries_lpar_hptab_clear(void)
 {
 	unsigned long size_bytes = 1UL << ppc64_pft_size;
 	unsigned long hpte_count = size_bytes >> 4;
-	unsigned long dummy1, dummy2, dword0;
+	struct {
+		unsigned long pteh;
+		unsigned long ptel;
+	} ptes[4];
 	long lpar_rc;
-	int i;
-
-	/* TODO: Use bulk call */
-	for (i = 0; i < hpte_count; i++) {
-		/* dont remove HPTEs with VRMA mappings */
-		lpar_rc = plpar_pte_remove_raw(H_ANDCOND, i, HPTE_V_1TB_SEG,
-						&dummy1, &dummy2);
-		if (lpar_rc == H_NOT_FOUND) {
-			lpar_rc = plpar_pte_read_raw(0, i, &dword0, &dummy1);
-			if (!lpar_rc && ((dword0 & HPTE_V_VRMA_MASK)
-				!= HPTE_V_VRMA_MASK))
-				/* Can be hpte for 1TB Seg. So remove it */
-				plpar_pte_remove_raw(0, i, 0, &dummy1, &dummy2);
+	unsigned long i, j;
+
+	/* Read in batches of 4,
+	 * invalidate only valid entries not in the VRMA
+	 * hpte_count will be a multiple of 4
+         */
+	for (i = 0; i < hpte_count; i += 4) {
+		lpar_rc = plpar_pte_read_4_raw(0, i, (void *)ptes);
+		if (lpar_rc != H_SUCCESS)
+			continue;
+		for (j = 0; j < 4; j++){
+			if ((ptes[j].pteh & HPTE_V_VRMA_MASK) ==
+				HPTE_V_VRMA_MASK)
+				continue;
+			if (ptes[j].pteh & HPTE_V_VALID)
+				plpar_pte_remove_raw(0, i + j, 0,
+					&(ptes[j].pteh), &(ptes[j].ptel));
 		}
 	}
-}
-
-/*
- * This computes the AVPN and B fields of the first dword of a HPTE,
- * for use when we want to match an existing PTE.  The bottom 7 bits
- * of the returned value are zero.
- */
-static inline unsigned long hpte_encode_avpn(unsigned long va, int psize,
-					     int ssize)
-{
-	unsigned long v;
 
-	v = (va >> 23) & ~(mmu_psize_defs[psize].avpnm);
-	v <<= HPTE_V_AVPN_SHIFT;
-	v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT;
-	return v;
+#ifdef __LITTLE_ENDIAN__
+	/* Reset exceptions to big endian */
+	if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
+		long rc;
+
+		rc = pseries_big_endian_exceptions();
+		/*
+		 * At this point it is unlikely panic() will get anything
+		 * out to the user, but at least this will stop us from
+		 * continuing on further and creating an even more
+		 * difficult to debug situation.
+		 */
+		if (rc)
+			panic("Could not enable big endian exceptions");
+	}
+#endif
 }
 
 /*
@@ -409,26 +273,27 @@ static inline unsigned long hpte_encode_avpn(unsigned long va, int psize,
  */
 static long pSeries_lpar_hpte_updatepp(unsigned long slot,
 				       unsigned long newpp,
-				       unsigned long va,
-				       int psize, int ssize, int local)
+				       unsigned long vpn,
+				       int psize, int apsize,
+				       int ssize, int local)
 {
 	unsigned long lpar_rc;
 	unsigned long flags = (newpp & 7) | H_AVPN;
 	unsigned long want_v;
 
-	want_v = hpte_encode_avpn(va, psize, ssize);
+	want_v = hpte_encode_avpn(vpn, psize, ssize);
 
-	pr_debug("    update: avpnv=%016lx, hash=%016lx, f=%lx, psize: %d ...",
+	pr_devel("    update: avpnv=%016lx, hash=%016lx, f=%lx, psize: %d ...",
 		 want_v, slot, flags, psize);
 
 	lpar_rc = plpar_pte_protect(flags, slot, want_v);
 
 	if (lpar_rc == H_NOT_FOUND) {
-		pr_debug("not found !\n");
+		pr_devel("not found !\n");
 		return -1;
 	}
 
-	pr_debug("ok\n");
+	pr_devel("ok\n");
 
 	BUG_ON(lpar_rc != H_SUCCESS);
 
@@ -454,15 +319,15 @@ static unsigned long pSeries_lpar_hpte_getword0(unsigned long slot)
 	return dword0;
 }
 
-static long pSeries_lpar_hpte_find(unsigned long va, int psize, int ssize)
+static long pSeries_lpar_hpte_find(unsigned long vpn, int psize, int ssize)
 {
 	unsigned long hash;
 	unsigned long i;
 	long slot;
 	unsigned long want_v, hpte_v;
 
-	hash = hpt_hash(va, mmu_psize_defs[psize].shift, ssize);
-	want_v = hpte_encode_avpn(va, psize, ssize);
+	hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
+	want_v = hpte_encode_avpn(vpn, psize, ssize);
 
 	/* Bolted entries are always in the primary group */
 	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
@@ -482,12 +347,13 @@ static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
 					     unsigned long ea,
 					     int psize, int ssize)
 {
-	unsigned long lpar_rc, slot, vsid, va, flags;
+	unsigned long vpn;
+	unsigned long lpar_rc, slot, vsid, flags;
 
 	vsid = get_kernel_vsid(ea, ssize);
-	va = hpt_va(ea, vsid, ssize);
+	vpn = hpt_vpn(ea, vsid, ssize);
 
-	slot = pSeries_lpar_hpte_find(va, psize, ssize);
+	slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
 	BUG_ON(slot == -1);
 
 	flags = newpp & 7;
@@ -496,17 +362,18 @@ static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
 	BUG_ON(lpar_rc != H_SUCCESS);
 }
 
-static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long va,
-					 int psize, int ssize, int local)
+static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
+					 int psize, int apsize,
+					 int ssize, int local)
 {
 	unsigned long want_v;
 	unsigned long lpar_rc;
 	unsigned long dummy1, dummy2;
 
-	pr_debug("    inval : slot=%lx, va=%016lx, psize: %d, local: %d\n",
-		 slot, va, psize, local);
+	pr_devel("    inval : slot=%lx, vpn=%016lx, psize: %d, local: %d\n",
+		 slot, vpn, psize, local);
 
-	want_v = hpte_encode_avpn(va, psize, ssize);
+	want_v = hpte_encode_avpn(vpn, psize, ssize);
 	lpar_rc = plpar_pte_remove(H_AVPN, slot, want_v, &dummy1, &dummy2);
 	if (lpar_rc == H_NOT_FOUND)
 		return;
@@ -514,39 +381,142 @@ static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long va,
 	BUG_ON(lpar_rc != H_SUCCESS);
 }
 
+/*
+ * Limit iterations holding pSeries_lpar_tlbie_lock to 3. We also need
+ * to make sure that we avoid bouncing the hypervisor tlbie lock.
+ */
+#define PPC64_HUGE_HPTE_BATCH 12
+
+static void __pSeries_lpar_hugepage_invalidate(unsigned long *slot,
+					     unsigned long *vpn, int count,
+					     int psize, int ssize)
+{
+	unsigned long param[8];
+	int i = 0, pix = 0, rc;
+	unsigned long flags = 0;
+	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+
+	if (lock_tlbie)
+		spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
+
+	for (i = 0; i < count; i++) {
+
+		if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
+			pSeries_lpar_hpte_invalidate(slot[i], vpn[i], psize, 0,
+						     ssize, 0);
+		} else {
+			param[pix] = HBR_REQUEST | HBR_AVPN | slot[i];
+			param[pix+1] = hpte_encode_avpn(vpn[i], psize, ssize);
+			pix += 2;
+			if (pix == 8) {
+				rc = plpar_hcall9(H_BULK_REMOVE, param,
+						  param[0], param[1], param[2],
+						  param[3], param[4], param[5],
+						  param[6], param[7]);
+				BUG_ON(rc != H_SUCCESS);
+				pix = 0;
+			}
+		}
+	}
+	if (pix) {
+		param[pix] = HBR_END;
+		rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
+				  param[2], param[3], param[4], param[5],
+				  param[6], param[7]);
+		BUG_ON(rc != H_SUCCESS);
+	}
+
+	if (lock_tlbie)
+		spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
+}
+
+static void pSeries_lpar_hugepage_invalidate(struct mm_struct *mm,
+				       unsigned char *hpte_slot_array,
+				       unsigned long addr, int psize)
+{
+	int ssize = 0, i, index = 0;
+	unsigned long s_addr = addr;
+	unsigned int max_hpte_count, valid;
+	unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH];
+	unsigned long slot_array[PPC64_HUGE_HPTE_BATCH];
+	unsigned long shift, hidx, vpn = 0, vsid, hash, slot;
+
+	shift = mmu_psize_defs[psize].shift;
+	max_hpte_count = 1U << (PMD_SHIFT - shift);
+
+	for (i = 0; i < max_hpte_count; i++) {
+		valid = hpte_valid(hpte_slot_array, i);
+		if (!valid)
+			continue;
+		hidx =  hpte_hash_index(hpte_slot_array, i);
+
+		/* get the vpn */
+		addr = s_addr + (i * (1ul << shift));
+		if (!is_kernel_addr(addr)) {
+			ssize = user_segment_size(addr);
+			vsid = get_vsid(mm->context.id, addr, ssize);
+			WARN_ON(vsid == 0);
+		} else {
+			vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+			ssize = mmu_kernel_ssize;
+		}
+
+		vpn = hpt_vpn(addr, vsid, ssize);
+		hash = hpt_hash(vpn, shift, ssize);
+		if (hidx & _PTEIDX_SECONDARY)
+			hash = ~hash;
+
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += hidx & _PTEIDX_GROUP_IX;
+
+		slot_array[index] = slot;
+		vpn_array[index] = vpn;
+		if (index == PPC64_HUGE_HPTE_BATCH - 1) {
+			/*
+			 * Now do a bluk invalidate
+			 */
+			__pSeries_lpar_hugepage_invalidate(slot_array,
+							   vpn_array,
+							   PPC64_HUGE_HPTE_BATCH,
+							   psize, ssize);
+			index = 0;
+		} else
+			index++;
+	}
+	if (index)
+		__pSeries_lpar_hugepage_invalidate(slot_array, vpn_array,
+						   index, psize, ssize);
+}
+
 static void pSeries_lpar_hpte_removebolted(unsigned long ea,
 					   int psize, int ssize)
 {
-	unsigned long slot, vsid, va;
+	unsigned long vpn;
+	unsigned long slot, vsid;
 
 	vsid = get_kernel_vsid(ea, ssize);
-	va = hpt_va(ea, vsid, ssize);
+	vpn = hpt_vpn(ea, vsid, ssize);
 
-	slot = pSeries_lpar_hpte_find(va, psize, ssize);
+	slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
 	BUG_ON(slot == -1);
-
-	pSeries_lpar_hpte_invalidate(slot, va, psize, ssize, 0);
+	/*
+	 * lpar doesn't use the passed actual page size
+	 */
+	pSeries_lpar_hpte_invalidate(slot, vpn, psize, 0, ssize, 0);
 }
 
-/* Flag bits for H_BULK_REMOVE */
-#define HBR_REQUEST	0x4000000000000000UL
-#define HBR_RESPONSE	0x8000000000000000UL
-#define HBR_END		0xc000000000000000UL
-#define HBR_AVPN	0x0200000000000000UL
-#define HBR_ANDCOND	0x0100000000000000UL
-
 /*
  * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
  * lock.
  */
 static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
 {
+	unsigned long vpn;
 	unsigned long i, pix, rc;
 	unsigned long flags = 0;
 	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
-	int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
+	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
 	unsigned long param[9];
-	unsigned long va;
 	unsigned long hash, index, shift, hidx, slot;
 	real_pte_t pte;
 	int psize, ssize;
@@ -558,21 +528,24 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
 	ssize = batch->ssize;
 	pix = 0;
 	for (i = 0; i < number; i++) {
-		va = batch->vaddr[i];
+		vpn = batch->vpn[i];
 		pte = batch->pte[i];
-		pte_iterate_hashed_subpages(pte, psize, va, index, shift) {
-			hash = hpt_hash(va, shift, ssize);
+		pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
+			hash = hpt_hash(vpn, shift, ssize);
 			hidx = __rpte_to_hidx(pte, index);
 			if (hidx & _PTEIDX_SECONDARY)
 				hash = ~hash;
 			slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 			slot += hidx & _PTEIDX_GROUP_IX;
 			if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
-				pSeries_lpar_hpte_invalidate(slot, va, psize,
-							     ssize, local);
+				/*
+				 * lpar doesn't use the passed actual page size
+				 */
+				pSeries_lpar_hpte_invalidate(slot, vpn, psize,
+							     0, ssize, local);
 			} else {
 				param[pix] = HBR_REQUEST | HBR_AVPN | slot;
-				param[pix+1] = hpte_encode_avpn(va, psize,
+				param[pix+1] = hpte_encode_avpn(vpn, psize,
 								ssize);
 				pix += 2;
 				if (pix == 8) {
@@ -598,6 +571,18 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
 		spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
 }
 
+static int __init disable_bulk_remove(char *str)
+{
+	if (strcmp(str, "off") == 0 &&
+	    firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
+			printk(KERN_INFO "Disabling BULK_REMOVE firmware feature");
+			powerpc_firmware_features &= ~FW_FEATURE_BULK_REMOVE;
+	}
+	return 1;
+}
+
+__setup("bulk_remove=", disable_bulk_remove);
+
 void __init hpte_init_lpar(void)
 {
 	ppc_md.hpte_invalidate	= pSeries_lpar_hpte_invalidate;
@@ -608,4 +593,182 @@ void __init hpte_init_lpar(void)
 	ppc_md.hpte_removebolted = pSeries_lpar_hpte_removebolted;
 	ppc_md.flush_hash_range	= pSeries_lpar_flush_hash_range;
 	ppc_md.hpte_clear_all   = pSeries_lpar_hptab_clear;
+	ppc_md.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
+}
+
+#ifdef CONFIG_PPC_SMLPAR
+#define CMO_FREE_HINT_DEFAULT 1
+static int cmo_free_hint_flag = CMO_FREE_HINT_DEFAULT;
+
+static int __init cmo_free_hint(char *str)
+{
+	char *parm;
+	parm = strstrip(str);
+
+	if (strcasecmp(parm, "no") == 0 || strcasecmp(parm, "off") == 0) {
+		printk(KERN_INFO "cmo_free_hint: CMO free page hinting is not active.\n");
+		cmo_free_hint_flag = 0;
+		return 1;
+	}
+
+	cmo_free_hint_flag = 1;
+	printk(KERN_INFO "cmo_free_hint: CMO free page hinting is active.\n");
+
+	if (strcasecmp(parm, "yes") == 0 || strcasecmp(parm, "on") == 0)
+		return 1;
+
+	return 0;
+}
+
+__setup("cmo_free_hint=", cmo_free_hint);
+
+static void pSeries_set_page_state(struct page *page, int order,
+				   unsigned long state)
+{
+	int i, j;
+	unsigned long cmo_page_sz, addr;
+
+	cmo_page_sz = cmo_get_page_size();
+	addr = __pa((unsigned long)page_address(page));
+
+	for (i = 0; i < (1 << order); i++, addr += PAGE_SIZE) {
+		for (j = 0; j < PAGE_SIZE; j += cmo_page_sz)
+			plpar_hcall_norets(H_PAGE_INIT, state, addr + j, 0);
+	}
+}
+
+void arch_free_page(struct page *page, int order)
+{
+	if (!cmo_free_hint_flag || !firmware_has_feature(FW_FEATURE_CMO))
+		return;
+
+	pSeries_set_page_state(page, order, H_PAGE_SET_UNUSED);
+}
+EXPORT_SYMBOL(arch_free_page);
+
+#endif
+
+#ifdef CONFIG_TRACEPOINTS
+/*
+ * We optimise our hcall path by placing hcall_tracepoint_refcount
+ * directly in the TOC so we can check if the hcall tracepoints are
+ * enabled via a single load.
+ */
+
+/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
+extern long hcall_tracepoint_refcount;
+
+/* 
+ * Since the tracing code might execute hcalls we need to guard against
+ * recursion. One example of this are spinlocks calling H_YIELD on
+ * shared processor partitions.
+ */
+static DEFINE_PER_CPU(unsigned int, hcall_trace_depth);
+
+void hcall_tracepoint_regfunc(void)
+{
+	hcall_tracepoint_refcount++;
+}
+
+void hcall_tracepoint_unregfunc(void)
+{
+	hcall_tracepoint_refcount--;
+}
+
+void __trace_hcall_entry(unsigned long opcode, unsigned long *args)
+{
+	unsigned long flags;
+	unsigned int *depth;
+
+	/*
+	 * We cannot call tracepoints inside RCU idle regions which
+	 * means we must not trace H_CEDE.
+	 */
+	if (opcode == H_CEDE)
+		return;
+
+	local_irq_save(flags);
+
+	depth = &__get_cpu_var(hcall_trace_depth);
+
+	if (*depth)
+		goto out;
+
+	(*depth)++;
+	preempt_disable();
+	trace_hcall_entry(opcode, args);
+	(*depth)--;
+
+out:
+	local_irq_restore(flags);
+}
+
+void __trace_hcall_exit(long opcode, unsigned long retval,
+			unsigned long *retbuf)
+{
+	unsigned long flags;
+	unsigned int *depth;
+
+	if (opcode == H_CEDE)
+		return;
+
+	local_irq_save(flags);
+
+	depth = &__get_cpu_var(hcall_trace_depth);
+
+	if (*depth)
+		goto out;
+
+	(*depth)++;
+	trace_hcall_exit(opcode, retval, retbuf);
+	preempt_enable();
+	(*depth)--;
+
+out:
+	local_irq_restore(flags);
+}
+#endif
+
+/**
+ * h_get_mpp
+ * H_GET_MPP hcall returns info in 7 parms
+ */
+int h_get_mpp(struct hvcall_mpp_data *mpp_data)
+{
+	int rc;
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+
+	rc = plpar_hcall9(H_GET_MPP, retbuf);
+
+	mpp_data->entitled_mem = retbuf[0];
+	mpp_data->mapped_mem = retbuf[1];
+
+	mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
+	mpp_data->pool_num = retbuf[2] & 0xffff;
+
+	mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff;
+	mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff;
+	mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffffUL;
+
+	mpp_data->pool_size = retbuf[4];
+	mpp_data->loan_request = retbuf[5];
+	mpp_data->backing_mem = retbuf[6];
+
+	return rc;
+}
+EXPORT_SYMBOL(h_get_mpp);
+
+int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data)
+{
+	int rc;
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = { 0 };
+
+	rc = plpar_hcall9(H_GET_MPP_X, retbuf);
+
+	mpp_x_data->coalesced_bytes = retbuf[0];
+	mpp_x_data->pool_coalesced_bytes = retbuf[1];
+	mpp_x_data->pool_purr_cycles = retbuf[2];
+	mpp_x_data->pool_spurr_cycles = retbuf[3];
+
+	return rc;
 }
diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c
new file mode 100644
index 00000000000..c9fecf09b8f
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/lparcfg.c
@@ -0,0 +1,710 @@
+/*
+ * PowerPC64 LPAR Configuration Information Driver
+ *
+ * Dave Engebretsen engebret@us.ibm.com
+ *    Copyright (c) 2003 Dave Engebretsen
+ * Will Schmidt willschm@us.ibm.com
+ *    SPLPAR updates, Copyright (c) 2003 Will Schmidt IBM Corporation.
+ *    seq_file updates, Copyright (c) 2004 Will Schmidt IBM Corporation.
+ * Nathan Lynch nathanl@austin.ibm.com
+ *    Added lparcfg_write, Copyright (C) 2004 Nathan Lynch IBM Corporation.
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ * This driver creates a proc file at /proc/ppc64/lparcfg which contains
+ * keyword - value pairs that specify the configuration of the partition.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/lppaca.h>
+#include <asm/hvcall.h>
+#include <asm/firmware.h>
+#include <asm/rtas.h>
+#include <asm/time.h>
+#include <asm/prom.h>
+#include <asm/vdso_datapage.h>
+#include <asm/vio.h>
+#include <asm/mmu.h>
+#include <asm/machdep.h>
+
+
+/*
+ * This isn't a module but we expose that to userspace
+ * via /proc so leave the definitions here
+ */
+#define MODULE_VERS "1.9"
+#define MODULE_NAME "lparcfg"
+
+/* #define LPARCFG_DEBUG */
+
+/*
+ * Track sum of all purrs across all processors. This is used to further
+ * calculate usage values by different applications
+ */
+static unsigned long get_purr(void)
+{
+	unsigned long sum_purr = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct cpu_usage *cu;
+
+		cu = &per_cpu(cpu_usage_array, cpu);
+		sum_purr += cu->current_tb;
+	}
+	return sum_purr;
+}
+
+/*
+ * Methods used to fetch LPAR data when running on a pSeries platform.
+ */
+
+struct hvcall_ppp_data {
+	u64	entitlement;
+	u64	unallocated_entitlement;
+	u16	group_num;
+	u16	pool_num;
+	u8	capped;
+	u8	weight;
+	u8	unallocated_weight;
+	u16	active_procs_in_pool;
+	u16	active_system_procs;
+	u16	phys_platform_procs;
+	u32	max_proc_cap_avail;
+	u32	entitled_proc_cap_avail;
+};
+
+/*
+ * H_GET_PPP hcall returns info in 4 parms.
+ *  entitled_capacity,unallocated_capacity,
+ *  aggregation, resource_capability).
+ *
+ *  R4 = Entitled Processor Capacity Percentage.
+ *  R5 = Unallocated Processor Capacity Percentage.
+ *  R6 (AABBCCDDEEFFGGHH).
+ *      XXXX - reserved (0)
+ *          XXXX - reserved (0)
+ *              XXXX - Group Number
+ *                  XXXX - Pool Number.
+ *  R7 (IIJJKKLLMMNNOOPP).
+ *      XX - reserved. (0)
+ *        XX - bit 0-6 reserved (0).   bit 7 is Capped indicator.
+ *          XX - variable processor Capacity Weight
+ *            XX - Unallocated Variable Processor Capacity Weight.
+ *              XXXX - Active processors in Physical Processor Pool.
+ *                  XXXX  - Processors active on platform.
+ *  R8 (QQQQRRRRRRSSSSSS). if ibm,partition-performance-parameters-level >= 1
+ *	XXXX - Physical platform procs allocated to virtualization.
+ *	    XXXXXX - Max procs capacity % available to the partitions pool.
+ *	          XXXXXX - Entitled procs capacity % available to the
+ *			   partitions pool.
+ */
+static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data)
+{
+	unsigned long rc;
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+
+	rc = plpar_hcall9(H_GET_PPP, retbuf);
+
+	ppp_data->entitlement = retbuf[0];
+	ppp_data->unallocated_entitlement = retbuf[1];
+
+	ppp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
+	ppp_data->pool_num = retbuf[2] & 0xffff;
+
+	ppp_data->capped = (retbuf[3] >> 6 * 8) & 0x01;
+	ppp_data->weight = (retbuf[3] >> 5 * 8) & 0xff;
+	ppp_data->unallocated_weight = (retbuf[3] >> 4 * 8) & 0xff;
+	ppp_data->active_procs_in_pool = (retbuf[3] >> 2 * 8) & 0xffff;
+	ppp_data->active_system_procs = retbuf[3] & 0xffff;
+
+	ppp_data->phys_platform_procs = retbuf[4] >> 6 * 8;
+	ppp_data->max_proc_cap_avail = (retbuf[4] >> 3 * 8) & 0xffffff;
+	ppp_data->entitled_proc_cap_avail = retbuf[4] & 0xffffff;
+
+	return rc;
+}
+
+static unsigned h_pic(unsigned long *pool_idle_time,
+		      unsigned long *num_procs)
+{
+	unsigned long rc;
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+	rc = plpar_hcall(H_PIC, retbuf);
+
+	*pool_idle_time = retbuf[0];
+	*num_procs = retbuf[1];
+
+	return rc;
+}
+
+/*
+ * parse_ppp_data
+ * Parse out the data returned from h_get_ppp and h_pic
+ */
+static void parse_ppp_data(struct seq_file *m)
+{
+	struct hvcall_ppp_data ppp_data;
+	struct device_node *root;
+	const __be32 *perf_level;
+	int rc;
+
+	rc = h_get_ppp(&ppp_data);
+	if (rc)
+		return;
+
+	seq_printf(m, "partition_entitled_capacity=%lld\n",
+	           ppp_data.entitlement);
+	seq_printf(m, "group=%d\n", ppp_data.group_num);
+	seq_printf(m, "system_active_processors=%d\n",
+	           ppp_data.active_system_procs);
+
+	/* pool related entries are appropriate for shared configs */
+	if (lppaca_shared_proc(get_lppaca())) {
+		unsigned long pool_idle_time, pool_procs;
+
+		seq_printf(m, "pool=%d\n", ppp_data.pool_num);
+
+		/* report pool_capacity in percentage */
+		seq_printf(m, "pool_capacity=%d\n",
+			   ppp_data.active_procs_in_pool * 100);
+
+		h_pic(&pool_idle_time, &pool_procs);
+		seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time);
+		seq_printf(m, "pool_num_procs=%ld\n", pool_procs);
+	}
+
+	seq_printf(m, "unallocated_capacity_weight=%d\n",
+		   ppp_data.unallocated_weight);
+	seq_printf(m, "capacity_weight=%d\n", ppp_data.weight);
+	seq_printf(m, "capped=%d\n", ppp_data.capped);
+	seq_printf(m, "unallocated_capacity=%lld\n",
+		   ppp_data.unallocated_entitlement);
+
+	/* The last bits of information returned from h_get_ppp are only
+	 * valid if the ibm,partition-performance-parameters-level
+	 * property is >= 1.
+	 */
+	root = of_find_node_by_path("/");
+	if (root) {
+		perf_level = of_get_property(root,
+				"ibm,partition-performance-parameters-level",
+					     NULL);
+		if (perf_level && (be32_to_cpup(perf_level) >= 1)) {
+			seq_printf(m,
+			    "physical_procs_allocated_to_virtualization=%d\n",
+				   ppp_data.phys_platform_procs);
+			seq_printf(m, "max_proc_capacity_available=%d\n",
+				   ppp_data.max_proc_cap_avail);
+			seq_printf(m, "entitled_proc_capacity_available=%d\n",
+				   ppp_data.entitled_proc_cap_avail);
+		}
+
+		of_node_put(root);
+	}
+}
+
+/**
+ * parse_mpp_data
+ * Parse out data returned from h_get_mpp
+ */
+static void parse_mpp_data(struct seq_file *m)
+{
+	struct hvcall_mpp_data mpp_data;
+	int rc;
+
+	rc = h_get_mpp(&mpp_data);
+	if (rc)
+		return;
+
+	seq_printf(m, "entitled_memory=%ld\n", mpp_data.entitled_mem);
+
+	if (mpp_data.mapped_mem != -1)
+		seq_printf(m, "mapped_entitled_memory=%ld\n",
+		           mpp_data.mapped_mem);
+
+	seq_printf(m, "entitled_memory_group_number=%d\n", mpp_data.group_num);
+	seq_printf(m, "entitled_memory_pool_number=%d\n", mpp_data.pool_num);
+
+	seq_printf(m, "entitled_memory_weight=%d\n", mpp_data.mem_weight);
+	seq_printf(m, "unallocated_entitled_memory_weight=%d\n",
+	           mpp_data.unallocated_mem_weight);
+	seq_printf(m, "unallocated_io_mapping_entitlement=%ld\n",
+	           mpp_data.unallocated_entitlement);
+
+	if (mpp_data.pool_size != -1)
+		seq_printf(m, "entitled_memory_pool_size=%ld bytes\n",
+		           mpp_data.pool_size);
+
+	seq_printf(m, "entitled_memory_loan_request=%ld\n",
+	           mpp_data.loan_request);
+
+	seq_printf(m, "backing_memory=%ld bytes\n", mpp_data.backing_mem);
+}
+
+/**
+ * parse_mpp_x_data
+ * Parse out data returned from h_get_mpp_x
+ */
+static void parse_mpp_x_data(struct seq_file *m)
+{
+	struct hvcall_mpp_x_data mpp_x_data;
+
+	if (!firmware_has_feature(FW_FEATURE_XCMO))
+		return;
+	if (h_get_mpp_x(&mpp_x_data))
+		return;
+
+	seq_printf(m, "coalesced_bytes=%ld\n", mpp_x_data.coalesced_bytes);
+
+	if (mpp_x_data.pool_coalesced_bytes)
+		seq_printf(m, "pool_coalesced_bytes=%ld\n",
+			   mpp_x_data.pool_coalesced_bytes);
+	if (mpp_x_data.pool_purr_cycles)
+		seq_printf(m, "coalesce_pool_purr=%ld\n", mpp_x_data.pool_purr_cycles);
+	if (mpp_x_data.pool_spurr_cycles)
+		seq_printf(m, "coalesce_pool_spurr=%ld\n", mpp_x_data.pool_spurr_cycles);
+}
+
+#define SPLPAR_CHARACTERISTICS_TOKEN 20
+#define SPLPAR_MAXLENGTH 1026*(sizeof(char))
+
+/*
+ * parse_system_parameter_string()
+ * Retrieve the potential_processors, max_entitled_capacity and friends
+ * through the get-system-parameter rtas call.  Replace keyword strings as
+ * necessary.
+ */
+static void parse_system_parameter_string(struct seq_file *m)
+{
+	int call_status;
+
+	unsigned char *local_buffer = kmalloc(SPLPAR_MAXLENGTH, GFP_KERNEL);
+	if (!local_buffer) {
+		printk(KERN_ERR "%s %s kmalloc failure at line %d\n",
+		       __FILE__, __func__, __LINE__);
+		return;
+	}
+
+	spin_lock(&rtas_data_buf_lock);
+	memset(rtas_data_buf, 0, SPLPAR_MAXLENGTH);
+	call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+				NULL,
+				SPLPAR_CHARACTERISTICS_TOKEN,
+				__pa(rtas_data_buf),
+				RTAS_DATA_BUF_SIZE);
+	memcpy(local_buffer, rtas_data_buf, SPLPAR_MAXLENGTH);
+	local_buffer[SPLPAR_MAXLENGTH - 1] = '\0';
+	spin_unlock(&rtas_data_buf_lock);
+
+	if (call_status != 0) {
+		printk(KERN_INFO
+		       "%s %s Error calling get-system-parameter (0x%x)\n",
+		       __FILE__, __func__, call_status);
+	} else {
+		int splpar_strlen;
+		int idx, w_idx;
+		char *workbuffer = kzalloc(SPLPAR_MAXLENGTH, GFP_KERNEL);
+		if (!workbuffer) {
+			printk(KERN_ERR "%s %s kmalloc failure at line %d\n",
+			       __FILE__, __func__, __LINE__);
+			kfree(local_buffer);
+			return;
+		}
+#ifdef LPARCFG_DEBUG
+		printk(KERN_INFO "success calling get-system-parameter\n");
+#endif
+		splpar_strlen = local_buffer[0] * 256 + local_buffer[1];
+		local_buffer += 2;	/* step over strlen value */
+
+		w_idx = 0;
+		idx = 0;
+		while ((*local_buffer) && (idx < splpar_strlen)) {
+			workbuffer[w_idx++] = local_buffer[idx++];
+			if ((local_buffer[idx] == ',')
+			    || (local_buffer[idx] == '\0')) {
+				workbuffer[w_idx] = '\0';
+				if (w_idx) {
+					/* avoid the empty string */
+					seq_printf(m, "%s\n", workbuffer);
+				}
+				memset(workbuffer, 0, SPLPAR_MAXLENGTH);
+				idx++;	/* skip the comma */
+				w_idx = 0;
+			} else if (local_buffer[idx] == '=') {
+				/* code here to replace workbuffer contents
+				   with different keyword strings */
+				if (0 == strcmp(workbuffer, "MaxEntCap")) {
+					strcpy(workbuffer,
+					       "partition_max_entitled_capacity");
+					w_idx = strlen(workbuffer);
+				}
+				if (0 == strcmp(workbuffer, "MaxPlatProcs")) {
+					strcpy(workbuffer,
+					       "system_potential_processors");
+					w_idx = strlen(workbuffer);
+				}
+			}
+		}
+		kfree(workbuffer);
+		local_buffer -= 2;	/* back up over strlen value */
+	}
+	kfree(local_buffer);
+}
+
+/* Return the number of processors in the system.
+ * This function reads through the device tree and counts
+ * the virtual processors, this does not include threads.
+ */
+static int lparcfg_count_active_processors(void)
+{
+	struct device_node *cpus_dn = NULL;
+	int count = 0;
+
+	while ((cpus_dn = of_find_node_by_type(cpus_dn, "cpu"))) {
+#ifdef LPARCFG_DEBUG
+		printk(KERN_ERR "cpus_dn %p\n", cpus_dn);
+#endif
+		count++;
+	}
+	return count;
+}
+
+static void pseries_cmo_data(struct seq_file *m)
+{
+	int cpu;
+	unsigned long cmo_faults = 0;
+	unsigned long cmo_fault_time = 0;
+
+	seq_printf(m, "cmo_enabled=%d\n", firmware_has_feature(FW_FEATURE_CMO));
+
+	if (!firmware_has_feature(FW_FEATURE_CMO))
+		return;
+
+	for_each_possible_cpu(cpu) {
+		cmo_faults += be64_to_cpu(lppaca_of(cpu).cmo_faults);
+		cmo_fault_time += be64_to_cpu(lppaca_of(cpu).cmo_fault_time);
+	}
+
+	seq_printf(m, "cmo_faults=%lu\n", cmo_faults);
+	seq_printf(m, "cmo_fault_time_usec=%lu\n",
+		   cmo_fault_time / tb_ticks_per_usec);
+	seq_printf(m, "cmo_primary_psp=%d\n", cmo_get_primary_psp());
+	seq_printf(m, "cmo_secondary_psp=%d\n", cmo_get_secondary_psp());
+	seq_printf(m, "cmo_page_size=%lu\n", cmo_get_page_size());
+}
+
+static void splpar_dispatch_data(struct seq_file *m)
+{
+	int cpu;
+	unsigned long dispatches = 0;
+	unsigned long dispatch_dispersions = 0;
+
+	for_each_possible_cpu(cpu) {
+		dispatches += be32_to_cpu(lppaca_of(cpu).yield_count);
+		dispatch_dispersions +=
+			be32_to_cpu(lppaca_of(cpu).dispersion_count);
+	}
+
+	seq_printf(m, "dispatches=%lu\n", dispatches);
+	seq_printf(m, "dispatch_dispersions=%lu\n", dispatch_dispersions);
+}
+
+static void parse_em_data(struct seq_file *m)
+{
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+	if (firmware_has_feature(FW_FEATURE_LPAR) &&
+	    plpar_hcall(H_GET_EM_PARMS, retbuf) == H_SUCCESS)
+		seq_printf(m, "power_mode_data=%016lx\n", retbuf[0]);
+}
+
+static int pseries_lparcfg_data(struct seq_file *m, void *v)
+{
+	int partition_potential_processors;
+	int partition_active_processors;
+	struct device_node *rtas_node;
+	const __be32 *lrdrp = NULL;
+
+	rtas_node = of_find_node_by_path("/rtas");
+	if (rtas_node)
+		lrdrp = of_get_property(rtas_node, "ibm,lrdr-capacity", NULL);
+
+	if (lrdrp == NULL) {
+		partition_potential_processors = vdso_data->processorCount;
+	} else {
+		partition_potential_processors = be32_to_cpup(lrdrp + 4);
+	}
+	of_node_put(rtas_node);
+
+	partition_active_processors = lparcfg_count_active_processors();
+
+	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
+		/* this call handles the ibm,get-system-parameter contents */
+		parse_system_parameter_string(m);
+		parse_ppp_data(m);
+		parse_mpp_data(m);
+		parse_mpp_x_data(m);
+		pseries_cmo_data(m);
+		splpar_dispatch_data(m);
+
+		seq_printf(m, "purr=%ld\n", get_purr());
+	} else {		/* non SPLPAR case */
+
+		seq_printf(m, "system_active_processors=%d\n",
+			   partition_potential_processors);
+
+		seq_printf(m, "system_potential_processors=%d\n",
+			   partition_potential_processors);
+
+		seq_printf(m, "partition_max_entitled_capacity=%d\n",
+			   partition_potential_processors * 100);
+
+		seq_printf(m, "partition_entitled_capacity=%d\n",
+			   partition_active_processors * 100);
+	}
+
+	seq_printf(m, "partition_active_processors=%d\n",
+		   partition_active_processors);
+
+	seq_printf(m, "partition_potential_processors=%d\n",
+		   partition_potential_processors);
+
+	seq_printf(m, "shared_processor_mode=%d\n",
+		   lppaca_shared_proc(get_lppaca()));
+
+	seq_printf(m, "slb_size=%d\n", mmu_slb_size);
+
+	parse_em_data(m);
+
+	return 0;
+}
+
+static ssize_t update_ppp(u64 *entitlement, u8 *weight)
+{
+	struct hvcall_ppp_data ppp_data;
+	u8 new_weight;
+	u64 new_entitled;
+	ssize_t retval;
+
+	/* Get our current parameters */
+	retval = h_get_ppp(&ppp_data);
+	if (retval)
+		return retval;
+
+	if (entitlement) {
+		new_weight = ppp_data.weight;
+		new_entitled = *entitlement;
+	} else if (weight) {
+		new_weight = *weight;
+		new_entitled = ppp_data.entitlement;
+	} else
+		return -EINVAL;
+
+	pr_debug("%s: current_entitled = %llu, current_weight = %u\n",
+		 __func__, ppp_data.entitlement, ppp_data.weight);
+
+	pr_debug("%s: new_entitled = %llu, new_weight = %u\n",
+		 __func__, new_entitled, new_weight);
+
+	retval = plpar_hcall_norets(H_SET_PPP, new_entitled, new_weight);
+	return retval;
+}
+
+/**
+ * update_mpp
+ *
+ * Update the memory entitlement and weight for the partition.  Caller must
+ * specify either a new entitlement or weight, not both, to be updated
+ * since the h_set_mpp call takes both entitlement and weight as parameters.
+ */
+static ssize_t update_mpp(u64 *entitlement, u8 *weight)
+{
+	struct hvcall_mpp_data mpp_data;
+	u64 new_entitled;
+	u8 new_weight;
+	ssize_t rc;
+
+	if (entitlement) {
+		/* Check with vio to ensure the new memory entitlement
+		 * can be handled.
+		 */
+		rc = vio_cmo_entitlement_update(*entitlement);
+		if (rc)
+			return rc;
+	}
+
+	rc = h_get_mpp(&mpp_data);
+	if (rc)
+		return rc;
+
+	if (entitlement) {
+		new_weight = mpp_data.mem_weight;
+		new_entitled = *entitlement;
+	} else if (weight) {
+		new_weight = *weight;
+		new_entitled = mpp_data.entitled_mem;
+	} else
+		return -EINVAL;
+
+	pr_debug("%s: current_entitled = %lu, current_weight = %u\n",
+	         __func__, mpp_data.entitled_mem, mpp_data.mem_weight);
+
+	pr_debug("%s: new_entitled = %llu, new_weight = %u\n",
+		 __func__, new_entitled, new_weight);
+
+	rc = plpar_hcall_norets(H_SET_MPP, new_entitled, new_weight);
+	return rc;
+}
+
+/*
+ * Interface for changing system parameters (variable capacity weight
+ * and entitled capacity).  Format of input is "param_name=value";
+ * anything after value is ignored.  Valid parameters at this time are
+ * "partition_entitled_capacity" and "capacity_weight".  We use
+ * H_SET_PPP to alter parameters.
+ *
+ * This function should be invoked only on systems with
+ * FW_FEATURE_SPLPAR.
+ */
+static ssize_t lparcfg_write(struct file *file, const char __user * buf,
+			     size_t count, loff_t * off)
+{
+	int kbuf_sz = 64;
+	char kbuf[kbuf_sz];
+	char *tmp;
+	u64 new_entitled, *new_entitled_ptr = &new_entitled;
+	u8 new_weight, *new_weight_ptr = &new_weight;
+	ssize_t retval;
+
+	if (!firmware_has_feature(FW_FEATURE_SPLPAR))
+		return -EINVAL;
+
+	if (count > kbuf_sz)
+		return -EINVAL;
+
+	if (copy_from_user(kbuf, buf, count))
+		return -EFAULT;
+
+	kbuf[count - 1] = '\0';
+	tmp = strchr(kbuf, '=');
+	if (!tmp)
+		return -EINVAL;
+
+	*tmp++ = '\0';
+
+	if (!strcmp(kbuf, "partition_entitled_capacity")) {
+		char *endp;
+		*new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
+		if (endp == tmp)
+			return -EINVAL;
+
+		retval = update_ppp(new_entitled_ptr, NULL);
+	} else if (!strcmp(kbuf, "capacity_weight")) {
+		char *endp;
+		*new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
+		if (endp == tmp)
+			return -EINVAL;
+
+		retval = update_ppp(NULL, new_weight_ptr);
+	} else if (!strcmp(kbuf, "entitled_memory")) {
+		char *endp;
+		*new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
+		if (endp == tmp)
+			return -EINVAL;
+
+		retval = update_mpp(new_entitled_ptr, NULL);
+	} else if (!strcmp(kbuf, "entitled_memory_weight")) {
+		char *endp;
+		*new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
+		if (endp == tmp)
+			return -EINVAL;
+
+		retval = update_mpp(NULL, new_weight_ptr);
+	} else
+		return -EINVAL;
+
+	if (retval == H_SUCCESS || retval == H_CONSTRAINED) {
+		retval = count;
+	} else if (retval == H_BUSY) {
+		retval = -EBUSY;
+	} else if (retval == H_HARDWARE) {
+		retval = -EIO;
+	} else if (retval == H_PARAMETER) {
+		retval = -EINVAL;
+	}
+
+	return retval;
+}
+
+static int lparcfg_data(struct seq_file *m, void *v)
+{
+	struct device_node *rootdn;
+	const char *model = "";
+	const char *system_id = "";
+	const char *tmp;
+	const __be32 *lp_index_ptr;
+	unsigned int lp_index = 0;
+
+	seq_printf(m, "%s %s\n", MODULE_NAME, MODULE_VERS);
+
+	rootdn = of_find_node_by_path("/");
+	if (rootdn) {
+		tmp = of_get_property(rootdn, "model", NULL);
+		if (tmp)
+			model = tmp;
+		tmp = of_get_property(rootdn, "system-id", NULL);
+		if (tmp)
+			system_id = tmp;
+		lp_index_ptr = of_get_property(rootdn, "ibm,partition-no",
+					NULL);
+		if (lp_index_ptr)
+			lp_index = be32_to_cpup(lp_index_ptr);
+		of_node_put(rootdn);
+	}
+	seq_printf(m, "serial_number=%s\n", system_id);
+	seq_printf(m, "system_type=%s\n", model);
+	seq_printf(m, "partition_id=%d\n", (int)lp_index);
+
+	return pseries_lparcfg_data(m, v);
+}
+
+static int lparcfg_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, lparcfg_data, NULL);
+}
+
+static const struct file_operations lparcfg_fops = {
+	.read		= seq_read,
+	.write		= lparcfg_write,
+	.open		= lparcfg_open,
+	.release	= single_release,
+	.llseek		= seq_lseek,
+};
+
+static int __init lparcfg_init(void)
+{
+	umode_t mode = S_IRUSR | S_IRGRP | S_IROTH;
+
+	/* Allow writing if we have FW_FEATURE_SPLPAR */
+	if (firmware_has_feature(FW_FEATURE_SPLPAR))
+		mode |= S_IWUSR;
+
+	if (!proc_create("powerpc/lparcfg", mode, NULL, &lparcfg_fops)) {
+		printk(KERN_ERR "Failed to create powerpc/lparcfg\n");
+		return -EIO;
+	}
+	return 0;
+}
+machine_device_initcall(pseries, lparcfg_init);
diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c
new file mode 100644
index 00000000000..bde7ebad394
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -0,0 +1,365 @@
+/*
+ * Support for Partition Mobility/Migration
+ *
+ * Copyright (C) 2010 Nathan Fontenot
+ * Copyright (C) 2010 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/smp.h>
+#include <linux/stat.h>
+#include <linux/completion.h>
+#include <linux/device.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+
+#include <asm/rtas.h>
+#include "pseries.h"
+
+static struct kobject *mobility_kobj;
+
+struct update_props_workarea {
+	u32 phandle;
+	u32 state;
+	u64 reserved;
+	u32 nprops;
+} __packed;
+
+#define NODE_ACTION_MASK	0xff000000
+#define NODE_COUNT_MASK		0x00ffffff
+
+#define DELETE_DT_NODE	0x01000000
+#define UPDATE_DT_NODE	0x02000000
+#define ADD_DT_NODE	0x03000000
+
+#define MIGRATION_SCOPE	(1)
+
+static int mobility_rtas_call(int token, char *buf, s32 scope)
+{
+	int rc;
+
+	spin_lock(&rtas_data_buf_lock);
+
+	memcpy(rtas_data_buf, buf, RTAS_DATA_BUF_SIZE);
+	rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, scope);
+	memcpy(buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
+
+	spin_unlock(&rtas_data_buf_lock);
+	return rc;
+}
+
+static int delete_dt_node(u32 phandle)
+{
+	struct device_node *dn;
+
+	dn = of_find_node_by_phandle(phandle);
+	if (!dn)
+		return -ENOENT;
+
+	dlpar_detach_node(dn);
+	of_node_put(dn);
+	return 0;
+}
+
+static int update_dt_property(struct device_node *dn, struct property **prop,
+			      const char *name, u32 vd, char *value)
+{
+	struct property *new_prop = *prop;
+	int more = 0;
+
+	/* A negative 'vd' value indicates that only part of the new property
+	 * value is contained in the buffer and we need to call
+	 * ibm,update-properties again to get the rest of the value.
+	 *
+	 * A negative value is also the two's compliment of the actual value.
+	 */
+	if (vd & 0x80000000) {
+		vd = ~vd + 1;
+		more = 1;
+	}
+
+	if (new_prop) {
+		/* partial property fixup */
+		char *new_data = kzalloc(new_prop->length + vd, GFP_KERNEL);
+		if (!new_data)
+			return -ENOMEM;
+
+		memcpy(new_data, new_prop->value, new_prop->length);
+		memcpy(new_data + new_prop->length, value, vd);
+
+		kfree(new_prop->value);
+		new_prop->value = new_data;
+		new_prop->length += vd;
+	} else {
+		new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL);
+		if (!new_prop)
+			return -ENOMEM;
+
+		new_prop->name = kstrdup(name, GFP_KERNEL);
+		if (!new_prop->name) {
+			kfree(new_prop);
+			return -ENOMEM;
+		}
+
+		new_prop->length = vd;
+		new_prop->value = kzalloc(new_prop->length, GFP_KERNEL);
+		if (!new_prop->value) {
+			kfree(new_prop->name);
+			kfree(new_prop);
+			return -ENOMEM;
+		}
+
+		memcpy(new_prop->value, value, vd);
+		*prop = new_prop;
+	}
+
+	if (!more) {
+		of_update_property(dn, new_prop);
+		*prop = NULL;
+	}
+
+	return 0;
+}
+
+static int update_dt_node(u32 phandle, s32 scope)
+{
+	struct update_props_workarea *upwa;
+	struct device_node *dn;
+	struct property *prop = NULL;
+	int i, rc, rtas_rc;
+	char *prop_data;
+	char *rtas_buf;
+	int update_properties_token;
+	u32 vd;
+
+	update_properties_token = rtas_token("ibm,update-properties");
+	if (update_properties_token == RTAS_UNKNOWN_SERVICE)
+		return -EINVAL;
+
+	rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+	if (!rtas_buf)
+		return -ENOMEM;
+
+	dn = of_find_node_by_phandle(phandle);
+	if (!dn) {
+		kfree(rtas_buf);
+		return -ENOENT;
+	}
+
+	upwa = (struct update_props_workarea *)&rtas_buf[0];
+	upwa->phandle = phandle;
+
+	do {
+		rtas_rc = mobility_rtas_call(update_properties_token, rtas_buf,
+					scope);
+		if (rtas_rc < 0)
+			break;
+
+		prop_data = rtas_buf + sizeof(*upwa);
+
+		/* On the first call to ibm,update-properties for a node the
+		 * the first property value descriptor contains an empty
+		 * property name, the property value length encoded as u32,
+		 * and the property value is the node path being updated.
+		 */
+		if (*prop_data == 0) {
+			prop_data++;
+			vd = *(u32 *)prop_data;
+			prop_data += vd + sizeof(vd);
+			upwa->nprops--;
+		}
+
+		for (i = 0; i < upwa->nprops; i++) {
+			char *prop_name;
+
+			prop_name = prop_data;
+			prop_data += strlen(prop_name) + 1;
+			vd = *(u32 *)prop_data;
+			prop_data += sizeof(vd);
+
+			switch (vd) {
+			case 0x00000000:
+				/* name only property, nothing to do */
+				break;
+
+			case 0x80000000:
+				prop = of_find_property(dn, prop_name, NULL);
+				of_remove_property(dn, prop);
+				prop = NULL;
+				break;
+
+			default:
+				rc = update_dt_property(dn, &prop, prop_name,
+							vd, prop_data);
+				if (rc) {
+					printk(KERN_ERR "Could not update %s"
+					       " property\n", prop_name);
+				}
+
+				prop_data += vd;
+			}
+		}
+	} while (rtas_rc == 1);
+
+	of_node_put(dn);
+	kfree(rtas_buf);
+	return 0;
+}
+
+static int add_dt_node(u32 parent_phandle, u32 drc_index)
+{
+	struct device_node *dn;
+	struct device_node *parent_dn;
+	int rc;
+
+	parent_dn = of_find_node_by_phandle(parent_phandle);
+	if (!parent_dn)
+		return -ENOENT;
+
+	dn = dlpar_configure_connector(drc_index, parent_dn);
+	if (!dn)
+		return -ENOENT;
+
+	rc = dlpar_attach_node(dn);
+	if (rc)
+		dlpar_free_cc_nodes(dn);
+
+	of_node_put(parent_dn);
+	return rc;
+}
+
+int pseries_devicetree_update(s32 scope)
+{
+	char *rtas_buf;
+	u32 *data;
+	int update_nodes_token;
+	int rc;
+
+	update_nodes_token = rtas_token("ibm,update-nodes");
+	if (update_nodes_token == RTAS_UNKNOWN_SERVICE)
+		return -EINVAL;
+
+	rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+	if (!rtas_buf)
+		return -ENOMEM;
+
+	do {
+		rc = mobility_rtas_call(update_nodes_token, rtas_buf, scope);
+		if (rc && rc != 1)
+			break;
+
+		data = (u32 *)rtas_buf + 4;
+		while (*data & NODE_ACTION_MASK) {
+			int i;
+			u32 action = *data & NODE_ACTION_MASK;
+			int node_count = *data & NODE_COUNT_MASK;
+
+			data++;
+
+			for (i = 0; i < node_count; i++) {
+				u32 phandle = *data++;
+				u32 drc_index;
+
+				switch (action) {
+				case DELETE_DT_NODE:
+					delete_dt_node(phandle);
+					break;
+				case UPDATE_DT_NODE:
+					update_dt_node(phandle, scope);
+					break;
+				case ADD_DT_NODE:
+					drc_index = *data++;
+					add_dt_node(phandle, drc_index);
+					break;
+				}
+			}
+		}
+	} while (rc == 1);
+
+	kfree(rtas_buf);
+	return rc;
+}
+
+void post_mobility_fixup(void)
+{
+	int rc;
+	int activate_fw_token;
+
+	activate_fw_token = rtas_token("ibm,activate-firmware");
+	if (activate_fw_token == RTAS_UNKNOWN_SERVICE) {
+		printk(KERN_ERR "Could not make post-mobility "
+		       "activate-fw call.\n");
+		return;
+	}
+
+	do {
+		rc = rtas_call(activate_fw_token, 0, 1, NULL);
+	} while (rtas_busy_delay(rc));
+
+	if (rc)
+		printk(KERN_ERR "Post-mobility activate-fw failed: %d\n", rc);
+
+	rc = pseries_devicetree_update(MIGRATION_SCOPE);
+	if (rc)
+		printk(KERN_ERR "Post-mobility device tree update "
+			"failed: %d\n", rc);
+
+	return;
+}
+
+static ssize_t migrate_store(struct class *class, struct class_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct rtas_args args;
+	u64 streamid;
+	int rc;
+
+	rc = strict_strtoull(buf, 0, &streamid);
+	if (rc)
+		return rc;
+
+	memset(&args, 0, sizeof(args));
+	args.token = rtas_token("ibm,suspend-me");
+	args.nargs = 2;
+	args.nret = 1;
+
+	args.args[0] = streamid >> 32 ;
+	args.args[1] = streamid & 0xffffffff;
+	args.rets = &args.args[args.nargs];
+
+	do {
+		args.rets[0] = 0;
+		rc = rtas_ibm_suspend_me(&args);
+		if (!rc && args.rets[0] == RTAS_NOT_SUSPENDABLE)
+			ssleep(1);
+	} while (!rc && args.rets[0] == RTAS_NOT_SUSPENDABLE);
+
+	if (rc)
+		return rc;
+	else if (args.rets[0])
+		return args.rets[0];
+
+	post_mobility_fixup();
+	return count;
+}
+
+static CLASS_ATTR(migration, S_IWUSR, NULL, migrate_store);
+
+static int __init mobility_sysfs_init(void)
+{
+	int rc;
+
+	mobility_kobj = kobject_create_and_add("mobility", kernel_kobj);
+	if (!mobility_kobj)
+		return -ENOMEM;
+
+	rc = sysfs_create_file(mobility_kobj, &class_attr_migration.attr);
+
+	return rc;
+}
+device_initcall(mobility_sysfs_init);
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
index bf2e1ac4130..0c882e83c4c 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -24,26 +24,7 @@ static int query_token, change_token;
 #define RTAS_RESET_FN		2
 #define RTAS_CHANGE_MSI_FN	3
 #define RTAS_CHANGE_MSIX_FN	4
-
-static struct pci_dn *get_pdn(struct pci_dev *pdev)
-{
-	struct device_node *dn;
-	struct pci_dn *pdn;
-
-	dn = pci_device_to_OF_node(pdev);
-	if (!dn) {
-		dev_dbg(&pdev->dev, "rtas_msi: No OF device node\n");
-		return NULL;
-	}
-
-	pdn = PCI_DN(dn);
-	if (!pdn) {
-		dev_dbg(&pdev->dev, "rtas_msi: No PCI DN\n");
-		return NULL;
-	}
-
-	return pdn;
-}
+#define RTAS_CHANGE_32MSI_FN	5
 
 /* RTAS Helpers */
 
@@ -58,7 +39,8 @@ static int rtas_change_msi(struct pci_dn *pdn, u32 func, u32 num_irqs)
 
 	seq_num = 1;
 	do {
-		if (func == RTAS_CHANGE_MSI_FN || func == RTAS_CHANGE_MSIX_FN)
+		if (func == RTAS_CHANGE_MSI_FN || func == RTAS_CHANGE_MSIX_FN ||
+		    func == RTAS_CHANGE_32MSI_FN)
 			rc = rtas_call(change_token, 6, 4, rtas_ret, addr,
 					BUID_HI(buid), BUID_LO(buid),
 					func, num_irqs, seq_num);
@@ -89,12 +71,22 @@ static void rtas_disable_msi(struct pci_dev *pdev)
 {
 	struct pci_dn *pdn;
 
-	pdn = get_pdn(pdev);
+	pdn = pci_get_pdn(pdev);
 	if (!pdn)
 		return;
 
-	if (rtas_change_msi(pdn, RTAS_CHANGE_FN, 0) != 0)
-		pr_debug("rtas_msi: Setting MSIs to 0 failed!\n");
+	/*
+	 * disabling MSI with the explicit interface also disables MSI-X
+	 */
+	if (rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, 0) != 0) {
+		/* 
+		 * may have failed because explicit interface is not
+		 * present
+		 */
+		if (rtas_change_msi(pdn, RTAS_CHANGE_FN, 0) != 0) {
+			pr_debug("rtas_msi: Setting MSIs to 0 failed!\n");
+		}
+	}
 }
 
 static int rtas_query_irq_number(struct pci_dn *pdn, int offset)
@@ -127,7 +119,7 @@ static void rtas_teardown_msi_irqs(struct pci_dev *pdev)
 		if (entry->irq == NO_IRQ)
 			continue;
 
-		set_irq_msi(entry->irq, NULL);
+		irq_set_msi_desc(entry->irq, NULL);
 		irq_dispose_mapping(entry->irq);
 	}
 
@@ -138,27 +130,29 @@ static int check_req(struct pci_dev *pdev, int nvec, char *prop_name)
 {
 	struct device_node *dn;
 	struct pci_dn *pdn;
-	const u32 *req_msi;
+	const __be32 *p;
+	u32 req_msi;
 
-	pdn = get_pdn(pdev);
+	pdn = pci_get_pdn(pdev);
 	if (!pdn)
 		return -ENODEV;
 
 	dn = pdn->node;
 
-	req_msi = of_get_property(dn, prop_name, NULL);
-	if (!req_msi) {
+	p = of_get_property(dn, prop_name, NULL);
+	if (!p) {
 		pr_debug("rtas_msi: No %s on %s\n", prop_name, dn->full_name);
 		return -ENOENT;
 	}
 
-	if (*req_msi < nvec) {
+	req_msi = be32_to_cpup(p);
+	if (req_msi < nvec) {
 		pr_debug("rtas_msi: %s requests < %d MSIs\n", prop_name, nvec);
 
-		if (*req_msi == 0) /* Be paranoid */
+		if (req_msi == 0) /* Be paranoid */
 			return -ENOSPC;
 
-		return *req_msi;
+		return req_msi;
 	}
 
 	return 0;
@@ -179,7 +173,7 @@ static int check_req_msix(struct pci_dev *pdev, int nvec)
 static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total)
 {
 	struct device_node *dn;
-	const u32 *p;
+	const __be32 *p;
 
 	dn = of_node_get(pci_device_to_OF_node(dev));
 	while (dn) {
@@ -187,7 +181,7 @@ static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total)
 		if (p) {
 			pr_debug("rtas_msi: found prop on dn %s\n",
 				dn->full_name);
-			*total = *p;
+			*total = be32_to_cpup(p);
 			return dn;
 		}
 
@@ -200,6 +194,7 @@ static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total)
 static struct device_node *find_pe_dn(struct pci_dev *dev, int *total)
 {
 	struct device_node *dn;
+	struct eeh_dev *edev;
 
 	/* Found our PE and assume 8 at that point. */
 
@@ -207,7 +202,11 @@ static struct device_node *find_pe_dn(struct pci_dev *dev, int *total)
 	if (!dn)
 		return NULL;
 
-	dn = find_device_pe(dn);
+	/* Get the top level device in the PE */
+	edev = of_node_to_eeh_dev(dn);
+	if (edev->pe)
+		edev = list_first_entry(&edev->pe->edevs, struct eeh_dev, list);
+	dn = eeh_dev_to_of_node(edev);
 	if (!dn)
 		return NULL;
 
@@ -235,13 +234,13 @@ struct msi_counts {
 static void *count_non_bridge_devices(struct device_node *dn, void *data)
 {
 	struct msi_counts *counts = data;
-	const u32 *p;
+	const __be32 *p;
 	u32 class;
 
 	pr_debug("rtas_msi: counting %s\n", dn->full_name);
 
 	p = of_get_property(dn, "class-code", NULL);
-	class = p ? *p : 0;
+	class = p ? be32_to_cpup(p) : 0;
 
 	if ((class >> 8) != PCI_CLASS_BRIDGE_PCI)
 		counts->num_devices++;
@@ -252,7 +251,7 @@ static void *count_non_bridge_devices(struct device_node *dn, void *data)
 static void *count_spare_msis(struct device_node *dn, void *data)
 {
 	struct msi_counts *counts = data;
-	const u32 *p;
+	const __be32 *p;
 	int req;
 
 	if (dn == counts->requestor)
@@ -263,11 +262,11 @@ static void *count_spare_msis(struct device_node *dn, void *data)
 		req = 0;
 		p = of_get_property(dn, "ibm,req#msi", NULL);
 		if (p)
-			req = *p;
+			req = be32_to_cpup(p);
 
 		p = of_get_property(dn, "ibm,req#msi-x", NULL);
 		if (p)
-			req = max(req, (int)*p);
+			req = max(req, (int)be32_to_cpup(p));
 	}
 
 	if (req < counts->quota)
@@ -377,14 +376,33 @@ static int check_msix_entries(struct pci_dev *pdev)
 	return 0;
 }
 
-static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
+static void rtas_hack_32bit_msi_gen2(struct pci_dev *pdev)
+{
+	u32 addr_hi, addr_lo;
+
+	/*
+	 * We should only get in here for IODA1 configs. This is based on the
+	 * fact that we using RTAS for MSIs, we don't have the 32 bit MSI RTAS
+	 * support, and we are in a PCIe Gen2 slot.
+	 */
+	dev_info(&pdev->dev,
+		 "rtas_msi: No 32 bit MSI firmware support, forcing 32 bit MSI\n");
+	pci_read_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, &addr_hi);
+	addr_lo = 0xffff0000 | ((addr_hi >> (48 - 32)) << 4);
+	pci_write_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO, addr_lo);
+	pci_write_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, 0);
+}
+
+static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type)
 {
 	struct pci_dn *pdn;
 	int hwirq, virq, i, rc;
 	struct msi_desc *entry;
 	struct msi_msg msg;
+	int nvec = nvec_in;
+	int use_32bit_msi_hack = 0;
 
-	pdn = get_pdn(pdev);
+	pdn = pci_get_pdn(pdev);
 	if (!pdn)
 		return -ENODEV;
 
@@ -392,21 +410,57 @@ static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
 		return -EINVAL;
 
 	/*
+	 * Firmware currently refuse any non power of two allocation
+	 * so we round up if the quota will allow it.
+	 */
+	if (type == PCI_CAP_ID_MSIX) {
+		int m = roundup_pow_of_two(nvec);
+		int quota = msi_quota_for_device(pdev, m);
+
+		if (quota >= m)
+			nvec = m;
+	}
+
+	/*
 	 * Try the new more explicit firmware interface, if that fails fall
 	 * back to the old interface. The old interface is known to never
 	 * return MSI-Xs.
 	 */
+again:
 	if (type == PCI_CAP_ID_MSI) {
-		rc = rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, nvec);
+		if (pdn->force_32bit_msi) {
+			rc = rtas_change_msi(pdn, RTAS_CHANGE_32MSI_FN, nvec);
+			if (rc < 0) {
+				/*
+				 * We only want to run the 32 bit MSI hack below if
+				 * the max bus speed is Gen2 speed
+				 */
+				if (pdev->bus->max_bus_speed != PCIE_SPEED_5_0GT)
+					return rc;
+
+				use_32bit_msi_hack = 1;
+			}
+		} else
+			rc = -1;
+
+		if (rc < 0)
+			rc = rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, nvec);
 
 		if (rc < 0) {
 			pr_debug("rtas_msi: trying the old firmware call.\n");
 			rc = rtas_change_msi(pdn, RTAS_CHANGE_FN, nvec);
 		}
+
+		if (use_32bit_msi_hack && rc > 0)
+			rtas_hack_32bit_msi_gen2(pdev);
 	} else
 		rc = rtas_change_msi(pdn, RTAS_CHANGE_MSIX_FN, nvec);
 
 	if (rc != nvec) {
+		if (nvec != nvec_in) {
+			nvec = nvec_in;
+			goto again;
+		}
 		pr_debug("rtas_msi: rtas_change_msi() failed\n");
 		return rc;
 	}
@@ -427,13 +481,11 @@ static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
 		}
 
 		dev_dbg(&pdev->dev, "rtas_msi: allocated virq %d\n", virq);
-		set_irq_msi(virq, entry);
+		irq_set_msi_desc(virq, entry);
 
 		/* Read config space back so we can restore after reset */
 		read_msi_msg(virq, &msg);
 		entry->msg = msg;
-
-		unmask_msi_irq(virq);
 	}
 
 	return 0;
@@ -481,3 +533,4 @@ static int rtas_msi_init(void)
 	return 0;
 }
 arch_initcall(rtas_msi_init);
+
diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index 42f7e384e6c..0cc240b7f69 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -15,19 +15,145 @@
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/kmsg_dump.h>
+#include <linux/pstore.h>
+#include <linux/ctype.h>
+#include <linux/zlib.h>
 #include <asm/uaccess.h>
 #include <asm/nvram.h>
 #include <asm/rtas.h>
 #include <asm/prom.h>
 #include <asm/machdep.h>
 
+/* Max bytes to read/write in one go */
+#define NVRW_CNT 0x20
+
+/*
+ * Set oops header version to distinguish between old and new format header.
+ * lnx,oops-log partition max size is 4000, header version > 4000 will
+ * help in identifying new header.
+ */
+#define OOPS_HDR_VERSION 5000
+
 static unsigned int nvram_size;
 static int nvram_fetch, nvram_store;
 static char nvram_buf[NVRW_CNT];	/* assume this is in the first 4GB */
 static DEFINE_SPINLOCK(nvram_lock);
 
+struct err_log_info {
+	__be32 error_type;
+	__be32 seq_num;
+};
+
+struct nvram_os_partition {
+	const char *name;
+	int req_size;	/* desired size, in bytes */
+	int min_size;	/* minimum acceptable size (0 means req_size) */
+	long size;	/* size of data portion (excluding err_log_info) */
+	long index;	/* offset of data portion of partition */
+	bool os_partition; /* partition initialized by OS, not FW */
+};
+
+static struct nvram_os_partition rtas_log_partition = {
+	.name = "ibm,rtas-log",
+	.req_size = 2079,
+	.min_size = 1055,
+	.index = -1,
+	.os_partition = true
+};
+
+static struct nvram_os_partition oops_log_partition = {
+	.name = "lnx,oops-log",
+	.req_size = 4000,
+	.min_size = 2000,
+	.index = -1,
+	.os_partition = true
+};
+
+static const char *pseries_nvram_os_partitions[] = {
+	"ibm,rtas-log",
+	"lnx,oops-log",
+	NULL
+};
+
+struct oops_log_info {
+	__be16 version;
+	__be16 report_length;
+	__be64 timestamp;
+} __attribute__((packed));
+
+static void oops_to_nvram(struct kmsg_dumper *dumper,
+			  enum kmsg_dump_reason reason);
+
+static struct kmsg_dumper nvram_kmsg_dumper = {
+	.dump = oops_to_nvram
+};
+
+/* See clobbering_unread_rtas_event() */
+#define NVRAM_RTAS_READ_TIMEOUT 5		/* seconds */
+static unsigned long last_unread_rtas_event;	/* timestamp */
+
+/*
+ * For capturing and compressing an oops or panic report...
+
+ * big_oops_buf[] holds the uncompressed text we're capturing.
+ *
+ * oops_buf[] holds the compressed text, preceded by a oops header.
+ * oops header has u16 holding the version of oops header (to differentiate
+ * between old and new format header) followed by u16 holding the length of
+ * the compressed* text (*Or uncompressed, if compression fails.) and u64
+ * holding the timestamp. oops_buf[] gets written to NVRAM.
+ *
+ * oops_log_info points to the header. oops_data points to the compressed text.
+ *
+ * +- oops_buf
+ * |                                   +- oops_data
+ * v                                   v
+ * +-----------+-----------+-----------+------------------------+
+ * | version   | length    | timestamp | text                   |
+ * | (2 bytes) | (2 bytes) | (8 bytes) | (oops_data_sz bytes)   |
+ * +-----------+-----------+-----------+------------------------+
+ * ^
+ * +- oops_log_info
+ *
+ * We preallocate these buffers during init to avoid kmalloc during oops/panic.
+ */
+static size_t big_oops_buf_sz;
+static char *big_oops_buf, *oops_buf;
+static char *oops_data;
+static size_t oops_data_sz;
+
+/* Compression parameters */
+#define COMPR_LEVEL 6
+#define WINDOW_BITS 12
+#define MEM_LEVEL 4
+static struct z_stream_s stream;
+
+#ifdef CONFIG_PSTORE
+static struct nvram_os_partition of_config_partition = {
+	.name = "of-config",
+	.index = -1,
+	.os_partition = false
+};
+
+static struct nvram_os_partition common_partition = {
+	.name = "common",
+	.index = -1,
+	.os_partition = false
+};
+
+static enum pstore_type_id nvram_type_ids[] = {
+	PSTORE_TYPE_DMESG,
+	PSTORE_TYPE_PPC_RTAS,
+	PSTORE_TYPE_PPC_OF,
+	PSTORE_TYPE_PPC_COMMON,
+	-1
+};
+static int read_type;
+static unsigned long last_rtas_event;
+#endif
 
 static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index)
 {
@@ -120,10 +246,564 @@ static ssize_t pSeries_nvram_get_size(void)
 	return nvram_size ? nvram_size : -ENODEV;
 }
 
+
+/* nvram_write_os_partition, nvram_write_error_log
+ *
+ * We need to buffer the error logs into nvram to ensure that we have
+ * the failure information to decode.  If we have a severe error there
+ * is no way to guarantee that the OS or the machine is in a state to
+ * get back to user land and write the error to disk.  For example if
+ * the SCSI device driver causes a Machine Check by writing to a bad
+ * IO address, there is no way of guaranteeing that the device driver
+ * is in any state that is would also be able to write the error data
+ * captured to disk, thus we buffer it in NVRAM for analysis on the
+ * next boot.
+ *
+ * In NVRAM the partition containing the error log buffer will looks like:
+ * Header (in bytes):
+ * +-----------+----------+--------+------------+------------------+
+ * | signature | checksum | length | name       | data             |
+ * |0          |1         |2      3|4         15|16        length-1|
+ * +-----------+----------+--------+------------+------------------+
+ *
+ * The 'data' section would look like (in bytes):
+ * +--------------+------------+-----------------------------------+
+ * | event_logged | sequence # | error log                         |
+ * |0            3|4          7|8                  error_log_size-1|
+ * +--------------+------------+-----------------------------------+
+ *
+ * event_logged: 0 if event has not been logged to syslog, 1 if it has
+ * sequence #: The unique sequence # for each event. (until it wraps)
+ * error log: The error log from event_scan
+ */
+int nvram_write_os_partition(struct nvram_os_partition *part, char * buff,
+		int length, unsigned int err_type, unsigned int error_log_cnt)
+{
+	int rc;
+	loff_t tmp_index;
+	struct err_log_info info;
+	
+	if (part->index == -1) {
+		return -ESPIPE;
+	}
+
+	if (length > part->size) {
+		length = part->size;
+	}
+
+	info.error_type = cpu_to_be32(err_type);
+	info.seq_num = cpu_to_be32(error_log_cnt);
+
+	tmp_index = part->index;
+
+	rc = ppc_md.nvram_write((char *)&info, sizeof(struct err_log_info), &tmp_index);
+	if (rc <= 0) {
+		pr_err("%s: Failed nvram_write (%d)\n", __func__, rc);
+		return rc;
+	}
+
+	rc = ppc_md.nvram_write(buff, length, &tmp_index);
+	if (rc <= 0) {
+		pr_err("%s: Failed nvram_write (%d)\n", __func__, rc);
+		return rc;
+	}
+	
+	return 0;
+}
+
+int nvram_write_error_log(char * buff, int length,
+                          unsigned int err_type, unsigned int error_log_cnt)
+{
+	int rc = nvram_write_os_partition(&rtas_log_partition, buff, length,
+						err_type, error_log_cnt);
+	if (!rc) {
+		last_unread_rtas_event = get_seconds();
+#ifdef CONFIG_PSTORE
+		last_rtas_event = get_seconds();
+#endif
+	}
+
+	return rc;
+}
+
+/* nvram_read_partition
+ *
+ * Reads nvram partition for at most 'length'
+ */
+int nvram_read_partition(struct nvram_os_partition *part, char *buff,
+			int length, unsigned int *err_type,
+			unsigned int *error_log_cnt)
+{
+	int rc;
+	loff_t tmp_index;
+	struct err_log_info info;
+	
+	if (part->index == -1)
+		return -1;
+
+	if (length > part->size)
+		length = part->size;
+
+	tmp_index = part->index;
+
+	if (part->os_partition) {
+		rc = ppc_md.nvram_read((char *)&info,
+					sizeof(struct err_log_info),
+					&tmp_index);
+		if (rc <= 0) {
+			pr_err("%s: Failed nvram_read (%d)\n", __func__, rc);
+			return rc;
+		}
+	}
+
+	rc = ppc_md.nvram_read(buff, length, &tmp_index);
+	if (rc <= 0) {
+		pr_err("%s: Failed nvram_read (%d)\n", __func__, rc);
+		return rc;
+	}
+
+	if (part->os_partition) {
+		*error_log_cnt = be32_to_cpu(info.seq_num);
+		*err_type = be32_to_cpu(info.error_type);
+	}
+
+	return 0;
+}
+
+/* nvram_read_error_log
+ *
+ * Reads nvram for error log for at most 'length'
+ */
+int nvram_read_error_log(char *buff, int length,
+			unsigned int *err_type, unsigned int *error_log_cnt)
+{
+	return nvram_read_partition(&rtas_log_partition, buff, length,
+						err_type, error_log_cnt);
+}
+
+/* This doesn't actually zero anything, but it sets the event_logged
+ * word to tell that this event is safely in syslog.
+ */
+int nvram_clear_error_log(void)
+{
+	loff_t tmp_index;
+	int clear_word = ERR_FLAG_ALREADY_LOGGED;
+	int rc;
+
+	if (rtas_log_partition.index == -1)
+		return -1;
+
+	tmp_index = rtas_log_partition.index;
+	
+	rc = ppc_md.nvram_write((char *)&clear_word, sizeof(int), &tmp_index);
+	if (rc <= 0) {
+		printk(KERN_ERR "nvram_clear_error_log: Failed nvram_write (%d)\n", rc);
+		return rc;
+	}
+	last_unread_rtas_event = 0;
+
+	return 0;
+}
+
+/* pseries_nvram_init_os_partition
+ *
+ * This sets up a partition with an "OS" signature.
+ *
+ * The general strategy is the following:
+ * 1.) If a partition with the indicated name already exists...
+ *	- If it's large enough, use it.
+ *	- Otherwise, recycle it and keep going.
+ * 2.) Search for a free partition that is large enough.
+ * 3.) If there's not a free partition large enough, recycle any obsolete
+ * OS partitions and try again.
+ * 4.) Will first try getting a chunk that will satisfy the requested size.
+ * 5.) If a chunk of the requested size cannot be allocated, then try finding
+ * a chunk that will satisfy the minum needed.
+ *
+ * Returns 0 on success, else -1.
+ */
+static int __init pseries_nvram_init_os_partition(struct nvram_os_partition
+									*part)
+{
+	loff_t p;
+	int size;
+
+	/* Look for ours */
+	p = nvram_find_partition(part->name, NVRAM_SIG_OS, &size);
+
+	/* Found one but too small, remove it */
+	if (p && size < part->min_size) {
+		pr_info("nvram: Found too small %s partition,"
+					" removing it...\n", part->name);
+		nvram_remove_partition(part->name, NVRAM_SIG_OS, NULL);
+		p = 0;
+	}
+
+	/* Create one if we didn't find */
+	if (!p) {
+		p = nvram_create_partition(part->name, NVRAM_SIG_OS,
+					part->req_size, part->min_size);
+		if (p == -ENOSPC) {
+			pr_info("nvram: No room to create %s partition, "
+				"deleting any obsolete OS partitions...\n",
+				part->name);
+			nvram_remove_partition(NULL, NVRAM_SIG_OS,
+						pseries_nvram_os_partitions);
+			p = nvram_create_partition(part->name, NVRAM_SIG_OS,
+					part->req_size, part->min_size);
+		}
+	}
+
+	if (p <= 0) {
+		pr_err("nvram: Failed to find or create %s"
+		       " partition, err %d\n", part->name, (int)p);
+		return -1;
+	}
+
+	part->index = p;
+	part->size = nvram_get_partition_size(p) - sizeof(struct err_log_info);
+	
+	return 0;
+}
+
+/*
+ * Are we using the ibm,rtas-log for oops/panic reports?  And if so,
+ * would logging this oops/panic overwrite an RTAS event that rtas_errd
+ * hasn't had a chance to read and process?  Return 1 if so, else 0.
+ *
+ * We assume that if rtas_errd hasn't read the RTAS event in
+ * NVRAM_RTAS_READ_TIMEOUT seconds, it's probably not going to.
+ */
+static int clobbering_unread_rtas_event(void)
+{
+	return (oops_log_partition.index == rtas_log_partition.index
+		&& last_unread_rtas_event
+		&& get_seconds() - last_unread_rtas_event <=
+						NVRAM_RTAS_READ_TIMEOUT);
+}
+
+/* Derived from logfs_compress() */
+static int nvram_compress(const void *in, void *out, size_t inlen,
+							size_t outlen)
+{
+	int err, ret;
+
+	ret = -EIO;
+	err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS,
+						MEM_LEVEL, Z_DEFAULT_STRATEGY);
+	if (err != Z_OK)
+		goto error;
+
+	stream.next_in = in;
+	stream.avail_in = inlen;
+	stream.total_in = 0;
+	stream.next_out = out;
+	stream.avail_out = outlen;
+	stream.total_out = 0;
+
+	err = zlib_deflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END)
+		goto error;
+
+	err = zlib_deflateEnd(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	if (stream.total_out >= stream.total_in)
+		goto error;
+
+	ret = stream.total_out;
+error:
+	return ret;
+}
+
+/* Compress the text from big_oops_buf into oops_buf. */
+static int zip_oops(size_t text_len)
+{
+	struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf;
+	int zipped_len = nvram_compress(big_oops_buf, oops_data, text_len,
+								oops_data_sz);
+	if (zipped_len < 0) {
+		pr_err("nvram: compression failed; returned %d\n", zipped_len);
+		pr_err("nvram: logging uncompressed oops/panic report\n");
+		return -1;
+	}
+	oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION);
+	oops_hdr->report_length = cpu_to_be16(zipped_len);
+	oops_hdr->timestamp = cpu_to_be64(get_seconds());
+	return 0;
+}
+
+#ifdef CONFIG_PSTORE
+static int nvram_pstore_open(struct pstore_info *psi)
+{
+	/* Reset the iterator to start reading partitions again */
+	read_type = -1;
+	return 0;
+}
+
+/**
+ * nvram_pstore_write - pstore write callback for nvram
+ * @type:               Type of message logged
+ * @reason:             reason behind dump (oops/panic)
+ * @id:                 identifier to indicate the write performed
+ * @part:               pstore writes data to registered buffer in parts,
+ *                      part number will indicate the same.
+ * @count:              Indicates oops count
+ * @compressed:         Flag to indicate the log is compressed
+ * @size:               number of bytes written to the registered buffer
+ * @psi:                registered pstore_info structure
+ *
+ * Called by pstore_dump() when an oops or panic report is logged in the
+ * printk buffer.
+ * Returns 0 on successful write.
+ */
+static int nvram_pstore_write(enum pstore_type_id type,
+				enum kmsg_dump_reason reason,
+				u64 *id, unsigned int part, int count,
+				bool compressed, size_t size,
+				struct pstore_info *psi)
+{
+	int rc;
+	unsigned int err_type = ERR_TYPE_KERNEL_PANIC;
+	struct oops_log_info *oops_hdr = (struct oops_log_info *) oops_buf;
+
+	/* part 1 has the recent messages from printk buffer */
+	if (part > 1 || type != PSTORE_TYPE_DMESG ||
+				clobbering_unread_rtas_event())
+		return -1;
+
+	oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION);
+	oops_hdr->report_length = cpu_to_be16(size);
+	oops_hdr->timestamp = cpu_to_be64(get_seconds());
+
+	if (compressed)
+		err_type = ERR_TYPE_KERNEL_PANIC_GZ;
+
+	rc = nvram_write_os_partition(&oops_log_partition, oops_buf,
+		(int) (sizeof(*oops_hdr) + size), err_type, count);
+
+	if (rc != 0)
+		return rc;
+
+	*id = part;
+	return 0;
+}
+
+/*
+ * Reads the oops/panic report, rtas, of-config and common partition.
+ * Returns the length of the data we read from each partition.
+ * Returns 0 if we've been called before.
+ */
+static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type,
+				int *count, struct timespec *time, char **buf,
+				bool *compressed, struct pstore_info *psi)
+{
+	struct oops_log_info *oops_hdr;
+	unsigned int err_type, id_no, size = 0;
+	struct nvram_os_partition *part = NULL;
+	char *buff = NULL;
+	int sig = 0;
+	loff_t p;
+
+	read_type++;
+
+	switch (nvram_type_ids[read_type]) {
+	case PSTORE_TYPE_DMESG:
+		part = &oops_log_partition;
+		*type = PSTORE_TYPE_DMESG;
+		break;
+	case PSTORE_TYPE_PPC_RTAS:
+		part = &rtas_log_partition;
+		*type = PSTORE_TYPE_PPC_RTAS;
+		time->tv_sec = last_rtas_event;
+		time->tv_nsec = 0;
+		break;
+	case PSTORE_TYPE_PPC_OF:
+		sig = NVRAM_SIG_OF;
+		part = &of_config_partition;
+		*type = PSTORE_TYPE_PPC_OF;
+		*id = PSTORE_TYPE_PPC_OF;
+		time->tv_sec = 0;
+		time->tv_nsec = 0;
+		break;
+	case PSTORE_TYPE_PPC_COMMON:
+		sig = NVRAM_SIG_SYS;
+		part = &common_partition;
+		*type = PSTORE_TYPE_PPC_COMMON;
+		*id = PSTORE_TYPE_PPC_COMMON;
+		time->tv_sec = 0;
+		time->tv_nsec = 0;
+		break;
+	default:
+		return 0;
+	}
+
+	if (!part->os_partition) {
+		p = nvram_find_partition(part->name, sig, &size);
+		if (p <= 0) {
+			pr_err("nvram: Failed to find partition %s, "
+				"err %d\n", part->name, (int)p);
+			return 0;
+		}
+		part->index = p;
+		part->size = size;
+	}
+
+	buff = kmalloc(part->size, GFP_KERNEL);
+
+	if (!buff)
+		return -ENOMEM;
+
+	if (nvram_read_partition(part, buff, part->size, &err_type, &id_no)) {
+		kfree(buff);
+		return 0;
+	}
+
+	*count = 0;
+
+	if (part->os_partition)
+		*id = id_no;
+
+	if (nvram_type_ids[read_type] == PSTORE_TYPE_DMESG) {
+		size_t length, hdr_size;
+
+		oops_hdr = (struct oops_log_info *)buff;
+		if (be16_to_cpu(oops_hdr->version) < OOPS_HDR_VERSION) {
+			/* Old format oops header had 2-byte record size */
+			hdr_size = sizeof(u16);
+			length = be16_to_cpu(oops_hdr->version);
+			time->tv_sec = 0;
+			time->tv_nsec = 0;
+		} else {
+			hdr_size = sizeof(*oops_hdr);
+			length = be16_to_cpu(oops_hdr->report_length);
+			time->tv_sec = be64_to_cpu(oops_hdr->timestamp);
+			time->tv_nsec = 0;
+		}
+		*buf = kmalloc(length, GFP_KERNEL);
+		if (*buf == NULL)
+			return -ENOMEM;
+		memcpy(*buf, buff + hdr_size, length);
+		kfree(buff);
+
+		if (err_type == ERR_TYPE_KERNEL_PANIC_GZ)
+			*compressed = true;
+		else
+			*compressed = false;
+		return length;
+	}
+
+	*buf = buff;
+	return part->size;
+}
+
+static struct pstore_info nvram_pstore_info = {
+	.owner = THIS_MODULE,
+	.name = "nvram",
+	.open = nvram_pstore_open,
+	.read = nvram_pstore_read,
+	.write = nvram_pstore_write,
+};
+
+static int nvram_pstore_init(void)
+{
+	int rc = 0;
+
+	nvram_pstore_info.buf = oops_data;
+	nvram_pstore_info.bufsize = oops_data_sz;
+
+	rc = pstore_register(&nvram_pstore_info);
+	if (rc != 0)
+		pr_err("nvram: pstore_register() failed, defaults to "
+				"kmsg_dump; returned %d\n", rc);
+
+	return rc;
+}
+#else
+static int nvram_pstore_init(void)
+{
+	return -1;
+}
+#endif
+
+static void __init nvram_init_oops_partition(int rtas_partition_exists)
+{
+	int rc;
+
+	rc = pseries_nvram_init_os_partition(&oops_log_partition);
+	if (rc != 0) {
+		if (!rtas_partition_exists)
+			return;
+		pr_notice("nvram: Using %s partition to log both"
+			" RTAS errors and oops/panic reports\n",
+			rtas_log_partition.name);
+		memcpy(&oops_log_partition, &rtas_log_partition,
+						sizeof(rtas_log_partition));
+	}
+	oops_buf = kmalloc(oops_log_partition.size, GFP_KERNEL);
+	if (!oops_buf) {
+		pr_err("nvram: No memory for %s partition\n",
+						oops_log_partition.name);
+		return;
+	}
+	oops_data = oops_buf + sizeof(struct oops_log_info);
+	oops_data_sz = oops_log_partition.size - sizeof(struct oops_log_info);
+
+	rc = nvram_pstore_init();
+
+	if (!rc)
+		return;
+
+	/*
+	 * Figure compression (preceded by elimination of each line's <n>
+	 * severity prefix) will reduce the oops/panic report to at most
+	 * 45% of its original size.
+	 */
+	big_oops_buf_sz = (oops_data_sz * 100) / 45;
+	big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL);
+	if (big_oops_buf) {
+		stream.workspace =  kmalloc(zlib_deflate_workspacesize(
+					WINDOW_BITS, MEM_LEVEL), GFP_KERNEL);
+		if (!stream.workspace) {
+			pr_err("nvram: No memory for compression workspace; "
+				"skipping compression of %s partition data\n",
+				oops_log_partition.name);
+			kfree(big_oops_buf);
+			big_oops_buf = NULL;
+		}
+	} else {
+		pr_err("No memory for uncompressed %s data; "
+			"skipping compression\n", oops_log_partition.name);
+		stream.workspace = NULL;
+	}
+
+	rc = kmsg_dump_register(&nvram_kmsg_dumper);
+	if (rc != 0) {
+		pr_err("nvram: kmsg_dump_register() failed; returned %d\n", rc);
+		kfree(oops_buf);
+		kfree(big_oops_buf);
+		kfree(stream.workspace);
+	}
+}
+
+static int __init pseries_nvram_init_log_partitions(void)
+{
+	int rc;
+
+	/* Scan nvram for partitions */
+	nvram_scan_partitions();
+
+	rc = pseries_nvram_init_os_partition(&rtas_log_partition);
+	nvram_init_oops_partition(rc == 0);
+	return 0;
+}
+machine_arch_initcall(pseries, pseries_nvram_init_log_partitions);
+
 int __init pSeries_nvram_init(void)
 {
 	struct device_node *nvram;
-	const unsigned int *nbytes_p;
+	const __be32 *nbytes_p;
 	unsigned int proplen;
 
 	nvram = of_find_node_by_type(NULL, "nvram");
@@ -136,7 +816,7 @@ int __init pSeries_nvram_init(void)
 		return -EIO;
 	}
 
-	nvram_size = *nbytes_p;
+	nvram_size = be32_to_cpup(nbytes_p);
 
 	nvram_fetch = rtas_token("nvram-fetch");
 	nvram_store = rtas_token("nvram-store");
@@ -149,3 +829,73 @@ int __init pSeries_nvram_init(void)
 
 	return 0;
 }
+
+
+/*
+ * This is our kmsg_dump callback, called after an oops or panic report
+ * has been written to the printk buffer.  We want to capture as much
+ * of the printk buffer as possible.  First, capture as much as we can
+ * that we think will compress sufficiently to fit in the lnx,oops-log
+ * partition.  If that's too much, go back and capture uncompressed text.
+ */
+static void oops_to_nvram(struct kmsg_dumper *dumper,
+			  enum kmsg_dump_reason reason)
+{
+	struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf;
+	static unsigned int oops_count = 0;
+	static bool panicking = false;
+	static DEFINE_SPINLOCK(lock);
+	unsigned long flags;
+	size_t text_len;
+	unsigned int err_type = ERR_TYPE_KERNEL_PANIC_GZ;
+	int rc = -1;
+
+	switch (reason) {
+	case KMSG_DUMP_RESTART:
+	case KMSG_DUMP_HALT:
+	case KMSG_DUMP_POWEROFF:
+		/* These are almost always orderly shutdowns. */
+		return;
+	case KMSG_DUMP_OOPS:
+		break;
+	case KMSG_DUMP_PANIC:
+		panicking = true;
+		break;
+	case KMSG_DUMP_EMERG:
+		if (panicking)
+			/* Panic report already captured. */
+			return;
+		break;
+	default:
+		pr_err("%s: ignoring unrecognized KMSG_DUMP_* reason %d\n",
+		       __func__, (int) reason);
+		return;
+	}
+
+	if (clobbering_unread_rtas_event())
+		return;
+
+	if (!spin_trylock_irqsave(&lock, flags))
+		return;
+
+	if (big_oops_buf) {
+		kmsg_dump_get_buffer(dumper, false,
+				     big_oops_buf, big_oops_buf_sz, &text_len);
+		rc = zip_oops(text_len);
+	}
+	if (rc != 0) {
+		kmsg_dump_rewind(dumper);
+		kmsg_dump_get_buffer(dumper, false,
+				     oops_data, oops_data_sz, &text_len);
+		err_type = ERR_TYPE_KERNEL_PANIC;
+		oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION);
+		oops_hdr->report_length = cpu_to_be16(text_len);
+		oops_hdr->timestamp = cpu_to_be64(get_seconds());
+	}
+
+	(void) nvram_write_os_partition(&oops_log_partition, oops_buf,
+		(int) (sizeof(*oops_hdr) + text_len), err_type,
+		++oops_count);
+
+	spin_unlock_irqrestore(&lock, flags);
+}
diff --git a/arch/powerpc/platforms/pseries/offline_states.h b/arch/powerpc/platforms/pseries/offline_states.h
new file mode 100644
index 00000000000..08672d9136a
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/offline_states.h
@@ -0,0 +1,37 @@
+#ifndef _OFFLINE_STATES_H_
+#define _OFFLINE_STATES_H_
+
+/* Cpu offline states go here */
+enum cpu_state_vals {
+	CPU_STATE_OFFLINE,
+	CPU_STATE_INACTIVE,
+	CPU_STATE_ONLINE,
+	CPU_MAX_OFFLINE_STATES
+};
+
+#ifdef CONFIG_HOTPLUG_CPU
+extern enum cpu_state_vals get_cpu_current_state(int cpu);
+extern void set_cpu_current_state(int cpu, enum cpu_state_vals state);
+extern void set_preferred_offline_state(int cpu, enum cpu_state_vals state);
+extern void set_default_offline_state(int cpu);
+#else
+static inline enum cpu_state_vals get_cpu_current_state(int cpu)
+{
+	return CPU_STATE_ONLINE;
+}
+
+static inline void set_cpu_current_state(int cpu, enum cpu_state_vals state)
+{
+}
+
+static inline void set_preferred_offline_state(int cpu, enum cpu_state_vals state)
+{
+}
+
+static inline void set_default_offline_state(int cpu)
+{
+}
+#endif
+
+extern enum cpu_state_vals get_preferred_offline_state(int cpu);
+#endif
diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c
index 2c6ded29f73..c413ec158ff 100644
--- a/arch/powerpc/platforms/pseries/pci.c
+++ b/arch/powerpc/platforms/pseries/pci.c
@@ -40,7 +40,8 @@ void pcibios_name_device(struct pci_dev *dev)
 	 */
 	dn = pci_device_to_OF_node(dev);
 	if (dn) {
-		const char *loc_code = of_get_property(dn, "ibm,loc-code", 0);
+		const char *loc_code = of_get_property(dn, "ibm,loc-code",
+				NULL);
 		if (loc_code) {
 			int loc_len = strlen(loc_code);
 			if (loc_len < sizeof(dev->dev.name)) {
@@ -73,7 +74,7 @@ void __init pSeries_final_fixup(void)
 {
 	pSeries_request_regions();
 
-	pci_addr_cache_build();
+	eeh_addr_cache_build();
 }
 
 /*
@@ -107,3 +108,64 @@ static void fixup_winbond_82c105(struct pci_dev* dev)
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_WINBOND, PCI_DEVICE_ID_WINBOND_82C105,
 			 fixup_winbond_82c105);
+
+int pseries_root_bridge_prepare(struct pci_host_bridge *bridge)
+{
+	struct device_node *dn, *pdn;
+	struct pci_bus *bus;
+	u32 pcie_link_speed_stats[2];
+	int rc;
+
+	bus = bridge->bus;
+
+	dn = pcibios_get_phb_of_node(bus);
+	if (!dn)
+		return 0;
+
+	for (pdn = dn; pdn != NULL; pdn = of_get_next_parent(pdn)) {
+		rc = of_property_read_u32_array(pdn,
+				"ibm,pcie-link-speed-stats",
+				&pcie_link_speed_stats[0], 2);
+		if (!rc)
+			break;
+	}
+
+	of_node_put(pdn);
+
+	if (rc) {
+		pr_err("no ibm,pcie-link-speed-stats property\n");
+		return 0;
+	}
+
+	switch (pcie_link_speed_stats[0]) {
+	case 0x01:
+		bus->max_bus_speed = PCIE_SPEED_2_5GT;
+		break;
+	case 0x02:
+		bus->max_bus_speed = PCIE_SPEED_5_0GT;
+		break;
+	case 0x04:
+		bus->max_bus_speed = PCIE_SPEED_8_0GT;
+		break;
+	default:
+		bus->max_bus_speed = PCI_SPEED_UNKNOWN;
+		break;
+	}
+
+	switch (pcie_link_speed_stats[1]) {
+	case 0x01:
+		bus->cur_bus_speed = PCIE_SPEED_2_5GT;
+		break;
+	case 0x02:
+		bus->cur_bus_speed = PCIE_SPEED_5_0GT;
+		break;
+	case 0x04:
+		bus->cur_bus_speed = PCIE_SPEED_8_0GT;
+		break;
+	default:
+		bus->cur_bus_speed = PCI_SPEED_UNKNOWN;
+		break;
+	}
+
+	return 0;
+}
diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c
index ad152a0e394..203cbf0dc10 100644
--- a/arch/powerpc/platforms/pseries/pci_dlpar.c
+++ b/arch/powerpc/platforms/pseries/pci_dlpar.c
@@ -25,9 +25,8 @@
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#undef DEBUG
-
 #include <linux/pci.h>
+#include <linux/export.h>
 #include <asm/pci-bridge.h>
 #include <asm/ppc-pci.h>
 #include <asm/firmware.h>
@@ -38,15 +37,15 @@ find_bus_among_children(struct pci_bus *bus,
                         struct device_node *dn)
 {
 	struct pci_bus *child = NULL;
-	struct list_head *tmp;
+	struct pci_bus *tmp;
 	struct device_node *busdn;
 
 	busdn = pci_bus_to_OF_node(bus);
 	if (busdn == dn)
 		return bus;
 
-	list_for_each(tmp, &bus->children) {
-		child = find_bus_among_children(pci_bus_b(tmp), dn);
+	list_for_each_entry(tmp, &bus->children, node) {
+		child = find_bus_among_children(tmp, dn);
 		if (child)
 			break;
 	};
@@ -65,76 +64,7 @@ pcibios_find_pci_bus(struct device_node *dn)
 }
 EXPORT_SYMBOL_GPL(pcibios_find_pci_bus);
 
-/**
- * pcibios_remove_pci_devices - remove all devices under this bus
- *
- * Remove all of the PCI devices under this bus both from the
- * linux pci device tree, and from the powerpc EEH address cache.
- */
-void pcibios_remove_pci_devices(struct pci_bus *bus)
-{
- 	struct pci_dev *dev, *tmp;
-	struct pci_bus *child_bus;
-
-	/* First go down child busses */
-	list_for_each_entry(child_bus, &bus->children, node)
-		pcibios_remove_pci_devices(child_bus);
-
-	pr_debug("PCI: Removing devices on bus %04x:%02x\n",
-		 pci_domain_nr(bus),  bus->number);
-	list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) {
-		pr_debug("     * Removing %s...\n", pci_name(dev));
-		eeh_remove_bus_device(dev);
- 		pci_remove_bus_device(dev);
- 	}
-}
-EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices);
-
-/**
- * pcibios_add_pci_devices - adds new pci devices to bus
- *
- * This routine will find and fixup new pci devices under
- * the indicated bus. This routine presumes that there
- * might already be some devices under this bridge, so
- * it carefully tries to add only new devices.  (And that
- * is how this routine differs from other, similar pcibios
- * routines.)
- */
-void pcibios_add_pci_devices(struct pci_bus * bus)
-{
-	int slotno, num, mode, pass, max;
-	struct pci_dev *dev;
-	struct device_node *dn = pci_bus_to_OF_node(bus);
-
-	eeh_add_device_tree_early(dn);
-
-	mode = PCI_PROBE_NORMAL;
-	if (ppc_md.pci_probe_mode)
-		mode = ppc_md.pci_probe_mode(bus);
-
-	if (mode == PCI_PROBE_DEVTREE) {
-		/* use ofdt-based probe */
-		of_rescan_bus(dn, bus);
-	} else if (mode == PCI_PROBE_NORMAL) {
-		/* use legacy probe */
-		slotno = PCI_SLOT(PCI_DN(dn->child)->devfn);
-		num = pci_scan_slot(bus, PCI_DEVFN(slotno, 0));
-		if (!num)
-			return;
-		pcibios_setup_bus_devices(bus);
-		max = bus->secondary;
-		for (pass=0; pass < 2; pass++)
-			list_for_each_entry(dev, &bus->devices, bus_list) {
-			if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
-			    dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
-				max = pci_scan_bridge(bus, dev, max, pass);
-		}
-	}
-	pcibios_finish_adding_to_bus(bus);
-}
-EXPORT_SYMBOL_GPL(pcibios_add_pci_devices);
-
-struct pci_controller * __devinit init_phb_dynamic(struct device_node *dn)
+struct pci_controller *init_phb_dynamic(struct device_node *dn)
 {
 	struct pci_controller *phb;
 
@@ -148,10 +78,13 @@ struct pci_controller * __devinit init_phb_dynamic(struct device_node *dn)
 
 	pci_devs_phb_init_dynamic(phb);
 
+	/* Create EEH devices for the PHB */
+	eeh_dev_phb_init_dynamic(phb);
+
 	if (dn->child)
 		eeh_add_device_tree_early(dn);
 
-	scan_phb(phb);
+	pcibios_scan_phb(phb);
 	pcibios_finish_adding_to_bus(phb->bus);
 
 	return phb;
@@ -165,7 +98,7 @@ int remove_phb_dynamic(struct pci_controller *phb)
 	struct resource *res;
 	int rc, i;
 
-	pr_debug("PCI: Removing PHB %04x:%02x... \n",
+	pr_debug("PCI: Removing PHB %04x:%02x...\n",
 		 pci_domain_nr(b), b->number);
 
 	/* We cannot to remove a root bus that has children */
diff --git a/arch/powerpc/platforms/pseries/phyp_dump.c b/arch/powerpc/platforms/pseries/phyp_dump.c
deleted file mode 100644
index 15eb6107bcd..00000000000
--- a/arch/powerpc/platforms/pseries/phyp_dump.c
+++ /dev/null
@@ -1,512 +0,0 @@
-/*
- * Hypervisor-assisted dump
- *
- * Linas Vepstas, Manish Ahuja 2008
- * Copyright 2008 IBM Corp.
- *
- *      This program is free software; you can redistribute it and/or
- *      modify it under the terms of the GNU General Public License
- *      as published by the Free Software Foundation; either version
- *      2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/init.h>
-#include <linux/kobject.h>
-#include <linux/mm.h>
-#include <linux/of.h>
-#include <linux/pfn.h>
-#include <linux/swap.h>
-#include <linux/sysfs.h>
-
-#include <asm/page.h>
-#include <asm/phyp_dump.h>
-#include <asm/machdep.h>
-#include <asm/prom.h>
-#include <asm/rtas.h>
-
-/* Variables, used to communicate data between early boot and late boot */
-static struct phyp_dump phyp_dump_vars;
-struct phyp_dump *phyp_dump_info = &phyp_dump_vars;
-
-static int ibm_configure_kernel_dump;
-/* ------------------------------------------------- */
-/* RTAS interfaces to declare the dump regions */
-
-struct dump_section {
-	u32 dump_flags;
-	u16 source_type;
-	u16 error_flags;
-	u64 source_address;
-	u64 source_length;
-	u64 length_copied;
-	u64 destination_address;
-};
-
-struct phyp_dump_header {
-	u32 version;
-	u16 num_of_sections;
-	u16 status;
-
-	u32 first_offset_section;
-	u32 dump_disk_section;
-	u64 block_num_dd;
-	u64 num_of_blocks_dd;
-	u32 offset_dd;
-	u32 maxtime_to_auto;
-	/* No dump disk path string used */
-
-	struct dump_section cpu_data;
-	struct dump_section hpte_data;
-	struct dump_section kernel_data;
-};
-
-/* The dump header *must be* in low memory, so .bss it */
-static struct phyp_dump_header phdr;
-
-#define NUM_DUMP_SECTIONS	3
-#define DUMP_HEADER_VERSION	0x1
-#define DUMP_REQUEST_FLAG	0x1
-#define DUMP_SOURCE_CPU		0x0001
-#define DUMP_SOURCE_HPTE	0x0002
-#define DUMP_SOURCE_RMO		0x0011
-#define DUMP_ERROR_FLAG		0x2000
-#define DUMP_TRIGGERED		0x4000
-#define DUMP_PERFORMED		0x8000
-
-
-/**
- * init_dump_header() - initialize the header declaring a dump
- * Returns: length of dump save area.
- *
- * When the hypervisor saves crashed state, it needs to put
- * it somewhere. The dump header tells the hypervisor where
- * the data can be saved.
- */
-static unsigned long init_dump_header(struct phyp_dump_header *ph)
-{
-	unsigned long addr_offset = 0;
-
-	/* Set up the dump header */
-	ph->version = DUMP_HEADER_VERSION;
-	ph->num_of_sections = NUM_DUMP_SECTIONS;
-	ph->status = 0;
-
-	ph->first_offset_section =
-		(u32)offsetof(struct phyp_dump_header, cpu_data);
-	ph->dump_disk_section = 0;
-	ph->block_num_dd = 0;
-	ph->num_of_blocks_dd = 0;
-	ph->offset_dd = 0;
-
-	ph->maxtime_to_auto = 0; /* disabled */
-
-	/* The first two sections are mandatory */
-	ph->cpu_data.dump_flags = DUMP_REQUEST_FLAG;
-	ph->cpu_data.source_type = DUMP_SOURCE_CPU;
-	ph->cpu_data.source_address = 0;
-	ph->cpu_data.source_length = phyp_dump_info->cpu_state_size;
-	ph->cpu_data.destination_address = addr_offset;
-	addr_offset += phyp_dump_info->cpu_state_size;
-
-	ph->hpte_data.dump_flags = DUMP_REQUEST_FLAG;
-	ph->hpte_data.source_type = DUMP_SOURCE_HPTE;
-	ph->hpte_data.source_address = 0;
-	ph->hpte_data.source_length = phyp_dump_info->hpte_region_size;
-	ph->hpte_data.destination_address = addr_offset;
-	addr_offset += phyp_dump_info->hpte_region_size;
-
-	/* This section describes the low kernel region */
-	ph->kernel_data.dump_flags = DUMP_REQUEST_FLAG;
-	ph->kernel_data.source_type = DUMP_SOURCE_RMO;
-	ph->kernel_data.source_address = PHYP_DUMP_RMR_START;
-	ph->kernel_data.source_length = PHYP_DUMP_RMR_END;
-	ph->kernel_data.destination_address = addr_offset;
-	addr_offset += ph->kernel_data.source_length;
-
-	return addr_offset;
-}
-
-static void print_dump_header(const struct phyp_dump_header *ph)
-{
-#ifdef DEBUG
-	if (ph == NULL)
-		return;
-
-	printk(KERN_INFO "dump header:\n");
-	/* setup some ph->sections required */
-	printk(KERN_INFO "version = %d\n", ph->version);
-	printk(KERN_INFO "Sections = %d\n", ph->num_of_sections);
-	printk(KERN_INFO "Status = 0x%x\n", ph->status);
-
-	/* No ph->disk, so all should be set to 0 */
-	printk(KERN_INFO "Offset to first section 0x%x\n",
-		ph->first_offset_section);
-	printk(KERN_INFO "dump disk sections should be zero\n");
-	printk(KERN_INFO "dump disk section = %d\n", ph->dump_disk_section);
-	printk(KERN_INFO "block num = %lld\n", ph->block_num_dd);
-	printk(KERN_INFO "number of blocks = %lld\n", ph->num_of_blocks_dd);
-	printk(KERN_INFO "dump disk offset = %d\n", ph->offset_dd);
-	printk(KERN_INFO "Max auto time= %d\n", ph->maxtime_to_auto);
-
-	/*set cpu state and hpte states as well scratch pad area */
-	printk(KERN_INFO " CPU AREA \n");
-	printk(KERN_INFO "cpu dump_flags =%d\n", ph->cpu_data.dump_flags);
-	printk(KERN_INFO "cpu source_type =%d\n", ph->cpu_data.source_type);
-	printk(KERN_INFO "cpu error_flags =%d\n", ph->cpu_data.error_flags);
-	printk(KERN_INFO "cpu source_address =%llx\n",
-		ph->cpu_data.source_address);
-	printk(KERN_INFO "cpu source_length =%llx\n",
-		ph->cpu_data.source_length);
-	printk(KERN_INFO "cpu length_copied =%llx\n",
-		ph->cpu_data.length_copied);
-
-	printk(KERN_INFO " HPTE AREA \n");
-	printk(KERN_INFO "HPTE dump_flags =%d\n", ph->hpte_data.dump_flags);
-	printk(KERN_INFO "HPTE source_type =%d\n", ph->hpte_data.source_type);
-	printk(KERN_INFO "HPTE error_flags =%d\n", ph->hpte_data.error_flags);
-	printk(KERN_INFO "HPTE source_address =%llx\n",
-		ph->hpte_data.source_address);
-	printk(KERN_INFO "HPTE source_length =%llx\n",
-		ph->hpte_data.source_length);
-	printk(KERN_INFO "HPTE length_copied =%llx\n",
-		ph->hpte_data.length_copied);
-
-	printk(KERN_INFO " SRSD AREA \n");
-	printk(KERN_INFO "SRSD dump_flags =%d\n", ph->kernel_data.dump_flags);
-	printk(KERN_INFO "SRSD source_type =%d\n", ph->kernel_data.source_type);
-	printk(KERN_INFO "SRSD error_flags =%d\n", ph->kernel_data.error_flags);
-	printk(KERN_INFO "SRSD source_address =%llx\n",
-		ph->kernel_data.source_address);
-	printk(KERN_INFO "SRSD source_length =%llx\n",
-		ph->kernel_data.source_length);
-	printk(KERN_INFO "SRSD length_copied =%llx\n",
-		ph->kernel_data.length_copied);
-#endif
-}
-
-static ssize_t show_phyp_dump_active(struct kobject *kobj,
-			struct kobj_attribute *attr, char *buf)
-{
-
-	/* create filesystem entry so kdump is phyp-dump aware */
-	return sprintf(buf, "%lx\n", phyp_dump_info->phyp_dump_at_boot);
-}
-
-static struct kobj_attribute pdl = __ATTR(phyp_dump_active, 0600,
-					show_phyp_dump_active,
-					NULL);
-
-static void register_dump_area(struct phyp_dump_header *ph, unsigned long addr)
-{
-	int rc;
-
-	/* Add addr value if not initialized before */
-	if (ph->cpu_data.destination_address == 0) {
-		ph->cpu_data.destination_address += addr;
-		ph->hpte_data.destination_address += addr;
-		ph->kernel_data.destination_address += addr;
-	}
-
-	/* ToDo Invalidate kdump and free memory range. */
-
-	do {
-		rc = rtas_call(ibm_configure_kernel_dump, 3, 1, NULL,
-				1, ph, sizeof(struct phyp_dump_header));
-	} while (rtas_busy_delay(rc));
-
-	if (rc) {
-		printk(KERN_ERR "phyp-dump: unexpected error (%d) on "
-						"register\n", rc);
-		print_dump_header(ph);
-		return;
-	}
-
-	rc = sysfs_create_file(kernel_kobj, &pdl.attr);
-	if (rc)
-		printk(KERN_ERR "phyp-dump: unable to create sysfs"
-				" file (%d)\n", rc);
-}
-
-static
-void invalidate_last_dump(struct phyp_dump_header *ph, unsigned long addr)
-{
-	int rc;
-
-	/* Add addr value if not initialized before */
-	if (ph->cpu_data.destination_address == 0) {
-		ph->cpu_data.destination_address += addr;
-		ph->hpte_data.destination_address += addr;
-		ph->kernel_data.destination_address += addr;
-	}
-
-	do {
-		rc = rtas_call(ibm_configure_kernel_dump, 3, 1, NULL,
-				2, ph, sizeof(struct phyp_dump_header));
-	} while (rtas_busy_delay(rc));
-
-	if (rc) {
-		printk(KERN_ERR "phyp-dump: unexpected error (%d) "
-						"on invalidate\n", rc);
-		print_dump_header(ph);
-	}
-}
-
-/* ------------------------------------------------- */
-/**
- * release_memory_range -- release memory previously lmb_reserved
- * @start_pfn: starting physical frame number
- * @nr_pages: number of pages to free.
- *
- * This routine will release memory that had been previously
- * lmb_reserved in early boot. The released memory becomes
- * available for genreal use.
- */
-static void release_memory_range(unsigned long start_pfn,
-			unsigned long nr_pages)
-{
-	struct page *rpage;
-	unsigned long end_pfn;
-	long i;
-
-	end_pfn = start_pfn + nr_pages;
-
-	for (i = start_pfn; i <= end_pfn; i++) {
-		rpage = pfn_to_page(i);
-		if (PageReserved(rpage)) {
-			ClearPageReserved(rpage);
-			init_page_count(rpage);
-			__free_page(rpage);
-			totalram_pages++;
-		}
-	}
-}
-
-/**
- * track_freed_range -- Counts the range being freed.
- * Once the counter goes to zero, it re-registers dump for
- * future use.
- */
-static void
-track_freed_range(unsigned long addr, unsigned long length)
-{
-	static unsigned long scratch_area_size, reserved_area_size;
-
-	if (addr < phyp_dump_info->init_reserve_start)
-		return;
-
-	if ((addr >= phyp_dump_info->init_reserve_start) &&
-	    (addr <= phyp_dump_info->init_reserve_start +
-	     phyp_dump_info->init_reserve_size))
-		reserved_area_size += length;
-
-	if ((addr >= phyp_dump_info->reserved_scratch_addr) &&
-	    (addr <= phyp_dump_info->reserved_scratch_addr +
-	     phyp_dump_info->reserved_scratch_size))
-		scratch_area_size += length;
-
-	if ((reserved_area_size == phyp_dump_info->init_reserve_size) &&
-	    (scratch_area_size == phyp_dump_info->reserved_scratch_size)) {
-
-		invalidate_last_dump(&phdr,
-				phyp_dump_info->reserved_scratch_addr);
-		register_dump_area(&phdr,
-				phyp_dump_info->reserved_scratch_addr);
-	}
-}
-
-/* ------------------------------------------------- */
-/**
- * sysfs_release_region -- sysfs interface to release memory range.
- *
- * Usage:
- *   "echo <start addr> <length> > /sys/kernel/release_region"
- *
- * Example:
- *   "echo 0x40000000 0x10000000 > /sys/kernel/release_region"
- *
- * will release 256MB starting at 1GB.
- */
-static ssize_t store_release_region(struct kobject *kobj,
-				struct kobj_attribute *attr,
-				const char *buf, size_t count)
-{
-	unsigned long start_addr, length, end_addr;
-	unsigned long start_pfn, nr_pages;
-	ssize_t ret;
-
-	ret = sscanf(buf, "%lx %lx", &start_addr, &length);
-	if (ret != 2)
-		return -EINVAL;
-
-	track_freed_range(start_addr, length);
-
-	/* Range-check - don't free any reserved memory that
-	 * wasn't reserved for phyp-dump */
-	if (start_addr < phyp_dump_info->init_reserve_start)
-		start_addr = phyp_dump_info->init_reserve_start;
-
-	end_addr = phyp_dump_info->init_reserve_start +
-			phyp_dump_info->init_reserve_size;
-	if (start_addr+length > end_addr)
-		length = end_addr - start_addr;
-
-	/* Release the region of memory assed in by user */
-	start_pfn = PFN_DOWN(start_addr);
-	nr_pages = PFN_DOWN(length);
-	release_memory_range(start_pfn, nr_pages);
-
-	return count;
-}
-
-static ssize_t show_release_region(struct kobject *kobj,
-			struct kobj_attribute *attr, char *buf)
-{
-	u64 second_addr_range;
-
-	/* total reserved size - start of scratch area */
-	second_addr_range = phyp_dump_info->init_reserve_size -
-				phyp_dump_info->reserved_scratch_size;
-	return sprintf(buf, "CPU:0x%llx-0x%llx: HPTE:0x%llx-0x%llx:"
-			    " DUMP:0x%llx-0x%llx, 0x%lx-0x%llx:\n",
-		phdr.cpu_data.destination_address,
-		phdr.cpu_data.length_copied,
-		phdr.hpte_data.destination_address,
-		phdr.hpte_data.length_copied,
-		phdr.kernel_data.destination_address,
-		phdr.kernel_data.length_copied,
-		phyp_dump_info->init_reserve_start,
-		second_addr_range);
-}
-
-static struct kobj_attribute rr = __ATTR(release_region, 0600,
-					show_release_region,
-					store_release_region);
-
-static int __init phyp_dump_setup(void)
-{
-	struct device_node *rtas;
-	const struct phyp_dump_header *dump_header = NULL;
-	unsigned long dump_area_start;
-	unsigned long dump_area_length;
-	int header_len = 0;
-	int rc;
-
-	/* If no memory was reserved in early boot, there is nothing to do */
-	if (phyp_dump_info->init_reserve_size == 0)
-		return 0;
-
-	/* Return if phyp dump not supported */
-	if (!phyp_dump_info->phyp_dump_configured)
-		return -ENOSYS;
-
-	/* Is there dump data waiting for us? If there isn't,
-	 * then register a new dump area, and release all of
-	 * the rest of the reserved ram.
-	 *
-	 * The /rtas/ibm,kernel-dump rtas node is present only
-	 * if there is dump data waiting for us.
-	 */
-	rtas = of_find_node_by_path("/rtas");
-	if (rtas) {
-		dump_header = of_get_property(rtas, "ibm,kernel-dump",
-						&header_len);
-		of_node_put(rtas);
-	}
-
-	ibm_configure_kernel_dump = rtas_token("ibm,configure-kernel-dump");
-
-	print_dump_header(dump_header);
-	dump_area_length = init_dump_header(&phdr);
-	/* align down */
-	dump_area_start = phyp_dump_info->init_reserve_start & PAGE_MASK;
-
-	if (dump_header == NULL) {
-		register_dump_area(&phdr, dump_area_start);
-		return 0;
-	}
-
-	/* re-register the dump area, if old dump was invalid */
-	if ((dump_header) && (dump_header->status & DUMP_ERROR_FLAG)) {
-		invalidate_last_dump(&phdr, dump_area_start);
-		register_dump_area(&phdr, dump_area_start);
-		return 0;
-	}
-
-	if (dump_header) {
-		phyp_dump_info->reserved_scratch_addr =
-				dump_header->cpu_data.destination_address;
-		phyp_dump_info->reserved_scratch_size =
-				dump_header->cpu_data.source_length +
-				dump_header->hpte_data.source_length +
-				dump_header->kernel_data.source_length;
-	}
-
-	/* Should we create a dump_subsys, analogous to s390/ipl.c ? */
-	rc = sysfs_create_file(kernel_kobj, &rr.attr);
-	if (rc)
-		printk(KERN_ERR "phyp-dump: unable to create sysfs file (%d)\n",
-									rc);
-
-	/* ToDo: re-register the dump area, for next time. */
-	return 0;
-}
-machine_subsys_initcall(pseries, phyp_dump_setup);
-
-int __init early_init_dt_scan_phyp_dump(unsigned long node,
-		const char *uname, int depth, void *data)
-{
-	const unsigned int *sizes;
-
-	phyp_dump_info->phyp_dump_configured = 0;
-	phyp_dump_info->phyp_dump_is_active = 0;
-
-	if (depth != 1 || strcmp(uname, "rtas") != 0)
-		return 0;
-
-	if (of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL))
-		phyp_dump_info->phyp_dump_configured++;
-
-	if (of_get_flat_dt_prop(node, "ibm,dump-kernel", NULL))
-		phyp_dump_info->phyp_dump_is_active++;
-
-	sizes = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes",
-				    NULL);
-	if (!sizes)
-		return 0;
-
-	if (sizes[0] == 1)
-		phyp_dump_info->cpu_state_size = *((unsigned long *)&sizes[1]);
-
-	if (sizes[3] == 2)
-		phyp_dump_info->hpte_region_size =
-						*((unsigned long *)&sizes[4]);
-	return 1;
-}
-
-/* Look for phyp_dump= cmdline option */
-static int __init early_phyp_dump_enabled(char *p)
-{
-	phyp_dump_info->phyp_dump_at_boot = 1;
-
-        if (!p)
-                return 0;
-
-        if (strncmp(p, "1", 1) == 0)
-		phyp_dump_info->phyp_dump_at_boot = 1;
-        else if (strncmp(p, "0", 1) == 0)
-		phyp_dump_info->phyp_dump_at_boot = 0;
-
-        return 0;
-}
-early_param("phyp_dump", early_phyp_dump_enabled);
-
-/* Look for phyp_dump_reserve_size= cmdline option */
-static int __init early_phyp_dump_reserve_size(char *p)
-{
-        if (p)
-		phyp_dump_info->reserve_bootvar = memparse(p, &p);
-
-        return 0;
-}
-early_param("phyp_dump_reserve_size", early_phyp_dump_reserve_size);
diff --git a/arch/powerpc/platforms/pseries/plpar_wrappers.h b/arch/powerpc/platforms/pseries/plpar_wrappers.h
deleted file mode 100644
index a24a6b2333b..00000000000
--- a/arch/powerpc/platforms/pseries/plpar_wrappers.h
+++ /dev/null
@@ -1,252 +0,0 @@
-#ifndef _PSERIES_PLPAR_WRAPPERS_H
-#define _PSERIES_PLPAR_WRAPPERS_H
-
-#include <asm/hvcall.h>
-#include <asm/page.h>
-
-static inline long poll_pending(void)
-{
-	return plpar_hcall_norets(H_POLL_PENDING);
-}
-
-static inline long cede_processor(void)
-{
-	return plpar_hcall_norets(H_CEDE);
-}
-
-static inline long vpa_call(unsigned long flags, unsigned long cpu,
-		unsigned long vpa)
-{
-	/* flags are in bits 16-18 (counting from most significant bit) */
-	flags = flags << (63 - 18);
-
-	return plpar_hcall_norets(H_REGISTER_VPA, flags, cpu, vpa);
-}
-
-static inline long unregister_vpa(unsigned long cpu, unsigned long vpa)
-{
-	return vpa_call(0x5, cpu, vpa);
-}
-
-static inline long register_vpa(unsigned long cpu, unsigned long vpa)
-{
-	return vpa_call(0x1, cpu, vpa);
-}
-
-static inline long unregister_slb_shadow(unsigned long cpu, unsigned long vpa)
-{
-	return vpa_call(0x7, cpu, vpa);
-}
-
-static inline long register_slb_shadow(unsigned long cpu, unsigned long vpa)
-{
-	return vpa_call(0x3, cpu, vpa);
-}
-
-static inline long unregister_dtl(unsigned long cpu, unsigned long vpa)
-{
-	return vpa_call(0x6, cpu, vpa);
-}
-
-static inline long register_dtl(unsigned long cpu, unsigned long vpa)
-{
-	return vpa_call(0x2, cpu, vpa);
-}
-
-static inline long plpar_page_set_loaned(unsigned long vpa)
-{
-	unsigned long cmo_page_sz = cmo_get_page_size();
-	long rc = 0;
-	int i;
-
-	for (i = 0; !rc && i < PAGE_SIZE; i += cmo_page_sz)
-		rc = plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED, vpa + i, 0);
-
-	for (i -= cmo_page_sz; rc && i != 0; i -= cmo_page_sz)
-		plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE,
-				   vpa + i - cmo_page_sz, 0);
-
-	return rc;
-}
-
-static inline long plpar_page_set_active(unsigned long vpa)
-{
-	unsigned long cmo_page_sz = cmo_get_page_size();
-	long rc = 0;
-	int i;
-
-	for (i = 0; !rc && i < PAGE_SIZE; i += cmo_page_sz)
-		rc = plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE, vpa + i, 0);
-
-	for (i -= cmo_page_sz; rc && i != 0; i -= cmo_page_sz)
-		plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED,
-				   vpa + i - cmo_page_sz, 0);
-
-	return rc;
-}
-
-extern void vpa_init(int cpu);
-
-static inline long plpar_pte_enter(unsigned long flags,
-		unsigned long hpte_group, unsigned long hpte_v,
-		unsigned long hpte_r, unsigned long *slot)
-{
-	long rc;
-	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
-
-	rc = plpar_hcall(H_ENTER, retbuf, flags, hpte_group, hpte_v, hpte_r);
-
-	*slot = retbuf[0];
-
-	return rc;
-}
-
-static inline long plpar_pte_remove(unsigned long flags, unsigned long ptex,
-		unsigned long avpn, unsigned long *old_pteh_ret,
-		unsigned long *old_ptel_ret)
-{
-	long rc;
-	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
-
-	rc = plpar_hcall(H_REMOVE, retbuf, flags, ptex, avpn);
-
-	*old_pteh_ret = retbuf[0];
-	*old_ptel_ret = retbuf[1];
-
-	return rc;
-}
-
-/* plpar_pte_remove_raw can be called in real mode. It calls plpar_hcall_raw */
-static inline long plpar_pte_remove_raw(unsigned long flags, unsigned long ptex,
-		unsigned long avpn, unsigned long *old_pteh_ret,
-		unsigned long *old_ptel_ret)
-{
-	long rc;
-	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
-
-	rc = plpar_hcall_raw(H_REMOVE, retbuf, flags, ptex, avpn);
-
-	*old_pteh_ret = retbuf[0];
-	*old_ptel_ret = retbuf[1];
-
-	return rc;
-}
-
-static inline long plpar_pte_read(unsigned long flags, unsigned long ptex,
-		unsigned long *old_pteh_ret, unsigned long *old_ptel_ret)
-{
-	long rc;
-	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
-
-	rc = plpar_hcall(H_READ, retbuf, flags, ptex);
-
-	*old_pteh_ret = retbuf[0];
-	*old_ptel_ret = retbuf[1];
-
-	return rc;
-}
-
-/* plpar_pte_read_raw can be called in real mode. It calls plpar_hcall_raw */
-static inline long plpar_pte_read_raw(unsigned long flags, unsigned long ptex,
-		unsigned long *old_pteh_ret, unsigned long *old_ptel_ret)
-{
-	long rc;
-	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
-
-	rc = plpar_hcall_raw(H_READ, retbuf, flags, ptex);
-
-	*old_pteh_ret = retbuf[0];
-	*old_ptel_ret = retbuf[1];
-
-	return rc;
-}
-
-static inline long plpar_pte_protect(unsigned long flags, unsigned long ptex,
-		unsigned long avpn)
-{
-	return plpar_hcall_norets(H_PROTECT, flags, ptex, avpn);
-}
-
-static inline long plpar_tce_get(unsigned long liobn, unsigned long ioba,
-		unsigned long *tce_ret)
-{
-	long rc;
-	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
-
-	rc = plpar_hcall(H_GET_TCE, retbuf, liobn, ioba);
-
-	*tce_ret = retbuf[0];
-
-	return rc;
-}
-
-static inline long plpar_tce_put(unsigned long liobn, unsigned long ioba,
-		unsigned long tceval)
-{
-	return plpar_hcall_norets(H_PUT_TCE, liobn, ioba, tceval);
-}
-
-static inline long plpar_tce_put_indirect(unsigned long liobn,
-		unsigned long ioba, unsigned long page, unsigned long count)
-{
-	return plpar_hcall_norets(H_PUT_TCE_INDIRECT, liobn, ioba, page, count);
-}
-
-static inline long plpar_tce_stuff(unsigned long liobn, unsigned long ioba,
-		unsigned long tceval, unsigned long count)
-{
-	return plpar_hcall_norets(H_STUFF_TCE, liobn, ioba, tceval, count);
-}
-
-static inline long plpar_get_term_char(unsigned long termno,
-		unsigned long *len_ret, char *buf_ret)
-{
-	long rc;
-	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
-	unsigned long *lbuf = (unsigned long *)buf_ret;	/* TODO: alignment? */
-
-	rc = plpar_hcall(H_GET_TERM_CHAR, retbuf, termno);
-
-	*len_ret = retbuf[0];
-	lbuf[0] = retbuf[1];
-	lbuf[1] = retbuf[2];
-
-	return rc;
-}
-
-static inline long plpar_put_term_char(unsigned long termno, unsigned long len,
-		const char *buffer)
-{
-	unsigned long *lbuf = (unsigned long *)buffer;	/* TODO: alignment? */
-	return plpar_hcall_norets(H_PUT_TERM_CHAR, termno, len, lbuf[0],
-			lbuf[1]);
-}
-
-static inline long plpar_eoi(unsigned long xirr)
-{
-	return plpar_hcall_norets(H_EOI, xirr);
-}
-
-static inline long plpar_cppr(unsigned long cppr)
-{
-	return plpar_hcall_norets(H_CPPR, cppr);
-}
-
-static inline long plpar_ipi(unsigned long servernum, unsigned long mfrr)
-{
-	return plpar_hcall_norets(H_IPI, servernum, mfrr);
-}
-
-static inline long plpar_xirr(unsigned long *xirr_ret)
-{
-	long rc;
-	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
-
-	rc = plpar_hcall(H_XIRR, retbuf);
-
-	*xirr_ret = retbuf[0];
-
-	return rc;
-}
-
-#endif /* _PSERIES_PLPAR_WRAPPERS_H */
diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
index 9e17c0d2a0c..361add62abf 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -10,7 +10,19 @@
 #ifndef _PSERIES_PSERIES_H
 #define _PSERIES_PSERIES_H
 
-extern void __init fw_feature_init(const char *hypertas, unsigned long len);
+#include <linux/interrupt.h>
+
+struct device_node;
+
+extern void request_event_sources_irqs(struct device_node *np,
+				       irq_handler_t handler, const char *name);
+
+#include <linux/of.h>
+
+extern void __init fw_hypertas_feature_init(const char *hypertas,
+					    unsigned long len);
+extern void __init fw_vec5_feature_init(const char *hypertas,
+					unsigned long len);
 
 struct pt_regs;
 
@@ -38,6 +50,20 @@ extern void pSeries_final_fixup(void);
 /* Poweron flag used for enabling auto ups restart */
 extern unsigned long rtas_poweron_auto;
 
-extern void find_udbg_vterm(void);
+/* Provided by HVC VIO */
+extern void hvc_vio_init_early(void);
+
+/* Dynamic logical Partitioning/Mobility */
+extern void dlpar_free_cc_nodes(struct device_node *);
+extern void dlpar_free_cc_property(struct property *);
+extern struct device_node *dlpar_configure_connector(u32, struct device_node *);
+extern int dlpar_attach_node(struct device_node *);
+extern int dlpar_detach_node(struct device_node *);
+
+/* PCI root bridge prepare function override for pseries */
+struct pci_host_bridge;
+int pseries_root_bridge_prepare(struct pci_host_bridge *bridge);
+
+unsigned long pseries_memory_block_size(void);
 
 #endif /* _PSERIES_PSERIES_H */
diff --git a/arch/powerpc/platforms/pseries/pseries_energy.c b/arch/powerpc/platforms/pseries/pseries_energy.c
new file mode 100644
index 00000000000..92767791f93
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/pseries_energy.c
@@ -0,0 +1,290 @@
+/*
+ * POWER platform energy management driver
+ * Copyright (C) 2010 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This pseries platform device driver provides access to
+ * platform energy management capabilities.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/of.h>
+#include <asm/cputhreads.h>
+#include <asm/page.h>
+#include <asm/hvcall.h>
+#include <asm/firmware.h>
+
+
+#define MODULE_VERS "1.0"
+#define MODULE_NAME "pseries_energy"
+
+/* Driver flags */
+
+static int sysfs_entries;
+
+/* Helper routines */
+
+/* Helper Routines to convert between drc_index to cpu numbers */
+
+static u32 cpu_to_drc_index(int cpu)
+{
+	struct device_node *dn = NULL;
+	const int *indexes;
+	int i;
+	int rc = 1;
+	u32 ret = 0;
+
+	dn = of_find_node_by_path("/cpus");
+	if (dn == NULL)
+		goto err;
+	indexes = of_get_property(dn, "ibm,drc-indexes", NULL);
+	if (indexes == NULL)
+		goto err_of_node_put;
+	/* Convert logical cpu number to core number */
+	i = cpu_core_index_of_thread(cpu);
+	/*
+	 * The first element indexes[0] is the number of drc_indexes
+	 * returned in the list.  Hence i+1 will get the drc_index
+	 * corresponding to core number i.
+	 */
+	WARN_ON(i > indexes[0]);
+	ret = indexes[i + 1];
+	rc = 0;
+
+err_of_node_put:
+	of_node_put(dn);
+err:
+	if (rc)
+		printk(KERN_WARNING "cpu_to_drc_index(%d) failed", cpu);
+	return ret;
+}
+
+static int drc_index_to_cpu(u32 drc_index)
+{
+	struct device_node *dn = NULL;
+	const int *indexes;
+	int i, cpu = 0;
+	int rc = 1;
+
+	dn = of_find_node_by_path("/cpus");
+	if (dn == NULL)
+		goto err;
+	indexes = of_get_property(dn, "ibm,drc-indexes", NULL);
+	if (indexes == NULL)
+		goto err_of_node_put;
+	/*
+	 * First element in the array is the number of drc_indexes
+	 * returned.  Search through the list to find the matching
+	 * drc_index and get the core number
+	 */
+	for (i = 0; i < indexes[0]; i++) {
+		if (indexes[i + 1] == drc_index)
+			break;
+	}
+	/* Convert core number to logical cpu number */
+	cpu = cpu_first_thread_of_core(i);
+	rc = 0;
+
+err_of_node_put:
+	of_node_put(dn);
+err:
+	if (rc)
+		printk(KERN_WARNING "drc_index_to_cpu(%d) failed", drc_index);
+	return cpu;
+}
+
+/*
+ * pseries hypervisor call H_BEST_ENERGY provides hints to OS on
+ * preferred logical cpus to activate or deactivate for optimized
+ * energy consumption.
+ */
+
+#define FLAGS_MODE1	0x004E200000080E01UL
+#define FLAGS_MODE2	0x004E200000080401UL
+#define FLAGS_ACTIVATE  0x100
+
+static ssize_t get_best_energy_list(char *page, int activate)
+{
+	int rc, cnt, i, cpu;
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+	unsigned long flags = 0;
+	u32 *buf_page;
+	char *s = page;
+
+	buf_page = (u32 *) get_zeroed_page(GFP_KERNEL);
+	if (!buf_page)
+		return -ENOMEM;
+
+	flags = FLAGS_MODE1;
+	if (activate)
+		flags |= FLAGS_ACTIVATE;
+
+	rc = plpar_hcall9(H_BEST_ENERGY, retbuf, flags, 0, __pa(buf_page),
+				0, 0, 0, 0, 0, 0);
+	if (rc != H_SUCCESS) {
+		free_page((unsigned long) buf_page);
+		return -EINVAL;
+	}
+
+	cnt = retbuf[0];
+	for (i = 0; i < cnt; i++) {
+		cpu = drc_index_to_cpu(buf_page[2*i+1]);
+		if ((cpu_online(cpu) && !activate) ||
+		    (!cpu_online(cpu) && activate))
+			s += sprintf(s, "%d,", cpu);
+	}
+	if (s > page) { /* Something to show */
+		s--; /* Suppress last comma */
+		s += sprintf(s, "\n");
+	}
+
+	free_page((unsigned long) buf_page);
+	return s-page;
+}
+
+static ssize_t get_best_energy_data(struct device *dev,
+					char *page, int activate)
+{
+	int rc;
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+	unsigned long flags = 0;
+
+	flags = FLAGS_MODE2;
+	if (activate)
+		flags |= FLAGS_ACTIVATE;
+
+	rc = plpar_hcall9(H_BEST_ENERGY, retbuf, flags,
+				cpu_to_drc_index(dev->id),
+				0, 0, 0, 0, 0, 0, 0);
+
+	if (rc != H_SUCCESS)
+		return -EINVAL;
+
+	return sprintf(page, "%lu\n", retbuf[1] >> 32);
+}
+
+/* Wrapper functions */
+
+static ssize_t cpu_activate_hint_list_show(struct device *dev,
+			struct device_attribute *attr, char *page)
+{
+	return get_best_energy_list(page, 1);
+}
+
+static ssize_t cpu_deactivate_hint_list_show(struct device *dev,
+			struct device_attribute *attr, char *page)
+{
+	return get_best_energy_list(page, 0);
+}
+
+static ssize_t percpu_activate_hint_show(struct device *dev,
+			struct device_attribute *attr, char *page)
+{
+	return get_best_energy_data(dev, page, 1);
+}
+
+static ssize_t percpu_deactivate_hint_show(struct device *dev,
+			struct device_attribute *attr, char *page)
+{
+	return get_best_energy_data(dev, page, 0);
+}
+
+/*
+ * Create sysfs interface:
+ * /sys/devices/system/cpu/pseries_activate_hint_list
+ * /sys/devices/system/cpu/pseries_deactivate_hint_list
+ *	Comma separated list of cpus to activate or deactivate
+ * /sys/devices/system/cpu/cpuN/pseries_activate_hint
+ * /sys/devices/system/cpu/cpuN/pseries_deactivate_hint
+ *	Per-cpu value of the hint
+ */
+
+struct device_attribute attr_cpu_activate_hint_list =
+		__ATTR(pseries_activate_hint_list, 0444,
+		cpu_activate_hint_list_show, NULL);
+
+struct device_attribute attr_cpu_deactivate_hint_list =
+		__ATTR(pseries_deactivate_hint_list, 0444,
+		cpu_deactivate_hint_list_show, NULL);
+
+struct device_attribute attr_percpu_activate_hint =
+		__ATTR(pseries_activate_hint, 0444,
+		percpu_activate_hint_show, NULL);
+
+struct device_attribute attr_percpu_deactivate_hint =
+		__ATTR(pseries_deactivate_hint, 0444,
+		percpu_deactivate_hint_show, NULL);
+
+static int __init pseries_energy_init(void)
+{
+	int cpu, err;
+	struct device *cpu_dev;
+
+	if (!firmware_has_feature(FW_FEATURE_BEST_ENERGY)) {
+		printk(KERN_INFO "Hypercall H_BEST_ENERGY not supported\n");
+		return 0;
+	}
+	/* Create the sysfs files */
+	err = device_create_file(cpu_subsys.dev_root,
+				&attr_cpu_activate_hint_list);
+	if (!err)
+		err = device_create_file(cpu_subsys.dev_root,
+				&attr_cpu_deactivate_hint_list);
+
+	if (err)
+		return err;
+	for_each_possible_cpu(cpu) {
+		cpu_dev = get_cpu_device(cpu);
+		err = device_create_file(cpu_dev,
+				&attr_percpu_activate_hint);
+		if (err)
+			break;
+		err = device_create_file(cpu_dev,
+				&attr_percpu_deactivate_hint);
+		if (err)
+			break;
+	}
+
+	if (err)
+		return err;
+
+	sysfs_entries = 1; /* Removed entries on cleanup */
+	return 0;
+
+}
+
+static void __exit pseries_energy_cleanup(void)
+{
+	int cpu;
+	struct device *cpu_dev;
+
+	if (!sysfs_entries)
+		return;
+
+	/* Remove the sysfs files */
+	device_remove_file(cpu_subsys.dev_root, &attr_cpu_activate_hint_list);
+	device_remove_file(cpu_subsys.dev_root, &attr_cpu_deactivate_hint_list);
+
+	for_each_possible_cpu(cpu) {
+		cpu_dev = get_cpu_device(cpu);
+		sysfs_remove_file(&cpu_dev->kobj,
+				&attr_percpu_activate_hint.attr);
+		sysfs_remove_file(&cpu_dev->kobj,
+				&attr_percpu_deactivate_hint.attr);
+	}
+}
+
+module_init(pseries_energy_init);
+module_exit(pseries_energy_cleanup);
+MODULE_DESCRIPTION("Driver for pSeries platform energy management");
+MODULE_AUTHOR("Vaidyanathan Srinivasan");
+MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index d20b96e22c2..9c5778e6ed4 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -16,38 +16,15 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  */
 
-/* Change Activity:
- * 2001/09/21 : engebret : Created with minimal EPOW and HW exception support.
- * End Change Activity
- */
-
-#include <linux/errno.h>
-#include <linux/threads.h>
-#include <linux/kernel_stat.h>
-#include <linux/signal.h>
 #include <linux/sched.h>
-#include <linux/ioport.h>
 #include <linux/interrupt.h>
-#include <linux/timex.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/delay.h>
 #include <linux/irq.h>
-#include <linux/random.h>
-#include <linux/sysrq.h>
-#include <linux/bitops.h>
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <asm/io.h>
-#include <asm/pgtable.h>
-#include <asm/irq.h>
-#include <asm/cache.h>
-#include <asm/prom.h>
-#include <asm/ptrace.h>
+#include <linux/of.h>
+#include <linux/fs.h>
+#include <linux/reboot.h>
+
 #include <asm/machdep.h>
 #include <asm/rtas.h>
-#include <asm/udbg.h>
 #include <asm/firmware.h>
 
 #include "pseries.h"
@@ -55,76 +32,18 @@
 static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX];
 static DEFINE_SPINLOCK(ras_log_buf_lock);
 
-static char mce_data_buf[RTAS_ERROR_LOG_MAX];
+static char global_mce_data_buf[RTAS_ERROR_LOG_MAX];
+static DEFINE_PER_CPU(__u64, mce_data_buf);
 
-static int ras_get_sensor_state_token;
 static int ras_check_exception_token;
 
 #define EPOW_SENSOR_TOKEN	9
 #define EPOW_SENSOR_INDEX	0
-#define RAS_VECTOR_OFFSET	0x500
 
 static irqreturn_t ras_epow_interrupt(int irq, void *dev_id);
 static irqreturn_t ras_error_interrupt(int irq, void *dev_id);
 
 
-static void request_ras_irqs(struct device_node *np,
-			irq_handler_t handler,
-			const char *name)
-{
-	int i, index, count = 0;
-	struct of_irq oirq;
-	const u32 *opicprop;
-	unsigned int opicplen;
-	unsigned int virqs[16];
-
-	/* Check for obsolete "open-pic-interrupt" property. If present, then
-	 * map those interrupts using the default interrupt host and default
-	 * trigger
-	 */
-	opicprop = of_get_property(np, "open-pic-interrupt", &opicplen);
-	if (opicprop) {
-		opicplen /= sizeof(u32);
-		for (i = 0; i < opicplen; i++) {
-			if (count > 15)
-				break;
-			virqs[count] = irq_create_mapping(NULL, *(opicprop++));
-			if (virqs[count] == NO_IRQ)
-				printk(KERN_ERR "Unable to allocate interrupt "
-				       "number for %s\n", np->full_name);
-			else
-				count++;
-
-		}
-	}
-	/* Else use normal interrupt tree parsing */
-	else {
-		/* First try to do a proper OF tree parsing */
-		for (index = 0; of_irq_map_one(np, index, &oirq) == 0;
-		     index++) {
-			if (count > 15)
-				break;
-			virqs[count] = irq_create_of_mapping(oirq.controller,
-							    oirq.specifier,
-							    oirq.size);
-			if (virqs[count] == NO_IRQ)
-				printk(KERN_ERR "Unable to allocate interrupt "
-				       "number for %s\n", np->full_name);
-			else
-				count++;
-		}
-	}
-
-	/* Now request them */
-	for (i = 0; i < count; i++) {
-		if (request_irq(virqs[i], handler, 0, name, NULL)) {
-			printk(KERN_ERR "Unable to request interrupt %d for "
-			       "%s\n", virqs[i], np->full_name);
-			return;
-		}
-	}
-}
-
 /*
  * Initialize handlers for the set of interrupts caused by hardware errors
  * and power system events.
@@ -133,65 +52,161 @@ static int __init init_ras_IRQ(void)
 {
 	struct device_node *np;
 
-	ras_get_sensor_state_token = rtas_token("get-sensor-state");
 	ras_check_exception_token = rtas_token("check-exception");
 
 	/* Internal Errors */
 	np = of_find_node_by_path("/event-sources/internal-errors");
 	if (np != NULL) {
-		request_ras_irqs(np, ras_error_interrupt, "RAS_ERROR");
+		request_event_sources_irqs(np, ras_error_interrupt,
+					   "RAS_ERROR");
 		of_node_put(np);
 	}
 
 	/* EPOW Events */
 	np = of_find_node_by_path("/event-sources/epow-events");
 	if (np != NULL) {
-		request_ras_irqs(np, ras_epow_interrupt, "RAS_EPOW");
+		request_event_sources_irqs(np, ras_epow_interrupt, "RAS_EPOW");
 		of_node_put(np);
 	}
 
 	return 0;
 }
-__initcall(init_ras_IRQ);
+subsys_initcall(init_ras_IRQ);
 
-/*
- * Handle power subsystem events (EPOW).
- *
- * Presently we just log the event has occurred.  This should be fixed
- * to examine the type of power failure and take appropriate action where
- * the time horizon permits something useful to be done.
- */
+#define EPOW_SHUTDOWN_NORMAL				1
+#define EPOW_SHUTDOWN_ON_UPS				2
+#define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS	3
+#define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH	4
+
+static void handle_system_shutdown(char event_modifier)
+{
+	switch (event_modifier) {
+	case EPOW_SHUTDOWN_NORMAL:
+		pr_emerg("Firmware initiated power off");
+		orderly_poweroff(true);
+		break;
+
+	case EPOW_SHUTDOWN_ON_UPS:
+		pr_emerg("Loss of power reported by firmware, system is "
+			"running on UPS/battery");
+		break;
+
+	case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS:
+		pr_emerg("Loss of system critical functions reported by "
+			"firmware");
+		pr_emerg("Check RTAS error log for details");
+		orderly_poweroff(true);
+		break;
+
+	case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH:
+		pr_emerg("Ambient temperature too high reported by firmware");
+		pr_emerg("Check RTAS error log for details");
+		orderly_poweroff(true);
+		break;
+
+	default:
+		pr_err("Unknown power/cooling shutdown event (modifier %d)",
+			event_modifier);
+	}
+}
+
+struct epow_errorlog {
+	unsigned char sensor_value;
+	unsigned char event_modifier;
+	unsigned char extended_modifier;
+	unsigned char reserved;
+	unsigned char platform_reason;
+};
+
+#define EPOW_RESET			0
+#define EPOW_WARN_COOLING		1
+#define EPOW_WARN_POWER			2
+#define EPOW_SYSTEM_SHUTDOWN		3
+#define EPOW_SYSTEM_HALT		4
+#define EPOW_MAIN_ENCLOSURE		5
+#define EPOW_POWER_OFF			7
+
+void rtas_parse_epow_errlog(struct rtas_error_log *log)
+{
+	struct pseries_errorlog *pseries_log;
+	struct epow_errorlog *epow_log;
+	char action_code;
+	char modifier;
+
+	pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW);
+	if (pseries_log == NULL)
+		return;
+
+	epow_log = (struct epow_errorlog *)pseries_log->data;
+	action_code = epow_log->sensor_value & 0xF;	/* bottom 4 bits */
+	modifier = epow_log->event_modifier & 0xF;	/* bottom 4 bits */
+
+	switch (action_code) {
+	case EPOW_RESET:
+		pr_err("Non critical power or cooling issue cleared");
+		break;
+
+	case EPOW_WARN_COOLING:
+		pr_err("Non critical cooling issue reported by firmware");
+		pr_err("Check RTAS error log for details");
+		break;
+
+	case EPOW_WARN_POWER:
+		pr_err("Non critical power issue reported by firmware");
+		pr_err("Check RTAS error log for details");
+		break;
+
+	case EPOW_SYSTEM_SHUTDOWN:
+		handle_system_shutdown(epow_log->event_modifier);
+		break;
+
+	case EPOW_SYSTEM_HALT:
+		pr_emerg("Firmware initiated power off");
+		orderly_poweroff(true);
+		break;
+
+	case EPOW_MAIN_ENCLOSURE:
+	case EPOW_POWER_OFF:
+		pr_emerg("Critical power/cooling issue reported by firmware");
+		pr_emerg("Check RTAS error log for details");
+		pr_emerg("Immediate power off");
+		emergency_sync();
+		kernel_power_off();
+		break;
+
+	default:
+		pr_err("Unknown power/cooling event (action code %d)",
+			action_code);
+	}
+}
+
+/* Handle environmental and power warning (EPOW) interrupts. */
 static irqreturn_t ras_epow_interrupt(int irq, void *dev_id)
 {
-	int status = 0xdeadbeef;
-	int state = 0;
+	int status;
+	int state;
 	int critical;
 
-	status = rtas_call(ras_get_sensor_state_token, 2, 2, &state,
-			   EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX);
+	status = rtas_get_sensor(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, &state);
 
 	if (state > 3)
-		critical = 1;  /* Time Critical */
+		critical = 1;		/* Time Critical */
 	else
 		critical = 0;
 
 	spin_lock(&ras_log_buf_lock);
 
 	status = rtas_call(ras_check_exception_token, 6, 1, NULL,
-			   RAS_VECTOR_OFFSET,
-			   irq_map[irq].hwirq,
-			   RTAS_EPOW_WARNING | RTAS_POWERMGM_EVENTS,
+			   RTAS_VECTOR_EXTERNAL_INTERRUPT,
+			   virq_to_hw(irq),
+			   RTAS_EPOW_WARNING,
 			   critical, __pa(&ras_log_buf),
 				rtas_get_error_log_max());
 
-	udbg_printf("EPOW <0x%lx 0x%x 0x%x>\n",
-		    *((unsigned long *)&ras_log_buf), status, state);
-	printk(KERN_WARNING "EPOW <0x%lx 0x%x 0x%x>\n",
-	       *((unsigned long *)&ras_log_buf), status, state);
-
-	/* format and print the extended information */
 	log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0);
 
+	rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf);
+
 	spin_unlock(&ras_log_buf_lock);
 	return IRQ_HANDLED;
 }
@@ -207,21 +222,22 @@ static irqreturn_t ras_epow_interrupt(int irq, void *dev_id)
 static irqreturn_t ras_error_interrupt(int irq, void *dev_id)
 {
 	struct rtas_error_log *rtas_elog;
-	int status = 0xdeadbeef;
+	int status;
 	int fatal;
 
 	spin_lock(&ras_log_buf_lock);
 
 	status = rtas_call(ras_check_exception_token, 6, 1, NULL,
-			   RAS_VECTOR_OFFSET,
-			   irq_map[irq].hwirq,
-			   RTAS_INTERNAL_ERROR, 1 /*Time Critical */,
+			   RTAS_VECTOR_EXTERNAL_INTERRUPT,
+			   virq_to_hw(irq),
+			   RTAS_INTERNAL_ERROR, 1 /* Time Critical */,
 			   __pa(&ras_log_buf),
 				rtas_get_error_log_max());
 
 	rtas_elog = (struct rtas_error_log *)ras_log_buf;
 
-	if ((status == 0) && (rtas_elog->severity >= RTAS_SEVERITY_ERROR_SYNC))
+	if (status == 0 &&
+	    rtas_error_severity(rtas_elog) >= RTAS_SEVERITY_ERROR_SYNC)
 		fatal = 1;
 	else
 		fatal = 0;
@@ -230,36 +246,37 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id)
 	log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal);
 
 	if (fatal) {
-		udbg_printf("Fatal HW Error <0x%lx 0x%x>\n",
-			    *((unsigned long *)&ras_log_buf), status);
-		printk(KERN_EMERG "Error: Fatal hardware error <0x%lx 0x%x>\n",
-		       *((unsigned long *)&ras_log_buf), status);
-
-#ifndef DEBUG_RTAS_POWER_OFF
-		/* Don't actually power off when debugging so we can test
-		 * without actually failing while injecting errors.
-		 * Error data will not be logged to syslog.
-		 */
-		ppc_md.power_off();
-#endif
+		pr_emerg("Fatal hardware error reported by firmware");
+		pr_emerg("Check RTAS error log for details");
+		pr_emerg("Immediate power off");
+		emergency_sync();
+		kernel_power_off();
 	} else {
-		udbg_printf("Recoverable HW Error <0x%lx 0x%x>\n",
-			    *((unsigned long *)&ras_log_buf), status);
-		printk(KERN_WARNING
-		       "Warning: Recoverable hardware error <0x%lx 0x%x>\n",
-		       *((unsigned long *)&ras_log_buf), status);
+		pr_err("Recoverable hardware error reported by firmware");
 	}
 
 	spin_unlock(&ras_log_buf_lock);
 	return IRQ_HANDLED;
 }
 
-/* Get the error information for errors coming through the
+/*
+ * Some versions of FWNMI place the buffer inside the 4kB page starting at
+ * 0x7000. Other versions place it inside the rtas buffer. We check both.
+ */
+#define VALID_FWNMI_BUFFER(A) \
+	((((A) >= 0x7000) && ((A) < 0x7ff0)) || \
+	(((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16))))
+
+/*
+ * Get the error information for errors coming through the
  * FWNMI vectors.  The pt_regs' r3 will be updated to reflect
  * the actual r3 if possible, and a ptr to the error log entry
  * will be returned if found.
  *
- * The mce_data_buf does not have any locks or protection around it,
+ * If the RTAS error is not of the extended type, then we put it in a per
+ * cpu 64bit buffer. If it is the extended type we use global_mce_data_buf.
+ *
+ * The global_mce_data_buf does not have any locks or protection around it,
  * if a second machine check comes in, or a system reset is done
  * before we have logged the error, then we will get corruption in the
  * error log.  This is preferable over holding off on calling
@@ -268,20 +285,35 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id)
  */
 static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
 {
-	unsigned long errdata = regs->gpr[3];
-	struct rtas_error_log *errhdr = NULL;
 	unsigned long *savep;
+	struct rtas_error_log *h, *errhdr = NULL;
+
+	/* Mask top two bits */
+	regs->gpr[3] &= ~(0x3UL << 62);
+
+	if (!VALID_FWNMI_BUFFER(regs->gpr[3])) {
+		printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]);
+		return NULL;
+	}
+
+	savep = __va(regs->gpr[3]);
+	regs->gpr[3] = savep[0];	/* restore original r3 */
 
-	if ((errdata >= 0x7000 && errdata < 0x7fff0) ||
-	    (errdata >= rtas.base && errdata < rtas.base + rtas.size - 16)) {
-		savep = __va(errdata);
-		regs->gpr[3] = savep[0];	/* restore original r3 */
-		memset(mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
-		memcpy(mce_data_buf, (char *)(savep + 1), RTAS_ERROR_LOG_MAX);
-		errhdr = (struct rtas_error_log *)mce_data_buf;
+	/* If it isn't an extended log we can use the per cpu 64bit buffer */
+	h = (struct rtas_error_log *)&savep[1];
+	if (!rtas_error_extended(h)) {
+		memcpy(&__get_cpu_var(mce_data_buf), h, sizeof(__u64));
+		errhdr = (struct rtas_error_log *)&__get_cpu_var(mce_data_buf);
 	} else {
-		printk("FWNMI: corrupt r3\n");
+		int len, error_log_length;
+
+		error_log_length = 8 + rtas_error_extended_log_length(h);
+		len = max_t(int, error_log_length, RTAS_ERROR_LOG_MAX);
+		memset(global_mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
+		memcpy(global_mce_data_buf, h, len);
+		errhdr = (struct rtas_error_log *)global_mce_data_buf;
 	}
+
 	return errhdr;
 }
 
@@ -293,7 +325,7 @@ static void fwnmi_release_errinfo(void)
 {
 	int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL);
 	if (ret != 0)
-		printk("FWNMI: nmi-interlock failed: %d\n", ret);
+		printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret);
 }
 
 int pSeries_system_reset_exception(struct pt_regs *regs)
@@ -317,31 +349,44 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
  * Return 1 if corrected (or delivered a signal).
  * Return 0 if there is nothing we can do.
  */
-static int recover_mce(struct pt_regs *regs, struct rtas_error_log * err)
+static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
 {
-	int nonfatal = 0;
+	int recovered = 0;
+	int disposition = rtas_error_disposition(err);
+
+	if (!(regs->msr & MSR_RI)) {
+		/* If MSR_RI isn't set, we cannot recover */
+		recovered = 0;
 
-	if (err->disposition == RTAS_DISP_FULLY_RECOVERED) {
+	} else if (disposition == RTAS_DISP_FULLY_RECOVERED) {
 		/* Platform corrected itself */
-		nonfatal = 1;
-	} else if ((regs->msr & MSR_RI) &&
-		   user_mode(regs) &&
-		   err->severity == RTAS_SEVERITY_ERROR_SYNC &&
-		   err->disposition == RTAS_DISP_NOT_RECOVERED &&
-		   err->target == RTAS_TARGET_MEMORY &&
-		   err->type == RTAS_TYPE_ECC_UNCORR &&
-		   !(current->pid == 0 || is_global_init(current))) {
-		/* Kill off a user process with an ECC error */
-		printk(KERN_ERR "MCE: uncorrectable ecc error for pid %d\n",
-		       current->pid);
-		/* XXX something better for ECC error? */
-		_exception(SIGBUS, regs, BUS_ADRERR, regs->nip);
-		nonfatal = 1;
+		recovered = 1;
+
+	} else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
+		/* Platform corrected itself but could be degraded */
+		printk(KERN_ERR "MCE: limited recovery, system may "
+		       "be degraded\n");
+		recovered = 1;
+
+	} else if (user_mode(regs) && !is_global_init(current) &&
+		   rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) {
+
+		/*
+		 * If we received a synchronous error when in userspace
+		 * kill the task. Firmware may report details of the fail
+		 * asynchronously, so we can't rely on the target and type
+		 * fields being valid here.
+		 */
+		printk(KERN_ERR "MCE: uncorrectable error, killing task "
+		       "%s:%d\n", current->comm, current->pid);
+
+		_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
+		recovered = 1;
 	}
 
-	log_error((char *)err, ERR_TYPE_RTAS_LOG, !nonfatal);
+	log_error((char *)err, ERR_TYPE_RTAS_LOG, 0);
 
-	return nonfatal;
+	return recovered;
 }
 
 /*
diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c
index b6f1b137d42..1c0a60d9886 100644
--- a/arch/powerpc/platforms/pseries/reconfig.c
+++ b/arch/powerpc/platforms/pseries/reconfig.c
@@ -12,56 +12,15 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/kref.h>
 #include <linux/notifier.h>
 #include <linux/proc_fs.h>
+#include <linux/slab.h>
+#include <linux/of.h>
 
 #include <asm/prom.h>
 #include <asm/machdep.h>
 #include <asm/uaccess.h>
-#include <asm/pSeries_reconfig.h>
-
-
-
-/*
- * Routines for "runtime" addition and removal of device tree nodes.
- */
-#ifdef CONFIG_PROC_DEVICETREE
-/*
- * Add a node to /proc/device-tree.
- */
-static void add_node_proc_entries(struct device_node *np)
-{
-	struct proc_dir_entry *ent;
-
-	ent = proc_mkdir(strrchr(np->full_name, '/') + 1, np->parent->pde);
-	if (ent)
-		proc_device_tree_add_node(np, ent);
-}
-
-static void remove_node_proc_entries(struct device_node *np)
-{
-	struct property *pp = np->properties;
-	struct device_node *parent = np->parent;
-
-	while (pp) {
-		remove_proc_entry(pp->name, np->pde);
-		pp = pp->next;
-	}
-	if (np->pde)
-		remove_proc_entry(np->pde->name, parent->pde);
-}
-#else /* !CONFIG_PROC_DEVICETREE */
-static void add_node_proc_entries(struct device_node *np)
-{
-	return;
-}
-
-static void remove_node_proc_entries(struct device_node *np)
-{
-	return;
-}
-#endif /* CONFIG_PROC_DEVICETREE */
+#include <asm/mmu.h>
 
 /**
  *	derive_parent - basically like dirname(1)
@@ -95,18 +54,6 @@ static struct device_node *derive_parent(const char *path)
 	return parent;
 }
 
-static BLOCKING_NOTIFIER_HEAD(pSeries_reconfig_chain);
-
-int pSeries_reconfig_notifier_register(struct notifier_block *nb)
-{
-	return blocking_notifier_chain_register(&pSeries_reconfig_chain, nb);
-}
-
-void pSeries_reconfig_notifier_unregister(struct notifier_block *nb)
-{
-	blocking_notifier_chain_unregister(&pSeries_reconfig_chain, nb);
-}
-
 static int pSeries_reconfig_add_node(const char *path, struct property *proplist)
 {
 	struct device_node *np;
@@ -116,15 +63,13 @@ static int pSeries_reconfig_add_node(const char *path, struct property *proplist
 	if (!np)
 		goto out_err;
 
-	np->full_name = kmalloc(strlen(path) + 1, GFP_KERNEL);
+	np->full_name = kstrdup(path, GFP_KERNEL);
 	if (!np->full_name)
 		goto out_err;
 
-	strcpy(np->full_name, path);
-
 	np->properties = proplist;
 	of_node_set_flag(np, OF_DYNAMIC);
-	kref_init(&np->kref);
+	of_node_init(np);
 
 	np->parent = derive_parent(path);
 	if (IS_ERR(np->parent)) {
@@ -132,18 +77,12 @@ static int pSeries_reconfig_add_node(const char *path, struct property *proplist
 		goto out_err;
 	}
 
-	err = blocking_notifier_call_chain(&pSeries_reconfig_chain,
-				  PSERIES_RECONFIG_ADD, np);
-	if (err == NOTIFY_BAD) {
+	err = of_attach_node(np);
+	if (err) {
 		printk(KERN_ERR "Failed to add device node %s\n", path);
-		err = -ENOMEM; /* For now, safe to assume kmalloc failure */
 		goto out_err;
 	}
 
-	of_attach_node(np);
-
-	add_node_proc_entries(np);
-
 	of_node_put(np->parent);
 
 	return 0;
@@ -171,19 +110,14 @@ static int pSeries_reconfig_remove_node(struct device_node *np)
 		return -EBUSY;
 	}
 
-	remove_node_proc_entries(np);
-
-	blocking_notifier_call_chain(&pSeries_reconfig_chain,
-			    PSERIES_RECONFIG_REMOVE, np);
 	of_detach_node(np);
-
 	of_node_put(parent);
 	of_node_put(np); /* Must decrement the refcount */
 	return 0;
 }
 
 /*
- * /proc/ppc64/ofdt - yucky binary interface for adding and removing
+ * /proc/powerpc/ofdt - yucky binary interface for adding and removing
  * OF device nodes.  Should be deprecated as soon as we get an
  * in-kernel wrapper for the RTAS ibm,configure-connector call.
  */
@@ -274,12 +208,11 @@ static struct property *new_property(const char *name, const int length,
 	if (!new)
 		return NULL;
 
-	if (!(new->name = kmalloc(strlen(name) + 1, GFP_KERNEL)))
+	if (!(new->name = kstrdup(name, GFP_KERNEL)))
 		goto cleanup;
 	if (!(new->value = kmalloc(length + 1, GFP_KERNEL)))
 		goto cleanup;
 
-	strcpy(new->name, name);
 	memcpy(new->value, value, length);
 	*(((char *)new->value) + length) = 0;
 	new->length = length;
@@ -391,7 +324,7 @@ static int do_add_property(char *buf, size_t bufsize)
 	if (!prop)
 		return -ENOMEM;
 
-	prom_add_property(np, prop);
+	of_add_property(np, prop);
 
 	return 0;
 }
@@ -415,7 +348,7 @@ static int do_remove_property(char *buf, size_t bufsize)
 
 	prop = of_find_property(np, buf, NULL);
 
-	return prom_remove_property(np, prop);
+	return of_remove_property(np, prop);
 }
 
 static int do_update_property(char *buf, size_t bufsize)
@@ -423,8 +356,8 @@ static int do_update_property(char *buf, size_t bufsize)
 	struct device_node *np;
 	unsigned char *value;
 	char *name, *end, *next_prop;
-	int rc, length;
-	struct property *newprop, *oldprop;
+	int length;
+	struct property *newprop;
 	buf = parse_node(buf, bufsize, &np);
 	end = buf + bufsize;
 
@@ -435,46 +368,17 @@ static int do_update_property(char *buf, size_t bufsize)
 	if (!next_prop)
 		return -EINVAL;
 
+	if (!strlen(name))
+		return -ENODEV;
+
 	newprop = new_property(name, length, value, NULL);
 	if (!newprop)
 		return -ENOMEM;
 
-	oldprop = of_find_property(np, name,NULL);
-	if (!oldprop)
-		return -ENODEV;
+	if (!strcmp(name, "slb-size") || !strcmp(name, "ibm,slb-size"))
+		slb_set_size(*(int *)value);
 
-	rc = prom_update_property(np, newprop, oldprop);
-	if (rc)
-		return rc;
-
-	/* For memory under the ibm,dynamic-reconfiguration-memory node
-	 * of the device tree, adding and removing memory is just an update
-	 * to the ibm,dynamic-memory property instead of adding/removing a
-	 * memory node in the device tree.  For these cases we still need to
-	 * involve the notifier chain.
-	 */
-	if (!strcmp(name, "ibm,dynamic-memory")) {
-		int action;
-
-		next_prop = parse_next_property(next_prop, end, &name,
-						&length, &value);
-		if (!next_prop)
-			return -EINVAL;
-
-		if (!strcmp(name, "add"))
-			action = PSERIES_DRCONF_MEM_ADD;
-		else
-			action = PSERIES_DRCONF_MEM_REMOVE;
-
-		rc = blocking_notifier_call_chain(&pSeries_reconfig_chain,
-						  action, value);
-		if (rc == NOTIFY_BAD) {
-			rc = prom_update_property(np, oldprop, newprop);
-			return -ENOMEM;
-		}
-	}
-
-	return 0;
+	return of_update_property(np, newprop);
 }
 
 /**
@@ -533,10 +437,11 @@ out:
 }
 
 static const struct file_operations ofdt_fops = {
-	.write = ofdt_write
+	.write = ofdt_write,
+	.llseek = noop_llseek,
 };
 
-/* create /proc/ppc64/ofdt write-only by root */
+/* create /proc/powerpc/ofdt write-only by root */
 static int proc_ppc64_create_ofdt(void)
 {
 	struct proc_dir_entry *ent;
@@ -544,9 +449,9 @@ static int proc_ppc64_create_ofdt(void)
 	if (!machine_is(pseries))
 		return 0;
 
-	ent = proc_create("ppc64/ofdt", S_IWUSR, NULL, &ofdt_fops);
+	ent = proc_create("powerpc/ofdt", S_IWUSR, NULL, &ofdt_fops);
 	if (ent)
-		ent->size = 0;
+		proc_set_size(ent, 0);
 
 	return 0;
 }
diff --git a/arch/powerpc/platforms/pseries/rng.c b/arch/powerpc/platforms/pseries/rng.c
new file mode 100644
index 00000000000..72a102758d4
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/rng.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2013, Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt)	"pseries-rng: " fmt
+
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <asm/archrandom.h>
+#include <asm/machdep.h>
+#include <asm/plpar_wrappers.h>
+
+
+static int pseries_get_random_long(unsigned long *v)
+{
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+	if (plpar_hcall(H_RANDOM, retbuf) == H_SUCCESS) {
+		*v = retbuf[0];
+		return 1;
+	}
+
+	return 0;
+}
+
+static __init int rng_init(void)
+{
+	struct device_node *dn;
+
+	dn = of_find_compatible_node(NULL, NULL, "ibm,random");
+	if (!dn)
+		return -ENODEV;
+
+	pr_info("Registering arch random hook.\n");
+
+	ppc_md.get_random_long = pseries_get_random_long;
+
+	return 0;
+}
+subsys_initcall(rng_init);
diff --git a/arch/powerpc/platforms/pseries/rtasd.c b/arch/powerpc/platforms/pseries/rtasd.c
deleted file mode 100644
index afad9f5ac0a..00000000000
--- a/arch/powerpc/platforms/pseries/rtasd.c
+++ /dev/null
@@ -1,519 +0,0 @@
-/*
- * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Communication to userspace based on kernel/printk.c
- */
-
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/poll.h>
-#include <linux/proc_fs.h>
-#include <linux/init.h>
-#include <linux/vmalloc.h>
-#include <linux/spinlock.h>
-#include <linux/cpu.h>
-#include <linux/delay.h>
-
-#include <asm/uaccess.h>
-#include <asm/io.h>
-#include <asm/rtas.h>
-#include <asm/prom.h>
-#include <asm/nvram.h>
-#include <asm/atomic.h>
-#include <asm/machdep.h>
-
-
-static DEFINE_SPINLOCK(rtasd_log_lock);
-
-static DECLARE_WAIT_QUEUE_HEAD(rtas_log_wait);
-
-static char *rtas_log_buf;
-static unsigned long rtas_log_start;
-static unsigned long rtas_log_size;
-
-static int surveillance_timeout = -1;
-static unsigned int rtas_error_log_max;
-static unsigned int rtas_error_log_buffer_max;
-
-/* RTAS service tokens */
-static unsigned int event_scan;
-static unsigned int rtas_event_scan_rate;
-
-static int full_rtas_msgs = 0;
-
-/* Stop logging to nvram after first fatal error */
-static int logging_enabled; /* Until we initialize everything,
-                             * make sure we don't try logging
-                             * anything */
-static int error_log_cnt;
-
-/*
- * Since we use 32 bit RTAS, the physical address of this must be below
- * 4G or else bad things happen. Allocate this in the kernel data and
- * make it big enough.
- */
-static unsigned char logdata[RTAS_ERROR_LOG_MAX];
-
-static char *rtas_type[] = {
-	"Unknown", "Retry", "TCE Error", "Internal Device Failure",
-	"Timeout", "Data Parity", "Address Parity", "Cache Parity",
-	"Address Invalid", "ECC Uncorrected", "ECC Corrupted",
-};
-
-static char *rtas_event_type(int type)
-{
-	if ((type > 0) && (type < 11))
-		return rtas_type[type];
-
-	switch (type) {
-		case RTAS_TYPE_EPOW:
-			return "EPOW";
-		case RTAS_TYPE_PLATFORM:
-			return "Platform Error";
-		case RTAS_TYPE_IO:
-			return "I/O Event";
-		case RTAS_TYPE_INFO:
-			return "Platform Information Event";
-		case RTAS_TYPE_DEALLOC:
-			return "Resource Deallocation Event";
-		case RTAS_TYPE_DUMP:
-			return "Dump Notification Event";
-	}
-
-	return rtas_type[0];
-}
-
-/* To see this info, grep RTAS /var/log/messages and each entry
- * will be collected together with obvious begin/end.
- * There will be a unique identifier on the begin and end lines.
- * This will persist across reboots.
- *
- * format of error logs returned from RTAS:
- * bytes	(size)	: contents
- * --------------------------------------------------------
- * 0-7		(8)	: rtas_error_log
- * 8-47		(40)	: extended info
- * 48-51	(4)	: vendor id
- * 52-1023 (vendor specific) : location code and debug data
- */
-static void printk_log_rtas(char *buf, int len)
-{
-
-	int i,j,n = 0;
-	int perline = 16;
-	char buffer[64];
-	char * str = "RTAS event";
-
-	if (full_rtas_msgs) {
-		printk(RTAS_DEBUG "%d -------- %s begin --------\n",
-		       error_log_cnt, str);
-
-		/*
-		 * Print perline bytes on each line, each line will start
-		 * with RTAS and a changing number, so syslogd will
-		 * print lines that are otherwise the same.  Separate every
-		 * 4 bytes with a space.
-		 */
-		for (i = 0; i < len; i++) {
-			j = i % perline;
-			if (j == 0) {
-				memset(buffer, 0, sizeof(buffer));
-				n = sprintf(buffer, "RTAS %d:", i/perline);
-			}
-
-			if ((i % 4) == 0)
-				n += sprintf(buffer+n, " ");
-
-			n += sprintf(buffer+n, "%02x", (unsigned char)buf[i]);
-
-			if (j == (perline-1))
-				printk(KERN_DEBUG "%s\n", buffer);
-		}
-		if ((i % perline) != 0)
-			printk(KERN_DEBUG "%s\n", buffer);
-
-		printk(RTAS_DEBUG "%d -------- %s end ----------\n",
-		       error_log_cnt, str);
-	} else {
-		struct rtas_error_log *errlog = (struct rtas_error_log *)buf;
-
-		printk(RTAS_DEBUG "event: %d, Type: %s, Severity: %d\n",
-		       error_log_cnt, rtas_event_type(errlog->type),
-		       errlog->severity);
-	}
-}
-
-static int log_rtas_len(char * buf)
-{
-	int len;
-	struct rtas_error_log *err;
-
-	/* rtas fixed header */
-	len = 8;
-	err = (struct rtas_error_log *)buf;
-	if (err->extended_log_length) {
-
-		/* extended header */
-		len += err->extended_log_length;
-	}
-
-	if (rtas_error_log_max == 0)
-		rtas_error_log_max = rtas_get_error_log_max();
-
-	if (len > rtas_error_log_max)
-		len = rtas_error_log_max;
-
-	return len;
-}
-
-/*
- * First write to nvram, if fatal error, that is the only
- * place we log the info.  The error will be picked up
- * on the next reboot by rtasd.  If not fatal, run the
- * method for the type of error.  Currently, only RTAS
- * errors have methods implemented, but in the future
- * there might be a need to store data in nvram before a
- * call to panic().
- *
- * XXX We write to nvram periodically, to indicate error has
- * been written and sync'd, but there is a possibility
- * that if we don't shutdown correctly, a duplicate error
- * record will be created on next reboot.
- */
-void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
-{
-	unsigned long offset;
-	unsigned long s;
-	int len = 0;
-
-	pr_debug("rtasd: logging event\n");
-	if (buf == NULL)
-		return;
-
-	spin_lock_irqsave(&rtasd_log_lock, s);
-
-	/* get length and increase count */
-	switch (err_type & ERR_TYPE_MASK) {
-	case ERR_TYPE_RTAS_LOG:
-		len = log_rtas_len(buf);
-		if (!(err_type & ERR_FLAG_BOOT))
-			error_log_cnt++;
-		break;
-	case ERR_TYPE_KERNEL_PANIC:
-	default:
-		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
-		spin_unlock_irqrestore(&rtasd_log_lock, s);
-		return;
-	}
-
-	/* Write error to NVRAM */
-	if (logging_enabled && !(err_type & ERR_FLAG_BOOT))
-		nvram_write_error_log(buf, len, err_type, error_log_cnt);
-
-	/*
-	 * rtas errors can occur during boot, and we do want to capture
-	 * those somewhere, even if nvram isn't ready (why not?), and even
-	 * if rtasd isn't ready. Put them into the boot log, at least.
-	 */
-	if ((err_type & ERR_TYPE_MASK) == ERR_TYPE_RTAS_LOG)
-		printk_log_rtas(buf, len);
-
-	/* Check to see if we need to or have stopped logging */
-	if (fatal || !logging_enabled) {
-		logging_enabled = 0;
-		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
-		spin_unlock_irqrestore(&rtasd_log_lock, s);
-		return;
-	}
-
-	/* call type specific method for error */
-	switch (err_type & ERR_TYPE_MASK) {
-	case ERR_TYPE_RTAS_LOG:
-		offset = rtas_error_log_buffer_max *
-			((rtas_log_start+rtas_log_size) & LOG_NUMBER_MASK);
-
-		/* First copy over sequence number */
-		memcpy(&rtas_log_buf[offset], (void *) &error_log_cnt, sizeof(int));
-
-		/* Second copy over error log data */
-		offset += sizeof(int);
-		memcpy(&rtas_log_buf[offset], buf, len);
-
-		if (rtas_log_size < LOG_NUMBER)
-			rtas_log_size += 1;
-		else
-			rtas_log_start += 1;
-
-		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
-		spin_unlock_irqrestore(&rtasd_log_lock, s);
-		wake_up_interruptible(&rtas_log_wait);
-		break;
-	case ERR_TYPE_KERNEL_PANIC:
-	default:
-		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
-		spin_unlock_irqrestore(&rtasd_log_lock, s);
-		return;
-	}
-
-}
-
-
-static int rtas_log_open(struct inode * inode, struct file * file)
-{
-	return 0;
-}
-
-static int rtas_log_release(struct inode * inode, struct file * file)
-{
-	return 0;
-}
-
-/* This will check if all events are logged, if they are then, we
- * know that we can safely clear the events in NVRAM.
- * Next we'll sit and wait for something else to log.
- */
-static ssize_t rtas_log_read(struct file * file, char __user * buf,
-			 size_t count, loff_t *ppos)
-{
-	int error;
-	char *tmp;
-	unsigned long s;
-	unsigned long offset;
-
-	if (!buf || count < rtas_error_log_buffer_max)
-		return -EINVAL;
-
-	count = rtas_error_log_buffer_max;
-
-	if (!access_ok(VERIFY_WRITE, buf, count))
-		return -EFAULT;
-
-	tmp = kmalloc(count, GFP_KERNEL);
-	if (!tmp)
-		return -ENOMEM;
-
-	spin_lock_irqsave(&rtasd_log_lock, s);
-	/* if it's 0, then we know we got the last one (the one in NVRAM) */
-	while (rtas_log_size == 0) {
-		if (file->f_flags & O_NONBLOCK) {
-			spin_unlock_irqrestore(&rtasd_log_lock, s);
-			error = -EAGAIN;
-			goto out;
-		}
-
-		if (!logging_enabled) {
-			spin_unlock_irqrestore(&rtasd_log_lock, s);
-			error = -ENODATA;
-			goto out;
-		}
-		nvram_clear_error_log();
-
-		spin_unlock_irqrestore(&rtasd_log_lock, s);
-		error = wait_event_interruptible(rtas_log_wait, rtas_log_size);
-		if (error)
-			goto out;
-		spin_lock_irqsave(&rtasd_log_lock, s);
-	}
-
-	offset = rtas_error_log_buffer_max * (rtas_log_start & LOG_NUMBER_MASK);
-	memcpy(tmp, &rtas_log_buf[offset], count);
-
-	rtas_log_start += 1;
-	rtas_log_size -= 1;
-	spin_unlock_irqrestore(&rtasd_log_lock, s);
-
-	error = copy_to_user(buf, tmp, count) ? -EFAULT : count;
-out:
-	kfree(tmp);
-	return error;
-}
-
-static unsigned int rtas_log_poll(struct file *file, poll_table * wait)
-{
-	poll_wait(file, &rtas_log_wait, wait);
-	if (rtas_log_size)
-		return POLLIN | POLLRDNORM;
-	return 0;
-}
-
-static const struct file_operations proc_rtas_log_operations = {
-	.read =		rtas_log_read,
-	.poll =		rtas_log_poll,
-	.open =		rtas_log_open,
-	.release =	rtas_log_release,
-};
-
-static int enable_surveillance(int timeout)
-{
-	int error;
-
-	error = rtas_set_indicator(SURVEILLANCE_TOKEN, 0, timeout);
-
-	if (error == 0)
-		return 0;
-
-	if (error == -EINVAL) {
-		printk(KERN_DEBUG "rtasd: surveillance not supported\n");
-		return 0;
-	}
-
-	printk(KERN_ERR "rtasd: could not update surveillance\n");
-	return -1;
-}
-
-static void do_event_scan(void)
-{
-	int error;
-	do {
-		memset(logdata, 0, rtas_error_log_max);
-		error = rtas_call(event_scan, 4, 1, NULL,
-				  RTAS_EVENT_SCAN_ALL_EVENTS, 0,
-				  __pa(logdata), rtas_error_log_max);
-		if (error == -1) {
-			printk(KERN_ERR "event-scan failed\n");
-			break;
-		}
-
-		if (error == 0)
-			pSeries_log_error(logdata, ERR_TYPE_RTAS_LOG, 0);
-
-	} while(error == 0);
-}
-
-static void do_event_scan_all_cpus(long delay)
-{
-	int cpu;
-
-	get_online_cpus();
-	cpu = first_cpu(cpu_online_map);
-	for (;;) {
-		set_cpus_allowed(current, cpumask_of_cpu(cpu));
-		do_event_scan();
-		set_cpus_allowed(current, CPU_MASK_ALL);
-
-		/* Drop hotplug lock, and sleep for the specified delay */
-		put_online_cpus();
-		msleep_interruptible(delay);
-		get_online_cpus();
-
-		cpu = next_cpu(cpu, cpu_online_map);
-		if (cpu == NR_CPUS)
-			break;
-	}
-	put_online_cpus();
-}
-
-static int rtasd(void *unused)
-{
-	unsigned int err_type;
-	int rc;
-
-	daemonize("rtasd");
-
-	printk(KERN_DEBUG "RTAS daemon started\n");
-	pr_debug("rtasd: will sleep for %d milliseconds\n",
-		 (30000 / rtas_event_scan_rate));
-
-	/* See if we have any error stored in NVRAM */
-	memset(logdata, 0, rtas_error_log_max);
-	rc = nvram_read_error_log(logdata, rtas_error_log_max,
-	                          &err_type, &error_log_cnt);
-	/* We can use rtas_log_buf now */
-	logging_enabled = 1;
-
-	if (!rc) {
-		if (err_type != ERR_FLAG_ALREADY_LOGGED) {
-			pSeries_log_error(logdata, err_type | ERR_FLAG_BOOT, 0);
-		}
-	}
-
-	/* First pass. */
-	do_event_scan_all_cpus(1000);
-
-	if (surveillance_timeout != -1) {
-		pr_debug("rtasd: enabling surveillance\n");
-		enable_surveillance(surveillance_timeout);
-		pr_debug("rtasd: surveillance enabled\n");
-	}
-
-	/* Delay should be at least one second since some
-	 * machines have problems if we call event-scan too
-	 * quickly. */
-	for (;;)
-		do_event_scan_all_cpus(30000/rtas_event_scan_rate);
-
-	return -EINVAL;
-}
-
-static int __init rtas_init(void)
-{
-	struct proc_dir_entry *entry;
-
-	if (!machine_is(pseries))
-		return 0;
-
-	/* No RTAS */
-	event_scan = rtas_token("event-scan");
-	if (event_scan == RTAS_UNKNOWN_SERVICE) {
-		printk(KERN_DEBUG "rtasd: no event-scan on system\n");
-		return -ENODEV;
-	}
-
-	rtas_event_scan_rate = rtas_token("rtas-event-scan-rate");
-	if (rtas_event_scan_rate == RTAS_UNKNOWN_SERVICE) {
-		printk(KERN_ERR "rtasd: no rtas-event-scan-rate on system\n");
-		return -ENODEV;
-	}
-
-	/* Make room for the sequence number */
-	rtas_error_log_max = rtas_get_error_log_max();
-	rtas_error_log_buffer_max = rtas_error_log_max + sizeof(int);
-
-	rtas_log_buf = vmalloc(rtas_error_log_buffer_max*LOG_NUMBER);
-	if (!rtas_log_buf) {
-		printk(KERN_ERR "rtasd: no memory\n");
-		return -ENOMEM;
-	}
-
-	entry = proc_create("ppc64/rtas/error_log", S_IRUSR, NULL,
-			    &proc_rtas_log_operations);
-	if (!entry)
-		printk(KERN_ERR "Failed to create error_log proc entry\n");
-
-	if (kernel_thread(rtasd, NULL, CLONE_FS) < 0)
-		printk(KERN_ERR "Failed to start RTAS daemon\n");
-
-	return 0;
-}
-
-static int __init surveillance_setup(char *str)
-{
-	int i;
-
-	if (get_option(&str,&i)) {
-		if (i >= 0 && i <= 255)
-			surveillance_timeout = i;
-	}
-
-	return 1;
-}
-
-static int __init rtasmsgs_setup(char *str)
-{
-	if (strcmp(str, "on") == 0)
-		full_rtas_msgs = 1;
-	else if (strcmp(str, "off") == 0)
-		full_rtas_msgs = 0;
-
-	return 1;
-}
-__initcall(rtas_init);
-__setup("surveillance=", surveillance_setup);
-__setup("rtasmsgs=", rtasmsgs_setup);
diff --git a/arch/powerpc/platforms/pseries/scanlog.c b/arch/powerpc/platforms/pseries/scanlog.c
index 417eca79df6..b502ab61aaf 100644
--- a/arch/powerpc/platforms/pseries/scanlog.c
+++ b/arch/powerpc/platforms/pseries/scanlog.c
@@ -13,7 +13,7 @@
  * of this data using this driver.  A dump exists if the device-tree
  * /chosen/ibm,scan-log-data property exists.
  *
- * This driver exports /proc/ppc64/scan-log-dump which can be read.
+ * This driver exports /proc/powerpc/scan-log-dump which can be read.
  * The driver supports only sequential reads.
  *
  * The driver looks at a write to the driver for the single word "reset".
@@ -26,6 +26,7 @@
 #include <linux/proc_fs.h>
 #include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/rtas.h>
 #include <asm/prom.h>
@@ -40,21 +41,16 @@
 
 
 static unsigned int ibm_scan_log_dump;			/* RTAS token */
-static struct proc_dir_entry *proc_ppc64_scan_log_dump;	/* The proc file */
+static unsigned int *scanlog_buffer;			/* The data buffer */
 
 static ssize_t scanlog_read(struct file *file, char __user *buf,
 			    size_t count, loff_t *ppos)
 {
-        struct inode * inode = file->f_path.dentry->d_inode;
-	struct proc_dir_entry *dp;
-	unsigned int *data;
+	unsigned int *data = scanlog_buffer;
 	int status;
 	unsigned long len, off;
 	unsigned int wait_time;
 
-        dp = PDE(inode);
- 	data = (unsigned int *)dp->data;
-
 	if (count > RTAS_DATA_BUF_SIZE)
 		count = RTAS_DATA_BUF_SIZE;
 
@@ -138,8 +134,7 @@ static ssize_t scanlog_write(struct file * file, const char __user * buf,
 
 static int scanlog_open(struct inode * inode, struct file * file)
 {
-	struct proc_dir_entry *dp = PDE(inode);
-	unsigned int *data = (unsigned int *)dp->data;
+	unsigned int *data = scanlog_buffer;
 
 	if (data[0] != 0) {
 		/* This imperfect test stops a second copy of the
@@ -155,11 +150,9 @@ static int scanlog_open(struct inode * inode, struct file * file)
 
 static int scanlog_release(struct inode * inode, struct file * file)
 {
-	struct proc_dir_entry *dp = PDE(inode);
-	unsigned int *data = (unsigned int *)dp->data;
+	unsigned int *data = scanlog_buffer;
 
 	data[0] = 0;
-
 	return 0;
 }
 
@@ -169,12 +162,12 @@ const struct file_operations scanlog_fops = {
 	.write		= scanlog_write,
 	.open		= scanlog_open,
 	.release	= scanlog_release,
+	.llseek		= noop_llseek,
 };
 
 static int __init scanlog_init(void)
 {
 	struct proc_dir_entry *ent;
-	void *data;
 	int err = -ENOMEM;
 
 	ibm_scan_log_dump = rtas_token("ibm,scan-log-dump");
@@ -182,29 +175,24 @@ static int __init scanlog_init(void)
 		return -ENODEV;
 
 	/* Ideally we could allocate a buffer < 4G */
-	data = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
-	if (!data)
+	scanlog_buffer = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+	if (!scanlog_buffer)
 		goto err;
 
-	ent = proc_create_data("ppc64/rtas/scan-log-dump", S_IRUSR, NULL,
-			       &scanlog_fops, data);
+	ent = proc_create("powerpc/rtas/scan-log-dump", S_IRUSR, NULL,
+			  &scanlog_fops);
 	if (!ent)
 		goto err;
-
-	proc_ppc64_scan_log_dump = ent;
-
 	return 0;
 err:
-	kfree(data);
+	kfree(scanlog_buffer);
 	return err;
 }
 
 static void __exit scanlog_cleanup(void)
 {
-	if (proc_ppc64_scan_log_dump) {
-		kfree(proc_ppc64_scan_log_dump->data);
-		remove_proc_entry("scan-log-dump", proc_ppc64_scan_log_dump->parent);
-	}
+	remove_proc_entry("powerpc/rtas/scan-log-dump", NULL);
+	kfree(scanlog_buffer);
 }
 
 module_init(scanlog_init);
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index ec341707e41..f2f40e64658 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -23,7 +23,6 @@
 #include <linux/mm.h>
 #include <linux/stddef.h>
 #include <linux/unistd.h>
-#include <linux/slab.h>
 #include <linux/user.h>
 #include <linux/tty.h>
 #include <linux/major.h>
@@ -35,11 +34,13 @@
 #include <linux/pci.h>
 #include <linux/utsname.h>
 #include <linux/adb.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/delay.h>
 #include <linux/irq.h>
 #include <linux/seq_file.h>
 #include <linux/root_dev.h>
+#include <linux/of.h>
+#include <linux/kexec.h>
 
 #include <asm/mmu.h>
 #include <asm/processor.h>
@@ -54,29 +55,27 @@
 #include <asm/irq.h>
 #include <asm/time.h>
 #include <asm/nvram.h>
-#include "xics.h"
 #include <asm/pmc.h>
 #include <asm/mpic.h>
+#include <asm/xics.h>
 #include <asm/ppc-pci.h>
 #include <asm/i8259.h>
 #include <asm/udbg.h>
 #include <asm/smp.h>
 #include <asm/firmware.h>
 #include <asm/eeh.h>
+#include <asm/reg.h>
+#include <asm/plpar_wrappers.h>
 
-#include "plpar_wrappers.h"
 #include "pseries.h"
 
 int CMO_PrPSP = -1;
 int CMO_SecPSP = -1;
-unsigned long CMO_PageSize = (ASM_CONST(1) << IOMMU_PAGE_SHIFT);
+unsigned long CMO_PageSize = (ASM_CONST(1) << IOMMU_PAGE_SHIFT_4K);
 EXPORT_SYMBOL(CMO_PageSize);
 
 int fwnmi_active;  /* TRUE if an FWNMI handler is present */
 
-static void pseries_shared_idle_sleep(void);
-static void pseries_dedicated_idle_sleep(void);
-
 static struct device_node *pSeries_mpic_node;
 
 static void pSeries_show_cpuinfo(struct seq_file *m)
@@ -114,10 +113,13 @@ static void __init fwnmi_init(void)
 
 static void pseries_8259_cascade(unsigned int irq, struct irq_desc *desc)
 {
+	struct irq_chip *chip = irq_desc_get_chip(desc);
 	unsigned int cascade_irq = i8259_irq();
+
 	if (cascade_irq != NO_IRQ)
 		generic_handle_irq(cascade_irq);
-	desc->chip->eoi(irq);
+
+	chip->irq_eoi(&desc->irq_data);
 }
 
 static void __init pseries_setup_i8259_cascade(void)
@@ -166,7 +168,7 @@ static void __init pseries_setup_i8259_cascade(void)
 		printk(KERN_DEBUG "pic: PCI 8259 intack at 0x%016lx\n", intack);
 	i8259_init(found, intack);
 	of_node_put(found);
-	set_irq_chained_handler(cascade, pseries_8259_cascade);
+	irq_set_chained_handler(cascade, pseries_8259_cascade);
 }
 
 static void __init pseries_mpic_init_IRQ(void)
@@ -180,7 +182,7 @@ static void __init pseries_mpic_init_IRQ(void)
 	np = of_find_node_by_path("/");
 	naddr = of_n_addr_cells(np);
 	opprop = of_get_property(np, "platform-open-pic", &opplen);
-	if (opprop != 0) {
+	if (opprop != NULL) {
 		openpic_addr = of_read_number(opprop, naddr);
 		printk(KERN_DEBUG "OpenPIC addr: %lx\n", openpic_addr);
 	}
@@ -190,9 +192,7 @@ static void __init pseries_mpic_init_IRQ(void)
 
 	/* Setup the openpic driver */
 	mpic = mpic_alloc(pSeries_mpic_node, openpic_addr,
-			  MPIC_PRIMARY,
-			  16, 250, /* isu size, irq count */
-			  " MPIC     ");
+			MPIC_NO_RESET, 16, 0, " MPIC     ");
 	BUG_ON(mpic == NULL);
 
 	/* Add ISUs */
@@ -202,6 +202,9 @@ static void __init pseries_mpic_init_IRQ(void)
 		mpic_assign_isu(mpic, n, isuaddr);
 	}
 
+	/* Setup top-level get_irq */
+	ppc_md.get_irq = mpic_get_irq;
+
 	/* All ISUs are setup, complete initialization */
 	mpic_init(mpic);
 
@@ -211,7 +214,7 @@ static void __init pseries_mpic_init_IRQ(void)
 
 static void __init pseries_xics_init_IRQ(void)
 {
-	xics_init_IRQ();
+	xics_init();
 	pseries_setup_i8259_cascade();
 }
 
@@ -222,10 +225,6 @@ static void pseries_lpar_enable_pmcs(void)
 	set = 1UL << 63;
 	reset = 0;
 	plpar_hcall_norets(H_PERFMON, set, reset);
-
-	/* instruct hypervisor to maintain PMCs */
-	if (firmware_has_feature(FW_FEATURE_SPLPAR))
-		get_lppaca()->pmcregs_in_use = 1;
 }
 
 static void __init pseries_discover_pic(void)
@@ -239,7 +238,6 @@ static void __init pseries_discover_pic(void)
 		if (strstr(typep, "open-pic")) {
 			pSeries_mpic_node = of_node_get(np);
 			ppc_md.init_IRQ       = pseries_mpic_init_IRQ;
-			ppc_md.get_irq        = mpic_get_irq;
 			setup_kexec_cpu_down_mpic();
 			smp_init_pseries_mpic();
 			return;
@@ -254,8 +252,219 @@ static void __init pseries_discover_pic(void)
 	       " interrupt-controller\n");
 }
 
+static int pci_dn_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node)
+{
+	struct device_node *np = node;
+	struct pci_dn *pci = NULL;
+	int err = NOTIFY_OK;
+
+	switch (action) {
+	case OF_RECONFIG_ATTACH_NODE:
+		pci = np->parent->data;
+		if (pci) {
+			update_dn_pci_info(np, pci->phb);
+
+			/* Create EEH device for the OF node */
+			eeh_dev_init(np, pci->phb);
+		}
+		break;
+	default:
+		err = NOTIFY_DONE;
+		break;
+	}
+	return err;
+}
+
+static struct notifier_block pci_dn_reconfig_nb = {
+	.notifier_call = pci_dn_reconfig_notifier,
+};
+
+struct kmem_cache *dtl_cache;
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+/*
+ * Allocate space for the dispatch trace log for all possible cpus
+ * and register the buffers with the hypervisor.  This is used for
+ * computing time stolen by the hypervisor.
+ */
+static int alloc_dispatch_logs(void)
+{
+	int cpu, ret;
+	struct paca_struct *pp;
+	struct dtl_entry *dtl;
+
+	if (!firmware_has_feature(FW_FEATURE_SPLPAR))
+		return 0;
+
+	if (!dtl_cache)
+		return 0;
+
+	for_each_possible_cpu(cpu) {
+		pp = &paca[cpu];
+		dtl = kmem_cache_alloc(dtl_cache, GFP_KERNEL);
+		if (!dtl) {
+			pr_warn("Failed to allocate dispatch trace log for cpu %d\n",
+				cpu);
+			pr_warn("Stolen time statistics will be unreliable\n");
+			break;
+		}
+
+		pp->dtl_ridx = 0;
+		pp->dispatch_log = dtl;
+		pp->dispatch_log_end = dtl + N_DISPATCH_LOG;
+		pp->dtl_curr = dtl;
+	}
+
+	/* Register the DTL for the current (boot) cpu */
+	dtl = get_paca()->dispatch_log;
+	get_paca()->dtl_ridx = 0;
+	get_paca()->dtl_curr = dtl;
+	get_paca()->lppaca_ptr->dtl_idx = 0;
+
+	/* hypervisor reads buffer length from this field */
+	dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES);
+	ret = register_dtl(hard_smp_processor_id(), __pa(dtl));
+	if (ret)
+		pr_err("WARNING: DTL registration of cpu %d (hw %d) failed "
+		       "with %d\n", smp_processor_id(),
+		       hard_smp_processor_id(), ret);
+	get_paca()->lppaca_ptr->dtl_enable_mask = 2;
+
+	return 0;
+}
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+static inline int alloc_dispatch_logs(void)
+{
+	return 0;
+}
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+
+static int alloc_dispatch_log_kmem_cache(void)
+{
+	dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES,
+						DISPATCH_LOG_BYTES, 0, NULL);
+	if (!dtl_cache) {
+		pr_warn("Failed to create dispatch trace log buffer cache\n");
+		pr_warn("Stolen time statistics will be unreliable\n");
+		return 0;
+	}
+
+	return alloc_dispatch_logs();
+}
+early_initcall(alloc_dispatch_log_kmem_cache);
+
+static void pseries_lpar_idle(void)
+{
+	/*
+	 * Default handler to go into low thread priority and possibly
+	 * low power mode by cedeing processor to hypervisor
+	 */
+
+	/* Indicate to hypervisor that we are idle. */
+	get_lppaca()->idle = 1;
+
+	/*
+	 * Yield the processor to the hypervisor.  We return if
+	 * an external interrupt occurs (which are driven prior
+	 * to returning here) or if a prod occurs from another
+	 * processor. When returning here, external interrupts
+	 * are enabled.
+	 */
+	cede_processor();
+
+	get_lppaca()->idle = 0;
+}
+
+/*
+ * Enable relocation on during exceptions. This has partition wide scope and
+ * may take a while to complete, if it takes longer than one second we will
+ * just give up rather than wasting any more time on this - if that turns out
+ * to ever be a problem in practice we can move this into a kernel thread to
+ * finish off the process later in boot.
+ */
+long pSeries_enable_reloc_on_exc(void)
+{
+	long rc;
+	unsigned int delay, total_delay = 0;
+
+	while (1) {
+		rc = enable_reloc_on_exceptions();
+		if (!H_IS_LONG_BUSY(rc))
+			return rc;
+
+		delay = get_longbusy_msecs(rc);
+		total_delay += delay;
+		if (total_delay > 1000) {
+			pr_warn("Warning: Giving up waiting to enable "
+				"relocation on exceptions (%u msec)!\n",
+				total_delay);
+			return rc;
+		}
+
+		mdelay(delay);
+	}
+}
+EXPORT_SYMBOL(pSeries_enable_reloc_on_exc);
+
+long pSeries_disable_reloc_on_exc(void)
+{
+	long rc;
+
+	while (1) {
+		rc = disable_reloc_on_exceptions();
+		if (!H_IS_LONG_BUSY(rc))
+			return rc;
+		mdelay(get_longbusy_msecs(rc));
+	}
+}
+EXPORT_SYMBOL(pSeries_disable_reloc_on_exc);
+
+#ifdef CONFIG_KEXEC
+static void pSeries_machine_kexec(struct kimage *image)
+{
+	long rc;
+
+	if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
+		rc = pSeries_disable_reloc_on_exc();
+		if (rc != H_SUCCESS)
+			pr_warning("Warning: Failed to disable relocation on "
+				   "exceptions: %ld\n", rc);
+	}
+
+	default_machine_kexec(image);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+long pseries_big_endian_exceptions(void)
+{
+	long rc;
+
+	while (1) {
+		rc = enable_big_endian_exceptions();
+		if (!H_IS_LONG_BUSY(rc))
+			return rc;
+		mdelay(get_longbusy_msecs(rc));
+	}
+}
+
+static long pseries_little_endian_exceptions(void)
+{
+	long rc;
+
+	while (1) {
+		rc = enable_little_endian_exceptions();
+		if (!H_IS_LONG_BUSY(rc))
+			return rc;
+		mdelay(get_longbusy_msecs(rc));
+	}
+}
+#endif
+
 static void __init pSeries_setup_arch(void)
 {
+	set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
+
 	/* Discover PIC type and setup ppc_md accordingly */
 	pseries_discover_pic();
 
@@ -268,58 +477,87 @@ static void __init pSeries_setup_arch(void)
 
 	fwnmi_init();
 
+	/* By default, only probe PCI (can be overriden by rtas_pci) */
+	pci_add_flags(PCI_PROBE_ONLY);
+
 	/* Find and initialize PCI host bridges */
 	init_pci_config_tokens();
 	find_and_init_phbs();
-	eeh_init();
+	of_reconfig_notifier_register(&pci_dn_reconfig_nb);
 
 	pSeries_nvram_init();
 
-	/* Choose an idle loop */
-	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
+	if (firmware_has_feature(FW_FEATURE_LPAR)) {
 		vpa_init(boot_cpuid);
-		if (get_lppaca()->shared_proc) {
-			printk(KERN_DEBUG "Using shared processor idle loop\n");
-			ppc_md.power_save = pseries_shared_idle_sleep;
-		} else {
-			printk(KERN_DEBUG "Using dedicated idle loop\n");
-			ppc_md.power_save = pseries_dedicated_idle_sleep;
-		}
+		ppc_md.power_save = pseries_lpar_idle;
+		ppc_md.enable_pmcs = pseries_lpar_enable_pmcs;
 	} else {
-		printk(KERN_DEBUG "Using default idle loop\n");
+		/* No special idle routine */
+		ppc_md.enable_pmcs = power4_enable_pmcs;
 	}
 
-	if (firmware_has_feature(FW_FEATURE_LPAR))
-		ppc_md.enable_pmcs = pseries_lpar_enable_pmcs;
-	else
-		ppc_md.enable_pmcs = power4_enable_pmcs;
+	ppc_md.pcibios_root_bridge_prepare = pseries_root_bridge_prepare;
+
+	if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
+		long rc;
+		if ((rc = pSeries_enable_reloc_on_exc()) != H_SUCCESS) {
+			pr_warn("Unable to enable relocation on exceptions: "
+				"%ld\n", rc);
+		}
+	}
 }
 
 static int __init pSeries_init_panel(void)
 {
 	/* Manually leave the kernel version on the panel. */
+#ifdef __BIG_ENDIAN__
 	ppc_md.progress("Linux ppc64\n", 0);
+#else
+	ppc_md.progress("Linux ppc64le\n", 0);
+#endif
 	ppc_md.progress(init_utsname()->version, 0);
 
 	return 0;
 }
-arch_initcall(pSeries_init_panel);
+machine_arch_initcall(pseries, pSeries_init_panel);
 
-static int pseries_set_dabr(unsigned long dabr)
+static int pseries_set_dabr(unsigned long dabr, unsigned long dabrx)
 {
 	return plpar_hcall_norets(H_SET_DABR, dabr);
 }
 
-static int pseries_set_xdabr(unsigned long dabr)
+static int pseries_set_xdabr(unsigned long dabr, unsigned long dabrx)
 {
-	/* We want to catch accesses from kernel and userspace */
-	return plpar_hcall_norets(H_SET_XDABR, dabr,
-			H_DABRX_KERNEL | H_DABRX_USER);
+	/* Have to set at least one bit in the DABRX according to PAPR */
+	if (dabrx == 0 && dabr == 0)
+		dabrx = DABRX_USER;
+	/* PAPR says we can only set kernel and user bits */
+	dabrx &= DABRX_KERNEL | DABRX_USER;
+
+	return plpar_hcall_norets(H_SET_XDABR, dabr, dabrx);
+}
+
+static int pseries_set_dawr(unsigned long dawr, unsigned long dawrx)
+{
+	/* PAPR says we can't set HYP */
+	dawrx &= ~DAWRX_HYP;
+
+	return  plapr_set_watchpoint0(dawr, dawrx);
 }
 
 #define CMO_CHARACTERISTICS_TOKEN 44
 #define CMO_MAXLENGTH 1026
 
+void pSeries_coalesce_init(void)
+{
+	struct hvcall_mpp_x_data mpp_x_data;
+
+	if (firmware_has_feature(FW_FEATURE_CMO) && !h_get_mpp_x(&mpp_x_data))
+		powerpc_firmware_features |= FW_FEATURE_XCMO;
+	else
+		powerpc_firmware_features &= ~FW_FEATURE_XCMO;
+}
+
 /**
  * fw_cmo_feature_init - FW_FEATURE_CMO is not stored in ibm,hypertas-functions,
  * handle that here. (Stolen from parse_system_parameter_string)
@@ -328,7 +566,7 @@ void pSeries_cmo_feature_init(void)
 {
 	char *ptr, *key, *value, *end;
 	int call_status;
-	int page_order = IOMMU_PAGE_SHIFT;
+	int page_order = IOMMU_PAGE_SHIFT_4K;
 
 	pr_debug(" -> fw_cmo_feature_init()\n");
 	spin_lock(&rtas_data_buf_lock);
@@ -389,6 +627,7 @@ void pSeries_cmo_feature_init(void)
 		pr_debug("CMO enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP,
 		         CMO_SecPSP);
 		powerpc_firmware_features |= FW_FEATURE_CMO;
+		pSeries_coalesce_init();
 	} else
 		pr_debug("CMO not enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP,
 		         CMO_SecPSP);
@@ -403,13 +642,17 @@ static void __init pSeries_init_early(void)
 {
 	pr_debug(" -> pSeries_init_early()\n");
 
+#ifdef CONFIG_HVC_CONSOLE
 	if (firmware_has_feature(FW_FEATURE_LPAR))
-		find_udbg_vterm();
-
-	if (firmware_has_feature(FW_FEATURE_DABR))
-		ppc_md.set_dabr = pseries_set_dabr;
-	else if (firmware_has_feature(FW_FEATURE_XDABR))
+		hvc_vio_init_early();
+#endif
+	if (firmware_has_feature(FW_FEATURE_XDABR))
 		ppc_md.set_dabr = pseries_set_xdabr;
+	else if (firmware_has_feature(FW_FEATURE_DABR))
+		ppc_md.set_dabr = pseries_set_dabr;
+
+	if (firmware_has_feature(FW_FEATURE_SET_MODE))
+		ppc_md.set_dawr = pseries_set_dawr;
 
 	pSeries_cmo_feature_init();
 	iommu_init_early_pSeries();
@@ -421,31 +664,45 @@ static void __init pSeries_init_early(void)
  * Called very early, MMU is off, device-tree isn't unflattened
  */
 
-static int __init pSeries_probe_hypertas(unsigned long node,
-					 const char *uname, int depth,
-					 void *data)
+static int __init pseries_probe_fw_features(unsigned long node,
+					    const char *uname, int depth,
+					    void *data)
 {
-	const char *hypertas;
-	unsigned long len;
+	const char *prop;
+	int len;
+	static int hypertas_found;
+	static int vec5_found;
 
-	if (depth != 1 ||
-	    (strcmp(uname, "rtas") != 0 && strcmp(uname, "rtas@0") != 0))
+	if (depth != 1)
 		return 0;
 
-	hypertas = of_get_flat_dt_prop(node, "ibm,hypertas-functions", &len);
-	if (!hypertas)
-		return 1;
+	if (!strcmp(uname, "rtas") || !strcmp(uname, "rtas@0")) {
+		prop = of_get_flat_dt_prop(node, "ibm,hypertas-functions",
+					   &len);
+		if (prop) {
+			powerpc_firmware_features |= FW_FEATURE_LPAR;
+			fw_hypertas_feature_init(prop, len);
+		}
+
+		hypertas_found = 1;
+	}
 
-	powerpc_firmware_features |= FW_FEATURE_LPAR;
-	fw_feature_init(hypertas, len);
+	if (!strcmp(uname, "chosen")) {
+		prop = of_get_flat_dt_prop(node, "ibm,architecture-vec-5",
+					   &len);
+		if (prop)
+			fw_vec5_feature_init(prop, len);
 
-	return 1;
+		vec5_found = 1;
+	}
+
+	return hypertas_found && vec5_found;
 }
 
 static int __init pSeries_probe(void)
 {
 	unsigned long root = of_get_flat_dt_root();
- 	char *dtype = of_get_flat_dt_prop(root, "device_type", NULL);
+	const char *dtype = of_get_flat_dt_prop(root, "device_type", NULL);
 
  	if (dtype == NULL)
  		return 0;
@@ -462,7 +719,23 @@ static int __init pSeries_probe(void)
 	pr_debug("pSeries detected, looking for LPAR capability...\n");
 
 	/* Now try to figure out if we are running on LPAR */
-	of_scan_flat_dt(pSeries_probe_hypertas, NULL);
+	of_scan_flat_dt(pseries_probe_fw_features, NULL);
+
+#ifdef __LITTLE_ENDIAN__
+	if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
+		long rc;
+		/*
+		 * Tell the hypervisor that we want our exceptions to
+		 * be taken in little endian mode. If this fails we don't
+		 * want to use BUG() because it will trigger an exception.
+		 */
+		rc = pseries_little_endian_exceptions();
+		if (rc) {
+			ppc_md.progress("H_SET_MODE LE exception fail", 0);
+			panic("Could not enable little endian exceptions");
+		}
+	}
+#endif
 
 	if (firmware_has_feature(FW_FEATURE_LPAR))
 		hpte_init_lpar();
@@ -475,80 +748,6 @@ static int __init pSeries_probe(void)
 	return 1;
 }
 
-
-DECLARE_PER_CPU(unsigned long, smt_snooze_delay);
-
-static void pseries_dedicated_idle_sleep(void)
-{ 
-	unsigned int cpu = smp_processor_id();
-	unsigned long start_snooze;
-	unsigned long in_purr, out_purr;
-
-	/*
-	 * Indicate to the HV that we are idle. Now would be
-	 * a good time to find other work to dispatch.
-	 */
-	get_lppaca()->idle = 1;
-	get_lppaca()->donate_dedicated_cpu = 1;
-	in_purr = mfspr(SPRN_PURR);
-
-	/*
-	 * We come in with interrupts disabled, and need_resched()
-	 * has been checked recently.  If we should poll for a little
-	 * while, do so.
-	 */
-	if (__get_cpu_var(smt_snooze_delay)) {
-		start_snooze = get_tb() +
-			__get_cpu_var(smt_snooze_delay) * tb_ticks_per_usec;
-		local_irq_enable();
-		set_thread_flag(TIF_POLLING_NRFLAG);
-
-		while (get_tb() < start_snooze) {
-			if (need_resched() || cpu_is_offline(cpu))
-				goto out;
-			ppc64_runlatch_off();
-			HMT_low();
-			HMT_very_low();
-		}
-
-		HMT_medium();
-		clear_thread_flag(TIF_POLLING_NRFLAG);
-		smp_mb();
-		local_irq_disable();
-		if (need_resched() || cpu_is_offline(cpu))
-			goto out;
-	}
-
-	cede_processor();
-
-out:
-	HMT_medium();
-	out_purr = mfspr(SPRN_PURR);
-	get_lppaca()->wait_state_cycles += out_purr - in_purr;
-	get_lppaca()->donate_dedicated_cpu = 0;
-	get_lppaca()->idle = 0;
-}
-
-static void pseries_shared_idle_sleep(void)
-{
-	/*
-	 * Indicate to the HV that we are idle. Now would be
-	 * a good time to find other work to dispatch.
-	 */
-	get_lppaca()->idle = 1;
-
-	/*
-	 * Yield the processor to the hypervisor.  We return if
-	 * an external interrupt occurs (which are driven prior
-	 * to returning here) or if a prod occurs from another
-	 * processor. When returning here, external interrupts
-	 * are enabled.
-	 */
-	cede_processor();
-
-	get_lppaca()->idle = 0;
-}
-
 static int pSeries_pci_probe_mode(struct pci_bus *bus)
 {
 	if (firmware_has_feature(FW_FEATURE_LPAR))
@@ -608,4 +807,10 @@ define_machine(pseries) {
 	.progress		= rtas_progress,
 	.system_reset_exception = pSeries_system_reset_exception,
 	.machine_check_exception = pSeries_machine_check_exception,
+#ifdef CONFIG_KEXEC
+	.machine_kexec          = pSeries_machine_kexec,
+#endif
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+	.memory_block_size	= pseries_memory_block_size,
+#endif
 };
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index 1a231c389ba..a3555b10c1a 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -14,7 +14,6 @@
 
 
 #include <linux/kernel.h>
-#include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
 #include <linux/interrupt.h>
@@ -23,11 +22,11 @@
 #include <linux/spinlock.h>
 #include <linux/cache.h>
 #include <linux/err.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/cpu.h>
 
 #include <asm/ptrace.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/irq.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -35,29 +34,54 @@
 #include <asm/prom.h>
 #include <asm/smp.h>
 #include <asm/paca.h>
-#include <asm/time.h>
 #include <asm/machdep.h>
 #include <asm/cputable.h>
 #include <asm/firmware.h>
-#include <asm/system.h>
 #include <asm/rtas.h>
-#include <asm/pSeries_reconfig.h>
 #include <asm/mpic.h>
 #include <asm/vdso_datapage.h>
 #include <asm/cputhreads.h>
+#include <asm/xics.h>
+#include <asm/dbell.h>
+#include <asm/plpar_wrappers.h>
+#include <asm/code-patching.h>
 
-#include "plpar_wrappers.h"
 #include "pseries.h"
-#include "xics.h"
+#include "offline_states.h"
 
 
 /*
  * The Primary thread of each non-boot processor was started from the OF client
  * interface by prom_hold_cpus and is spinning on secondary_hold_spinloop.
  */
-static cpumask_t of_spin_map;
+static cpumask_var_t of_spin_mask;
 
-extern void generic_secondary_smp_init(unsigned long);
+/*
+ * If we multiplex IPI mechanisms, store the appropriate XICS IPI mechanism here
+ */
+static void  (*xics_cause_ipi)(int cpu, unsigned long data);
+
+/* Query where a cpu is now.  Return codes #defined in plpar_wrappers.h */
+int smp_query_cpu_stopped(unsigned int pcpu)
+{
+	int cpu_status, status;
+	int qcss_tok = rtas_token("query-cpu-stopped-state");
+
+	if (qcss_tok == RTAS_UNKNOWN_SERVICE) {
+		printk_once(KERN_INFO
+			"Firmware doesn't support query-cpu-stopped-state\n");
+		return QCSS_HARDWARE_ERROR;
+	}
+
+	status = rtas_call(qcss_tok, 1, 2, &cpu_status, pcpu);
+	if (status != 0) {
+		printk(KERN_ERR
+		       "RTAS query-cpu-stopped-state failed: %i\n", status);
+		return status;
+	}
+
+	return cpu_status;
+}
 
 /**
  * smp_startup_cpu() - start the given cpu
@@ -70,23 +94,32 @@ extern void generic_secondary_smp_init(unsigned long);
  *	0	- failure
  *	1	- success
  */
-static inline int __devinit smp_startup_cpu(unsigned int lcpu)
+static inline int smp_startup_cpu(unsigned int lcpu)
 {
 	int status;
-	unsigned long start_here = __pa((u32)*((unsigned long *)
-					       generic_secondary_smp_init));
+	unsigned long start_here =
+			__pa(ppc_function_entry(generic_secondary_smp_init));
 	unsigned int pcpu;
 	int start_cpu;
 
-	if (cpu_isset(lcpu, of_spin_map))
+	if (cpumask_test_cpu(lcpu, of_spin_mask))
 		/* Already started by OF and sitting in spin loop */
 		return 1;
 
 	pcpu = get_hard_smp_processor_id(lcpu);
 
+	/* Check to see if the CPU out of FW already for kexec */
+	if (smp_query_cpu_stopped(pcpu) == QCSS_NOT_STOPPED){
+		cpumask_set_cpu(lcpu, of_spin_mask);
+		return 1;
+	}
+
 	/* Fixup atomic count: it exited inside IRQ handler. */
 	task_thread_info(paca[lcpu].__current)->preempt_count	= 0;
-
+#ifdef CONFIG_HOTPLUG_CPU
+	if (get_cpu_current_state(lcpu) == CPU_STATE_INACTIVE)
+		goto out;
+#endif
 	/* 
 	 * If the RTAS start-cpu token does not exist then presume the
 	 * cpu is already spinning.
@@ -101,54 +134,35 @@ static inline int __devinit smp_startup_cpu(unsigned int lcpu)
 		return 0;
 	}
 
+#ifdef CONFIG_HOTPLUG_CPU
+out:
+#endif
 	return 1;
 }
 
-#ifdef CONFIG_XICS
-static void __devinit smp_xics_setup_cpu(int cpu)
+static void smp_xics_setup_cpu(int cpu)
 {
 	if (cpu != boot_cpuid)
 		xics_setup_cpu();
+	if (cpu_has_feature(CPU_FTR_DBELL))
+		doorbell_setup_this_cpu();
 
 	if (firmware_has_feature(FW_FEATURE_SPLPAR))
 		vpa_init(cpu);
 
-	cpu_clear(cpu, of_spin_map);
-
-}
-#endif /* CONFIG_XICS */
-
-static DEFINE_SPINLOCK(timebase_lock);
-static unsigned long timebase = 0;
-
-static void __devinit pSeries_give_timebase(void)
-{
-	spin_lock(&timebase_lock);
-	rtas_call(rtas_token("freeze-time-base"), 0, 1, NULL);
-	timebase = get_tb();
-	spin_unlock(&timebase_lock);
-
-	while (timebase)
-		barrier();
-	rtas_call(rtas_token("thaw-time-base"), 0, 1, NULL);
-}
-
-static void __devinit pSeries_take_timebase(void)
-{
-	while (!timebase)
-		barrier();
-	spin_lock(&timebase_lock);
-	set_tb(timebase >> 32, timebase & 0xffffffff);
-	timebase = 0;
-	spin_unlock(&timebase_lock);
+	cpumask_clear_cpu(cpu, of_spin_mask);
+#ifdef CONFIG_HOTPLUG_CPU
+	set_cpu_current_state(cpu, CPU_STATE_ONLINE);
+	set_default_offline_state(cpu);
+#endif
 }
 
-static void __devinit smp_pSeries_kick_cpu(int nr)
+static int smp_pSeries_kick_cpu(int nr)
 {
 	BUG_ON(nr < 0 || nr >= NR_CPUS);
 
 	if (!smp_startup_cpu(nr))
-		return;
+		return -ENOENT;
 
 	/*
 	 * The processor is currently spinning, waiting for the
@@ -156,37 +170,60 @@ static void __devinit smp_pSeries_kick_cpu(int nr)
 	 * the processor will continue on to secondary_start
 	 */
 	paca[nr].cpu_start = 1;
+#ifdef CONFIG_HOTPLUG_CPU
+	set_preferred_offline_state(nr, CPU_STATE_ONLINE);
+
+	if (get_cpu_current_state(nr) == CPU_STATE_INACTIVE) {
+		long rc;
+		unsigned long hcpuid;
+
+		hcpuid = get_hard_smp_processor_id(nr);
+		rc = plpar_hcall_norets(H_PROD, hcpuid);
+		if (rc != H_SUCCESS)
+			printk(KERN_ERR "Error: Prod to wake up processor %d "
+						"Ret= %ld\n", nr, rc);
+	}
+#endif
+
+	return 0;
 }
 
-static int smp_pSeries_cpu_bootable(unsigned int nr)
+/* Only used on systems that support multiple IPI mechanisms */
+static void pSeries_cause_ipi_mux(int cpu, unsigned long data)
 {
-	/* Special case - we inhibit secondary thread startup
-	 * during boot if the user requests it.
-	 */
-	if (system_state < SYSTEM_RUNNING &&
-	    cpu_has_feature(CPU_FTR_SMT) &&
-	    !smt_enabled_at_boot && cpu_thread_in_core(nr) != 0)
-		return 0;
+	if (cpumask_test_cpu(cpu, cpu_sibling_mask(smp_processor_id())))
+		doorbell_cause_ipi(cpu, data);
+	else
+		xics_cause_ipi(cpu, data);
+}
 
-	return 1;
+static __init int pSeries_smp_probe(void)
+{
+	int ret = xics_smp_probe();
+
+	if (cpu_has_feature(CPU_FTR_DBELL)) {
+		xics_cause_ipi = smp_ops->cause_ipi;
+		smp_ops->cause_ipi = pSeries_cause_ipi_mux;
+	}
+
+	return ret;
 }
-#ifdef CONFIG_MPIC
+
 static struct smp_ops_t pSeries_mpic_smp_ops = {
 	.message_pass	= smp_mpic_message_pass,
 	.probe		= smp_mpic_probe,
 	.kick_cpu	= smp_pSeries_kick_cpu,
 	.setup_cpu	= smp_mpic_setup_cpu,
 };
-#endif
-#ifdef CONFIG_XICS
+
 static struct smp_ops_t pSeries_xics_smp_ops = {
-	.message_pass	= smp_xics_message_pass,
-	.probe		= smp_xics_probe,
+	.message_pass	= NULL,	/* Use smp_muxed_ipi_message_pass */
+	.cause_ipi	= NULL,	/* Filled at runtime by pSeries_smp_probe() */
+	.probe		= pSeries_smp_probe,
 	.kick_cpu	= smp_pSeries_kick_cpu,
 	.setup_cpu	= smp_xics_setup_cpu,
-	.cpu_bootable	= smp_pSeries_cpu_bootable,
+	.cpu_bootable	= smp_generic_cpu_bootable,
 };
-#endif
 
 /* This is called very early */
 static void __init smp_init_pseries(void)
@@ -195,35 +232,41 @@ static void __init smp_init_pseries(void)
 
 	pr_debug(" -> smp_init_pSeries()\n");
 
-	/* Mark threads which are still spinning in hold loops. */
-	if (cpu_has_feature(CPU_FTR_SMT)) {
-		for_each_present_cpu(i) { 
-			if (cpu_thread_in_core(i) == 0)
-				cpu_set(i, of_spin_map);
-		}
-	} else {
-		of_spin_map = cpu_present_map;
-	}
+	alloc_bootmem_cpumask_var(&of_spin_mask);
 
-	cpu_clear(boot_cpuid, of_spin_map);
+	/*
+	 * Mark threads which are still spinning in hold loops
+	 *
+	 * We know prom_init will not have started them if RTAS supports
+	 * query-cpu-stopped-state.
+	 */
+	if (rtas_token("query-cpu-stopped-state") == RTAS_UNKNOWN_SERVICE) {
+		if (cpu_has_feature(CPU_FTR_SMT)) {
+			for_each_present_cpu(i) {
+				if (cpu_thread_in_core(i) == 0)
+					cpumask_set_cpu(i, of_spin_mask);
+			}
+		} else
+			cpumask_copy(of_spin_mask, cpu_present_mask);
+
+		cpumask_clear_cpu(boot_cpuid, of_spin_mask);
+	}
 
 	/* Non-lpar has additional take/give timebase */
 	if (rtas_token("freeze-time-base") != RTAS_UNKNOWN_SERVICE) {
-		smp_ops->give_timebase = pSeries_give_timebase;
-		smp_ops->take_timebase = pSeries_take_timebase;
+		smp_ops->give_timebase = rtas_give_timebase;
+		smp_ops->take_timebase = rtas_take_timebase;
 	}
 
 	pr_debug(" <- smp_init_pSeries()\n");
 }
 
-#ifdef CONFIG_MPIC
 void __init smp_init_pseries_mpic(void)
 {
 	smp_ops = &pSeries_mpic_smp_ops;
 
 	smp_init_pseries();
 }
-#endif
 
 void __init smp_init_pseries_xics(void)
 {
diff --git a/arch/powerpc/platforms/pseries/suspend.c b/arch/powerpc/platforms/pseries/suspend.c
new file mode 100644
index 00000000000..b87b97849d4
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/suspend.c
@@ -0,0 +1,284 @@
+/*
+  * Copyright (C) 2010 Brian King IBM Corporation
+  *
+  * This program is free software; you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation; either version 2 of the License, or
+  * (at your option) any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program; if not, write to the Free Software
+  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+  */
+
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/suspend.h>
+#include <linux/stat.h>
+#include <asm/firmware.h>
+#include <asm/hvcall.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+#include <asm/rtas.h>
+#include <asm/topology.h>
+#include "../../kernel/cacheinfo.h"
+
+static u64 stream_id;
+static struct device suspend_dev;
+static DECLARE_COMPLETION(suspend_work);
+static struct rtas_suspend_me_data suspend_data;
+static atomic_t suspending;
+
+/**
+ * pseries_suspend_begin - First phase of hibernation
+ *
+ * Check to ensure we are in a valid state to hibernate
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int pseries_suspend_begin(suspend_state_t state)
+{
+	long vasi_state, rc;
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+	/* Make sure the state is valid */
+	rc = plpar_hcall(H_VASI_STATE, retbuf, stream_id);
+
+	vasi_state = retbuf[0];
+
+	if (rc) {
+		pr_err("pseries_suspend_begin: vasi_state returned %ld\n",rc);
+		return rc;
+	} else if (vasi_state == H_VASI_ENABLED) {
+		return -EAGAIN;
+	} else if (vasi_state != H_VASI_SUSPENDING) {
+		pr_err("pseries_suspend_begin: vasi_state returned state %ld\n",
+		       vasi_state);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * pseries_suspend_cpu - Suspend a single CPU
+ *
+ * Makes the H_JOIN call to suspend the CPU
+ *
+ **/
+static int pseries_suspend_cpu(void)
+{
+	if (atomic_read(&suspending))
+		return rtas_suspend_cpu(&suspend_data);
+	return 0;
+}
+
+/**
+ * pseries_suspend_enable_irqs
+ *
+ * Post suspend configuration updates
+ *
+ **/
+static void pseries_suspend_enable_irqs(void)
+{
+	/*
+	 * Update configuration which can be modified based on device tree
+	 * changes during resume.
+	 */
+	cacheinfo_cpu_offline(smp_processor_id());
+	post_mobility_fixup();
+	cacheinfo_cpu_online(smp_processor_id());
+}
+
+/**
+ * pseries_suspend_enter - Final phase of hibernation
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int pseries_suspend_enter(suspend_state_t state)
+{
+	int rc = rtas_suspend_last_cpu(&suspend_data);
+
+	atomic_set(&suspending, 0);
+	atomic_set(&suspend_data.done, 1);
+	return rc;
+}
+
+/**
+ * pseries_prepare_late - Prepare to suspend all other CPUs
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int pseries_prepare_late(void)
+{
+	atomic_set(&suspending, 1);
+	atomic_set(&suspend_data.working, 0);
+	atomic_set(&suspend_data.done, 0);
+	atomic_set(&suspend_data.error, 0);
+	suspend_data.complete = &suspend_work;
+	reinit_completion(&suspend_work);
+	return 0;
+}
+
+/**
+ * store_hibernate - Initiate partition hibernation
+ * @dev:		subsys root device
+ * @attr:		device attribute struct
+ * @buf:		buffer
+ * @count:		buffer size
+ *
+ * Write the stream ID received from the HMC to this file
+ * to trigger hibernating the partition
+ *
+ * Return value:
+ * 	number of bytes printed to buffer / other on failure
+ **/
+static ssize_t store_hibernate(struct device *dev,
+			       struct device_attribute *attr,
+			       const char *buf, size_t count)
+{
+	cpumask_var_t offline_mask;
+	int rc;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!alloc_cpumask_var(&offline_mask, GFP_TEMPORARY))
+		return -ENOMEM;
+
+	stream_id = simple_strtoul(buf, NULL, 16);
+
+	do {
+		rc = pseries_suspend_begin(PM_SUSPEND_MEM);
+		if (rc == -EAGAIN)
+			ssleep(1);
+	} while (rc == -EAGAIN);
+
+	if (!rc) {
+		/* All present CPUs must be online */
+		cpumask_andnot(offline_mask, cpu_present_mask,
+				cpu_online_mask);
+		rc = rtas_online_cpus_mask(offline_mask);
+		if (rc) {
+			pr_err("%s: Could not bring present CPUs online.\n",
+					__func__);
+			goto out;
+		}
+
+		stop_topology_update();
+		rc = pm_suspend(PM_SUSPEND_MEM);
+		start_topology_update();
+
+		/* Take down CPUs not online prior to suspend */
+		if (!rtas_offline_cpus_mask(offline_mask))
+			pr_warn("%s: Could not restore CPUs to offline "
+					"state.\n", __func__);
+	}
+
+	stream_id = 0;
+
+	if (!rc)
+		rc = count;
+out:
+	free_cpumask_var(offline_mask);
+	return rc;
+}
+
+#define USER_DT_UPDATE	0
+#define KERN_DT_UPDATE	1
+
+/**
+ * show_hibernate - Report device tree update responsibilty
+ * @dev:		subsys root device
+ * @attr:		device attribute struct
+ * @buf:		buffer
+ *
+ * Report whether a device tree update is performed by the kernel after a
+ * resume, or if drmgr must coordinate the update from user space.
+ *
+ * Return value:
+ *	0 if drmgr is to initiate update, and 1 otherwise
+ **/
+static ssize_t show_hibernate(struct device *dev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	return sprintf(buf, "%d\n", KERN_DT_UPDATE);
+}
+
+static DEVICE_ATTR(hibernate, S_IWUSR | S_IRUGO,
+		   show_hibernate, store_hibernate);
+
+static struct bus_type suspend_subsys = {
+	.name = "power",
+	.dev_name = "power",
+};
+
+static const struct platform_suspend_ops pseries_suspend_ops = {
+	.valid		= suspend_valid_only_mem,
+	.begin		= pseries_suspend_begin,
+	.prepare_late	= pseries_prepare_late,
+	.enter		= pseries_suspend_enter,
+};
+
+/**
+ * pseries_suspend_sysfs_register - Register with sysfs
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int pseries_suspend_sysfs_register(struct device *dev)
+{
+	int rc;
+
+	if ((rc = subsys_system_register(&suspend_subsys, NULL)))
+		return rc;
+
+	dev->id = 0;
+	dev->bus = &suspend_subsys;
+
+	if ((rc = device_create_file(suspend_subsys.dev_root, &dev_attr_hibernate)))
+		goto subsys_unregister;
+
+	return 0;
+
+subsys_unregister:
+	bus_unregister(&suspend_subsys);
+	return rc;
+}
+
+/**
+ * pseries_suspend_init - initcall for pSeries suspend
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int __init pseries_suspend_init(void)
+{
+	int rc;
+
+	if (!machine_is(pseries) || !firmware_has_feature(FW_FEATURE_LPAR))
+		return 0;
+
+	suspend_data.token = rtas_token("ibm,suspend-me");
+	if (suspend_data.token == RTAS_UNKNOWN_SERVICE)
+		return 0;
+
+	if ((rc = pseries_suspend_sysfs_register(&suspend_dev)))
+		return rc;
+
+	ppc_md.suspend_disable_cpu = pseries_suspend_cpu;
+	ppc_md.suspend_enable_irqs = pseries_suspend_enable_irqs;
+	suspend_set_ops(&pseries_suspend_ops);
+	return 0;
+}
+
+__initcall(pseries_suspend_init);
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c
deleted file mode 100644
index be3581a8c29..00000000000
--- a/arch/powerpc/platforms/pseries/xics.c
+++ /dev/null
@@ -1,881 +0,0 @@
-/*
- * arch/powerpc/platforms/pseries/xics.c
- *
- * Copyright 2000 IBM Corporation.
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- */
-
-#include <linux/types.h>
-#include <linux/threads.h>
-#include <linux/kernel.h>
-#include <linux/irq.h>
-#include <linux/smp.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/radix-tree.h>
-#include <linux/cpu.h>
-#include <linux/of.h>
-
-#include <asm/firmware.h>
-#include <asm/io.h>
-#include <asm/pgtable.h>
-#include <asm/smp.h>
-#include <asm/rtas.h>
-#include <asm/hvcall.h>
-#include <asm/machdep.h>
-
-#include "xics.h"
-#include "plpar_wrappers.h"
-
-static struct irq_host *xics_host;
-
-#define XICS_IPI		2
-#define XICS_IRQ_SPURIOUS	0
-
-/* Want a priority other than 0.  Various HW issues require this. */
-#define	DEFAULT_PRIORITY	5
-
-/*
- * Mark IPIs as higher priority so we can take them inside interrupts that
- * arent marked IRQF_DISABLED
- */
-#define IPI_PRIORITY		4
-
-static unsigned int default_server = 0xFF;
-static unsigned int default_distrib_server = 0;
-static unsigned int interrupt_server_size = 8;
-
-/* RTAS service tokens */
-static int ibm_get_xive;
-static int ibm_set_xive;
-static int ibm_int_on;
-static int ibm_int_off;
-
-
-/* Direct hardware low level accessors */
-
-/* The part of the interrupt presentation layer that we care about */
-struct xics_ipl {
-	union {
-		u32 word;
-		u8 bytes[4];
-	} xirr_poll;
-	union {
-		u32 word;
-		u8 bytes[4];
-	} xirr;
-	u32 dummy;
-	union {
-		u32 word;
-		u8 bytes[4];
-	} qirr;
-};
-
-static struct xics_ipl __iomem *xics_per_cpu[NR_CPUS];
-
-static inline unsigned int direct_xirr_info_get(void)
-{
-	int cpu = smp_processor_id();
-
-	return in_be32(&xics_per_cpu[cpu]->xirr.word);
-}
-
-static inline void direct_xirr_info_set(unsigned int value)
-{
-	int cpu = smp_processor_id();
-
-	out_be32(&xics_per_cpu[cpu]->xirr.word, value);
-}
-
-static inline void direct_cppr_info(u8 value)
-{
-	int cpu = smp_processor_id();
-
-	out_8(&xics_per_cpu[cpu]->xirr.bytes[0], value);
-}
-
-static inline void direct_qirr_info(int n_cpu, u8 value)
-{
-	out_8(&xics_per_cpu[n_cpu]->qirr.bytes[0], value);
-}
-
-
-/* LPAR low level accessors */
-
-static inline unsigned int lpar_xirr_info_get(void)
-{
-	unsigned long lpar_rc;
-	unsigned long return_value;
-
-	lpar_rc = plpar_xirr(&return_value);
-	if (lpar_rc != H_SUCCESS)
-		panic(" bad return code xirr - rc = %lx \n", lpar_rc);
-	return (unsigned int)return_value;
-}
-
-static inline void lpar_xirr_info_set(unsigned int value)
-{
-	unsigned long lpar_rc;
-
-	lpar_rc = plpar_eoi(value);
-	if (lpar_rc != H_SUCCESS)
-		panic("bad return code EOI - rc = %ld, value=%x\n", lpar_rc,
-		      value);
-}
-
-static inline void lpar_cppr_info(u8 value)
-{
-	unsigned long lpar_rc;
-
-	lpar_rc = plpar_cppr(value);
-	if (lpar_rc != H_SUCCESS)
-		panic("bad return code cppr - rc = %lx\n", lpar_rc);
-}
-
-static inline void lpar_qirr_info(int n_cpu , u8 value)
-{
-	unsigned long lpar_rc;
-
-	lpar_rc = plpar_ipi(get_hard_smp_processor_id(n_cpu), value);
-	if (lpar_rc != H_SUCCESS)
-		panic("bad return code qirr - rc = %lx\n", lpar_rc);
-}
-
-
-/* Interface to generic irq subsystem */
-
-#ifdef CONFIG_SMP
-static int get_irq_server(unsigned int virq, unsigned int strict_check)
-{
-	int server;
-	/* For the moment only implement delivery to all cpus or one cpu */
-	cpumask_t cpumask;
-	cpumask_t tmp = CPU_MASK_NONE;
-
-	cpumask_copy(&cpumask, irq_desc[virq].affinity);
-	if (!distribute_irqs)
-		return default_server;
-
-	if (!cpus_equal(cpumask, CPU_MASK_ALL)) {
-		cpus_and(tmp, cpu_online_map, cpumask);
-
-		server = first_cpu(tmp);
-
-		if (server < NR_CPUS)
-			return get_hard_smp_processor_id(server);
-
-		if (strict_check)
-			return -1;
-	}
-
-	if (cpus_equal(cpu_online_map, cpu_present_map))
-		return default_distrib_server;
-
-	return default_server;
-}
-#else
-static int get_irq_server(unsigned int virq, unsigned int strict_check)
-{
-	return default_server;
-}
-#endif
-
-static void xics_unmask_irq(unsigned int virq)
-{
-	unsigned int irq;
-	int call_status;
-	int server;
-
-	pr_debug("xics: unmask virq %d\n", virq);
-
-	irq = (unsigned int)irq_map[virq].hwirq;
-	pr_debug(" -> map to hwirq 0x%x\n", irq);
-	if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS)
-		return;
-
-	server = get_irq_server(virq, 0);
-
-	call_status = rtas_call(ibm_set_xive, 3, 1, NULL, irq, server,
-				DEFAULT_PRIORITY);
-	if (call_status != 0) {
-		printk(KERN_ERR
-			"%s: ibm_set_xive irq %u server %x returned %d\n",
-			__func__, irq, server, call_status);
-		return;
-	}
-
-	/* Now unmask the interrupt (often a no-op) */
-	call_status = rtas_call(ibm_int_on, 1, 1, NULL, irq);
-	if (call_status != 0) {
-		printk(KERN_ERR "%s: ibm_int_on irq=%u returned %d\n",
-			__func__, irq, call_status);
-		return;
-	}
-}
-
-static unsigned int xics_startup(unsigned int virq)
-{
-	/* unmask it */
-	xics_unmask_irq(virq);
-	return 0;
-}
-
-static void xics_mask_real_irq(unsigned int irq)
-{
-	int call_status;
-
-	if (irq == XICS_IPI)
-		return;
-
-	call_status = rtas_call(ibm_int_off, 1, 1, NULL, irq);
-	if (call_status != 0) {
-		printk(KERN_ERR "%s: ibm_int_off irq=%u returned %d\n",
-			__func__, irq, call_status);
-		return;
-	}
-
-	/* Have to set XIVE to 0xff to be able to remove a slot */
-	call_status = rtas_call(ibm_set_xive, 3, 1, NULL, irq,
-				default_server, 0xff);
-	if (call_status != 0) {
-		printk(KERN_ERR "%s: ibm_set_xive(0xff) irq=%u returned %d\n",
-			__func__, irq, call_status);
-		return;
-	}
-}
-
-static void xics_mask_irq(unsigned int virq)
-{
-	unsigned int irq;
-
-	pr_debug("xics: mask virq %d\n", virq);
-
-	irq = (unsigned int)irq_map[virq].hwirq;
-	if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS)
-		return;
-	xics_mask_real_irq(irq);
-}
-
-static void xics_mask_unknown_vec(unsigned int vec)
-{
-	printk(KERN_ERR "Interrupt %u (real) is invalid, disabling it.\n", vec);
-	xics_mask_real_irq(vec);
-}
-
-static inline unsigned int xics_xirr_vector(unsigned int xirr)
-{
-	/*
-	 * The top byte is the old cppr, to be restored on EOI.
-	 * The remaining 24 bits are the vector.
-	 */
-	return xirr & 0x00ffffff;
-}
-
-static unsigned int xics_get_irq_direct(void)
-{
-	unsigned int xirr = direct_xirr_info_get();
-	unsigned int vec = xics_xirr_vector(xirr);
-	unsigned int irq;
-
-	if (vec == XICS_IRQ_SPURIOUS)
-		return NO_IRQ;
-
-	irq = irq_radix_revmap_lookup(xics_host, vec);
-	if (likely(irq != NO_IRQ))
-		return irq;
-
-	/* We don't have a linux mapping, so have rtas mask it. */
-	xics_mask_unknown_vec(vec);
-
-	/* We might learn about it later, so EOI it */
-	direct_xirr_info_set(xirr);
-	return NO_IRQ;
-}
-
-static unsigned int xics_get_irq_lpar(void)
-{
-	unsigned int xirr = lpar_xirr_info_get();
-	unsigned int vec = xics_xirr_vector(xirr);
-	unsigned int irq;
-
-	if (vec == XICS_IRQ_SPURIOUS)
-		return NO_IRQ;
-
-	irq = irq_radix_revmap_lookup(xics_host, vec);
-	if (likely(irq != NO_IRQ))
-		return irq;
-
-	/* We don't have a linux mapping, so have RTAS mask it. */
-	xics_mask_unknown_vec(vec);
-
-	/* We might learn about it later, so EOI it */
-	lpar_xirr_info_set(xirr);
-	return NO_IRQ;
-}
-
-static void xics_eoi_direct(unsigned int virq)
-{
-	unsigned int irq = (unsigned int)irq_map[virq].hwirq;
-
-	iosync();
-	direct_xirr_info_set((0xff << 24) | irq);
-}
-
-static void xics_eoi_lpar(unsigned int virq)
-{
-	unsigned int irq = (unsigned int)irq_map[virq].hwirq;
-
-	iosync();
-	lpar_xirr_info_set((0xff << 24) | irq);
-}
-
-static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
-{
-	unsigned int irq;
-	int status;
-	int xics_status[2];
-	int irq_server;
-
-	irq = (unsigned int)irq_map[virq].hwirq;
-	if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS)
-		return -1;
-
-	status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq);
-
-	if (status) {
-		printk(KERN_ERR "%s: ibm,get-xive irq=%u returns %d\n",
-			__func__, irq, status);
-		return -1;
-	}
-
-	/*
-	 * For the moment only implement delivery to all cpus or one cpu.
-	 * Get current irq_server for the given irq
-	 */
-	irq_server = get_irq_server(virq, 1);
-	if (irq_server == -1) {
-		char cpulist[128];
-		cpumask_scnprintf(cpulist, sizeof(cpulist), cpumask);
-		printk(KERN_WARNING
-			"%s: No online cpus in the mask %s for irq %d\n",
-			__func__, cpulist, virq);
-		return -1;
-	}
-
-	status = rtas_call(ibm_set_xive, 3, 1, NULL,
-				irq, irq_server, xics_status[1]);
-
-	if (status) {
-		printk(KERN_ERR "%s: ibm,set-xive irq=%u returns %d\n",
-			__func__, irq, status);
-		return -1;
-	}
-
-	return 0;
-}
-
-static struct irq_chip xics_pic_direct = {
-	.typename = " XICS     ",
-	.startup = xics_startup,
-	.mask = xics_mask_irq,
-	.unmask = xics_unmask_irq,
-	.eoi = xics_eoi_direct,
-	.set_affinity = xics_set_affinity
-};
-
-static struct irq_chip xics_pic_lpar = {
-	.typename = " XICS     ",
-	.startup = xics_startup,
-	.mask = xics_mask_irq,
-	.unmask = xics_unmask_irq,
-	.eoi = xics_eoi_lpar,
-	.set_affinity = xics_set_affinity
-};
-
-
-/* Interface to arch irq controller subsystem layer */
-
-/* Points to the irq_chip we're actually using */
-static struct irq_chip *xics_irq_chip;
-
-static int xics_host_match(struct irq_host *h, struct device_node *node)
-{
-	/* IBM machines have interrupt parents of various funky types for things
-	 * like vdevices, events, etc... The trick we use here is to match
-	 * everything here except the legacy 8259 which is compatible "chrp,iic"
-	 */
-	return !of_device_is_compatible(node, "chrp,iic");
-}
-
-static int xics_host_map(struct irq_host *h, unsigned int virq,
-			 irq_hw_number_t hw)
-{
-	pr_debug("xics: map virq %d, hwirq 0x%lx\n", virq, hw);
-
-	/* Insert the interrupt mapping into the radix tree for fast lookup */
-	irq_radix_revmap_insert(xics_host, virq, hw);
-
-	get_irq_desc(virq)->status |= IRQ_LEVEL;
-	set_irq_chip_and_handler(virq, xics_irq_chip, handle_fasteoi_irq);
-	return 0;
-}
-
-static int xics_host_xlate(struct irq_host *h, struct device_node *ct,
-			   u32 *intspec, unsigned int intsize,
-			   irq_hw_number_t *out_hwirq, unsigned int *out_flags)
-
-{
-	/* Current xics implementation translates everything
-	 * to level. It is not technically right for MSIs but this
-	 * is irrelevant at this point. We might get smarter in the future
-	 */
-	*out_hwirq = intspec[0];
-	*out_flags = IRQ_TYPE_LEVEL_LOW;
-
-	return 0;
-}
-
-static struct irq_host_ops xics_host_ops = {
-	.match = xics_host_match,
-	.map = xics_host_map,
-	.xlate = xics_host_xlate,
-};
-
-static void __init xics_init_host(void)
-{
-	if (firmware_has_feature(FW_FEATURE_LPAR))
-		xics_irq_chip = &xics_pic_lpar;
-	else
-		xics_irq_chip = &xics_pic_direct;
-
-	xics_host = irq_alloc_host(NULL, IRQ_HOST_MAP_TREE, 0, &xics_host_ops,
-				   XICS_IRQ_SPURIOUS);
-	BUG_ON(xics_host == NULL);
-	irq_set_default_host(xics_host);
-}
-
-
-/* Inter-processor interrupt support */
-
-#ifdef CONFIG_SMP
-/*
- * XICS only has a single IPI, so encode the messages per CPU
- */
-struct xics_ipi_struct {
-        unsigned long value;
-	} ____cacheline_aligned;
-
-static struct xics_ipi_struct xics_ipi_message[NR_CPUS] __cacheline_aligned;
-
-static inline void smp_xics_do_message(int cpu, int msg)
-{
-	set_bit(msg, &xics_ipi_message[cpu].value);
-	mb();
-	if (firmware_has_feature(FW_FEATURE_LPAR))
-		lpar_qirr_info(cpu, IPI_PRIORITY);
-	else
-		direct_qirr_info(cpu, IPI_PRIORITY);
-}
-
-void smp_xics_message_pass(int target, int msg)
-{
-	unsigned int i;
-
-	if (target < NR_CPUS) {
-		smp_xics_do_message(target, msg);
-	} else {
-		for_each_online_cpu(i) {
-			if (target == MSG_ALL_BUT_SELF
-			    && i == smp_processor_id())
-				continue;
-			smp_xics_do_message(i, msg);
-		}
-	}
-}
-
-static irqreturn_t xics_ipi_dispatch(int cpu)
-{
-	WARN_ON(cpu_is_offline(cpu));
-
-	mb();	/* order mmio clearing qirr */
-	while (xics_ipi_message[cpu].value) {
-		if (test_and_clear_bit(PPC_MSG_CALL_FUNCTION,
-				       &xics_ipi_message[cpu].value)) {
-			smp_message_recv(PPC_MSG_CALL_FUNCTION);
-		}
-		if (test_and_clear_bit(PPC_MSG_RESCHEDULE,
-				       &xics_ipi_message[cpu].value)) {
-			smp_message_recv(PPC_MSG_RESCHEDULE);
-		}
-		if (test_and_clear_bit(PPC_MSG_CALL_FUNC_SINGLE,
-				       &xics_ipi_message[cpu].value)) {
-			smp_message_recv(PPC_MSG_CALL_FUNC_SINGLE);
-		}
-#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
-		if (test_and_clear_bit(PPC_MSG_DEBUGGER_BREAK,
-				       &xics_ipi_message[cpu].value)) {
-			smp_message_recv(PPC_MSG_DEBUGGER_BREAK);
-		}
-#endif
-	}
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t xics_ipi_action_direct(int irq, void *dev_id)
-{
-	int cpu = smp_processor_id();
-
-	direct_qirr_info(cpu, 0xff);
-
-	return xics_ipi_dispatch(cpu);
-}
-
-static irqreturn_t xics_ipi_action_lpar(int irq, void *dev_id)
-{
-	int cpu = smp_processor_id();
-
-	lpar_qirr_info(cpu, 0xff);
-
-	return xics_ipi_dispatch(cpu);
-}
-
-static void xics_request_ipi(void)
-{
-	unsigned int ipi;
-	int rc;
-
-	ipi = irq_create_mapping(xics_host, XICS_IPI);
-	BUG_ON(ipi == NO_IRQ);
-
-	/*
-	 * IPIs are marked IRQF_DISABLED as they must run with irqs
-	 * disabled
-	 */
-	set_irq_handler(ipi, handle_percpu_irq);
-	if (firmware_has_feature(FW_FEATURE_LPAR))
-		rc = request_irq(ipi, xics_ipi_action_lpar,
-				IRQF_DISABLED|IRQF_PERCPU, "IPI", NULL);
-	else
-		rc = request_irq(ipi, xics_ipi_action_direct,
-				IRQF_DISABLED|IRQF_PERCPU, "IPI", NULL);
-	BUG_ON(rc);
-}
-
-int __init smp_xics_probe(void)
-{
-	xics_request_ipi();
-
-	return cpus_weight(cpu_possible_map);
-}
-
-#endif /* CONFIG_SMP */
-
-
-/* Initialization */
-
-static void xics_update_irq_servers(void)
-{
-	int i, j;
-	struct device_node *np;
-	u32 ilen;
-	const u32 *ireg;
-	u32 hcpuid;
-
-	/* Find the server numbers for the boot cpu. */
-	np = of_get_cpu_node(boot_cpuid, NULL);
-	BUG_ON(!np);
-
-	ireg = of_get_property(np, "ibm,ppc-interrupt-gserver#s", &ilen);
-	if (!ireg) {
-		of_node_put(np);
-		return;
-	}
-
-	i = ilen / sizeof(int);
-	hcpuid = get_hard_smp_processor_id(boot_cpuid);
-
-	/* Global interrupt distribution server is specified in the last
-	 * entry of "ibm,ppc-interrupt-gserver#s" property. Get the last
-	 * entry fom this property for current boot cpu id and use it as
-	 * default distribution server
-	 */
-	for (j = 0; j < i; j += 2) {
-		if (ireg[j] == hcpuid) {
-			default_server = hcpuid;
-			default_distrib_server = ireg[j+1];
-		}
-	}
-
-	of_node_put(np);
-}
-
-static void __init xics_map_one_cpu(int hw_id, unsigned long addr,
-				     unsigned long size)
-{
-	int i;
-
-	/* This may look gross but it's good enough for now, we don't quite
-	 * have a hard -> linux processor id matching.
-	 */
-	for_each_possible_cpu(i) {
-		if (!cpu_present(i))
-			continue;
-		if (hw_id == get_hard_smp_processor_id(i)) {
-			xics_per_cpu[i] = ioremap(addr, size);
-			return;
-		}
-	}
-}
-
-static void __init xics_init_one_node(struct device_node *np,
-				      unsigned int *indx)
-{
-	unsigned int ilen;
-	const u32 *ireg;
-
-	/* This code does the theorically broken assumption that the interrupt
-	 * server numbers are the same as the hard CPU numbers.
-	 * This happens to be the case so far but we are playing with fire...
-	 * should be fixed one of these days. -BenH.
-	 */
-	ireg = of_get_property(np, "ibm,interrupt-server-ranges", NULL);
-
-	/* Do that ever happen ? we'll know soon enough... but even good'old
-	 * f80 does have that property ..
-	 */
-	WARN_ON(ireg == NULL);
-	if (ireg) {
-		/*
-		 * set node starting index for this node
-		 */
-		*indx = *ireg;
-	}
-	ireg = of_get_property(np, "reg", &ilen);
-	if (!ireg)
-		panic("xics_init_IRQ: can't find interrupt reg property");
-
-	while (ilen >= (4 * sizeof(u32))) {
-		unsigned long addr, size;
-
-		/* XXX Use proper OF parsing code here !!! */
-		addr = (unsigned long)*ireg++ << 32;
-		ilen -= sizeof(u32);
-		addr |= *ireg++;
-		ilen -= sizeof(u32);
-		size = (unsigned long)*ireg++ << 32;
-		ilen -= sizeof(u32);
-		size |= *ireg++;
-		ilen -= sizeof(u32);
-		xics_map_one_cpu(*indx, addr, size);
-		(*indx)++;
-	}
-}
-
-void __init xics_init_IRQ(void)
-{
-	struct device_node *np;
-	u32 indx = 0;
-	int found = 0;
-	const u32 *isize;
-
-	ppc64_boot_msg(0x20, "XICS Init");
-
-	ibm_get_xive = rtas_token("ibm,get-xive");
-	ibm_set_xive = rtas_token("ibm,set-xive");
-	ibm_int_on  = rtas_token("ibm,int-on");
-	ibm_int_off = rtas_token("ibm,int-off");
-
-	for_each_node_by_type(np, "PowerPC-External-Interrupt-Presentation") {
-		found = 1;
-		if (firmware_has_feature(FW_FEATURE_LPAR)) {
-			of_node_put(np);
-			break;
-			}
-		xics_init_one_node(np, &indx);
-	}
-	if (found == 0)
-		return;
-
-	/* get the bit size of server numbers */
-	found = 0;
-
-	for_each_compatible_node(np, NULL, "ibm,ppc-xics") {
-		isize = of_get_property(np, "ibm,interrupt-server#-size", NULL);
-
-		if (!isize)
-			continue;
-
-		if (!found) {
-			interrupt_server_size = *isize;
-			found = 1;
-		} else if (*isize != interrupt_server_size) {
-			printk(KERN_WARNING "XICS: "
-			       "mismatched ibm,interrupt-server#-size\n");
-			interrupt_server_size = max(*isize,
-						    interrupt_server_size);
-		}
-	}
-
-	xics_update_irq_servers();
-	xics_init_host();
-
-	if (firmware_has_feature(FW_FEATURE_LPAR))
-		ppc_md.get_irq = xics_get_irq_lpar;
-	else
-		ppc_md.get_irq = xics_get_irq_direct;
-
-	xics_setup_cpu();
-
-	ppc64_boot_msg(0x21, "XICS Done");
-}
-
-/* Cpu startup, shutdown, and hotplug */
-
-static void xics_set_cpu_priority(unsigned char cppr)
-{
-	if (firmware_has_feature(FW_FEATURE_LPAR))
-		lpar_cppr_info(cppr);
-	else
-		direct_cppr_info(cppr);
-	iosync();
-}
-
-/* Have the calling processor join or leave the specified global queue */
-static void xics_set_cpu_giq(unsigned int gserver, unsigned int join)
-{
-	int index;
-	int status;
-
-	if (!rtas_indicator_present(GLOBAL_INTERRUPT_QUEUE, NULL))
-		return;
-
-	index = (1UL << interrupt_server_size) - 1 - gserver;
-
-	status = rtas_set_indicator_fast(GLOBAL_INTERRUPT_QUEUE, index, join);
-
-	WARN(status < 0, "set-indicator(%d, %d, %u) returned %d\n",
-	     GLOBAL_INTERRUPT_QUEUE, index, join, status);
-}
-
-void xics_setup_cpu(void)
-{
-	xics_set_cpu_priority(0xff);
-
-	xics_set_cpu_giq(default_distrib_server, 1);
-}
-
-void xics_teardown_cpu(void)
-{
-	int cpu = smp_processor_id();
-
-	xics_set_cpu_priority(0);
-
-	/* Clear any pending IPI request */
-	if (firmware_has_feature(FW_FEATURE_LPAR))
-		lpar_qirr_info(cpu, 0xff);
-	else
-		direct_qirr_info(cpu, 0xff);
-}
-
-void xics_kexec_teardown_cpu(int secondary)
-{
-	xics_teardown_cpu();
-
-	/*
-	 * we take the ipi irq but and never return so we
-	 * need to EOI the IPI, but want to leave our priority 0
-	 *
-	 * should we check all the other interrupts too?
-	 * should we be flagging idle loop instead?
-	 * or creating some task to be scheduled?
-	 */
-
-	if (firmware_has_feature(FW_FEATURE_LPAR))
-		lpar_xirr_info_set((0x00 << 24) | XICS_IPI);
-	else
-		direct_xirr_info_set((0x00 << 24) | XICS_IPI);
-
-	/*
-	 * Some machines need to have at least one cpu in the GIQ,
-	 * so leave the master cpu in the group.
-	 */
-	if (secondary)
-		xics_set_cpu_giq(default_distrib_server, 0);
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/* Interrupts are disabled. */
-void xics_migrate_irqs_away(void)
-{
-	int cpu = smp_processor_id(), hw_cpu = hard_smp_processor_id();
-	unsigned int irq, virq;
-
-	/* If we used to be the default server, move to the new "boot_cpuid" */
-	if (hw_cpu == default_server)
-		xics_update_irq_servers();
-
-	/* Reject any interrupt that was queued to us... */
-	xics_set_cpu_priority(0);
-
-	/* Remove ourselves from the global interrupt queue */
-	xics_set_cpu_giq(default_distrib_server, 0);
-
-	/* Allow IPIs again... */
-	xics_set_cpu_priority(DEFAULT_PRIORITY);
-
-	for_each_irq(virq) {
-		struct irq_desc *desc;
-		int xics_status[2];
-		int status;
-		unsigned long flags;
-
-		/* We cant set affinity on ISA interrupts */
-		if (virq < NUM_ISA_INTERRUPTS)
-			continue;
-		if (irq_map[virq].host != xics_host)
-			continue;
-		irq = (unsigned int)irq_map[virq].hwirq;
-		/* We need to get IPIs still. */
-		if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS)
-			continue;
-		desc = get_irq_desc(virq);
-
-		/* We only need to migrate enabled IRQS */
-		if (desc == NULL || desc->chip == NULL
-		    || desc->action == NULL
-		    || desc->chip->set_affinity == NULL)
-			continue;
-
-		spin_lock_irqsave(&desc->lock, flags);
-
-		status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq);
-		if (status) {
-			printk(KERN_ERR "%s: ibm,get-xive irq=%u returns %d\n",
-					__func__, irq, status);
-			goto unlock;
-		}
-
-		/*
-		 * We only support delivery to all cpus or to one cpu.
-		 * The irq has to be migrated only in the single cpu
-		 * case.
-		 */
-		if (xics_status[0] != hw_cpu)
-			goto unlock;
-
-		printk(KERN_WARNING "IRQ %u affinity broken off cpu %u\n",
-		       virq, cpu);
-
-		/* Reset affinity to all cpus */
-		cpumask_setall(irq_desc[virq].affinity);
-		desc->chip->set_affinity(virq, cpu_all_mask);
-unlock:
-		spin_unlock_irqrestore(&desc->lock, flags);
-	}
-}
-#endif
diff --git a/arch/powerpc/platforms/pseries/xics.h b/arch/powerpc/platforms/pseries/xics.h
deleted file mode 100644
index d1d5a83039a..00000000000
--- a/arch/powerpc/platforms/pseries/xics.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * arch/powerpc/platforms/pseries/xics.h
- *
- * Copyright 2000 IBM Corporation.
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- */
-
-#ifndef _POWERPC_KERNEL_XICS_H
-#define _POWERPC_KERNEL_XICS_H
-
-extern void xics_init_IRQ(void);
-extern void xics_setup_cpu(void);
-extern void xics_teardown_cpu(void);
-extern void xics_kexec_teardown_cpu(int secondary);
-extern void xics_migrate_irqs_away(void);
-extern int smp_xics_probe(void);
-extern void smp_xics_message_pass(int target, int msg);
-
-#endif /* _POWERPC_KERNEL_XICS_H */