Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6

Conflicts: drivers/ata/libata-scsi.c include/linux/libata.h Futher merge of Linus's head and compilation fixups. Signed-Off-By: David Howells <dhowells@redhat.com>
author: David Howells <dhowells@redhat.com> 2006-12-05 17:01:28 +0000
committer: David Howells <dhowells@warthog.cambridge.redhat.com> 2006-12-05 17:01:28 +0000
commit: 9db73724453a9350e1c22dbe732d427e2939a5c9 (patch)
tree: 15e3ead6413ae97398a54292acc199bee0864d42 /arch/powerpc/platforms/cell
parent: 4c1ac1b49122b805adfa4efc620592f68dccf5db (diff)
parent: e62438630ca37539c8cc1553710bbfaa3cf960a7 (diff)
28 files changed, 3916 insertions, 1077 deletions
diff --git a/arch/powerpc/platforms/cell/Kconfig b/arch/powerpc/platforms/cell/Kconfig
index 3e430b489bb..06a85b70433 100644
--- a/arch/powerpc/platforms/cell/Kconfig
+++ b/arch/powerpc/platforms/cell/Kconfig
@@ -20,4 +20,18 @@ config CBE_RAS
 	bool "RAS features for bare metal Cell BE"
 	default y
 
+config CBE_THERM
+	tristate "CBE thermal support"
+	default m
+	depends on CBE_RAS
+
+config CBE_CPUFREQ
+	tristate "CBE frequency scaling"
+	depends on CBE_RAS && CPU_FREQ
+	default m
+	help
+	  This adds the cpufreq driver for Cell BE processors.
+	  For details, take a look at <file:Documentation/cpu-freq/>.
+	  If you don't have such processor, say N
+
 endmenu
diff --git a/arch/powerpc/platforms/cell/Makefile b/arch/powerpc/platforms/cell/Makefile
index c89cdd67383..f90e8337796 100644
--- a/arch/powerpc/platforms/cell/Makefile
+++ b/arch/powerpc/platforms/cell/Makefile
@@ -1,7 +1,11 @@
 obj-$(CONFIG_PPC_CELL_NATIVE)		+= interrupt.o iommu.o setup.o \
-					   cbe_regs.o spider-pic.o pervasive.o
+					   cbe_regs.o spider-pic.o \
+					   pervasive.o pmu.o io-workarounds.o
 obj-$(CONFIG_CBE_RAS)			+= ras.o
 
+obj-$(CONFIG_CBE_THERM)			+= cbe_thermal.o
+obj-$(CONFIG_CBE_CPUFREQ)		+= cbe_cpufreq.o
+
 ifeq ($(CONFIG_SMP),y)
 obj-$(CONFIG_PPC_CELL_NATIVE)		+= smp.o
 endif
@@ -11,5 +15,6 @@ spufs-modular-$(CONFIG_SPU_FS)		+= spu_syscalls.o
 spu-priv1-$(CONFIG_PPC_CELL_NATIVE)	+= spu_priv1_mmio.o
 
 obj-$(CONFIG_SPU_BASE)			+= spu_callbacks.o spu_base.o \
+					   spu_coredump.o \
 					   $(spufs-modular-m) \
 					   $(spu-priv1-y) spufs/
diff --git a/arch/powerpc/platforms/cell/cbe_cpufreq.c b/arch/powerpc/platforms/cell/cbe_cpufreq.c
new file mode 100644
index 00000000000..a3850fd1e94
--- /dev/null
+++ b/arch/powerpc/platforms/cell/cbe_cpufreq.c
@@ -0,0 +1,248 @@
+/*
+ * cpufreq driver for the cell processor
+ *
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2005
+ *
+ * Author: Christian Krafft <krafft@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/cpufreq.h>
+#include <linux/timer.h>
+
+#include <asm/hw_irq.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+#include <asm/prom.h>
+#include <asm/time.h>
+
+#include "cbe_regs.h"
+
+static DEFINE_MUTEX(cbe_switch_mutex);
+
+
+/* the CBE supports an 8 step frequency scaling */
+static struct cpufreq_frequency_table cbe_freqs[] = {
+	{1,	0},
+	{2,	0},
+	{3,	0},
+	{4,	0},
+	{5,	0},
+	{6,	0},
+	{8,	0},
+	{10,	0},
+	{0,	CPUFREQ_TABLE_END},
+};
+
+/* to write to MIC register */
+static u64 MIC_Slow_Fast_Timer_table[] = {
+	[0 ... 7] = 0x007fc00000000000ull,
+};
+
+/* more values for the MIC */
+static u64 MIC_Slow_Next_Timer_table[] = {
+	0x0000240000000000ull,
+	0x0000268000000000ull,
+	0x000029C000000000ull,
+	0x00002D0000000000ull,
+	0x0000300000000000ull,
+	0x0000334000000000ull,
+	0x000039C000000000ull,
+	0x00003FC000000000ull,
+};
+
+/*
+ * hardware specific functions
+ */
+
+static int get_pmode(int cpu)
+{
+	int ret;
+	struct cbe_pmd_regs __iomem *pmd_regs;
+
+	pmd_regs = cbe_get_cpu_pmd_regs(cpu);
+	ret = in_be64(&pmd_regs->pmsr) & 0x07;
+
+	return ret;
+}
+
+static int set_pmode(int cpu, unsigned int pmode)
+{
+	struct cbe_pmd_regs __iomem *pmd_regs;
+	struct cbe_mic_tm_regs __iomem *mic_tm_regs;
+	u64 flags;
+	u64 value;
+
+	local_irq_save(flags);
+
+	mic_tm_regs = cbe_get_cpu_mic_tm_regs(cpu);
+	pmd_regs = cbe_get_cpu_pmd_regs(cpu);
+
+	pr_debug("pm register is mapped at %p\n", &pmd_regs->pmcr);
+	pr_debug("mic register is mapped at %p\n", &mic_tm_regs->slow_fast_timer_0);
+
+	out_be64(&mic_tm_regs->slow_fast_timer_0, MIC_Slow_Fast_Timer_table[pmode]);
+	out_be64(&mic_tm_regs->slow_fast_timer_1, MIC_Slow_Fast_Timer_table[pmode]);
+
+	out_be64(&mic_tm_regs->slow_next_timer_0, MIC_Slow_Next_Timer_table[pmode]);
+	out_be64(&mic_tm_regs->slow_next_timer_1, MIC_Slow_Next_Timer_table[pmode]);
+
+	value = in_be64(&pmd_regs->pmcr);
+	/* set bits to zero */
+	value &= 0xFFFFFFFFFFFFFFF8ull;
+	/* set bits to next pmode */
+	value |= pmode;
+
+	out_be64(&pmd_regs->pmcr, value);
+
+	/* wait until new pmode appears in status register */
+	value = in_be64(&pmd_regs->pmsr) & 0x07;
+	while(value != pmode) {
+		cpu_relax();
+		value = in_be64(&pmd_regs->pmsr) & 0x07;
+	}
+
+	local_irq_restore(flags);
+
+	return 0;
+}
+
+/*
+ * cpufreq functions
+ */
+
+static int cbe_cpufreq_cpu_init (struct cpufreq_policy *policy)
+{
+	u32 *max_freq;
+	int i, cur_pmode;
+	struct device_node *cpu;
+
+	cpu = of_get_cpu_node(policy->cpu, NULL);
+
+	if(!cpu)
+		return -ENODEV;
+
+	pr_debug("init cpufreq on CPU %d\n", policy->cpu);
+
+	max_freq = (u32*) get_property(cpu, "clock-frequency", NULL);
+
+	if(!max_freq)
+		return -EINVAL;
+
+	// we need the freq in kHz
+	*max_freq /= 1000;
+
+	pr_debug("max clock-frequency is at %u kHz\n", *max_freq);
+	pr_debug("initializing frequency table\n");
+
+	// initialize frequency table
+	for (i=0; cbe_freqs[i].frequency!=CPUFREQ_TABLE_END; i++) {
+		cbe_freqs[i].frequency = *max_freq / cbe_freqs[i].index;
+		pr_debug("%d: %d\n", i, cbe_freqs[i].frequency);
+	}
+
+	policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
+	/* if DEBUG is enabled set_pmode() measures the correct latency of a transition */
+	policy->cpuinfo.transition_latency = 25000;
+
+	cur_pmode = get_pmode(policy->cpu);
+	pr_debug("current pmode is at %d\n",cur_pmode);
+
+	policy->cur = cbe_freqs[cur_pmode].frequency;
+
+#ifdef CONFIG_SMP
+	policy->cpus = cpu_sibling_map[policy->cpu];
+#endif
+
+	cpufreq_frequency_table_get_attr (cbe_freqs, policy->cpu);
+
+	/* this ensures that policy->cpuinfo_min and policy->cpuinfo_max are set correctly */
+	return cpufreq_frequency_table_cpuinfo (policy, cbe_freqs);
+}
+
+static int cbe_cpufreq_cpu_exit(struct cpufreq_policy *policy)
+{
+	cpufreq_frequency_table_put_attr(policy->cpu);
+	return 0;
+}
+
+static int cbe_cpufreq_verify(struct cpufreq_policy *policy)
+{
+	return cpufreq_frequency_table_verify(policy, cbe_freqs);
+}
+
+
+static int cbe_cpufreq_target(struct cpufreq_policy *policy, unsigned int target_freq,
+			    unsigned int relation)
+{
+	int rc;
+	struct cpufreq_freqs freqs;
+	int cbe_pmode_new;
+
+	cpufreq_frequency_table_target(policy,
+				       cbe_freqs,
+				       target_freq,
+				       relation,
+				       &cbe_pmode_new);
+
+	freqs.old = policy->cur;
+	freqs.new = cbe_freqs[cbe_pmode_new].frequency;
+	freqs.cpu = policy->cpu;
+
+	mutex_lock (&cbe_switch_mutex);
+	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+
+	pr_debug("setting frequency for cpu %d to %d kHz, 1/%d of max frequency\n",
+		 policy->cpu,
+		 cbe_freqs[cbe_pmode_new].frequency,
+		 cbe_freqs[cbe_pmode_new].index);
+
+	rc = set_pmode(policy->cpu, cbe_pmode_new);
+	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	mutex_unlock(&cbe_switch_mutex);
+
+	return rc;
+}
+
+static struct cpufreq_driver cbe_cpufreq_driver = {
+	.verify		= cbe_cpufreq_verify,
+	.target		= cbe_cpufreq_target,
+	.init		= cbe_cpufreq_cpu_init,
+	.exit		= cbe_cpufreq_cpu_exit,
+	.name		= "cbe-cpufreq",
+	.owner		= THIS_MODULE,
+	.flags		= CPUFREQ_CONST_LOOPS,
+};
+
+/*
+ * module init and destoy
+ */
+
+static int __init cbe_cpufreq_init(void)
+{
+	return cpufreq_register_driver(&cbe_cpufreq_driver);
+}
+
+static void __exit cbe_cpufreq_exit(void)
+{
+	cpufreq_unregister_driver(&cbe_cpufreq_driver);
+}
+
+module_init(cbe_cpufreq_init);
+module_exit(cbe_cpufreq_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Christian Krafft <krafft@de.ibm.com>");
diff --git a/arch/powerpc/platforms/cell/cbe_regs.c b/arch/powerpc/platforms/cell/cbe_regs.c
index 2f194ba2989..9a0ee62691d 100644
--- a/arch/powerpc/platforms/cell/cbe_regs.c
+++ b/arch/powerpc/platforms/cell/cbe_regs.c
@@ -8,6 +8,7 @@
 
 #include <linux/percpu.h>
 #include <linux/types.h>
+#include <linux/module.h>
 
 #include <asm/io.h>
 #include <asm/pgtable.h>
@@ -16,8 +17,6 @@
 
 #include "cbe_regs.h"
 
-#define MAX_CBE		2
-
 /*
  * Current implementation uses "cpu" nodes. We build our own mapping
  * array of cpu numbers to cpu nodes locally for now to allow interrupt
@@ -30,6 +29,8 @@ static struct cbe_regs_map
 	struct device_node *cpu_node;
 	struct cbe_pmd_regs __iomem *pmd_regs;
 	struct cbe_iic_regs __iomem *iic_regs;
+	struct cbe_mic_tm_regs __iomem *mic_tm_regs;
+	struct cbe_pmd_shadow_regs pmd_shadow_regs;
 } cbe_regs_maps[MAX_CBE];
 static int cbe_regs_map_count;
 
@@ -42,6 +43,19 @@ static struct cbe_thread_map
 static struct cbe_regs_map *cbe_find_map(struct device_node *np)
 {
 	int i;
+	struct device_node *tmp_np;
+
+	if (strcasecmp(np->type, "spe") == 0) {
+		if (np->data == NULL) {
+			/* walk up path until cpu node was found */
+			tmp_np = np->parent;
+			while (tmp_np != NULL && strcasecmp(tmp_np->type, "cpu") != 0)
+				tmp_np = tmp_np->parent;
+
+			np->data = cbe_find_map(tmp_np);
+		}
+		return np->data;
+	}
 
 	for (i = 0; i < cbe_regs_map_count; i++)
 		if (cbe_regs_maps[i].cpu_node == np)
@@ -56,6 +70,7 @@ struct cbe_pmd_regs __iomem *cbe_get_pmd_regs(struct device_node *np)
 		return NULL;
 	return map->pmd_regs;
 }
+EXPORT_SYMBOL_GPL(cbe_get_pmd_regs);
 
 struct cbe_pmd_regs __iomem *cbe_get_cpu_pmd_regs(int cpu)
 {
@@ -64,7 +79,23 @@ struct cbe_pmd_regs __iomem *cbe_get_cpu_pmd_regs(int cpu)
 		return NULL;
 	return map->pmd_regs;
 }
+EXPORT_SYMBOL_GPL(cbe_get_cpu_pmd_regs);
 
+struct cbe_pmd_shadow_regs *cbe_get_pmd_shadow_regs(struct device_node *np)
+{
+	struct cbe_regs_map *map = cbe_find_map(np);
+	if (map == NULL)
+		return NULL;
+	return &map->pmd_shadow_regs;
+}
+
+struct cbe_pmd_shadow_regs *cbe_get_cpu_pmd_shadow_regs(int cpu)
+{
+	struct cbe_regs_map *map = cbe_thread_map[cpu].regs;
+	if (map == NULL)
+		return NULL;
+	return &map->pmd_shadow_regs;
+}
 
 struct cbe_iic_regs __iomem *cbe_get_iic_regs(struct device_node *np)
 {
@@ -73,6 +104,7 @@ struct cbe_iic_regs __iomem *cbe_get_iic_regs(struct device_node *np)
 		return NULL;
 	return map->iic_regs;
 }
+
 struct cbe_iic_regs __iomem *cbe_get_cpu_iic_regs(int cpu)
 {
 	struct cbe_regs_map *map = cbe_thread_map[cpu].regs;
@@ -81,6 +113,36 @@ struct cbe_iic_regs __iomem *cbe_get_cpu_iic_regs(int cpu)
 	return map->iic_regs;
 }
 
+struct cbe_mic_tm_regs __iomem *cbe_get_mic_tm_regs(struct device_node *np)
+{
+	struct cbe_regs_map *map = cbe_find_map(np);
+	if (map == NULL)
+		return NULL;
+	return map->mic_tm_regs;
+}
+
+struct cbe_mic_tm_regs __iomem *cbe_get_cpu_mic_tm_regs(int cpu)
+{
+	struct cbe_regs_map *map = cbe_thread_map[cpu].regs;
+	if (map == NULL)
+		return NULL;
+	return map->mic_tm_regs;
+}
+EXPORT_SYMBOL_GPL(cbe_get_cpu_mic_tm_regs);
+
+/* FIXME
+ * This is little more than a stub at the moment.  It should be
+ * fleshed out so that it works for both SMT and non-SMT, no
+ * matter if the passed cpu is odd or even.
+ * For SMT enabled, returns 0 for even-numbered cpu; otherwise 1.
+ * For SMT disabled, returns 0 for all cpus.
+ */
+u32 cbe_get_hw_thread_id(int cpu)
+{
+	return (cpu & 1);
+}
+EXPORT_SYMBOL_GPL(cbe_get_hw_thread_id);
+
 void __init cbe_regs_init(void)
 {
 	int i;
@@ -119,6 +181,11 @@ void __init cbe_regs_init(void)
 		prop = get_property(cpu, "iic", NULL);
 		if (prop != NULL)
 			map->iic_regs = ioremap(prop->address, prop->len);
+
+		prop = (struct address_prop *)get_property(cpu, "mic-tm",
+							   NULL);
+		if (prop != NULL)
+			map->mic_tm_regs = ioremap(prop->address, prop->len);
 	}
 }
 
diff --git a/arch/powerpc/platforms/cell/cbe_regs.h b/arch/powerpc/platforms/cell/cbe_regs.h
index e76e4a6af5b..440a7ecc66e 100644
--- a/arch/powerpc/platforms/cell/cbe_regs.h
+++ b/arch/powerpc/platforms/cell/cbe_regs.h
@@ -4,12 +4,19 @@
  * This file is intended to hold the various register definitions for CBE
  * on-chip system devices (memory controller, IO controller, etc...)
  *
+ * (C) Copyright IBM Corporation 2001,2006
+ *
+ * Authors: Maximino Aguilar (maguilar@us.ibm.com)
+ *          David J. Erb (djerb@us.ibm.com)
+ *
  * (c) 2006 Benjamin Herrenschmidt <benh@kernel.crashing.org>, IBM Corp.
  */
 
 #ifndef CBE_REGS_H
 #define CBE_REGS_H
 
+#include <asm/cell-pmu.h>
+
 /*
  *
  * Some HID register definitions
@@ -22,6 +29,7 @@
 #define HID0_CBE_THERM_INT_EN	0x0000000400000000ul
 #define HID0_CBE_SYSERR_INT_EN	0x0000000200000000ul
 
+#define MAX_CBE		2
 
 /*
  *
@@ -29,51 +37,124 @@
  *
  */
 
+union spe_reg {
+	u64 val;
+	u8 spe[8];
+};
+
+union ppe_spe_reg {
+	u64 val;
+	struct {
+		u32 ppe;
+		u32 spe;
+	};
+};
+
+
 struct cbe_pmd_regs {
-	u8 pad_0x0000_0x0800[0x0800 - 0x0000];			/* 0x0000 */
+	/* Debug Bus Control */
+	u64	pad_0x0000;					/* 0x0000 */
+
+	u64	group_control;					/* 0x0008 */
+
+	u8	pad_0x0010_0x00a8 [0x00a8 - 0x0010];		/* 0x0010 */
+
+	u64	debug_bus_control;				/* 0x00a8 */
+
+	u8	pad_0x00b0_0x0100 [0x0100 - 0x00b0];		/* 0x00b0 */
+
+	u64	trace_aux_data;					/* 0x0100 */
+	u64	trace_buffer_0_63;				/* 0x0108 */
+	u64	trace_buffer_64_127;				/* 0x0110 */
+	u64	trace_address;					/* 0x0118 */
+	u64	ext_tr_timer;					/* 0x0120 */
+
+	u8	pad_0x0128_0x0400 [0x0400 - 0x0128];		/* 0x0128 */
+
+	/* Performance Monitor */
+	u64	pm_status;					/* 0x0400 */
+	u64	pm_control;					/* 0x0408 */
+	u64	pm_interval;					/* 0x0410 */
+	u64	pm_ctr[4];					/* 0x0418 */
+	u64	pm_start_stop;					/* 0x0438 */
+	u64	pm07_control[8];				/* 0x0440 */
+
+	u8	pad_0x0480_0x0800 [0x0800 - 0x0480];		/* 0x0480 */
 
 	/* Thermal Sensor Registers */
-	u64  ts_ctsr1;						/* 0x0800 */
-	u64  ts_ctsr2;						/* 0x0808 */
-	u64  ts_mtsr1;						/* 0x0810 */
-	u64  ts_mtsr2;						/* 0x0818 */
-	u64  ts_itr1;						/* 0x0820 */
-	u64  ts_itr2;						/* 0x0828 */
-	u64  ts_gitr;						/* 0x0830 */
-	u64  ts_isr;						/* 0x0838 */
-	u64  ts_imr;						/* 0x0840 */
-	u64  tm_cr1;						/* 0x0848 */
-	u64  tm_cr2;						/* 0x0850 */
-	u64  tm_simr;						/* 0x0858 */
-	u64  tm_tpr;						/* 0x0860 */
-	u64  tm_str1;						/* 0x0868 */
-	u64  tm_str2;						/* 0x0870 */
-	u64  tm_tsr;						/* 0x0878 */
+	union	spe_reg	ts_ctsr1;				/* 0x0800 */
+	u64	ts_ctsr2;					/* 0x0808 */
+	union	spe_reg	ts_mtsr1;				/* 0x0810 */
+	u64	ts_mtsr2;					/* 0x0818 */
+	union	spe_reg	ts_itr1;				/* 0x0820 */
+	u64	ts_itr2;					/* 0x0828 */
+	u64	ts_gitr;					/* 0x0830 */
+	u64	ts_isr;						/* 0x0838 */
+	u64	ts_imr;						/* 0x0840 */
+	union	spe_reg	tm_cr1;					/* 0x0848 */
+	u64	tm_cr2;						/* 0x0850 */
+	u64	tm_simr;					/* 0x0858 */
+	union	ppe_spe_reg tm_tpr;				/* 0x0860 */
+	union	spe_reg	tm_str1;				/* 0x0868 */
+	u64	tm_str2;					/* 0x0870 */
+	union	ppe_spe_reg tm_tsr;				/* 0x0878 */
 
 	/* Power Management */
-	u64  pm_control;					/* 0x0880 */
-#define CBE_PMD_PAUSE_ZERO_CONTROL		0x10000
-	u64  pm_status;						/* 0x0888 */
+	u64	pmcr;						/* 0x0880 */
+#define CBE_PMD_PAUSE_ZERO_CONTROL	0x10000
+	u64	pmsr;						/* 0x0888 */
 
 	/* Time Base Register */
-	u64  tbr;						/* 0x0890 */
+	u64	tbr;						/* 0x0890 */
 
-	u8   pad_0x0898_0x0c00 [0x0c00 - 0x0898];		/* 0x0898 */
+	u8	pad_0x0898_0x0c00 [0x0c00 - 0x0898];		/* 0x0898 */
 
 	/* Fault Isolation Registers */
-	u64  checkstop_fir;					/* 0x0c00 */
-	u64  recoverable_fir;
-	u64  spec_att_mchk_fir;
-	u64  fir_mode_reg;
-	u64  fir_enable_mask;
+	u64	checkstop_fir;					/* 0x0c00 */
+	u64	recoverable_fir;				/* 0x0c08 */
+	u64	spec_att_mchk_fir;				/* 0x0c10 */
+	u64	fir_mode_reg;					/* 0x0c18 */
+	u64	fir_enable_mask;				/* 0x0c20 */
 
-	u8   pad_0x0c28_0x1000 [0x1000 - 0x0c28];		/* 0x0c28 */
+	u8	pad_0x0c28_0x1000 [0x1000 - 0x0c28];		/* 0x0c28 */
 };
 
 extern struct cbe_pmd_regs __iomem *cbe_get_pmd_regs(struct device_node *np);
 extern struct cbe_pmd_regs __iomem *cbe_get_cpu_pmd_regs(int cpu);
 
 /*
+ * PMU shadow registers
+ *
+ * Many of the registers in the performance monitoring unit are write-only,
+ * so we need to save a copy of what we write to those registers.
+ *
+ * The actual data counters are read/write. However, writing to the counters
+ * only takes effect if the PMU is enabled. Otherwise the value is stored in
+ * a hardware latch until the next time the PMU is enabled. So we save a copy
+ * of the counter values if we need to read them back while the PMU is
+ * disabled. The counter_value_in_latch field is a bitmap indicating which
+ * counters currently have a value waiting to be written.
+ */
+
+struct cbe_pmd_shadow_regs {
+	u32 group_control;
+	u32 debug_bus_control;
+	u32 trace_address;
+	u32 ext_tr_timer;
+	u32 pm_status;
+	u32 pm_control;
+	u32 pm_interval;
+	u32 pm_start_stop;
+	u32 pm07_control[NR_CTRS];
+
+	u32 pm_ctr[NR_PHYS_CTRS];
+	u32 counter_value_in_latch;
+};
+
+extern struct cbe_pmd_shadow_regs *cbe_get_pmd_shadow_regs(struct device_node *np);
+extern struct cbe_pmd_shadow_regs *cbe_get_cpu_pmd_shadow_regs(int cpu);
+
+/*
  *
  * IIC unit register definitions
  *
@@ -102,18 +183,28 @@ struct cbe_iic_regs {
 
 	/* IIC interrupt registers */
 	struct	cbe_iic_thread_regs thread[2];			/* 0x0400 */
-	u64     iic_ir;						/* 0x0440 */
-	u64     iic_is;						/* 0x0448 */
+
+	u64	iic_ir;						/* 0x0440 */
+#define CBE_IIC_IR_PRIO(x)      (((x) & 0xf) << 12)
+#define CBE_IIC_IR_DEST_NODE(x) (((x) & 0xf) << 4)
+#define CBE_IIC_IR_DEST_UNIT(x) ((x) & 0xf)
+#define CBE_IIC_IR_IOC_0        0x0
+#define CBE_IIC_IR_IOC_1S       0xb
+#define CBE_IIC_IR_PT_0         0xe
+#define CBE_IIC_IR_PT_1         0xf
+
+	u64	iic_is;						/* 0x0448 */
+#define CBE_IIC_IS_PMI		0x2
 
 	u8	pad_0x0450_0x0500[0x0500 - 0x0450];		/* 0x0450 */
 
 	/* IOC FIR */
 	u64	ioc_fir_reset;					/* 0x0500 */
-	u64	ioc_fir_set;
-	u64	ioc_checkstop_enable;
-	u64	ioc_fir_error_mask;
-	u64	ioc_syserr_enable;
-	u64	ioc_fir;
+	u64	ioc_fir_set;					/* 0x0508 */
+	u64	ioc_checkstop_enable;				/* 0x0510 */
+	u64	ioc_fir_error_mask;				/* 0x0518 */
+	u64	ioc_syserr_enable;				/* 0x0520 */
+	u64	ioc_fir;					/* 0x0528 */
 
 	u8	pad_0x0530_0x1000[0x1000 - 0x0530];		/* 0x0530 */
 };
@@ -122,6 +213,48 @@ extern struct cbe_iic_regs __iomem *cbe_get_iic_regs(struct device_node *np);
 extern struct cbe_iic_regs __iomem *cbe_get_cpu_iic_regs(int cpu);
 
 
+struct cbe_mic_tm_regs {
+	u8	pad_0x0000_0x0040[0x0040 - 0x0000];		/* 0x0000 */
+
+	u64	mic_ctl_cnfg2;					/* 0x0040 */
+#define CBE_MIC_ENABLE_AUX_TRC		0x8000000000000000LL
+#define CBE_MIC_DISABLE_PWR_SAV_2	0x0200000000000000LL
+#define CBE_MIC_DISABLE_AUX_TRC_WRAP	0x0100000000000000LL
+#define CBE_MIC_ENABLE_AUX_TRC_INT	0x0080000000000000LL
+
+	u64	pad_0x0048;					/* 0x0048 */
+
+	u64	mic_aux_trc_base;				/* 0x0050 */
+	u64	mic_aux_trc_max_addr;				/* 0x0058 */
+	u64	mic_aux_trc_cur_addr;				/* 0x0060 */
+	u64	mic_aux_trc_grf_addr;				/* 0x0068 */
+	u64	mic_aux_trc_grf_data;				/* 0x0070 */
+
+	u64	pad_0x0078;					/* 0x0078 */
+
+	u64	mic_ctl_cnfg_0;					/* 0x0080 */
+#define CBE_MIC_DISABLE_PWR_SAV_0	0x8000000000000000LL
+
+	u64	pad_0x0088;					/* 0x0088 */
+
+	u64	slow_fast_timer_0;				/* 0x0090 */
+	u64	slow_next_timer_0;				/* 0x0098 */
+
+	u8	pad_0x00a0_0x01c0[0x01c0 - 0x0a0];		/* 0x00a0 */
+
+	u64	mic_ctl_cnfg_1;					/* 0x01c0 */
+#define CBE_MIC_DISABLE_PWR_SAV_1	0x8000000000000000LL
+	u64	pad_0x01c8;					/* 0x01c8 */
+
+	u64	slow_fast_timer_1;				/* 0x01d0 */
+	u64	slow_next_timer_1;				/* 0x01d8 */
+
+	u8	pad_0x01e0_0x1000[0x1000 - 0x01e0];		/* 0x01e0 */
+};
+
+extern struct cbe_mic_tm_regs __iomem *cbe_get_mic_tm_regs(struct device_node *np);
+extern struct cbe_mic_tm_regs __iomem *cbe_get_cpu_mic_tm_regs(int cpu);
+
 /* Init this module early */
 extern void cbe_regs_init(void);
 
diff --git a/arch/powerpc/platforms/cell/cbe_thermal.c b/arch/powerpc/platforms/cell/cbe_thermal.c
new file mode 100644
index 00000000000..616a0a3fd0e
--- /dev/null
+++ b/arch/powerpc/platforms/cell/cbe_thermal.c
@@ -0,0 +1,226 @@
+/*
+ * thermal support for the cell processor
+ *
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2005
+ *
+ * Author: Christian Krafft <krafft@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/sysdev.h>
+#include <linux/kernel.h>
+#include <linux/cpu.h>
+#include <asm/spu.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+
+#include "cbe_regs.h"
+#include "spu_priv1_mmio.h"
+
+static struct cbe_pmd_regs __iomem *get_pmd_regs(struct sys_device *sysdev)
+{
+	struct spu *spu;
+
+	spu = container_of(sysdev, struct spu, sysdev);
+
+	return cbe_get_pmd_regs(spu_devnode(spu));
+}
+
+/* returns the value for a given spu in a given register */
+static u8 spu_read_register_value(struct sys_device *sysdev, union spe_reg __iomem *reg)
+{
+	unsigned int *id;
+	union spe_reg value;
+	struct spu *spu;
+
+	/* getting the id from the reg attribute will not work on future device-tree layouts
+	 * in future we should store the id to the spu struct and use it here */
+	spu = container_of(sysdev, struct spu, sysdev);
+	id = (unsigned int *)get_property(spu_devnode(spu), "reg", NULL);
+	value.val = in_be64(&reg->val);
+
+	return value.spe[*id];
+}
+
+static ssize_t spu_show_temp(struct sys_device *sysdev, char *buf)
+{
+	int value;
+	struct cbe_pmd_regs __iomem *pmd_regs;
+
+	pmd_regs = get_pmd_regs(sysdev);
+
+	value = spu_read_register_value(sysdev, &pmd_regs->ts_ctsr1);
+	/* clear all other bits */
+	value &= 0x3F;
+	/* temp is stored in steps of 2 degrees */
+	value *= 2;
+	/* base temp is 65 degrees */
+	value += 65;
+
+	return sprintf(buf, "%d\n", (int) value);
+}
+
+static ssize_t ppe_show_temp(struct sys_device *sysdev, char *buf, int pos)
+{
+	struct cbe_pmd_regs __iomem *pmd_regs;
+	u64 value;
+
+	pmd_regs = cbe_get_cpu_pmd_regs(sysdev->id);
+	value = in_be64(&pmd_regs->ts_ctsr2);
+
+	/* access the corresponding byte */
+	value >>= pos;
+	/* clear all other bits */
+	value &= 0x3F;
+	/* temp is stored in steps of 2 degrees */
+	value *= 2;
+	/* base temp is 65 degrees */
+	value += 65;
+
+	return sprintf(buf, "%d\n", (int) value);
+}
+
+
+/* shows the temperature of the DTS on the PPE,
+ * located near the linear thermal sensor */
+static ssize_t ppe_show_temp0(struct sys_device *sysdev, char *buf)
+{
+	return ppe_show_temp(sysdev, buf, 32);
+}
+
+/* shows the temperature of the second DTS on the PPE */
+static ssize_t ppe_show_temp1(struct sys_device *sysdev, char *buf)
+{
+	return ppe_show_temp(sysdev, buf, 0);
+}
+
+static struct sysdev_attribute attr_spu_temperature = {
+	.attr = {.name = "temperature", .mode = 0400 },
+	.show = spu_show_temp,
+};
+
+static struct attribute *spu_attributes[] = {
+	&attr_spu_temperature.attr,
+};
+
+static struct attribute_group spu_attribute_group = {
+	.name	= "thermal",
+	.attrs	= spu_attributes,
+};
+
+static struct sysdev_attribute attr_ppe_temperature0 = {
+	.attr = {.name = "temperature0", .mode = 0400 },
+	.show = ppe_show_temp0,
+};
+
+static struct sysdev_attribute attr_ppe_temperature1 = {
+	.attr = {.name = "temperature1", .mode = 0400 },
+	.show = ppe_show_temp1,
+};
+
+static struct attribute *ppe_attributes[] = {
+	&attr_ppe_temperature0.attr,
+	&attr_ppe_temperature1.attr,
+};
+
+static struct attribute_group ppe_attribute_group = {
+	.name	= "thermal",
+	.attrs	= ppe_attributes,
+};
+
+/*
+ * initialize throttling with default values
+ */
+static void __init init_default_values(void)
+{
+	int cpu;
+	struct cbe_pmd_regs __iomem *pmd_regs;
+	struct sys_device *sysdev;
+	union ppe_spe_reg tpr;
+	union spe_reg str1;
+	u64 str2;
+	union spe_reg cr1;
+	u64 cr2;
+
+	/* TPR defaults */
+	/* ppe
+	 *	1F - no full stop
+	 *	08 - dynamic throttling starts if over 80 degrees
+	 *	03 - dynamic throttling ceases if below 70 degrees */
+	tpr.ppe = 0x1F0803;
+	/* spe
+	 *	10 - full stopped when over 96 degrees
+	 *	08 - dynamic throttling starts if over 80 degrees
+	 *	03 - dynamic throttling ceases if below 70 degrees
+	 */
+	tpr.spe = 0x100803;
+
+	/* STR defaults */
+	/* str1
+	 *	10 - stop 16 of 32 cycles
+	 */
+	str1.val = 0x1010101010101010ull;
+	/* str2
+	 *	10 - stop 16 of 32 cycles
+	 */
+	str2 = 0x10;
+
+	/* CR defaults */
+	/* cr1
+	 *	4 - normal operation
+	 */
+	cr1.val = 0x0404040404040404ull;
+	/* cr2
+	 *	4 - normal operation
+	 */
+	cr2 = 0x04;
+
+	for_each_possible_cpu (cpu) {
+		pr_debug("processing cpu %d\n", cpu);
+		sysdev = get_cpu_sysdev(cpu);
+		pmd_regs = cbe_get_cpu_pmd_regs(sysdev->id);
+
+		out_be64(&pmd_regs->tm_str2, str2);
+		out_be64(&pmd_regs->tm_str1.val, str1.val);
+		out_be64(&pmd_regs->tm_tpr.val, tpr.val);
+		out_be64(&pmd_regs->tm_cr1.val, cr1.val);
+		out_be64(&pmd_regs->tm_cr2, cr2);
+	}
+}
+
+
+static int __init thermal_init(void)
+{
+	init_default_values();
+
+	spu_add_sysdev_attr_group(&spu_attribute_group);
+	cpu_add_sysdev_attr_group(&ppe_attribute_group);
+
+	return 0;
+}
+module_init(thermal_init);
+
+static void __exit thermal_exit(void)
+{
+	spu_remove_sysdev_attr_group(&spu_attribute_group);
+	cpu_remove_sysdev_attr_group(&ppe_attribute_group);
+}
+module_exit(thermal_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Christian Krafft <krafft@de.ibm.com>");
+
diff --git a/arch/powerpc/platforms/cell/interrupt.c b/arch/powerpc/platforms/cell/interrupt.c
index a914c12b406..6666d037eb4 100644
--- a/arch/powerpc/platforms/cell/interrupt.c
+++ b/arch/powerpc/platforms/cell/interrupt.c
@@ -396,3 +396,19 @@ void __init iic_init_IRQ(void)
 	/* Enable on current CPU */
 	iic_setup_cpu();
 }
+
+void iic_set_interrupt_routing(int cpu, int thread, int priority)
+{
+	struct cbe_iic_regs __iomem *iic_regs = cbe_get_cpu_iic_regs(cpu);
+	u64 iic_ir = 0;
+	int node = cpu >> 1;
+
+	/* Set which node and thread will handle the next interrupt */
+	iic_ir |= CBE_IIC_IR_PRIO(priority) |
+		  CBE_IIC_IR_DEST_NODE(node);
+	if (thread == 0)
+		iic_ir |= CBE_IIC_IR_DEST_UNIT(CBE_IIC_IR_PT_0);
+	else
+		iic_ir |= CBE_IIC_IR_DEST_UNIT(CBE_IIC_IR_PT_1);
+	out_be64(&iic_regs->iic_ir, iic_ir);
+}
diff --git a/arch/powerpc/platforms/cell/interrupt.h b/arch/powerpc/platforms/cell/interrupt.h
index 9ba1d3c17b4..942dc39d604 100644
--- a/arch/powerpc/platforms/cell/interrupt.h
+++ b/arch/powerpc/platforms/cell/interrupt.h
@@ -83,5 +83,7 @@ extern u8 iic_get_target_id(int cpu);
 
 extern void spider_init_IRQ(void);
 
+extern void iic_set_interrupt_routing(int cpu, int thread, int priority);
+
 #endif
 #endif /* ASM_CELL_PIC_H */
diff --git a/arch/powerpc/platforms/cell/io-workarounds.c b/arch/powerpc/platforms/cell/io-workarounds.c
new file mode 100644
index 00000000000..580d4259591
--- /dev/null
+++ b/arch/powerpc/platforms/cell/io-workarounds.c
@@ -0,0 +1,346 @@
+/*
+ *  Copyright (C) 2006 Benjamin Herrenschmidt <benh@kernel.crashing.org>
+ *		       IBM, Corp.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/pci.h>
+#include <asm/io.h>
+#include <asm/machdep.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+
+
+#define SPIDER_PCI_REG_BASE		0xd000
+#define SPIDER_PCI_VCI_CNTL_STAT	0x0110
+#define SPIDER_PCI_DUMMY_READ		0x0810
+#define SPIDER_PCI_DUMMY_READ_BASE	0x0814
+
+/* Undefine that to re-enable bogus prefetch
+ *
+ * Without that workaround, the chip will do bogus prefetch past
+ * page boundary from system memory. This setting will disable that,
+ * though the documentation is unclear as to the consequences of doing
+ * so, either purely performances, or possible misbehaviour... It's not
+ * clear wether the chip can handle unaligned accesses at all without
+ * prefetching enabled.
+ *
+ * For now, things appear to be behaving properly with that prefetching
+ * disabled and IDE, possibly because IDE isn't doing any unaligned
+ * access.
+ */
+#define SPIDER_DISABLE_PREFETCH
+
+#define MAX_SPIDERS	2
+
+static struct spider_pci_bus {
+	void __iomem	*regs;
+	unsigned long	mmio_start;
+	unsigned long	mmio_end;
+	unsigned long	pio_vstart;
+	unsigned long	pio_vend;
+} spider_pci_busses[MAX_SPIDERS];
+static int spider_pci_count;
+
+static struct spider_pci_bus *spider_pci_find(unsigned long vaddr,
+					      unsigned long paddr)
+{
+	int i;
+
+	for (i = 0; i < spider_pci_count; i++) {
+		struct spider_pci_bus *bus = &spider_pci_busses[i];
+		if (paddr && paddr >= bus->mmio_start && paddr < bus->mmio_end)
+			return bus;
+		if (vaddr && vaddr >= bus->pio_vstart && vaddr < bus->pio_vend)
+			return bus;
+	}
+	return NULL;
+}
+
+static void spider_io_flush(const volatile void __iomem *addr)
+{
+	struct spider_pci_bus *bus;
+	int token;
+
+	/* Get platform token (set by ioremap) from address */
+	token = PCI_GET_ADDR_TOKEN(addr);
+
+	/* Fast path if we have a non-0 token, it indicates which bus we
+	 * are on.
+	 *
+	 * If the token is 0, that means either the the ioremap was done
+	 * before we initialized this layer, or it's a PIO operation. We
+	 * fallback to a low path in this case. Hopefully, internal devices
+	 * which are ioremap'ed early should use in_XX/out_XX functions
+	 * instead of the PCI ones and thus not suffer from the slowdown.
+	 *
+	 * Also note that currently, the workaround will not work for areas
+	 * that are not mapped with PTEs (bolted in the hash table). This
+	 * is the case for ioremaps done very early at boot (before
+	 * mem_init_done) and includes the mapping of the ISA IO space.
+	 *
+	 * Fortunately, none of the affected devices is expected to do DMA
+	 * and thus there should be no problem in practice.
+	 *
+	 * In order to improve performances, we only do the PTE search for
+	 * addresses falling in the PHB IO space area. That means it will
+	 * not work for hotplug'ed PHBs but those don't exist with Spider.
+	 */
+	if (token && token <= spider_pci_count)
+		bus = &spider_pci_busses[token - 1];
+	else {
+		unsigned long vaddr, paddr;
+		pte_t *ptep;
+
+		/* Fixup physical address */
+		vaddr = (unsigned long)PCI_FIX_ADDR(addr);
+
+		/* Check if it's in allowed range for  PIO */
+		if (vaddr < PHBS_IO_BASE || vaddr >= IMALLOC_BASE)
+			return;
+
+		/* Try to find a PTE. If not, clear the paddr, we'll do
+		 * a vaddr only lookup (PIO only)
+		 */
+		ptep = find_linux_pte(init_mm.pgd, vaddr);
+		if (ptep == NULL)
+			paddr = 0;
+		else
+			paddr = pte_pfn(*ptep) << PAGE_SHIFT;
+
+		bus = spider_pci_find(vaddr, paddr);
+		if (bus == NULL)
+			return;
+	}
+
+	/* Now do the workaround
+	 */
+	(void)in_be32(bus->regs + SPIDER_PCI_DUMMY_READ);
+}
+
+static u8 spider_readb(const volatile void __iomem *addr)
+{
+	u8 val = __do_readb(addr);
+	spider_io_flush(addr);
+	return val;
+}
+
+static u16 spider_readw(const volatile void __iomem *addr)
+{
+	u16 val = __do_readw(addr);
+	spider_io_flush(addr);
+	return val;
+}
+
+static u32 spider_readl(const volatile void __iomem *addr)
+{
+	u32 val = __do_readl(addr);
+	spider_io_flush(addr);
+	return val;
+}
+
+static u64 spider_readq(const volatile void __iomem *addr)
+{
+	u64 val = __do_readq(addr);
+	spider_io_flush(addr);
+	return val;
+}
+
+static u16 spider_readw_be(const volatile void __iomem *addr)
+{
+	u16 val = __do_readw_be(addr);
+	spider_io_flush(addr);
+	return val;
+}
+
+static u32 spider_readl_be(const volatile void __iomem *addr)
+{
+	u32 val = __do_readl_be(addr);
+	spider_io_flush(addr);
+	return val;
+}
+
+static u64 spider_readq_be(const volatile void __iomem *addr)
+{
+	u64 val = __do_readq_be(addr);
+	spider_io_flush(addr);
+	return val;
+}
+
+static void spider_readsb(const volatile void __iomem *addr, void *buf,
+			  unsigned long count)
+{
+	__do_readsb(addr, buf, count);
+	spider_io_flush(addr);
+}
+
+static void spider_readsw(const volatile void __iomem *addr, void *buf,
+			  unsigned long count)
+{
+	__do_readsw(addr, buf, count);
+	spider_io_flush(addr);
+}
+
+static void spider_readsl(const volatile void __iomem *addr, void *buf,
+			  unsigned long count)
+{
+	__do_readsl(addr, buf, count);
+	spider_io_flush(addr);
+}
+
+static void spider_memcpy_fromio(void *dest, const volatile void __iomem *src,
+				 unsigned long n)
+{
+	__do_memcpy_fromio(dest, src, n);
+	spider_io_flush(src);
+}
+
+
+static void __iomem * spider_ioremap(unsigned long addr, unsigned long size,
+				     unsigned long flags)
+{
+	struct spider_pci_bus *bus;
+	void __iomem *res = __ioremap(addr, size, flags);
+	int busno;
+
+	pr_debug("spider_ioremap(0x%lx, 0x%lx, 0x%lx) -> 0x%p\n",
+		 addr, size, flags, res);
+
+	bus = spider_pci_find(0, addr);
+	if (bus != NULL) {
+		busno = bus - spider_pci_busses;
+		pr_debug(" found bus %d, setting token\n", busno);
+		PCI_SET_ADDR_TOKEN(res, busno + 1);
+	}
+	pr_debug(" result=0x%p\n", res);
+
+	return res;
+}
+
+static void __init spider_pci_setup_chip(struct spider_pci_bus *bus)
+{
+#ifdef SPIDER_DISABLE_PREFETCH
+	u32 val = in_be32(bus->regs + SPIDER_PCI_VCI_CNTL_STAT);
+	pr_debug(" PVCI_Control_Status was 0x%08x\n", val);
+	out_be32(bus->regs + SPIDER_PCI_VCI_CNTL_STAT, val | 0x8);
+#endif
+
+	/* Configure the dummy address for the workaround */
+	out_be32(bus->regs + SPIDER_PCI_DUMMY_READ_BASE, 0x80000000);
+}
+
+static void __init spider_pci_add_one(struct pci_controller *phb)
+{
+	struct spider_pci_bus *bus = &spider_pci_busses[spider_pci_count];
+	struct device_node *np = phb->arch_data;
+	struct resource rsrc;
+	void __iomem *regs;
+
+	if (spider_pci_count >= MAX_SPIDERS) {
+		printk(KERN_ERR "Too many spider bridges, workarounds"
+		       " disabled for %s\n", np->full_name);
+		return;
+	}
+
+	/* Get the registers for the beast */
+	if (of_address_to_resource(np, 0, &rsrc)) {
+		printk(KERN_ERR "Failed to get registers for spider %s"
+		       " workarounds disabled\n", np->full_name);
+		return;
+	}
+
+	/* Mask out some useless bits in there to get to the base of the
+	 * spider chip
+	 */
+	rsrc.start &= ~0xfffffffful;
+
+	/* Map them */
+	regs = ioremap(rsrc.start + SPIDER_PCI_REG_BASE, 0x1000);
+	if (regs == NULL) {
+		printk(KERN_ERR "Failed to map registers for spider %s"
+		       " workarounds disabled\n", np->full_name);
+		return;
+	}
+
+	spider_pci_count++;
+
+	/* We assume spiders only have one MMIO resource */
+	bus->mmio_start = phb->mem_resources[0].start;
+	bus->mmio_end = phb->mem_resources[0].end + 1;
+
+	bus->pio_vstart = (unsigned long)phb->io_base_virt;
+	bus->pio_vend = bus->pio_vstart + phb->pci_io_size;
+
+	bus->regs = regs;
+
+	printk(KERN_INFO "PCI: Spider MMIO workaround for %s\n",np->full_name);
+
+	pr_debug(" mmio (P) = 0x%016lx..0x%016lx\n",
+		 bus->mmio_start, bus->mmio_end);
+	pr_debug("  pio (V) = 0x%016lx..0x%016lx\n",
+		 bus->pio_vstart, bus->pio_vend);
+	pr_debug(" regs (P) = 0x%016lx (V) = 0x%p\n",
+		 rsrc.start + SPIDER_PCI_REG_BASE, bus->regs);
+
+	spider_pci_setup_chip(bus);
+}
+
+static struct ppc_pci_io __initdata spider_pci_io = {
+	.readb = spider_readb,
+	.readw = spider_readw,
+	.readl = spider_readl,
+	.readq = spider_readq,
+	.readw_be = spider_readw_be,
+	.readl_be = spider_readl_be,
+	.readq_be = spider_readq_be,
+	.readsb = spider_readsb,
+	.readsw = spider_readsw,
+	.readsl = spider_readsl,
+	.memcpy_fromio = spider_memcpy_fromio,
+};
+
+static int __init spider_pci_workaround_init(void)
+{
+	struct pci_controller *phb;
+
+	if (!machine_is(cell))
+		return 0;
+
+	/* Find spider bridges. We assume they have been all probed
+	 * in setup_arch(). If that was to change, we would need to
+	 * update this code to cope with dynamically added busses
+	 */
+	list_for_each_entry(phb, &hose_list, list_node) {
+		struct device_node *np = phb->arch_data;
+		const char *model = get_property(np, "model", NULL);
+
+		/* If no model property or name isn't exactly "pci", skip */
+		if (model == NULL || strcmp(np->name, "pci"))
+			continue;
+		/* If model is not "Spider", skip */
+		if (strcmp(model, "Spider"))
+			continue;
+		spider_pci_add_one(phb);
+	}
+
+	/* No Spider PCI found, exit */
+	if (spider_pci_count == 0)
+		return 0;
+
+	/* Setup IO callbacks. We only setup MMIO reads. PIO reads will
+	 * fallback to MMIO reads (though without a token, thus slower)
+	 */
+	ppc_pci_io = spider_pci_io;
+
+	/* Setup ioremap callback */
+	ppc_md.ioremap = spider_ioremap;
+
+	return 0;
+}
+arch_initcall(spider_pci_workaround_init);
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index aca4c3db0dd..b43466ba809 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -1,514 +1,747 @@
 /*
  * IOMMU implementation for Cell Broadband Processor Architecture
- * We just establish a linear mapping at boot by setting all the
- * IOPT cache entries in the CPU.
- * The mapping functions should be identical to pci_direct_iommu, 
- * except for the handling of the high order bit that is required
- * by the Spider bridge. These should be split into a separate
- * file at the point where we get a different bridge chip.
  *
- * Copyright (C) 2005 IBM Deutschland Entwicklung GmbH,
- *			 Arnd Bergmann <arndb@de.ibm.com>
+ * (C) Copyright IBM Corporation 2006
  *
- * Based on linear mapping
- * Copyright (C) 2003 Benjamin Herrenschmidt (benh@kernel.crashing.org)
+ * Author: Jeremy Kerr <jk@ozlabs.org>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #undef DEBUG
 
 #include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/delay.h>
-#include <linux/string.h>
 #include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/mm.h>
-#include <linux/dma-mapping.h>
-#include <linux/kernel.h>
-#include <linux/compiler.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
 
-#include <asm/sections.h>
-#include <asm/iommu.h>
-#include <asm/io.h>
 #include <asm/prom.h>
-#include <asm/pci-bridge.h>
+#include <asm/iommu.h>
 #include <asm/machdep.h>
-#include <asm/pmac_feature.h>
-#include <asm/abs_addr.h>
-#include <asm/system.h>
-#include <asm/ppc-pci.h>
+#include <asm/pci-bridge.h>
 #include <asm/udbg.h>
+#include <asm/of_platform.h>
+#include <asm/lmb.h>
 
-#include "iommu.h"
+#include "cbe_regs.h"
+#include "interrupt.h"
 
-static inline unsigned long 
-get_iopt_entry(unsigned long real_address, unsigned long ioid,
-			 unsigned long prot)
-{
-	return (prot & IOPT_PROT_MASK)
-	     | (IOPT_COHERENT)
-	     | (IOPT_ORDER_VC)
-	     | (real_address & IOPT_RPN_MASK)
-	     | (ioid & IOPT_IOID_MASK);
-}
+/* Define CELL_IOMMU_REAL_UNMAP to actually unmap non-used pages
+ * instead of leaving them mapped to some dummy page. This can be
+ * enabled once the appropriate workarounds for spider bugs have
+ * been enabled
+ */
+#define CELL_IOMMU_REAL_UNMAP
 
-typedef struct {
-	unsigned long val;
-} ioste;
+/* Define CELL_IOMMU_STRICT_PROTECTION to enforce protection of
+ * IO PTEs based on the transfer direction. That can be enabled
+ * once spider-net has been fixed to pass the correct direction
+ * to the DMA mapping functions
+ */
+#define CELL_IOMMU_STRICT_PROTECTION
+
+
+#define NR_IOMMUS			2
+
+/* IOC mmap registers */
+#define IOC_Reg_Size			0x2000
+
+#define IOC_IOPT_CacheInvd		0x908
+#define IOC_IOPT_CacheInvd_NE_Mask	0xffe0000000000000ul
+#define IOC_IOPT_CacheInvd_IOPTE_Mask	0x000003fffffffff8ul
+#define IOC_IOPT_CacheInvd_Busy		0x0000000000000001ul
+
+#define IOC_IOST_Origin			0x918
+#define IOC_IOST_Origin_E		0x8000000000000000ul
+#define IOC_IOST_Origin_HW		0x0000000000000800ul
+#define IOC_IOST_Origin_HL		0x0000000000000400ul
+
+#define IOC_IO_ExcpStat			0x920
+#define IOC_IO_ExcpStat_V		0x8000000000000000ul
+#define IOC_IO_ExcpStat_SPF_Mask	0x6000000000000000ul
+#define IOC_IO_ExcpStat_SPF_S		0x6000000000000000ul
+#define IOC_IO_ExcpStat_SPF_P		0x4000000000000000ul
+#define IOC_IO_ExcpStat_ADDR_Mask	0x00000007fffff000ul
+#define IOC_IO_ExcpStat_RW_Mask		0x0000000000000800ul
+#define IOC_IO_ExcpStat_IOID_Mask	0x00000000000007fful
+
+#define IOC_IO_ExcpMask			0x928
+#define IOC_IO_ExcpMask_SFE		0x4000000000000000ul
+#define IOC_IO_ExcpMask_PFE		0x2000000000000000ul
+
+#define IOC_IOCmd_Offset		0x1000
+
+#define IOC_IOCmd_Cfg			0xc00
+#define IOC_IOCmd_Cfg_TE		0x0000800000000000ul
+
+
+/* Segment table entries */
+#define IOSTE_V			0x8000000000000000ul /* valid */
+#define IOSTE_H			0x4000000000000000ul /* cache hint */
+#define IOSTE_PT_Base_RPN_Mask  0x3ffffffffffff000ul /* base RPN of IOPT */
+#define IOSTE_NPPT_Mask		0x0000000000000fe0ul /* no. pages in IOPT */
+#define IOSTE_PS_Mask		0x0000000000000007ul /* page size */
+#define IOSTE_PS_4K		0x0000000000000001ul /*   - 4kB  */
+#define IOSTE_PS_64K		0x0000000000000003ul /*   - 64kB */
+#define IOSTE_PS_1M		0x0000000000000005ul /*   - 1MB  */
+#define IOSTE_PS_16M		0x0000000000000007ul /*   - 16MB */
+
+/* Page table entries */
+#define IOPTE_PP_W		0x8000000000000000ul /* protection: write */
+#define IOPTE_PP_R		0x4000000000000000ul /* protection: read */
+#define IOPTE_M			0x2000000000000000ul /* coherency required */
+#define IOPTE_SO_R		0x1000000000000000ul /* ordering: writes */
+#define IOPTE_SO_RW             0x1800000000000000ul /* ordering: r & w */
+#define IOPTE_RPN_Mask		0x07fffffffffff000ul /* RPN */
+#define IOPTE_H			0x0000000000000800ul /* cache hint */
+#define IOPTE_IOID_Mask		0x00000000000007fful /* ioid */
+
+
+/* IOMMU sizing */
+#define IO_SEGMENT_SHIFT	28
+#define IO_PAGENO_BITS		(IO_SEGMENT_SHIFT - IOMMU_PAGE_SHIFT)
+
+/* The high bit needs to be set on every DMA address */
+#define SPIDER_DMA_OFFSET	0x80000000ul
+
+struct iommu_window {
+	struct list_head list;
+	struct cbe_iommu *iommu;
+	unsigned long offset;
+	unsigned long size;
+	unsigned long pte_offset;
+	unsigned int ioid;
+	struct iommu_table table;
+};
 
-static inline ioste
-mk_ioste(unsigned long val)
-{
-	ioste ioste = { .val = val, };
-	return ioste;
-}
+#define NAMESIZE 8
+struct cbe_iommu {
+	int nid;
+	char name[NAMESIZE];
+	void __iomem *xlate_regs;
+	void __iomem *cmd_regs;
+	unsigned long *stab;
+	unsigned long *ptab;
+	void *pad_page;
+	struct list_head windows;
+};
+
+/* Static array of iommus, one per node
+ *   each contains a list of windows, keyed from dma_window property
+ *   - on bus setup, look for a matching window, or create one
+ *   - on dev setup, assign iommu_table ptr
+ */
+static struct cbe_iommu iommus[NR_IOMMUS];
+static int cbe_nr_iommus;
 
-static inline ioste
-get_iost_entry(unsigned long iopt_base, unsigned long io_address, unsigned page_size)
+static void invalidate_tce_cache(struct cbe_iommu *iommu, unsigned long *pte,
+		long n_ptes)
 {
-	unsigned long ps;
-	unsigned long iostep;
-	unsigned long nnpt;
-	unsigned long shift;
-
-	switch (page_size) {
-	case 0x1000000:
-		ps = IOST_PS_16M;
-		nnpt = 0;  /* one page per segment */
-		shift = 5; /* segment has 16 iopt entries */
-		break;
-
-	case 0x100000:
-		ps = IOST_PS_1M;
-		nnpt = 0;  /* one page per segment */
-		shift = 1; /* segment has 256 iopt entries */
-		break;
-
-	case 0x10000:
-		ps = IOST_PS_64K;
-		nnpt = 0x07; /* 8 pages per io page table */
-		shift = 0;   /* all entries are used */
-		break;
-
-	case 0x1000:
-		ps = IOST_PS_4K;
-		nnpt = 0x7f; /* 128 pages per io page table */
-		shift = 0;   /* all entries are used */
-		break;
-
-	default: /* not a known compile time constant */
-		{
-			/* BUILD_BUG_ON() is not usable here */
-			extern void __get_iost_entry_bad_page_size(void);
-			__get_iost_entry_bad_page_size();
-		}
-		break;
-	}
+	unsigned long *reg, val;
+	long n;
 
-	iostep = iopt_base +
-			 /* need 8 bytes per iopte */
-			(((io_address / page_size * 8)
-			 /* align io page tables on 4k page boundaries */
-				 << shift) 
-			 /* nnpt+1 pages go into each iopt */
-				 & ~(nnpt << 12));
-
-	nnpt++; /* this seems to work, but the documentation is not clear
-		   about wether we put nnpt or nnpt-1 into the ioste bits.
-		   In theory, this can't work for 4k pages. */
-	return mk_ioste(IOST_VALID_MASK
-			| (iostep & IOST_PT_BASE_MASK)
-			| ((nnpt << 5) & IOST_NNPT_MASK)
-			| (ps & IOST_PS_MASK));
-}
+	reg = iommu->xlate_regs + IOC_IOPT_CacheInvd;
 
-/* compute the address of an io pte */
-static inline unsigned long
-get_ioptep(ioste iost_entry, unsigned long io_address)
-{
-	unsigned long iopt_base;
-	unsigned long page_size;
-	unsigned long page_number;
-	unsigned long iopt_offset;
-
-	iopt_base = iost_entry.val & IOST_PT_BASE_MASK;
-	page_size = iost_entry.val & IOST_PS_MASK;
-
-	/* decode page size to compute page number */
-	page_number = (io_address & 0x0fffffff) >> (10 + 2 * page_size);
-	/* page number is an offset into the io page table */
-	iopt_offset = (page_number << 3) & 0x7fff8ul;
-	return iopt_base + iopt_offset;
-}
+	while (n_ptes > 0) {
+		/* we can invalidate up to 1 << 11 PTEs at once */
+		n = min(n_ptes, 1l << 11);
+		val = (((n /*- 1*/) << 53) & IOC_IOPT_CacheInvd_NE_Mask)
+			| (__pa(pte) & IOC_IOPT_CacheInvd_IOPTE_Mask)
+		        | IOC_IOPT_CacheInvd_Busy;
 
-/* compute the tag field of the iopt cache entry */
-static inline unsigned long
-get_ioc_tag(ioste iost_entry, unsigned long io_address)
-{
-	unsigned long iopte = get_ioptep(iost_entry, io_address);
+		out_be64(reg, val);
+		while (in_be64(reg) & IOC_IOPT_CacheInvd_Busy)
+			;
 
-	return IOPT_VALID_MASK
-	     | ((iopte & 0x00000000000000ff8ul) >> 3)
-	     | ((iopte & 0x0000003fffffc0000ul) >> 9);
+		n_ptes -= n;
+		pte += n;
+	}
 }
 
-/* compute the hashed 6 bit index for the 4-way associative pte cache */
-static inline unsigned long
-get_ioc_hash(ioste iost_entry, unsigned long io_address)
+static void tce_build_cell(struct iommu_table *tbl, long index, long npages,
+		unsigned long uaddr, enum dma_data_direction direction)
 {
-	unsigned long iopte = get_ioptep(iost_entry, io_address);
-
-	return ((iopte & 0x000000000000001f8ul) >> 3)
-	     ^ ((iopte & 0x00000000000020000ul) >> 17)
-	     ^ ((iopte & 0x00000000000010000ul) >> 15)
-	     ^ ((iopte & 0x00000000000008000ul) >> 13)
-	     ^ ((iopte & 0x00000000000004000ul) >> 11)
-	     ^ ((iopte & 0x00000000000002000ul) >> 9)
-	     ^ ((iopte & 0x00000000000001000ul) >> 7);
+	int i;
+	unsigned long *io_pte, base_pte;
+	struct iommu_window *window =
+		container_of(tbl, struct iommu_window, table);
+
+	/* implementing proper protection causes problems with the spidernet
+	 * driver - check mapping directions later, but allow read & write by
+	 * default for now.*/
+#ifdef CELL_IOMMU_STRICT_PROTECTION
+	/* to avoid referencing a global, we use a trick here to setup the
+	 * protection bit. "prot" is setup to be 3 fields of 4 bits apprended
+	 * together for each of the 3 supported direction values. It is then
+	 * shifted left so that the fields matching the desired direction
+	 * lands on the appropriate bits, and other bits are masked out.
+	 */
+	const unsigned long prot = 0xc48;
+	base_pte =
+		((prot << (52 + 4 * direction)) & (IOPTE_PP_W | IOPTE_PP_R))
+		| IOPTE_M | IOPTE_SO_RW | (window->ioid & IOPTE_IOID_Mask);
+#else
+	base_pte = IOPTE_PP_W | IOPTE_PP_R | IOPTE_M | IOPTE_SO_RW |
+		(window->ioid & IOPTE_IOID_Mask);
+#endif
+
+	io_pte = (unsigned long *)tbl->it_base + (index - window->pte_offset);
+
+	for (i = 0; i < npages; i++, uaddr += IOMMU_PAGE_SIZE)
+		io_pte[i] = base_pte | (__pa(uaddr) & IOPTE_RPN_Mask);
+
+	mb();
+
+	invalidate_tce_cache(window->iommu, io_pte, npages);
+
+	pr_debug("tce_build_cell(index=%lx,n=%lx,dir=%d,base_pte=%lx)\n",
+		 index, npages, direction, base_pte);
 }
 
-/* same as above, but pretend that we have a simpler 1-way associative
-   pte cache with an 8 bit index */
-static inline unsigned long
-get_ioc_hash_1way(ioste iost_entry, unsigned long io_address)
+static void tce_free_cell(struct iommu_table *tbl, long index, long npages)
 {
-	unsigned long iopte = get_ioptep(iost_entry, io_address);
-
-	return ((iopte & 0x000000000000001f8ul) >> 3)
-	     ^ ((iopte & 0x00000000000020000ul) >> 17)
-	     ^ ((iopte & 0x00000000000010000ul) >> 15)
-	     ^ ((iopte & 0x00000000000008000ul) >> 13)
-	     ^ ((iopte & 0x00000000000004000ul) >> 11)
-	     ^ ((iopte & 0x00000000000002000ul) >> 9)
-	     ^ ((iopte & 0x00000000000001000ul) >> 7)
-	     ^ ((iopte & 0x0000000000000c000ul) >> 8);
-}
 
-static inline ioste
-get_iost_cache(void __iomem *base, unsigned long index)
-{
-	unsigned long __iomem *p = (base + IOC_ST_CACHE_DIR);
-	return mk_ioste(in_be64(&p[index]));
-}
+	int i;
+	unsigned long *io_pte, pte;
+	struct iommu_window *window =
+		container_of(tbl, struct iommu_window, table);
 
-static inline void
-set_iost_cache(void __iomem *base, unsigned long index, ioste ste)
-{
-	unsigned long __iomem *p = (base + IOC_ST_CACHE_DIR);
-	pr_debug("ioste %02lx was %016lx, store %016lx", index,
-			get_iost_cache(base, index).val, ste.val);
-	out_be64(&p[index], ste.val);
-	pr_debug(" now %016lx\n", get_iost_cache(base, index).val);
-}
+	pr_debug("tce_free_cell(index=%lx,n=%lx)\n", index, npages);
 
-static inline unsigned long
-get_iopt_cache(void __iomem *base, unsigned long index, unsigned long *tag)
-{
-	unsigned long __iomem *tags = (void *)(base + IOC_PT_CACHE_DIR);
-	unsigned long __iomem *p = (void *)(base + IOC_PT_CACHE_REG);	
+#ifdef CELL_IOMMU_REAL_UNMAP
+	pte = 0;
+#else
+	/* spider bridge does PCI reads after freeing - insert a mapping
+	 * to a scratch page instead of an invalid entry */
+	pte = IOPTE_PP_R | IOPTE_M | IOPTE_SO_RW | __pa(window->iommu->pad_page)
+		| (window->ioid & IOPTE_IOID_Mask);
+#endif
 
-	*tag = tags[index];
-	rmb();
-	return *p;
-}
+	io_pte = (unsigned long *)tbl->it_base + (index - window->pte_offset);
 
-static inline void
-set_iopt_cache(void __iomem *base, unsigned long index,
-		 unsigned long tag, unsigned long val)
-{
-	unsigned long __iomem *tags = base + IOC_PT_CACHE_DIR;
-	unsigned long __iomem *p = base + IOC_PT_CACHE_REG;
+	for (i = 0; i < npages; i++)
+		io_pte[i] = pte;
+
+	mb();
 
-	out_be64(p, val);
-	out_be64(&tags[index], tag);
+	invalidate_tce_cache(window->iommu, io_pte, npages);
 }
 
-static inline void
-set_iost_origin(void __iomem *base)
+static irqreturn_t ioc_interrupt(int irq, void *data)
 {
-	unsigned long __iomem *p = base + IOC_ST_ORIGIN;
-	unsigned long origin = IOSTO_ENABLE | IOSTO_SW;
-
-	pr_debug("iost_origin %016lx, now %016lx\n", in_be64(p), origin);
-	out_be64(p, origin);
+	unsigned long stat;
+	struct cbe_iommu *iommu = data;
+
+	stat = in_be64(iommu->xlate_regs + IOC_IO_ExcpStat);
+
+	/* Might want to rate limit it */
+	printk(KERN_ERR "iommu: DMA exception 0x%016lx\n", stat);
+	printk(KERN_ERR "  V=%d, SPF=[%c%c], RW=%s, IOID=0x%04x\n",
+	       !!(stat & IOC_IO_ExcpStat_V),
+	       (stat & IOC_IO_ExcpStat_SPF_S) ? 'S' : ' ',
+	       (stat & IOC_IO_ExcpStat_SPF_P) ? 'P' : ' ',
+	       (stat & IOC_IO_ExcpStat_RW_Mask) ? "Read" : "Write",
+	       (unsigned int)(stat & IOC_IO_ExcpStat_IOID_Mask));
+	printk(KERN_ERR "  page=0x%016lx\n",
+	       stat & IOC_IO_ExcpStat_ADDR_Mask);
+
+	/* clear interrupt */
+	stat &= ~IOC_IO_ExcpStat_V;
+	out_be64(iommu->xlate_regs + IOC_IO_ExcpStat, stat);
+
+	return IRQ_HANDLED;
 }
 
-static inline void
-set_iocmd_config(void __iomem *base)
+static int cell_iommu_find_ioc(int nid, unsigned long *base)
 {
-	unsigned long __iomem *p = base + 0xc00;
-	unsigned long conf;
+	struct device_node *np;
+	struct resource r;
+
+	*base = 0;
+
+	/* First look for new style /be nodes */
+	for_each_node_by_name(np, "ioc") {
+		if (of_node_to_nid(np) != nid)
+			continue;
+		if (of_address_to_resource(np, 0, &r)) {
+			printk(KERN_ERR "iommu: can't get address for %s\n",
+			       np->full_name);
+			continue;
+		}
+		*base = r.start;
+		of_node_put(np);
+		return 0;
+	}
 
-	conf = in_be64(p);
-	pr_debug("iost_conf %016lx, now %016lx\n", conf, conf | IOCMD_CONF_TE);
-	out_be64(p, conf | IOCMD_CONF_TE);
+	/* Ok, let's try the old way */
+	for_each_node_by_type(np, "cpu") {
+		const unsigned int *nidp;
+		const unsigned long *tmp;
+
+		nidp = get_property(np, "node-id", NULL);
+		if (nidp && *nidp == nid) {
+			tmp = get_property(np, "ioc-translation", NULL);
+			if (tmp) {
+				*base = *tmp;
+				of_node_put(np);
+				return 0;
+			}
+		}
+	}
+
+	return -ENODEV;
 }
 
-static void enable_mapping(void __iomem *base, void __iomem *mmio_base)
+static void cell_iommu_setup_hardware(struct cbe_iommu *iommu, unsigned long size)
 {
-	set_iocmd_config(base);
-	set_iost_origin(mmio_base);
-}
+	struct page *page;
+	int ret, i;
+	unsigned long reg, segments, pages_per_segment, ptab_size, n_pte_pages;
+	unsigned long xlate_base;
+	unsigned int virq;
+
+	if (cell_iommu_find_ioc(iommu->nid, &xlate_base))
+		panic("%s: missing IOC register mappings for node %d\n",
+		      __FUNCTION__, iommu->nid);
+
+	iommu->xlate_regs = ioremap(xlate_base, IOC_Reg_Size);
+	iommu->cmd_regs = iommu->xlate_regs + IOC_IOCmd_Offset;
+
+	segments = size >> IO_SEGMENT_SHIFT;
+	pages_per_segment = 1ull << IO_PAGENO_BITS;
+
+	pr_debug("%s: iommu[%d]: segments: %lu, pages per segment: %lu\n",
+			__FUNCTION__, iommu->nid, segments, pages_per_segment);
+
+	/* set up the segment table */
+	page = alloc_pages_node(iommu->nid, GFP_KERNEL, 0);
+	BUG_ON(!page);
+	iommu->stab = page_address(page);
+	clear_page(iommu->stab);
+
+	/* ... and the page tables. Since these are contiguous, we can treat
+	 * the page tables as one array of ptes, like pSeries does.
+	 */
+	ptab_size = segments * pages_per_segment * sizeof(unsigned long);
+	pr_debug("%s: iommu[%d]: ptab_size: %lu, order: %d\n", __FUNCTION__,
+			iommu->nid, ptab_size, get_order(ptab_size));
+	page = alloc_pages_node(iommu->nid, GFP_KERNEL, get_order(ptab_size));
+	BUG_ON(!page);
+
+	iommu->ptab = page_address(page);
+	memset(iommu->ptab, 0, ptab_size);
+
+	/* allocate a bogus page for the end of each mapping */
+	page = alloc_pages_node(iommu->nid, GFP_KERNEL, 0);
+	BUG_ON(!page);
+	iommu->pad_page = page_address(page);
+	clear_page(iommu->pad_page);
+
+	/* number of pages needed for a page table */
+	n_pte_pages = (pages_per_segment *
+		       sizeof(unsigned long)) >> IOMMU_PAGE_SHIFT;
+
+	pr_debug("%s: iommu[%d]: stab at %p, ptab at %p, n_pte_pages: %lu\n",
+			__FUNCTION__, iommu->nid, iommu->stab, iommu->ptab,
+			n_pte_pages);
+
+	/* initialise the STEs */
+	reg = IOSTE_V | ((n_pte_pages - 1) << 5);
+
+	if (IOMMU_PAGE_SIZE == 0x1000)
+		reg |= IOSTE_PS_4K;
+	else if (IOMMU_PAGE_SIZE == 0x10000)
+		reg |= IOSTE_PS_64K;
+	else {
+		extern void __unknown_page_size_error(void);
+		__unknown_page_size_error();
+	}
+
+	pr_debug("Setting up IOMMU stab:\n");
+	for (i = 0; i * (1ul << IO_SEGMENT_SHIFT) < size; i++) {
+		iommu->stab[i] = reg |
+			(__pa(iommu->ptab) + n_pte_pages * IOMMU_PAGE_SIZE * i);
+		pr_debug("\t[%d] 0x%016lx\n", i, iommu->stab[i]);
+	}
 
-static void iommu_dev_setup_null(struct pci_dev *d) { }
-static void iommu_bus_setup_null(struct pci_bus *b) { }
+	/* ensure that the STEs have updated */
+	mb();
 
-struct cell_iommu {
-	unsigned long base;
-	unsigned long mmio_base;
-	void __iomem *mapped_base;
-	void __iomem *mapped_mmio_base;
-};
+	/* setup interrupts for the iommu. */
+	reg = in_be64(iommu->xlate_regs + IOC_IO_ExcpStat);
+	out_be64(iommu->xlate_regs + IOC_IO_ExcpStat,
+			reg & ~IOC_IO_ExcpStat_V);
+	out_be64(iommu->xlate_regs + IOC_IO_ExcpMask,
+			IOC_IO_ExcpMask_PFE | IOC_IO_ExcpMask_SFE);
 
-static struct cell_iommu cell_iommus[NR_CPUS];
+	virq = irq_create_mapping(NULL,
+			IIC_IRQ_IOEX_ATI | (iommu->nid << IIC_IRQ_NODE_SHIFT));
+	BUG_ON(virq == NO_IRQ);
 
-/* initialize the iommu to support a simple linear mapping
- * for each DMA window used by any device. For now, we
- * happen to know that there is only one DMA window in use,
- * starting at iopt_phys_offset. */
-static void cell_do_map_iommu(struct cell_iommu *iommu,
-			      unsigned int ioid,
-			      unsigned long map_start,
-			      unsigned long map_size)
-{
-	unsigned long io_address, real_address;
-	void __iomem *ioc_base, *ioc_mmio_base;
-	ioste ioste;
-	unsigned long index;
+	ret = request_irq(virq, ioc_interrupt, IRQF_DISABLED,
+			iommu->name, iommu);
+	BUG_ON(ret);
 
-	/* we pretend the io page table was at a very high address */
-	const unsigned long fake_iopt = 0x10000000000ul;
-	const unsigned long io_page_size = 0x1000000; /* use 16M pages */
-	const unsigned long io_segment_size = 0x10000000; /* 256M */
-
-	ioc_base = iommu->mapped_base;
-	ioc_mmio_base = iommu->mapped_mmio_base;
-
-	for (real_address = 0, io_address = map_start;
-	     io_address <= map_start + map_size;
-	     real_address += io_page_size, io_address += io_page_size) {
-		ioste = get_iost_entry(fake_iopt, io_address, io_page_size);
-		if ((real_address % io_segment_size) == 0) /* segment start */
-			set_iost_cache(ioc_mmio_base,
-				       io_address >> 28, ioste);
-		index = get_ioc_hash_1way(ioste, io_address);
-		pr_debug("addr %08lx, index %02lx, ioste %016lx\n",
-					 io_address, index, ioste.val);
-		set_iopt_cache(ioc_mmio_base,
-			get_ioc_hash_1way(ioste, io_address),
-			get_ioc_tag(ioste, io_address),
-			get_iopt_entry(real_address, ioid, IOPT_PROT_RW));
-	}
+	/* set the IOC segment table origin register (and turn on the iommu) */
+	reg = IOC_IOST_Origin_E | __pa(iommu->stab) | IOC_IOST_Origin_HW;
+	out_be64(iommu->xlate_regs + IOC_IOST_Origin, reg);
+	in_be64(iommu->xlate_regs + IOC_IOST_Origin);
+
+	/* turn on IO translation */
+	reg = in_be64(iommu->cmd_regs + IOC_IOCmd_Cfg) | IOC_IOCmd_Cfg_TE;
+	out_be64(iommu->cmd_regs + IOC_IOCmd_Cfg, reg);
 }
 
-static void iommu_devnode_setup(struct device_node *d)
+#if 0/* Unused for now */
+static struct iommu_window *find_window(struct cbe_iommu *iommu,
+		unsigned long offset, unsigned long size)
 {
-	const unsigned int *ioid;
-	unsigned long map_start, map_size, token;
-	const unsigned long *dma_window;
-	struct cell_iommu *iommu;
+	struct iommu_window *window;
 
-	ioid = get_property(d, "ioid", NULL);
-	if (!ioid)
-		pr_debug("No ioid entry found !\n");
+	/* todo: check for overlapping (but not equal) windows) */
 
-	dma_window = get_property(d, "ibm,dma-window", NULL);
-	if (!dma_window)
-		pr_debug("No ibm,dma-window entry found !\n");
+	list_for_each_entry(window, &(iommu->windows), list) {
+		if (window->offset == offset && window->size == size)
+			return window;
+	}
 
-	map_start = dma_window[1];
-	map_size = dma_window[2];
-	token = dma_window[0] >> 32;
+	return NULL;
+}
+#endif
 
-	iommu = &cell_iommus[token];
+static struct iommu_window * __init
+cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
+			unsigned long offset, unsigned long size,
+			unsigned long pte_offset)
+{
+	struct iommu_window *window;
+	const unsigned int *ioid;
 
-	cell_do_map_iommu(iommu, *ioid, map_start, map_size);
+	ioid = get_property(np, "ioid", NULL);
+	if (ioid == NULL)
+		printk(KERN_WARNING "iommu: missing ioid for %s using 0\n",
+		       np->full_name);
+
+	window = kmalloc_node(sizeof(*window), GFP_KERNEL, iommu->nid);
+	BUG_ON(window == NULL);
+
+	window->offset = offset;
+	window->size = size;
+	window->ioid = ioid ? *ioid : 0;
+	window->iommu = iommu;
+	window->pte_offset = pte_offset;
+
+	window->table.it_blocksize = 16;
+	window->table.it_base = (unsigned long)iommu->ptab;
+	window->table.it_index = iommu->nid;
+	window->table.it_offset = (offset >> IOMMU_PAGE_SHIFT) +
+		window->pte_offset;
+	window->table.it_size = size >> IOMMU_PAGE_SHIFT;
+
+	iommu_init_table(&window->table, iommu->nid);
+
+	pr_debug("\tioid      %d\n", window->ioid);
+	pr_debug("\tblocksize %ld\n", window->table.it_blocksize);
+	pr_debug("\tbase      0x%016lx\n", window->table.it_base);
+	pr_debug("\toffset    0x%lx\n", window->table.it_offset);
+	pr_debug("\tsize      %ld\n", window->table.it_size);
+
+	list_add(&window->list, &iommu->windows);
+
+	if (offset != 0)
+		return window;
+
+	/* We need to map and reserve the first IOMMU page since it's used
+	 * by the spider workaround. In theory, we only need to do that when
+	 * running on spider but it doesn't really matter.
+	 *
+	 * This code also assumes that we have a window that starts at 0,
+	 * which is the case on all spider based blades.
+	 */
+	__set_bit(0, window->table.it_map);
+	tce_build_cell(&window->table, window->table.it_offset, 1,
+		       (unsigned long)iommu->pad_page, DMA_TO_DEVICE);
+	window->table.it_hint = window->table.it_blocksize;
+
+	return window;
 }
 
-static void iommu_bus_setup(struct pci_bus *b)
+static struct cbe_iommu *cell_iommu_for_node(int nid)
 {
-	struct device_node *d = (struct device_node *)b->sysdata;
-	iommu_devnode_setup(d);
-}
+	int i;
 
+	for (i = 0; i < cbe_nr_iommus; i++)
+		if (iommus[i].nid == nid)
+			return &iommus[i];
+	return NULL;
+}
 
-static int cell_map_iommu_hardcoded(int num_nodes)
+static void cell_dma_dev_setup(struct device *dev)
 {
-	struct cell_iommu *iommu = NULL;
-
-	pr_debug("%s(%d): Using hardcoded defaults\n", __FUNCTION__, __LINE__);
+	struct iommu_window *window;
+	struct cbe_iommu *iommu;
+	struct dev_archdata *archdata = &dev->archdata;
+
+	/* If we run without iommu, no need to do anything */
+	if (pci_dma_ops == &dma_direct_ops)
+		return;
+
+	/* Current implementation uses the first window available in that
+	 * node's iommu. We -might- do something smarter later though it may
+	 * never be necessary
+	 */
+	iommu = cell_iommu_for_node(archdata->numa_node);
+	if (iommu == NULL || list_empty(&iommu->windows)) {
+		printk(KERN_ERR "iommu: missing iommu for %s (node %d)\n",
+		       archdata->of_node ? archdata->of_node->full_name : "?",
+		       archdata->numa_node);
+		return;
+	}
+	window = list_entry(iommu->windows.next, struct iommu_window, list);
 
-	/* node 0 */
-	iommu = &cell_iommus[0];
-	iommu->mapped_base = ioremap(0x20000511000ul, 0x1000);
-	iommu->mapped_mmio_base = ioremap(0x20000510000ul, 0x1000);
+	archdata->dma_data = &window->table;
+}
 
-	enable_mapping(iommu->mapped_base, iommu->mapped_mmio_base);
+static void cell_pci_dma_dev_setup(struct pci_dev *dev)
+{
+	cell_dma_dev_setup(&dev->dev);
+}
 
-	cell_do_map_iommu(iommu, 0x048a,
-			  0x20000000ul,0x20000000ul);
+static int cell_of_bus_notify(struct notifier_block *nb, unsigned long action,
+			      void *data)
+{
+	struct device *dev = data;
 
-	if (num_nodes < 2)
+	/* We are only intereted in device addition */
+	if (action != BUS_NOTIFY_ADD_DEVICE)
 		return 0;
 
-	/* node 1 */
-	iommu = &cell_iommus[1];
-	iommu->mapped_base = ioremap(0x30000511000ul, 0x1000);
-	iommu->mapped_mmio_base = ioremap(0x30000510000ul, 0x1000);
-
-	enable_mapping(iommu->mapped_base, iommu->mapped_mmio_base);
+	/* We use the PCI DMA ops */
+	dev->archdata.dma_ops = pci_dma_ops;
 
-	cell_do_map_iommu(iommu, 0x048a,
-			  0x20000000,0x20000000ul);
+	cell_dma_dev_setup(dev);
 
 	return 0;
 }
 
+static struct notifier_block cell_of_bus_notifier = {
+	.notifier_call = cell_of_bus_notify
+};
 
-static int cell_map_iommu(void)
+static int __init cell_iommu_get_window(struct device_node *np,
+					 unsigned long *base,
+					 unsigned long *size)
 {
-	unsigned int num_nodes = 0;
-	const unsigned int *node_id;
-	const unsigned long *base, *mmio_base;
-	struct device_node *dn;
-	struct cell_iommu *iommu = NULL;
-
-	/* determine number of nodes (=iommus) */
-	pr_debug("%s(%d): determining number of nodes...", __FUNCTION__, __LINE__);
-	for(dn = of_find_node_by_type(NULL, "cpu");
-	    dn;
-	    dn = of_find_node_by_type(dn, "cpu")) {
-		node_id = get_property(dn, "node-id", NULL);
-
-		if (num_nodes < *node_id)
-			num_nodes = *node_id;
-		}
-
-	num_nodes++;
-	pr_debug("%i found.\n", num_nodes);
+	const void *dma_window;
+	unsigned long index;
 
-	/* map the iommu registers for each node */
-	pr_debug("%s(%d): Looping through nodes\n", __FUNCTION__, __LINE__);
-	for(dn = of_find_node_by_type(NULL, "cpu");
-	    dn;
-	    dn = of_find_node_by_type(dn, "cpu")) {
+	/* Use ibm,dma-window if available, else, hard code ! */
+	dma_window = get_property(np, "ibm,dma-window", NULL);
+	if (dma_window == NULL) {
+		*base = 0;
+		*size = 0x80000000u;
+		return -ENODEV;
+	}
 
-		node_id = get_property(dn, "node-id", NULL);
-		base = get_property(dn, "ioc-cache", NULL);
-		mmio_base = get_property(dn, "ioc-translation", NULL);
+	of_parse_dma_window(np, dma_window, &index, base, size);
+	return 0;
+}
 
-		if (!base || !mmio_base || !node_id)
-			return cell_map_iommu_hardcoded(num_nodes);
+static void __init cell_iommu_init_one(struct device_node *np, unsigned long offset)
+{
+	struct cbe_iommu *iommu;
+	unsigned long base, size;
+	int nid, i;
+
+	/* Get node ID */
+	nid = of_node_to_nid(np);
+	if (nid < 0) {
+		printk(KERN_ERR "iommu: failed to get node for %s\n",
+		       np->full_name);
+		return;
+	}
+	pr_debug("iommu: setting up iommu for node %d (%s)\n",
+		 nid, np->full_name);
+
+	/* XXX todo: If we can have multiple windows on the same IOMMU, which
+	 * isn't the case today, we probably want here to check wether the
+	 * iommu for that node is already setup.
+	 * However, there might be issue with getting the size right so let's
+	 * ignore that for now. We might want to completely get rid of the
+	 * multiple window support since the cell iommu supports per-page ioids
+	 */
+
+	if (cbe_nr_iommus >= NR_IOMMUS) {
+		printk(KERN_ERR "iommu: too many IOMMUs detected ! (%s)\n",
+		       np->full_name);
+		return;
+	}
 
-		iommu = &cell_iommus[*node_id];
-		iommu->base = *base;
-		iommu->mmio_base = *mmio_base;
+	/* Init base fields */
+	i = cbe_nr_iommus++;
+	iommu = &iommus[i];
+	iommu->stab = 0;
+	iommu->nid = nid;
+	snprintf(iommu->name, sizeof(iommu->name), "iommu%d", i);
+	INIT_LIST_HEAD(&iommu->windows);
 
-		iommu->mapped_base = ioremap(*base, 0x1000);
-		iommu->mapped_mmio_base = ioremap(*mmio_base, 0x1000);
+	/* Obtain a window for it */
+	cell_iommu_get_window(np, &base, &size);
 
-		enable_mapping(iommu->mapped_base,
-			       iommu->mapped_mmio_base);
+	pr_debug("\ttranslating window 0x%lx...0x%lx\n",
+		 base, base + size - 1);
 
-		/* everything else will be done in iommu_bus_setup */
-	}
+	/* Initialize the hardware */
+	cell_iommu_setup_hardware(iommu, size);
 
-	return 1;
+	/* Setup the iommu_table */
+	cell_iommu_setup_window(iommu, np, base, size,
+				offset >> IOMMU_PAGE_SHIFT);
 }
 
-static void *cell_alloc_coherent(struct device *hwdev, size_t size,
-			   dma_addr_t *dma_handle, gfp_t flag)
+static void __init cell_disable_iommus(void)
 {
-	void *ret;
-
-	ret = (void *)__get_free_pages(flag, get_order(size));
-	if (ret != NULL) {
-		memset(ret, 0, size);
-		*dma_handle = virt_to_abs(ret) | CELL_DMA_VALID;
+	int node;
+	unsigned long base, val;
+	void __iomem *xregs, *cregs;
+
+	/* Make sure IOC translation is disabled on all nodes */
+	for_each_online_node(node) {
+		if (cell_iommu_find_ioc(node, &base))
+			continue;
+		xregs = ioremap(base, IOC_Reg_Size);
+		if (xregs == NULL)
+			continue;
+		cregs = xregs + IOC_IOCmd_Offset;
+
+		pr_debug("iommu: cleaning up iommu on node %d\n", node);
+
+		out_be64(xregs + IOC_IOST_Origin, 0);
+		(void)in_be64(xregs + IOC_IOST_Origin);
+		val = in_be64(cregs + IOC_IOCmd_Cfg);
+		val &= ~IOC_IOCmd_Cfg_TE;
+		out_be64(cregs + IOC_IOCmd_Cfg, val);
+		(void)in_be64(cregs + IOC_IOCmd_Cfg);
+
+		iounmap(xregs);
 	}
-	return ret;
 }
 
-static void cell_free_coherent(struct device *hwdev, size_t size,
-				 void *vaddr, dma_addr_t dma_handle)
+static int __init cell_iommu_init_disabled(void)
 {
-	free_pages((unsigned long)vaddr, get_order(size));
-}
+	struct device_node *np = NULL;
+	unsigned long base = 0, size;
+
+	/* When no iommu is present, we use direct DMA ops */
+	pci_dma_ops = &dma_direct_ops;
+
+	/* First make sure all IOC translation is turned off */
+	cell_disable_iommus();
+
+	/* If we have no Axon, we set up the spider DMA magic offset */
+	if (of_find_node_by_name(NULL, "axon") == NULL)
+		dma_direct_offset = SPIDER_DMA_OFFSET;
+
+	/* Now we need to check to see where the memory is mapped
+	 * in PCI space. We assume that all busses use the same dma
+	 * window which is always the case so far on Cell, thus we
+	 * pick up the first pci-internal node we can find and check
+	 * the DMA window from there.
+	 */
+	for_each_node_by_name(np, "axon") {
+		if (np->parent == NULL || np->parent->parent != NULL)
+			continue;
+		if (cell_iommu_get_window(np, &base, &size) == 0)
+			break;
+	}
+	if (np == NULL) {
+		for_each_node_by_name(np, "pci-internal") {
+			if (np->parent == NULL || np->parent->parent != NULL)
+				continue;
+			if (cell_iommu_get_window(np, &base, &size) == 0)
+				break;
+		}
+	}
+	of_node_put(np);
+
+	/* If we found a DMA window, we check if it's big enough to enclose
+	 * all of physical memory. If not, we force enable IOMMU
+	 */
+	if (np && size < lmb_end_of_DRAM()) {
+		printk(KERN_WARNING "iommu: force-enabled, dma window"
+		       " (%ldMB) smaller than total memory (%ldMB)\n",
+		       size >> 20, lmb_end_of_DRAM() >> 20);
+		return -ENODEV;
+	}
 
-static dma_addr_t cell_map_single(struct device *hwdev, void *ptr,
-		size_t size, enum dma_data_direction direction)
-{
-	return virt_to_abs(ptr) | CELL_DMA_VALID;
-}
+	dma_direct_offset += base;
 
-static void cell_unmap_single(struct device *hwdev, dma_addr_t dma_addr,
-		size_t size, enum dma_data_direction direction)
-{
+	printk("iommu: disabled, direct DMA offset is 0x%lx\n",
+	       dma_direct_offset);
+
+	return 0;
 }
 
-static int cell_map_sg(struct device *hwdev, struct scatterlist *sg,
-		int nents, enum dma_data_direction direction)
+static int __init cell_iommu_init(void)
 {
-	int i;
+	struct device_node *np;
+
+	if (!machine_is(cell))
+		return -ENODEV;
+
+	/* If IOMMU is disabled or we have little enough RAM to not need
+	 * to enable it, we setup a direct mapping.
+	 *
+	 * Note: should we make sure we have the IOMMU actually disabled ?
+	 */
+	if (iommu_is_off ||
+	    (!iommu_force_on && lmb_end_of_DRAM() <= 0x80000000ull))
+		if (cell_iommu_init_disabled() == 0)
+			goto bail;
+
+	/* Setup various ppc_md. callbacks */
+	ppc_md.pci_dma_dev_setup = cell_pci_dma_dev_setup;
+	ppc_md.tce_build = tce_build_cell;
+	ppc_md.tce_free = tce_free_cell;
+
+	/* Create an iommu for each /axon node.  */
+	for_each_node_by_name(np, "axon") {
+		if (np->parent == NULL || np->parent->parent != NULL)
+			continue;
+		cell_iommu_init_one(np, 0);
+	}
 
-	for (i = 0; i < nents; i++, sg++) {
-		sg->dma_address = (page_to_phys(sg->page) + sg->offset)
-					| CELL_DMA_VALID;
-		sg->dma_length = sg->length;
+	/* Create an iommu for each toplevel /pci-internal node for
+	 * old hardware/firmware
+	 */
+	for_each_node_by_name(np, "pci-internal") {
+		if (np->parent == NULL || np->parent->parent != NULL)
+			continue;
+		cell_iommu_init_one(np, SPIDER_DMA_OFFSET);
 	}
 
-	return nents;
-}
+	/* Setup default PCI iommu ops */
+	pci_dma_ops = &dma_iommu_ops;
 
-static void cell_unmap_sg(struct device *hwdev, struct scatterlist *sg,
-		int nents, enum dma_data_direction direction)
-{
-}
+ bail:
+	/* Register callbacks on OF platform device addition/removal
+	 * to handle linking them to the right DMA operations
+	 */
+	bus_register_notifier(&of_platform_bus_type, &cell_of_bus_notifier);
 
-static int cell_dma_supported(struct device *dev, u64 mask)
-{
-	return mask < 0x100000000ull;
+	return 0;
 }
+arch_initcall(cell_iommu_init);
 
-static struct dma_mapping_ops cell_iommu_ops = {
-	.alloc_coherent = cell_alloc_coherent,
-	.free_coherent = cell_free_coherent,
-	.map_single = cell_map_single,
-	.unmap_single = cell_unmap_single,
-	.map_sg = cell_map_sg,
-	.unmap_sg = cell_unmap_sg,
-	.dma_supported = cell_dma_supported,
-};
-
-void cell_init_iommu(void)
-{
-	int setup_bus = 0;
-
-	if (of_find_node_by_path("/mambo")) {
-		pr_info("Not using iommu on systemsim\n");
-	} else {
-
-		if (!(of_chosen &&
-		      get_property(of_chosen, "linux,iommu-off", NULL)))
-			setup_bus = cell_map_iommu();
-
-		if (setup_bus) {
-			pr_debug("%s: IOMMU mapping activated\n", __FUNCTION__);
-			ppc_md.iommu_dev_setup = iommu_dev_setup_null;
-			ppc_md.iommu_bus_setup = iommu_bus_setup;
-		} else {
-			pr_debug("%s: IOMMU mapping activated, "
-				 "no device action necessary\n", __FUNCTION__);
-			/* Direct I/O, IOMMU off */
-			ppc_md.iommu_dev_setup = iommu_dev_setup_null;
-			ppc_md.iommu_bus_setup = iommu_bus_setup_null;
-		}
-	}
-
-	pci_dma_ops = cell_iommu_ops;
-}
diff --git a/arch/powerpc/platforms/cell/iommu.h b/arch/powerpc/platforms/cell/iommu.h
deleted file mode 100644
index 490d77abfe8..00000000000
--- a/arch/powerpc/platforms/cell/iommu.h
+++ /dev/null
@@ -1,65 +0,0 @@
-#ifndef CELL_IOMMU_H
-#define CELL_IOMMU_H
-
-/* some constants */
-enum {
-	/* segment table entries */
-	IOST_VALID_MASK	  = 0x8000000000000000ul,
-	IOST_TAG_MASK     = 0x3000000000000000ul,
-	IOST_PT_BASE_MASK = 0x000003fffffff000ul,
-	IOST_NNPT_MASK	  = 0x0000000000000fe0ul,
-	IOST_PS_MASK	  = 0x000000000000000ful,
-
-	IOST_PS_4K	  = 0x1,
-	IOST_PS_64K	  = 0x3,
-	IOST_PS_1M	  = 0x5,
-	IOST_PS_16M	  = 0x7,
-
-	/* iopt tag register */
-	IOPT_VALID_MASK   = 0x0000000200000000ul,
-	IOPT_TAG_MASK	  = 0x00000001fffffffful,
-
-	/* iopt cache register */
-	IOPT_PROT_MASK	  = 0xc000000000000000ul,
-	IOPT_PROT_NONE	  = 0x0000000000000000ul,
-	IOPT_PROT_READ	  = 0x4000000000000000ul,
-	IOPT_PROT_WRITE	  = 0x8000000000000000ul,
-	IOPT_PROT_RW	  = 0xc000000000000000ul,
-	IOPT_COHERENT	  = 0x2000000000000000ul,
-	
-	IOPT_ORDER_MASK	  = 0x1800000000000000ul,
-	/* order access to same IOID/VC on same address */
-	IOPT_ORDER_ADDR	  = 0x0800000000000000ul,
-	/* similar, but only after a write access */
-	IOPT_ORDER_WRITES = 0x1000000000000000ul,
-	/* Order all accesses to same IOID/VC */
-	IOPT_ORDER_VC	  = 0x1800000000000000ul,
-	
-	IOPT_RPN_MASK	  = 0x000003fffffff000ul,
-	IOPT_HINT_MASK	  = 0x0000000000000800ul,
-	IOPT_IOID_MASK	  = 0x00000000000007fful,
-
-	IOSTO_ENABLE	  = 0x8000000000000000ul,
-	IOSTO_ORIGIN	  = 0x000003fffffff000ul,
-	IOSTO_HW	  = 0x0000000000000800ul,
-	IOSTO_SW	  = 0x0000000000000400ul,
-
-	IOCMD_CONF_TE	  = 0x0000800000000000ul,
-
-	/* memory mapped registers */
-	IOC_PT_CACHE_DIR  = 0x000,
-	IOC_ST_CACHE_DIR  = 0x800,
-	IOC_PT_CACHE_REG  = 0x910,
-	IOC_ST_ORIGIN     = 0x918,
-	IOC_CONF	  = 0x930,
-
-	/* The high bit needs to be set on every DMA address,
-	   only 2GB are addressable */
-	CELL_DMA_VALID	  = 0x80000000,
-	CELL_DMA_MASK	  = 0x7fffffff,
-};
-
-
-void cell_init_iommu(void);
-
-#endif
diff --git a/arch/powerpc/platforms/cell/pervasive.c b/arch/powerpc/platforms/cell/pervasive.c
index 9f2e4ed20a5..8c20f0fb865 100644
--- a/arch/powerpc/platforms/cell/pervasive.c
+++ b/arch/powerpc/platforms/cell/pervasive.c
@@ -38,32 +38,25 @@
 #include "pervasive.h"
 #include "cbe_regs.h"
 
-static DEFINE_SPINLOCK(cbe_pervasive_lock);
-
-static void __init cbe_enable_pause_zero(void)
+static void cbe_power_save(void)
 {
-	unsigned long thread_switch_control;
-	unsigned long temp_register;
-	struct cbe_pmd_regs __iomem *pregs;
-
-	spin_lock_irq(&cbe_pervasive_lock);
-	pregs = cbe_get_cpu_pmd_regs(smp_processor_id());
-	if (pregs == NULL)
-		goto out;
+	unsigned long ctrl, thread_switch_control;
 
-	pr_debug("Power Management: CPU %d\n", smp_processor_id());
-
-	 /* Enable Pause(0) control bit */
-	temp_register = in_be64(&pregs->pm_control);
+	/*
+	 * We need to hard disable interrupts, but we also need to mark them
+	 * hard disabled in the PACA so that the local_irq_enable() done by
+	 * our caller upon return propertly hard enables.
+	 */
+	hard_irq_disable();
+	get_paca()->hard_enabled = 0;
 
-	out_be64(&pregs->pm_control,
-		 temp_register | CBE_PMD_PAUSE_ZERO_CONTROL);
+	ctrl = mfspr(SPRN_CTRLF);
 
 	/* Enable DEC and EE interrupt request */
 	thread_switch_control  = mfspr(SPRN_TSC_CELL);
 	thread_switch_control |= TSC_CELL_EE_ENABLE | TSC_CELL_EE_BOOST;
 
-	switch ((mfspr(SPRN_CTRLF) & CTRL_CT)) {
+	switch (ctrl & CTRL_CT) {
 	case CTRL_CT0:
 		thread_switch_control |= TSC_CELL_DEC_ENABLE_0;
 		break;
@@ -75,58 +68,21 @@ static void __init cbe_enable_pause_zero(void)
 			__FUNCTION__);
 		break;
 	}
-
 	mtspr(SPRN_TSC_CELL, thread_switch_control);
 
-out:
-	spin_unlock_irq(&cbe_pervasive_lock);
-}
-
-static void cbe_idle(void)
-{
-	unsigned long ctrl;
+	/*
+	 * go into low thread priority, medium priority will be
+	 * restored for us after wake-up.
+	 */
+	HMT_low();
 
-	/* Why do we do that on every idle ? Couldn't that be done once for
-	 * all or do we lose the state some way ? Also, the pm_control
-	 * register setting, that can't be set once at boot ? We really want
-	 * to move that away in order to implement a simple powersave
+	/*
+	 * atomically disable thread execution and runlatch.
+	 * External and Decrementer exceptions are still handled when the
+	 * thread is disabled but now enter in cbe_system_reset_exception()
 	 */
-	cbe_enable_pause_zero();
-
-	while (1) {
-		if (!need_resched()) {
-			local_irq_disable();
-			while (!need_resched()) {
-				/* go into low thread priority */
-				HMT_low();
-
-				/*
-				 * atomically disable thread execution
-				 * and runlatch.
-				 * External and Decrementer exceptions
-				 * are still handled when the thread
-				 * is disabled but now enter in
-				 * cbe_system_reset_exception()
-				 */
-				ctrl = mfspr(SPRN_CTRLF);
-				ctrl &= ~(CTRL_RUNLATCH | CTRL_TE);
-				mtspr(SPRN_CTRLT, ctrl);
-			}
-			/* restore thread prio */
-			HMT_medium();
-			local_irq_enable();
-		}
-
-		/*
-		 * turn runlatch on again before scheduling the
-		 * process we just woke up
-		 */
-		ppc64_runlatch_on();
-
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
-	}
+	ctrl &= ~(CTRL_RUNLATCH | CTRL_TE);
+	mtspr(SPRN_CTRLT, ctrl);
 }
 
 static int cbe_system_reset_exception(struct pt_regs *regs)
@@ -158,9 +114,20 @@ static int cbe_system_reset_exception(struct pt_regs *regs)
 
 void __init cbe_pervasive_init(void)
 {
+	int cpu;
 	if (!cpu_has_feature(CPU_FTR_PAUSE_ZERO))
 		return;
 
-	ppc_md.idle_loop = cbe_idle;
+	for_each_possible_cpu(cpu) {
+		struct cbe_pmd_regs __iomem *regs = cbe_get_cpu_pmd_regs(cpu);
+		if (!regs)
+			continue;
+
+		 /* Enable Pause(0) control bit */
+		out_be64(&regs->pmcr, in_be64(&regs->pmcr) |
+					    CBE_PMD_PAUSE_ZERO_CONTROL);
+	}
+
+	ppc_md.power_save = cbe_power_save;
 	ppc_md.system_reset_exception = cbe_system_reset_exception;
 }
diff --git a/arch/powerpc/platforms/cell/pmu.c b/arch/powerpc/platforms/cell/pmu.c
new file mode 100644
index 00000000000..99c612025e8
--- /dev/null
+++ b/arch/powerpc/platforms/cell/pmu.c
@@ -0,0 +1,429 @@
+/*
+ * Cell Broadband Engine Performance Monitor
+ *
+ * (C) Copyright IBM Corporation 2001,2006
+ *
+ * Author:
+ *    David Erb (djerb@us.ibm.com)
+ *    Kevin Corry (kevcorry@us.ibm.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/types.h>
+#include <asm/io.h>
+#include <asm/irq_regs.h>
+#include <asm/machdep.h>
+#include <asm/pmc.h>
+#include <asm/reg.h>
+#include <asm/spu.h>
+
+#include "cbe_regs.h"
+#include "interrupt.h"
+
+/*
+ * When writing to write-only mmio addresses, save a shadow copy. All of the
+ * registers are 32-bit, but stored in the upper-half of a 64-bit field in
+ * pmd_regs.
+ */
+
+#define WRITE_WO_MMIO(reg, x)					\
+	do {							\
+		u32 _x = (x);					\
+		struct cbe_pmd_regs __iomem *pmd_regs;		\
+		struct cbe_pmd_shadow_regs *shadow_regs;	\
+		pmd_regs = cbe_get_cpu_pmd_regs(cpu);		\
+		shadow_regs = cbe_get_cpu_pmd_shadow_regs(cpu);	\
+		out_be64(&(pmd_regs->reg), (((u64)_x) << 32));	\
+		shadow_regs->reg = _x;				\
+	} while (0)
+
+#define READ_SHADOW_REG(val, reg)				\
+	do {							\
+		struct cbe_pmd_shadow_regs *shadow_regs;	\
+		shadow_regs = cbe_get_cpu_pmd_shadow_regs(cpu);	\
+		(val) = shadow_regs->reg;			\
+	} while (0)
+
+#define READ_MMIO_UPPER32(val, reg)				\
+	do {							\
+		struct cbe_pmd_regs __iomem *pmd_regs;		\
+		pmd_regs = cbe_get_cpu_pmd_regs(cpu);		\
+		(val) = (u32)(in_be64(&pmd_regs->reg) >> 32);	\
+	} while (0)
+
+/*
+ * Physical counter registers.
+ * Each physical counter can act as one 32-bit counter or two 16-bit counters.
+ */
+
+u32 cbe_read_phys_ctr(u32 cpu, u32 phys_ctr)
+{
+	u32 val_in_latch, val = 0;
+
+	if (phys_ctr < NR_PHYS_CTRS) {
+		READ_SHADOW_REG(val_in_latch, counter_value_in_latch);
+
+		/* Read the latch or the actual counter, whichever is newer. */
+		if (val_in_latch & (1 << phys_ctr)) {
+			READ_SHADOW_REG(val, pm_ctr[phys_ctr]);
+		} else {
+			READ_MMIO_UPPER32(val, pm_ctr[phys_ctr]);
+		}
+	}
+
+	return val;
+}
+EXPORT_SYMBOL_GPL(cbe_read_phys_ctr);
+
+void cbe_write_phys_ctr(u32 cpu, u32 phys_ctr, u32 val)
+{
+	struct cbe_pmd_shadow_regs *shadow_regs;
+	u32 pm_ctrl;
+
+	if (phys_ctr < NR_PHYS_CTRS) {
+		/* Writing to a counter only writes to a hardware latch.
+		 * The new value is not propagated to the actual counter
+		 * until the performance monitor is enabled.
+		 */
+		WRITE_WO_MMIO(pm_ctr[phys_ctr], val);
+
+		pm_ctrl = cbe_read_pm(cpu, pm_control);
+		if (pm_ctrl & CBE_PM_ENABLE_PERF_MON) {
+			/* The counters are already active, so we need to
+			 * rewrite the pm_control register to "re-enable"
+			 * the PMU.
+			 */
+			cbe_write_pm(cpu, pm_control, pm_ctrl);
+		} else {
+			shadow_regs = cbe_get_cpu_pmd_shadow_regs(cpu);
+			shadow_regs->counter_value_in_latch |= (1 << phys_ctr);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(cbe_write_phys_ctr);
+
+/*
+ * "Logical" counter registers.
+ * These will read/write 16-bits or 32-bits depending on the
+ * current size of the counter. Counters 4 - 7 are always 16-bit.
+ */
+
+u32 cbe_read_ctr(u32 cpu, u32 ctr)
+{
+	u32 val;
+	u32 phys_ctr = ctr & (NR_PHYS_CTRS - 1);
+
+	val = cbe_read_phys_ctr(cpu, phys_ctr);
+
+	if (cbe_get_ctr_size(cpu, phys_ctr) == 16)
+		val = (ctr < NR_PHYS_CTRS) ? (val >> 16) : (val & 0xffff);
+
+	return val;
+}
+EXPORT_SYMBOL_GPL(cbe_read_ctr);
+
+void cbe_write_ctr(u32 cpu, u32 ctr, u32 val)
+{
+	u32 phys_ctr;
+	u32 phys_val;
+
+	phys_ctr = ctr & (NR_PHYS_CTRS - 1);
+
+	if (cbe_get_ctr_size(cpu, phys_ctr) == 16) {
+		phys_val = cbe_read_phys_ctr(cpu, phys_ctr);
+
+		if (ctr < NR_PHYS_CTRS)
+			val = (val << 16) | (phys_val & 0xffff);
+		else
+			val = (val & 0xffff) | (phys_val & 0xffff0000);
+	}
+
+	cbe_write_phys_ctr(cpu, phys_ctr, val);
+}
+EXPORT_SYMBOL_GPL(cbe_write_ctr);
+
+/*
+ * Counter-control registers.
+ * Each "logical" counter has a corresponding control register.
+ */
+
+u32 cbe_read_pm07_control(u32 cpu, u32 ctr)
+{
+	u32 pm07_control = 0;
+
+	if (ctr < NR_CTRS)
+		READ_SHADOW_REG(pm07_control, pm07_control[ctr]);
+
+	return pm07_control;
+}
+EXPORT_SYMBOL_GPL(cbe_read_pm07_control);
+
+void cbe_write_pm07_control(u32 cpu, u32 ctr, u32 val)
+{
+	if (ctr < NR_CTRS)
+		WRITE_WO_MMIO(pm07_control[ctr], val);
+}
+EXPORT_SYMBOL_GPL(cbe_write_pm07_control);
+
+/*
+ * Other PMU control registers. Most of these are write-only.
+ */
+
+u32 cbe_read_pm(u32 cpu, enum pm_reg_name reg)
+{
+	u32 val = 0;
+
+	switch (reg) {
+	case group_control:
+		READ_SHADOW_REG(val, group_control);
+		break;
+
+	case debug_bus_control:
+		READ_SHADOW_REG(val, debug_bus_control);
+		break;
+
+	case trace_address:
+		READ_MMIO_UPPER32(val, trace_address);
+		break;
+
+	case ext_tr_timer:
+		READ_SHADOW_REG(val, ext_tr_timer);
+		break;
+
+	case pm_status:
+		READ_MMIO_UPPER32(val, pm_status);
+		break;
+
+	case pm_control:
+		READ_SHADOW_REG(val, pm_control);
+		break;
+
+	case pm_interval:
+		READ_SHADOW_REG(val, pm_interval);
+		break;
+
+	case pm_start_stop:
+		READ_SHADOW_REG(val, pm_start_stop);
+		break;
+	}
+
+	return val;
+}
+EXPORT_SYMBOL_GPL(cbe_read_pm);
+
+void cbe_write_pm(u32 cpu, enum pm_reg_name reg, u32 val)
+{
+	switch (reg) {
+	case group_control:
+		WRITE_WO_MMIO(group_control, val);
+		break;
+
+	case debug_bus_control:
+		WRITE_WO_MMIO(debug_bus_control, val);
+		break;
+
+	case trace_address:
+		WRITE_WO_MMIO(trace_address, val);
+		break;
+
+	case ext_tr_timer:
+		WRITE_WO_MMIO(ext_tr_timer, val);
+		break;
+
+	case pm_status:
+		WRITE_WO_MMIO(pm_status, val);
+		break;
+
+	case pm_control:
+		WRITE_WO_MMIO(pm_control, val);
+		break;
+
+	case pm_interval:
+		WRITE_WO_MMIO(pm_interval, val);
+		break;
+
+	case pm_start_stop:
+		WRITE_WO_MMIO(pm_start_stop, val);
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(cbe_write_pm);
+
+/*
+ * Get/set the size of a physical counter to either 16 or 32 bits.
+ */
+
+u32 cbe_get_ctr_size(u32 cpu, u32 phys_ctr)
+{
+	u32 pm_ctrl, size = 0;
+
+	if (phys_ctr < NR_PHYS_CTRS) {
+		pm_ctrl = cbe_read_pm(cpu, pm_control);
+		size = (pm_ctrl & CBE_PM_16BIT_CTR(phys_ctr)) ? 16 : 32;
+	}
+
+	return size;
+}
+EXPORT_SYMBOL_GPL(cbe_get_ctr_size);
+
+void cbe_set_ctr_size(u32 cpu, u32 phys_ctr, u32 ctr_size)
+{
+	u32 pm_ctrl;
+
+	if (phys_ctr < NR_PHYS_CTRS) {
+		pm_ctrl = cbe_read_pm(cpu, pm_control);
+		switch (ctr_size) {
+		case 16:
+			pm_ctrl |= CBE_PM_16BIT_CTR(phys_ctr);
+			break;
+
+		case 32:
+			pm_ctrl &= ~CBE_PM_16BIT_CTR(phys_ctr);
+			break;
+		}
+		cbe_write_pm(cpu, pm_control, pm_ctrl);
+	}
+}
+EXPORT_SYMBOL_GPL(cbe_set_ctr_size);
+
+/*
+ * Enable/disable the entire performance monitoring unit.
+ * When we enable the PMU, all pending writes to counters get committed.
+ */
+
+void cbe_enable_pm(u32 cpu)
+{
+	struct cbe_pmd_shadow_regs *shadow_regs;
+	u32 pm_ctrl;
+
+	shadow_regs = cbe_get_cpu_pmd_shadow_regs(cpu);
+	shadow_regs->counter_value_in_latch = 0;
+
+	pm_ctrl = cbe_read_pm(cpu, pm_control) | CBE_PM_ENABLE_PERF_MON;
+	cbe_write_pm(cpu, pm_control, pm_ctrl);
+}
+EXPORT_SYMBOL_GPL(cbe_enable_pm);
+
+void cbe_disable_pm(u32 cpu)
+{
+	u32 pm_ctrl;
+	pm_ctrl = cbe_read_pm(cpu, pm_control) & ~CBE_PM_ENABLE_PERF_MON;
+	cbe_write_pm(cpu, pm_control, pm_ctrl);
+}
+EXPORT_SYMBOL_GPL(cbe_disable_pm);
+
+/*
+ * Reading from the trace_buffer.
+ * The trace buffer is two 64-bit registers. Reading from
+ * the second half automatically increments the trace_address.
+ */
+
+void cbe_read_trace_buffer(u32 cpu, u64 *buf)
+{
+	struct cbe_pmd_regs __iomem *pmd_regs = cbe_get_cpu_pmd_regs(cpu);
+
+	*buf++ = in_be64(&pmd_regs->trace_buffer_0_63);
+	*buf++ = in_be64(&pmd_regs->trace_buffer_64_127);
+}
+EXPORT_SYMBOL_GPL(cbe_read_trace_buffer);
+
+/*
+ * Enabling/disabling interrupts for the entire performance monitoring unit.
+ */
+
+u32 cbe_query_pm_interrupts(u32 cpu)
+{
+	return cbe_read_pm(cpu, pm_status);
+}
+EXPORT_SYMBOL_GPL(cbe_query_pm_interrupts);
+
+u32 cbe_clear_pm_interrupts(u32 cpu)
+{
+	/* Reading pm_status clears the interrupt bits. */
+	return cbe_query_pm_interrupts(cpu);
+}
+EXPORT_SYMBOL_GPL(cbe_clear_pm_interrupts);
+
+void cbe_enable_pm_interrupts(u32 cpu, u32 thread, u32 mask)
+{
+	/* Set which node and thread will handle the next interrupt. */
+	iic_set_interrupt_routing(cpu, thread, 0);
+
+	/* Enable the interrupt bits in the pm_status register. */
+	if (mask)
+		cbe_write_pm(cpu, pm_status, mask);
+}
+EXPORT_SYMBOL_GPL(cbe_enable_pm_interrupts);
+
+void cbe_disable_pm_interrupts(u32 cpu)
+{
+	cbe_clear_pm_interrupts(cpu);
+	cbe_write_pm(cpu, pm_status, 0);
+}
+EXPORT_SYMBOL_GPL(cbe_disable_pm_interrupts);
+
+static irqreturn_t cbe_pm_irq(int irq, void *dev_id)
+{
+	perf_irq(get_irq_regs());
+	return IRQ_HANDLED;
+}
+
+int __init cbe_init_pm_irq(void)
+{
+	unsigned int irq;
+	int rc, node;
+
+	for_each_node(node) {
+		irq = irq_create_mapping(NULL, IIC_IRQ_IOEX_PMI |
+					       (node << IIC_IRQ_NODE_SHIFT));
+		if (irq == NO_IRQ) {
+			printk("ERROR: Unable to allocate irq for node %d\n",
+			       node);
+			return -EINVAL;
+		}
+
+		rc = request_irq(irq, cbe_pm_irq,
+				 IRQF_DISABLED, "cbe-pmu-0", NULL);
+		if (rc) {
+			printk("ERROR: Request for irq on node %d failed\n",
+			       node);
+			return rc;
+		}
+	}
+
+	return 0;
+}
+arch_initcall(cbe_init_pm_irq);
+
+void cbe_sync_irq(int node)
+{
+	unsigned int irq;
+
+	irq = irq_find_mapping(NULL,
+			       IIC_IRQ_IOEX_PMI
+			       | (node << IIC_IRQ_NODE_SHIFT));
+
+	if (irq == NO_IRQ) {
+		printk(KERN_WARNING "ERROR, unable to get existing irq %d " \
+		"for node %d\n", irq, node);
+		return;
+	}
+
+	synchronize_irq(irq);
+}
+EXPORT_SYMBOL_GPL(cbe_sync_irq);
+
diff --git a/arch/powerpc/platforms/cell/setup.c b/arch/powerpc/platforms/cell/setup.c
index 22c228a49c3..36989c2eee6 100644
--- a/arch/powerpc/platforms/cell/setup.c
+++ b/arch/powerpc/platforms/cell/setup.c
@@ -50,9 +50,10 @@
 #include <asm/spu.h>
 #include <asm/spu_priv1.h>
 #include <asm/udbg.h>
+#include <asm/mpic.h>
+#include <asm/of_platform.h>
 
 #include "interrupt.h"
-#include "iommu.h"
 #include "cbe_regs.h"
 #include "pervasive.h"
 #include "ras.h"
@@ -80,24 +81,72 @@ static void cell_progress(char *s, unsigned short hex)
 	printk("*** %04x : %s\n", hex, s ? s : "");
 }
 
-static void __init cell_pcibios_fixup(void)
+static int __init cell_publish_devices(void)
 {
-	struct pci_dev *dev = NULL;
+	if (!machine_is(cell))
+		return 0;
+
+	/* Publish OF platform devices for southbridge IOs */
+	of_platform_bus_probe(NULL, NULL, NULL);
+
+	return 0;
+}
+device_initcall(cell_publish_devices);
+
+static void cell_mpic_cascade(unsigned int irq, struct irq_desc *desc)
+{
+	struct mpic *mpic = desc->handler_data;
+	unsigned int virq;
+
+	virq = mpic_get_one_irq(mpic);
+	if (virq != NO_IRQ)
+		generic_handle_irq(virq);
+	desc->chip->eoi(irq);
+}
 
-	for_each_pci_dev(dev)
-		pci_read_irq_line(dev);
+static void __init mpic_init_IRQ(void)
+{
+	struct device_node *dn;
+	struct mpic *mpic;
+	unsigned int virq;
+
+	for (dn = NULL;
+	     (dn = of_find_node_by_name(dn, "interrupt-controller"));) {
+		if (!device_is_compatible(dn, "CBEA,platform-open-pic"))
+			continue;
+
+		/* The MPIC driver will get everything it needs from the
+		 * device-tree, just pass 0 to all arguments
+		 */
+		mpic = mpic_alloc(dn, 0, 0, 0, 0, " MPIC     ");
+		if (mpic == NULL)
+			continue;
+		mpic_init(mpic);
+
+		virq = irq_of_parse_and_map(dn, 0);
+		if (virq == NO_IRQ)
+			continue;
+
+		printk(KERN_INFO "%s : hooking up to IRQ %d\n",
+		       dn->full_name, virq);
+		set_irq_data(virq, mpic);
+		set_irq_chained_handler(virq, cell_mpic_cascade);
+	}
 }
 
+
 static void __init cell_init_irq(void)
 {
 	iic_init_IRQ();
 	spider_init_IRQ();
+	mpic_init_IRQ();
 }
 
 static void __init cell_setup_arch(void)
 {
 #ifdef CONFIG_SPU_BASE
-	spu_priv1_ops         = &spu_priv1_mmio_ops;
+	spu_priv1_ops = &spu_priv1_mmio_ops;
+	spu_management_ops = &spu_management_of_ops;
 #endif
 
 	cbe_regs_init();
@@ -109,7 +158,6 @@ static void __init cell_setup_arch(void)
 #ifdef CONFIG_SMP
 	smp_init_cell();
 #endif
-
 	/* init to some ~sane value until calibrate_delay() runs */
 	loops_per_jiffy = 50000000;
 
@@ -129,19 +177,6 @@ static void __init cell_setup_arch(void)
 	mmio_nvram_init();
 }
 
-/*
- * Early initialization.  Relocation is on but do not reference unbolted pages
- */
-static void __init cell_init_early(void)
-{
-	DBG(" -> cell_init_early()\n");
-
-	cell_init_iommu();
-
-	DBG(" <- cell_init_early()\n");
-}
-
-
 static int __init cell_probe(void)
 {
 	unsigned long root = of_get_flat_dt_root();
@@ -168,7 +203,6 @@ define_machine(cell) {
 	.name			= "Cell",
 	.probe			= cell_probe,
 	.setup_arch		= cell_setup_arch,
-	.init_early		= cell_init_early,
 	.show_cpuinfo		= cell_show_cpuinfo,
 	.restart		= rtas_restart,
 	.power_off		= rtas_power_off,
@@ -180,7 +214,7 @@ define_machine(cell) {
 	.check_legacy_ioport	= cell_check_legacy_ioport,
 	.progress		= cell_progress,
 	.init_IRQ       	= cell_init_irq,
-	.pcibios_fixup		= cell_pcibios_fixup,
+	.pci_setup_phb		= rtas_setup_phb,
 #ifdef CONFIG_KEXEC
 	.machine_kexec		= default_machine_kexec,
 	.machine_kexec_prepare	= default_machine_kexec_prepare,
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index 7aa809d5a24..bd7bffc3ddd 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -25,22 +25,17 @@
 #include <linux/interrupt.h>
 #include <linux/list.h>
 #include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/poll.h>
 #include <linux/ptrace.h>
 #include <linux/slab.h>
 #include <linux/wait.h>
-
-#include <asm/firmware.h>
-#include <asm/io.h>
-#include <asm/prom.h>
+#include <linux/mm.h>
+#include <linux/io.h>
 #include <linux/mutex.h>
 #include <asm/spu.h>
 #include <asm/spu_priv1.h>
-#include <asm/mmu_context.h>
-
-#include "interrupt.h"
+#include <asm/xmon.h>
 
+const struct spu_management_ops *spu_management_ops;
 const struct spu_priv1_ops *spu_priv1_ops;
 
 EXPORT_SYMBOL_GPL(spu_priv1_ops);
@@ -89,7 +84,30 @@ static int __spu_trap_data_seg(struct spu *spu, unsigned long ea)
 		printk("%s: invalid access during switch!\n", __func__);
 		return 1;
 	}
-	if (!mm || (REGION_ID(ea) != USER_REGION_ID)) {
+	esid = (ea & ESID_MASK) | SLB_ESID_V;
+
+	switch(REGION_ID(ea)) {
+	case USER_REGION_ID:
+#ifdef CONFIG_HUGETLB_PAGE
+		if (in_hugepage_area(mm->context, ea))
+			llp = mmu_psize_defs[mmu_huge_psize].sllp;
+		else
+#endif
+			llp = mmu_psize_defs[mmu_virtual_psize].sllp;
+		vsid = (get_vsid(mm->context.id, ea) << SLB_VSID_SHIFT) |
+				SLB_VSID_USER | llp;
+		break;
+	case VMALLOC_REGION_ID:
+		llp = mmu_psize_defs[mmu_virtual_psize].sllp;
+		vsid = (get_kernel_vsid(ea) << SLB_VSID_SHIFT) |
+			SLB_VSID_KERNEL | llp;
+		break;
+	case KERNEL_REGION_ID:
+		llp = mmu_psize_defs[mmu_linear_psize].sllp;
+		vsid = (get_kernel_vsid(ea) << SLB_VSID_SHIFT) |
+			SLB_VSID_KERNEL | llp;
+		break;
+	default:
 		/* Future: support kernel segments so that drivers
 		 * can use SPUs.
 		 */
@@ -97,16 +115,6 @@ static int __spu_trap_data_seg(struct spu *spu, unsigned long ea)
 		return 1;
 	}
 
-	esid = (ea & ESID_MASK) | SLB_ESID_V;
-#ifdef CONFIG_HUGETLB_PAGE
-	if (in_hugepage_area(mm->context, ea))
-		llp = mmu_psize_defs[mmu_huge_psize].sllp;
-	else
-#endif
-		llp = mmu_psize_defs[mmu_virtual_psize].sllp;
-	vsid = (get_vsid(mm->context.id, ea) << SLB_VSID_SHIFT) |
-			SLB_VSID_USER | llp;
-
 	out_be64(&priv2->slb_index_W, spu->slb_replace);
 	out_be64(&priv2->slb_vsid_RW, vsid);
 	out_be64(&priv2->slb_esid_RW, esid);
@@ -320,6 +328,7 @@ static void spu_free_irqs(struct spu *spu)
 }
 
 static struct list_head spu_list[MAX_NUMNODES];
+static LIST_HEAD(spu_full_list);
 static DEFINE_MUTEX(spu_mutex);
 
 static void spu_init_channels(struct spu *spu)
@@ -364,8 +373,7 @@ struct spu *spu_alloc_node(int node)
 	if (!list_empty(&spu_list[node])) {
 		spu = list_entry(spu_list[node].next, struct spu, list);
 		list_del_init(&spu->list);
-		pr_debug("Got SPU %x %d %d\n",
-			 spu->isrc, spu->number, spu->node);
+		pr_debug("Got SPU %d %d\n", spu->number, spu->node);
 		spu_init_channels(spu);
 	}
 	mutex_unlock(&spu_mutex);
@@ -493,280 +501,65 @@ int spu_irq_class_1_bottom(struct spu *spu)
 	if (!error) {
 		spu_restart_dma(spu);
 	} else {
-		__spu_trap_invalid_dma(spu);
+		spu->dma_callback(spu, SPE_EVENT_SPE_DATA_STORAGE);
 	}
 	return ret;
 }
 
-static int __init find_spu_node_id(struct device_node *spe)
-{
-	const unsigned int *id;
-	struct device_node *cpu;
-	cpu = spe->parent->parent;
-	id = get_property(cpu, "node-id", NULL);
-	return id ? *id : 0;
-}
-
-static int __init cell_spuprop_present(struct spu *spu, struct device_node *spe,
-		const char *prop)
-{
-	static DEFINE_MUTEX(add_spumem_mutex);
-
-	const struct address_prop {
-		unsigned long address;
-		unsigned int len;
-	} __attribute__((packed)) *p;
-	int proplen;
-
-	unsigned long start_pfn, nr_pages;
-	struct pglist_data *pgdata;
-	struct zone *zone;
-	int ret;
-
-	p = get_property(spe, prop, &proplen);
-	WARN_ON(proplen != sizeof (*p));
-
-	start_pfn = p->address >> PAGE_SHIFT;
-	nr_pages = ((unsigned long)p->len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-
-	pgdata = NODE_DATA(spu->nid);
-	zone = pgdata->node_zones;
-
-	/* XXX rethink locking here */
-	mutex_lock(&add_spumem_mutex);
-	ret = __add_pages(zone, start_pfn, nr_pages);
-	mutex_unlock(&add_spumem_mutex);
-
-	return ret;
-}
+struct sysdev_class spu_sysdev_class = {
+	set_kset_name("spu")
+};
 
-static void __iomem * __init map_spe_prop(struct spu *spu,
-		struct device_node *n, const char *name)
+int spu_add_sysdev_attr(struct sysdev_attribute *attr)
 {
-	const struct address_prop {
-		unsigned long address;
-		unsigned int len;
-	} __attribute__((packed)) *prop;
-
-	const void *p;
-	int proplen;
-	void __iomem *ret = NULL;
-	int err = 0;
-
-	p = get_property(n, name, &proplen);
-	if (proplen != sizeof (struct address_prop))
-		return NULL;
-
-	prop = p;
-
-	err = cell_spuprop_present(spu, n, name);
-	if (err && (err != -EEXIST))
-		goto out;
-
-	ret = ioremap(prop->address, prop->len);
-
- out:
-	return ret;
-}
+	struct spu *spu;
+	mutex_lock(&spu_mutex);
 
-static void spu_unmap(struct spu *spu)
-{
-	iounmap(spu->priv2);
-	iounmap(spu->priv1);
-	iounmap(spu->problem);
-	iounmap((__force u8 __iomem *)spu->local_store);
-}
+	list_for_each_entry(spu, &spu_full_list, full_list)
+		sysdev_create_file(&spu->sysdev, attr);
 
-/* This function shall be abstracted for HV platforms */
-static int __init spu_map_interrupts_old(struct spu *spu, struct device_node *np)
-{
-	unsigned int isrc;
-	const u32 *tmp;
-
-	/* Get the interrupt source unit from the device-tree */
-	tmp = get_property(np, "isrc", NULL);
-	if (!tmp)
-		return -ENODEV;
-	isrc = tmp[0];
-
-	/* Add the node number */
-	isrc |= spu->node << IIC_IRQ_NODE_SHIFT;
-	spu->isrc = isrc;
-
-	/* Now map interrupts of all 3 classes */
-	spu->irqs[0] = irq_create_mapping(NULL, IIC_IRQ_CLASS_0 | isrc);
-	spu->irqs[1] = irq_create_mapping(NULL, IIC_IRQ_CLASS_1 | isrc);
-	spu->irqs[2] = irq_create_mapping(NULL, IIC_IRQ_CLASS_2 | isrc);
-
-	/* Right now, we only fail if class 2 failed */
-	return spu->irqs[2] == NO_IRQ ? -EINVAL : 0;
+	mutex_unlock(&spu_mutex);
+	return 0;
 }
+EXPORT_SYMBOL_GPL(spu_add_sysdev_attr);
 
-static int __init spu_map_device_old(struct spu *spu, struct device_node *node)
+int spu_add_sysdev_attr_group(struct attribute_group *attrs)
 {
-	const char *prop;
-	int ret;
-
-	ret = -ENODEV;
-	spu->name = get_property(node, "name", NULL);
-	if (!spu->name)
-		goto out;
-
-	prop = get_property(node, "local-store", NULL);
-	if (!prop)
-		goto out;
-	spu->local_store_phys = *(unsigned long *)prop;
-
-	/* we use local store as ram, not io memory */
-	spu->local_store = (void __force *)
-		map_spe_prop(spu, node, "local-store");
-	if (!spu->local_store)
-		goto out;
-
-	prop = get_property(node, "problem", NULL);
-	if (!prop)
-		goto out_unmap;
-	spu->problem_phys = *(unsigned long *)prop;
-
-	spu->problem= map_spe_prop(spu, node, "problem");
-	if (!spu->problem)
-		goto out_unmap;
-
-	spu->priv1= map_spe_prop(spu, node, "priv1");
-	/* priv1 is not available on a hypervisor */
-
-	spu->priv2= map_spe_prop(spu, node, "priv2");
-	if (!spu->priv2)
-		goto out_unmap;
-	ret = 0;
-	goto out;
-
-out_unmap:
-	spu_unmap(spu);
-out:
-	return ret;
-}
+	struct spu *spu;
+	mutex_lock(&spu_mutex);
 
-static int __init spu_map_interrupts(struct spu *spu, struct device_node *np)
-{
-	struct of_irq oirq;
-	int ret;
-	int i;
+	list_for_each_entry(spu, &spu_full_list, full_list)
+		sysfs_create_group(&spu->sysdev.kobj, attrs);
 
-	for (i=0; i < 3; i++) {
-		ret = of_irq_map_one(np, i, &oirq);
-		if (ret) {
-			pr_debug("spu_new: failed to get irq %d\n", i);
-			goto err;
-		}
-		ret = -EINVAL;
-		pr_debug("  irq %d no 0x%x on %s\n", i, oirq.specifier[0],
-			 oirq.controller->full_name);
-		spu->irqs[i] = irq_create_of_mapping(oirq.controller,
-					oirq.specifier, oirq.size);
-		if (spu->irqs[i] == NO_IRQ) {
-			pr_debug("spu_new: failed to map it !\n");
-			goto err;
-		}
-	}
+	mutex_unlock(&spu_mutex);
 	return 0;
-
-err:
-	pr_debug("failed to map irq %x for spu %s\n", *oirq.specifier, spu->name);
-	for (; i >= 0; i--) {
-		if (spu->irqs[i] != NO_IRQ)
-			irq_dispose_mapping(spu->irqs[i]);
-	}
-	return ret;
 }
+EXPORT_SYMBOL_GPL(spu_add_sysdev_attr_group);
 
-static int spu_map_resource(struct device_node *node, int nr,
-		void __iomem** virt, unsigned long *phys)
-{
-	struct resource resource = { };
-	int ret;
-
-	ret = of_address_to_resource(node, nr, &resource);
-	if (ret)
-		goto out;
 
-	if (phys)
-		*phys = resource.start;
-	*virt = ioremap(resource.start, resource.end - resource.start);
-	if (!*virt)
-		ret = -EINVAL;
-
-out:
-	return ret;
-}
-
-static int __init spu_map_device(struct spu *spu, struct device_node *node)
+void spu_remove_sysdev_attr(struct sysdev_attribute *attr)
 {
-	int ret = -ENODEV;
-	spu->name = get_property(node, "name", NULL);
-	if (!spu->name)
-		goto out;
-
-	ret = spu_map_resource(node, 0, (void __iomem**)&spu->local_store,
-					&spu->local_store_phys);
-	if (ret) {
-		pr_debug("spu_new: failed to map %s resource 0\n",
-			 node->full_name);
-		goto out;
-	}
-	ret = spu_map_resource(node, 1, (void __iomem**)&spu->problem,
-					&spu->problem_phys);
-	if (ret) {
-		pr_debug("spu_new: failed to map %s resource 1\n",
-			 node->full_name);
-		goto out_unmap;
-	}
-	ret = spu_map_resource(node, 2, (void __iomem**)&spu->priv2,
-					NULL);
-	if (ret) {
-		pr_debug("spu_new: failed to map %s resource 2\n",
-			 node->full_name);
-		goto out_unmap;
-	}
-
-	if (!firmware_has_feature(FW_FEATURE_LPAR))
-		ret = spu_map_resource(node, 3, (void __iomem**)&spu->priv1,
-					NULL);
-	if (ret) {
-		pr_debug("spu_new: failed to map %s resource 3\n",
-			 node->full_name);
-		goto out_unmap;
-	}
-	pr_debug("spu_new: %s maps:\n", node->full_name);
-	pr_debug("  local store   : 0x%016lx -> 0x%p\n",
-		 spu->local_store_phys, spu->local_store);
-	pr_debug("  problem state : 0x%016lx -> 0x%p\n",
-		 spu->problem_phys, spu->problem);
-	pr_debug("  priv2         :                       0x%p\n", spu->priv2);
-	pr_debug("  priv1         :                       0x%p\n", spu->priv1);
+	struct spu *spu;
+	mutex_lock(&spu_mutex);
 
-	return 0;
+	list_for_each_entry(spu, &spu_full_list, full_list)
+		sysdev_remove_file(&spu->sysdev, attr);
 
-out_unmap:
-	spu_unmap(spu);
-out:
-	pr_debug("failed to map spe %s: %d\n", spu->name, ret);
-	return ret;
+	mutex_unlock(&spu_mutex);
 }
+EXPORT_SYMBOL_GPL(spu_remove_sysdev_attr);
 
-struct sysdev_class spu_sysdev_class = {
-	set_kset_name("spu")
-};
-
-static ssize_t spu_show_isrc(struct sys_device *sysdev, char *buf)
+void spu_remove_sysdev_attr_group(struct attribute_group *attrs)
 {
-	struct spu *spu = container_of(sysdev, struct spu, sysdev);
-	return sprintf(buf, "%d\n", spu->isrc);
+	struct spu *spu;
+	mutex_lock(&spu_mutex);
 
-}
-static SYSDEV_ATTR(isrc, 0400, spu_show_isrc, NULL);
+	list_for_each_entry(spu, &spu_full_list, full_list)
+		sysfs_remove_group(&spu->sysdev.kobj, attrs);
 
-extern int attach_sysdev_to_node(struct sys_device *dev, int nid);
+	mutex_unlock(&spu_mutex);
+}
+EXPORT_SYMBOL_GPL(spu_remove_sysdev_attr_group);
 
 static int spu_create_sysdev(struct spu *spu)
 {
@@ -781,21 +574,18 @@ static int spu_create_sysdev(struct spu *spu)
 		return ret;
 	}
 
-	if (spu->isrc != 0)
-		sysdev_create_file(&spu->sysdev, &attr_isrc);
-	sysfs_add_device_to_node(&spu->sysdev, spu->nid);
+	sysfs_add_device_to_node(&spu->sysdev, spu->node);
 
 	return 0;
 }
 
 static void spu_destroy_sysdev(struct spu *spu)
 {
-	sysdev_remove_file(&spu->sysdev, &attr_isrc);
-	sysfs_remove_device_from_node(&spu->sysdev, spu->nid);
+	sysfs_remove_device_from_node(&spu->sysdev, spu->node);
 	sysdev_unregister(&spu->sysdev);
 }
 
-static int __init create_spu(struct device_node *spe)
+static int __init create_spu(void *data)
 {
 	struct spu *spu;
 	int ret;
@@ -806,57 +596,37 @@ static int __init create_spu(struct device_node *spe)
 	if (!spu)
 		goto out;
 
-	spu->node = find_spu_node_id(spe);
-	if (spu->node >= MAX_NUMNODES) {
-		printk(KERN_WARNING "SPE %s on node %d ignored,"
-		       " node number too big\n", spe->full_name, spu->node);
-		printk(KERN_WARNING "Check if CONFIG_NUMA is enabled.\n");
-		return -ENODEV;
-	}
-	spu->nid = of_node_to_nid(spe);
-	if (spu->nid == -1)
-		spu->nid = 0;
+	spin_lock_init(&spu->register_lock);
+	mutex_lock(&spu_mutex);
+	spu->number = number++;
+	mutex_unlock(&spu_mutex);
+
+	ret = spu_create_spu(spu, data);
 
-	ret = spu_map_device(spu, spe);
-	/* try old method */
-	if (ret)
-		ret = spu_map_device_old(spu, spe);
 	if (ret)
 		goto out_free;
 
-	ret = spu_map_interrupts(spu, spe);
-	if (ret)
-		ret = spu_map_interrupts_old(spu, spe);
-	if (ret)
-		goto out_unmap;
-	spin_lock_init(&spu->register_lock);
-	spu_mfc_sdr_set(spu, mfspr(SPRN_SDR1));
+	spu_mfc_sdr_setup(spu);
 	spu_mfc_sr1_set(spu, 0x33);
-	mutex_lock(&spu_mutex);
-
-	spu->number = number++;
 	ret = spu_request_irqs(spu);
 	if (ret)
-		goto out_unlock;
+		goto out_destroy;
 
 	ret = spu_create_sysdev(spu);
 	if (ret)
 		goto out_free_irqs;
 
+	mutex_lock(&spu_mutex);
 	list_add(&spu->list, &spu_list[spu->node]);
+	list_add(&spu->full_list, &spu_full_list);
 	mutex_unlock(&spu_mutex);
 
-	pr_debug(KERN_DEBUG "Using SPE %s %02x %p %p %p %p %d\n",
-		spu->name, spu->isrc, spu->local_store,
-		spu->problem, spu->priv1, spu->priv2, spu->number);
 	goto out;
 
 out_free_irqs:
 	spu_free_irqs(spu);
-out_unlock:
-	mutex_unlock(&spu_mutex);
-out_unmap:
-	spu_unmap(spu);
+out_destroy:
+	spu_destroy_spu(spu);
 out_free:
 	kfree(spu);
 out:
@@ -866,10 +636,11 @@ out:
 static void destroy_spu(struct spu *spu)
 {
 	list_del_init(&spu->list);
+	list_del_init(&spu->full_list);
 
 	spu_destroy_sysdev(spu);
 	spu_free_irqs(spu);
-	spu_unmap(spu);
+	spu_destroy_spu(spu);
 	kfree(spu);
 }
 
@@ -890,9 +661,11 @@ module_exit(cleanup_spu_base);
 
 static int __init init_spu_base(void)
 {
-	struct device_node *node;
 	int i, ret;
 
+	if (!spu_management_ops)
+		return 0;
+
 	/* create sysdev class for spus */
 	ret = sysdev_class_register(&spu_sysdev_class);
 	if (ret)
@@ -901,17 +674,17 @@ static int __init init_spu_base(void)
 	for (i = 0; i < MAX_NUMNODES; i++)
 		INIT_LIST_HEAD(&spu_list[i]);
 
-	ret = -ENODEV;
-	for (node = of_find_node_by_type(NULL, "spe");
-			node; node = of_find_node_by_type(node, "spe")) {
-		ret = create_spu(node);
-		if (ret) {
-			printk(KERN_WARNING "%s: Error initializing %s\n",
-				__FUNCTION__, node->name);
-			cleanup_spu_base();
-			break;
-		}
+	ret = spu_enumerate_spus(create_spu);
+
+	if (ret) {
+		printk(KERN_WARNING "%s: Error initializing spus\n",
+			__FUNCTION__);
+		cleanup_spu_base();
+		return ret;
 	}
+
+	xmon_register_spus(&spu_full_list);
+
 	return ret;
 }
 module_init(init_spu_base);
diff --git a/arch/powerpc/platforms/cell/spu_coredump.c b/arch/powerpc/platforms/cell/spu_coredump.c
new file mode 100644
index 00000000000..6915b418ee7
--- /dev/null
+++ b/arch/powerpc/platforms/cell/spu_coredump.c
@@ -0,0 +1,81 @@
+/*
+ * SPU core dump code
+ *
+ * (C) Copyright 2006 IBM Corp.
+ *
+ * Author: Dwayne Grant McConnell <decimal@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/file.h>
+#include <linux/module.h>
+#include <linux/syscalls.h>
+
+#include <asm/spu.h>
+
+static struct spu_coredump_calls spu_coredump_calls;
+static DEFINE_MUTEX(spu_coredump_mutex);
+
+int arch_notes_size(void)
+{
+	long ret;
+	struct module *owner = spu_coredump_calls.owner;
+
+	ret = -ENOSYS;
+	mutex_lock(&spu_coredump_mutex);
+	if (owner && try_module_get(owner)) {
+		ret = spu_coredump_calls.arch_notes_size();
+		module_put(owner);
+	}
+	mutex_unlock(&spu_coredump_mutex);
+	return ret;
+}
+
+void arch_write_notes(struct file *file)
+{
+	struct module *owner = spu_coredump_calls.owner;
+
+	mutex_lock(&spu_coredump_mutex);
+	if (owner && try_module_get(owner)) {
+		spu_coredump_calls.arch_write_notes(file);
+		module_put(owner);
+	}
+	mutex_unlock(&spu_coredump_mutex);
+}
+
+int register_arch_coredump_calls(struct spu_coredump_calls *calls)
+{
+	if (spu_coredump_calls.owner)
+		return -EBUSY;
+
+	mutex_lock(&spu_coredump_mutex);
+	spu_coredump_calls.arch_notes_size = calls->arch_notes_size;
+	spu_coredump_calls.arch_write_notes = calls->arch_write_notes;
+	spu_coredump_calls.owner = calls->owner;
+	mutex_unlock(&spu_coredump_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(register_arch_coredump_calls);
+
+void unregister_arch_coredump_calls(struct spu_coredump_calls *calls)
+{
+	BUG_ON(spu_coredump_calls.owner != calls->owner);
+
+	mutex_lock(&spu_coredump_mutex);
+	spu_coredump_calls.owner = NULL;
+	mutex_unlock(&spu_coredump_mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_arch_coredump_calls);
diff --git a/arch/powerpc/platforms/cell/spu_priv1_mmio.c b/arch/powerpc/platforms/cell/spu_priv1_mmio.c
index 71b69f0a1a4..a5de0430c56 100644
--- a/arch/powerpc/platforms/cell/spu_priv1_mmio.c
+++ b/arch/powerpc/platforms/cell/spu_priv1_mmio.c
@@ -18,120 +18,498 @@
  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include <linux/interrupt.h>
+#include <linux/list.h>
 #include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+#include <linux/mutex.h>
+#include <linux/device.h>
 
-#include <asm/io.h>
 #include <asm/spu.h>
 #include <asm/spu_priv1.h>
+#include <asm/firmware.h>
+#include <asm/prom.h>
 
 #include "interrupt.h"
+#include "spu_priv1_mmio.h"
+
+struct spu_pdata {
+	int nid;
+	struct device_node *devnode;
+	struct spu_priv1 __iomem *priv1;
+};
+
+static struct spu_pdata *spu_get_pdata(struct spu *spu)
+{
+	BUG_ON(!spu->pdata);
+	return spu->pdata;
+}
+
+struct device_node *spu_devnode(struct spu *spu)
+{
+	return spu_get_pdata(spu)->devnode;
+}
+
+EXPORT_SYMBOL_GPL(spu_devnode);
+
+static int __init find_spu_node_id(struct device_node *spe)
+{
+	const unsigned int *id;
+	struct device_node *cpu;
+	cpu = spe->parent->parent;
+	id = get_property(cpu, "node-id", NULL);
+	return id ? *id : 0;
+}
+
+static int __init cell_spuprop_present(struct spu *spu, struct device_node *spe,
+		const char *prop)
+{
+	static DEFINE_MUTEX(add_spumem_mutex);
+
+	const struct address_prop {
+		unsigned long address;
+		unsigned int len;
+	} __attribute__((packed)) *p;
+	int proplen;
+
+	unsigned long start_pfn, nr_pages;
+	struct pglist_data *pgdata;
+	struct zone *zone;
+	int ret;
+
+	p = get_property(spe, prop, &proplen);
+	WARN_ON(proplen != sizeof (*p));
+
+	start_pfn = p->address >> PAGE_SHIFT;
+	nr_pages = ((unsigned long)p->len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	pgdata = NODE_DATA(spu_get_pdata(spu)->nid);
+	zone = pgdata->node_zones;
+
+	/* XXX rethink locking here */
+	mutex_lock(&add_spumem_mutex);
+	ret = __add_pages(zone, start_pfn, nr_pages);
+	mutex_unlock(&add_spumem_mutex);
+
+	return ret;
+}
+
+static void __iomem * __init map_spe_prop(struct spu *spu,
+		struct device_node *n, const char *name)
+{
+	const struct address_prop {
+		unsigned long address;
+		unsigned int len;
+	} __attribute__((packed)) *prop;
+
+	const void *p;
+	int proplen;
+	void __iomem *ret = NULL;
+	int err = 0;
+
+	p = get_property(n, name, &proplen);
+	if (proplen != sizeof (struct address_prop))
+		return NULL;
+
+	prop = p;
+
+	err = cell_spuprop_present(spu, n, name);
+	if (err && (err != -EEXIST))
+		goto out;
+
+	ret = ioremap(prop->address, prop->len);
+
+ out:
+	return ret;
+}
+
+static void spu_unmap(struct spu *spu)
+{
+	iounmap(spu->priv2);
+	iounmap(spu_get_pdata(spu)->priv1);
+	iounmap(spu->problem);
+	iounmap((__force u8 __iomem *)spu->local_store);
+}
+
+static int __init spu_map_interrupts_old(struct spu *spu,
+	struct device_node *np)
+{
+	unsigned int isrc;
+	const u32 *tmp;
+
+	/* Get the interrupt source unit from the device-tree */
+	tmp = get_property(np, "isrc", NULL);
+	if (!tmp)
+		return -ENODEV;
+	isrc = tmp[0];
+
+	/* Add the node number */
+	isrc |= spu->node << IIC_IRQ_NODE_SHIFT;
+
+	/* Now map interrupts of all 3 classes */
+	spu->irqs[0] = irq_create_mapping(NULL, IIC_IRQ_CLASS_0 | isrc);
+	spu->irqs[1] = irq_create_mapping(NULL, IIC_IRQ_CLASS_1 | isrc);
+	spu->irqs[2] = irq_create_mapping(NULL, IIC_IRQ_CLASS_2 | isrc);
+
+	/* Right now, we only fail if class 2 failed */
+	return spu->irqs[2] == NO_IRQ ? -EINVAL : 0;
+}
+
+static int __init spu_map_device_old(struct spu *spu, struct device_node *node)
+{
+	const char *prop;
+	int ret;
+
+	ret = -ENODEV;
+	spu->name = get_property(node, "name", NULL);
+	if (!spu->name)
+		goto out;
+
+	prop = get_property(node, "local-store", NULL);
+	if (!prop)
+		goto out;
+	spu->local_store_phys = *(unsigned long *)prop;
+
+	/* we use local store as ram, not io memory */
+	spu->local_store = (void __force *)
+		map_spe_prop(spu, node, "local-store");
+	if (!spu->local_store)
+		goto out;
+
+	prop = get_property(node, "problem", NULL);
+	if (!prop)
+		goto out_unmap;
+	spu->problem_phys = *(unsigned long *)prop;
+
+	spu->problem= map_spe_prop(spu, node, "problem");
+	if (!spu->problem)
+		goto out_unmap;
+
+	spu_get_pdata(spu)->priv1= map_spe_prop(spu, node, "priv1");
+
+	spu->priv2= map_spe_prop(spu, node, "priv2");
+	if (!spu->priv2)
+		goto out_unmap;
+	ret = 0;
+	goto out;
+
+out_unmap:
+	spu_unmap(spu);
+out:
+	return ret;
+}
+
+static int __init spu_map_interrupts(struct spu *spu, struct device_node *np)
+{
+	struct of_irq oirq;
+	int ret;
+	int i;
+
+	for (i=0; i < 3; i++) {
+		ret = of_irq_map_one(np, i, &oirq);
+		if (ret) {
+			pr_debug("spu_new: failed to get irq %d\n", i);
+			goto err;
+		}
+		ret = -EINVAL;
+		pr_debug("  irq %d no 0x%x on %s\n", i, oirq.specifier[0],
+			 oirq.controller->full_name);
+		spu->irqs[i] = irq_create_of_mapping(oirq.controller,
+					oirq.specifier, oirq.size);
+		if (spu->irqs[i] == NO_IRQ) {
+			pr_debug("spu_new: failed to map it !\n");
+			goto err;
+		}
+	}
+	return 0;
+
+err:
+	pr_debug("failed to map irq %x for spu %s\n", *oirq.specifier,
+		spu->name);
+	for (; i >= 0; i--) {
+		if (spu->irqs[i] != NO_IRQ)
+			irq_dispose_mapping(spu->irqs[i]);
+	}
+	return ret;
+}
+
+static int spu_map_resource(struct device_node *node, int nr,
+		void __iomem** virt, unsigned long *phys)
+{
+	struct resource resource = { };
+	int ret;
+
+	ret = of_address_to_resource(node, nr, &resource);
+	if (ret)
+		goto out;
+
+	if (phys)
+		*phys = resource.start;
+	*virt = ioremap(resource.start, resource.end - resource.start);
+	if (!*virt)
+		ret = -EINVAL;
+
+out:
+	return ret;
+}
+
+static int __init spu_map_device(struct spu *spu, struct device_node *node)
+{
+	int ret = -ENODEV;
+	spu->name = get_property(node, "name", NULL);
+	if (!spu->name)
+		goto out;
+
+	ret = spu_map_resource(node, 0, (void __iomem**)&spu->local_store,
+					&spu->local_store_phys);
+	if (ret) {
+		pr_debug("spu_new: failed to map %s resource 0\n",
+			 node->full_name);
+		goto out;
+	}
+	ret = spu_map_resource(node, 1, (void __iomem**)&spu->problem,
+					&spu->problem_phys);
+	if (ret) {
+		pr_debug("spu_new: failed to map %s resource 1\n",
+			 node->full_name);
+		goto out_unmap;
+	}
+	ret = spu_map_resource(node, 2, (void __iomem**)&spu->priv2,
+					NULL);
+	if (ret) {
+		pr_debug("spu_new: failed to map %s resource 2\n",
+			 node->full_name);
+		goto out_unmap;
+	}
+	if (!firmware_has_feature(FW_FEATURE_LPAR))
+		ret = spu_map_resource(node, 3,
+			(void __iomem**)&spu_get_pdata(spu)->priv1, NULL);
+	if (ret) {
+		pr_debug("spu_new: failed to map %s resource 3\n",
+			 node->full_name);
+		goto out_unmap;
+	}
+	pr_debug("spu_new: %s maps:\n", node->full_name);
+	pr_debug("  local store   : 0x%016lx -> 0x%p\n",
+		 spu->local_store_phys, spu->local_store);
+	pr_debug("  problem state : 0x%016lx -> 0x%p\n",
+		 spu->problem_phys, spu->problem);
+	pr_debug("  priv2         :                       0x%p\n", spu->priv2);
+	pr_debug("  priv1         :                       0x%p\n",
+						spu_get_pdata(spu)->priv1);
+
+	return 0;
+
+out_unmap:
+	spu_unmap(spu);
+out:
+	pr_debug("failed to map spe %s: %d\n", spu->name, ret);
+	return ret;
+}
+
+static int __init of_enumerate_spus(int (*fn)(void *data))
+{
+	int ret;
+	struct device_node *node;
+
+	ret = -ENODEV;
+	for (node = of_find_node_by_type(NULL, "spe");
+			node; node = of_find_node_by_type(node, "spe")) {
+		ret = fn(node);
+		if (ret) {
+			printk(KERN_WARNING "%s: Error initializing %s\n",
+				__FUNCTION__, node->name);
+			break;
+		}
+	}
+	return ret;
+}
+
+static int __init of_create_spu(struct spu *spu, void *data)
+{
+	int ret;
+	struct device_node *spe = (struct device_node *)data;
+
+	spu->pdata = kzalloc(sizeof(struct spu_pdata),
+		GFP_KERNEL);
+	if (!spu->pdata) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	spu->node = find_spu_node_id(spe);
+	if (spu->node >= MAX_NUMNODES) {
+		printk(KERN_WARNING "SPE %s on node %d ignored,"
+		       " node number too big\n", spe->full_name, spu->node);
+		printk(KERN_WARNING "Check if CONFIG_NUMA is enabled.\n");
+		ret = -ENODEV;
+		goto out_free;
+	}
+
+	spu_get_pdata(spu)->nid = of_node_to_nid(spe);
+	if (spu_get_pdata(spu)->nid == -1)
+		spu_get_pdata(spu)->nid = 0;
+
+	ret = spu_map_device(spu, spe);
+	/* try old method */
+	if (ret)
+		ret = spu_map_device_old(spu, spe);
+	if (ret)
+		goto out_free;
+
+	ret = spu_map_interrupts(spu, spe);
+	if (ret)
+		ret = spu_map_interrupts_old(spu, spe);
+	if (ret)
+		goto out_unmap;
+
+	spu_get_pdata(spu)->devnode = of_node_get(spe);
+
+	pr_debug(KERN_DEBUG "Using SPE %s %p %p %p %p %d\n", spu->name,
+		spu->local_store, spu->problem, spu_get_pdata(spu)->priv1,
+		spu->priv2, spu->number);
+	goto out;
+
+out_unmap:
+	spu_unmap(spu);
+out_free:
+	kfree(spu->pdata);
+	spu->pdata = NULL;
+out:
+	return ret;
+}
+
+static int of_destroy_spu(struct spu *spu)
+{
+	spu_unmap(spu);
+	of_node_put(spu_get_pdata(spu)->devnode);
+	kfree(spu->pdata);
+	spu->pdata = NULL;
+	return 0;
+}
+
+const struct spu_management_ops spu_management_of_ops = {
+	.enumerate_spus = of_enumerate_spus,
+	.create_spu = of_create_spu,
+	.destroy_spu = of_destroy_spu,
+};
 
 static void int_mask_and(struct spu *spu, int class, u64 mask)
 {
 	u64 old_mask;
 
-	old_mask = in_be64(&spu->priv1->int_mask_RW[class]);
-	out_be64(&spu->priv1->int_mask_RW[class], old_mask & mask);
+	old_mask = in_be64(&spu_get_pdata(spu)->priv1->int_mask_RW[class]);
+	out_be64(&spu_get_pdata(spu)->priv1->int_mask_RW[class],
+		old_mask & mask);
 }
 
 static void int_mask_or(struct spu *spu, int class, u64 mask)
 {
 	u64 old_mask;
 
-	old_mask = in_be64(&spu->priv1->int_mask_RW[class]);
-	out_be64(&spu->priv1->int_mask_RW[class], old_mask | mask);
+	old_mask = in_be64(&spu_get_pdata(spu)->priv1->int_mask_RW[class]);
+	out_be64(&spu_get_pdata(spu)->priv1->int_mask_RW[class],
+		old_mask | mask);
 }
 
 static void int_mask_set(struct spu *spu, int class, u64 mask)
 {
-	out_be64(&spu->priv1->int_mask_RW[class], mask);
+	out_be64(&spu_get_pdata(spu)->priv1->int_mask_RW[class], mask);
 }
 
 static u64 int_mask_get(struct spu *spu, int class)
 {
-	return in_be64(&spu->priv1->int_mask_RW[class]);
+	return in_be64(&spu_get_pdata(spu)->priv1->int_mask_RW[class]);
 }
 
 static void int_stat_clear(struct spu *spu, int class, u64 stat)
 {
-	out_be64(&spu->priv1->int_stat_RW[class], stat);
+	out_be64(&spu_get_pdata(spu)->priv1->int_stat_RW[class], stat);
 }
 
 static u64 int_stat_get(struct spu *spu, int class)
 {
-	return in_be64(&spu->priv1->int_stat_RW[class]);
+	return in_be64(&spu_get_pdata(spu)->priv1->int_stat_RW[class]);
 }
 
 static void cpu_affinity_set(struct spu *spu, int cpu)
 {
 	u64 target = iic_get_target_id(cpu);
 	u64 route = target << 48 | target << 32 | target << 16;
-	out_be64(&spu->priv1->int_route_RW, route);
+	out_be64(&spu_get_pdata(spu)->priv1->int_route_RW, route);
 }
 
 static u64 mfc_dar_get(struct spu *spu)
 {
-	return in_be64(&spu->priv1->mfc_dar_RW);
+	return in_be64(&spu_get_pdata(spu)->priv1->mfc_dar_RW);
 }
 
 static u64 mfc_dsisr_get(struct spu *spu)
 {
-	return in_be64(&spu->priv1->mfc_dsisr_RW);
+	return in_be64(&spu_get_pdata(spu)->priv1->mfc_dsisr_RW);
 }
 
 static void mfc_dsisr_set(struct spu *spu, u64 dsisr)
 {
-	out_be64(&spu->priv1->mfc_dsisr_RW, dsisr);
+	out_be64(&spu_get_pdata(spu)->priv1->mfc_dsisr_RW, dsisr);
 }
 
-static void mfc_sdr_set(struct spu *spu, u64 sdr)
+static void mfc_sdr_setup(struct spu *spu)
 {
-	out_be64(&spu->priv1->mfc_sdr_RW, sdr);
+	out_be64(&spu_get_pdata(spu)->priv1->mfc_sdr_RW, mfspr(SPRN_SDR1));
 }
 
 static void mfc_sr1_set(struct spu *spu, u64 sr1)
 {
-	out_be64(&spu->priv1->mfc_sr1_RW, sr1);
+	out_be64(&spu_get_pdata(spu)->priv1->mfc_sr1_RW, sr1);
 }
 
 static u64 mfc_sr1_get(struct spu *spu)
 {
-	return in_be64(&spu->priv1->mfc_sr1_RW);
+	return in_be64(&spu_get_pdata(spu)->priv1->mfc_sr1_RW);
 }
 
 static void mfc_tclass_id_set(struct spu *spu, u64 tclass_id)
 {
-	out_be64(&spu->priv1->mfc_tclass_id_RW, tclass_id);
+	out_be64(&spu_get_pdata(spu)->priv1->mfc_tclass_id_RW, tclass_id);
 }
 
 static u64 mfc_tclass_id_get(struct spu *spu)
 {
-	return in_be64(&spu->priv1->mfc_tclass_id_RW);
+	return in_be64(&spu_get_pdata(spu)->priv1->mfc_tclass_id_RW);
 }
 
 static void tlb_invalidate(struct spu *spu)
 {
-	out_be64(&spu->priv1->tlb_invalidate_entry_W, 0ul);
+	out_be64(&spu_get_pdata(spu)->priv1->tlb_invalidate_entry_W, 0ul);
 }
 
 static void resource_allocation_groupID_set(struct spu *spu, u64 id)
 {
-	out_be64(&spu->priv1->resource_allocation_groupID_RW, id);
+	out_be64(&spu_get_pdata(spu)->priv1->resource_allocation_groupID_RW,
+		id);
 }
 
 static u64 resource_allocation_groupID_get(struct spu *spu)
 {
-	return in_be64(&spu->priv1->resource_allocation_groupID_RW);
+	return in_be64(
+		&spu_get_pdata(spu)->priv1->resource_allocation_groupID_RW);
 }
 
 static void resource_allocation_enable_set(struct spu *spu, u64 enable)
 {
-	out_be64(&spu->priv1->resource_allocation_enable_RW, enable);
+	out_be64(&spu_get_pdata(spu)->priv1->resource_allocation_enable_RW,
+		enable);
 }
 
 static u64 resource_allocation_enable_get(struct spu *spu)
 {
-	return in_be64(&spu->priv1->resource_allocation_enable_RW);
+	return in_be64(
+		&spu_get_pdata(spu)->priv1->resource_allocation_enable_RW);
 }
 
 const struct spu_priv1_ops spu_priv1_mmio_ops =
@@ -146,7 +524,7 @@ const struct spu_priv1_ops spu_priv1_mmio_ops =
 	.mfc_dar_get = mfc_dar_get,
 	.mfc_dsisr_get = mfc_dsisr_get,
 	.mfc_dsisr_set = mfc_dsisr_set,
-	.mfc_sdr_set = mfc_sdr_set,
+	.mfc_sdr_setup = mfc_sdr_setup,
 	.mfc_sr1_set = mfc_sr1_set,
 	.mfc_sr1_get = mfc_sr1_get,
 	.mfc_tclass_id_set = mfc_tclass_id_set,
diff --git a/arch/powerpc/platforms/cell/spu_priv1_mmio.h b/arch/powerpc/platforms/cell/spu_priv1_mmio.h
new file mode 100644
index 00000000000..7b62bd1cc25
--- /dev/null
+++ b/arch/powerpc/platforms/cell/spu_priv1_mmio.h
@@ -0,0 +1,26 @@
+/*
+ * spu hypervisor abstraction for direct hardware access.
+ *
+ *  Copyright (C) 2006 Sony Computer Entertainment Inc.
+ *  Copyright 2006 Sony Corp.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef SPU_PRIV1_MMIO_H
+#define SPU_PRIV1_MMIO_H
+
+struct device_node *spu_devnode(struct spu *spu);
+
+#endif /* SPU_PRIV1_MMIO_H */
diff --git a/arch/powerpc/platforms/cell/spufs/Makefile b/arch/powerpc/platforms/cell/spufs/Makefile
index ecdfbb35f82..472217d19fa 100644
--- a/arch/powerpc/platforms/cell/spufs/Makefile
+++ b/arch/powerpc/platforms/cell/spufs/Makefile
@@ -1,7 +1,7 @@
 obj-y += switch.o
 
 obj-$(CONFIG_SPU_FS) += spufs.o
-spufs-y += inode.o file.o context.o syscalls.o
+spufs-y += inode.o file.o context.o syscalls.o coredump.o
 spufs-y += sched.o backing_ops.o hw_ops.o run.o gang.o
 
 # Rules to build switch.o with the help of SPU tool chain
diff --git a/arch/powerpc/platforms/cell/spufs/backing_ops.c b/arch/powerpc/platforms/cell/spufs/backing_ops.c
index 2d22cd59d6f..1898f0d3a8b 100644
--- a/arch/powerpc/platforms/cell/spufs/backing_ops.c
+++ b/arch/powerpc/platforms/cell/spufs/backing_ops.c
@@ -36,6 +36,7 @@
 #include <asm/io.h>
 #include <asm/spu.h>
 #include <asm/spu_csa.h>
+#include <asm/spu_info.h>
 #include <asm/mmu_context.h>
 #include "spufs.h"
 
@@ -267,6 +268,11 @@ static char *spu_backing_get_ls(struct spu_context *ctx)
 	return ctx->csa.lscsa->ls;
 }
 
+static u32 spu_backing_runcntl_read(struct spu_context *ctx)
+{
+	return ctx->csa.prob.spu_runcntl_RW;
+}
+
 static void spu_backing_runcntl_write(struct spu_context *ctx, u32 val)
 {
 	spin_lock(&ctx->csa.register_lock);
@@ -279,9 +285,26 @@ static void spu_backing_runcntl_write(struct spu_context *ctx, u32 val)
 	spin_unlock(&ctx->csa.register_lock);
 }
 
-static void spu_backing_runcntl_stop(struct spu_context *ctx)
+static void spu_backing_master_start(struct spu_context *ctx)
+{
+	struct spu_state *csa = &ctx->csa;
+	u64 sr1;
+
+	spin_lock(&csa->register_lock);
+	sr1 = csa->priv1.mfc_sr1_RW | MFC_STATE1_MASTER_RUN_CONTROL_MASK;
+	csa->priv1.mfc_sr1_RW = sr1;
+	spin_unlock(&csa->register_lock);
+}
+
+static void spu_backing_master_stop(struct spu_context *ctx)
 {
-	spu_backing_runcntl_write(ctx, SPU_RUNCNTL_STOP);
+	struct spu_state *csa = &ctx->csa;
+	u64 sr1;
+
+	spin_lock(&csa->register_lock);
+	sr1 = csa->priv1.mfc_sr1_RW & ~MFC_STATE1_MASTER_RUN_CONTROL_MASK;
+	csa->priv1.mfc_sr1_RW = sr1;
+	spin_unlock(&csa->register_lock);
 }
 
 static int spu_backing_set_mfc_query(struct spu_context * ctx, u32 mask,
@@ -345,8 +368,10 @@ struct spu_context_ops spu_backing_ops = {
 	.npc_write = spu_backing_npc_write,
 	.status_read = spu_backing_status_read,
 	.get_ls = spu_backing_get_ls,
+	.runcntl_read = spu_backing_runcntl_read,
 	.runcntl_write = spu_backing_runcntl_write,
-	.runcntl_stop = spu_backing_runcntl_stop,
+	.master_start = spu_backing_master_start,
+	.master_stop = spu_backing_master_stop,
 	.set_mfc_query = spu_backing_set_mfc_query,
 	.read_mfc_tagstatus = spu_backing_read_mfc_tagstatus,
 	.get_mfc_free_elements = spu_backing_get_mfc_free_elements,
diff --git a/arch/powerpc/platforms/cell/spufs/context.c b/arch/powerpc/platforms/cell/spufs/context.c
index 034cf6af53a..0870009f56d 100644
--- a/arch/powerpc/platforms/cell/spufs/context.c
+++ b/arch/powerpc/platforms/cell/spufs/context.c
@@ -120,6 +120,33 @@ void spu_unmap_mappings(struct spu_context *ctx)
 		unmap_mapping_range(ctx->signal2, 0, 0x4000, 1);
 }
 
+int spu_acquire_exclusive(struct spu_context *ctx)
+{
+	int ret = 0;
+
+	down_write(&ctx->state_sema);
+	/* ctx is about to be freed, can't acquire any more */
+	if (!ctx->owner) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (ctx->state == SPU_STATE_SAVED) {
+		ret = spu_activate(ctx, 0);
+		if (ret)
+			goto out;
+		ctx->state = SPU_STATE_RUNNABLE;
+	} else {
+		/* We need to exclude userspace access to the context. */
+		spu_unmap_mappings(ctx);
+	}
+
+out:
+	if (ret)
+		up_write(&ctx->state_sema);
+	return ret;
+}
+
 int spu_acquire_runnable(struct spu_context *ctx)
 {
 	int ret = 0;
diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
new file mode 100644
index 00000000000..26945c491f6
--- /dev/null
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -0,0 +1,238 @@
+/*
+ * SPU core dump code
+ *
+ * (C) Copyright 2006 IBM Corp.
+ *
+ * Author: Dwayne Grant McConnell <decimal@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/elf.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/syscalls.h>
+
+#include <asm/uaccess.h>
+
+#include "spufs.h"
+
+struct spufs_ctx_info {
+	struct list_head list;
+	int dfd;
+	int memsize; /* in bytes */
+	struct spu_context *ctx;
+};
+
+static LIST_HEAD(ctx_info_list);
+
+static ssize_t do_coredump_read(int num, struct spu_context *ctx, void __user *buffer,
+				size_t size, loff_t *off)
+{
+	u64 data;
+	int ret;
+
+	if (spufs_coredump_read[num].read)
+		return spufs_coredump_read[num].read(ctx, buffer, size, off);
+
+	data = spufs_coredump_read[num].get(ctx);
+	ret = copy_to_user(buffer, &data, 8);
+	return ret ? -EFAULT : 8;
+}
+
+/*
+ * These are the only things you should do on a core-file: use only these
+ * functions to write out all the necessary info.
+ */
+static int spufs_dump_write(struct file *file, const void *addr, int nr)
+{
+	return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
+}
+
+static int spufs_dump_seek(struct file *file, loff_t off)
+{
+	if (file->f_op->llseek) {
+		if (file->f_op->llseek(file, off, 0) != off)
+			return 0;
+	} else
+		file->f_pos = off;
+	return 1;
+}
+
+static void spufs_fill_memsize(struct spufs_ctx_info *ctx_info)
+{
+	struct spu_context *ctx;
+	unsigned long long lslr;
+
+	ctx = ctx_info->ctx;
+	lslr = ctx->csa.priv2.spu_lslr_RW;
+	ctx_info->memsize = lslr + 1;
+}
+
+static int spufs_ctx_note_size(struct spufs_ctx_info *ctx_info)
+{
+	int dfd, memsize, i, sz, total = 0;
+	char *name;
+	char fullname[80];
+
+	dfd = ctx_info->dfd;
+	memsize = ctx_info->memsize;
+
+	for (i = 0; spufs_coredump_read[i].name; i++) {
+		name = spufs_coredump_read[i].name;
+		sz = spufs_coredump_read[i].size;
+
+		sprintf(fullname, "SPU/%d/%s", dfd, name);
+
+		total += sizeof(struct elf_note);
+		total += roundup(strlen(fullname) + 1, 4);
+		if (!strcmp(name, "mem"))
+			total += roundup(memsize, 4);
+		else
+			total += roundup(sz, 4);
+	}
+
+	return total;
+}
+
+static int spufs_add_one_context(struct file *file, int dfd)
+{
+	struct spu_context *ctx;
+	struct spufs_ctx_info *ctx_info;
+	int size;
+
+	ctx = SPUFS_I(file->f_dentry->d_inode)->i_ctx;
+	if (ctx->flags & SPU_CREATE_NOSCHED)
+		return 0;
+
+	ctx_info = kzalloc(sizeof(*ctx_info), GFP_KERNEL);
+	if (unlikely(!ctx_info))
+		return -ENOMEM;
+
+	ctx_info->dfd = dfd;
+	ctx_info->ctx = ctx;
+
+	spufs_fill_memsize(ctx_info);
+
+	size = spufs_ctx_note_size(ctx_info);
+	list_add(&ctx_info->list, &ctx_info_list);
+	return size;
+}
+
+/*
+ * The additional architecture-specific notes for Cell are various
+ * context files in the spu context.
+ *
+ * This function iterates over all open file descriptors and sees
+ * if they are a directory in spufs.  In that case we use spufs
+ * internal functionality to dump them without needing to actually
+ * open the files.
+ */
+static int spufs_arch_notes_size(void)
+{
+	struct fdtable *fdt = files_fdtable(current->files);
+	int size = 0, fd;
+
+	for (fd = 0; fd < fdt->max_fdset && fd < fdt->max_fds; fd++) {
+		if (FD_ISSET(fd, fdt->open_fds)) {
+			struct file *file = fcheck(fd);
+
+			if (file && file->f_op == &spufs_context_fops) {
+				int rval = spufs_add_one_context(file, fd);
+				if (rval < 0)
+					break;
+				size += rval;
+			}
+		}
+	}
+
+	return size;
+}
+
+static void spufs_arch_write_note(struct spufs_ctx_info *ctx_info, int i,
+				struct file *file)
+{
+	struct spu_context *ctx;
+	loff_t pos = 0;
+	int sz, dfd, rc, total = 0;
+	const int bufsz = 4096;
+	char *name;
+	char fullname[80], *buf;
+	struct elf_note en;
+
+	buf = kmalloc(bufsz, GFP_KERNEL);
+	if (!buf)
+		return;
+
+	dfd = ctx_info->dfd;
+	name = spufs_coredump_read[i].name;
+
+	if (!strcmp(name, "mem"))
+		sz = ctx_info->memsize;
+	else
+		sz = spufs_coredump_read[i].size;
+
+	ctx = ctx_info->ctx;
+	if (!ctx) {
+		return;
+	}
+
+	sprintf(fullname, "SPU/%d/%s", dfd, name);
+	en.n_namesz = strlen(fullname) + 1;
+	en.n_descsz = sz;
+	en.n_type = NT_SPU;
+
+	if (!spufs_dump_write(file, &en, sizeof(en)))
+		return;
+	if (!spufs_dump_write(file, fullname, en.n_namesz))
+		return;
+	if (!spufs_dump_seek(file, roundup((unsigned long)file->f_pos, 4)))
+		return;
+
+	do {
+		rc = do_coredump_read(i, ctx, buf, bufsz, &pos);
+		if (rc > 0) {
+			if (!spufs_dump_write(file, buf, rc))
+				return;
+			total += rc;
+		}
+	} while (rc == bufsz && total < sz);
+
+	spufs_dump_seek(file, roundup((unsigned long)file->f_pos
+						- total + sz, 4));
+}
+
+static void spufs_arch_write_notes(struct file *file)
+{
+	int j;
+	struct spufs_ctx_info *ctx_info, *next;
+
+	list_for_each_entry_safe(ctx_info, next, &ctx_info_list, list) {
+		spu_acquire_saved(ctx_info->ctx);
+		for (j = 0; j < spufs_coredump_num_notes; j++)
+			spufs_arch_write_note(ctx_info, j, file);
+		spu_release(ctx_info->ctx);
+		list_del(&ctx_info->list);
+		kfree(ctx_info);
+	}
+}
+
+struct spu_coredump_calls spufs_coredump_calls = {
+	.arch_notes_size = spufs_arch_notes_size,
+	.arch_write_notes = spufs_arch_write_notes,
+	.owner = THIS_MODULE,
+};
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index 533e2723e18..347eff56fcb 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -32,13 +32,13 @@
 #include <asm/io.h>
 #include <asm/semaphore.h>
 #include <asm/spu.h>
+#include <asm/spu_info.h>
 #include <asm/uaccess.h>
 
 #include "spufs.h"
 
 #define SPUFS_MMAP_4K (PAGE_SIZE == 0x1000)
 
-
 static int
 spufs_mem_open(struct inode *inode, struct file *file)
 {
@@ -51,18 +51,23 @@ spufs_mem_open(struct inode *inode, struct file *file)
 }
 
 static ssize_t
+__spufs_mem_read(struct spu_context *ctx, char __user *buffer,
+			size_t size, loff_t *pos)
+{
+	char *local_store = ctx->ops->get_ls(ctx);
+	return simple_read_from_buffer(buffer, size, pos, local_store,
+					LS_SIZE);
+}
+
+static ssize_t
 spufs_mem_read(struct file *file, char __user *buffer,
 				size_t size, loff_t *pos)
 {
-	struct spu_context *ctx = file->private_data;
-	char *local_store;
 	int ret;
+	struct spu_context *ctx = file->private_data;
 
 	spu_acquire(ctx);
-
-	local_store = ctx->ops->get_ls(ctx);
-	ret = simple_read_from_buffer(buffer, size, pos, local_store, LS_SIZE);
-
+	ret = __spufs_mem_read(ctx, buffer, size, pos);
 	spu_release(ctx);
 	return ret;
 }
@@ -104,11 +109,11 @@ spufs_mem_mmap_nopage(struct vm_area_struct *vma,
 
 	if (ctx->state == SPU_STATE_SAVED) {
 		vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
-					& ~(_PAGE_NO_CACHE | _PAGE_GUARDED));
+							& ~_PAGE_NO_CACHE);
 		page = vmalloc_to_page(ctx->csa.lscsa->ls + offset);
 	} else {
 		vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
-					| _PAGE_NO_CACHE | _PAGE_GUARDED);
+							| _PAGE_NO_CACHE);
 		page = pfn_to_page((ctx->spu->local_store_phys + offset)
 				   >> PAGE_SHIFT);
 	}
@@ -131,7 +136,7 @@ spufs_mem_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!(vma->vm_flags & VM_SHARED))
 		return -EINVAL;
 
-	/* FIXME: */
+	vma->vm_flags |= VM_IO;
 	vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
 				     | _PAGE_NO_CACHE);
 
@@ -200,7 +205,7 @@ static int spufs_cntl_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!(vma->vm_flags & VM_SHARED))
 		return -EINVAL;
 
-	vma->vm_flags |= VM_RESERVED;
+	vma->vm_flags |= VM_IO;
 	vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
 				     | _PAGE_NO_CACHE | _PAGE_GUARDED);
 
@@ -261,18 +266,23 @@ spufs_regs_open(struct inode *inode, struct file *file)
 }
 
 static ssize_t
+__spufs_regs_read(struct spu_context *ctx, char __user *buffer,
+			size_t size, loff_t *pos)
+{
+	struct spu_lscsa *lscsa = ctx->csa.lscsa;
+	return simple_read_from_buffer(buffer, size, pos,
+				      lscsa->gprs, sizeof lscsa->gprs);
+}
+
+static ssize_t
 spufs_regs_read(struct file *file, char __user *buffer,
 		size_t size, loff_t *pos)
 {
-	struct spu_context *ctx = file->private_data;
-	struct spu_lscsa *lscsa = ctx->csa.lscsa;
 	int ret;
+	struct spu_context *ctx = file->private_data;
 
 	spu_acquire_saved(ctx);
-
-	ret = simple_read_from_buffer(buffer, size, pos,
-				      lscsa->gprs, sizeof lscsa->gprs);
-
+	ret = __spufs_regs_read(ctx, buffer, size, pos);
 	spu_release(ctx);
 	return ret;
 }
@@ -307,18 +317,23 @@ static struct file_operations spufs_regs_fops = {
 };
 
 static ssize_t
+__spufs_fpcr_read(struct spu_context *ctx, char __user * buffer,
+			size_t size, loff_t * pos)
+{
+	struct spu_lscsa *lscsa = ctx->csa.lscsa;
+	return simple_read_from_buffer(buffer, size, pos,
+				      &lscsa->fpcr, sizeof(lscsa->fpcr));
+}
+
+static ssize_t
 spufs_fpcr_read(struct file *file, char __user * buffer,
 		size_t size, loff_t * pos)
 {
-	struct spu_context *ctx = file->private_data;
-	struct spu_lscsa *lscsa = ctx->csa.lscsa;
 	int ret;
+	struct spu_context *ctx = file->private_data;
 
 	spu_acquire_saved(ctx);
-
-	ret = simple_read_from_buffer(buffer, size, pos,
-				      &lscsa->fpcr, sizeof(lscsa->fpcr));
-
+	ret = __spufs_fpcr_read(ctx, buffer, size, pos);
 	spu_release(ctx);
 	return ret;
 }
@@ -718,23 +733,41 @@ static int spufs_signal1_open(struct inode *inode, struct file *file)
 	return nonseekable_open(inode, file);
 }
 
-static ssize_t spufs_signal1_read(struct file *file, char __user *buf,
+static ssize_t __spufs_signal1_read(struct spu_context *ctx, char __user *buf,
 			size_t len, loff_t *pos)
 {
-	struct spu_context *ctx = file->private_data;
+	int ret = 0;
 	u32 data;
 
 	if (len < 4)
 		return -EINVAL;
 
-	spu_acquire(ctx);
-	data = ctx->ops->signal1_read(ctx);
-	spu_release(ctx);
+	if (ctx->csa.spu_chnlcnt_RW[3]) {
+		data = ctx->csa.spu_chnldata_RW[3];
+		ret = 4;
+	}
+
+	if (!ret)
+		goto out;
 
 	if (copy_to_user(buf, &data, 4))
 		return -EFAULT;
 
-	return 4;
+out:
+	return ret;
+}
+
+static ssize_t spufs_signal1_read(struct file *file, char __user *buf,
+			size_t len, loff_t *pos)
+{
+	int ret;
+	struct spu_context *ctx = file->private_data;
+
+	spu_acquire_saved(ctx);
+	ret = __spufs_signal1_read(ctx, buf, len, pos);
+	spu_release(ctx);
+
+	return ret;
 }
 
 static ssize_t spufs_signal1_write(struct file *file, const char __user *buf,
@@ -782,7 +815,7 @@ static int spufs_signal1_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!(vma->vm_flags & VM_SHARED))
 		return -EINVAL;
 
-	vma->vm_flags |= VM_RESERVED;
+	vma->vm_flags |= VM_IO;
 	vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
 				     | _PAGE_NO_CACHE | _PAGE_GUARDED);
 
@@ -807,25 +840,41 @@ static int spufs_signal2_open(struct inode *inode, struct file *file)
 	return nonseekable_open(inode, file);
 }
 
-static ssize_t spufs_signal2_read(struct file *file, char __user *buf,
+static ssize_t __spufs_signal2_read(struct spu_context *ctx, char __user *buf,
 			size_t len, loff_t *pos)
 {
-	struct spu_context *ctx;
+	int ret = 0;
 	u32 data;
 
-	ctx = file->private_data;
-
 	if (len < 4)
 		return -EINVAL;
 
-	spu_acquire(ctx);
-	data = ctx->ops->signal2_read(ctx);
-	spu_release(ctx);
+	if (ctx->csa.spu_chnlcnt_RW[4]) {
+		data =  ctx->csa.spu_chnldata_RW[4];
+		ret = 4;
+	}
+
+	if (!ret)
+		goto out;
 
 	if (copy_to_user(buf, &data, 4))
 		return -EFAULT;
 
-	return 4;
+out:
+	return ret;
+}
+
+static ssize_t spufs_signal2_read(struct file *file, char __user *buf,
+			size_t len, loff_t *pos)
+{
+	struct spu_context *ctx = file->private_data;
+	int ret;
+
+	spu_acquire_saved(ctx);
+	ret = __spufs_signal2_read(ctx, buf, len, pos);
+	spu_release(ctx);
+
+	return ret;
 }
 
 static ssize_t spufs_signal2_write(struct file *file, const char __user *buf,
@@ -874,8 +923,7 @@ static int spufs_signal2_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!(vma->vm_flags & VM_SHARED))
 		return -EINVAL;
 
-	/* FIXME: */
-	vma->vm_flags |= VM_RESERVED;
+	vma->vm_flags |= VM_IO;
 	vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
 				     | _PAGE_NO_CACHE | _PAGE_GUARDED);
 
@@ -902,13 +950,19 @@ static void spufs_signal1_type_set(void *data, u64 val)
 	spu_release(ctx);
 }
 
+static u64 __spufs_signal1_type_get(void *data)
+{
+	struct spu_context *ctx = data;
+	return ctx->ops->signal1_type_get(ctx);
+}
+
 static u64 spufs_signal1_type_get(void *data)
 {
 	struct spu_context *ctx = data;
 	u64 ret;
 
 	spu_acquire(ctx);
-	ret = ctx->ops->signal1_type_get(ctx);
+	ret = __spufs_signal1_type_get(data);
 	spu_release(ctx);
 
 	return ret;
@@ -925,13 +979,19 @@ static void spufs_signal2_type_set(void *data, u64 val)
 	spu_release(ctx);
 }
 
+static u64 __spufs_signal2_type_get(void *data)
+{
+	struct spu_context *ctx = data;
+	return ctx->ops->signal2_type_get(ctx);
+}
+
 static u64 spufs_signal2_type_get(void *data)
 {
 	struct spu_context *ctx = data;
 	u64 ret;
 
 	spu_acquire(ctx);
-	ret = ctx->ops->signal2_type_get(ctx);
+	ret = __spufs_signal2_type_get(data);
 	spu_release(ctx);
 
 	return ret;
@@ -958,7 +1018,7 @@ static int spufs_mss_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!(vma->vm_flags & VM_SHARED))
 		return -EINVAL;
 
-	vma->vm_flags |= VM_RESERVED;
+	vma->vm_flags |= VM_IO;
 	vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
 				     | _PAGE_NO_CACHE | _PAGE_GUARDED);
 
@@ -1000,7 +1060,7 @@ static int spufs_psmap_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!(vma->vm_flags & VM_SHARED))
 		return -EINVAL;
 
-	vma->vm_flags |= VM_RESERVED;
+	vma->vm_flags |= VM_IO;
 	vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
 				     | _PAGE_NO_CACHE | _PAGE_GUARDED);
 
@@ -1041,7 +1101,7 @@ static int spufs_mfc_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!(vma->vm_flags & VM_SHARED))
 		return -EINVAL;
 
-	vma->vm_flags |= VM_RESERVED;
+	vma->vm_flags |= VM_IO;
 	vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
 				     | _PAGE_NO_CACHE | _PAGE_GUARDED);
 
@@ -1265,6 +1325,7 @@ static ssize_t spufs_mfc_write(struct file *file, const char __user *buffer,
 		goto out;
 
 	ctx->tagwait |= 1 << cmd.tag;
+	ret = size;
 
 out:
 	return ret;
@@ -1360,7 +1421,8 @@ static u64 spufs_npc_get(void *data)
 	spu_release(ctx);
 	return ret;
 }
-DEFINE_SIMPLE_ATTRIBUTE(spufs_npc_ops, spufs_npc_get, spufs_npc_set, "%llx\n")
+DEFINE_SIMPLE_ATTRIBUTE(spufs_npc_ops, spufs_npc_get, spufs_npc_set,
+			"0x%llx\n")
 
 static void spufs_decr_set(void *data, u64 val)
 {
@@ -1371,18 +1433,24 @@ static void spufs_decr_set(void *data, u64 val)
 	spu_release(ctx);
 }
 
-static u64 spufs_decr_get(void *data)
+static u64 __spufs_decr_get(void *data)
 {
 	struct spu_context *ctx = data;
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
+	return lscsa->decr.slot[0];
+}
+
+static u64 spufs_decr_get(void *data)
+{
+	struct spu_context *ctx = data;
 	u64 ret;
 	spu_acquire_saved(ctx);
-	ret = lscsa->decr.slot[0];
+	ret = __spufs_decr_get(data);
 	spu_release(ctx);
 	return ret;
 }
 DEFINE_SIMPLE_ATTRIBUTE(spufs_decr_ops, spufs_decr_get, spufs_decr_set,
-			"%llx\n")
+			"0x%llx\n")
 
 static void spufs_decr_status_set(void *data, u64 val)
 {
@@ -1393,62 +1461,76 @@ static void spufs_decr_status_set(void *data, u64 val)
 	spu_release(ctx);
 }
 
-static u64 spufs_decr_status_get(void *data)
+static u64 __spufs_decr_status_get(void *data)
 {
 	struct spu_context *ctx = data;
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
+	return lscsa->decr_status.slot[0];
+}
+
+static u64 spufs_decr_status_get(void *data)
+{
+	struct spu_context *ctx = data;
 	u64 ret;
 	spu_acquire_saved(ctx);
-	ret = lscsa->decr_status.slot[0];
+	ret = __spufs_decr_status_get(data);
 	spu_release(ctx);
 	return ret;
 }
 DEFINE_SIMPLE_ATTRIBUTE(spufs_decr_status_ops, spufs_decr_status_get,
-			spufs_decr_status_set, "%llx\n")
+			spufs_decr_status_set, "0x%llx\n")
 
-static void spufs_spu_tag_mask_set(void *data, u64 val)
+static void spufs_event_mask_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
 	spu_acquire_saved(ctx);
-	lscsa->tag_mask.slot[0] = (u32) val;
+	lscsa->event_mask.slot[0] = (u32) val;
 	spu_release(ctx);
 }
 
-static u64 spufs_spu_tag_mask_get(void *data)
+static u64 __spufs_event_mask_get(void *data)
 {
 	struct spu_context *ctx = data;
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
+	return lscsa->event_mask.slot[0];
+}
+
+static u64 spufs_event_mask_get(void *data)
+{
+	struct spu_context *ctx = data;
 	u64 ret;
 	spu_acquire_saved(ctx);
-	ret = lscsa->tag_mask.slot[0];
+	ret = __spufs_event_mask_get(data);
 	spu_release(ctx);
 	return ret;
 }
-DEFINE_SIMPLE_ATTRIBUTE(spufs_spu_tag_mask_ops, spufs_spu_tag_mask_get,
-			spufs_spu_tag_mask_set, "%llx\n")
+DEFINE_SIMPLE_ATTRIBUTE(spufs_event_mask_ops, spufs_event_mask_get,
+			spufs_event_mask_set, "0x%llx\n")
 
-static void spufs_event_mask_set(void *data, u64 val)
+static u64 __spufs_event_status_get(void *data)
 {
 	struct spu_context *ctx = data;
-	struct spu_lscsa *lscsa = ctx->csa.lscsa;
-	spu_acquire_saved(ctx);
-	lscsa->event_mask.slot[0] = (u32) val;
-	spu_release(ctx);
+	struct spu_state *state = &ctx->csa;
+	u64 stat;
+	stat = state->spu_chnlcnt_RW[0];
+	if (stat)
+		return state->spu_chnldata_RW[0];
+	return 0;
 }
 
-static u64 spufs_event_mask_get(void *data)
+static u64 spufs_event_status_get(void *data)
 {
 	struct spu_context *ctx = data;
-	struct spu_lscsa *lscsa = ctx->csa.lscsa;
-	u64 ret;
+	u64 ret = 0;
+
 	spu_acquire_saved(ctx);
-	ret = lscsa->event_mask.slot[0];
+	ret = __spufs_event_status_get(data);
 	spu_release(ctx);
 	return ret;
 }
-DEFINE_SIMPLE_ATTRIBUTE(spufs_event_mask_ops, spufs_event_mask_get,
-			spufs_event_mask_set, "%llx\n")
+DEFINE_SIMPLE_ATTRIBUTE(spufs_event_status_ops, spufs_event_status_get,
+			NULL, "0x%llx\n")
 
 static void spufs_srr0_set(void *data, u64 val)
 {
@@ -1470,7 +1552,7 @@ static u64 spufs_srr0_get(void *data)
 	return ret;
 }
 DEFINE_SIMPLE_ATTRIBUTE(spufs_srr0_ops, spufs_srr0_get, spufs_srr0_set,
-			"%llx\n")
+			"0x%llx\n")
 
 static u64 spufs_id_get(void *data)
 {
@@ -1488,12 +1570,18 @@ static u64 spufs_id_get(void *data)
 }
 DEFINE_SIMPLE_ATTRIBUTE(spufs_id_ops, spufs_id_get, NULL, "0x%llx\n")
 
-static u64 spufs_object_id_get(void *data)
+static u64 __spufs_object_id_get(void *data)
 {
 	struct spu_context *ctx = data;
 	return ctx->object_id;
 }
 
+static u64 spufs_object_id_get(void *data)
+{
+	/* FIXME: Should there really be no locking here? */
+	return __spufs_object_id_get(data);
+}
+
 static void spufs_object_id_set(void *data, u64 id)
 {
 	struct spu_context *ctx = data;
@@ -1503,6 +1591,250 @@ static void spufs_object_id_set(void *data, u64 id)
 DEFINE_SIMPLE_ATTRIBUTE(spufs_object_id_ops, spufs_object_id_get,
 		spufs_object_id_set, "0x%llx\n");
 
+static u64 __spufs_lslr_get(void *data)
+{
+	struct spu_context *ctx = data;
+	return ctx->csa.priv2.spu_lslr_RW;
+}
+
+static u64 spufs_lslr_get(void *data)
+{
+	struct spu_context *ctx = data;
+	u64 ret;
+
+	spu_acquire_saved(ctx);
+	ret = __spufs_lslr_get(data);
+	spu_release(ctx);
+
+	return ret;
+}
+DEFINE_SIMPLE_ATTRIBUTE(spufs_lslr_ops, spufs_lslr_get, NULL, "0x%llx\n")
+
+static int spufs_info_open(struct inode *inode, struct file *file)
+{
+	struct spufs_inode_info *i = SPUFS_I(inode);
+	struct spu_context *ctx = i->i_ctx;
+	file->private_data = ctx;
+	return 0;
+}
+
+static ssize_t __spufs_mbox_info_read(struct spu_context *ctx,
+			char __user *buf, size_t len, loff_t *pos)
+{
+	u32 mbox_stat;
+	u32 data;
+
+	mbox_stat = ctx->csa.prob.mb_stat_R;
+	if (mbox_stat & 0x0000ff) {
+		data = ctx->csa.prob.pu_mb_R;
+	}
+
+	return simple_read_from_buffer(buf, len, pos, &data, sizeof data);
+}
+
+static ssize_t spufs_mbox_info_read(struct file *file, char __user *buf,
+				   size_t len, loff_t *pos)
+{
+	int ret;
+	struct spu_context *ctx = file->private_data;
+
+	if (!access_ok(VERIFY_WRITE, buf, len))
+		return -EFAULT;
+
+	spu_acquire_saved(ctx);
+	spin_lock(&ctx->csa.register_lock);
+	ret = __spufs_mbox_info_read(ctx, buf, len, pos);
+	spin_unlock(&ctx->csa.register_lock);
+	spu_release(ctx);
+
+	return ret;
+}
+
+static struct file_operations spufs_mbox_info_fops = {
+	.open = spufs_info_open,
+	.read = spufs_mbox_info_read,
+	.llseek  = generic_file_llseek,
+};
+
+static ssize_t __spufs_ibox_info_read(struct spu_context *ctx,
+				char __user *buf, size_t len, loff_t *pos)
+{
+	u32 ibox_stat;
+	u32 data;
+
+	ibox_stat = ctx->csa.prob.mb_stat_R;
+	if (ibox_stat & 0xff0000) {
+		data = ctx->csa.priv2.puint_mb_R;
+	}
+
+	return simple_read_from_buffer(buf, len, pos, &data, sizeof data);
+}
+
+static ssize_t spufs_ibox_info_read(struct file *file, char __user *buf,
+				   size_t len, loff_t *pos)
+{
+	struct spu_context *ctx = file->private_data;
+	int ret;
+
+	if (!access_ok(VERIFY_WRITE, buf, len))
+		return -EFAULT;
+
+	spu_acquire_saved(ctx);
+	spin_lock(&ctx->csa.register_lock);
+	ret = __spufs_ibox_info_read(ctx, buf, len, pos);
+	spin_unlock(&ctx->csa.register_lock);
+	spu_release(ctx);
+
+	return ret;
+}
+
+static struct file_operations spufs_ibox_info_fops = {
+	.open = spufs_info_open,
+	.read = spufs_ibox_info_read,
+	.llseek  = generic_file_llseek,
+};
+
+static ssize_t __spufs_wbox_info_read(struct spu_context *ctx,
+			char __user *buf, size_t len, loff_t *pos)
+{
+	int i, cnt;
+	u32 data[4];
+	u32 wbox_stat;
+
+	wbox_stat = ctx->csa.prob.mb_stat_R;
+	cnt = 4 - ((wbox_stat & 0x00ff00) >> 8);
+	for (i = 0; i < cnt; i++) {
+		data[i] = ctx->csa.spu_mailbox_data[i];
+	}
+
+	return simple_read_from_buffer(buf, len, pos, &data,
+				cnt * sizeof(u32));
+}
+
+static ssize_t spufs_wbox_info_read(struct file *file, char __user *buf,
+				   size_t len, loff_t *pos)
+{
+	struct spu_context *ctx = file->private_data;
+	int ret;
+
+	if (!access_ok(VERIFY_WRITE, buf, len))
+		return -EFAULT;
+
+	spu_acquire_saved(ctx);
+	spin_lock(&ctx->csa.register_lock);
+	ret = __spufs_wbox_info_read(ctx, buf, len, pos);
+	spin_unlock(&ctx->csa.register_lock);
+	spu_release(ctx);
+
+	return ret;
+}
+
+static struct file_operations spufs_wbox_info_fops = {
+	.open = spufs_info_open,
+	.read = spufs_wbox_info_read,
+	.llseek  = generic_file_llseek,
+};
+
+static ssize_t __spufs_dma_info_read(struct spu_context *ctx,
+			char __user *buf, size_t len, loff_t *pos)
+{
+	struct spu_dma_info info;
+	struct mfc_cq_sr *qp, *spuqp;
+	int i;
+
+	info.dma_info_type = ctx->csa.priv2.spu_tag_status_query_RW;
+	info.dma_info_mask = ctx->csa.lscsa->tag_mask.slot[0];
+	info.dma_info_status = ctx->csa.spu_chnldata_RW[24];
+	info.dma_info_stall_and_notify = ctx->csa.spu_chnldata_RW[25];
+	info.dma_info_atomic_command_status = ctx->csa.spu_chnldata_RW[27];
+	for (i = 0; i < 16; i++) {
+		qp = &info.dma_info_command_data[i];
+		spuqp = &ctx->csa.priv2.spuq[i];
+
+		qp->mfc_cq_data0_RW = spuqp->mfc_cq_data0_RW;
+		qp->mfc_cq_data1_RW = spuqp->mfc_cq_data1_RW;
+		qp->mfc_cq_data2_RW = spuqp->mfc_cq_data2_RW;
+		qp->mfc_cq_data3_RW = spuqp->mfc_cq_data3_RW;
+	}
+
+	return simple_read_from_buffer(buf, len, pos, &info,
+				sizeof info);
+}
+
+static ssize_t spufs_dma_info_read(struct file *file, char __user *buf,
+			      size_t len, loff_t *pos)
+{
+	struct spu_context *ctx = file->private_data;
+	int ret;
+
+	if (!access_ok(VERIFY_WRITE, buf, len))
+		return -EFAULT;
+
+	spu_acquire_saved(ctx);
+	spin_lock(&ctx->csa.register_lock);
+	ret = __spufs_dma_info_read(ctx, buf, len, pos);
+	spin_unlock(&ctx->csa.register_lock);
+	spu_release(ctx);
+
+	return ret;
+}
+
+static struct file_operations spufs_dma_info_fops = {
+	.open = spufs_info_open,
+	.read = spufs_dma_info_read,
+};
+
+static ssize_t __spufs_proxydma_info_read(struct spu_context *ctx,
+			char __user *buf, size_t len, loff_t *pos)
+{
+	struct spu_proxydma_info info;
+	struct mfc_cq_sr *qp, *puqp;
+	int ret = sizeof info;
+	int i;
+
+	if (len < ret)
+		return -EINVAL;
+
+	if (!access_ok(VERIFY_WRITE, buf, len))
+		return -EFAULT;
+
+	info.proxydma_info_type = ctx->csa.prob.dma_querytype_RW;
+	info.proxydma_info_mask = ctx->csa.prob.dma_querymask_RW;
+	info.proxydma_info_status = ctx->csa.prob.dma_tagstatus_R;
+	for (i = 0; i < 8; i++) {
+		qp = &info.proxydma_info_command_data[i];
+		puqp = &ctx->csa.priv2.puq[i];
+
+		qp->mfc_cq_data0_RW = puqp->mfc_cq_data0_RW;
+		qp->mfc_cq_data1_RW = puqp->mfc_cq_data1_RW;
+		qp->mfc_cq_data2_RW = puqp->mfc_cq_data2_RW;
+		qp->mfc_cq_data3_RW = puqp->mfc_cq_data3_RW;
+	}
+
+	return simple_read_from_buffer(buf, len, pos, &info,
+				sizeof info);
+}
+
+static ssize_t spufs_proxydma_info_read(struct file *file, char __user *buf,
+				   size_t len, loff_t *pos)
+{
+	struct spu_context *ctx = file->private_data;
+	int ret;
+
+	spu_acquire_saved(ctx);
+	spin_lock(&ctx->csa.register_lock);
+	ret = __spufs_proxydma_info_read(ctx, buf, len, pos);
+	spin_unlock(&ctx->csa.register_lock);
+	spu_release(ctx);
+
+	return ret;
+}
+
+static struct file_operations spufs_proxydma_info_fops = {
+	.open = spufs_info_open,
+	.read = spufs_proxydma_info_read,
+};
+
 struct tree_descr spufs_dir_contents[] = {
 	{ "mem",  &spufs_mem_fops,  0666, },
 	{ "regs", &spufs_regs_fops,  0666, },
@@ -1516,18 +1848,70 @@ struct tree_descr spufs_dir_contents[] = {
 	{ "signal2", &spufs_signal2_fops, 0666, },
 	{ "signal1_type", &spufs_signal1_type, 0666, },
 	{ "signal2_type", &spufs_signal2_type, 0666, },
-	{ "mss", &spufs_mss_fops, 0666, },
-	{ "mfc", &spufs_mfc_fops, 0666, },
 	{ "cntl", &spufs_cntl_fops,  0666, },
-	{ "npc", &spufs_npc_ops, 0666, },
 	{ "fpcr", &spufs_fpcr_fops, 0666, },
+	{ "lslr", &spufs_lslr_ops, 0444, },
+	{ "mfc", &spufs_mfc_fops, 0666, },
+	{ "mss", &spufs_mss_fops, 0666, },
+	{ "npc", &spufs_npc_ops, 0666, },
+	{ "srr0", &spufs_srr0_ops, 0666, },
 	{ "decr", &spufs_decr_ops, 0666, },
 	{ "decr_status", &spufs_decr_status_ops, 0666, },
-	{ "spu_tag_mask", &spufs_spu_tag_mask_ops, 0666, },
 	{ "event_mask", &spufs_event_mask_ops, 0666, },
-	{ "srr0", &spufs_srr0_ops, 0666, },
+	{ "event_status", &spufs_event_status_ops, 0444, },
+	{ "psmap", &spufs_psmap_fops, 0666, },
+	{ "phys-id", &spufs_id_ops, 0666, },
+	{ "object-id", &spufs_object_id_ops, 0666, },
+	{ "mbox_info", &spufs_mbox_info_fops, 0444, },
+	{ "ibox_info", &spufs_ibox_info_fops, 0444, },
+	{ "wbox_info", &spufs_wbox_info_fops, 0444, },
+	{ "dma_info", &spufs_dma_info_fops, 0444, },
+	{ "proxydma_info", &spufs_proxydma_info_fops, 0444, },
+	{},
+};
+
+struct tree_descr spufs_dir_nosched_contents[] = {
+	{ "mem",  &spufs_mem_fops,  0666, },
+	{ "mbox", &spufs_mbox_fops, 0444, },
+	{ "ibox", &spufs_ibox_fops, 0444, },
+	{ "wbox", &spufs_wbox_fops, 0222, },
+	{ "mbox_stat", &spufs_mbox_stat_fops, 0444, },
+	{ "ibox_stat", &spufs_ibox_stat_fops, 0444, },
+	{ "wbox_stat", &spufs_wbox_stat_fops, 0444, },
+	{ "signal1", &spufs_signal1_fops, 0666, },
+	{ "signal2", &spufs_signal2_fops, 0666, },
+	{ "signal1_type", &spufs_signal1_type, 0666, },
+	{ "signal2_type", &spufs_signal2_type, 0666, },
+	{ "mss", &spufs_mss_fops, 0666, },
+	{ "mfc", &spufs_mfc_fops, 0666, },
+	{ "cntl", &spufs_cntl_fops,  0666, },
+	{ "npc", &spufs_npc_ops, 0666, },
 	{ "psmap", &spufs_psmap_fops, 0666, },
 	{ "phys-id", &spufs_id_ops, 0666, },
 	{ "object-id", &spufs_object_id_ops, 0666, },
 	{},
 };
+
+struct spufs_coredump_reader spufs_coredump_read[] = {
+	{ "regs", __spufs_regs_read, NULL, 128 * 16 },
+	{ "fpcr", __spufs_fpcr_read, NULL, 16 },
+	{ "lslr", NULL, __spufs_lslr_get, 11 },
+	{ "decr", NULL, __spufs_decr_get, 11 },
+	{ "decr_status", NULL, __spufs_decr_status_get, 11 },
+	{ "mem", __spufs_mem_read, NULL, 256 * 1024, },
+	{ "signal1", __spufs_signal1_read, NULL, 4 },
+	{ "signal1_type", NULL, __spufs_signal1_type_get, 2 },
+	{ "signal2", __spufs_signal2_read, NULL, 4 },
+	{ "signal2_type", NULL, __spufs_signal2_type_get, 2 },
+	{ "event_mask", NULL, __spufs_event_mask_get, 8 },
+	{ "event_status", NULL, __spufs_event_status_get, 8 },
+	{ "mbox_info", __spufs_mbox_info_read, NULL, 4 },
+	{ "ibox_info", __spufs_ibox_info_read, NULL, 4 },
+	{ "wbox_info", __spufs_wbox_info_read, NULL, 16 },
+	{ "dma_info", __spufs_dma_info_read, NULL, 69 * 8 },
+	{ "proxydma_info", __spufs_proxydma_info_read, NULL, 35 * 8 },
+	{ "object-id", NULL, __spufs_object_id_get, 19 },
+	{ },
+};
+int spufs_coredump_num_notes = ARRAY_SIZE(spufs_coredump_read) - 1;
+
diff --git a/arch/powerpc/platforms/cell/spufs/hw_ops.c b/arch/powerpc/platforms/cell/spufs/hw_ops.c
index d805ffed892..ae42e03b8c8 100644
--- a/arch/powerpc/platforms/cell/spufs/hw_ops.c
+++ b/arch/powerpc/platforms/cell/spufs/hw_ops.c
@@ -135,21 +135,11 @@ static int spu_hw_wbox_write(struct spu_context *ctx, u32 data)
 	return ret;
 }
 
-static u32 spu_hw_signal1_read(struct spu_context *ctx)
-{
-	return in_be32(&ctx->spu->problem->signal_notify1);
-}
-
 static void spu_hw_signal1_write(struct spu_context *ctx, u32 data)
 {
 	out_be32(&ctx->spu->problem->signal_notify1, data);
 }
 
-static u32 spu_hw_signal2_read(struct spu_context *ctx)
-{
-	return in_be32(&ctx->spu->problem->signal_notify2);
-}
-
 static void spu_hw_signal2_write(struct spu_context *ctx, u32 data)
 {
 	out_be32(&ctx->spu->problem->signal_notify2, data);
@@ -217,21 +207,42 @@ static char *spu_hw_get_ls(struct spu_context *ctx)
 	return ctx->spu->local_store;
 }
 
-static void spu_hw_runcntl_write(struct spu_context *ctx, u32 val)
+static u32 spu_hw_runcntl_read(struct spu_context *ctx)
 {
-	eieio();
-	out_be32(&ctx->spu->problem->spu_runcntl_RW, val);
+	return in_be32(&ctx->spu->problem->spu_runcntl_RW);
 }
 
-static void spu_hw_runcntl_stop(struct spu_context *ctx)
+static void spu_hw_runcntl_write(struct spu_context *ctx, u32 val)
 {
 	spin_lock_irq(&ctx->spu->register_lock);
-	out_be32(&ctx->spu->problem->spu_runcntl_RW, SPU_RUNCNTL_STOP);
-	while (in_be32(&ctx->spu->problem->spu_status_R) & SPU_STATUS_RUNNING)
-		cpu_relax();
+	if (val & SPU_RUNCNTL_ISOLATE)
+		out_be64(&ctx->spu->priv2->spu_privcntl_RW, 4LL);
+	out_be32(&ctx->spu->problem->spu_runcntl_RW, val);
 	spin_unlock_irq(&ctx->spu->register_lock);
 }
 
+static void spu_hw_master_start(struct spu_context *ctx)
+{
+	struct spu *spu = ctx->spu;
+	u64 sr1;
+
+	spin_lock_irq(&spu->register_lock);
+	sr1 = spu_mfc_sr1_get(spu) | MFC_STATE1_MASTER_RUN_CONTROL_MASK;
+	spu_mfc_sr1_set(spu, sr1);
+	spin_unlock_irq(&spu->register_lock);
+}
+
+static void spu_hw_master_stop(struct spu_context *ctx)
+{
+	struct spu *spu = ctx->spu;
+	u64 sr1;
+
+	spin_lock_irq(&spu->register_lock);
+	sr1 = spu_mfc_sr1_get(spu) & ~MFC_STATE1_MASTER_RUN_CONTROL_MASK;
+	spu_mfc_sr1_set(spu, sr1);
+	spin_unlock_irq(&spu->register_lock);
+}
+
 static int spu_hw_set_mfc_query(struct spu_context * ctx, u32 mask, u32 mode)
 {
 	struct spu_problem __iomem *prob = ctx->spu->problem;
@@ -291,9 +302,7 @@ struct spu_context_ops spu_hw_ops = {
 	.mbox_stat_poll = spu_hw_mbox_stat_poll,
 	.ibox_read = spu_hw_ibox_read,
 	.wbox_write = spu_hw_wbox_write,
-	.signal1_read = spu_hw_signal1_read,
 	.signal1_write = spu_hw_signal1_write,
-	.signal2_read = spu_hw_signal2_read,
 	.signal2_write = spu_hw_signal2_write,
 	.signal1_type_set = spu_hw_signal1_type_set,
 	.signal1_type_get = spu_hw_signal1_type_get,
@@ -303,8 +312,10 @@ struct spu_context_ops spu_hw_ops = {
 	.npc_write = spu_hw_npc_write,
 	.status_read = spu_hw_status_read,
 	.get_ls = spu_hw_get_ls,
+	.runcntl_read = spu_hw_runcntl_read,
 	.runcntl_write = spu_hw_runcntl_write,
-	.runcntl_stop = spu_hw_runcntl_stop,
+	.master_start = spu_hw_master_start,
+	.master_stop = spu_hw_master_stop,
 	.set_mfc_query = spu_hw_set_mfc_query,
 	.read_mfc_tagstatus = spu_hw_read_mfc_tagstatus,
 	.get_mfc_free_elements = spu_hw_get_mfc_free_elements,
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 427d00a4f6a..c7d010749a1 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -33,7 +33,7 @@
 #include <linux/slab.h>
 #include <linux/parser.h>
 
-#include <asm/io.h>
+#include <asm/prom.h>
 #include <asm/semaphore.h>
 #include <asm/spu.h>
 #include <asm/uaccess.h>
@@ -41,6 +41,7 @@
 #include "spufs.h"
 
 static kmem_cache_t *spufs_inode_cache;
+char *isolated_loader;
 
 static struct inode *
 spufs_alloc_inode(struct super_block *sb)
@@ -231,6 +232,7 @@ struct file_operations spufs_context_fops = {
 	.readdir	= dcache_readdir,
 	.fsync		= simple_sync_file,
 };
+EXPORT_SYMBOL_GPL(spufs_context_fops);
 
 static int
 spufs_mkdir(struct inode *dir, struct dentry *dentry, unsigned int flags,
@@ -255,10 +257,14 @@ spufs_mkdir(struct inode *dir, struct dentry *dentry, unsigned int flags,
 		goto out_iput;
 
 	ctx->flags = flags;
-
 	inode->i_op = &spufs_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
-	ret = spufs_fill_dir(dentry, spufs_dir_contents, mode, ctx);
+	if (flags & SPU_CREATE_NOSCHED)
+		ret = spufs_fill_dir(dentry, spufs_dir_nosched_contents,
+					 mode, ctx);
+	else
+		ret = spufs_fill_dir(dentry, spufs_dir_contents, mode, ctx);
+
 	if (ret)
 		goto out_free_ctx;
 
@@ -307,6 +313,20 @@ static int spufs_create_context(struct inode *inode,
 {
 	int ret;
 
+	ret = -EPERM;
+	if ((flags & SPU_CREATE_NOSCHED) &&
+	    !capable(CAP_SYS_NICE))
+		goto out_unlock;
+
+	ret = -EINVAL;
+	if ((flags & (SPU_CREATE_NOSCHED | SPU_CREATE_ISOLATE))
+	    == SPU_CREATE_ISOLATE)
+		goto out_unlock;
+
+	ret = -ENODEV;
+	if ((flags & SPU_CREATE_ISOLATE) && !isolated_loader)
+		goto out_unlock;
+
 	ret = spufs_mkdir(inode, dentry, flags, mode & S_IRWXUGO);
 	if (ret)
 		goto out_unlock;
@@ -540,6 +560,30 @@ spufs_parse_options(char *options, struct inode *root)
 	return 1;
 }
 
+static void
+spufs_init_isolated_loader(void)
+{
+	struct device_node *dn;
+	const char *loader;
+	int size;
+
+	dn = of_find_node_by_path("/spu-isolation");
+	if (!dn)
+		return;
+
+	loader = get_property(dn, "loader", &size);
+	if (!loader)
+		return;
+
+	/* kmalloc should align on a 16 byte boundary..* */
+	isolated_loader = kmalloc(size, GFP_KERNEL);
+	if (!isolated_loader)
+		return;
+
+	memcpy(isolated_loader, loader, size);
+	printk(KERN_INFO "spufs: SPU isolation mode enabled\n");
+}
+
 static int
 spufs_create_root(struct super_block *sb, void *data)
 {
@@ -608,6 +652,7 @@ static struct file_system_type spufs_type = {
 static int __init spufs_init(void)
 {
 	int ret;
+
 	ret = -ENOMEM;
 	spufs_inode_cache = kmem_cache_create("spufs_inode_cache",
 			sizeof(struct spufs_inode_info), 0,
@@ -625,6 +670,12 @@ static int __init spufs_init(void)
 	ret = register_spu_syscalls(&spufs_calls);
 	if (ret)
 		goto out_fs;
+	ret = register_arch_coredump_calls(&spufs_coredump_calls);
+	if (ret)
+		goto out_fs;
+
+	spufs_init_isolated_loader();
+
 	return 0;
 out_fs:
 	unregister_filesystem(&spufs_type);
@@ -638,6 +689,7 @@ module_init(spufs_init);
 static void __exit spufs_exit(void)
 {
 	spu_sched_exit();
+	unregister_arch_coredump_calls(&spufs_coredump_calls);
 	unregister_spu_syscalls(&spufs_calls);
 	unregister_filesystem(&spufs_type);
 	kmem_cache_destroy(spufs_inode_cache);
diff --git a/arch/powerpc/platforms/cell/spufs/run.c b/arch/powerpc/platforms/cell/spufs/run.c
index 63df8cf4ba1..1acc2ffef8c 100644
--- a/arch/powerpc/platforms/cell/spufs/run.c
+++ b/arch/powerpc/platforms/cell/spufs/run.c
@@ -1,7 +1,11 @@
+#define DEBUG
+
 #include <linux/wait.h>
 #include <linux/ptrace.h>
 
 #include <asm/spu.h>
+#include <asm/spu_priv1.h>
+#include <asm/io.h>
 #include <asm/unistd.h>
 
 #include "spufs.h"
@@ -24,6 +28,7 @@ void spufs_dma_callback(struct spu *spu, int type)
 	} else {
 		switch (type) {
 		case SPE_EVENT_DMA_ALIGNMENT:
+		case SPE_EVENT_SPE_DATA_STORAGE:
 		case SPE_EVENT_INVALID_DMA:
 			force_sig(SIGBUS, /* info, */ current);
 			break;
@@ -48,15 +53,122 @@ static inline int spu_stopped(struct spu_context *ctx, u32 * stat)
 	return (!(*stat & 0x1) || pte_fault || spu->class_0_pending) ? 1 : 0;
 }
 
+static int spu_setup_isolated(struct spu_context *ctx)
+{
+	int ret;
+	u64 __iomem *mfc_cntl;
+	u64 sr1;
+	u32 status;
+	unsigned long timeout;
+	const u32 status_loading = SPU_STATUS_RUNNING
+		| SPU_STATUS_ISOLATED_STATE | SPU_STATUS_ISOLATED_LOAD_STATUS;
+
+	if (!isolated_loader)
+		return -ENODEV;
+
+	ret = spu_acquire_exclusive(ctx);
+	if (ret)
+		goto out;
+
+	mfc_cntl = &ctx->spu->priv2->mfc_control_RW;
+
+	/* purge the MFC DMA queue to ensure no spurious accesses before we
+	 * enter kernel mode */
+	timeout = jiffies + HZ;
+	out_be64(mfc_cntl, MFC_CNTL_PURGE_DMA_REQUEST);
+	while ((in_be64(mfc_cntl) & MFC_CNTL_PURGE_DMA_STATUS_MASK)
+			!= MFC_CNTL_PURGE_DMA_COMPLETE) {
+		if (time_after(jiffies, timeout)) {
+			printk(KERN_ERR "%s: timeout flushing MFC DMA queue\n",
+					__FUNCTION__);
+			ret = -EIO;
+			goto out_unlock;
+		}
+		cond_resched();
+	}
+
+	/* put the SPE in kernel mode to allow access to the loader */
+	sr1 = spu_mfc_sr1_get(ctx->spu);
+	sr1 &= ~MFC_STATE1_PROBLEM_STATE_MASK;
+	spu_mfc_sr1_set(ctx->spu, sr1);
+
+	/* start the loader */
+	ctx->ops->signal1_write(ctx, (unsigned long)isolated_loader >> 32);
+	ctx->ops->signal2_write(ctx,
+			(unsigned long)isolated_loader & 0xffffffff);
+
+	ctx->ops->runcntl_write(ctx,
+			SPU_RUNCNTL_RUNNABLE | SPU_RUNCNTL_ISOLATE);
+
+	ret = 0;
+	timeout = jiffies + HZ;
+	while (((status = ctx->ops->status_read(ctx)) & status_loading) ==
+				status_loading) {
+		if (time_after(jiffies, timeout)) {
+			printk(KERN_ERR "%s: timeout waiting for loader\n",
+					__FUNCTION__);
+			ret = -EIO;
+			goto out_drop_priv;
+		}
+		cond_resched();
+	}
+
+	if (!(status & SPU_STATUS_RUNNING)) {
+		/* If isolated LOAD has failed: run SPU, we will get a stop-and
+		 * signal later. */
+		pr_debug("%s: isolated LOAD failed\n", __FUNCTION__);
+		ctx->ops->runcntl_write(ctx, SPU_RUNCNTL_RUNNABLE);
+		ret = -EACCES;
+
+	} else if (!(status & SPU_STATUS_ISOLATED_STATE)) {
+		/* This isn't allowed by the CBEA, but check anyway */
+		pr_debug("%s: SPU fell out of isolated mode?\n", __FUNCTION__);
+		ctx->ops->runcntl_write(ctx, SPU_RUNCNTL_STOP);
+		ret = -EINVAL;
+	}
+
+out_drop_priv:
+	/* Finished accessing the loader. Drop kernel mode */
+	sr1 |= MFC_STATE1_PROBLEM_STATE_MASK;
+	spu_mfc_sr1_set(ctx->spu, sr1);
+
+out_unlock:
+	spu_release_exclusive(ctx);
+out:
+	return ret;
+}
+
 static inline int spu_run_init(struct spu_context *ctx, u32 * npc)
 {
 	int ret;
+	unsigned long runcntl = SPU_RUNCNTL_RUNNABLE;
 
-	if ((ret = spu_acquire_runnable(ctx)) != 0)
+	ret = spu_acquire_runnable(ctx);
+	if (ret)
 		return ret;
-	ctx->ops->npc_write(ctx, *npc);
-	ctx->ops->runcntl_write(ctx, SPU_RUNCNTL_RUNNABLE);
-	return 0;
+
+	if (ctx->flags & SPU_CREATE_ISOLATE) {
+		if (!(ctx->ops->status_read(ctx) & SPU_STATUS_ISOLATED_STATE)) {
+			/* Need to release ctx, because spu_setup_isolated will
+			 * acquire it exclusively.
+			 */
+			spu_release(ctx);
+			ret = spu_setup_isolated(ctx);
+			if (!ret)
+				ret = spu_acquire_runnable(ctx);
+		}
+
+		/* if userspace has set the runcntrl register (eg, to issue an
+		 * isolated exit), we need to re-set it here */
+		runcntl = ctx->ops->runcntl_read(ctx) &
+			(SPU_RUNCNTL_RUNNABLE | SPU_RUNCNTL_ISOLATE);
+		if (runcntl == 0)
+			runcntl = SPU_RUNCNTL_RUNNABLE;
+	} else
+		ctx->ops->npc_write(ctx, *npc);
+
+	ctx->ops->runcntl_write(ctx, runcntl);
+	return ret;
 }
 
 static inline int spu_run_fini(struct spu_context *ctx, u32 * npc,
@@ -70,13 +182,7 @@ static inline int spu_run_fini(struct spu_context *ctx, u32 * npc,
 
 	if (signal_pending(current))
 		ret = -ERESTARTSYS;
-	if (unlikely(current->ptrace & PT_PTRACED)) {
-		if ((*status & SPU_STATUS_STOPPED_BY_STOP)
-		    && (*status >> SPU_STOP_STATUS_SHIFT) == 0x3fff) {
-			force_sig(SIGTRAP, current);
-			ret = -ERESTARTSYS;
-		}
-	}
+
 	return ret;
 }
 
@@ -204,6 +310,7 @@ long spufs_run_spu(struct file *file, struct spu_context *ctx,
 	if (down_interruptible(&ctx->run_sema))
 		return -ERESTARTSYS;
 
+	ctx->ops->master_start(ctx);
 	ctx->event_return = 0;
 	ret = spu_run_init(ctx, npc);
 	if (ret)
@@ -223,7 +330,7 @@ long spufs_run_spu(struct file *file, struct spu_context *ctx,
 		if (unlikely(ctx->state != SPU_STATE_RUNNABLE)) {
 			ret = spu_reacquire_runnable(ctx, npc, &status);
 			if (ret)
-				goto out;
+				goto out2;
 			continue;
 		}
 		ret = spu_process_events(ctx);
@@ -231,12 +338,24 @@ long spufs_run_spu(struct file *file, struct spu_context *ctx,
 	} while (!ret && !(status & (SPU_STATUS_STOPPED_BY_STOP |
 				      SPU_STATUS_STOPPED_BY_HALT)));
 
-	ctx->ops->runcntl_stop(ctx);
+	ctx->ops->master_stop(ctx);
 	ret = spu_run_fini(ctx, npc, &status);
-	if (!ret)
-		ret = status;
 	spu_yield(ctx);
 
+out2:
+	if ((ret == 0) ||
+	    ((ret == -ERESTARTSYS) &&
+	     ((status & SPU_STATUS_STOPPED_BY_HALT) ||
+	      ((status & SPU_STATUS_STOPPED_BY_STOP) &&
+	       (status >> SPU_STOP_STATUS_SHIFT != 0x2104)))))
+		ret = status;
+
+	if ((status & SPU_STATUS_STOPPED_BY_STOP)
+	    && (status >> SPU_STOP_STATUS_SHIFT) == 0x3fff) {
+		force_sig(SIGTRAP, current);
+		ret = -ERESTARTSYS;
+	}
+
 out:
 	*event = ctx->event_return;
 	up(&ctx->run_sema);
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h
index a0f55ca2d48..70fb13395c0 100644
--- a/arch/powerpc/platforms/cell/spufs/spufs.h
+++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -29,6 +29,7 @@
 
 #include <asm/spu.h>
 #include <asm/spu_csa.h>
+#include <asm/spu_info.h>
 
 /* The magic number for our file system */
 enum {
@@ -114,13 +115,19 @@ struct spu_context_ops {
 	void (*npc_write) (struct spu_context * ctx, u32 data);
 	 u32(*status_read) (struct spu_context * ctx);
 	char*(*get_ls) (struct spu_context * ctx);
+	 u32 (*runcntl_read) (struct spu_context * ctx);
 	void (*runcntl_write) (struct spu_context * ctx, u32 data);
-	void (*runcntl_stop) (struct spu_context * ctx);
+	void (*master_start) (struct spu_context * ctx);
+	void (*master_stop) (struct spu_context * ctx);
 	int (*set_mfc_query)(struct spu_context * ctx, u32 mask, u32 mode);
 	u32 (*read_mfc_tagstatus)(struct spu_context * ctx);
 	u32 (*get_mfc_free_elements)(struct spu_context *ctx);
-	int (*send_mfc_command)(struct spu_context *ctx,
-					struct mfc_dma_command *cmd);
+	int (*send_mfc_command)(struct spu_context * ctx,
+				struct mfc_dma_command * cmd);
+	void (*dma_info_read) (struct spu_context * ctx,
+			       struct spu_dma_info * info);
+	void (*proxydma_info_read) (struct spu_context * ctx,
+				    struct spu_proxydma_info * info);
 };
 
 extern struct spu_context_ops spu_hw_ops;
@@ -135,6 +142,7 @@ struct spufs_inode_info {
 	container_of(inode, struct spufs_inode_info, vfs_inode)
 
 extern struct tree_descr spufs_dir_contents[];
+extern struct tree_descr spufs_dir_nosched_contents[];
 
 /* system call implementation */
 long spufs_run_spu(struct file *file,
@@ -162,6 +170,12 @@ void spu_acquire(struct spu_context *ctx);
 void spu_release(struct spu_context *ctx);
 int spu_acquire_runnable(struct spu_context *ctx);
 void spu_acquire_saved(struct spu_context *ctx);
+int spu_acquire_exclusive(struct spu_context *ctx);
+
+static inline void spu_release_exclusive(struct spu_context *ctx)
+{
+	up_write(&ctx->state_sema);
+}
 
 int spu_activate(struct spu_context *ctx, u64 flags);
 void spu_deactivate(struct spu_context *ctx);
@@ -169,6 +183,8 @@ void spu_yield(struct spu_context *ctx);
 int __init spu_sched_init(void);
 void __exit spu_sched_exit(void);
 
+extern char *isolated_loader;
+
 /*
  * spufs_wait
  * 	Same as wait_event_interruptible(), except that here
@@ -207,4 +223,15 @@ void spufs_stop_callback(struct spu *spu);
 void spufs_mfc_callback(struct spu *spu);
 void spufs_dma_callback(struct spu *spu, int type);
 
+extern struct spu_coredump_calls spufs_coredump_calls;
+struct spufs_coredump_reader {
+	char *name;
+	ssize_t (*read)(struct spu_context *ctx,
+			char __user *buffer, size_t size, loff_t *pos);
+	u64 (*get)(void *data);
+	size_t size;
+};
+extern struct spufs_coredump_reader spufs_coredump_read[];
+extern int spufs_coredump_num_notes;
+
 #endif
diff --git a/arch/powerpc/platforms/cell/spufs/switch.c b/arch/powerpc/platforms/cell/spufs/switch.c
index 0f782ca662b..c08981ff7fc 100644
--- a/arch/powerpc/platforms/cell/spufs/switch.c
+++ b/arch/powerpc/platforms/cell/spufs/switch.c
@@ -102,7 +102,7 @@ static inline int check_spu_isolate(struct spu_state *csa, struct spu *spu)
 	 *     saved at this time.
 	 */
 	isolate_state = SPU_STATUS_ISOLATED_STATE |
-	    SPU_STATUS_ISOLATED_LOAD_STAUTUS | SPU_STATUS_ISOLATED_EXIT_STAUTUS;
+	    SPU_STATUS_ISOLATED_LOAD_STATUS | SPU_STATUS_ISOLATED_EXIT_STATUS;
 	return (in_be32(&prob->spu_status_R) & isolate_state) ? 1 : 0;
 }
 
@@ -1046,12 +1046,12 @@ static inline int suspend_spe(struct spu_state *csa, struct spu *spu)
 	 */
 	if (in_be32(&prob->spu_status_R) & SPU_STATUS_RUNNING) {
 		if (in_be32(&prob->spu_status_R) &
-		    SPU_STATUS_ISOLATED_EXIT_STAUTUS) {
+		    SPU_STATUS_ISOLATED_EXIT_STATUS) {
 			POLL_WHILE_TRUE(in_be32(&prob->spu_status_R) &
 					SPU_STATUS_RUNNING);
 		}
 		if ((in_be32(&prob->spu_status_R) &
-		     SPU_STATUS_ISOLATED_LOAD_STAUTUS)
+		     SPU_STATUS_ISOLATED_LOAD_STATUS)
 		    || (in_be32(&prob->spu_status_R) &
 			SPU_STATUS_ISOLATED_STATE)) {
 			out_be32(&prob->spu_runcntl_RW, SPU_RUNCNTL_STOP);
@@ -1085,7 +1085,7 @@ static inline void clear_spu_status(struct spu_state *csa, struct spu *spu)
 	 */
 	if (!(in_be32(&prob->spu_status_R) & SPU_STATUS_RUNNING)) {
 		if (in_be32(&prob->spu_status_R) &
-		    SPU_STATUS_ISOLATED_EXIT_STAUTUS) {
+		    SPU_STATUS_ISOLATED_EXIT_STATUS) {
 			spu_mfc_sr1_set(spu,
 					MFC_STATE1_MASTER_RUN_CONTROL_MASK);
 			eieio();
@@ -1095,7 +1095,7 @@ static inline void clear_spu_status(struct spu_state *csa, struct spu *spu)
 					SPU_STATUS_RUNNING);
 		}
 		if ((in_be32(&prob->spu_status_R) &
-		     SPU_STATUS_ISOLATED_LOAD_STAUTUS)
+		     SPU_STATUS_ISOLATED_LOAD_STATUS)
 		    || (in_be32(&prob->spu_status_R) &
 			SPU_STATUS_ISOLATED_STATE)) {
 			spu_mfc_sr1_set(spu,
@@ -1916,6 +1916,51 @@ static void save_lscsa(struct spu_state *prev, struct spu *spu)
 	wait_spu_stopped(prev, spu);	/* Step 57. */
 }
 
+static void force_spu_isolate_exit(struct spu *spu)
+{
+	struct spu_problem __iomem *prob = spu->problem;
+	struct spu_priv2 __iomem *priv2 = spu->priv2;
+
+	/* Stop SPE execution and wait for completion. */
+	out_be32(&prob->spu_runcntl_RW, SPU_RUNCNTL_STOP);
+	iobarrier_rw();
+	POLL_WHILE_TRUE(in_be32(&prob->spu_status_R) & SPU_STATUS_RUNNING);
+
+	/* Restart SPE master runcntl. */
+	spu_mfc_sr1_set(spu, MFC_STATE1_MASTER_RUN_CONTROL_MASK);
+	iobarrier_w();
+
+	/* Initiate isolate exit request and wait for completion. */
+	out_be64(&priv2->spu_privcntl_RW, 4LL);
+	iobarrier_w();
+	out_be32(&prob->spu_runcntl_RW, 2);
+	iobarrier_rw();
+	POLL_WHILE_FALSE((in_be32(&prob->spu_status_R)
+				& SPU_STATUS_STOPPED_BY_STOP));
+
+	/* Reset load request to normal. */
+	out_be64(&priv2->spu_privcntl_RW, SPU_PRIVCNT_LOAD_REQUEST_NORMAL);
+	iobarrier_w();
+}
+
+/**
+ * stop_spu_isolate
+ *	Check SPU run-control state and force isolated
+ *	exit function as necessary.
+ */
+static void stop_spu_isolate(struct spu *spu)
+{
+	struct spu_problem __iomem *prob = spu->problem;
+
+	if (in_be32(&prob->spu_status_R) & SPU_STATUS_ISOLATED_STATE) {
+		/* The SPU is in isolated state; the only way
+		 * to get it out is to perform an isolated
+		 * exit (clean) operation.
+		 */
+		force_spu_isolate_exit(spu);
+	}
+}
+
 static void harvest(struct spu_state *prev, struct spu *spu)
 {
 	/*
@@ -1928,6 +1973,7 @@ static void harvest(struct spu_state *prev, struct spu *spu)
 	inhibit_user_access(prev, spu);	        /* Step 3.  */
 	terminate_spu_app(prev, spu);	        /* Step 4.  */
 	set_switch_pending(prev, spu);	        /* Step 5.  */
+	stop_spu_isolate(spu);			/* NEW.     */
 	remove_other_spu_access(prev, spu);	/* Step 6.  */
 	suspend_mfc(prev, spu);	                /* Step 7.  */
 	wait_suspend_mfc_complete(prev, spu);	/* Step 8.  */
@@ -2096,11 +2142,11 @@ int spu_save(struct spu_state *prev, struct spu *spu)
 	acquire_spu_lock(spu);	        /* Step 1.     */
 	rc = __do_spu_save(prev, spu);	/* Steps 2-53. */
 	release_spu_lock(spu);
-	if (rc) {
+	if (rc != 0 && rc != 2 && rc != 6) {
 		panic("%s failed on SPU[%d], rc=%d.\n",
 		      __func__, spu->number, rc);
 	}
-	return rc;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(spu_save);
 
@@ -2165,9 +2211,6 @@ static void init_priv1(struct spu_state *csa)
 	    MFC_STATE1_PROBLEM_STATE_MASK |
 	    MFC_STATE1_RELOCATE_MASK | MFC_STATE1_BUS_TLBIE_MASK;
 
-	/* Set storage description.  */
-	csa->priv1.mfc_sdr_RW = mfspr(SPRN_SDR1);
-
 	/* Enable OS-specific set of interrupts. */
 	csa->priv1.int_mask_class0_RW = CLASS0_ENABLE_DMA_ALIGNMENT_INTR |
 	    CLASS0_ENABLE_INVALID_DMA_COMMAND_INTR |
author	David Howells <dhowells@redhat.com>	2006-12-05 17:01:28 +0000
committer	David Howells <dhowells@warthog.cambridge.redhat.com>	2006-12-05 17:01:28 +0000
commit	9db73724453a9350e1c22dbe732d427e2939a5c9 (patch)
tree	15e3ead6413ae97398a54292acc199bee0864d42 /arch/powerpc/platforms/cell
parent	4c1ac1b49122b805adfa4efc620592f68dccf5db (diff)
parent	e62438630ca37539c8cc1553710bbfaa3cf960a7 (diff)