20 files changed, 3941 insertions, 1712 deletions
diff --git a/arch/powerpc/platforms/cell/spufs/.gitignore b/arch/powerpc/platforms/cell/spufs/.gitignore
new file mode 100644
index 00000000000..a09ee8d84d6
--- /dev/null
+++ b/arch/powerpc/platforms/cell/spufs/.gitignore
@@ -0,0 +1,2 @@
+spu_save_dump.h
+spu_restore_dump.h
diff --git a/arch/powerpc/platforms/cell/spufs/Makefile b/arch/powerpc/platforms/cell/spufs/Makefile
index 472217d19fa..52a7d2596d3 100644
--- a/arch/powerpc/platforms/cell/spufs/Makefile
+++ b/arch/powerpc/platforms/cell/spufs/Makefile
@@ -1,8 +1,12 @@
-obj-y += switch.o
 
 obj-$(CONFIG_SPU_FS) += spufs.o
-spufs-y += inode.o file.o context.o syscalls.o coredump.o
+spufs-y += inode.o file.o context.o syscalls.o
 spufs-y += sched.o backing_ops.o hw_ops.o run.o gang.o
+spufs-y += switch.o fault.o lscsa_alloc.o
+spufs-$(CONFIG_COREDUMP) += coredump.o
+
+# magic for the trace events
+CFLAGS_sched.o := -I$(src)
 
 # Rules to build switch.o with the help of SPU tool chain
 SPU_CROSS	:= spu-
@@ -10,10 +14,8 @@ SPU_CC		:= $(SPU_CROSS)gcc
 SPU_AS		:= $(SPU_CROSS)gcc
 SPU_LD		:= $(SPU_CROSS)ld
 SPU_OBJCOPY	:= $(SPU_CROSS)objcopy
-SPU_CFLAGS	:= -O2 -Wall -I$(srctree)/include \
-		   -I$(objtree)/include2 -D__KERNEL__
-SPU_AFLAGS	:= -c -D__ASSEMBLY__ -I$(srctree)/include \
-		   -I$(objtree)/include2 -D__KERNEL__
+SPU_CFLAGS	:= -O2 -Wall -I$(srctree)/include -D__KERNEL__
+SPU_AFLAGS	:= -c -D__ASSEMBLY__ -I$(srctree)/include -D__KERNEL__
 SPU_LDFLAGS	:= -N -Ttext=0x0
 
 $(obj)/switch.o: $(obj)/spu_save_dump.h $(obj)/spu_restore_dump.h
diff --git a/arch/powerpc/platforms/cell/spufs/backing_ops.c b/arch/powerpc/platforms/cell/spufs/backing_ops.c
index 1898f0d3a8b..6e8a9ef8590 100644
--- a/arch/powerpc/platforms/cell/spufs/backing_ops.c
+++ b/arch/powerpc/platforms/cell/spufs/backing_ops.c
@@ -21,14 +21,12 @@
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/stddef.h>
 #include <linux/unistd.h>
 #include <linux/poll.h>
@@ -107,16 +105,20 @@ static unsigned int spu_backing_mbox_stat_poll(struct spu_context *ctx,
 		if (stat & 0xff0000)
 			ret |= POLLIN | POLLRDNORM;
 		else {
-			ctx->csa.priv1.int_stat_class0_RW &= ~0x1;
-			ctx->csa.priv1.int_mask_class2_RW |= 0x1;
+			ctx->csa.priv1.int_stat_class2_RW &=
+				~CLASS2_MAILBOX_INTR;
+			ctx->csa.priv1.int_mask_class2_RW |=
+				CLASS2_ENABLE_MAILBOX_INTR;
 		}
 	}
 	if (events & (POLLOUT | POLLWRNORM)) {
 		if (stat & 0x00ff00)
 			ret = POLLOUT | POLLWRNORM;
 		else {
-			ctx->csa.priv1.int_stat_class0_RW &= ~0x10;
-			ctx->csa.priv1.int_mask_class2_RW |= 0x10;
+			ctx->csa.priv1.int_stat_class2_RW &=
+				~CLASS2_MAILBOX_THRESHOLD_INTR;
+			ctx->csa.priv1.int_mask_class2_RW |=
+				CLASS2_ENABLE_MAILBOX_THRESHOLD_INTR;
 		}
 	}
 	spin_unlock_irq(&ctx->csa.register_lock);
@@ -140,7 +142,7 @@ static int spu_backing_ibox_read(struct spu_context *ctx, u32 * data)
 		ret = 4;
 	} else {
 		/* make sure we get woken up by the interrupt */
-		ctx->csa.priv1.int_mask_class2_RW |= 0x1UL;
+		ctx->csa.priv1.int_mask_class2_RW |= CLASS2_ENABLE_MAILBOX_INTR;
 		ret = 0;
 	}
 	spin_unlock(&ctx->csa.register_lock);
@@ -163,13 +165,15 @@ static int spu_backing_wbox_write(struct spu_context *ctx, u32 data)
 		BUG_ON(avail != (4 - slot));
 		ctx->csa.spu_mailbox_data[slot] = data;
 		ctx->csa.spu_chnlcnt_RW[29] = ++slot;
-		ctx->csa.prob.mb_stat_R = (((4 - slot) & 0xff) << 8);
+		ctx->csa.prob.mb_stat_R &= ~(0x00ff00);
+		ctx->csa.prob.mb_stat_R |= (((4 - slot) & 0xff) << 8);
 		gen_spu_event(ctx, MFC_SPU_MAILBOX_WRITTEN_EVENT);
 		ret = 4;
 	} else {
 		/* make sure we get woken up by the interrupt when space
 		   becomes available */
-		ctx->csa.priv1.int_mask_class2_RW |= 0x10;
+		ctx->csa.priv1.int_mask_class2_RW |=
+			CLASS2_ENABLE_MAILBOX_THRESHOLD_INTR;
 		ret = 0;
 	}
 	spin_unlock(&ctx->csa.register_lock);
@@ -268,6 +272,11 @@ static char *spu_backing_get_ls(struct spu_context *ctx)
 	return ctx->csa.lscsa->ls;
 }
 
+static void spu_backing_privcntl_write(struct spu_context *ctx, u64 val)
+{
+	ctx->csa.priv2.spu_privcntl_RW = val;
+}
+
 static u32 spu_backing_runcntl_read(struct spu_context *ctx)
 {
 	return ctx->csa.prob.spu_runcntl_RW;
@@ -278,6 +287,12 @@ static void spu_backing_runcntl_write(struct spu_context *ctx, u32 val)
 	spin_lock(&ctx->csa.register_lock);
 	ctx->csa.prob.spu_runcntl_RW = val;
 	if (val & SPU_RUNCNTL_RUNNABLE) {
+		ctx->csa.prob.spu_status_R &=
+			~SPU_STATUS_STOPPED_BY_STOP &
+			~SPU_STATUS_STOPPED_BY_HALT &
+			~SPU_STATUS_SINGLE_STEP &
+			~SPU_STATUS_INVALID_INSTR &
+			~SPU_STATUS_INVALID_CH;
 		ctx->csa.prob.spu_status_R |= SPU_STATUS_RUNNING;
 	} else {
 		ctx->csa.prob.spu_status_R &= ~SPU_STATUS_RUNNING;
@@ -285,6 +300,11 @@ static void spu_backing_runcntl_write(struct spu_context *ctx, u32 val)
 	spin_unlock(&ctx->csa.register_lock);
 }
 
+static void spu_backing_runcntl_stop(struct spu_context *ctx)
+{
+	spu_backing_runcntl_write(ctx, SPU_RUNCNTL_STOP);
+}
+
 static void spu_backing_master_start(struct spu_context *ctx)
 {
 	struct spu_state *csa = &ctx->csa;
@@ -321,6 +341,12 @@ static int spu_backing_set_mfc_query(struct spu_context * ctx, u32 mask,
 	/* FIXME: what are the side-effects of this? */
 	prob->dma_querymask_RW = mask;
 	prob->dma_querytype_RW = mode;
+	/* In the current implementation, the SPU context is always
+	 * acquired in runnable state when new bits are added to the
+	 * mask (tagwait), so it's sufficient just to mask
+	 * dma_tagstatus_R with the 'mask' parameter here.
+	 */
+	ctx->csa.prob.dma_tagstatus_R &= mask;
 out:
 	spin_unlock(&ctx->csa.register_lock);
 
@@ -350,6 +376,11 @@ static int spu_backing_send_mfc_command(struct spu_context *ctx,
 	return ret;
 }
 
+static void spu_backing_restart_dma(struct spu_context *ctx)
+{
+	ctx->csa.priv2.mfc_control_RW |= MFC_CNTL_RESTART_DMA_COMMAND;
+}
+
 struct spu_context_ops spu_backing_ops = {
 	.mbox_read = spu_backing_mbox_read,
 	.mbox_stat_read = spu_backing_mbox_stat_read,
@@ -368,12 +399,15 @@ struct spu_context_ops spu_backing_ops = {
 	.npc_write = spu_backing_npc_write,
 	.status_read = spu_backing_status_read,
 	.get_ls = spu_backing_get_ls,
+	.privcntl_write = spu_backing_privcntl_write,
 	.runcntl_read = spu_backing_runcntl_read,
 	.runcntl_write = spu_backing_runcntl_write,
+	.runcntl_stop = spu_backing_runcntl_stop,
 	.master_start = spu_backing_master_start,
 	.master_stop = spu_backing_master_stop,
 	.set_mfc_query = spu_backing_set_mfc_query,
 	.read_mfc_tagstatus = spu_backing_read_mfc_tagstatus,
 	.get_mfc_free_elements = spu_backing_get_mfc_free_elements,
 	.send_mfc_command = spu_backing_send_mfc_command,
+	.restart_dma = spu_backing_restart_dma,
 };
diff --git a/arch/powerpc/platforms/cell/spufs/context.c b/arch/powerpc/platforms/cell/spufs/context.c
index 04ad2e364e9..9c6790d17ed 100644
--- a/arch/powerpc/platforms/cell/spufs/context.c
+++ b/arch/powerpc/platforms/cell/spufs/context.c
@@ -23,40 +23,54 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include <linux/atomic.h>
+#include <linux/sched.h>
 #include <asm/spu.h>
 #include <asm/spu_csa.h>
 #include "spufs.h"
+#include "sputrace.h"
+
+
+atomic_t nr_spu_contexts = ATOMIC_INIT(0);
 
 struct spu_context *alloc_spu_context(struct spu_gang *gang)
 {
 	struct spu_context *ctx;
+	struct timespec ts;
+
 	ctx = kzalloc(sizeof *ctx, GFP_KERNEL);
 	if (!ctx)
 		goto out;
 	/* Binding to physical processor deferred
 	 * until spu_activate().
 	 */
-	spu_init_csa(&ctx->csa);
-	if (!ctx->csa.lscsa) {
+	if (spu_init_csa(&ctx->csa))
 		goto out_free;
-	}
 	spin_lock_init(&ctx->mmio_lock);
+	mutex_init(&ctx->mapping_lock);
 	kref_init(&ctx->kref);
 	mutex_init(&ctx->state_mutex);
-	init_MUTEX(&ctx->run_sema);
+	mutex_init(&ctx->run_mutex);
 	init_waitqueue_head(&ctx->ibox_wq);
 	init_waitqueue_head(&ctx->wbox_wq);
 	init_waitqueue_head(&ctx->stop_wq);
 	init_waitqueue_head(&ctx->mfc_wq);
+	init_waitqueue_head(&ctx->run_wq);
 	ctx->state = SPU_STATE_SAVED;
 	ctx->ops = &spu_backing_ops;
 	ctx->owner = get_task_mm(current);
+	INIT_LIST_HEAD(&ctx->rq);
+	INIT_LIST_HEAD(&ctx->aff_list);
 	if (gang)
 		spu_gang_add_ctx(gang, ctx);
-	ctx->rt_priority = current->rt_priority;
-	ctx->policy = current->policy;
-	ctx->prio = current->prio;
-	INIT_DELAYED_WORK(&ctx->sched_work, spu_sched_tick);
+
+	__spu_update_sched_info(ctx);
+	spu_set_timeslice(ctx);
+	ctx->stats.util_state = SPU_UTIL_IDLE_LOADED;
+	ktime_get_ts(&ts);
+	ctx->stats.tstamp = timespec_to_ns(&ts);
+
+	atomic_inc(&nr_spu_contexts);
 	goto out;
 out_free:
 	kfree(ctx);
@@ -69,12 +83,18 @@ void destroy_spu_context(struct kref *kref)
 {
 	struct spu_context *ctx;
 	ctx = container_of(kref, struct spu_context, kref);
+	spu_context_nospu_trace(destroy_spu_context__enter, ctx);
 	mutex_lock(&ctx->state_mutex);
 	spu_deactivate(ctx);
 	mutex_unlock(&ctx->state_mutex);
 	spu_fini_csa(&ctx->csa);
 	if (ctx->gang)
 		spu_gang_remove_ctx(ctx->gang, ctx);
+	if (ctx->prof_priv_kref)
+		kref_put(ctx->prof_priv_kref, ctx->prof_priv_release);
+	BUG_ON(!list_empty(&ctx->rq));
+	atomic_dec(&nr_spu_contexts);
+	kfree(ctx->switch_log);
 	kfree(ctx);
 }
 
@@ -93,7 +113,16 @@ int put_spu_context(struct spu_context *ctx)
 void spu_forget(struct spu_context *ctx)
 {
 	struct mm_struct *mm;
-	spu_acquire_saved(ctx);
+
+	/*
+	 * This is basically an open-coded spu_acquire_saved, except that
+	 * we don't acquire the state mutex interruptible, and we don't
+	 * want this context to be rescheduled on release.
+	 */
+	mutex_lock(&ctx->state_mutex);
+	if (ctx->state != SPU_STATE_SAVED)
+		spu_deactivate(ctx);
+
 	mm = ctx->owner;
 	ctx->owner = NULL;
 	mmput(mm);
@@ -102,100 +131,58 @@ void spu_forget(struct spu_context *ctx)
 
 void spu_unmap_mappings(struct spu_context *ctx)
 {
+	mutex_lock(&ctx->mapping_lock);
 	if (ctx->local_store)
 		unmap_mapping_range(ctx->local_store, 0, LS_SIZE, 1);
 	if (ctx->mfc)
-		unmap_mapping_range(ctx->mfc, 0, 0x1000, 1);
+		unmap_mapping_range(ctx->mfc, 0, SPUFS_MFC_MAP_SIZE, 1);
 	if (ctx->cntl)
-		unmap_mapping_range(ctx->cntl, 0, 0x1000, 1);
+		unmap_mapping_range(ctx->cntl, 0, SPUFS_CNTL_MAP_SIZE, 1);
 	if (ctx->signal1)
-		unmap_mapping_range(ctx->signal1, 0, PAGE_SIZE, 1);
+		unmap_mapping_range(ctx->signal1, 0, SPUFS_SIGNAL_MAP_SIZE, 1);
 	if (ctx->signal2)
-		unmap_mapping_range(ctx->signal2, 0, PAGE_SIZE, 1);
+		unmap_mapping_range(ctx->signal2, 0, SPUFS_SIGNAL_MAP_SIZE, 1);
 	if (ctx->mss)
-		unmap_mapping_range(ctx->mss, 0, 0x1000, 1);
+		unmap_mapping_range(ctx->mss, 0, SPUFS_MSS_MAP_SIZE, 1);
 	if (ctx->psmap)
-		unmap_mapping_range(ctx->psmap, 0, 0x20000, 1);
+		unmap_mapping_range(ctx->psmap, 0, SPUFS_PS_MAP_SIZE, 1);
+	mutex_unlock(&ctx->mapping_lock);
 }
 
 /**
- * spu_acquire_exclusive - lock spu contex and protect against userspace access
+ * spu_acquire_saved - lock spu contex and make sure it is in saved state
  * @ctx:	spu contex to lock
- *
- * Note:
- *	Returns 0 and with the context locked on success
- *	Returns negative error and with the context _unlocked_ on failure.
  */
-int spu_acquire_exclusive(struct spu_context *ctx)
+int spu_acquire_saved(struct spu_context *ctx)
 {
-	int ret = -EINVAL;
+	int ret;
 
-	spu_acquire(ctx);
-	/*
-	 * Context is about to be freed, so we can't acquire it anymore.
-	 */
-	if (!ctx->owner)
-		goto out_unlock;
-
-	if (ctx->state == SPU_STATE_SAVED) {
-		ret = spu_activate(ctx, 0);
-		if (ret)
-			goto out_unlock;
-	} else {
-		/*
-		 * We need to exclude userspace access to the context.
-		 *
-		 * To protect against memory access we invalidate all ptes
-		 * and make sure the pagefault handlers block on the mutex.
-		 */
-		spu_unmap_mappings(ctx);
+	spu_context_nospu_trace(spu_acquire_saved__enter, ctx);
+
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
+
+	if (ctx->state != SPU_STATE_SAVED) {
+		set_bit(SPU_SCHED_WAS_ACTIVE, &ctx->sched_flags);
+		spu_deactivate(ctx);
 	}
 
 	return 0;
-
- out_unlock:
-	spu_release(ctx);
-	return ret;
 }
 
 /**
- * spu_acquire_runnable - lock spu contex and make sure it is in runnable state
- * @ctx:	spu contex to lock
- *
- * Note:
- *	Returns 0 and with the context locked on success
- *	Returns negative error and with the context _unlocked_ on failure.
+ * spu_release_saved - unlock spu context and return it to the runqueue
+ * @ctx:	context to unlock
  */
-int spu_acquire_runnable(struct spu_context *ctx, unsigned long flags)
+void spu_release_saved(struct spu_context *ctx)
 {
-	int ret = -EINVAL;
-
-	spu_acquire(ctx);
-	if (ctx->state == SPU_STATE_SAVED) {
-		/*
-		 * Context is about to be freed, so we can't acquire it anymore.
-		 */
-		if (!ctx->owner)
-			goto out_unlock;
-		ret = spu_activate(ctx, flags);
-		if (ret)
-			goto out_unlock;
-	}
+	BUG_ON(ctx->state != SPU_STATE_SAVED);
 
-	return 0;
+	if (test_and_clear_bit(SPU_SCHED_WAS_ACTIVE, &ctx->sched_flags) &&
+			test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))
+		spu_activate(ctx, 0);
 
- out_unlock:
 	spu_release(ctx);
-	return ret;
 }
 
-/**
- * spu_acquire_saved - lock spu contex and make sure it is in saved state
- * @ctx:	spu contex to lock
- */
-void spu_acquire_saved(struct spu_context *ctx)
-{
-	spu_acquire(ctx);
-	if (ctx->state != SPU_STATE_SAVED)
-		spu_deactivate(ctx);
-}
diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
index 725e1956115..be6212ddbf0 100644
--- a/arch/powerpc/platforms/cell/spufs/coredump.c
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -22,25 +22,19 @@
 
 #include <linux/elf.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/list.h>
-#include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/coredump.h>
+#include <linux/binfmts.h>
 
 #include <asm/uaccess.h>
 
 #include "spufs.h"
 
-struct spufs_ctx_info {
-	struct list_head list;
-	int dfd;
-	int memsize; /* in bytes */
-	struct spu_context *ctx;
-};
-
-static LIST_HEAD(ctx_info_list);
-
-static ssize_t do_coredump_read(int num, struct spu_context *ctx, void __user *buffer,
+static ssize_t do_coredump_read(int num, struct spu_context *ctx, void *buffer,
 				size_t size, loff_t *off)
 {
 	u64 data;
@@ -50,49 +44,19 @@ static ssize_t do_coredump_read(int num, struct spu_context *ctx, void __user *b
 		return spufs_coredump_read[num].read(ctx, buffer, size, off);
 
 	data = spufs_coredump_read[num].get(ctx);
-	ret = copy_to_user(buffer, &data, 8);
-	return ret ? -EFAULT : 8;
-}
-
-/*
- * These are the only things you should do on a core-file: use only these
- * functions to write out all the necessary info.
- */
-static int spufs_dump_write(struct file *file, const void *addr, int nr)
-{
-	return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-}
-
-static int spufs_dump_seek(struct file *file, loff_t off)
-{
-	if (file->f_op->llseek) {
-		if (file->f_op->llseek(file, off, 0) != off)
-			return 0;
-	} else
-		file->f_pos = off;
-	return 1;
-}
-
-static void spufs_fill_memsize(struct spufs_ctx_info *ctx_info)
-{
-	struct spu_context *ctx;
-	unsigned long long lslr;
-
-	ctx = ctx_info->ctx;
-	lslr = ctx->csa.priv2.spu_lslr_RW;
-	ctx_info->memsize = lslr + 1;
+	ret = snprintf(buffer, size, "0x%.16llx", data);
+	if (ret >= size)
+		return size;
+	return ++ret; /* count trailing NULL */
 }
 
-static int spufs_ctx_note_size(struct spufs_ctx_info *ctx_info)
+static int spufs_ctx_note_size(struct spu_context *ctx, int dfd)
 {
-	int dfd, memsize, i, sz, total = 0;
+	int i, sz, total = 0;
 	char *name;
 	char fullname[80];
 
-	dfd = ctx_info->dfd;
-	memsize = ctx_info->memsize;
-
-	for (i = 0; spufs_coredump_read[i].name; i++) {
+	for (i = 0; spufs_coredump_read[i].name != NULL; i++) {
 		name = spufs_coredump_read[i].name;
 		sz = spufs_coredump_read[i].size;
 
@@ -100,37 +64,21 @@ static int spufs_ctx_note_size(struct spufs_ctx_info *ctx_info)
 
 		total += sizeof(struct elf_note);
 		total += roundup(strlen(fullname) + 1, 4);
-		if (!strcmp(name, "mem"))
-			total += roundup(memsize, 4);
-		else
-			total += roundup(sz, 4);
+		total += roundup(sz, 4);
 	}
 
 	return total;
 }
 
-static int spufs_add_one_context(struct file *file, int dfd)
+static int match_context(const void *v, struct file *file, unsigned fd)
 {
 	struct spu_context *ctx;
-	struct spufs_ctx_info *ctx_info;
-	int size;
-
-	ctx = SPUFS_I(file->f_dentry->d_inode)->i_ctx;
+	if (file->f_op != &spufs_context_fops)
+		return 0;
+	ctx = SPUFS_I(file_inode(file))->i_ctx;
 	if (ctx->flags & SPU_CREATE_NOSCHED)
 		return 0;
-
-	ctx_info = kzalloc(sizeof(*ctx_info), GFP_KERNEL);
-	if (unlikely(!ctx_info))
-		return -ENOMEM;
-
-	ctx_info->dfd = dfd;
-	ctx_info->ctx = ctx;
-
-	spufs_fill_memsize(ctx_info);
-
-	size = spufs_ctx_note_size(ctx_info);
-	list_add(&ctx_info->list, &ctx_info_list);
-	return size;
+	return fd + 1;
 }
 
 /*
@@ -142,97 +90,122 @@ static int spufs_add_one_context(struct file *file, int dfd)
  * internal functionality to dump them without needing to actually
  * open the files.
  */
-static int spufs_arch_notes_size(void)
+/*
+ * descriptor table is not shared, so files can't change or go away.
+ */
+static struct spu_context *coredump_next_context(int *fd)
 {
-	struct fdtable *fdt = files_fdtable(current->files);
-	int size = 0, fd;
-
-	for (fd = 0; fd < fdt->max_fds; fd++) {
-		if (FD_ISSET(fd, fdt->open_fds)) {
-			struct file *file = fcheck(fd);
-
-			if (file && file->f_op == &spufs_context_fops) {
-				int rval = spufs_add_one_context(file, fd);
-				if (rval < 0)
-					break;
-				size += rval;
-			}
-		}
+	struct file *file;
+	int n = iterate_fd(current->files, *fd, match_context, NULL);
+	if (!n)
+		return NULL;
+	*fd = n - 1;
+	file = fcheck(*fd);
+	return SPUFS_I(file_inode(file))->i_ctx;
+}
+
+int spufs_coredump_extra_notes_size(void)
+{
+	struct spu_context *ctx;
+	int size = 0, rc, fd;
+
+	fd = 0;
+	while ((ctx = coredump_next_context(&fd)) != NULL) {
+		rc = spu_acquire_saved(ctx);
+		if (rc)
+			break;
+		rc = spufs_ctx_note_size(ctx, fd);
+		spu_release_saved(ctx);
+		if (rc < 0)
+			break;
+
+		size += rc;
+
+		/* start searching the next fd next time */
+		fd++;
 	}
 
 	return size;
 }
 
-static void spufs_arch_write_note(struct spufs_ctx_info *ctx_info, int i,
-				struct file *file)
+static int spufs_arch_write_note(struct spu_context *ctx, int i,
+				  struct coredump_params *cprm, int dfd)
 {
-	struct spu_context *ctx;
 	loff_t pos = 0;
-	int sz, dfd, rc, total = 0;
-	const int bufsz = 4096;
+	int sz, rc, total = 0;
+	const int bufsz = PAGE_SIZE;
 	char *name;
 	char fullname[80], *buf;
 	struct elf_note en;
 
-	buf = kmalloc(bufsz, GFP_KERNEL);
+	buf = (void *)get_zeroed_page(GFP_KERNEL);
 	if (!buf)
-		return;
+		return -ENOMEM;
 
-	dfd = ctx_info->dfd;
 	name = spufs_coredump_read[i].name;
-
-	if (!strcmp(name, "mem"))
-		sz = ctx_info->memsize;
-	else
-		sz = spufs_coredump_read[i].size;
-
-	ctx = ctx_info->ctx;
-	if (!ctx) {
-		return;
-	}
+	sz = spufs_coredump_read[i].size;
 
 	sprintf(fullname, "SPU/%d/%s", dfd, name);
 	en.n_namesz = strlen(fullname) + 1;
 	en.n_descsz = sz;
 	en.n_type = NT_SPU;
 
-	if (!spufs_dump_write(file, &en, sizeof(en)))
-		return;
-	if (!spufs_dump_write(file, fullname, en.n_namesz))
-		return;
-	if (!spufs_dump_seek(file, roundup((unsigned long)file->f_pos, 4)))
-		return;
+	if (!dump_emit(cprm, &en, sizeof(en)))
+		goto Eio;
+
+	if (!dump_emit(cprm, fullname, en.n_namesz))
+		goto Eio;
+
+	if (!dump_align(cprm, 4))
+		goto Eio;
 
 	do {
 		rc = do_coredump_read(i, ctx, buf, bufsz, &pos);
 		if (rc > 0) {
-			if (!spufs_dump_write(file, buf, rc))
-				return;
+			if (!dump_emit(cprm, buf, rc))
+				goto Eio;
 			total += rc;
 		}
 	} while (rc == bufsz && total < sz);
 
-	spufs_dump_seek(file, roundup((unsigned long)file->f_pos
-						- total + sz, 4));
+	if (rc < 0)
+		goto out;
+
+	if (!dump_skip(cprm,
+		       roundup(cprm->written - total + sz, 4) - cprm->written))
+		goto Eio;
+out:
+	free_page((unsigned long)buf);
+	return rc;
+Eio:
+	free_page((unsigned long)buf);
+	return -EIO;
 }
 
-static void spufs_arch_write_notes(struct file *file)
+int spufs_coredump_extra_notes_write(struct coredump_params *cprm)
 {
-	int j;
-	struct spufs_ctx_info *ctx_info, *next;
-
-	list_for_each_entry_safe(ctx_info, next, &ctx_info_list, list) {
-		spu_acquire_saved(ctx_info->ctx);
-		for (j = 0; j < spufs_coredump_num_notes; j++)
-			spufs_arch_write_note(ctx_info, j, file);
-		spu_release(ctx_info->ctx);
-		list_del(&ctx_info->list);
-		kfree(ctx_info);
+	struct spu_context *ctx;
+	int fd, j, rc;
+
+	fd = 0;
+	while ((ctx = coredump_next_context(&fd)) != NULL) {
+		rc = spu_acquire_saved(ctx);
+		if (rc)
+			return rc;
+
+		for (j = 0; spufs_coredump_read[j].name != NULL; j++) {
+			rc = spufs_arch_write_note(ctx, j, cprm, fd);
+			if (rc) {
+				spu_release_saved(ctx);
+				return rc;
+			}
+		}
+
+		spu_release_saved(ctx);
+
+		/* start searching the next fd next time */
+		fd++;
 	}
-}
 
-struct spu_coredump_calls spufs_coredump_calls = {
-	.arch_notes_size = spufs_arch_notes_size,
-	.arch_write_notes = spufs_arch_write_notes,
-	.owner = THIS_MODULE,
-};
+	return 0;
+}
diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c
new file mode 100644
index 00000000000..8cb6260cc80
--- /dev/null
+++ b/arch/powerpc/platforms/cell/spufs/fault.c
@@ -0,0 +1,191 @@
+/*
+ * Low-level SPU handling
+ *
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2005
+ *
+ * Author: Arnd Bergmann <arndb@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+
+#include <asm/spu.h>
+#include <asm/spu_csa.h>
+
+#include "spufs.h"
+
+/**
+ * Handle an SPE event, depending on context SPU_CREATE_EVENTS_ENABLED flag.
+ *
+ * If the context was created with events, we just set the return event.
+ * Otherwise, send an appropriate signal to the process.
+ */
+static void spufs_handle_event(struct spu_context *ctx,
+				unsigned long ea, int type)
+{
+	siginfo_t info;
+
+	if (ctx->flags & SPU_CREATE_EVENTS_ENABLED) {
+		ctx->event_return |= type;
+		wake_up_all(&ctx->stop_wq);
+		return;
+	}
+
+	memset(&info, 0, sizeof(info));
+
+	switch (type) {
+	case SPE_EVENT_INVALID_DMA:
+		info.si_signo = SIGBUS;
+		info.si_code = BUS_OBJERR;
+		break;
+	case SPE_EVENT_SPE_DATA_STORAGE:
+		info.si_signo = SIGSEGV;
+		info.si_addr = (void __user *)ea;
+		info.si_code = SEGV_ACCERR;
+		ctx->ops->restart_dma(ctx);
+		break;
+	case SPE_EVENT_DMA_ALIGNMENT:
+		info.si_signo = SIGBUS;
+		/* DAR isn't set for an alignment fault :( */
+		info.si_code = BUS_ADRALN;
+		break;
+	case SPE_EVENT_SPE_ERROR:
+		info.si_signo = SIGILL;
+		info.si_addr = (void __user *)(unsigned long)
+			ctx->ops->npc_read(ctx) - 4;
+		info.si_code = ILL_ILLOPC;
+		break;
+	}
+
+	if (info.si_signo)
+		force_sig_info(info.si_signo, &info, current);
+}
+
+int spufs_handle_class0(struct spu_context *ctx)
+{
+	unsigned long stat = ctx->csa.class_0_pending & CLASS0_INTR_MASK;
+
+	if (likely(!stat))
+		return 0;
+
+	if (stat & CLASS0_DMA_ALIGNMENT_INTR)
+		spufs_handle_event(ctx, ctx->csa.class_0_dar,
+			SPE_EVENT_DMA_ALIGNMENT);
+
+	if (stat & CLASS0_INVALID_DMA_COMMAND_INTR)
+		spufs_handle_event(ctx, ctx->csa.class_0_dar,
+			SPE_EVENT_INVALID_DMA);
+
+	if (stat & CLASS0_SPU_ERROR_INTR)
+		spufs_handle_event(ctx, ctx->csa.class_0_dar,
+			SPE_EVENT_SPE_ERROR);
+
+	ctx->csa.class_0_pending = 0;
+
+	return -EIO;
+}
+
+/*
+ * bottom half handler for page faults, we can't do this from
+ * interrupt context, since we might need to sleep.
+ * we also need to give up the mutex so we can get scheduled
+ * out while waiting for the backing store.
+ *
+ * TODO: try calling hash_page from the interrupt handler first
+ *       in order to speed up the easy case.
+ */
+int spufs_handle_class1(struct spu_context *ctx)
+{
+	u64 ea, dsisr, access;
+	unsigned long flags;
+	unsigned flt = 0;
+	int ret;
+
+	/*
+	 * dar and dsisr get passed from the registers
+	 * to the spu_context, to this function, but not
+	 * back to the spu if it gets scheduled again.
+	 *
+	 * if we don't handle the fault for a saved context
+	 * in time, we can still expect to get the same fault
+	 * the immediately after the context restore.
+	 */
+	ea = ctx->csa.class_1_dar;
+	dsisr = ctx->csa.class_1_dsisr;
+
+	if (!(dsisr & (MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED)))
+		return 0;
+
+	spuctx_switch_state(ctx, SPU_UTIL_IOWAIT);
+
+	pr_debug("ctx %p: ea %016llx, dsisr %016llx state %d\n", ctx, ea,
+		dsisr, ctx->state);
+
+	ctx->stats.hash_flt++;
+	if (ctx->state == SPU_STATE_RUNNABLE)
+		ctx->spu->stats.hash_flt++;
+
+	/* we must not hold the lock when entering spu_handle_mm_fault */
+	spu_release(ctx);
+
+	access = (_PAGE_PRESENT | _PAGE_USER);
+	access |= (dsisr & MFC_DSISR_ACCESS_PUT) ? _PAGE_RW : 0UL;
+	local_irq_save(flags);
+	ret = hash_page(ea, access, 0x300);
+	local_irq_restore(flags);
+
+	/* hashing failed, so try the actual fault handler */
+	if (ret)
+		ret = spu_handle_mm_fault(current->mm, ea, dsisr, &flt);
+
+	/*
+	 * This is nasty: we need the state_mutex for all the bookkeeping even
+	 * if the syscall was interrupted by a signal. ewww.
+	 */
+	mutex_lock(&ctx->state_mutex);
+
+	/*
+	 * Clear dsisr under ctxt lock after handling the fault, so that
+	 * time slicing will not preempt the context while the page fault
+	 * handler is running. Context switch code removes mappings.
+	 */
+	ctx->csa.class_1_dar = ctx->csa.class_1_dsisr = 0;
+
+	/*
+	 * If we handled the fault successfully and are in runnable
+	 * state, restart the DMA.
+	 * In case of unhandled error report the problem to user space.
+	 */
+	if (!ret) {
+		if (flt & VM_FAULT_MAJOR)
+			ctx->stats.maj_flt++;
+		else
+			ctx->stats.min_flt++;
+		if (ctx->state == SPU_STATE_RUNNABLE) {
+			if (flt & VM_FAULT_MAJOR)
+				ctx->spu->stats.maj_flt++;
+			else
+				ctx->spu->stats.min_flt++;
+		}
+
+		if (ctx->spu)
+			ctx->ops->restart_dma(ctx);
+	} else
+		spufs_handle_event(ctx, ea, SPE_EVENT_SPE_DATA_STORAGE);
+
+	spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
+	return ret;
+}
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index b00653d69c0..90986923a53 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -24,29 +24,163 @@
 
 #include <linux/fs.h>
 #include <linux/ioctl.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/pagemap.h>
 #include <linux/poll.h>
 #include <linux/ptrace.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
 
 #include <asm/io.h>
-#include <asm/semaphore.h>
+#include <asm/time.h>
 #include <asm/spu.h>
 #include <asm/spu_info.h>
 #include <asm/uaccess.h>
 
 #include "spufs.h"
+#include "sputrace.h"
 
 #define SPUFS_MMAP_4K (PAGE_SIZE == 0x1000)
 
+/* Simple attribute files */
+struct spufs_attr {
+	int (*get)(void *, u64 *);
+	int (*set)(void *, u64);
+	char get_buf[24];       /* enough to store a u64 and "\n\0" */
+	char set_buf[24];
+	void *data;
+	const char *fmt;        /* format for read operation */
+	struct mutex mutex;     /* protects access to these buffers */
+};
+
+static int spufs_attr_open(struct inode *inode, struct file *file,
+		int (*get)(void *, u64 *), int (*set)(void *, u64),
+		const char *fmt)
+{
+	struct spufs_attr *attr;
+
+	attr = kmalloc(sizeof(*attr), GFP_KERNEL);
+	if (!attr)
+		return -ENOMEM;
+
+	attr->get = get;
+	attr->set = set;
+	attr->data = inode->i_private;
+	attr->fmt = fmt;
+	mutex_init(&attr->mutex);
+	file->private_data = attr;
+
+	return nonseekable_open(inode, file);
+}
+
+static int spufs_attr_release(struct inode *inode, struct file *file)
+{
+       kfree(file->private_data);
+	return 0;
+}
+
+static ssize_t spufs_attr_read(struct file *file, char __user *buf,
+		size_t len, loff_t *ppos)
+{
+	struct spufs_attr *attr;
+	size_t size;
+	ssize_t ret;
+
+	attr = file->private_data;
+	if (!attr->get)
+		return -EACCES;
+
+	ret = mutex_lock_interruptible(&attr->mutex);
+	if (ret)
+		return ret;
+
+	if (*ppos) {		/* continued read */
+		size = strlen(attr->get_buf);
+	} else {		/* first read */
+		u64 val;
+		ret = attr->get(attr->data, &val);
+		if (ret)
+			goto out;
+
+		size = scnprintf(attr->get_buf, sizeof(attr->get_buf),
+				 attr->fmt, (unsigned long long)val);
+	}
+
+	ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size);
+out:
+	mutex_unlock(&attr->mutex);
+	return ret;
+}
+
+static ssize_t spufs_attr_write(struct file *file, const char __user *buf,
+		size_t len, loff_t *ppos)
+{
+	struct spufs_attr *attr;
+	u64 val;
+	size_t size;
+	ssize_t ret;
+
+	attr = file->private_data;
+	if (!attr->set)
+		return -EACCES;
+
+	ret = mutex_lock_interruptible(&attr->mutex);
+	if (ret)
+		return ret;
+
+	ret = -EFAULT;
+	size = min(sizeof(attr->set_buf) - 1, len);
+	if (copy_from_user(attr->set_buf, buf, size))
+		goto out;
+
+	ret = len; /* claim we got the whole input */
+	attr->set_buf[size] = '\0';
+	val = simple_strtol(attr->set_buf, NULL, 0);
+	attr->set(attr->data, val);
+out:
+	mutex_unlock(&attr->mutex);
+	return ret;
+}
+
+#define DEFINE_SPUFS_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt)	\
+static int __fops ## _open(struct inode *inode, struct file *file)	\
+{									\
+	__simple_attr_check_format(__fmt, 0ull);			\
+	return spufs_attr_open(inode, file, __get, __set, __fmt);	\
+}									\
+static const struct file_operations __fops = {				\
+	.open	 = __fops ## _open,					\
+	.release = spufs_attr_release,					\
+	.read	 = spufs_attr_read,					\
+	.write	 = spufs_attr_write,					\
+	.llseek  = generic_file_llseek,					\
+};
+
+
 static int
 spufs_mem_open(struct inode *inode, struct file *file)
 {
 	struct spufs_inode_info *i = SPUFS_I(inode);
 	struct spu_context *ctx = i->i_ctx;
+
+	mutex_lock(&ctx->mapping_lock);
 	file->private_data = ctx;
-	ctx->local_store = inode->i_mapping;
-	smp_wmb();
+	if (!i->i_openers++)
+		ctx->local_store = inode->i_mapping;
+	mutex_unlock(&ctx->mapping_lock);
+	return 0;
+}
+
+static int
+spufs_mem_release(struct inode *inode, struct file *file)
+{
+	struct spufs_inode_info *i = SPUFS_I(inode);
+	struct spu_context *ctx = i->i_ctx;
+
+	mutex_lock(&ctx->mapping_lock);
+	if (!--i->i_openers)
+		ctx->local_store = NULL;
+	mutex_unlock(&ctx->mapping_lock);
 	return 0;
 }
 
@@ -63,130 +197,241 @@ static ssize_t
 spufs_mem_read(struct file *file, char __user *buffer,
 				size_t size, loff_t *pos)
 {
-	int ret;
 	struct spu_context *ctx = file->private_data;
+	ssize_t ret;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	ret = __spufs_mem_read(ctx, buffer, size, pos);
 	spu_release(ctx);
+
 	return ret;
 }
 
 static ssize_t
 spufs_mem_write(struct file *file, const char __user *buffer,
-					size_t size, loff_t *pos)
+					size_t size, loff_t *ppos)
 {
 	struct spu_context *ctx = file->private_data;
 	char *local_store;
+	loff_t pos = *ppos;
 	int ret;
 
-	size = min_t(ssize_t, LS_SIZE - *pos, size);
-	if (size <= 0)
+	if (pos > LS_SIZE)
 		return -EFBIG;
-	*pos += size;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 
 	local_store = ctx->ops->get_ls(ctx);
-	ret = copy_from_user(local_store + *pos - size,
-			     buffer, size) ? -EFAULT : size;
-
+	size = simple_write_to_buffer(local_store, LS_SIZE, ppos, buffer, size);
 	spu_release(ctx);
-	return ret;
+
+	return size;
 }
 
-static unsigned long spufs_mem_mmap_nopfn(struct vm_area_struct *vma,
-					  unsigned long address)
+static int
+spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	struct spu_context *ctx = vma->vm_file->private_data;
-	unsigned long pfn, offset = address - vma->vm_start;
+	struct spu_context *ctx	= vma->vm_file->private_data;
+	unsigned long address = (unsigned long)vmf->virtual_address;
+	unsigned long pfn, offset;
+
+#ifdef CONFIG_SPU_FS_64K_LS
+	struct spu_state *csa = &ctx->csa;
+	int psize;
+
+	/* Check what page size we are using */
+	psize = get_slice_psize(vma->vm_mm, address);
 
-	offset += vma->vm_pgoff << PAGE_SHIFT;
+	/* Some sanity checking */
+	BUG_ON(csa->use_big_pages != (psize == MMU_PAGE_64K));
 
+	/* Wow, 64K, cool, we need to align the address though */
+	if (csa->use_big_pages) {
+		BUG_ON(vma->vm_start & 0xffff);
+		address &= ~0xfffful;
+	}
+#endif /* CONFIG_SPU_FS_64K_LS */
+
+	offset = vmf->pgoff << PAGE_SHIFT;
 	if (offset >= LS_SIZE)
-		return NOPFN_SIGBUS;
+		return VM_FAULT_SIGBUS;
 
-	spu_acquire(ctx);
+	pr_debug("spufs_mem_mmap_fault address=0x%lx, offset=0x%lx\n",
+			address, offset);
+
+	if (spu_acquire(ctx))
+		return VM_FAULT_NOPAGE;
 
 	if (ctx->state == SPU_STATE_SAVED) {
-		vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
-							& ~_PAGE_NO_CACHE);
+		vma->vm_page_prot = pgprot_cached(vma->vm_page_prot);
 		pfn = vmalloc_to_pfn(ctx->csa.lscsa->ls + offset);
 	} else {
-		vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
-					     | _PAGE_NO_CACHE);
+		vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
 		pfn = (ctx->spu->local_store_phys + offset) >> PAGE_SHIFT;
 	}
 	vm_insert_pfn(vma, address, pfn);
 
 	spu_release(ctx);
 
-	return NOPFN_REFAULT;
+	return VM_FAULT_NOPAGE;
 }
 
+static int spufs_mem_mmap_access(struct vm_area_struct *vma,
+				unsigned long address,
+				void *buf, int len, int write)
+{
+	struct spu_context *ctx = vma->vm_file->private_data;
+	unsigned long offset = address - vma->vm_start;
+	char *local_store;
 
-static struct vm_operations_struct spufs_mem_mmap_vmops = {
-	.nopfn = spufs_mem_mmap_nopfn,
+	if (write && !(vma->vm_flags & VM_WRITE))
+		return -EACCES;
+	if (spu_acquire(ctx))
+		return -EINTR;
+	if ((offset + len) > vma->vm_end)
+		len = vma->vm_end - offset;
+	local_store = ctx->ops->get_ls(ctx);
+	if (write)
+		memcpy_toio(local_store + offset, buf, len);
+	else
+		memcpy_fromio(buf, local_store + offset, len);
+	spu_release(ctx);
+	return len;
+}
+
+static const struct vm_operations_struct spufs_mem_mmap_vmops = {
+	.fault = spufs_mem_mmap_fault,
+	.access = spufs_mem_mmap_access,
 };
 
-static int
-spufs_mem_mmap(struct file *file, struct vm_area_struct *vma)
-{
+static int spufs_mem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+#ifdef CONFIG_SPU_FS_64K_LS
+	struct spu_context	*ctx = file->private_data;
+	struct spu_state	*csa = &ctx->csa;
+
+	/* Sanity check VMA alignment */
+	if (csa->use_big_pages) {
+		pr_debug("spufs_mem_mmap 64K, start=0x%lx, end=0x%lx,"
+			 " pgoff=0x%lx\n", vma->vm_start, vma->vm_end,
+			 vma->vm_pgoff);
+		if (vma->vm_start & 0xffff)
+			return -EINVAL;
+		if (vma->vm_pgoff & 0xf)
+			return -EINVAL;
+	}
+#endif /* CONFIG_SPU_FS_64K_LS */
+
 	if (!(vma->vm_flags & VM_SHARED))
 		return -EINVAL;
 
 	vma->vm_flags |= VM_IO | VM_PFNMAP;
-	vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
-				     | _PAGE_NO_CACHE);
+	vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
 
 	vma->vm_ops = &spufs_mem_mmap_vmops;
 	return 0;
 }
 
+#ifdef CONFIG_SPU_FS_64K_LS
+static unsigned long spufs_get_unmapped_area(struct file *file,
+		unsigned long addr, unsigned long len, unsigned long pgoff,
+		unsigned long flags)
+{
+	struct spu_context	*ctx = file->private_data;
+	struct spu_state	*csa = &ctx->csa;
+
+	/* If not using big pages, fallback to normal MM g_u_a */
+	if (!csa->use_big_pages)
+		return current->mm->get_unmapped_area(file, addr, len,
+						      pgoff, flags);
+
+	/* Else, try to obtain a 64K pages slice */
+	return slice_get_unmapped_area(addr, len, flags,
+				       MMU_PAGE_64K, 1);
+}
+#endif /* CONFIG_SPU_FS_64K_LS */
+
 static const struct file_operations spufs_mem_fops = {
-	.open	 = spufs_mem_open,
-	.read    = spufs_mem_read,
-	.write   = spufs_mem_write,
-	.llseek  = generic_file_llseek,
-	.mmap    = spufs_mem_mmap,
+	.open			= spufs_mem_open,
+	.release		= spufs_mem_release,
+	.read			= spufs_mem_read,
+	.write			= spufs_mem_write,
+	.llseek			= generic_file_llseek,
+	.mmap			= spufs_mem_mmap,
+#ifdef CONFIG_SPU_FS_64K_LS
+	.get_unmapped_area	= spufs_get_unmapped_area,
+#endif
 };
 
-static unsigned long spufs_ps_nopfn(struct vm_area_struct *vma,
-				    unsigned long address,
+static int spufs_ps_fault(struct vm_area_struct *vma,
+				    struct vm_fault *vmf,
 				    unsigned long ps_offs,
 				    unsigned long ps_size)
 {
 	struct spu_context *ctx = vma->vm_file->private_data;
-	unsigned long area, offset = address - vma->vm_start;
-	int ret;
+	unsigned long area, offset = vmf->pgoff << PAGE_SHIFT;
+	int ret = 0;
+
+	spu_context_nospu_trace(spufs_ps_fault__enter, ctx);
 
-	offset += vma->vm_pgoff << PAGE_SHIFT;
 	if (offset >= ps_size)
-		return NOPFN_SIGBUS;
+		return VM_FAULT_SIGBUS;
 
-	/* error here usually means a signal.. we might want to test
-	 * the error code more precisely though
+	if (fatal_signal_pending(current))
+		return VM_FAULT_SIGBUS;
+
+	/*
+	 * Because we release the mmap_sem, the context may be destroyed while
+	 * we're in spu_wait. Grab an extra reference so it isn't destroyed
+	 * in the meantime.
 	 */
-	ret = spu_acquire_runnable(ctx, 0);
-	if (ret)
-		return NOPFN_REFAULT;
+	get_spu_context(ctx);
 
-	area = ctx->spu->problem_phys + ps_offs;
-	vm_insert_pfn(vma, address, (area + offset) >> PAGE_SHIFT);
-	spu_release(ctx);
+	/*
+	 * We have to wait for context to be loaded before we have
+	 * pages to hand out to the user, but we don't want to wait
+	 * with the mmap_sem held.
+	 * It is possible to drop the mmap_sem here, but then we need
+	 * to return VM_FAULT_NOPAGE because the mappings may have
+	 * hanged.
+	 */
+	if (spu_acquire(ctx))
+		goto refault;
+
+	if (ctx->state == SPU_STATE_SAVED) {
+		up_read(&current->mm->mmap_sem);
+		spu_context_nospu_trace(spufs_ps_fault__sleep, ctx);
+		ret = spufs_wait(ctx->run_wq, ctx->state == SPU_STATE_RUNNABLE);
+		spu_context_trace(spufs_ps_fault__wake, ctx, ctx->spu);
+		down_read(&current->mm->mmap_sem);
+	} else {
+		area = ctx->spu->problem_phys + ps_offs;
+		vm_insert_pfn(vma, (unsigned long)vmf->virtual_address,
+					(area + offset) >> PAGE_SHIFT);
+		spu_context_trace(spufs_ps_fault__insert, ctx, ctx->spu);
+	}
+
+	if (!ret)
+		spu_release(ctx);
 
-	return NOPFN_REFAULT;
+refault:
+	put_spu_context(ctx);
+	return VM_FAULT_NOPAGE;
 }
 
 #if SPUFS_MMAP_4K
-static unsigned long spufs_cntl_mmap_nopfn(struct vm_area_struct *vma,
-					   unsigned long address)
+static int spufs_cntl_mmap_fault(struct vm_area_struct *vma,
+					   struct vm_fault *vmf)
 {
-	return spufs_ps_nopfn(vma, address, 0x4000, 0x1000);
+	return spufs_ps_fault(vma, vmf, 0x4000, SPUFS_CNTL_MAP_SIZE);
 }
 
-static struct vm_operations_struct spufs_cntl_mmap_vmops = {
-	.nopfn = spufs_cntl_mmap_nopfn,
+static const struct vm_operations_struct spufs_cntl_mmap_vmops = {
+	.fault = spufs_cntl_mmap_fault,
 };
 
 /*
@@ -198,8 +443,7 @@ static int spufs_cntl_mmap(struct file *file, struct vm_area_struct *vma)
 		return -EINVAL;
 
 	vma->vm_flags |= VM_IO | VM_PFNMAP;
-	vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
-				     | _PAGE_NO_CACHE | _PAGE_GUARDED);
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 	vma->vm_ops = &spufs_cntl_mmap_vmops;
 	return 0;
@@ -208,25 +452,32 @@ static int spufs_cntl_mmap(struct file *file, struct vm_area_struct *vma)
 #define spufs_cntl_mmap NULL
 #endif /* !SPUFS_MMAP_4K */
 
-static u64 spufs_cntl_get(void *data)
+static int spufs_cntl_get(void *data, u64 *val)
 {
 	struct spu_context *ctx = data;
-	u64 val;
+	int ret;
 
-	spu_acquire(ctx);
-	val = ctx->ops->status_read(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
+	*val = ctx->ops->status_read(ctx);
 	spu_release(ctx);
 
-	return val;
+	return 0;
 }
 
-static void spufs_cntl_set(void *data, u64 val)
+static int spufs_cntl_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
+	int ret;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	ctx->ops->runcntl_write(ctx, val);
 	spu_release(ctx);
+
+	return 0;
 }
 
 static int spufs_cntl_open(struct inode *inode, struct file *file)
@@ -234,18 +485,36 @@ static int spufs_cntl_open(struct inode *inode, struct file *file)
 	struct spufs_inode_info *i = SPUFS_I(inode);
 	struct spu_context *ctx = i->i_ctx;
 
+	mutex_lock(&ctx->mapping_lock);
 	file->private_data = ctx;
-	ctx->cntl = inode->i_mapping;
-	smp_wmb();
+	if (!i->i_openers++)
+		ctx->cntl = inode->i_mapping;
+	mutex_unlock(&ctx->mapping_lock);
 	return simple_attr_open(inode, file, spufs_cntl_get,
 					spufs_cntl_set, "0x%08lx");
 }
 
+static int
+spufs_cntl_release(struct inode *inode, struct file *file)
+{
+	struct spufs_inode_info *i = SPUFS_I(inode);
+	struct spu_context *ctx = i->i_ctx;
+
+	simple_attr_release(inode, file);
+
+	mutex_lock(&ctx->mapping_lock);
+	if (!--i->i_openers)
+		ctx->cntl = NULL;
+	mutex_unlock(&ctx->mapping_lock);
+	return 0;
+}
+
 static const struct file_operations spufs_cntl_fops = {
 	.open = spufs_cntl_open,
-	.release = simple_attr_close,
+	.release = spufs_cntl_release,
 	.read = simple_attr_read,
 	.write = simple_attr_write,
+	.llseek	= generic_file_llseek,
 	.mmap = spufs_cntl_mmap,
 };
 
@@ -273,9 +542,16 @@ spufs_regs_read(struct file *file, char __user *buffer,
 	int ret;
 	struct spu_context *ctx = file->private_data;
 
-	spu_acquire_saved(ctx);
+	/* pre-check for file position: if we'd return EOF, there's no point
+	 * causing a deschedule */
+	if (*pos >= sizeof(ctx->csa.lscsa->gprs))
+		return 0;
+
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	ret = __spufs_regs_read(ctx, buffer, size, pos);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 	return ret;
 }
 
@@ -287,18 +563,18 @@ spufs_regs_write(struct file *file, const char __user *buffer,
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
 	int ret;
 
-	size = min_t(ssize_t, sizeof lscsa->gprs - *pos, size);
-	if (size <= 0)
+	if (*pos >= sizeof(lscsa->gprs))
 		return -EFBIG;
-	*pos += size;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 
-	ret = copy_from_user(lscsa->gprs + *pos - size,
-			     buffer, size) ? -EFAULT : size;
+	size = simple_write_to_buffer(lscsa->gprs, sizeof(lscsa->gprs), pos,
+					buffer, size);
 
-	spu_release(ctx);
-	return ret;
+	spu_release_saved(ctx);
+	return size;
 }
 
 static const struct file_operations spufs_regs_fops = {
@@ -324,9 +600,11 @@ spufs_fpcr_read(struct file *file, char __user * buffer,
 	int ret;
 	struct spu_context *ctx = file->private_data;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	ret = __spufs_fpcr_read(ctx, buffer, size, pos);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 	return ret;
 }
 
@@ -338,18 +616,18 @@ spufs_fpcr_write(struct file *file, const char __user * buffer,
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
 	int ret;
 
-	size = min_t(ssize_t, sizeof(lscsa->fpcr) - *pos, size);
-	if (size <= 0)
+	if (*pos >= sizeof(lscsa->fpcr))
 		return -EFBIG;
-	*pos += size;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 
-	ret = copy_from_user((char *)&lscsa->fpcr + *pos - size,
-			     buffer, size) ? -EFAULT : size;
+	size = simple_write_to_buffer(&lscsa->fpcr, sizeof(lscsa->fpcr), pos,
+					buffer, size);
 
-	spu_release(ctx);
-	return ret;
+	spu_release_saved(ctx);
+	return size;
 }
 
 static const struct file_operations spufs_fpcr_fops = {
@@ -391,7 +669,10 @@ static ssize_t spufs_mbox_read(struct file *file, char __user *buf,
 
 	udata = (void __user *)buf;
 
-	spu_acquire(ctx);
+	count = spu_acquire(ctx);
+	if (count)
+		return count;
+
 	for (count = 0; (count + 4) <= len; count += 4, udata++) {
 		int ret;
 		ret = ctx->ops->mbox_read(ctx, &mbox_data);
@@ -421,18 +702,22 @@ static ssize_t spufs_mbox_read(struct file *file, char __user *buf,
 static const struct file_operations spufs_mbox_fops = {
 	.open	= spufs_pipe_open,
 	.read	= spufs_mbox_read,
+	.llseek	= no_llseek,
 };
 
 static ssize_t spufs_mbox_stat_read(struct file *file, char __user *buf,
 			size_t len, loff_t *pos)
 {
 	struct spu_context *ctx = file->private_data;
+	ssize_t ret;
 	u32 mbox_stat;
 
 	if (len < 4)
 		return -EINVAL;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 
 	mbox_stat = ctx->ops->mbox_stat_read(ctx) & 0xff;
 
@@ -447,6 +732,7 @@ static ssize_t spufs_mbox_stat_read(struct file *file, char __user *buf,
 static const struct file_operations spufs_mbox_stat_fops = {
 	.open	= spufs_pipe_open,
 	.read	= spufs_mbox_stat_read,
+	.llseek = no_llseek,
 };
 
 /* low-level ibox access function */
@@ -467,6 +753,9 @@ void spufs_ibox_callback(struct spu *spu)
 {
 	struct spu_context *ctx = spu->ctx;
 
+	if (!ctx)
+		return;
+
 	wake_up_all(&ctx->ibox_wq);
 	kill_fasync(&ctx->ibox_fasync, SIGIO, POLLIN);
 }
@@ -498,23 +787,27 @@ static ssize_t spufs_ibox_read(struct file *file, char __user *buf,
 
 	udata = (void __user *)buf;
 
-	spu_acquire(ctx);
+	count = spu_acquire(ctx);
+	if (count)
+		goto out;
 
 	/* wait only for the first element */
 	count = 0;
 	if (file->f_flags & O_NONBLOCK) {
-		if (!spu_ibox_read(ctx, &ibox_data))
+		if (!spu_ibox_read(ctx, &ibox_data)) {
 			count = -EAGAIN;
+			goto out_unlock;
+		}
 	} else {
 		count = spufs_wait(ctx->ibox_wq, spu_ibox_read(ctx, &ibox_data));
+		if (count)
+			goto out;
 	}
-	if (count)
-		goto out;
 
 	/* if we can't write at all, return -EFAULT */
 	count = __put_user(ibox_data, udata);
 	if (count)
-		goto out;
+		goto out_unlock;
 
 	for (count = 4, udata++; (count + 4) <= len; count += 4, udata++) {
 		int ret;
@@ -531,9 +824,9 @@ static ssize_t spufs_ibox_read(struct file *file, char __user *buf,
 			break;
 	}
 
-out:
+out_unlock:
 	spu_release(ctx);
-
+out:
 	return count;
 }
 
@@ -544,7 +837,11 @@ static unsigned int spufs_ibox_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &ctx->ibox_wq, wait);
 
-	spu_acquire(ctx);
+	/*
+	 * For now keep this uninterruptible and also ignore the rule
+	 * that poll should not sleep.  Will be fixed later.
+	 */
+	mutex_lock(&ctx->state_mutex);
 	mask = ctx->ops->mbox_stat_poll(ctx, POLLIN | POLLRDNORM);
 	spu_release(ctx);
 
@@ -556,18 +853,22 @@ static const struct file_operations spufs_ibox_fops = {
 	.read	= spufs_ibox_read,
 	.poll	= spufs_ibox_poll,
 	.fasync	= spufs_ibox_fasync,
+	.llseek = no_llseek,
 };
 
 static ssize_t spufs_ibox_stat_read(struct file *file, char __user *buf,
 			size_t len, loff_t *pos)
 {
 	struct spu_context *ctx = file->private_data;
+	ssize_t ret;
 	u32 ibox_stat;
 
 	if (len < 4)
 		return -EINVAL;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	ibox_stat = (ctx->ops->mbox_stat_read(ctx) >> 16) & 0xff;
 	spu_release(ctx);
 
@@ -580,6 +881,7 @@ static ssize_t spufs_ibox_stat_read(struct file *file, char __user *buf,
 static const struct file_operations spufs_ibox_stat_fops = {
 	.open	= spufs_pipe_open,
 	.read	= spufs_ibox_stat_read,
+	.llseek = no_llseek,
 };
 
 /* low-level mailbox write */
@@ -603,6 +905,9 @@ void spufs_wbox_callback(struct spu *spu)
 {
 	struct spu_context *ctx = spu->ctx;
 
+	if (!ctx)
+		return;
+
 	wake_up_all(&ctx->wbox_wq);
 	kill_fasync(&ctx->wbox_fasync, SIGIO, POLLOUT);
 }
@@ -636,7 +941,9 @@ static ssize_t spufs_wbox_write(struct file *file, const char __user *buf,
 	if (__get_user(wbox_data, udata))
 		return -EFAULT;
 
-	spu_acquire(ctx);
+	count = spu_acquire(ctx);
+	if (count)
+		goto out;
 
 	/*
 	 * make sure we can at least write one element, by waiting
@@ -644,16 +951,18 @@ static ssize_t spufs_wbox_write(struct file *file, const char __user *buf,
 	 */
 	count = 0;
 	if (file->f_flags & O_NONBLOCK) {
-		if (!spu_wbox_write(ctx, wbox_data))
+		if (!spu_wbox_write(ctx, wbox_data)) {
 			count = -EAGAIN;
+			goto out_unlock;
+		}
 	} else {
 		count = spufs_wait(ctx->wbox_wq, spu_wbox_write(ctx, wbox_data));
+		if (count)
+			goto out;
 	}
 
-	if (count)
-		goto out;
 
-	/* write aѕ much as possible */
+	/* write as much as possible */
 	for (count = 4, udata++; (count + 4) <= len; count += 4, udata++) {
 		int ret;
 		ret = __get_user(wbox_data, udata);
@@ -665,8 +974,9 @@ static ssize_t spufs_wbox_write(struct file *file, const char __user *buf,
 			break;
 	}
 
-out:
+out_unlock:
 	spu_release(ctx);
+out:
 	return count;
 }
 
@@ -677,7 +987,11 @@ static unsigned int spufs_wbox_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &ctx->wbox_wq, wait);
 
-	spu_acquire(ctx);
+	/*
+	 * For now keep this uninterruptible and also ignore the rule
+	 * that poll should not sleep.  Will be fixed later.
+	 */
+	mutex_lock(&ctx->state_mutex);
 	mask = ctx->ops->mbox_stat_poll(ctx, POLLOUT | POLLWRNORM);
 	spu_release(ctx);
 
@@ -689,18 +1003,22 @@ static const struct file_operations spufs_wbox_fops = {
 	.write	= spufs_wbox_write,
 	.poll	= spufs_wbox_poll,
 	.fasync	= spufs_wbox_fasync,
+	.llseek = no_llseek,
 };
 
 static ssize_t spufs_wbox_stat_read(struct file *file, char __user *buf,
 			size_t len, loff_t *pos)
 {
 	struct spu_context *ctx = file->private_data;
+	ssize_t ret;
 	u32 wbox_stat;
 
 	if (len < 4)
 		return -EINVAL;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	wbox_stat = (ctx->ops->mbox_stat_read(ctx) >> 8) & 0xff;
 	spu_release(ctx);
 
@@ -713,18 +1031,35 @@ static ssize_t spufs_wbox_stat_read(struct file *file, char __user *buf,
 static const struct file_operations spufs_wbox_stat_fops = {
 	.open	= spufs_pipe_open,
 	.read	= spufs_wbox_stat_read,
+	.llseek = no_llseek,
 };
 
 static int spufs_signal1_open(struct inode *inode, struct file *file)
 {
 	struct spufs_inode_info *i = SPUFS_I(inode);
 	struct spu_context *ctx = i->i_ctx;
+
+	mutex_lock(&ctx->mapping_lock);
 	file->private_data = ctx;
-	ctx->signal1 = inode->i_mapping;
-	smp_wmb();
+	if (!i->i_openers++)
+		ctx->signal1 = inode->i_mapping;
+	mutex_unlock(&ctx->mapping_lock);
 	return nonseekable_open(inode, file);
 }
 
+static int
+spufs_signal1_release(struct inode *inode, struct file *file)
+{
+	struct spufs_inode_info *i = SPUFS_I(inode);
+	struct spu_context *ctx = i->i_ctx;
+
+	mutex_lock(&ctx->mapping_lock);
+	if (!--i->i_openers)
+		ctx->signal1 = NULL;
+	mutex_unlock(&ctx->mapping_lock);
+	return 0;
+}
+
 static ssize_t __spufs_signal1_read(struct spu_context *ctx, char __user *buf,
 			size_t len, loff_t *pos)
 {
@@ -755,9 +1090,11 @@ static ssize_t spufs_signal1_read(struct file *file, char __user *buf,
 	int ret;
 	struct spu_context *ctx = file->private_data;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	ret = __spufs_signal1_read(ctx, buf, len, pos);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 
 	return ret;
 }
@@ -766,6 +1103,7 @@ static ssize_t spufs_signal1_write(struct file *file, const char __user *buf,
 			size_t len, loff_t *pos)
 {
 	struct spu_context *ctx;
+	ssize_t ret;
 	u32 data;
 
 	ctx = file->private_data;
@@ -776,30 +1114,32 @@ static ssize_t spufs_signal1_write(struct file *file, const char __user *buf,
 	if (copy_from_user(&data, buf, 4))
 		return -EFAULT;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	ctx->ops->signal1_write(ctx, data);
 	spu_release(ctx);
 
 	return 4;
 }
 
-static unsigned long spufs_signal1_mmap_nopfn(struct vm_area_struct *vma,
-					      unsigned long address)
+static int
+spufs_signal1_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-#if PAGE_SIZE == 0x1000
-	return spufs_ps_nopfn(vma, address, 0x14000, 0x1000);
-#elif PAGE_SIZE == 0x10000
+#if SPUFS_SIGNAL_MAP_SIZE == 0x1000
+	return spufs_ps_fault(vma, vmf, 0x14000, SPUFS_SIGNAL_MAP_SIZE);
+#elif SPUFS_SIGNAL_MAP_SIZE == 0x10000
 	/* For 64k pages, both signal1 and signal2 can be used to mmap the whole
 	 * signal 1 and 2 area
 	 */
-	return spufs_ps_nopfn(vma, address, 0x10000, 0x10000);
+	return spufs_ps_fault(vma, vmf, 0x10000, SPUFS_SIGNAL_MAP_SIZE);
 #else
 #error unsupported page size
 #endif
 }
 
-static struct vm_operations_struct spufs_signal1_mmap_vmops = {
-	.nopfn = spufs_signal1_mmap_nopfn,
+static const struct vm_operations_struct spufs_signal1_mmap_vmops = {
+	.fault = spufs_signal1_mmap_fault,
 };
 
 static int spufs_signal1_mmap(struct file *file, struct vm_area_struct *vma)
@@ -808,8 +1148,7 @@ static int spufs_signal1_mmap(struct file *file, struct vm_area_struct *vma)
 		return -EINVAL;
 
 	vma->vm_flags |= VM_IO | VM_PFNMAP;
-	vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
-				     | _PAGE_NO_CACHE | _PAGE_GUARDED);
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 	vma->vm_ops = &spufs_signal1_mmap_vmops;
 	return 0;
@@ -817,21 +1156,47 @@ static int spufs_signal1_mmap(struct file *file, struct vm_area_struct *vma)
 
 static const struct file_operations spufs_signal1_fops = {
 	.open = spufs_signal1_open,
+	.release = spufs_signal1_release,
 	.read = spufs_signal1_read,
 	.write = spufs_signal1_write,
 	.mmap = spufs_signal1_mmap,
+	.llseek = no_llseek,
+};
+
+static const struct file_operations spufs_signal1_nosched_fops = {
+	.open = spufs_signal1_open,
+	.release = spufs_signal1_release,
+	.write = spufs_signal1_write,
+	.mmap = spufs_signal1_mmap,
+	.llseek = no_llseek,
 };
 
 static int spufs_signal2_open(struct inode *inode, struct file *file)
 {
 	struct spufs_inode_info *i = SPUFS_I(inode);
 	struct spu_context *ctx = i->i_ctx;
+
+	mutex_lock(&ctx->mapping_lock);
 	file->private_data = ctx;
-	ctx->signal2 = inode->i_mapping;
-	smp_wmb();
+	if (!i->i_openers++)
+		ctx->signal2 = inode->i_mapping;
+	mutex_unlock(&ctx->mapping_lock);
 	return nonseekable_open(inode, file);
 }
 
+static int
+spufs_signal2_release(struct inode *inode, struct file *file)
+{
+	struct spufs_inode_info *i = SPUFS_I(inode);
+	struct spu_context *ctx = i->i_ctx;
+
+	mutex_lock(&ctx->mapping_lock);
+	if (!--i->i_openers)
+		ctx->signal2 = NULL;
+	mutex_unlock(&ctx->mapping_lock);
+	return 0;
+}
+
 static ssize_t __spufs_signal2_read(struct spu_context *ctx, char __user *buf,
 			size_t len, loff_t *pos)
 {
@@ -862,9 +1227,11 @@ static ssize_t spufs_signal2_read(struct file *file, char __user *buf,
 	struct spu_context *ctx = file->private_data;
 	int ret;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	ret = __spufs_signal2_read(ctx, buf, len, pos);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 
 	return ret;
 }
@@ -873,6 +1240,7 @@ static ssize_t spufs_signal2_write(struct file *file, const char __user *buf,
 			size_t len, loff_t *pos)
 {
 	struct spu_context *ctx;
+	ssize_t ret;
 	u32 data;
 
 	ctx = file->private_data;
@@ -883,7 +1251,9 @@ static ssize_t spufs_signal2_write(struct file *file, const char __user *buf,
 	if (copy_from_user(&data, buf, 4))
 		return -EFAULT;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	ctx->ops->signal2_write(ctx, data);
 	spu_release(ctx);
 
@@ -891,23 +1261,23 @@ static ssize_t spufs_signal2_write(struct file *file, const char __user *buf,
 }
 
 #if SPUFS_MMAP_4K
-static unsigned long spufs_signal2_mmap_nopfn(struct vm_area_struct *vma,
-					      unsigned long address)
+static int
+spufs_signal2_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-#if PAGE_SIZE == 0x1000
-	return spufs_ps_nopfn(vma, address, 0x1c000, 0x1000);
-#elif PAGE_SIZE == 0x10000
+#if SPUFS_SIGNAL_MAP_SIZE == 0x1000
+	return spufs_ps_fault(vma, vmf, 0x1c000, SPUFS_SIGNAL_MAP_SIZE);
+#elif SPUFS_SIGNAL_MAP_SIZE == 0x10000
 	/* For 64k pages, both signal1 and signal2 can be used to mmap the whole
 	 * signal 1 and 2 area
 	 */
-	return spufs_ps_nopfn(vma, address, 0x10000, 0x10000);
+	return spufs_ps_fault(vma, vmf, 0x10000, SPUFS_SIGNAL_MAP_SIZE);
 #else
 #error unsupported page size
 #endif
 }
 
-static struct vm_operations_struct spufs_signal2_mmap_vmops = {
-	.nopfn = spufs_signal2_mmap_nopfn,
+static const struct vm_operations_struct spufs_signal2_mmap_vmops = {
+	.fault = spufs_signal2_mmap_fault,
 };
 
 static int spufs_signal2_mmap(struct file *file, struct vm_area_struct *vma)
@@ -916,8 +1286,7 @@ static int spufs_signal2_mmap(struct file *file, struct vm_area_struct *vma)
 		return -EINVAL;
 
 	vma->vm_flags |= VM_IO | VM_PFNMAP;
-	vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
-				     | _PAGE_NO_CACHE | _PAGE_GUARDED);
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 	vma->vm_ops = &spufs_signal2_mmap_vmops;
 	return 0;
@@ -928,78 +1297,107 @@ static int spufs_signal2_mmap(struct file *file, struct vm_area_struct *vma)
 
 static const struct file_operations spufs_signal2_fops = {
 	.open = spufs_signal2_open,
+	.release = spufs_signal2_release,
 	.read = spufs_signal2_read,
 	.write = spufs_signal2_write,
 	.mmap = spufs_signal2_mmap,
+	.llseek = no_llseek,
+};
+
+static const struct file_operations spufs_signal2_nosched_fops = {
+	.open = spufs_signal2_open,
+	.release = spufs_signal2_release,
+	.write = spufs_signal2_write,
+	.mmap = spufs_signal2_mmap,
+	.llseek = no_llseek,
 };
 
-static void spufs_signal1_type_set(void *data, u64 val)
+/*
+ * This is a wrapper around DEFINE_SIMPLE_ATTRIBUTE which does the
+ * work of acquiring (or not) the SPU context before calling through
+ * to the actual get routine. The set routine is called directly.
+ */
+#define SPU_ATTR_NOACQUIRE	0
+#define SPU_ATTR_ACQUIRE	1
+#define SPU_ATTR_ACQUIRE_SAVED	2
+
+#define DEFINE_SPUFS_ATTRIBUTE(__name, __get, __set, __fmt, __acquire)	\
+static int __##__get(void *data, u64 *val)				\
+{									\
+	struct spu_context *ctx = data;					\
+	int ret = 0;							\
+									\
+	if (__acquire == SPU_ATTR_ACQUIRE) {				\
+		ret = spu_acquire(ctx);					\
+		if (ret)						\
+			return ret;					\
+		*val = __get(ctx);					\
+		spu_release(ctx);					\
+	} else if (__acquire == SPU_ATTR_ACQUIRE_SAVED)	{		\
+		ret = spu_acquire_saved(ctx);				\
+		if (ret)						\
+			return ret;					\
+		*val = __get(ctx);					\
+		spu_release_saved(ctx);					\
+	} else								\
+		*val = __get(ctx);					\
+									\
+	return 0;							\
+}									\
+DEFINE_SPUFS_SIMPLE_ATTRIBUTE(__name, __##__get, __set, __fmt);
+
+static int spufs_signal1_type_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
+	int ret;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	ctx->ops->signal1_type_set(ctx, val);
 	spu_release(ctx);
+
+	return 0;
 }
 
-static u64 __spufs_signal1_type_get(void *data)
+static u64 spufs_signal1_type_get(struct spu_context *ctx)
 {
-	struct spu_context *ctx = data;
 	return ctx->ops->signal1_type_get(ctx);
 }
+DEFINE_SPUFS_ATTRIBUTE(spufs_signal1_type, spufs_signal1_type_get,
+		       spufs_signal1_type_set, "%llu\n", SPU_ATTR_ACQUIRE);
 
-static u64 spufs_signal1_type_get(void *data)
-{
-	struct spu_context *ctx = data;
-	u64 ret;
-
-	spu_acquire(ctx);
-	ret = __spufs_signal1_type_get(data);
-	spu_release(ctx);
 
-	return ret;
-}
-DEFINE_SIMPLE_ATTRIBUTE(spufs_signal1_type, spufs_signal1_type_get,
-					spufs_signal1_type_set, "%llu");
-
-static void spufs_signal2_type_set(void *data, u64 val)
+static int spufs_signal2_type_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
+	int ret;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	ctx->ops->signal2_type_set(ctx, val);
 	spu_release(ctx);
-}
 
-static u64 __spufs_signal2_type_get(void *data)
-{
-	struct spu_context *ctx = data;
-	return ctx->ops->signal2_type_get(ctx);
+	return 0;
 }
 
-static u64 spufs_signal2_type_get(void *data)
+static u64 spufs_signal2_type_get(struct spu_context *ctx)
 {
-	struct spu_context *ctx = data;
-	u64 ret;
-
-	spu_acquire(ctx);
-	ret = __spufs_signal2_type_get(data);
-	spu_release(ctx);
-
-	return ret;
+	return ctx->ops->signal2_type_get(ctx);
 }
-DEFINE_SIMPLE_ATTRIBUTE(spufs_signal2_type, spufs_signal2_type_get,
-					spufs_signal2_type_set, "%llu");
+DEFINE_SPUFS_ATTRIBUTE(spufs_signal2_type, spufs_signal2_type_get,
+		       spufs_signal2_type_set, "%llu\n", SPU_ATTR_ACQUIRE);
 
 #if SPUFS_MMAP_4K
-static unsigned long spufs_mss_mmap_nopfn(struct vm_area_struct *vma,
-					  unsigned long address)
+static int
+spufs_mss_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	return spufs_ps_nopfn(vma, address, 0x0000, 0x1000);
+	return spufs_ps_fault(vma, vmf, 0x0000, SPUFS_MSS_MAP_SIZE);
 }
 
-static struct vm_operations_struct spufs_mss_mmap_vmops = {
-	.nopfn = spufs_mss_mmap_nopfn,
+static const struct vm_operations_struct spufs_mss_mmap_vmops = {
+	.fault = spufs_mss_mmap_fault,
 };
 
 /*
@@ -1011,8 +1409,7 @@ static int spufs_mss_mmap(struct file *file, struct vm_area_struct *vma)
 		return -EINVAL;
 
 	vma->vm_flags |= VM_IO | VM_PFNMAP;
-	vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
-				     | _PAGE_NO_CACHE | _PAGE_GUARDED);
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 	vma->vm_ops = &spufs_mss_mmap_vmops;
 	return 0;
@@ -1027,24 +1424,42 @@ static int spufs_mss_open(struct inode *inode, struct file *file)
 	struct spu_context *ctx = i->i_ctx;
 
 	file->private_data = i->i_ctx;
-	ctx->mss = inode->i_mapping;
-	smp_wmb();
+
+	mutex_lock(&ctx->mapping_lock);
+	if (!i->i_openers++)
+		ctx->mss = inode->i_mapping;
+	mutex_unlock(&ctx->mapping_lock);
 	return nonseekable_open(inode, file);
 }
 
+static int
+spufs_mss_release(struct inode *inode, struct file *file)
+{
+	struct spufs_inode_info *i = SPUFS_I(inode);
+	struct spu_context *ctx = i->i_ctx;
+
+	mutex_lock(&ctx->mapping_lock);
+	if (!--i->i_openers)
+		ctx->mss = NULL;
+	mutex_unlock(&ctx->mapping_lock);
+	return 0;
+}
+
 static const struct file_operations spufs_mss_fops = {
 	.open	 = spufs_mss_open,
+	.release = spufs_mss_release,
 	.mmap	 = spufs_mss_mmap,
+	.llseek  = no_llseek,
 };
 
-static unsigned long spufs_psmap_mmap_nopfn(struct vm_area_struct *vma,
-					    unsigned long address)
+static int
+spufs_psmap_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	return spufs_ps_nopfn(vma, address, 0x0000, 0x20000);
+	return spufs_ps_fault(vma, vmf, 0x0000, SPUFS_PS_MAP_SIZE);
 }
 
-static struct vm_operations_struct spufs_psmap_mmap_vmops = {
-	.nopfn = spufs_psmap_mmap_nopfn,
+static const struct vm_operations_struct spufs_psmap_mmap_vmops = {
+	.fault = spufs_psmap_mmap_fault,
 };
 
 /*
@@ -1056,8 +1471,7 @@ static int spufs_psmap_mmap(struct file *file, struct vm_area_struct *vma)
 		return -EINVAL;
 
 	vma->vm_flags |= VM_IO | VM_PFNMAP;
-	vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
-				     | _PAGE_NO_CACHE | _PAGE_GUARDED);
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 	vma->vm_ops = &spufs_psmap_mmap_vmops;
 	return 0;
@@ -1068,27 +1482,44 @@ static int spufs_psmap_open(struct inode *inode, struct file *file)
 	struct spufs_inode_info *i = SPUFS_I(inode);
 	struct spu_context *ctx = i->i_ctx;
 
+	mutex_lock(&ctx->mapping_lock);
 	file->private_data = i->i_ctx;
-	ctx->psmap = inode->i_mapping;
-	smp_wmb();
+	if (!i->i_openers++)
+		ctx->psmap = inode->i_mapping;
+	mutex_unlock(&ctx->mapping_lock);
 	return nonseekable_open(inode, file);
 }
 
+static int
+spufs_psmap_release(struct inode *inode, struct file *file)
+{
+	struct spufs_inode_info *i = SPUFS_I(inode);
+	struct spu_context *ctx = i->i_ctx;
+
+	mutex_lock(&ctx->mapping_lock);
+	if (!--i->i_openers)
+		ctx->psmap = NULL;
+	mutex_unlock(&ctx->mapping_lock);
+	return 0;
+}
+
 static const struct file_operations spufs_psmap_fops = {
 	.open	 = spufs_psmap_open,
+	.release = spufs_psmap_release,
 	.mmap	 = spufs_psmap_mmap,
+	.llseek  = no_llseek,
 };
 
 
 #if SPUFS_MMAP_4K
-static unsigned long spufs_mfc_mmap_nopfn(struct vm_area_struct *vma,
-					  unsigned long address)
+static int
+spufs_mfc_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	return spufs_ps_nopfn(vma, address, 0x3000, 0x1000);
+	return spufs_ps_fault(vma, vmf, 0x3000, SPUFS_MFC_MAP_SIZE);
 }
 
-static struct vm_operations_struct spufs_mfc_mmap_vmops = {
-	.nopfn = spufs_mfc_mmap_nopfn,
+static const struct vm_operations_struct spufs_mfc_mmap_vmops = {
+	.fault = spufs_mfc_mmap_fault,
 };
 
 /*
@@ -1100,8 +1531,7 @@ static int spufs_mfc_mmap(struct file *file, struct vm_area_struct *vma)
 		return -EINVAL;
 
 	vma->vm_flags |= VM_IO | VM_PFNMAP;
-	vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
-				     | _PAGE_NO_CACHE | _PAGE_GUARDED);
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 	vma->vm_ops = &spufs_mfc_mmap_vmops;
 	return 0;
@@ -1122,20 +1552,38 @@ static int spufs_mfc_open(struct inode *inode, struct file *file)
 	if (atomic_read(&inode->i_count) != 1)
 		return -EBUSY;
 
+	mutex_lock(&ctx->mapping_lock);
 	file->private_data = ctx;
-	ctx->mfc = inode->i_mapping;
-	smp_wmb();
+	if (!i->i_openers++)
+		ctx->mfc = inode->i_mapping;
+	mutex_unlock(&ctx->mapping_lock);
 	return nonseekable_open(inode, file);
 }
 
+static int
+spufs_mfc_release(struct inode *inode, struct file *file)
+{
+	struct spufs_inode_info *i = SPUFS_I(inode);
+	struct spu_context *ctx = i->i_ctx;
+
+	mutex_lock(&ctx->mapping_lock);
+	if (!--i->i_openers)
+		ctx->mfc = NULL;
+	mutex_unlock(&ctx->mapping_lock);
+	return 0;
+}
+
 /* interrupt-level mfc callback function. */
 void spufs_mfc_callback(struct spu *spu)
 {
 	struct spu_context *ctx = spu->ctx;
 
+	if (!ctx)
+		return;
+
 	wake_up_all(&ctx->mfc_wq);
 
-	pr_debug("%s %s\n", __FUNCTION__, spu->name);
+	pr_debug("%s %s\n", __func__, spu->name);
 	if (ctx->mfc_fasync) {
 		u32 free_elements, tagstatus;
 		unsigned int mask;
@@ -1179,22 +1627,26 @@ static ssize_t spufs_mfc_read(struct file *file, char __user *buffer,
 	if (size != 4)
 		goto out;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
+
+	ret = -EINVAL;
 	if (file->f_flags & O_NONBLOCK) {
 		status = ctx->ops->read_mfc_tagstatus(ctx);
 		if (!(status & ctx->tagwait))
 			ret = -EAGAIN;
 		else
+			/* XXX(hch): shouldn't we clear ret here? */
 			ctx->tagwait &= ~status;
 	} else {
 		ret = spufs_wait(ctx->mfc_wq,
 			   spufs_read_mfc_tagstatus(ctx, &status));
+		if (ret)
+			goto out;
 	}
 	spu_release(ctx);
 
-	if (ret)
-		goto out;
-
 	ret = 4;
 	if (copy_to_user(buffer, &status, 4))
 		ret = -EFAULT;
@@ -1205,7 +1657,7 @@ out:
 
 static int spufs_check_valid_dma(struct mfc_dma_command *cmd)
 {
-	pr_debug("queueing DMA %x %lx %x %x %x\n", cmd->lsa,
+	pr_debug("queueing DMA %x %llx %x %x %x\n", cmd->lsa,
 		 cmd->ea, cmd->size, cmd->tag, cmd->cmd);
 
 	switch (cmd->cmd) {
@@ -1222,7 +1674,7 @@ static int spufs_check_valid_dma(struct mfc_dma_command *cmd)
 	}
 
 	if ((cmd->lsa & 0xf) != (cmd->ea &0xf)) {
-		pr_debug("invalid DMA alignment, ea %lx lsa %x\n",
+		pr_debug("invalid DMA alignment, ea %llx lsa %x\n",
 				cmd->ea, cmd->lsa);
 		return -EIO;
 	}
@@ -1309,24 +1761,34 @@ static ssize_t spufs_mfc_write(struct file *file, const char __user *buffer,
 	if (ret)
 		goto out;
 
-	spu_acquire_runnable(ctx, 0);
+	ret = spu_acquire(ctx);
+	if (ret)
+		goto out;
+
+	ret = spufs_wait(ctx->run_wq, ctx->state == SPU_STATE_RUNNABLE);
+	if (ret)
+		goto out;
+
 	if (file->f_flags & O_NONBLOCK) {
 		ret = ctx->ops->send_mfc_command(ctx, &cmd);
 	} else {
 		int status;
 		ret = spufs_wait(ctx->mfc_wq,
 				 spu_send_mfc_command(ctx, cmd, &status));
+		if (ret)
+			goto out;
 		if (status)
 			ret = status;
 	}
-	spu_release(ctx);
 
 	if (ret)
-		goto out;
+		goto out_unlock;
 
 	ctx->tagwait |= 1 << cmd.tag;
 	ret = size;
 
+out_unlock:
+	spu_release(ctx);
 out:
 	return ret;
 }
@@ -1337,21 +1799,25 @@ static unsigned int spufs_mfc_poll(struct file *file,poll_table *wait)
 	u32 free_elements, tagstatus;
 	unsigned int mask;
 
-	spu_acquire(ctx);
+	poll_wait(file, &ctx->mfc_wq, wait);
+
+	/*
+	 * For now keep this uninterruptible and also ignore the rule
+	 * that poll should not sleep.  Will be fixed later.
+	 */
+	mutex_lock(&ctx->state_mutex);
 	ctx->ops->set_mfc_query(ctx, ctx->tagwait, 2);
 	free_elements = ctx->ops->get_mfc_free_elements(ctx);
 	tagstatus = ctx->ops->read_mfc_tagstatus(ctx);
 	spu_release(ctx);
 
-	poll_wait(file, &ctx->mfc_wq, wait);
-
 	mask = 0;
 	if (free_elements & 0xffff)
 		mask |= POLLOUT | POLLWRNORM;
 	if (tagstatus & ctx->tagwait)
 		mask |= POLLIN | POLLRDNORM;
 
-	pr_debug("%s: free %d tagstatus %d tagwait %d\n", __FUNCTION__,
+	pr_debug("%s: free %d tagstatus %d tagwait %d\n", __func__,
 		free_elements, tagstatus, ctx->tagwait);
 
 	return mask;
@@ -1362,7 +1828,9 @@ static int spufs_mfc_flush(struct file *file, fl_owner_t id)
 	struct spu_context *ctx = file->private_data;
 	int ret;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		goto out;
 #if 0
 /* this currently hangs */
 	ret = spufs_wait(ctx->mfc_wq,
@@ -1371,19 +1839,26 @@ static int spufs_mfc_flush(struct file *file, fl_owner_t id)
 		goto out;
 	ret = spufs_wait(ctx->mfc_wq,
 			 ctx->ops->read_mfc_tagstatus(ctx) == ctx->tagwait);
-out:
+	if (ret)
+		goto out;
 #else
 	ret = 0;
 #endif
 	spu_release(ctx);
-
+out:
 	return ret;
 }
 
-static int spufs_mfc_fsync(struct file *file, struct dentry *dentry,
-			   int datasync)
+static int spufs_mfc_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
-	return spufs_mfc_flush(file, NULL);
+	struct inode *inode = file_inode(file);
+	int err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (!err) {
+		mutex_lock(&inode->i_mutex);
+		err = spufs_mfc_flush(file, NULL);
+		mutex_unlock(&inode->i_mutex);
+	}
+	return err;
 }
 
 static int spufs_mfc_fasync(int fd, struct file *file, int on)
@@ -1395,6 +1870,7 @@ static int spufs_mfc_fasync(int fd, struct file *file, int on)
 
 static const struct file_operations spufs_mfc_fops = {
 	.open	 = spufs_mfc_open,
+	.release = spufs_mfc_release,
 	.read	 = spufs_mfc_read,
 	.write	 = spufs_mfc_write,
 	.poll	 = spufs_mfc_poll,
@@ -1402,115 +1878,108 @@ static const struct file_operations spufs_mfc_fops = {
 	.fsync	 = spufs_mfc_fsync,
 	.fasync	 = spufs_mfc_fasync,
 	.mmap	 = spufs_mfc_mmap,
+	.llseek  = no_llseek,
 };
 
-static void spufs_npc_set(void *data, u64 val)
+static int spufs_npc_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
-	spu_acquire(ctx);
+	int ret;
+
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	ctx->ops->npc_write(ctx, val);
 	spu_release(ctx);
+
+	return 0;
 }
 
-static u64 spufs_npc_get(void *data)
+static u64 spufs_npc_get(struct spu_context *ctx)
 {
-	struct spu_context *ctx = data;
-	u64 ret;
-	spu_acquire(ctx);
-	ret = ctx->ops->npc_read(ctx);
-	spu_release(ctx);
-	return ret;
+	return ctx->ops->npc_read(ctx);
 }
-DEFINE_SIMPLE_ATTRIBUTE(spufs_npc_ops, spufs_npc_get, spufs_npc_set,
-			"0x%llx\n")
+DEFINE_SPUFS_ATTRIBUTE(spufs_npc_ops, spufs_npc_get, spufs_npc_set,
+		       "0x%llx\n", SPU_ATTR_ACQUIRE);
 
-static void spufs_decr_set(void *data, u64 val)
+static int spufs_decr_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
-	spu_acquire_saved(ctx);
+	int ret;
+
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	lscsa->decr.slot[0] = (u32) val;
-	spu_release(ctx);
+	spu_release_saved(ctx);
+
+	return 0;
 }
 
-static u64 __spufs_decr_get(void *data)
+static u64 spufs_decr_get(struct spu_context *ctx)
 {
-	struct spu_context *ctx = data;
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
 	return lscsa->decr.slot[0];
 }
+DEFINE_SPUFS_ATTRIBUTE(spufs_decr_ops, spufs_decr_get, spufs_decr_set,
+		       "0x%llx\n", SPU_ATTR_ACQUIRE_SAVED);
 
-static u64 spufs_decr_get(void *data)
+static int spufs_decr_status_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
-	u64 ret;
-	spu_acquire_saved(ctx);
-	ret = __spufs_decr_get(data);
-	spu_release(ctx);
-	return ret;
-}
-DEFINE_SIMPLE_ATTRIBUTE(spufs_decr_ops, spufs_decr_get, spufs_decr_set,
-			"0x%llx\n")
+	int ret;
 
-static void spufs_decr_status_set(void *data, u64 val)
-{
-	struct spu_context *ctx = data;
-	struct spu_lscsa *lscsa = ctx->csa.lscsa;
-	spu_acquire_saved(ctx);
-	lscsa->decr_status.slot[0] = (u32) val;
-	spu_release(ctx);
-}
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
+	if (val)
+		ctx->csa.priv2.mfc_control_RW |= MFC_CNTL_DECREMENTER_RUNNING;
+	else
+		ctx->csa.priv2.mfc_control_RW &= ~MFC_CNTL_DECREMENTER_RUNNING;
+	spu_release_saved(ctx);
 
-static u64 __spufs_decr_status_get(void *data)
-{
-	struct spu_context *ctx = data;
-	struct spu_lscsa *lscsa = ctx->csa.lscsa;
-	return lscsa->decr_status.slot[0];
+	return 0;
 }
 
-static u64 spufs_decr_status_get(void *data)
+static u64 spufs_decr_status_get(struct spu_context *ctx)
 {
-	struct spu_context *ctx = data;
-	u64 ret;
-	spu_acquire_saved(ctx);
-	ret = __spufs_decr_status_get(data);
-	spu_release(ctx);
-	return ret;
+	if (ctx->csa.priv2.mfc_control_RW & MFC_CNTL_DECREMENTER_RUNNING)
+		return SPU_DECR_STATUS_RUNNING;
+	else
+		return 0;
 }
-DEFINE_SIMPLE_ATTRIBUTE(spufs_decr_status_ops, spufs_decr_status_get,
-			spufs_decr_status_set, "0x%llx\n")
+DEFINE_SPUFS_ATTRIBUTE(spufs_decr_status_ops, spufs_decr_status_get,
+		       spufs_decr_status_set, "0x%llx\n",
+		       SPU_ATTR_ACQUIRE_SAVED);
 
-static void spufs_event_mask_set(void *data, u64 val)
+static int spufs_event_mask_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
-	spu_acquire_saved(ctx);
+	int ret;
+
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	lscsa->event_mask.slot[0] = (u32) val;
-	spu_release(ctx);
+	spu_release_saved(ctx);
+
+	return 0;
 }
 
-static u64 __spufs_event_mask_get(void *data)
+static u64 spufs_event_mask_get(struct spu_context *ctx)
 {
-	struct spu_context *ctx = data;
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
 	return lscsa->event_mask.slot[0];
 }
 
-static u64 spufs_event_mask_get(void *data)
-{
-	struct spu_context *ctx = data;
-	u64 ret;
-	spu_acquire_saved(ctx);
-	ret = __spufs_event_mask_get(data);
-	spu_release(ctx);
-	return ret;
-}
-DEFINE_SIMPLE_ATTRIBUTE(spufs_event_mask_ops, spufs_event_mask_get,
-			spufs_event_mask_set, "0x%llx\n")
+DEFINE_SPUFS_ATTRIBUTE(spufs_event_mask_ops, spufs_event_mask_get,
+		       spufs_event_mask_set, "0x%llx\n",
+		       SPU_ATTR_ACQUIRE_SAVED);
 
-static u64 __spufs_event_status_get(void *data)
+static u64 spufs_event_status_get(struct spu_context *ctx)
 {
-	struct spu_context *ctx = data;
 	struct spu_state *state = &ctx->csa;
 	u64 stat;
 	stat = state->spu_chnlcnt_RW[0];
@@ -1518,97 +1987,69 @@ static u64 __spufs_event_status_get(void *data)
 		return state->spu_chnldata_RW[0];
 	return 0;
 }
+DEFINE_SPUFS_ATTRIBUTE(spufs_event_status_ops, spufs_event_status_get,
+		       NULL, "0x%llx\n", SPU_ATTR_ACQUIRE_SAVED)
 
-static u64 spufs_event_status_get(void *data)
-{
-	struct spu_context *ctx = data;
-	u64 ret = 0;
-
-	spu_acquire_saved(ctx);
-	ret = __spufs_event_status_get(data);
-	spu_release(ctx);
-	return ret;
-}
-DEFINE_SIMPLE_ATTRIBUTE(spufs_event_status_ops, spufs_event_status_get,
-			NULL, "0x%llx\n")
-
-static void spufs_srr0_set(void *data, u64 val)
+static int spufs_srr0_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
-	spu_acquire_saved(ctx);
+	int ret;
+
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	lscsa->srr0.slot[0] = (u32) val;
-	spu_release(ctx);
+	spu_release_saved(ctx);
+
+	return 0;
 }
 
-static u64 spufs_srr0_get(void *data)
+static u64 spufs_srr0_get(struct spu_context *ctx)
 {
-	struct spu_context *ctx = data;
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
-	u64 ret;
-	spu_acquire_saved(ctx);
-	ret = lscsa->srr0.slot[0];
-	spu_release(ctx);
-	return ret;
+	return lscsa->srr0.slot[0];
 }
-DEFINE_SIMPLE_ATTRIBUTE(spufs_srr0_ops, spufs_srr0_get, spufs_srr0_set,
-			"0x%llx\n")
+DEFINE_SPUFS_ATTRIBUTE(spufs_srr0_ops, spufs_srr0_get, spufs_srr0_set,
+		       "0x%llx\n", SPU_ATTR_ACQUIRE_SAVED)
 
-static u64 spufs_id_get(void *data)
+static u64 spufs_id_get(struct spu_context *ctx)
 {
-	struct spu_context *ctx = data;
 	u64 num;
 
-	spu_acquire(ctx);
 	if (ctx->state == SPU_STATE_RUNNABLE)
 		num = ctx->spu->number;
 	else
 		num = (unsigned int)-1;
-	spu_release(ctx);
 
 	return num;
 }
-DEFINE_SIMPLE_ATTRIBUTE(spufs_id_ops, spufs_id_get, NULL, "0x%llx\n")
+DEFINE_SPUFS_ATTRIBUTE(spufs_id_ops, spufs_id_get, NULL, "0x%llx\n",
+		       SPU_ATTR_ACQUIRE)
 
-static u64 __spufs_object_id_get(void *data)
-{
-	struct spu_context *ctx = data;
-	return ctx->object_id;
-}
-
-static u64 spufs_object_id_get(void *data)
+static u64 spufs_object_id_get(struct spu_context *ctx)
 {
 	/* FIXME: Should there really be no locking here? */
-	return __spufs_object_id_get(data);
+	return ctx->object_id;
 }
 
-static void spufs_object_id_set(void *data, u64 id)
+static int spufs_object_id_set(void *data, u64 id)
 {
 	struct spu_context *ctx = data;
 	ctx->object_id = id;
+
+	return 0;
 }
 
-DEFINE_SIMPLE_ATTRIBUTE(spufs_object_id_ops, spufs_object_id_get,
-		spufs_object_id_set, "0x%llx\n");
+DEFINE_SPUFS_ATTRIBUTE(spufs_object_id_ops, spufs_object_id_get,
+		       spufs_object_id_set, "0x%llx\n", SPU_ATTR_NOACQUIRE);
 
-static u64 __spufs_lslr_get(void *data)
+static u64 spufs_lslr_get(struct spu_context *ctx)
 {
-	struct spu_context *ctx = data;
 	return ctx->csa.priv2.spu_lslr_RW;
 }
-
-static u64 spufs_lslr_get(void *data)
-{
-	struct spu_context *ctx = data;
-	u64 ret;
-
-	spu_acquire_saved(ctx);
-	ret = __spufs_lslr_get(data);
-	spu_release(ctx);
-
-	return ret;
-}
-DEFINE_SIMPLE_ATTRIBUTE(spufs_lslr_ops, spufs_lslr_get, NULL, "0x%llx\n")
+DEFINE_SPUFS_ATTRIBUTE(spufs_lslr_ops, spufs_lslr_get, NULL, "0x%llx\n",
+		       SPU_ATTR_ACQUIRE_SAVED);
 
 static int spufs_info_open(struct inode *inode, struct file *file)
 {
@@ -1618,16 +2059,39 @@ static int spufs_info_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int spufs_caps_show(struct seq_file *s, void *private)
+{
+	struct spu_context *ctx = s->private;
+
+	if (!(ctx->flags & SPU_CREATE_NOSCHED))
+		seq_puts(s, "sched\n");
+	if (!(ctx->flags & SPU_CREATE_ISOLATE))
+		seq_puts(s, "step\n");
+	return 0;
+}
+
+static int spufs_caps_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, spufs_caps_show, SPUFS_I(inode)->i_ctx);
+}
+
+static const struct file_operations spufs_caps_fops = {
+	.open		= spufs_caps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 static ssize_t __spufs_mbox_info_read(struct spu_context *ctx,
 			char __user *buf, size_t len, loff_t *pos)
 {
-	u32 mbox_stat;
 	u32 data;
 
-	mbox_stat = ctx->csa.prob.mb_stat_R;
-	if (mbox_stat & 0x0000ff) {
-		data = ctx->csa.prob.pu_mb_R;
-	}
+	/* EOF if there's no entry in the mbox */
+	if (!(ctx->csa.prob.mb_stat_R & 0x0000ff))
+		return 0;
+
+	data = ctx->csa.prob.pu_mb_R;
 
 	return simple_read_from_buffer(buf, len, pos, &data, sizeof data);
 }
@@ -1641,11 +2105,13 @@ static ssize_t spufs_mbox_info_read(struct file *file, char __user *buf,
 	if (!access_ok(VERIFY_WRITE, buf, len))
 		return -EFAULT;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_mbox_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 
 	return ret;
 }
@@ -1659,13 +2125,13 @@ static const struct file_operations spufs_mbox_info_fops = {
 static ssize_t __spufs_ibox_info_read(struct spu_context *ctx,
 				char __user *buf, size_t len, loff_t *pos)
 {
-	u32 ibox_stat;
 	u32 data;
 
-	ibox_stat = ctx->csa.prob.mb_stat_R;
-	if (ibox_stat & 0xff0000) {
-		data = ctx->csa.priv2.puint_mb_R;
-	}
+	/* EOF if there's no entry in the ibox */
+	if (!(ctx->csa.prob.mb_stat_R & 0xff0000))
+		return 0;
+
+	data = ctx->csa.priv2.puint_mb_R;
 
 	return simple_read_from_buffer(buf, len, pos, &data, sizeof data);
 }
@@ -1679,11 +2145,13 @@ static ssize_t spufs_ibox_info_read(struct file *file, char __user *buf,
 	if (!access_ok(VERIFY_WRITE, buf, len))
 		return -EFAULT;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_ibox_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 
 	return ret;
 }
@@ -1720,11 +2188,13 @@ static ssize_t spufs_wbox_info_read(struct file *file, char __user *buf,
 	if (!access_ok(VERIFY_WRITE, buf, len))
 		return -EFAULT;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_wbox_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 
 	return ret;
 }
@@ -1770,11 +2240,13 @@ static ssize_t spufs_dma_info_read(struct file *file, char __user *buf,
 	if (!access_ok(VERIFY_WRITE, buf, len))
 		return -EFAULT;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_dma_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 
 	return ret;
 }
@@ -1782,6 +2254,7 @@ static ssize_t spufs_dma_info_read(struct file *file, char __user *buf,
 static const struct file_operations spufs_dma_info_fops = {
 	.open = spufs_info_open,
 	.read = spufs_dma_info_read,
+	.llseek = no_llseek,
 };
 
 static ssize_t __spufs_proxydma_info_read(struct spu_context *ctx,
@@ -1821,11 +2294,13 @@ static ssize_t spufs_proxydma_info_read(struct file *file, char __user *buf,
 	struct spu_context *ctx = file->private_data;
 	int ret;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_proxydma_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 
 	return ret;
 }
@@ -1833,23 +2308,391 @@ static ssize_t spufs_proxydma_info_read(struct file *file, char __user *buf,
 static const struct file_operations spufs_proxydma_info_fops = {
 	.open = spufs_info_open,
 	.read = spufs_proxydma_info_read,
+	.llseek = no_llseek,
 };
 
-struct tree_descr spufs_dir_contents[] = {
-	{ "mem",  &spufs_mem_fops,  0666, },
-	{ "regs", &spufs_regs_fops,  0666, },
+static int spufs_show_tid(struct seq_file *s, void *private)
+{
+	struct spu_context *ctx = s->private;
+
+	seq_printf(s, "%d\n", ctx->tid);
+	return 0;
+}
+
+static int spufs_tid_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, spufs_show_tid, SPUFS_I(inode)->i_ctx);
+}
+
+static const struct file_operations spufs_tid_fops = {
+	.open		= spufs_tid_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static const char *ctx_state_names[] = {
+	"user", "system", "iowait", "loaded"
+};
+
+static unsigned long long spufs_acct_time(struct spu_context *ctx,
+		enum spu_utilization_state state)
+{
+	struct timespec ts;
+	unsigned long long time = ctx->stats.times[state];
+
+	/*
+	 * In general, utilization statistics are updated by the controlling
+	 * thread as the spu context moves through various well defined
+	 * state transitions, but if the context is lazily loaded its
+	 * utilization statistics are not updated as the controlling thread
+	 * is not tightly coupled with the execution of the spu context.  We
+	 * calculate and apply the time delta from the last recorded state
+	 * of the spu context.
+	 */
+	if (ctx->spu && ctx->stats.util_state == state) {
+		ktime_get_ts(&ts);
+		time += timespec_to_ns(&ts) - ctx->stats.tstamp;
+	}
+
+	return time / NSEC_PER_MSEC;
+}
+
+static unsigned long long spufs_slb_flts(struct spu_context *ctx)
+{
+	unsigned long long slb_flts = ctx->stats.slb_flt;
+
+	if (ctx->state == SPU_STATE_RUNNABLE) {
+		slb_flts += (ctx->spu->stats.slb_flt -
+			     ctx->stats.slb_flt_base);
+	}
+
+	return slb_flts;
+}
+
+static unsigned long long spufs_class2_intrs(struct spu_context *ctx)
+{
+	unsigned long long class2_intrs = ctx->stats.class2_intr;
+
+	if (ctx->state == SPU_STATE_RUNNABLE) {
+		class2_intrs += (ctx->spu->stats.class2_intr -
+				 ctx->stats.class2_intr_base);
+	}
+
+	return class2_intrs;
+}
+
+
+static int spufs_show_stat(struct seq_file *s, void *private)
+{
+	struct spu_context *ctx = s->private;
+	int ret;
+
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
+
+	seq_printf(s, "%s %llu %llu %llu %llu "
+		      "%llu %llu %llu %llu %llu %llu %llu %llu\n",
+		ctx_state_names[ctx->stats.util_state],
+		spufs_acct_time(ctx, SPU_UTIL_USER),
+		spufs_acct_time(ctx, SPU_UTIL_SYSTEM),
+		spufs_acct_time(ctx, SPU_UTIL_IOWAIT),
+		spufs_acct_time(ctx, SPU_UTIL_IDLE_LOADED),
+		ctx->stats.vol_ctx_switch,
+		ctx->stats.invol_ctx_switch,
+		spufs_slb_flts(ctx),
+		ctx->stats.hash_flt,
+		ctx->stats.min_flt,
+		ctx->stats.maj_flt,
+		spufs_class2_intrs(ctx),
+		ctx->stats.libassist);
+	spu_release(ctx);
+	return 0;
+}
+
+static int spufs_stat_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, spufs_show_stat, SPUFS_I(inode)->i_ctx);
+}
+
+static const struct file_operations spufs_stat_fops = {
+	.open		= spufs_stat_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static inline int spufs_switch_log_used(struct spu_context *ctx)
+{
+	return (ctx->switch_log->head - ctx->switch_log->tail) %
+		SWITCH_LOG_BUFSIZE;
+}
+
+static inline int spufs_switch_log_avail(struct spu_context *ctx)
+{
+	return SWITCH_LOG_BUFSIZE - spufs_switch_log_used(ctx);
+}
+
+static int spufs_switch_log_open(struct inode *inode, struct file *file)
+{
+	struct spu_context *ctx = SPUFS_I(inode)->i_ctx;
+	int rc;
+
+	rc = spu_acquire(ctx);
+	if (rc)
+		return rc;
+
+	if (ctx->switch_log) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	ctx->switch_log = kmalloc(sizeof(struct switch_log) +
+		SWITCH_LOG_BUFSIZE * sizeof(struct switch_log_entry),
+		GFP_KERNEL);
+
+	if (!ctx->switch_log) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	ctx->switch_log->head = ctx->switch_log->tail = 0;
+	init_waitqueue_head(&ctx->switch_log->wait);
+	rc = 0;
+
+out:
+	spu_release(ctx);
+	return rc;
+}
+
+static int spufs_switch_log_release(struct inode *inode, struct file *file)
+{
+	struct spu_context *ctx = SPUFS_I(inode)->i_ctx;
+	int rc;
+
+	rc = spu_acquire(ctx);
+	if (rc)
+		return rc;
+
+	kfree(ctx->switch_log);
+	ctx->switch_log = NULL;
+	spu_release(ctx);
+
+	return 0;
+}
+
+static int switch_log_sprint(struct spu_context *ctx, char *tbuf, int n)
+{
+	struct switch_log_entry *p;
+
+	p = ctx->switch_log->log + ctx->switch_log->tail % SWITCH_LOG_BUFSIZE;
+
+	return snprintf(tbuf, n, "%u.%09u %d %u %u %llu\n",
+			(unsigned int) p->tstamp.tv_sec,
+			(unsigned int) p->tstamp.tv_nsec,
+			p->spu_id,
+			(unsigned int) p->type,
+			(unsigned int) p->val,
+			(unsigned long long) p->timebase);
+}
+
+static ssize_t spufs_switch_log_read(struct file *file, char __user *buf,
+			     size_t len, loff_t *ppos)
+{
+	struct inode *inode = file_inode(file);
+	struct spu_context *ctx = SPUFS_I(inode)->i_ctx;
+	int error = 0, cnt = 0;
+
+	if (!buf)
+		return -EINVAL;
+
+	error = spu_acquire(ctx);
+	if (error)
+		return error;
+
+	while (cnt < len) {
+		char tbuf[128];
+		int width;
+
+		if (spufs_switch_log_used(ctx) == 0) {
+			if (cnt > 0) {
+				/* If there's data ready to go, we can
+				 * just return straight away */
+				break;
+
+			} else if (file->f_flags & O_NONBLOCK) {
+				error = -EAGAIN;
+				break;
+
+			} else {
+				/* spufs_wait will drop the mutex and
+				 * re-acquire, but since we're in read(), the
+				 * file cannot be _released (and so
+				 * ctx->switch_log is stable).
+				 */
+				error = spufs_wait(ctx->switch_log->wait,
+						spufs_switch_log_used(ctx) > 0);
+
+				/* On error, spufs_wait returns without the
+				 * state mutex held */
+				if (error)
+					return error;
+
+				/* We may have had entries read from underneath
+				 * us while we dropped the mutex in spufs_wait,
+				 * so re-check */
+				if (spufs_switch_log_used(ctx) == 0)
+					continue;
+			}
+		}
+
+		width = switch_log_sprint(ctx, tbuf, sizeof(tbuf));
+		if (width < len)
+			ctx->switch_log->tail =
+				(ctx->switch_log->tail + 1) %
+				 SWITCH_LOG_BUFSIZE;
+		else
+			/* If the record is greater than space available return
+			 * partial buffer (so far) */
+			break;
+
+		error = copy_to_user(buf + cnt, tbuf, width);
+		if (error)
+			break;
+		cnt += width;
+	}
+
+	spu_release(ctx);
+
+	return cnt == 0 ? error : cnt;
+}
+
+static unsigned int spufs_switch_log_poll(struct file *file, poll_table *wait)
+{
+	struct inode *inode = file_inode(file);
+	struct spu_context *ctx = SPUFS_I(inode)->i_ctx;
+	unsigned int mask = 0;
+	int rc;
+
+	poll_wait(file, &ctx->switch_log->wait, wait);
+
+	rc = spu_acquire(ctx);
+	if (rc)
+		return rc;
+
+	if (spufs_switch_log_used(ctx) > 0)
+		mask |= POLLIN;
+
+	spu_release(ctx);
+
+	return mask;
+}
+
+static const struct file_operations spufs_switch_log_fops = {
+	.open		= spufs_switch_log_open,
+	.read		= spufs_switch_log_read,
+	.poll		= spufs_switch_log_poll,
+	.release	= spufs_switch_log_release,
+	.llseek		= no_llseek,
+};
+
+/**
+ * Log a context switch event to a switch log reader.
+ *
+ * Must be called with ctx->state_mutex held.
+ */
+void spu_switch_log_notify(struct spu *spu, struct spu_context *ctx,
+		u32 type, u32 val)
+{
+	if (!ctx->switch_log)
+		return;
+
+	if (spufs_switch_log_avail(ctx) > 1) {
+		struct switch_log_entry *p;
+
+		p = ctx->switch_log->log + ctx->switch_log->head;
+		ktime_get_ts(&p->tstamp);
+		p->timebase = get_tb();
+		p->spu_id = spu ? spu->number : -1;
+		p->type = type;
+		p->val = val;
+
+		ctx->switch_log->head =
+			(ctx->switch_log->head + 1) % SWITCH_LOG_BUFSIZE;
+	}
+
+	wake_up(&ctx->switch_log->wait);
+}
+
+static int spufs_show_ctx(struct seq_file *s, void *private)
+{
+	struct spu_context *ctx = s->private;
+	u64 mfc_control_RW;
+
+	mutex_lock(&ctx->state_mutex);
+	if (ctx->spu) {
+		struct spu *spu = ctx->spu;
+		struct spu_priv2 __iomem *priv2 = spu->priv2;
+
+		spin_lock_irq(&spu->register_lock);
+		mfc_control_RW = in_be64(&priv2->mfc_control_RW);
+		spin_unlock_irq(&spu->register_lock);
+	} else {
+		struct spu_state *csa = &ctx->csa;
+
+		mfc_control_RW = csa->priv2.mfc_control_RW;
+	}
+
+	seq_printf(s, "%c flgs(%lx) sflgs(%lx) pri(%d) ts(%d) spu(%02d)"
+		" %c %llx %llx %llx %llx %x %x\n",
+		ctx->state == SPU_STATE_SAVED ? 'S' : 'R',
+		ctx->flags,
+		ctx->sched_flags,
+		ctx->prio,
+		ctx->time_slice,
+		ctx->spu ? ctx->spu->number : -1,
+		!list_empty(&ctx->rq) ? 'q' : ' ',
+		ctx->csa.class_0_pending,
+		ctx->csa.class_0_dar,
+		ctx->csa.class_1_dsisr,
+		mfc_control_RW,
+		ctx->ops->runcntl_read(ctx),
+		ctx->ops->status_read(ctx));
+
+	mutex_unlock(&ctx->state_mutex);
+
+	return 0;
+}
+
+static int spufs_ctx_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, spufs_show_ctx, SPUFS_I(inode)->i_ctx);
+}
+
+static const struct file_operations spufs_ctx_fops = {
+	.open           = spufs_ctx_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = single_release,
+};
+
+const struct spufs_tree_descr spufs_dir_contents[] = {
+	{ "capabilities", &spufs_caps_fops, 0444, },
+	{ "mem",  &spufs_mem_fops,  0666, LS_SIZE, },
+	{ "regs", &spufs_regs_fops,  0666, sizeof(struct spu_reg128[128]), },
 	{ "mbox", &spufs_mbox_fops, 0444, },
 	{ "ibox", &spufs_ibox_fops, 0444, },
 	{ "wbox", &spufs_wbox_fops, 0222, },
-	{ "mbox_stat", &spufs_mbox_stat_fops, 0444, },
-	{ "ibox_stat", &spufs_ibox_stat_fops, 0444, },
-	{ "wbox_stat", &spufs_wbox_stat_fops, 0444, },
+	{ "mbox_stat", &spufs_mbox_stat_fops, 0444, sizeof(u32), },
+	{ "ibox_stat", &spufs_ibox_stat_fops, 0444, sizeof(u32), },
+	{ "wbox_stat", &spufs_wbox_stat_fops, 0444, sizeof(u32), },
 	{ "signal1", &spufs_signal1_fops, 0666, },
 	{ "signal2", &spufs_signal2_fops, 0666, },
 	{ "signal1_type", &spufs_signal1_type, 0666, },
 	{ "signal2_type", &spufs_signal2_type, 0666, },
 	{ "cntl", &spufs_cntl_fops,  0666, },
-	{ "fpcr", &spufs_fpcr_fops, 0666, },
+	{ "fpcr", &spufs_fpcr_fops, 0666, sizeof(struct spu_reg128), },
 	{ "lslr", &spufs_lslr_ops, 0444, },
 	{ "mfc", &spufs_mfc_fops, 0666, },
 	{ "mss", &spufs_mss_fops, 0666, },
@@ -1859,59 +2702,72 @@ struct tree_descr spufs_dir_contents[] = {
 	{ "decr_status", &spufs_decr_status_ops, 0666, },
 	{ "event_mask", &spufs_event_mask_ops, 0666, },
 	{ "event_status", &spufs_event_status_ops, 0444, },
-	{ "psmap", &spufs_psmap_fops, 0666, },
+	{ "psmap", &spufs_psmap_fops, 0666, SPUFS_PS_MAP_SIZE, },
 	{ "phys-id", &spufs_id_ops, 0666, },
 	{ "object-id", &spufs_object_id_ops, 0666, },
-	{ "mbox_info", &spufs_mbox_info_fops, 0444, },
-	{ "ibox_info", &spufs_ibox_info_fops, 0444, },
-	{ "wbox_info", &spufs_wbox_info_fops, 0444, },
-	{ "dma_info", &spufs_dma_info_fops, 0444, },
-	{ "proxydma_info", &spufs_proxydma_info_fops, 0444, },
+	{ "mbox_info", &spufs_mbox_info_fops, 0444, sizeof(u32), },
+	{ "ibox_info", &spufs_ibox_info_fops, 0444, sizeof(u32), },
+	{ "wbox_info", &spufs_wbox_info_fops, 0444, sizeof(u32), },
+	{ "dma_info", &spufs_dma_info_fops, 0444,
+		sizeof(struct spu_dma_info), },
+	{ "proxydma_info", &spufs_proxydma_info_fops, 0444,
+		sizeof(struct spu_proxydma_info)},
+	{ "tid", &spufs_tid_fops, 0444, },
+	{ "stat", &spufs_stat_fops, 0444, },
+	{ "switch_log", &spufs_switch_log_fops, 0444 },
 	{},
 };
 
-struct tree_descr spufs_dir_nosched_contents[] = {
-	{ "mem",  &spufs_mem_fops,  0666, },
+const struct spufs_tree_descr spufs_dir_nosched_contents[] = {
+	{ "capabilities", &spufs_caps_fops, 0444, },
+	{ "mem",  &spufs_mem_fops,  0666, LS_SIZE, },
 	{ "mbox", &spufs_mbox_fops, 0444, },
 	{ "ibox", &spufs_ibox_fops, 0444, },
 	{ "wbox", &spufs_wbox_fops, 0222, },
-	{ "mbox_stat", &spufs_mbox_stat_fops, 0444, },
-	{ "ibox_stat", &spufs_ibox_stat_fops, 0444, },
-	{ "wbox_stat", &spufs_wbox_stat_fops, 0444, },
-	{ "signal1", &spufs_signal1_fops, 0666, },
-	{ "signal2", &spufs_signal2_fops, 0666, },
+	{ "mbox_stat", &spufs_mbox_stat_fops, 0444, sizeof(u32), },
+	{ "ibox_stat", &spufs_ibox_stat_fops, 0444, sizeof(u32), },
+	{ "wbox_stat", &spufs_wbox_stat_fops, 0444, sizeof(u32), },
+	{ "signal1", &spufs_signal1_nosched_fops, 0222, },
+	{ "signal2", &spufs_signal2_nosched_fops, 0222, },
 	{ "signal1_type", &spufs_signal1_type, 0666, },
 	{ "signal2_type", &spufs_signal2_type, 0666, },
 	{ "mss", &spufs_mss_fops, 0666, },
 	{ "mfc", &spufs_mfc_fops, 0666, },
 	{ "cntl", &spufs_cntl_fops,  0666, },
 	{ "npc", &spufs_npc_ops, 0666, },
-	{ "psmap", &spufs_psmap_fops, 0666, },
+	{ "psmap", &spufs_psmap_fops, 0666, SPUFS_PS_MAP_SIZE, },
 	{ "phys-id", &spufs_id_ops, 0666, },
 	{ "object-id", &spufs_object_id_ops, 0666, },
+	{ "tid", &spufs_tid_fops, 0444, },
+	{ "stat", &spufs_stat_fops, 0444, },
 	{},
 };
 
-struct spufs_coredump_reader spufs_coredump_read[] = {
-	{ "regs", __spufs_regs_read, NULL, 128 * 16 },
-	{ "fpcr", __spufs_fpcr_read, NULL, 16 },
-	{ "lslr", NULL, __spufs_lslr_get, 11 },
-	{ "decr", NULL, __spufs_decr_get, 11 },
-	{ "decr_status", NULL, __spufs_decr_status_get, 11 },
-	{ "mem", __spufs_mem_read, NULL, 256 * 1024, },
-	{ "signal1", __spufs_signal1_read, NULL, 4 },
-	{ "signal1_type", NULL, __spufs_signal1_type_get, 2 },
-	{ "signal2", __spufs_signal2_read, NULL, 4 },
-	{ "signal2_type", NULL, __spufs_signal2_type_get, 2 },
-	{ "event_mask", NULL, __spufs_event_mask_get, 8 },
-	{ "event_status", NULL, __spufs_event_status_get, 8 },
-	{ "mbox_info", __spufs_mbox_info_read, NULL, 4 },
-	{ "ibox_info", __spufs_ibox_info_read, NULL, 4 },
-	{ "wbox_info", __spufs_wbox_info_read, NULL, 16 },
-	{ "dma_info", __spufs_dma_info_read, NULL, 69 * 8 },
-	{ "proxydma_info", __spufs_proxydma_info_read, NULL, 35 * 8 },
-	{ "object-id", NULL, __spufs_object_id_get, 19 },
-	{ },
+const struct spufs_tree_descr spufs_dir_debug_contents[] = {
+	{ ".ctx", &spufs_ctx_fops, 0444, },
+	{},
 };
-int spufs_coredump_num_notes = ARRAY_SIZE(spufs_coredump_read) - 1;
 
+const struct spufs_coredump_reader spufs_coredump_read[] = {
+	{ "regs", __spufs_regs_read, NULL, sizeof(struct spu_reg128[128])},
+	{ "fpcr", __spufs_fpcr_read, NULL, sizeof(struct spu_reg128) },
+	{ "lslr", NULL, spufs_lslr_get, 19 },
+	{ "decr", NULL, spufs_decr_get, 19 },
+	{ "decr_status", NULL, spufs_decr_status_get, 19 },
+	{ "mem", __spufs_mem_read, NULL, LS_SIZE, },
+	{ "signal1", __spufs_signal1_read, NULL, sizeof(u32) },
+	{ "signal1_type", NULL, spufs_signal1_type_get, 19 },
+	{ "signal2", __spufs_signal2_read, NULL, sizeof(u32) },
+	{ "signal2_type", NULL, spufs_signal2_type_get, 19 },
+	{ "event_mask", NULL, spufs_event_mask_get, 19 },
+	{ "event_status", NULL, spufs_event_status_get, 19 },
+	{ "mbox_info", __spufs_mbox_info_read, NULL, sizeof(u32) },
+	{ "ibox_info", __spufs_ibox_info_read, NULL, sizeof(u32) },
+	{ "wbox_info", __spufs_wbox_info_read, NULL, 4 * sizeof(u32)},
+	{ "dma_info", __spufs_dma_info_read, NULL, sizeof(struct spu_dma_info)},
+	{ "proxydma_info", __spufs_proxydma_info_read,
+			   NULL, sizeof(struct spu_proxydma_info)},
+	{ "object-id", NULL, spufs_object_id_get, 19 },
+	{ "npc", NULL, spufs_npc_get, 19 },
+	{ NULL },
+};
diff --git a/arch/powerpc/platforms/cell/spufs/gang.c b/arch/powerpc/platforms/cell/spufs/gang.c
index 212ea78f905..71a44325302 100644
--- a/arch/powerpc/platforms/cell/spufs/gang.c
+++ b/arch/powerpc/platforms/cell/spufs/gang.c
@@ -35,7 +35,9 @@ struct spu_gang *alloc_spu_gang(void)
 
 	kref_init(&gang->kref);
 	mutex_init(&gang->mutex);
+	mutex_init(&gang->aff_mutex);
 	INIT_LIST_HEAD(&gang->list);
+	INIT_LIST_HEAD(&gang->aff_list_head);
 
 out:
 	return gang;
@@ -73,6 +75,10 @@ void spu_gang_remove_ctx(struct spu_gang *gang, struct spu_context *ctx)
 {
 	mutex_lock(&gang->mutex);
 	WARN_ON(ctx->gang != gang);
+	if (!list_empty(&ctx->aff_list)) {
+		list_del_init(&ctx->aff_list);
+		gang->aff_flags &= ~AFF_OFFSETS_SET;
+	}
 	list_del_init(&ctx->gang_list);
 	gang->contexts--;
 	mutex_unlock(&gang->mutex);
diff --git a/arch/powerpc/platforms/cell/spufs/hw_ops.c b/arch/powerpc/platforms/cell/spufs/hw_ops.c
index ae42e03b8c8..8655c4cbefc 100644
--- a/arch/powerpc/platforms/cell/spufs/hw_ops.c
+++ b/arch/powerpc/platforms/cell/spufs/hw_ops.c
@@ -18,14 +18,12 @@
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/poll.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/stddef.h>
 #include <linux/unistd.h>
 
@@ -77,16 +75,18 @@ static unsigned int spu_hw_mbox_stat_poll(struct spu_context *ctx,
 		if (stat & 0xff0000)
 			ret |= POLLIN | POLLRDNORM;
 		else {
-			spu_int_stat_clear(spu, 2, 0x1);
-			spu_int_mask_or(spu, 2, 0x1);
+			spu_int_stat_clear(spu, 2, CLASS2_MAILBOX_INTR);
+			spu_int_mask_or(spu, 2, CLASS2_ENABLE_MAILBOX_INTR);
 		}
 	}
 	if (events & (POLLOUT | POLLWRNORM)) {
 		if (stat & 0x00ff00)
 			ret = POLLOUT | POLLWRNORM;
 		else {
-			spu_int_stat_clear(spu, 2, 0x10);
-			spu_int_mask_or(spu, 2, 0x10);
+			spu_int_stat_clear(spu, 2,
+					CLASS2_MAILBOX_THRESHOLD_INTR);
+			spu_int_mask_or(spu, 2,
+					CLASS2_ENABLE_MAILBOX_THRESHOLD_INTR);
 		}
 	}
 	spin_unlock_irq(&spu->register_lock);
@@ -107,7 +107,7 @@ static int spu_hw_ibox_read(struct spu_context *ctx, u32 * data)
 		ret = 4;
 	} else {
 		/* make sure we get woken up by the interrupt */
-		spu_int_mask_or(spu, 2, 0x1);
+		spu_int_mask_or(spu, 2, CLASS2_ENABLE_MAILBOX_INTR);
 		ret = 0;
 	}
 	spin_unlock_irq(&spu->register_lock);
@@ -128,7 +128,7 @@ static int spu_hw_wbox_write(struct spu_context *ctx, u32 data)
 	} else {
 		/* make sure we get woken up by the interrupt when space
 		   becomes available */
-		spu_int_mask_or(spu, 2, 0x10);
+		spu_int_mask_or(spu, 2, CLASS2_ENABLE_MAILBOX_THRESHOLD_INTR);
 		ret = 0;
 	}
 	spin_unlock_irq(&spu->register_lock);
@@ -207,6 +207,11 @@ static char *spu_hw_get_ls(struct spu_context *ctx)
 	return ctx->spu->local_store;
 }
 
+static void spu_hw_privcntl_write(struct spu_context *ctx, u64 val)
+{
+	out_be64(&ctx->spu->priv2->spu_privcntl_RW, val);
+}
+
 static u32 spu_hw_runcntl_read(struct spu_context *ctx)
 {
 	return in_be32(&ctx->spu->problem->spu_runcntl_RW);
@@ -216,11 +221,21 @@ static void spu_hw_runcntl_write(struct spu_context *ctx, u32 val)
 {
 	spin_lock_irq(&ctx->spu->register_lock);
 	if (val & SPU_RUNCNTL_ISOLATE)
-		out_be64(&ctx->spu->priv2->spu_privcntl_RW, 4LL);
+		spu_hw_privcntl_write(ctx,
+			SPU_PRIVCNT_LOAD_REQUEST_ENABLE_MASK);
 	out_be32(&ctx->spu->problem->spu_runcntl_RW, val);
 	spin_unlock_irq(&ctx->spu->register_lock);
 }
 
+static void spu_hw_runcntl_stop(struct spu_context *ctx)
+{
+	spin_lock_irq(&ctx->spu->register_lock);
+	out_be32(&ctx->spu->problem->spu_runcntl_RW, SPU_RUNCNTL_STOP);
+	while (in_be32(&ctx->spu->problem->spu_status_R) & SPU_STATUS_RUNNING)
+		cpu_relax();
+	spin_unlock_irq(&ctx->spu->register_lock);
+}
+
 static void spu_hw_master_start(struct spu_context *ctx)
 {
 	struct spu *spu = ctx->spu;
@@ -296,6 +311,14 @@ static int spu_hw_send_mfc_command(struct spu_context *ctx,
 	}
 }
 
+static void spu_hw_restart_dma(struct spu_context *ctx)
+{
+	struct spu_priv2 __iomem *priv2 = ctx->spu->priv2;
+
+	if (!test_bit(SPU_CONTEXT_SWITCH_PENDING, &ctx->spu->flags))
+		out_be64(&priv2->mfc_control_RW, MFC_CNTL_RESTART_DMA_COMMAND);
+}
+
 struct spu_context_ops spu_hw_ops = {
 	.mbox_read = spu_hw_mbox_read,
 	.mbox_stat_read = spu_hw_mbox_stat_read,
@@ -312,12 +335,15 @@ struct spu_context_ops spu_hw_ops = {
 	.npc_write = spu_hw_npc_write,
 	.status_read = spu_hw_status_read,
 	.get_ls = spu_hw_get_ls,
+	.privcntl_write = spu_hw_privcntl_write,
 	.runcntl_read = spu_hw_runcntl_read,
 	.runcntl_write = spu_hw_runcntl_write,
+	.runcntl_stop = spu_hw_runcntl_stop,
 	.master_start = spu_hw_master_start,
 	.master_stop = spu_hw_master_stop,
 	.set_mfc_query = spu_hw_set_mfc_query,
 	.read_mfc_tagstatus = spu_hw_read_mfc_tagstatus,
 	.get_mfc_free_elements = spu_hw_get_mfc_free_elements,
 	.send_mfc_command = spu_hw_send_mfc_command,
+	.restart_dma = spu_hw_restart_dma,
 };
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 8079983ef94..87ba7cf99cd 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -1,3 +1,4 @@
+
 /*
  * SPU file system
  *
@@ -22,6 +23,7 @@
 
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/fsnotify.h>
 #include <linux/backing-dev.h>
 #include <linux/init.h>
 #include <linux/ioctl.h>
@@ -34,14 +36,24 @@
 #include <linux/parser.h>
 
 #include <asm/prom.h>
-#include <asm/semaphore.h>
 #include <asm/spu.h>
+#include <asm/spu_priv1.h>
 #include <asm/uaccess.h>
 
 #include "spufs.h"
 
+struct spufs_sb_info {
+	int debug;
+};
+
 static struct kmem_cache *spufs_inode_cache;
 char *isolated_loader;
+static int isolated_loader_size;
+
+static struct spufs_sb_info *spufs_get_sb_info(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
 
 static struct inode *
 spufs_alloc_inode(struct super_block *sb)
@@ -54,29 +66,32 @@ spufs_alloc_inode(struct super_block *sb)
 
 	ei->i_gang = NULL;
 	ei->i_ctx = NULL;
+	ei->i_openers = 0;
 
 	return &ei->vfs_inode;
 }
 
-static void
-spufs_destroy_inode(struct inode *inode)
+static void spufs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(spufs_inode_cache, SPUFS_I(inode));
 }
 
+static void spufs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, spufs_i_callback);
+}
+
 static void
-spufs_init_once(void *p, struct kmem_cache * cachep, unsigned long flags)
+spufs_init_once(void *p)
 {
 	struct spufs_inode_info *ei = p;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		inode_init_once(&ei->vfs_inode);
-	}
+	inode_init_once(&ei->vfs_inode);
 }
 
 static struct inode *
-spufs_new_inode(struct super_block *sb, int mode)
+spufs_new_inode(struct super_block *sb, umode_t mode)
 {
 	struct inode *inode;
 
@@ -84,10 +99,10 @@ spufs_new_inode(struct super_block *sb, int mode)
 	if (!inode)
 		goto out;
 
+	inode->i_ino = get_next_ino();
 	inode->i_mode = mode;
-	inode->i_uid = current->fsuid;
-	inode->i_gid = current->fsgid;
-	inode->i_blocks = 0;
+	inode->i_uid = current_fsuid();
+	inode->i_gid = current_fsgid();
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 out:
 	return inode;
@@ -101,16 +116,18 @@ spufs_setattr(struct dentry *dentry, struct iattr *attr)
 	if ((attr->ia_valid & ATTR_SIZE) &&
 	    (attr->ia_size != inode->i_size))
 		return -EINVAL;
-	return inode_setattr(inode, attr);
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
 }
 
 
 static int
 spufs_new_file(struct super_block *sb, struct dentry *dentry,
-		const struct file_operations *fops, int mode,
-		struct spu_context *ctx)
+		const struct file_operations *fops, umode_t mode,
+		size_t size, struct spu_context *ctx)
 {
-	static struct inode_operations spufs_file_iops = {
+	static const struct inode_operations spufs_file_iops = {
 		.setattr = spufs_setattr,
 	};
 	struct inode *inode;
@@ -124,6 +141,7 @@ spufs_new_file(struct super_block *sb, struct dentry *dentry,
 	ret = 0;
 	inode->i_op = &spufs_file_iops;
 	inode->i_fop = fops;
+	inode->i_size = size;
 	inode->i_private = SPUFS_I(inode)->i_ctx = get_spu_context(ctx);
 	d_add(dentry, inode);
 out:
@@ -131,15 +149,14 @@ out:
 }
 
 static void
-spufs_delete_inode(struct inode *inode)
+spufs_evict_inode(struct inode *inode)
 {
 	struct spufs_inode_info *ei = SPUFS_I(inode);
-
+	clear_inode(inode);
 	if (ei->i_ctx)
 		put_spu_context(ei->i_ctx);
 	if (ei->i_gang)
 		put_spu_gang(ei->i_gang);
-	clear_inode(inode);
 }
 
 static void spufs_prune_dir(struct dentry *dir)
@@ -148,18 +165,18 @@ static void spufs_prune_dir(struct dentry *dir)
 
 	mutex_lock(&dir->d_inode->i_mutex);
 	list_for_each_entry_safe(dentry, tmp, &dir->d_subdirs, d_u.d_child) {
-		spin_lock(&dcache_lock);
 		spin_lock(&dentry->d_lock);
 		if (!(d_unhashed(dentry)) && dentry->d_inode) {
-			dget_locked(dentry);
+			dget_dlock(dentry);
 			__d_drop(dentry);
 			spin_unlock(&dentry->d_lock);
 			simple_unlink(dir->d_inode, dentry);
-			spin_unlock(&dcache_lock);
+			/* XXX: what was dcache_lock protecting here? Other
+			 * filesystems (IB, configfs) release dcache_lock
+			 * before unlink */
 			dput(dentry);
 		} else {
 			spin_unlock(&dentry->d_lock);
-			spin_unlock(&dcache_lock);
 		}
 	}
 	shrink_dcache_parent(dir);
@@ -170,32 +187,31 @@ static void spufs_prune_dir(struct dentry *dir)
 static int spufs_rmdir(struct inode *parent, struct dentry *dir)
 {
 	/* remove all entries */
+	int res;
 	spufs_prune_dir(dir);
-
-	return simple_rmdir(parent, dir);
+	d_drop(dir);
+	res = simple_rmdir(parent, dir);
+	/* We have to give up the mm_struct */
+	spu_forget(SPUFS_I(dir->d_inode)->i_ctx);
+	return res;
 }
 
-static int spufs_fill_dir(struct dentry *dir, struct tree_descr *files,
-			  int mode, struct spu_context *ctx)
+static int spufs_fill_dir(struct dentry *dir,
+		const struct spufs_tree_descr *files, umode_t mode,
+		struct spu_context *ctx)
 {
-	struct dentry *dentry;
-	int ret;
-
 	while (files->name && files->name[0]) {
-		ret = -ENOMEM;
-		dentry = d_alloc_name(dir, files->name);
+		int ret;
+		struct dentry *dentry = d_alloc_name(dir, files->name);
 		if (!dentry)
-			goto out;
+			return -ENOMEM;
 		ret = spufs_new_file(dir->d_sb, dentry, files->ops,
-					files->mode & mode, ctx);
+					files->mode & mode, files->size, ctx);
 		if (ret)
-			goto out;
+			return ret;
 		files++;
 	}
 	return 0;
-out:
-	spufs_prune_dir(dir);
-	return ret;
 }
 
 static int spufs_dir_close(struct inode *inode, struct file *file)
@@ -209,43 +225,35 @@ static int spufs_dir_close(struct inode *inode, struct file *file)
 	parent = dir->d_parent->d_inode;
 	ctx = SPUFS_I(dir->d_inode)->i_ctx;
 
-	mutex_lock(&parent->i_mutex);
+	mutex_lock_nested(&parent->i_mutex, I_MUTEX_PARENT);
 	ret = spufs_rmdir(parent, dir);
 	mutex_unlock(&parent->i_mutex);
 	WARN_ON(ret);
 
-	/* We have to give up the mm_struct */
-	spu_forget(ctx);
-
 	return dcache_dir_close(inode, file);
 }
 
-const struct inode_operations spufs_dir_inode_operations = {
-	.lookup = simple_lookup,
-};
-
 const struct file_operations spufs_context_fops = {
 	.open		= dcache_dir_open,
 	.release	= spufs_dir_close,
 	.llseek		= dcache_dir_lseek,
 	.read		= generic_read_dir,
-	.readdir	= dcache_readdir,
-	.fsync		= simple_sync_file,
+	.iterate	= dcache_readdir,
+	.fsync		= noop_fsync,
 };
 EXPORT_SYMBOL_GPL(spufs_context_fops);
 
 static int
 spufs_mkdir(struct inode *dir, struct dentry *dentry, unsigned int flags,
-		int mode)
+		umode_t mode)
 {
 	int ret;
 	struct inode *inode;
 	struct spu_context *ctx;
 
-	ret = -ENOSPC;
 	inode = spufs_new_inode(dir->i_sb, mode | S_IFDIR);
 	if (!inode)
-		goto out;
+		return -ENOSPC;
 
 	if (dir->i_mode & S_ISGID) {
 		inode->i_gid = dir->i_gid;
@@ -253,136 +261,223 @@ spufs_mkdir(struct inode *dir, struct dentry *dentry, unsigned int flags,
 	}
 	ctx = alloc_spu_context(SPUFS_I(dir)->i_gang); /* XXX gang */
 	SPUFS_I(inode)->i_ctx = ctx;
-	if (!ctx)
-		goto out_iput;
+	if (!ctx) {
+		iput(inode);
+		return -ENOSPC;
+	}
 
 	ctx->flags = flags;
-	inode->i_op = &spufs_dir_inode_operations;
+	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
+
+	mutex_lock(&inode->i_mutex);
+
+	dget(dentry);
+	inc_nlink(dir);
+	inc_nlink(inode);
+
+	d_instantiate(dentry, inode);
+
 	if (flags & SPU_CREATE_NOSCHED)
 		ret = spufs_fill_dir(dentry, spufs_dir_nosched_contents,
 					 mode, ctx);
 	else
 		ret = spufs_fill_dir(dentry, spufs_dir_contents, mode, ctx);
 
+	if (!ret && spufs_get_sb_info(dir->i_sb)->debug)
+		ret = spufs_fill_dir(dentry, spufs_dir_debug_contents,
+				mode, ctx);
+
 	if (ret)
-		goto out_free_ctx;
+		spufs_rmdir(dir, dentry);
 
-	d_instantiate(dentry, inode);
-	dget(dentry);
-	dir->i_nlink++;
-	dentry->d_inode->i_nlink++;
-	goto out;
+	mutex_unlock(&inode->i_mutex);
 
-out_free_ctx:
-	put_spu_context(ctx);
-out_iput:
-	iput(inode);
-out:
 	return ret;
 }
 
-static int spufs_context_open(struct dentry *dentry, struct vfsmount *mnt)
+static int spufs_context_open(struct path *path)
 {
 	int ret;
 	struct file *filp;
 
 	ret = get_unused_fd();
-	if (ret < 0) {
-		dput(dentry);
-		mntput(mnt);
-		goto out;
-	}
+	if (ret < 0)
+		return ret;
 
-	filp = dentry_open(dentry, mnt, O_RDONLY);
+	filp = dentry_open(path, O_RDONLY, current_cred());
 	if (IS_ERR(filp)) {
 		put_unused_fd(ret);
-		ret = PTR_ERR(filp);
-		goto out;
+		return PTR_ERR(filp);
 	}
 
 	filp->f_op = &spufs_context_fops;
 	fd_install(ret, filp);
-out:
 	return ret;
 }
 
-static int spufs_create_context(struct inode *inode,
-			struct dentry *dentry,
-			struct vfsmount *mnt, int flags, int mode)
+static struct spu_context *
+spufs_assert_affinity(unsigned int flags, struct spu_gang *gang,
+						struct file *filp)
 {
-	int ret;
+	struct spu_context *tmp, *neighbor, *err;
+	int count, node;
+	int aff_supp;
 
-	ret = -EPERM;
-	if ((flags & SPU_CREATE_NOSCHED) &&
-	    !capable(CAP_SYS_NICE))
-		goto out_unlock;
+	aff_supp = !list_empty(&(list_entry(cbe_spu_info[0].spus.next,
+					struct spu, cbe_list))->aff_list);
 
-	ret = -EINVAL;
-	if ((flags & (SPU_CREATE_NOSCHED | SPU_CREATE_ISOLATE))
-	    == SPU_CREATE_ISOLATE)
-		goto out_unlock;
+	if (!aff_supp)
+		return ERR_PTR(-EINVAL);
 
-	ret = -ENODEV;
-	if ((flags & SPU_CREATE_ISOLATE) && !isolated_loader)
-		goto out_unlock;
+	if (flags & SPU_CREATE_GANG)
+		return ERR_PTR(-EINVAL);
+
+	if (flags & SPU_CREATE_AFFINITY_MEM &&
+	    gang->aff_ref_ctx &&
+	    gang->aff_ref_ctx->flags & SPU_CREATE_AFFINITY_MEM)
+		return ERR_PTR(-EEXIST);
+
+	if (gang->aff_flags & AFF_MERGED)
+		return ERR_PTR(-EBUSY);
+
+	neighbor = NULL;
+	if (flags & SPU_CREATE_AFFINITY_SPU) {
+		if (!filp || filp->f_op != &spufs_context_fops)
+			return ERR_PTR(-EINVAL);
+
+		neighbor = get_spu_context(
+				SPUFS_I(file_inode(filp))->i_ctx);
+
+		if (!list_empty(&neighbor->aff_list) && !(neighbor->aff_head) &&
+		    !list_is_last(&neighbor->aff_list, &gang->aff_list_head) &&
+		    !list_entry(neighbor->aff_list.next, struct spu_context,
+		    aff_list)->aff_head) {
+			err = ERR_PTR(-EEXIST);
+			goto out_put_neighbor;
+		}
 
-	ret = spufs_mkdir(inode, dentry, flags, mode & S_IRWXUGO);
-	if (ret)
-		goto out_unlock;
+		if (gang != neighbor->gang) {
+			err = ERR_PTR(-EINVAL);
+			goto out_put_neighbor;
+		}
 
-	/*
-	 * get references for dget and mntget, will be released
-	 * in error path of *_open().
-	 */
-	ret = spufs_context_open(dget(dentry), mntget(mnt));
-	if (ret < 0) {
-		WARN_ON(spufs_rmdir(inode, dentry));
-		mutex_unlock(&inode->i_mutex);
-		spu_forget(SPUFS_I(dentry->d_inode)->i_ctx);
-		goto out;
+		count = 1;
+		list_for_each_entry(tmp, &gang->aff_list_head, aff_list)
+			count++;
+		if (list_empty(&neighbor->aff_list))
+			count++;
+
+		for (node = 0; node < MAX_NUMNODES; node++) {
+			if ((cbe_spu_info[node].n_spus - atomic_read(
+				&cbe_spu_info[node].reserved_spus)) >= count)
+				break;
+		}
+
+		if (node == MAX_NUMNODES) {
+			err = ERR_PTR(-EEXIST);
+			goto out_put_neighbor;
+		}
 	}
 
-out_unlock:
-	mutex_unlock(&inode->i_mutex);
-out:
-	dput(dentry);
-	return ret;
+	return neighbor;
+
+out_put_neighbor:
+	put_spu_context(neighbor);
+	return err;
 }
 
-static int spufs_rmgang(struct inode *root, struct dentry *dir)
+static void
+spufs_set_affinity(unsigned int flags, struct spu_context *ctx,
+					struct spu_context *neighbor)
 {
-	/* FIXME: this fails if the dir is not empty,
-	          which causes a leak of gangs. */
-	return simple_rmdir(root, dir);
+	if (flags & SPU_CREATE_AFFINITY_MEM)
+		ctx->gang->aff_ref_ctx = ctx;
+
+	if (flags & SPU_CREATE_AFFINITY_SPU) {
+		if (list_empty(&neighbor->aff_list)) {
+			list_add_tail(&neighbor->aff_list,
+				&ctx->gang->aff_list_head);
+			neighbor->aff_head = 1;
+		}
+
+		if (list_is_last(&neighbor->aff_list, &ctx->gang->aff_list_head)
+		    || list_entry(neighbor->aff_list.next, struct spu_context,
+							aff_list)->aff_head) {
+			list_add(&ctx->aff_list, &neighbor->aff_list);
+		} else  {
+			list_add_tail(&ctx->aff_list, &neighbor->aff_list);
+			if (neighbor->aff_head) {
+				neighbor->aff_head = 0;
+				ctx->aff_head = 1;
+			}
+		}
+
+		if (!ctx->gang->aff_ref_ctx)
+			ctx->gang->aff_ref_ctx = ctx;
+	}
 }
 
-static int spufs_gang_close(struct inode *inode, struct file *file)
+static int
+spufs_create_context(struct inode *inode, struct dentry *dentry,
+			struct vfsmount *mnt, int flags, umode_t mode,
+			struct file *aff_filp)
 {
-	struct inode *parent;
-	struct dentry *dir;
 	int ret;
+	int affinity;
+	struct spu_gang *gang;
+	struct spu_context *neighbor;
+	struct path path = {.mnt = mnt, .dentry = dentry};
 
-	dir = file->f_path.dentry;
-	parent = dir->d_parent->d_inode;
+	if ((flags & SPU_CREATE_NOSCHED) &&
+	    !capable(CAP_SYS_NICE))
+		return -EPERM;
 
-	ret = spufs_rmgang(parent, dir);
-	WARN_ON(ret);
+	if ((flags & (SPU_CREATE_NOSCHED | SPU_CREATE_ISOLATE))
+	    == SPU_CREATE_ISOLATE)
+		return -EINVAL;
 
-	return dcache_dir_close(inode, file);
-}
+	if ((flags & SPU_CREATE_ISOLATE) && !isolated_loader)
+		return -ENODEV;
+
+	gang = NULL;
+	neighbor = NULL;
+	affinity = flags & (SPU_CREATE_AFFINITY_MEM | SPU_CREATE_AFFINITY_SPU);
+	if (affinity) {
+		gang = SPUFS_I(inode)->i_gang;
+		if (!gang)
+			return -EINVAL;
+		mutex_lock(&gang->aff_mutex);
+		neighbor = spufs_assert_affinity(flags, gang, aff_filp);
+		if (IS_ERR(neighbor)) {
+			ret = PTR_ERR(neighbor);
+			goto out_aff_unlock;
+		}
+	}
 
-const struct file_operations spufs_gang_fops = {
-	.open		= dcache_dir_open,
-	.release	= spufs_gang_close,
-	.llseek		= dcache_dir_lseek,
-	.read		= generic_read_dir,
-	.readdir	= dcache_readdir,
-	.fsync		= simple_sync_file,
-};
+	ret = spufs_mkdir(inode, dentry, flags, mode & S_IRWXUGO);
+	if (ret)
+		goto out_aff_unlock;
+
+	if (affinity) {
+		spufs_set_affinity(flags, SPUFS_I(dentry->d_inode)->i_ctx,
+								neighbor);
+		if (neighbor)
+			put_spu_context(neighbor);
+	}
+
+	ret = spufs_context_open(&path);
+	if (ret < 0)
+		WARN_ON(spufs_rmdir(inode, dentry));
+
+out_aff_unlock:
+	if (affinity)
+		mutex_unlock(&gang->aff_mutex);
+	return ret;
+}
 
 static int
-spufs_mkgang(struct inode *dir, struct dentry *dentry, int mode)
+spufs_mkgang(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	int ret;
 	struct inode *inode;
@@ -404,13 +499,12 @@ spufs_mkgang(struct inode *dir, struct dentry *dentry, int mode)
 	if (!gang)
 		goto out_iput;
 
-	inode->i_op = &spufs_dir_inode_operations;
+	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
 
 	d_instantiate(dentry, inode);
-	dget(dentry);
-	dir->i_nlink++;
-	dentry->d_inode->i_nlink++;
+	inc_nlink(dir);
+	inc_nlink(dentry->d_inode);
 	return ret;
 
 out_iput:
@@ -419,118 +513,98 @@ out:
 	return ret;
 }
 
-static int spufs_gang_open(struct dentry *dentry, struct vfsmount *mnt)
+static int spufs_gang_open(struct path *path)
 {
 	int ret;
 	struct file *filp;
 
 	ret = get_unused_fd();
-	if (ret < 0) {
-		dput(dentry);
-		mntput(mnt);
-		goto out;
-	}
+	if (ret < 0)
+		return ret;
 
-	filp = dentry_open(dentry, mnt, O_RDONLY);
+	/*
+	 * get references for dget and mntget, will be released
+	 * in error path of *_open().
+	 */
+	filp = dentry_open(path, O_RDONLY, current_cred());
 	if (IS_ERR(filp)) {
 		put_unused_fd(ret);
-		ret = PTR_ERR(filp);
-		goto out;
+		return PTR_ERR(filp);
 	}
 
-	filp->f_op = &spufs_gang_fops;
+	filp->f_op = &simple_dir_operations;
 	fd_install(ret, filp);
-out:
 	return ret;
 }
 
 static int spufs_create_gang(struct inode *inode,
 			struct dentry *dentry,
-			struct vfsmount *mnt, int mode)
+			struct vfsmount *mnt, umode_t mode)
 {
+	struct path path = {.mnt = mnt, .dentry = dentry};
 	int ret;
 
 	ret = spufs_mkgang(inode, dentry, mode & S_IRWXUGO);
-	if (ret)
-		goto out;
-
-	/*
-	 * get references for dget and mntget, will be released
-	 * in error path of *_open().
-	 */
-	ret = spufs_gang_open(dget(dentry), mntget(mnt));
-	if (ret < 0)
-		WARN_ON(spufs_rmgang(inode, dentry));
-
-out:
-	mutex_unlock(&inode->i_mutex);
-	dput(dentry);
+	if (!ret) {
+		ret = spufs_gang_open(&path);
+		if (ret < 0) {
+			int err = simple_rmdir(inode, dentry);
+			WARN_ON(err);
+		}
+	}
 	return ret;
 }
 
 
 static struct file_system_type spufs_type;
 
-long spufs_create(struct nameidata *nd, unsigned int flags, mode_t mode)
+long spufs_create(struct path *path, struct dentry *dentry,
+		unsigned int flags, umode_t mode, struct file *filp)
 {
-	struct dentry *dentry;
+	struct inode *dir = path->dentry->d_inode;
 	int ret;
 
-	ret = -EINVAL;
 	/* check if we are on spufs */
-	if (nd->dentry->d_sb->s_type != &spufs_type)
-		goto out;
+	if (path->dentry->d_sb->s_type != &spufs_type)
+		return -EINVAL;
 
 	/* don't accept undefined flags */
 	if (flags & (~SPU_CREATE_FLAG_ALL))
-		goto out;
+		return -EINVAL;
 
 	/* only threads can be underneath a gang */
-	if (nd->dentry != nd->dentry->d_sb->s_root) {
-		if ((flags & SPU_CREATE_GANG) ||
-		    !SPUFS_I(nd->dentry->d_inode)->i_gang)
-			goto out;
-	}
-
-	dentry = lookup_create(nd, 1);
-	ret = PTR_ERR(dentry);
-	if (IS_ERR(dentry))
-		goto out_dir;
+	if (path->dentry != path->dentry->d_sb->s_root)
+		if ((flags & SPU_CREATE_GANG) || !SPUFS_I(dir)->i_gang)
+			return -EINVAL;
 
-	ret = -EEXIST;
-	if (dentry->d_inode)
-		goto out_dput;
-
-	mode &= ~current->fs->umask;
+	mode &= ~current_umask();
 
 	if (flags & SPU_CREATE_GANG)
-		return spufs_create_gang(nd->dentry->d_inode,
-					dentry, nd->mnt, mode);
+		ret = spufs_create_gang(dir, dentry, path->mnt, mode);
 	else
-		return spufs_create_context(nd->dentry->d_inode,
-					dentry, nd->mnt, flags, mode);
+		ret = spufs_create_context(dir, dentry, path->mnt, flags, mode,
+					    filp);
+	if (ret >= 0)
+		fsnotify_mkdir(dir, dentry);
 
-out_dput:
-	dput(dentry);
-out_dir:
-	mutex_unlock(&nd->dentry->d_inode->i_mutex);
-out:
 	return ret;
 }
 
 /* File system initialization */
 enum {
-	Opt_uid, Opt_gid, Opt_err,
+	Opt_uid, Opt_gid, Opt_mode, Opt_debug, Opt_err,
 };
 
-static match_table_t spufs_tokens = {
-	{ Opt_uid, "uid=%d" },
-	{ Opt_gid, "gid=%d" },
-	{ Opt_err, NULL  },
+static const match_table_t spufs_tokens = {
+	{ Opt_uid,   "uid=%d" },
+	{ Opt_gid,   "gid=%d" },
+	{ Opt_mode,  "mode=%o" },
+	{ Opt_debug, "debug" },
+	{ Opt_err,    NULL  },
 };
 
 static int
-spufs_parse_options(char *options, struct inode *root)
+spufs_parse_options(struct super_block *sb, char *options, struct inode *root)
 {
 	char *p;
 	substring_t args[MAX_OPT_ARGS];
@@ -546,12 +620,24 @@ spufs_parse_options(char *options, struct inode *root)
 		case Opt_uid:
 			if (match_int(&args[0], &option))
 				return 0;
-			root->i_uid = option;
+			root->i_uid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(root->i_uid))
+				return 0;
 			break;
 		case Opt_gid:
 			if (match_int(&args[0], &option))
 				return 0;
-			root->i_gid = option;
+			root->i_gid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(root->i_gid))
+				return 0;
+			break;
+		case Opt_mode:
+			if (match_octal(&args[0], &option))
+				return 0;
+			root->i_mode = option | S_IFDIR;
+			break;
+		case Opt_debug:
+			spufs_get_sb_info(sb)->debug = 1;
 			break;
 		default:
 			return 0;
@@ -560,6 +646,12 @@ spufs_parse_options(char *options, struct inode *root)
 	return 1;
 }
 
+static void spufs_exit_isolated_loader(void)
+{
+	free_pages((unsigned long) isolated_loader,
+			get_order(isolated_loader_size));
+}
+
 static void
 spufs_init_isolated_loader(void)
 {
@@ -571,15 +663,16 @@ spufs_init_isolated_loader(void)
 	if (!dn)
 		return;
 
-	loader = get_property(dn, "loader", &size);
+	loader = of_get_property(dn, "loader", &size);
 	if (!loader)
 		return;
 
-	/* kmalloc should align on a 16 byte boundary..* */
-	isolated_loader = kmalloc(size, GFP_KERNEL);
+	/* the loader must be align on a 16 byte boundary */
+	isolated_loader = (char *)__get_free_pages(GFP_KERNEL, get_order(size));
 	if (!isolated_loader)
 		return;
 
+	isolated_loader_size = size;
 	memcpy(isolated_loader, loader, size);
 	printk(KERN_INFO "spufs: SPU isolation mode enabled\n");
 }
@@ -590,23 +683,28 @@ spufs_create_root(struct super_block *sb, void *data)
 	struct inode *inode;
 	int ret;
 
+	ret = -ENODEV;
+	if (!spu_management_ops)
+		goto out;
+
 	ret = -ENOMEM;
 	inode = spufs_new_inode(sb, S_IFDIR | 0775);
 	if (!inode)
 		goto out;
 
-	inode->i_op = &spufs_dir_inode_operations;
+	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
 	SPUFS_I(inode)->i_ctx = NULL;
+	inc_nlink(inode);
 
 	ret = -EINVAL;
-	if (!spufs_parse_options(data, inode))
+	if (!spufs_parse_options(sb, data, inode))
 		goto out_iput;
 
 	ret = -ENOMEM;
-	sb->s_root = d_alloc_root(inode);
+	sb->s_root = d_make_root(inode);
 	if (!sb->s_root)
-		goto out_iput;
+		goto out;
 
 	return 0;
 out_iput:
@@ -618,67 +716,79 @@ out:
 static int
 spufs_fill_super(struct super_block *sb, void *data, int silent)
 {
-	static struct super_operations s_ops = {
+	struct spufs_sb_info *info;
+	static const struct super_operations s_ops = {
 		.alloc_inode = spufs_alloc_inode,
 		.destroy_inode = spufs_destroy_inode,
 		.statfs = simple_statfs,
-		.delete_inode = spufs_delete_inode,
-		.drop_inode = generic_delete_inode,
+		.evict_inode = spufs_evict_inode,
+		.show_options = generic_show_options,
 	};
 
+	save_mount_options(sb, data);
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_blocksize = PAGE_CACHE_SIZE;
 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
 	sb->s_magic = SPUFS_MAGIC;
 	sb->s_op = &s_ops;
+	sb->s_fs_info = info;
 
 	return spufs_create_root(sb, data);
 }
 
-static int
-spufs_get_sb(struct file_system_type *fstype, int flags,
-		const char *name, void *data, struct vfsmount *mnt)
+static struct dentry *
+spufs_mount(struct file_system_type *fstype, int flags,
+		const char *name, void *data)
 {
-	return get_sb_single(fstype, flags, data, spufs_fill_super, mnt);
+	return mount_single(fstype, flags, data, spufs_fill_super);
 }
 
 static struct file_system_type spufs_type = {
 	.owner = THIS_MODULE,
 	.name = "spufs",
-	.get_sb = spufs_get_sb,
+	.mount = spufs_mount,
 	.kill_sb = kill_litter_super,
 };
+MODULE_ALIAS_FS("spufs");
 
 static int __init spufs_init(void)
 {
 	int ret;
 
+	ret = -ENODEV;
+	if (!spu_management_ops)
+		goto out;
+
 	ret = -ENOMEM;
 	spufs_inode_cache = kmem_cache_create("spufs_inode_cache",
 			sizeof(struct spufs_inode_info), 0,
-			SLAB_HWCACHE_ALIGN, spufs_init_once, NULL);
+			SLAB_HWCACHE_ALIGN, spufs_init_once);
 
 	if (!spufs_inode_cache)
 		goto out;
-	if (spu_sched_init() != 0) {
-		kmem_cache_destroy(spufs_inode_cache);
-		goto out;
-	}
-	ret = register_filesystem(&spufs_type);
+	ret = spu_sched_init();
 	if (ret)
 		goto out_cache;
 	ret = register_spu_syscalls(&spufs_calls);
 	if (ret)
-		goto out_fs;
-	ret = register_arch_coredump_calls(&spufs_coredump_calls);
+		goto out_sched;
+	ret = register_filesystem(&spufs_type);
 	if (ret)
-		goto out_fs;
+		goto out_syscalls;
 
 	spufs_init_isolated_loader();
 
 	return 0;
-out_fs:
-	unregister_filesystem(&spufs_type);
+
+out_syscalls:
+	unregister_spu_syscalls(&spufs_calls);
+out_sched:
+	spu_sched_exit();
 out_cache:
 	kmem_cache_destroy(spufs_inode_cache);
 out:
@@ -689,7 +799,7 @@ module_init(spufs_init);
 static void __exit spufs_exit(void)
 {
 	spu_sched_exit();
-	unregister_arch_coredump_calls(&spufs_coredump_calls);
+	spufs_exit_isolated_loader();
 	unregister_spu_syscalls(&spufs_calls);
 	unregister_filesystem(&spufs_type);
 	kmem_cache_destroy(spufs_inode_cache);
diff --git a/arch/powerpc/platforms/cell/spufs/lscsa_alloc.c b/arch/powerpc/platforms/cell/spufs/lscsa_alloc.c
new file mode 100644
index 00000000000..147069938cf
--- /dev/null
+++ b/arch/powerpc/platforms/cell/spufs/lscsa_alloc.c
@@ -0,0 +1,183 @@
+/*
+ * SPU local store allocation routines
+ *
+ * Copyright 2007 Benjamin Herrenschmidt, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <asm/spu.h>
+#include <asm/spu_csa.h>
+#include <asm/mmu.h>
+
+#include "spufs.h"
+
+static int spu_alloc_lscsa_std(struct spu_state *csa)
+{
+	struct spu_lscsa *lscsa;
+	unsigned char *p;
+
+	lscsa = vzalloc(sizeof(struct spu_lscsa));
+	if (!lscsa)
+		return -ENOMEM;
+	csa->lscsa = lscsa;
+
+	/* Set LS pages reserved to allow for user-space mapping. */
+	for (p = lscsa->ls; p < lscsa->ls + LS_SIZE; p += PAGE_SIZE)
+		SetPageReserved(vmalloc_to_page(p));
+
+	return 0;
+}
+
+static void spu_free_lscsa_std(struct spu_state *csa)
+{
+	/* Clear reserved bit before vfree. */
+	unsigned char *p;
+
+	if (csa->lscsa == NULL)
+		return;
+
+	for (p = csa->lscsa->ls; p < csa->lscsa->ls + LS_SIZE; p += PAGE_SIZE)
+		ClearPageReserved(vmalloc_to_page(p));
+
+	vfree(csa->lscsa);
+}
+
+#ifdef CONFIG_SPU_FS_64K_LS
+
+#define SPU_64K_PAGE_SHIFT	16
+#define SPU_64K_PAGE_ORDER	(SPU_64K_PAGE_SHIFT - PAGE_SHIFT)
+#define SPU_64K_PAGE_COUNT	(1ul << SPU_64K_PAGE_ORDER)
+
+int spu_alloc_lscsa(struct spu_state *csa)
+{
+	struct page	**pgarray;
+	unsigned char	*p;
+	int		i, j, n_4k;
+
+	/* Check availability of 64K pages */
+	if (!spu_64k_pages_available())
+		goto fail;
+
+	csa->use_big_pages = 1;
+
+	pr_debug("spu_alloc_lscsa(csa=0x%p), trying to allocate 64K pages\n",
+		 csa);
+
+	/* First try to allocate our 64K pages. We need 5 of them
+	 * with the current implementation. In the future, we should try
+	 * to separate the lscsa with the actual local store image, thus
+	 * allowing us to require only 4 64K pages per context
+	 */
+	for (i = 0; i < SPU_LSCSA_NUM_BIG_PAGES; i++) {
+		/* XXX This is likely to fail, we should use a special pool
+		 *     similar to what hugetlbfs does.
+		 */
+		csa->lscsa_pages[i] = alloc_pages(GFP_KERNEL,
+						  SPU_64K_PAGE_ORDER);
+		if (csa->lscsa_pages[i] == NULL)
+			goto fail;
+	}
+
+	pr_debug(" success ! creating vmap...\n");
+
+	/* Now we need to create a vmalloc mapping of these for the kernel
+	 * and SPU context switch code to use. Currently, we stick to a
+	 * normal kernel vmalloc mapping, which in our case will be 4K
+	 */
+	n_4k = SPU_64K_PAGE_COUNT * SPU_LSCSA_NUM_BIG_PAGES;
+	pgarray = kmalloc(sizeof(struct page *) * n_4k, GFP_KERNEL);
+	if (pgarray == NULL)
+		goto fail;
+	for (i = 0; i < SPU_LSCSA_NUM_BIG_PAGES; i++)
+		for (j = 0; j < SPU_64K_PAGE_COUNT; j++)
+			/* We assume all the struct page's are contiguous
+			 * which should be hopefully the case for an order 4
+			 * allocation..
+			 */
+			pgarray[i * SPU_64K_PAGE_COUNT + j] =
+				csa->lscsa_pages[i] + j;
+	csa->lscsa = vmap(pgarray, n_4k, VM_USERMAP, PAGE_KERNEL);
+	kfree(pgarray);
+	if (csa->lscsa == NULL)
+		goto fail;
+
+	memset(csa->lscsa, 0, sizeof(struct spu_lscsa));
+
+	/* Set LS pages reserved to allow for user-space mapping.
+	 *
+	 * XXX isn't that a bit obsolete ? I think we should just
+	 * make sure the page count is high enough. Anyway, won't harm
+	 * for now
+	 */
+	for (p = csa->lscsa->ls; p < csa->lscsa->ls + LS_SIZE; p += PAGE_SIZE)
+		SetPageReserved(vmalloc_to_page(p));
+
+	pr_debug(" all good !\n");
+
+	return 0;
+fail:
+	pr_debug("spufs: failed to allocate lscsa 64K pages, falling back\n");
+	spu_free_lscsa(csa);
+	return spu_alloc_lscsa_std(csa);
+}
+
+void spu_free_lscsa(struct spu_state *csa)
+{
+	unsigned char *p;
+	int i;
+
+	if (!csa->use_big_pages) {
+		spu_free_lscsa_std(csa);
+		return;
+	}
+	csa->use_big_pages = 0;
+
+	if (csa->lscsa == NULL)
+		goto free_pages;
+
+	for (p = csa->lscsa->ls; p < csa->lscsa->ls + LS_SIZE; p += PAGE_SIZE)
+		ClearPageReserved(vmalloc_to_page(p));
+
+	vunmap(csa->lscsa);
+	csa->lscsa = NULL;
+
+ free_pages:
+
+	for (i = 0; i < SPU_LSCSA_NUM_BIG_PAGES; i++)
+		if (csa->lscsa_pages[i])
+			__free_pages(csa->lscsa_pages[i], SPU_64K_PAGE_ORDER);
+}
+
+#else /* CONFIG_SPU_FS_64K_LS */
+
+int spu_alloc_lscsa(struct spu_state *csa)
+{
+	return spu_alloc_lscsa_std(csa);
+}
+
+void spu_free_lscsa(struct spu_state *csa)
+{
+	spu_free_lscsa_std(csa);
+}
+
+#endif /* !defined(CONFIG_SPU_FS_64K_LS) */
diff --git a/arch/powerpc/platforms/cell/spufs/run.c b/arch/powerpc/platforms/cell/spufs/run.c
index 353a8fa07ab..4ddf769a64e 100644
--- a/arch/powerpc/platforms/cell/spufs/run.c
+++ b/arch/powerpc/platforms/cell/spufs/run.c
@@ -11,46 +11,71 @@
 #include "spufs.h"
 
 /* interrupt-level stop callback function. */
-void spufs_stop_callback(struct spu *spu)
+void spufs_stop_callback(struct spu *spu, int irq)
 {
 	struct spu_context *ctx = spu->ctx;
 
-	wake_up_all(&ctx->stop_wq);
-}
-
-void spufs_dma_callback(struct spu *spu, int type)
-{
-	struct spu_context *ctx = spu->ctx;
-
-	if (ctx->flags & SPU_CREATE_EVENTS_ENABLED) {
-		ctx->event_return |= type;
-		wake_up_all(&ctx->stop_wq);
-	} else {
-		switch (type) {
-		case SPE_EVENT_DMA_ALIGNMENT:
-		case SPE_EVENT_SPE_DATA_STORAGE:
-		case SPE_EVENT_INVALID_DMA:
-			force_sig(SIGBUS, /* info, */ current);
+	/*
+	 * It should be impossible to preempt a context while an exception
+	 * is being processed, since the context switch code is specially
+	 * coded to deal with interrupts ... But, just in case, sanity check
+	 * the context pointer.  It is OK to return doing nothing since
+	 * the exception will be regenerated when the context is resumed.
+	 */
+	if (ctx) {
+		/* Copy exception arguments into module specific structure */
+		switch(irq) {
+		case 0 :
+			ctx->csa.class_0_pending = spu->class_0_pending;
+			ctx->csa.class_0_dar = spu->class_0_dar;
+			break;
+		case 1 :
+			ctx->csa.class_1_dsisr = spu->class_1_dsisr;
+			ctx->csa.class_1_dar = spu->class_1_dar;
 			break;
-		case SPE_EVENT_SPE_ERROR:
-			force_sig(SIGILL, /* info */ current);
+		case 2 :
 			break;
 		}
+
+		/* ensure that the exception status has hit memory before a
+		 * thread waiting on the context's stop queue is woken */
+		smp_wmb();
+
+		wake_up_all(&ctx->stop_wq);
 	}
 }
 
-static inline int spu_stopped(struct spu_context *ctx, u32 * stat)
+int spu_stopped(struct spu_context *ctx, u32 *stat)
 {
-	struct spu *spu;
-	u64 pte_fault;
+	u64 dsisr;
+	u32 stopped;
 
+	stopped = SPU_STATUS_INVALID_INSTR | SPU_STATUS_SINGLE_STEP |
+		SPU_STATUS_STOPPED_BY_HALT | SPU_STATUS_STOPPED_BY_STOP;
+
+top:
 	*stat = ctx->ops->status_read(ctx);
-	if (ctx->state != SPU_STATE_RUNNABLE)
+	if (*stat & stopped) {
+		/*
+		 * If the spu hasn't finished stopping, we need to
+		 * re-read the register to get the stopped value.
+		 */
+		if (*stat & SPU_STATUS_RUNNING)
+			goto top;
+		return 1;
+	}
+
+	if (test_bit(SPU_SCHED_NOTIFY_ACTIVE, &ctx->sched_flags))
 		return 1;
-	spu = ctx->spu;
-	pte_fault = spu->dsisr &
-	    (MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED);
-	return (!(*stat & 0x1) || pte_fault || spu->class_0_pending) ? 1 : 0;
+
+	dsisr = ctx->csa.class_1_dsisr;
+	if (dsisr & (MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED))
+		return 1;
+
+	if (ctx->csa.class_0_pending)
+		return 1;
+
+	return 0;
 }
 
 static int spu_setup_isolated(struct spu_context *ctx)
@@ -63,13 +88,18 @@ static int spu_setup_isolated(struct spu_context *ctx)
 	const u32 status_loading = SPU_STATUS_RUNNING
 		| SPU_STATUS_ISOLATED_STATE | SPU_STATUS_ISOLATED_LOAD_STATUS;
 
+	ret = -ENODEV;
 	if (!isolated_loader)
-		return -ENODEV;
-
-	ret = spu_acquire_exclusive(ctx);
-	if (ret)
 		goto out;
 
+	/*
+	 * We need to exclude userspace access to the context.
+	 *
+	 * To protect against memory access we invalidate all ptes
+	 * and make sure the pagefault handlers block on the mutex.
+	 */
+	spu_unmap_mappings(ctx);
+
 	mfc_cntl = &ctx->spu->priv2->mfc_control_RW;
 
 	/* purge the MFC DMA queue to ensure no spurious accesses before we
@@ -80,13 +110,16 @@ static int spu_setup_isolated(struct spu_context *ctx)
 			!= MFC_CNTL_PURGE_DMA_COMPLETE) {
 		if (time_after(jiffies, timeout)) {
 			printk(KERN_ERR "%s: timeout flushing MFC DMA queue\n",
-					__FUNCTION__);
+					__func__);
 			ret = -EIO;
-			goto out_unlock;
+			goto out;
 		}
 		cond_resched();
 	}
 
+	/* clear purge status */
+	out_be64(mfc_cntl, 0);
+
 	/* put the SPE in kernel mode to allow access to the loader */
 	sr1 = spu_mfc_sr1_get(ctx->spu);
 	sr1 &= ~MFC_STATE1_PROBLEM_STATE_MASK;
@@ -106,7 +139,7 @@ static int spu_setup_isolated(struct spu_context *ctx)
 				status_loading) {
 		if (time_after(jiffies, timeout)) {
 			printk(KERN_ERR "%s: timeout waiting for loader\n",
-					__FUNCTION__);
+					__func__);
 			ret = -EIO;
 			goto out_drop_priv;
 		}
@@ -116,15 +149,18 @@ static int spu_setup_isolated(struct spu_context *ctx)
 	if (!(status & SPU_STATUS_RUNNING)) {
 		/* If isolated LOAD has failed: run SPU, we will get a stop-and
 		 * signal later. */
-		pr_debug("%s: isolated LOAD failed\n", __FUNCTION__);
+		pr_debug("%s: isolated LOAD failed\n", __func__);
 		ctx->ops->runcntl_write(ctx, SPU_RUNCNTL_RUNNABLE);
 		ret = -EACCES;
+		goto out_drop_priv;
+	}
 
-	} else if (!(status & SPU_STATUS_ISOLATED_STATE)) {
+	if (!(status & SPU_STATUS_ISOLATED_STATE)) {
 		/* This isn't allowed by the CBEA, but check anyway */
-		pr_debug("%s: SPU fell out of isolated mode?\n", __FUNCTION__);
+		pr_debug("%s: SPU fell out of isolated mode?\n", __func__);
 		ctx->ops->runcntl_write(ctx, SPU_RUNCNTL_STOP);
 		ret = -EINVAL;
+		goto out_drop_priv;
 	}
 
 out_drop_priv:
@@ -132,55 +168,91 @@ out_drop_priv:
 	sr1 |= MFC_STATE1_PROBLEM_STATE_MASK;
 	spu_mfc_sr1_set(ctx->spu, sr1);
 
-out_unlock:
-	spu_release(ctx);
 out:
 	return ret;
 }
 
-static inline int spu_run_init(struct spu_context *ctx, u32 * npc)
+static int spu_run_init(struct spu_context *ctx, u32 *npc)
 {
-	int ret;
 	unsigned long runcntl = SPU_RUNCNTL_RUNNABLE;
+	int ret;
 
-	ret = spu_acquire_runnable(ctx, SPU_ACTIVATE_NOWAKE);
-	if (ret)
-		return ret;
+	spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
 
+	/*
+	 * NOSCHED is synchronous scheduling with respect to the caller.
+	 * The caller waits for the context to be loaded.
+	 */
+	if (ctx->flags & SPU_CREATE_NOSCHED) {
+		if (ctx->state == SPU_STATE_SAVED) {
+			ret = spu_activate(ctx, 0);
+			if (ret)
+				return ret;
+		}
+	}
+
+	/*
+	 * Apply special setup as required.
+	 */
 	if (ctx->flags & SPU_CREATE_ISOLATE) {
 		if (!(ctx->ops->status_read(ctx) & SPU_STATUS_ISOLATED_STATE)) {
-			/* Need to release ctx, because spu_setup_isolated will
-			 * acquire it exclusively.
-			 */
-			spu_release(ctx);
 			ret = spu_setup_isolated(ctx);
-			if (!ret)
-				ret = spu_acquire_runnable(ctx, SPU_ACTIVATE_NOWAKE);
+			if (ret)
+				return ret;
 		}
 
-		/* if userspace has set the runcntrl register (eg, to issue an
-		 * isolated exit), we need to re-set it here */
+		/*
+		 * If userspace has set the runcntrl register (eg, to
+		 * issue an isolated exit), we need to re-set it here
+		 */
 		runcntl = ctx->ops->runcntl_read(ctx) &
 			(SPU_RUNCNTL_RUNNABLE | SPU_RUNCNTL_ISOLATE);
 		if (runcntl == 0)
 			runcntl = SPU_RUNCNTL_RUNNABLE;
 	} else {
-		spu_start_tick(ctx);
+		unsigned long privcntl;
+
+		if (test_thread_flag(TIF_SINGLESTEP))
+			privcntl = SPU_PRIVCNTL_MODE_SINGLE_STEP;
+		else
+			privcntl = SPU_PRIVCNTL_MODE_NORMAL;
+
+		ctx->ops->privcntl_write(ctx, privcntl);
 		ctx->ops->npc_write(ctx, *npc);
 	}
 
 	ctx->ops->runcntl_write(ctx, runcntl);
-	return ret;
+
+	if (ctx->flags & SPU_CREATE_NOSCHED) {
+		spuctx_switch_state(ctx, SPU_UTIL_USER);
+	} else {
+
+		if (ctx->state == SPU_STATE_SAVED) {
+			ret = spu_activate(ctx, 0);
+			if (ret)
+				return ret;
+		} else {
+			spuctx_switch_state(ctx, SPU_UTIL_USER);
+		}
+	}
+
+	set_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags);
+	return 0;
 }
 
-static inline int spu_run_fini(struct spu_context *ctx, u32 * npc,
-			       u32 * status)
+static int spu_run_fini(struct spu_context *ctx, u32 *npc,
+			       u32 *status)
 {
 	int ret = 0;
 
-	spu_stop_tick(ctx);
+	spu_del_from_rq(ctx);
+
 	*status = ctx->ops->status_read(ctx);
 	*npc = ctx->ops->npc_read(ctx);
+
+	spuctx_switch_state(ctx, SPU_UTIL_IDLE_LOADED);
+	clear_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags);
+	spu_switch_log_notify(NULL, ctx, SWITCH_LOG_EXIT, *status);
 	spu_release(ctx);
 
 	if (signal_pending(current))
@@ -189,22 +261,6 @@ static inline int spu_run_fini(struct spu_context *ctx, u32 * npc,
 	return ret;
 }
 
-static inline int spu_reacquire_runnable(struct spu_context *ctx, u32 *npc,
-				         u32 *status)
-{
-	int ret;
-
-	if ((ret = spu_run_fini(ctx, npc, status)) != 0)
-		return ret;
-	if (*status & (SPU_STATUS_STOPPED_BY_STOP |
-		       SPU_STATUS_STOPPED_BY_HALT)) {
-		return *status;
-	}
-	if ((ret = spu_run_init(ctx, npc)) != 0)
-		return ret;
-	return 0;
-}
-
 /*
  * SPU syscall restarting is tricky because we violate the basic
  * assumption that the signal handler is running on the interrupted
@@ -213,7 +269,7 @@ static inline int spu_reacquire_runnable(struct spu_context *ctx, u32 *npc,
  * This means we can only do a very rough approximation of POSIX
  * signal semantics.
  */
-int spu_handle_restartsys(struct spu_context *ctx, long *spu_ret,
+static int spu_handle_restartsys(struct spu_context *ctx, long *spu_ret,
 			  unsigned int *npc)
 {
 	int ret;
@@ -243,27 +299,27 @@ int spu_handle_restartsys(struct spu_context *ctx, long *spu_ret,
 		break;
 	default:
 		printk(KERN_WARNING "%s: unexpected return code %ld\n",
-			__FUNCTION__, *spu_ret);
+			__func__, *spu_ret);
 		ret = 0;
 	}
 	return ret;
 }
 
-int spu_process_callback(struct spu_context *ctx)
+static int spu_process_callback(struct spu_context *ctx)
 {
 	struct spu_syscall_block s;
 	u32 ls_pointer, npc;
-	char *ls;
+	void __iomem *ls;
 	long spu_ret;
 	int ret;
 
 	/* get syscall block from local store */
-	npc = ctx->ops->npc_read(ctx);
-	ls = ctx->ops->get_ls(ctx);
-	ls_pointer = *(u32*)(ls + npc);
+	npc = ctx->ops->npc_read(ctx) & ~3;
+	ls = (void __iomem *)ctx->ops->get_ls(ctx);
+	ls_pointer = in_be32(ls + npc);
 	if (ls_pointer > (LS_SIZE - sizeof(s)))
 		return -EFAULT;
-	memcpy(&s, ls + ls_pointer, sizeof (s));
+	memcpy_fromio(&s, ls + ls_pointer, sizeof(s));
 
 	/* do actual syscall without pinning the spu */
 	ret = 0;
@@ -277,52 +333,69 @@ int spu_process_callback(struct spu_context *ctx)
 		if (spu_ret <= -ERESTARTSYS) {
 			ret = spu_handle_restartsys(ctx, &spu_ret, &npc);
 		}
-		spu_acquire(ctx);
+		mutex_lock(&ctx->state_mutex);
 		if (ret == -ERESTARTSYS)
 			return ret;
 	}
 
+	/* need to re-get the ls, as it may have changed when we released the
+	 * spu */
+	ls = (void __iomem *)ctx->ops->get_ls(ctx);
+
 	/* write result, jump over indirect pointer */
-	memcpy(ls + ls_pointer, &spu_ret, sizeof (spu_ret));
+	memcpy_toio(ls + ls_pointer, &spu_ret, sizeof(spu_ret));
 	ctx->ops->npc_write(ctx, npc);
 	ctx->ops->runcntl_write(ctx, SPU_RUNCNTL_RUNNABLE);
 	return ret;
 }
 
-static inline int spu_process_events(struct spu_context *ctx)
-{
-	struct spu *spu = ctx->spu;
-	u64 pte_fault = MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED;
-	int ret = 0;
-
-	if (spu->dsisr & pte_fault)
-		ret = spu_irq_class_1_bottom(spu);
-	if (spu->class_0_pending)
-		ret = spu_irq_class_0_bottom(spu);
-	if (!ret && signal_pending(current))
-		ret = -ERESTARTSYS;
-	return ret;
-}
-
-long spufs_run_spu(struct file *file, struct spu_context *ctx,
-		   u32 *npc, u32 *event)
+long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *event)
 {
 	int ret;
+	struct spu *spu;
 	u32 status;
 
-	if (down_interruptible(&ctx->run_sema))
+	if (mutex_lock_interruptible(&ctx->run_mutex))
 		return -ERESTARTSYS;
 
-	ctx->ops->master_start(ctx);
 	ctx->event_return = 0;
-	ret = spu_run_init(ctx, npc);
+
+	ret = spu_acquire(ctx);
 	if (ret)
+		goto out_unlock;
+
+	spu_enable_spu(ctx);
+
+	spu_update_sched_info(ctx);
+
+	ret = spu_run_init(ctx, npc);
+	if (ret) {
+		spu_release(ctx);
 		goto out;
+	}
 
 	do {
 		ret = spufs_wait(ctx->stop_wq, spu_stopped(ctx, &status));
-		if (unlikely(ret))
+		if (unlikely(ret)) {
+			/*
+			 * This is nasty: we need the state_mutex for all the
+			 * bookkeeping even if the syscall was interrupted by
+			 * a signal. ewww.
+			 */
+			mutex_lock(&ctx->state_mutex);
 			break;
+		}
+		spu = ctx->spu;
+		if (unlikely(test_and_clear_bit(SPU_SCHED_NOTIFY_ACTIVE,
+						&ctx->sched_flags))) {
+			if (!(status & SPU_STATUS_STOPPED_BY_STOP)) {
+				spu_switch_notify(spu, ctx);
+				continue;
+			}
+		}
+
+		spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
+
 		if ((status & SPU_STATUS_STOPPED_BY_STOP) &&
 		    (status >> SPU_STOP_STATUS_SHIFT == 0x2104)) {
 			ret = spu_process_callback(ctx);
@@ -330,39 +403,52 @@ long spufs_run_spu(struct file *file, struct spu_context *ctx,
 				break;
 			status &= ~SPU_STATUS_STOPPED_BY_STOP;
 		}
-		if (unlikely(ctx->state != SPU_STATE_RUNNABLE)) {
-			ret = spu_reacquire_runnable(ctx, npc, &status);
-			if (ret) {
-				spu_stop_tick(ctx);
-				goto out2;
-			}
-			continue;
-		}
-		ret = spu_process_events(ctx);
+		ret = spufs_handle_class1(ctx);
+		if (ret)
+			break;
 
+		ret = spufs_handle_class0(ctx);
+		if (ret)
+			break;
+
+		if (signal_pending(current))
+			ret = -ERESTARTSYS;
 	} while (!ret && !(status & (SPU_STATUS_STOPPED_BY_STOP |
-				      SPU_STATUS_STOPPED_BY_HALT)));
+				      SPU_STATUS_STOPPED_BY_HALT |
+				       SPU_STATUS_SINGLE_STEP)));
 
-	ctx->ops->master_stop(ctx);
+	spu_disable_spu(ctx);
 	ret = spu_run_fini(ctx, npc, &status);
 	spu_yield(ctx);
 
-out2:
+	if ((status & SPU_STATUS_STOPPED_BY_STOP) &&
+	    (((status >> SPU_STOP_STATUS_SHIFT) & 0x3f00) == 0x2100))
+		ctx->stats.libassist++;
+
 	if ((ret == 0) ||
 	    ((ret == -ERESTARTSYS) &&
 	     ((status & SPU_STATUS_STOPPED_BY_HALT) ||
+	      (status & SPU_STATUS_SINGLE_STEP) ||
 	      ((status & SPU_STATUS_STOPPED_BY_STOP) &&
 	       (status >> SPU_STOP_STATUS_SHIFT != 0x2104)))))
 		ret = status;
 
-	if ((status & SPU_STATUS_STOPPED_BY_STOP)
-	    && (status >> SPU_STOP_STATUS_SHIFT) == 0x3fff) {
+	/* Note: we don't need to force_sig SIGTRAP on single-step
+	 * since we have TIF_SINGLESTEP set, thus the kernel will do
+	 * it upon return from the syscall anyawy
+	 */
+	if (unlikely(status & SPU_STATUS_SINGLE_STEP))
+		ret = -ERESTARTSYS;
+
+	else if (unlikely((status & SPU_STATUS_STOPPED_BY_STOP)
+	    && (status >> SPU_STOP_STATUS_SHIFT) == 0x3fff)) {
 		force_sig(SIGTRAP, current);
 		ret = -ERESTARTSYS;
 	}
 
 out:
 	*event = ctx->event_return;
-	up(&ctx->run_sema);
+out_unlock:
+	mutex_unlock(&ctx->run_mutex);
 	return ret;
 }
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 2f25e68b4ba..4a0a64fe25d 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -22,20 +22,24 @@
 
 #undef DEBUG
 
-#include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/slab.h>
 #include <linux/completion.h>
 #include <linux/vmalloc.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/stddef.h>
 #include <linux/unistd.h>
 #include <linux/numa.h>
 #include <linux/mutex.h>
 #include <linux/notifier.h>
+#include <linux/kthread.h>
+#include <linux/pid_namespace.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 
 #include <asm/io.h>
 #include <asm/mmu_context.h>
@@ -43,114 +47,171 @@
 #include <asm/spu_csa.h>
 #include <asm/spu_priv1.h>
 #include "spufs.h"
-
-#define SPU_TIMESLICE	(HZ)
+#define CREATE_TRACE_POINTS
+#include "sputrace.h"
 
 struct spu_prio_array {
 	DECLARE_BITMAP(bitmap, MAX_PRIO);
 	struct list_head runq[MAX_PRIO];
 	spinlock_t runq_lock;
-	struct list_head active_list[MAX_NUMNODES];
-	struct mutex active_mutex[MAX_NUMNODES];
+	int nr_waiting;
 };
 
+static unsigned long spu_avenrun[3];
 static struct spu_prio_array *spu_prio;
-static struct workqueue_struct *spu_sched_wq;
+static struct task_struct *spusched_task;
+static struct timer_list spusched_timer;
+static struct timer_list spuloadavg_timer;
 
-static inline int node_allowed(int node)
-{
-	cpumask_t mask;
+/*
+ * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).
+ */
+#define NORMAL_PRIO		120
 
-	if (!nr_cpus_node(node))
-		return 0;
-	mask = node_to_cpumask(node);
-	if (!cpus_intersects(mask, current->cpus_allowed))
-		return 0;
-	return 1;
-}
+/*
+ * Frequency of the spu scheduler tick.  By default we do one SPU scheduler
+ * tick for every 10 CPU scheduler ticks.
+ */
+#define SPUSCHED_TICK		(10)
 
-void spu_start_tick(struct spu_context *ctx)
-{
-	if (ctx->policy == SCHED_RR)
-		queue_delayed_work(spu_sched_wq, &ctx->sched_work, SPU_TIMESLICE);
-}
+/*
+ * These are the 'tuning knobs' of the scheduler:
+ *
+ * Minimum timeslice is 5 msecs (or 1 spu scheduler tick, whichever is
+ * larger), default timeslice is 100 msecs, maximum timeslice is 800 msecs.
+ */
+#define MIN_SPU_TIMESLICE	max(5 * HZ / (1000 * SPUSCHED_TICK), 1)
+#define DEF_SPU_TIMESLICE	(100 * HZ / (1000 * SPUSCHED_TICK))
+
+#define SCALE_PRIO(x, prio) \
+	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE)
 
-void spu_stop_tick(struct spu_context *ctx)
+/*
+ * scale user-nice values [ -20 ... 0 ... 19 ] to time slice values:
+ * [800ms ... 100ms ... 5ms]
+ *
+ * The higher a thread's priority, the bigger timeslices
+ * it gets during one round of execution. But even the lowest
+ * priority thread gets MIN_TIMESLICE worth of execution time.
+ */
+void spu_set_timeslice(struct spu_context *ctx)
 {
-	if (ctx->policy == SCHED_RR)
-		cancel_delayed_work(&ctx->sched_work);
+	if (ctx->prio < NORMAL_PRIO)
+		ctx->time_slice = SCALE_PRIO(DEF_SPU_TIMESLICE * 4, ctx->prio);
+	else
+		ctx->time_slice = SCALE_PRIO(DEF_SPU_TIMESLICE, ctx->prio);
 }
 
-void spu_sched_tick(struct work_struct *work)
+/*
+ * Update scheduling information from the owning thread.
+ */
+void __spu_update_sched_info(struct spu_context *ctx)
 {
-	struct spu_context *ctx =
-		container_of(work, struct spu_context, sched_work.work);
-	struct spu *spu;
-	int rearm = 1;
+	/*
+	 * assert that the context is not on the runqueue, so it is safe
+	 * to change its scheduling parameters.
+	 */
+	BUG_ON(!list_empty(&ctx->rq));
 
-	mutex_lock(&ctx->state_mutex);
-	spu = ctx->spu;
-	if (spu) {
-		int best = sched_find_first_bit(spu_prio->bitmap);
-		if (best <= ctx->prio) {
-			spu_deactivate(ctx);
-			rearm = 0;
-		}
-	}
-	mutex_unlock(&ctx->state_mutex);
+	/*
+	 * 32-Bit assignments are atomic on powerpc, and we don't care about
+	 * memory ordering here because retrieving the controlling thread is
+	 * per definition racy.
+	 */
+	ctx->tid = current->pid;
 
-	if (rearm)
-		spu_start_tick(ctx);
-}
+	/*
+	 * We do our own priority calculations, so we normally want
+	 * ->static_prio to start with. Unfortunately this field
+	 * contains junk for threads with a realtime scheduling
+	 * policy so we have to look at ->prio in this case.
+	 */
+	if (rt_prio(current->prio))
+		ctx->prio = current->prio;
+	else
+		ctx->prio = current->static_prio;
+	ctx->policy = current->policy;
 
-/**
- * spu_add_to_active_list - add spu to active list
- * @spu:	spu to add to the active list
- */
-static void spu_add_to_active_list(struct spu *spu)
-{
-	mutex_lock(&spu_prio->active_mutex[spu->node]);
-	list_add_tail(&spu->list, &spu_prio->active_list[spu->node]);
-	mutex_unlock(&spu_prio->active_mutex[spu->node]);
+	/*
+	 * TO DO: the context may be loaded, so we may need to activate
+	 * it again on a different node. But it shouldn't hurt anything
+	 * to update its parameters, because we know that the scheduler
+	 * is not actively looking at this field, since it is not on the
+	 * runqueue. The context will be rescheduled on the proper node
+	 * if it is timesliced or preempted.
+	 */
+	cpumask_copy(&ctx->cpus_allowed, tsk_cpus_allowed(current));
+
+	/* Save the current cpu id for spu interrupt routing. */
+	ctx->last_ran = raw_smp_processor_id();
 }
 
-/**
- * spu_remove_from_active_list - remove spu from active list
- * @spu:       spu to remove from the active list
- */
-static void spu_remove_from_active_list(struct spu *spu)
+void spu_update_sched_info(struct spu_context *ctx)
 {
-	int node = spu->node;
+	int node;
 
-	mutex_lock(&spu_prio->active_mutex[node]);
-	list_del_init(&spu->list);
-	mutex_unlock(&spu_prio->active_mutex[node]);
+	if (ctx->state == SPU_STATE_RUNNABLE) {
+		node = ctx->spu->node;
+
+		/*
+		 * Take list_mutex to sync with find_victim().
+		 */
+		mutex_lock(&cbe_spu_info[node].list_mutex);
+		__spu_update_sched_info(ctx);
+		mutex_unlock(&cbe_spu_info[node].list_mutex);
+	} else {
+		__spu_update_sched_info(ctx);
+	}
 }
 
-static inline void mm_needs_global_tlbie(struct mm_struct *mm)
+static int __node_allowed(struct spu_context *ctx, int node)
 {
-	int nr = (NR_CPUS > 1) ? NR_CPUS : NR_CPUS + 1;
-
-	/* Global TLBIE broadcast required with SPEs. */
-	__cpus_setall(&mm->cpu_vm_mask, nr);
-}
+	if (nr_cpus_node(node)) {
+		const struct cpumask *mask = cpumask_of_node(node);
 
-static BLOCKING_NOTIFIER_HEAD(spu_switch_notifier);
+		if (cpumask_intersects(mask, &ctx->cpus_allowed))
+			return 1;
+	}
 
-static void spu_switch_notify(struct spu *spu, struct spu_context *ctx)
-{
-	blocking_notifier_call_chain(&spu_switch_notifier,
-			    ctx ? ctx->object_id : 0, spu);
+	return 0;
 }
 
-int spu_switch_event_register(struct notifier_block * n)
+static int node_allowed(struct spu_context *ctx, int node)
 {
-	return blocking_notifier_chain_register(&spu_switch_notifier, n);
+	int rval;
+
+	spin_lock(&spu_prio->runq_lock);
+	rval = __node_allowed(ctx, node);
+	spin_unlock(&spu_prio->runq_lock);
+
+	return rval;
 }
 
-int spu_switch_event_unregister(struct notifier_block * n)
+void do_notify_spus_active(void)
 {
-	return blocking_notifier_chain_unregister(&spu_switch_notifier, n);
+	int node;
+
+	/*
+	 * Wake up the active spu_contexts.
+	 *
+	 * When the awakened processes see their "notify_active" flag is set,
+	 * they will call spu_switch_notify().
+	 */
+	for_each_online_node(node) {
+		struct spu *spu;
+
+		mutex_lock(&cbe_spu_info[node].list_mutex);
+		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
+			if (spu->alloc_state != SPU_FREE) {
+				struct spu_context *ctx = spu->ctx;
+				set_bit(SPU_SCHED_NOTIFY_ACTIVE,
+					&ctx->sched_flags);
+				mb();
+				wake_up_all(&ctx->stop_wq);
+			}
+		}
+		mutex_unlock(&cbe_spu_info[node].list_mutex);
+	}
 }
 
 /**
@@ -160,28 +221,211 @@ int spu_switch_event_unregister(struct notifier_block * n)
  */
 static void spu_bind_context(struct spu *spu, struct spu_context *ctx)
 {
-	pr_debug("%s: pid=%d SPU=%d NODE=%d\n", __FUNCTION__, current->pid,
-		 spu->number, spu->node);
+	spu_context_trace(spu_bind_context__enter, ctx, spu);
+
+	spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
+
+	if (ctx->flags & SPU_CREATE_NOSCHED)
+		atomic_inc(&cbe_spu_info[spu->node].reserved_spus);
+
+	ctx->stats.slb_flt_base = spu->stats.slb_flt;
+	ctx->stats.class2_intr_base = spu->stats.class2_intr;
+
+	spu_associate_mm(spu, ctx->owner);
+
+	spin_lock_irq(&spu->register_lock);
 	spu->ctx = ctx;
 	spu->flags = 0;
 	ctx->spu = spu;
 	ctx->ops = &spu_hw_ops;
 	spu->pid = current->pid;
-	spu->mm = ctx->owner;
-	mm_needs_global_tlbie(spu->mm);
+	spu->tgid = current->tgid;
 	spu->ibox_callback = spufs_ibox_callback;
 	spu->wbox_callback = spufs_wbox_callback;
 	spu->stop_callback = spufs_stop_callback;
 	spu->mfc_callback = spufs_mfc_callback;
-	spu->dma_callback = spufs_dma_callback;
-	mb();
+	spin_unlock_irq(&spu->register_lock);
+
 	spu_unmap_mappings(ctx);
+
+	spu_switch_log_notify(spu, ctx, SWITCH_LOG_START, 0);
 	spu_restore(&ctx->csa, spu);
 	spu->timestamp = jiffies;
-	spu_cpu_affinity_set(spu, raw_smp_processor_id());
 	spu_switch_notify(spu, ctx);
-	spu_add_to_active_list(spu);
 	ctx->state = SPU_STATE_RUNNABLE;
+
+	spuctx_switch_state(ctx, SPU_UTIL_USER);
+}
+
+/*
+ * Must be used with the list_mutex held.
+ */
+static inline int sched_spu(struct spu *spu)
+{
+	BUG_ON(!mutex_is_locked(&cbe_spu_info[spu->node].list_mutex));
+
+	return (!spu->ctx || !(spu->ctx->flags & SPU_CREATE_NOSCHED));
+}
+
+static void aff_merge_remaining_ctxs(struct spu_gang *gang)
+{
+	struct spu_context *ctx;
+
+	list_for_each_entry(ctx, &gang->aff_list_head, aff_list) {
+		if (list_empty(&ctx->aff_list))
+			list_add(&ctx->aff_list, &gang->aff_list_head);
+	}
+	gang->aff_flags |= AFF_MERGED;
+}
+
+static void aff_set_offsets(struct spu_gang *gang)
+{
+	struct spu_context *ctx;
+	int offset;
+
+	offset = -1;
+	list_for_each_entry_reverse(ctx, &gang->aff_ref_ctx->aff_list,
+								aff_list) {
+		if (&ctx->aff_list == &gang->aff_list_head)
+			break;
+		ctx->aff_offset = offset--;
+	}
+
+	offset = 0;
+	list_for_each_entry(ctx, gang->aff_ref_ctx->aff_list.prev, aff_list) {
+		if (&ctx->aff_list == &gang->aff_list_head)
+			break;
+		ctx->aff_offset = offset++;
+	}
+
+	gang->aff_flags |= AFF_OFFSETS_SET;
+}
+
+static struct spu *aff_ref_location(struct spu_context *ctx, int mem_aff,
+		 int group_size, int lowest_offset)
+{
+	struct spu *spu;
+	int node, n;
+
+	/*
+	 * TODO: A better algorithm could be used to find a good spu to be
+	 *       used as reference location for the ctxs chain.
+	 */
+	node = cpu_to_node(raw_smp_processor_id());
+	for (n = 0; n < MAX_NUMNODES; n++, node++) {
+		/*
+		 * "available_spus" counts how many spus are not potentially
+		 * going to be used by other affinity gangs whose reference
+		 * context is already in place. Although this code seeks to
+		 * avoid having affinity gangs with a summed amount of
+		 * contexts bigger than the amount of spus in the node,
+		 * this may happen sporadically. In this case, available_spus
+		 * becomes negative, which is harmless.
+		 */
+		int available_spus;
+
+		node = (node < MAX_NUMNODES) ? node : 0;
+		if (!node_allowed(ctx, node))
+			continue;
+
+		available_spus = 0;
+		mutex_lock(&cbe_spu_info[node].list_mutex);
+		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
+			if (spu->ctx && spu->ctx->gang && !spu->ctx->aff_offset
+					&& spu->ctx->gang->aff_ref_spu)
+				available_spus -= spu->ctx->gang->contexts;
+			available_spus++;
+		}
+		if (available_spus < ctx->gang->contexts) {
+			mutex_unlock(&cbe_spu_info[node].list_mutex);
+			continue;
+		}
+
+		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
+			if ((!mem_aff || spu->has_mem_affinity) &&
+							sched_spu(spu)) {
+				mutex_unlock(&cbe_spu_info[node].list_mutex);
+				return spu;
+			}
+		}
+		mutex_unlock(&cbe_spu_info[node].list_mutex);
+	}
+	return NULL;
+}
+
+static void aff_set_ref_point_location(struct spu_gang *gang)
+{
+	int mem_aff, gs, lowest_offset;
+	struct spu_context *ctx;
+	struct spu *tmp;
+
+	mem_aff = gang->aff_ref_ctx->flags & SPU_CREATE_AFFINITY_MEM;
+	lowest_offset = 0;
+	gs = 0;
+
+	list_for_each_entry(tmp, &gang->aff_list_head, aff_list)
+		gs++;
+
+	list_for_each_entry_reverse(ctx, &gang->aff_ref_ctx->aff_list,
+								aff_list) {
+		if (&ctx->aff_list == &gang->aff_list_head)
+			break;
+		lowest_offset = ctx->aff_offset;
+	}
+
+	gang->aff_ref_spu = aff_ref_location(gang->aff_ref_ctx, mem_aff, gs,
+							lowest_offset);
+}
+
+static struct spu *ctx_location(struct spu *ref, int offset, int node)
+{
+	struct spu *spu;
+
+	spu = NULL;
+	if (offset >= 0) {
+		list_for_each_entry(spu, ref->aff_list.prev, aff_list) {
+			BUG_ON(spu->node != node);
+			if (offset == 0)
+				break;
+			if (sched_spu(spu))
+				offset--;
+		}
+	} else {
+		list_for_each_entry_reverse(spu, ref->aff_list.next, aff_list) {
+			BUG_ON(spu->node != node);
+			if (offset == 0)
+				break;
+			if (sched_spu(spu))
+				offset++;
+		}
+	}
+
+	return spu;
+}
+
+/*
+ * affinity_check is called each time a context is going to be scheduled.
+ * It returns the spu ptr on which the context must run.
+ */
+static int has_affinity(struct spu_context *ctx)
+{
+	struct spu_gang *gang = ctx->gang;
+
+	if (list_empty(&ctx->aff_list))
+		return 0;
+
+	if (atomic_read(&ctx->gang->aff_sched_count) == 0)
+		ctx->gang->aff_ref_spu = NULL;
+
+	if (!gang->aff_ref_spu) {
+		if (!(gang->aff_flags & AFF_MERGED))
+			aff_merge_remaining_ctxs(gang);
+		if (!(gang->aff_flags & AFF_OFFSETS_SET))
+			aff_set_offsets(gang);
+		aff_set_ref_point_location(gang);
+	}
+
+	return gang->aff_ref_spu != NULL;
 }
 
 /**
@@ -191,127 +435,188 @@ static void spu_bind_context(struct spu *spu, struct spu_context *ctx)
  */
 static void spu_unbind_context(struct spu *spu, struct spu_context *ctx)
 {
-	pr_debug("%s: unbind pid=%d SPU=%d NODE=%d\n", __FUNCTION__,
-		 spu->pid, spu->number, spu->node);
+	u32 status;
+
+	spu_context_trace(spu_unbind_context__enter, ctx, spu);
+
+	spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
+
+ 	if (spu->ctx->flags & SPU_CREATE_NOSCHED)
+		atomic_dec(&cbe_spu_info[spu->node].reserved_spus);
+
+	if (ctx->gang)
+		/*
+		 * If ctx->gang->aff_sched_count is positive, SPU affinity is
+		 * being considered in this gang. Using atomic_dec_if_positive
+		 * allow us to skip an explicit check for affinity in this gang
+		 */
+		atomic_dec_if_positive(&ctx->gang->aff_sched_count);
 
-	spu_remove_from_active_list(spu);
 	spu_switch_notify(spu, NULL);
 	spu_unmap_mappings(ctx);
 	spu_save(&ctx->csa, spu);
+	spu_switch_log_notify(spu, ctx, SWITCH_LOG_STOP, 0);
+
+	spin_lock_irq(&spu->register_lock);
 	spu->timestamp = jiffies;
 	ctx->state = SPU_STATE_SAVED;
 	spu->ibox_callback = NULL;
 	spu->wbox_callback = NULL;
 	spu->stop_callback = NULL;
 	spu->mfc_callback = NULL;
-	spu->dma_callback = NULL;
-	spu->mm = NULL;
 	spu->pid = 0;
+	spu->tgid = 0;
 	ctx->ops = &spu_backing_ops;
-	ctx->spu = NULL;
 	spu->flags = 0;
 	spu->ctx = NULL;
+	spin_unlock_irq(&spu->register_lock);
+
+	spu_associate_mm(spu, NULL);
+
+	ctx->stats.slb_flt +=
+		(spu->stats.slb_flt - ctx->stats.slb_flt_base);
+	ctx->stats.class2_intr +=
+		(spu->stats.class2_intr - ctx->stats.class2_intr_base);
+
+	/* This maps the underlying spu state to idle */
+	spuctx_switch_state(ctx, SPU_UTIL_IDLE_LOADED);
+	ctx->spu = NULL;
+
+	if (spu_stopped(ctx, &status))
+		wake_up_all(&ctx->stop_wq);
 }
 
 /**
  * spu_add_to_rq - add a context to the runqueue
  * @ctx:       context to add
  */
-static void spu_add_to_rq(struct spu_context *ctx)
+static void __spu_add_to_rq(struct spu_context *ctx)
 {
-	spin_lock(&spu_prio->runq_lock);
-	list_add_tail(&ctx->rq, &spu_prio->runq[ctx->prio]);
-	set_bit(ctx->prio, spu_prio->bitmap);
-	spin_unlock(&spu_prio->runq_lock);
+	/*
+	 * Unfortunately this code path can be called from multiple threads
+	 * on behalf of a single context due to the way the problem state
+	 * mmap support works.
+	 *
+	 * Fortunately we need to wake up all these threads at the same time
+	 * and can simply skip the runqueue addition for every but the first
+	 * thread getting into this codepath.
+	 *
+	 * It's still quite hacky, and long-term we should proxy all other
+	 * threads through the owner thread so that spu_run is in control
+	 * of all the scheduling activity for a given context.
+	 */
+	if (list_empty(&ctx->rq)) {
+		list_add_tail(&ctx->rq, &spu_prio->runq[ctx->prio]);
+		set_bit(ctx->prio, spu_prio->bitmap);
+		if (!spu_prio->nr_waiting++)
+			mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK);
+	}
 }
 
-/**
- * spu_del_from_rq - remove a context from the runqueue
- * @ctx:       context to remove
- */
-static void spu_del_from_rq(struct spu_context *ctx)
+static void spu_add_to_rq(struct spu_context *ctx)
 {
 	spin_lock(&spu_prio->runq_lock);
-	list_del_init(&ctx->rq);
-	if (list_empty(&spu_prio->runq[ctx->prio]))
-		clear_bit(ctx->prio, spu_prio->bitmap);
+	__spu_add_to_rq(ctx);
 	spin_unlock(&spu_prio->runq_lock);
 }
 
-/**
- * spu_grab_context - remove one context from the runqueue
- * @prio:      priority of the context to be removed
- *
- * This function removes one context from the runqueue for priority @prio.
- * If there is more than one context with the given priority the first
- * task on the runqueue will be taken.
- *
- * Returns the spu_context it just removed.
- *
- * Must be called with spu_prio->runq_lock held.
- */
-static struct spu_context *spu_grab_context(int prio)
+static void __spu_del_from_rq(struct spu_context *ctx)
 {
-	struct list_head *rq = &spu_prio->runq[prio];
+	int prio = ctx->prio;
+
+	if (!list_empty(&ctx->rq)) {
+		if (!--spu_prio->nr_waiting)
+			del_timer(&spusched_timer);
+		list_del_init(&ctx->rq);
 
-	if (list_empty(rq))
-		return NULL;
-	return list_entry(rq->next, struct spu_context, rq);
+		if (list_empty(&spu_prio->runq[prio]))
+			clear_bit(prio, spu_prio->bitmap);
+	}
+}
+
+void spu_del_from_rq(struct spu_context *ctx)
+{
+	spin_lock(&spu_prio->runq_lock);
+	__spu_del_from_rq(ctx);
+	spin_unlock(&spu_prio->runq_lock);
 }
 
 static void spu_prio_wait(struct spu_context *ctx)
 {
 	DEFINE_WAIT(wait);
 
-	set_bit(SPU_SCHED_WAKE, &ctx->sched_flags);
+	/*
+	 * The caller must explicitly wait for a context to be loaded
+	 * if the nosched flag is set.  If NOSCHED is not set, the caller
+	 * queues the context and waits for an spu event or error.
+	 */
+	BUG_ON(!(ctx->flags & SPU_CREATE_NOSCHED));
+
+	spin_lock(&spu_prio->runq_lock);
 	prepare_to_wait_exclusive(&ctx->stop_wq, &wait, TASK_INTERRUPTIBLE);
 	if (!signal_pending(current)) {
+		__spu_add_to_rq(ctx);
+		spin_unlock(&spu_prio->runq_lock);
 		mutex_unlock(&ctx->state_mutex);
 		schedule();
 		mutex_lock(&ctx->state_mutex);
+		spin_lock(&spu_prio->runq_lock);
+		__spu_del_from_rq(ctx);
 	}
+	spin_unlock(&spu_prio->runq_lock);
 	__set_current_state(TASK_RUNNING);
 	remove_wait_queue(&ctx->stop_wq, &wait);
-	clear_bit(SPU_SCHED_WAKE, &ctx->sched_flags);
 }
 
-/**
- * spu_reschedule - try to find a runnable context for a spu
- * @spu:       spu available
- *
- * This function is called whenever a spu becomes idle.  It looks for the
- * most suitable runnable spu context and schedules it for execution.
- */
-static void spu_reschedule(struct spu *spu)
+static struct spu *spu_get_idle(struct spu_context *ctx)
 {
-	int best;
+	struct spu *spu, *aff_ref_spu;
+	int node, n;
 
-	spu_free(spu);
+	spu_context_nospu_trace(spu_get_idle__enter, ctx);
 
-	spin_lock(&spu_prio->runq_lock);
-	best = sched_find_first_bit(spu_prio->bitmap);
-	if (best < MAX_PRIO) {
-		struct spu_context *ctx = spu_grab_context(best);
-		if (ctx && test_bit(SPU_SCHED_WAKE, &ctx->sched_flags))
-			wake_up(&ctx->stop_wq);
-	}
-	spin_unlock(&spu_prio->runq_lock);
-}
+	if (ctx->gang) {
+		mutex_lock(&ctx->gang->aff_mutex);
+		if (has_affinity(ctx)) {
+			aff_ref_spu = ctx->gang->aff_ref_spu;
+			atomic_inc(&ctx->gang->aff_sched_count);
+			mutex_unlock(&ctx->gang->aff_mutex);
+			node = aff_ref_spu->node;
 
-static struct spu *spu_get_idle(struct spu_context *ctx)
-{
-	struct spu *spu = NULL;
-	int node = cpu_to_node(raw_smp_processor_id());
-	int n;
+			mutex_lock(&cbe_spu_info[node].list_mutex);
+			spu = ctx_location(aff_ref_spu, ctx->aff_offset, node);
+			if (spu && spu->alloc_state == SPU_FREE)
+				goto found;
+			mutex_unlock(&cbe_spu_info[node].list_mutex);
 
+			atomic_dec(&ctx->gang->aff_sched_count);
+			goto not_found;
+		}
+		mutex_unlock(&ctx->gang->aff_mutex);
+	}
+	node = cpu_to_node(raw_smp_processor_id());
 	for (n = 0; n < MAX_NUMNODES; n++, node++) {
 		node = (node < MAX_NUMNODES) ? node : 0;
-		if (!node_allowed(node))
+		if (!node_allowed(ctx, node))
 			continue;
-		spu = spu_alloc_node(node);
-		if (spu)
-			break;
+
+		mutex_lock(&cbe_spu_info[node].list_mutex);
+		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
+			if (spu->alloc_state == SPU_FREE)
+				goto found;
+		}
+		mutex_unlock(&cbe_spu_info[node].list_mutex);
 	}
+
+ not_found:
+	spu_context_nospu_trace(spu_get_idle__not_found, ctx);
+	return NULL;
+
+ found:
+	spu->alloc_state = SPU_USED;
+	mutex_unlock(&cbe_spu_info[node].list_mutex);
+	spu_context_trace(spu_get_idle__found, ctx, spu);
+	spu_init_channels(spu);
 	return spu;
 }
 
@@ -327,10 +632,12 @@ static struct spu *find_victim(struct spu_context *ctx)
 	struct spu *spu;
 	int node, n;
 
+	spu_context_nospu_trace(spu_find_victim__enter, ctx);
+
 	/*
 	 * Look for a possible preemption candidate on the local node first.
 	 * If there is no candidate look at the other nodes.  This isn't
-	 * exactly fair, but so far the whole spu schedule tries to keep
+	 * exactly fair, but so far the whole spu scheduler tries to keep
 	 * a strong node affinity.  We might want to fine-tune this in
 	 * the future.
 	 */
@@ -338,18 +645,22 @@ static struct spu *find_victim(struct spu_context *ctx)
 	node = cpu_to_node(raw_smp_processor_id());
 	for (n = 0; n < MAX_NUMNODES; n++, node++) {
 		node = (node < MAX_NUMNODES) ? node : 0;
-		if (!node_allowed(node))
+		if (!node_allowed(ctx, node))
 			continue;
 
-		mutex_lock(&spu_prio->active_mutex[node]);
-		list_for_each_entry(spu, &spu_prio->active_list[node], list) {
+		mutex_lock(&cbe_spu_info[node].list_mutex);
+		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
 			struct spu_context *tmp = spu->ctx;
 
-			if (tmp->rt_priority < ctx->rt_priority &&
-			    (!victim || tmp->rt_priority < victim->rt_priority))
+			if (tmp && tmp->prio > ctx->prio &&
+			    !(tmp->flags & SPU_CREATE_NOSCHED) &&
+			    (!victim || tmp->prio > victim->prio)) {
 				victim = spu->ctx;
+			}
 		}
-		mutex_unlock(&spu_prio->active_mutex[node]);
+		if (victim)
+			get_spu_context(victim);
+		mutex_unlock(&cbe_spu_info[node].list_mutex);
 
 		if (victim) {
 			/*
@@ -357,25 +668,45 @@ static struct spu *find_victim(struct spu_context *ctx)
 			 * higher priority contexts before lower priority
 			 * ones, so this is safe until we introduce
 			 * priority inheritance schemes.
+			 *
+			 * XXX if the highest priority context is locked,
+			 * this can loop a long time.  Might be better to
+			 * look at another context or give up after X retries.
 			 */
 			if (!mutex_trylock(&victim->state_mutex)) {
+				put_spu_context(victim);
 				victim = NULL;
 				goto restart;
 			}
 
 			spu = victim->spu;
-			if (!spu) {
+			if (!spu || victim->prio <= ctx->prio) {
 				/*
 				 * This race can happen because we've dropped
-				 * the active list mutex.  No a problem, just
+				 * the active list mutex.  Not a problem, just
 				 * restart the search.
 				 */
 				mutex_unlock(&victim->state_mutex);
+				put_spu_context(victim);
 				victim = NULL;
 				goto restart;
 			}
+
+			spu_context_trace(__spu_deactivate__unload, ctx, spu);
+
+			mutex_lock(&cbe_spu_info[node].list_mutex);
+			cbe_spu_info[node].nr_active--;
 			spu_unbind_context(spu, victim);
+			mutex_unlock(&cbe_spu_info[node].list_mutex);
+
+			victim->stats.invol_ctx_switch++;
+			spu->stats.invol_ctx_switch++;
+			if (test_bit(SPU_SCHED_SPU_RUN, &victim->sched_flags))
+				spu_add_to_rq(victim);
+
 			mutex_unlock(&victim->state_mutex);
+			put_spu_context(victim);
+
 			return spu;
 		}
 	}
@@ -383,43 +714,175 @@ static struct spu *find_victim(struct spu_context *ctx)
 	return NULL;
 }
 
+static void __spu_schedule(struct spu *spu, struct spu_context *ctx)
+{
+	int node = spu->node;
+	int success = 0;
+
+	spu_set_timeslice(ctx);
+
+	mutex_lock(&cbe_spu_info[node].list_mutex);
+	if (spu->ctx == NULL) {
+		spu_bind_context(spu, ctx);
+		cbe_spu_info[node].nr_active++;
+		spu->alloc_state = SPU_USED;
+		success = 1;
+	}
+	mutex_unlock(&cbe_spu_info[node].list_mutex);
+
+	if (success)
+		wake_up_all(&ctx->run_wq);
+	else
+		spu_add_to_rq(ctx);
+}
+
+static void spu_schedule(struct spu *spu, struct spu_context *ctx)
+{
+	/* not a candidate for interruptible because it's called either
+	   from the scheduler thread or from spu_deactivate */
+	mutex_lock(&ctx->state_mutex);
+	if (ctx->state == SPU_STATE_SAVED)
+		__spu_schedule(spu, ctx);
+	spu_release(ctx);
+}
+
+/**
+ * spu_unschedule - remove a context from a spu, and possibly release it.
+ * @spu:	The SPU to unschedule from
+ * @ctx:	The context currently scheduled on the SPU
+ * @free_spu	Whether to free the SPU for other contexts
+ *
+ * Unbinds the context @ctx from the SPU @spu. If @free_spu is non-zero, the
+ * SPU is made available for other contexts (ie, may be returned by
+ * spu_get_idle). If this is zero, the caller is expected to schedule another
+ * context to this spu.
+ *
+ * Should be called with ctx->state_mutex held.
+ */
+static void spu_unschedule(struct spu *spu, struct spu_context *ctx,
+		int free_spu)
+{
+	int node = spu->node;
+
+	mutex_lock(&cbe_spu_info[node].list_mutex);
+	cbe_spu_info[node].nr_active--;
+	if (free_spu)
+		spu->alloc_state = SPU_FREE;
+	spu_unbind_context(spu, ctx);
+	ctx->stats.invol_ctx_switch++;
+	spu->stats.invol_ctx_switch++;
+	mutex_unlock(&cbe_spu_info[node].list_mutex);
+}
+
 /**
  * spu_activate - find a free spu for a context and execute it
  * @ctx:	spu context to schedule
  * @flags:	flags (currently ignored)
  *
- * Tries to find a free spu to run @ctx.  If no free spu is availble
+ * Tries to find a free spu to run @ctx.  If no free spu is available
  * add the context to the runqueue so it gets woken up once an spu
  * is available.
  */
 int spu_activate(struct spu_context *ctx, unsigned long flags)
 {
+	struct spu *spu;
 
+	/*
+	 * If there are multiple threads waiting for a single context
+	 * only one actually binds the context while the others will
+	 * only be able to acquire the state_mutex once the context
+	 * already is in runnable state.
+	 */
 	if (ctx->spu)
 		return 0;
 
-	do {
-		struct spu *spu;
+spu_activate_top:
+	if (signal_pending(current))
+		return -ERESTARTSYS;
 
-		spu = spu_get_idle(ctx);
-		/*
-		 * If this is a realtime thread we try to get it running by
-		 * preempting a lower priority thread.
-		 */
-		if (!spu && ctx->rt_priority)
-			spu = find_victim(ctx);
-		if (spu) {
-			spu_bind_context(spu, ctx);
-			return 0;
+	spu = spu_get_idle(ctx);
+	/*
+	 * If this is a realtime thread we try to get it running by
+	 * preempting a lower priority thread.
+	 */
+	if (!spu && rt_prio(ctx->prio))
+		spu = find_victim(ctx);
+	if (spu) {
+		unsigned long runcntl;
+
+		runcntl = ctx->ops->runcntl_read(ctx);
+		__spu_schedule(spu, ctx);
+		if (runcntl & SPU_RUNCNTL_RUNNABLE)
+			spuctx_switch_state(ctx, SPU_UTIL_USER);
+
+		return 0;
+	}
+
+	if (ctx->flags & SPU_CREATE_NOSCHED) {
+		spu_prio_wait(ctx);
+		goto spu_activate_top;
+	}
+
+	spu_add_to_rq(ctx);
+
+	return 0;
+}
+
+/**
+ * grab_runnable_context - try to find a runnable context
+ *
+ * Remove the highest priority context on the runqueue and return it
+ * to the caller.  Returns %NULL if no runnable context was found.
+ */
+static struct spu_context *grab_runnable_context(int prio, int node)
+{
+	struct spu_context *ctx;
+	int best;
+
+	spin_lock(&spu_prio->runq_lock);
+	best = find_first_bit(spu_prio->bitmap, prio);
+	while (best < prio) {
+		struct list_head *rq = &spu_prio->runq[best];
+
+		list_for_each_entry(ctx, rq, rq) {
+			/* XXX(hch): check for affinity here as well */
+			if (__node_allowed(ctx, node)) {
+				__spu_del_from_rq(ctx);
+				goto found;
+			}
 		}
+		best++;
+	}
+	ctx = NULL;
+ found:
+	spin_unlock(&spu_prio->runq_lock);
+	return ctx;
+}
 
-		spu_add_to_rq(ctx);
-		if (!(flags & SPU_ACTIVATE_NOWAKE))
-			spu_prio_wait(ctx);
-		spu_del_from_rq(ctx);
-	} while (!signal_pending(current));
+static int __spu_deactivate(struct spu_context *ctx, int force, int max_prio)
+{
+	struct spu *spu = ctx->spu;
+	struct spu_context *new = NULL;
+
+	if (spu) {
+		new = grab_runnable_context(max_prio, spu->node);
+		if (new || force) {
+			spu_unschedule(spu, ctx, new == NULL);
+			if (new) {
+				if (new->flags & SPU_CREATE_NOSCHED)
+					wake_up(&new->stop_wq);
+				else {
+					spu_release(ctx);
+					spu_schedule(spu, new);
+					/* this one can't easily be made
+					   interruptible */
+					mutex_lock(&ctx->state_mutex);
+				}
+			}
+		}
+	}
 
-	return -ERESTARTSYS;
+	return new != NULL;
 }
 
 /**
@@ -431,16 +894,12 @@ int spu_activate(struct spu_context *ctx, unsigned long flags)
  */
 void spu_deactivate(struct spu_context *ctx)
 {
-	struct spu *spu = ctx->spu;
-
-	if (spu) {
-		spu_unbind_context(spu, ctx);
-		spu_reschedule(spu);
-	}
+	spu_context_nospu_trace(spu_deactivate__enter, ctx);
+	__spu_deactivate(ctx, 1, MAX_PRIO);
 }
 
 /**
- * spu_yield -  yield a physical spu if others are waiting
+ * spu_yield -	yield a physical spu if others are waiting
  * @ctx:	spu context to yield
  *
  * Check if there is a higher priority context waiting and if yes
@@ -449,67 +908,267 @@ void spu_deactivate(struct spu_context *ctx)
  */
 void spu_yield(struct spu_context *ctx)
 {
+	spu_context_nospu_trace(spu_yield__enter, ctx);
+	if (!(ctx->flags & SPU_CREATE_NOSCHED)) {
+		mutex_lock(&ctx->state_mutex);
+		__spu_deactivate(ctx, 0, MAX_PRIO);
+		mutex_unlock(&ctx->state_mutex);
+	}
+}
+
+static noinline void spusched_tick(struct spu_context *ctx)
+{
+	struct spu_context *new = NULL;
+	struct spu *spu = NULL;
+
+	if (spu_acquire(ctx))
+		BUG();	/* a kernel thread never has signals pending */
+
+	if (ctx->state != SPU_STATE_RUNNABLE)
+		goto out;
+	if (ctx->flags & SPU_CREATE_NOSCHED)
+		goto out;
+	if (ctx->policy == SCHED_FIFO)
+		goto out;
+
+	if (--ctx->time_slice && test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))
+		goto out;
+
+	spu = ctx->spu;
+
+	spu_context_trace(spusched_tick__preempt, ctx, spu);
+
+	new = grab_runnable_context(ctx->prio + 1, spu->node);
+	if (new) {
+		spu_unschedule(spu, ctx, 0);
+		if (test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))
+			spu_add_to_rq(ctx);
+	} else {
+		spu_context_nospu_trace(spusched_tick__newslice, ctx);
+		if (!ctx->time_slice)
+			ctx->time_slice++;
+	}
+out:
+	spu_release(ctx);
+
+	if (new)
+		spu_schedule(spu, new);
+}
+
+/**
+ * count_active_contexts - count nr of active tasks
+ *
+ * Return the number of tasks currently running or waiting to run.
+ *
+ * Note that we don't take runq_lock / list_mutex here.  Reading
+ * a single 32bit value is atomic on powerpc, and we don't care
+ * about memory ordering issues here.
+ */
+static unsigned long count_active_contexts(void)
+{
+	int nr_active = 0, node;
+
+	for (node = 0; node < MAX_NUMNODES; node++)
+		nr_active += cbe_spu_info[node].nr_active;
+	nr_active += spu_prio->nr_waiting;
+
+	return nr_active;
+}
+
+/**
+ * spu_calc_load - update the avenrun load estimates.
+ *
+ * No locking against reading these values from userspace, as for
+ * the CPU loadavg code.
+ */
+static void spu_calc_load(void)
+{
+	unsigned long active_tasks; /* fixed-point */
+
+	active_tasks = count_active_contexts() * FIXED_1;
+	CALC_LOAD(spu_avenrun[0], EXP_1, active_tasks);
+	CALC_LOAD(spu_avenrun[1], EXP_5, active_tasks);
+	CALC_LOAD(spu_avenrun[2], EXP_15, active_tasks);
+}
+
+static void spusched_wake(unsigned long data)
+{
+	mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK);
+	wake_up_process(spusched_task);
+}
+
+static void spuloadavg_wake(unsigned long data)
+{
+	mod_timer(&spuloadavg_timer, jiffies + LOAD_FREQ);
+	spu_calc_load();
+}
+
+static int spusched_thread(void *unused)
+{
 	struct spu *spu;
-	int need_yield = 0;
-
-	if (mutex_trylock(&ctx->state_mutex)) {
-		if ((spu = ctx->spu) != NULL) {
-			int best = sched_find_first_bit(spu_prio->bitmap);
-			if (best < MAX_PRIO) {
-				pr_debug("%s: yielding SPU %d NODE %d\n",
-					 __FUNCTION__, spu->number, spu->node);
-				spu_deactivate(ctx);
-				need_yield = 1;
+	int node;
+
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule();
+		for (node = 0; node < MAX_NUMNODES; node++) {
+			struct mutex *mtx = &cbe_spu_info[node].list_mutex;
+
+			mutex_lock(mtx);
+			list_for_each_entry(spu, &cbe_spu_info[node].spus,
+					cbe_list) {
+				struct spu_context *ctx = spu->ctx;
+
+				if (ctx) {
+					get_spu_context(ctx);
+					mutex_unlock(mtx);
+					spusched_tick(ctx);
+					mutex_lock(mtx);
+					put_spu_context(ctx);
+				}
 			}
+			mutex_unlock(mtx);
 		}
-		mutex_unlock(&ctx->state_mutex);
 	}
-	if (unlikely(need_yield))
-		yield();
+
+	return 0;
 }
 
-int __init spu_sched_init(void)
+void spuctx_switch_state(struct spu_context *ctx,
+		enum spu_utilization_state new_state)
 {
-	int i;
+	unsigned long long curtime;
+	signed long long delta;
+	struct timespec ts;
+	struct spu *spu;
+	enum spu_utilization_state old_state;
+	int node;
 
-	spu_sched_wq = create_singlethread_workqueue("spusched");
-	if (!spu_sched_wq)
-		return 1;
+	ktime_get_ts(&ts);
+	curtime = timespec_to_ns(&ts);
+	delta = curtime - ctx->stats.tstamp;
 
-	spu_prio = kzalloc(sizeof(struct spu_prio_array), GFP_KERNEL);
-	if (!spu_prio) {
-		printk(KERN_WARNING "%s: Unable to allocate priority queue.\n",
-		       __FUNCTION__);
-		       destroy_workqueue(spu_sched_wq);
-		return 1;
+	WARN_ON(!mutex_is_locked(&ctx->state_mutex));
+	WARN_ON(delta < 0);
+
+	spu = ctx->spu;
+	old_state = ctx->stats.util_state;
+	ctx->stats.util_state = new_state;
+	ctx->stats.tstamp = curtime;
+
+	/*
+	 * Update the physical SPU utilization statistics.
+	 */
+	if (spu) {
+		ctx->stats.times[old_state] += delta;
+		spu->stats.times[old_state] += delta;
+		spu->stats.util_state = new_state;
+		spu->stats.tstamp = curtime;
+		node = spu->node;
+		if (old_state == SPU_UTIL_USER)
+			atomic_dec(&cbe_spu_info[node].busy_spus);
+		if (new_state == SPU_UTIL_USER)
+			atomic_inc(&cbe_spu_info[node].busy_spus);
 	}
+}
+
+#define LOAD_INT(x) ((x) >> FSHIFT)
+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+
+static int show_spu_loadavg(struct seq_file *s, void *private)
+{
+	int a, b, c;
+
+	a = spu_avenrun[0] + (FIXED_1/200);
+	b = spu_avenrun[1] + (FIXED_1/200);
+	c = spu_avenrun[2] + (FIXED_1/200);
+
+	/*
+	 * Note that last_pid doesn't really make much sense for the
+	 * SPU loadavg (it even seems very odd on the CPU side...),
+	 * but we include it here to have a 100% compatible interface.
+	 */
+	seq_printf(s, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
+		LOAD_INT(a), LOAD_FRAC(a),
+		LOAD_INT(b), LOAD_FRAC(b),
+		LOAD_INT(c), LOAD_FRAC(c),
+		count_active_contexts(),
+		atomic_read(&nr_spu_contexts),
+		task_active_pid_ns(current)->last_pid);
+	return 0;
+}
+
+static int spu_loadavg_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_spu_loadavg, NULL);
+}
+
+static const struct file_operations spu_loadavg_fops = {
+	.open		= spu_loadavg_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+int __init spu_sched_init(void)
+{
+	struct proc_dir_entry *entry;
+	int err = -ENOMEM, i;
+
+	spu_prio = kzalloc(sizeof(struct spu_prio_array), GFP_KERNEL);
+	if (!spu_prio)
+		goto out;
+
 	for (i = 0; i < MAX_PRIO; i++) {
 		INIT_LIST_HEAD(&spu_prio->runq[i]);
 		__clear_bit(i, spu_prio->bitmap);
 	}
-	__set_bit(MAX_PRIO, spu_prio->bitmap);
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		mutex_init(&spu_prio->active_mutex[i]);
-		INIT_LIST_HEAD(&spu_prio->active_list[i]);
-	}
 	spin_lock_init(&spu_prio->runq_lock);
+
+	setup_timer(&spusched_timer, spusched_wake, 0);
+	setup_timer(&spuloadavg_timer, spuloadavg_wake, 0);
+
+	spusched_task = kthread_run(spusched_thread, NULL, "spusched");
+	if (IS_ERR(spusched_task)) {
+		err = PTR_ERR(spusched_task);
+		goto out_free_spu_prio;
+	}
+
+	mod_timer(&spuloadavg_timer, 0);
+
+	entry = proc_create("spu_loadavg", 0, NULL, &spu_loadavg_fops);
+	if (!entry)
+		goto out_stop_kthread;
+
+	pr_debug("spusched: tick: %d, min ticks: %d, default ticks: %d\n",
+			SPUSCHED_TICK, MIN_SPU_TIMESLICE, DEF_SPU_TIMESLICE);
 	return 0;
+
+ out_stop_kthread:
+	kthread_stop(spusched_task);
+ out_free_spu_prio:
+	kfree(spu_prio);
+ out:
+	return err;
 }
 
-void __exit spu_sched_exit(void)
+void spu_sched_exit(void)
 {
-	struct spu *spu, *tmp;
+	struct spu *spu;
 	int node;
 
+	remove_proc_entry("spu_loadavg", NULL);
+
+	del_timer_sync(&spusched_timer);
+	del_timer_sync(&spuloadavg_timer);
+	kthread_stop(spusched_task);
+
 	for (node = 0; node < MAX_NUMNODES; node++) {
-		mutex_lock(&spu_prio->active_mutex[node]);
-		list_for_each_entry_safe(spu, tmp, &spu_prio->active_list[node],
-					 list) {
-			list_del_init(&spu->list);
-			spu_free(spu);
-		}
-		mutex_unlock(&spu_prio->active_mutex[node]);
+		mutex_lock(&cbe_spu_info[node].list_mutex);
+		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list)
+			if (spu->alloc_state != SPU_FREE)
+				spu->alloc_state = SPU_FREE;
+		mutex_unlock(&cbe_spu_info[node].list_mutex);
 	}
 	kfree(spu_prio);
-	destroy_workqueue(spu_sched_wq);
 }
diff --git a/arch/powerpc/platforms/cell/spufs/spu_restore.c b/arch/powerpc/platforms/cell/spufs/spu_restore.c
index 0bf723dcd67..72c905f1ee7 100644
--- a/arch/powerpc/platforms/cell/spufs/spu_restore.c
+++ b/arch/powerpc/platforms/cell/spufs/spu_restore.c
@@ -84,13 +84,13 @@ static inline void restore_decr(void)
 	unsigned int decr_running;
 	unsigned int decr;
 
-	/* Restore, Step 6:
+	/* Restore, Step 6(moved):
 	 *    If the LSCSA "decrementer running" flag is set
 	 *    then write the SPU_WrDec channel with the
 	 *    decrementer value from LSCSA.
 	 */
 	offset = LSCSA_QW_OFFSET(decr_status);
-	decr_running = regs_spill[offset].slot[0];
+	decr_running = regs_spill[offset].slot[0] & SPU_DECR_STATUS_RUNNING;
 	if (decr_running) {
 		offset = LSCSA_QW_OFFSET(decr);
 		decr = regs_spill[offset].slot[0];
@@ -284,7 +284,7 @@ static inline void restore_complete(void)
 		exit_instrs[3] = BR_INSTR;
 		break;
 	default:
-		/* SPU_Status[R]=1. No additonal instructions. */
+		/* SPU_Status[R]=1. No additional instructions. */
 		break;
 	}
 	spu_sync();
@@ -296,7 +296,7 @@ static inline void restore_complete(void)
  * This code deviates from the documented sequence in the
  * following aspects:
  *
- * 	1. The EA for LSCSA is passed from PPE in the
+ *	1. The EA for LSCSA is passed from PPE in the
  *	   signal notification channels.
  *	2. The register spill area is pulled by SPU
  *	   into LS, rather than pushed by PPE.
@@ -318,10 +318,10 @@ int main()
 	build_dma_list(lscsa_ea);	/* Step 3.  */
 	restore_upper_240kb(lscsa_ea);	/* Step 4.  */
 					/* Step 5: done by 'exit'. */
-	restore_decr();			/* Step 6. */
 	enqueue_putllc(lscsa_ea);	/* Step 7. */
 	set_tag_update();		/* Step 8. */
 	read_tag_status();		/* Step 9. */
+	restore_decr();			/* moved Step 6. */
 	read_llar_status();		/* Step 10. */
 	write_ppu_mb();			/* Step 11. */
 	write_ppuint_mb();		/* Step 12. */
diff --git a/arch/powerpc/platforms/cell/spufs/spu_restore_dump.h_shipped b/arch/powerpc/platforms/cell/spufs/spu_restore_dump.h_shipped
index 15183d209b5..f383b027e8b 100644
--- a/arch/powerpc/platforms/cell/spufs/spu_restore_dump.h_shipped
+++ b/arch/powerpc/platforms/cell/spufs/spu_restore_dump.h_shipped
@@ -10,7 +10,7 @@ static unsigned int spu_restore_code[]  __attribute__((__aligned__(128))) = {
 0x24fd8081,
 0x1cd80081,
 0x33001180,
-0x42030003,
+0x42034003,
 0x33800284,
 0x1c010204,
 0x40200000,
@@ -24,22 +24,22 @@ static unsigned int spu_restore_code[]  __attribute__((__aligned__(128))) = {
 0x23fffd84,
 0x1c100183,
 0x217ffa85,
-0x3080a000,
-0x3080a201,
-0x3080a402,
-0x3080a603,
-0x3080a804,
-0x3080aa05,
-0x3080ac06,
-0x3080ae07,
-0x3080b008,
-0x3080b209,
-0x3080b40a,
-0x3080b60b,
-0x3080b80c,
-0x3080ba0d,
-0x3080bc0e,
-0x3080be0f,
+0x3080b000,
+0x3080b201,
+0x3080b402,
+0x3080b603,
+0x3080b804,
+0x3080ba05,
+0x3080bc06,
+0x3080be07,
+0x3080c008,
+0x3080c209,
+0x3080c40a,
+0x3080c60b,
+0x3080c80c,
+0x3080ca0d,
+0x3080cc0e,
+0x3080ce0f,
 0x00003ffc,
 0x00000000,
 0x00000000,
@@ -48,19 +48,18 @@ static unsigned int spu_restore_code[]  __attribute__((__aligned__(128))) = {
 0x3ec00083,
 0xb0a14103,
 0x01a00204,
-0x3ec10082,
-0x4202800e,
-0x04000703,
-0xb0a14202,
-0x21a00803,
-0x3fbf028d,
-0x3f20068d,
-0x3fbe0682,
+0x3ec10083,
+0x4202c002,
+0xb0a14203,
+0x21a00802,
+0x3fbf028a,
+0x3f20050a,
+0x3fbe0502,
 0x3fe30102,
 0x21a00882,
-0x3f82028f,
-0x3fe3078f,
-0x3fbf0784,
+0x3f82028b,
+0x3fe3058b,
+0x3fbf0584,
 0x3f200204,
 0x3fbe0204,
 0x3fe30204,
@@ -75,252 +74,285 @@ static unsigned int spu_restore_code[]  __attribute__((__aligned__(128))) = {
 0x21a00083,
 0x40800082,
 0x21a00b02,
-0x10002818,
-0x42a00002,
-0x32800007,
-0x4207000c,
-0x18008208,
-0x40a0000b,
-0x4080020a,
-0x40800709,
-0x00200000,
-0x42070002,
-0x3ac30384,
+0x10002612,
+0x42a00003,
+0x42074006,
+0x1800c204,
+0x40a00008,
+0x40800789,
+0x1c010305,
+0x34000302,
 0x1cffc489,
-0x00200000,
-0x18008383,
-0x38830382,
-0x4cffc486,
-0x3ac28185,
-0xb0408584,
-0x28830382,
-0x1c020387,
-0x38828182,
-0xb0408405,
-0x1802c408,
-0x28828182,
-0x217ff886,
-0x04000583,
-0x21a00803,
-0x3fbe0682,
-0x3fe30102,
-0x04000106,
-0x21a00886,
-0x04000603,
-0x21a00903,
-0x40803c02,
-0x21a00982,
-0x40800003,
-0x04000184,
-0x21a00a04,
+0x3ec00303,
+0x3ec00287,
+0xb0408403,
+0x24000302,
+0x34000282,
+0x1c020306,
+0xb0408207,
+0x18020204,
+0x24000282,
+0x217ffa09,
+0x04000402,
+0x21a00802,
+0x3fbe0504,
+0x3fe30204,
+0x21a00884,
+0x42074002,
+0x21a00902,
+0x40803c03,
+0x21a00983,
+0x04000485,
+0x21a00a05,
 0x40802202,
 0x21a00a82,
-0x42028005,
-0x34208702,
-0x21002282,
-0x21a00804,
-0x21a00886,
-0x3fbf0782,
+0x21a00805,
+0x21a00884,
+0x3fbf0582,
 0x3f200102,
 0x3fbe0102,
 0x3fe30102,
 0x21a00902,
 0x40804003,
 0x21a00983,
-0x21a00a04,
+0x21a00a05,
 0x40805a02,
 0x21a00a82,
 0x40800083,
 0x21a00b83,
 0x01a00c02,
-0x01a00d83,
-0x3420c282,
+0x30809c03,
+0x34000182,
+0x14004102,
+0x21002082,
+0x01a00d82,
+0x3080a003,
+0x34000182,
 0x21a00e02,
-0x34210283,
-0x21a00f03,
-0x34200284,
-0x77400200,
-0x3421c282,
+0x3080a203,
+0x34000182,
+0x21a00f02,
+0x3080a403,
+0x34000182,
+0x77400100,
+0x3080a603,
+0x34000182,
 0x21a00702,
-0x34218283,
-0x21a00083,
-0x34214282,
+0x3080a803,
+0x34000182,
+0x21a00082,
+0x3080aa03,
+0x34000182,
 0x21a00b02,
-0x4200480c,
-0x00200000,
-0x1c010286,
-0x34220284,
-0x34220302,
-0x0f608203,
-0x5c024204,
-0x3b81810b,
-0x42013c02,
-0x00200000,
-0x18008185,
-0x38808183,
-0x3b814182,
-0x21004e84,
+0x4020007f,
+0x3080ae02,
+0x42004805,
+0x3080ac04,
+0x34000103,
+0x34000202,
+0x1cffc183,
+0x3b810106,
+0x0f608184,
+0x42013802,
+0x5c020183,
+0x38810102,
+0x3b810102,
+0x21000e83,
 0x4020007f,
 0x35000100,
-0x000004e0,
-0x000002a0,
-0x000002e8,
-0x00000428,
+0x00000470,
+0x000002f8,
+0x00000430,
 0x00000360,
-0x000002e8,
-0x000004a0,
-0x00000468,
+0x000002f8,
 0x000003c8,
+0x000004a8,
+0x00000298,
 0x00000360,
+0x00200000,
 0x409ffe02,
 0x30801203,
-0x40800204,
-0x3ec40085,
-0x10009c09,
-0x3ac10606,
-0xb060c105,
-0x4020007f,
-0x4020007f,
+0x40800208,
+0x3ec40084,
+0x40800407,
+0x3ac20289,
+0xb060c104,
+0x3ac1c284,
 0x20801203,
-0x38810602,
-0xb0408586,
-0x28810602,
-0x32004180,
-0x34204702,
+0x38820282,
+0x41004003,
+0xb0408189,
+0x28820282,
+0x3881c282,
+0xb0408304,
+0x2881c282,
+0x00400000,
+0x40800003,
+0x35000000,
+0x30809e03,
+0x34000182,
 0x21a00382,
 0x4020007f,
-0x327fdc80,
+0x327fde00,
 0x409ffe02,
 0x30801203,
-0x40800204,
-0x3ec40087,
-0x40800405,
-0x00200000,
-0x40800606,
-0x3ac10608,
-0x3ac14609,
-0x3ac1860a,
-0xb060c107,
+0x40800206,
+0x3ec40084,
+0x40800407,
+0x40800608,
+0x3ac1828a,
+0x3ac20289,
+0xb060c104,
+0x3ac1c284,
 0x20801203,
+0x38818282,
 0x41004003,
-0x38810602,
-0x4020007f,
-0xb0408188,
-0x4020007f,
-0x28810602,
-0x41201002,
-0x38814603,
-0x10009c09,
-0xb060c109,
-0x4020007f,
-0x28814603,
+0xb040818a,
+0x10005b0b,
+0x41201003,
+0x28818282,
+0x3881c282,
+0xb0408184,
 0x41193f83,
-0x38818602,
 0x60ffc003,
-0xb040818a,
-0x28818602,
-0x32003080,
+0x2881c282,
+0x38820282,
+0xb0408189,
+0x28820282,
+0x327fef80,
 0x409ffe02,
 0x30801203,
-0x40800204,
-0x3ec40087,
-0x41201008,
-0x10009c14,
-0x40800405,
-0x3ac10609,
-0x40800606,
-0x3ac1460a,
-0xb060c107,
-0x3ac1860b,
+0x40800207,
+0x3ec40086,
+0x4120100b,
+0x10005b14,
+0x40800404,
+0x3ac1c289,
+0x40800608,
+0xb060c106,
+0x3ac10286,
+0x3ac2028a,
 0x20801203,
-0x38810602,
-0xb0408409,
-0x28810602,
-0x38814603,
-0xb060c40a,
-0x4020007f,
-0x28814603,
+0x3881c282,
 0x41193f83,
-0x38818602,
 0x60ffc003,
-0xb040818b,
-0x28818602,
-0x32002380,
-0x409ffe02,
-0x30801204,
-0x40800205,
-0x3ec40083,
-0x40800406,
-0x3ac14607,
-0x3ac18608,
-0xb0810103,
-0x41004002,
-0x20801204,
-0x4020007f,
-0x38814603,
-0x10009c0b,
-0xb060c107,
-0x4020007f,
-0x4020007f,
-0x28814603,
-0x38818602,
-0x4020007f,
+0xb0408589,
+0x2881c282,
+0x38810282,
+0xb0408586,
+0x28810282,
+0x38820282,
+0xb040818a,
+0x28820282,
 0x4020007f,
-0xb0408588,
-0x28818602,
+0x327fe280,
+0x409ffe02,
+0x30801203,
+0x40800207,
+0x3ec40084,
+0x40800408,
+0x10005b14,
+0x40800609,
+0x3ac1c28a,
+0x3ac2028b,
+0xb060c104,
+0x3ac24284,
+0x20801203,
+0x41201003,
+0x3881c282,
+0xb040830a,
+0x2881c282,
+0x38820282,
+0xb040818b,
+0x41193f83,
+0x60ffc003,
+0x28820282,
+0x38824282,
+0xb0408184,
+0x28824282,
 0x4020007f,
-0x32001780,
+0x327fd580,
 0x409ffe02,
-0x1000640e,
-0x40800204,
+0x1000658e,
+0x40800206,
 0x30801203,
-0x40800405,
-0x3ec40087,
-0x40800606,
-0x3ac10608,
-0x3ac14609,
-0x3ac1860a,
-0xb060c107,
+0x40800407,
+0x3ec40084,
+0x40800608,
+0x3ac1828a,
+0x3ac20289,
+0xb060c104,
+0x3ac1c284,
 0x20801203,
 0x413d8003,
-0x38810602,
+0x38818282,
 0x4020007f,
-0x327fd780,
-0x409ffe02,
-0x10007f0c,
-0x40800205,
-0x30801204,
-0x40800406,
-0x3ec40083,
-0x3ac14607,
-0x3ac18608,
-0xb0810103,
-0x413d8002,
-0x20801204,
-0x38814603,
+0x327fd800,
+0x409ffe03,
+0x30801202,
+0x40800207,
+0x3ec40084,
+0x10005b09,
+0x3ac1c288,
+0xb0408184,
 0x4020007f,
-0x327feb80,
+0x4020007f,
+0x20801202,
+0x3881c282,
+0xb0408308,
+0x2881c282,
+0x327fc680,
 0x409ffe02,
+0x1000588b,
+0x40800208,
 0x30801203,
-0x40800204,
-0x3ec40087,
-0x40800405,
-0x1000650a,
-0x40800606,
-0x3ac10608,
-0x3ac14609,
-0x3ac1860a,
-0xb060c107,
+0x40800407,
+0x3ec40084,
+0x3ac20289,
+0xb060c104,
+0x3ac1c284,
 0x20801203,
-0x38810602,
-0xb0408588,
-0x4020007f,
-0x327fc980,
-0x00400000,
-0x40800003,
-0x4020007f,
-0x35000000,
+0x413d8003,
+0x38820282,
+0x327fbd80,
+0x00200000,
+0x00000da0,
+0x00000000,
+0x00000000,
+0x00000000,
+0x00000d90,
+0x00000000,
+0x00000000,
+0x00000000,
+0x00000db0,
+0x00000000,
+0x00000000,
+0x00000000,
+0x00000dc0,
+0x00000000,
+0x00000000,
+0x00000000,
+0x00000d80,
+0x00000000,
+0x00000000,
+0x00000000,
+0x00000df0,
+0x00000000,
+0x00000000,
+0x00000000,
+0x00000de0,
+0x00000000,
+0x00000000,
+0x00000000,
+0x00000dd0,
+0x00000000,
+0x00000000,
+0x00000000,
+0x00000e04,
+0x00000000,
+0x00000000,
 0x00000000,
+0x00000e00,
 0x00000000,
 0x00000000,
 0x00000000,
diff --git a/arch/powerpc/platforms/cell/spufs/spu_save.c b/arch/powerpc/platforms/cell/spufs/spu_save.c
index 196033b8a57..ae95cc1701e 100644
--- a/arch/powerpc/platforms/cell/spufs/spu_save.c
+++ b/arch/powerpc/platforms/cell/spufs/spu_save.c
@@ -44,7 +44,7 @@ static inline void save_event_mask(void)
 	 *    Read the SPU_RdEventMsk channel and save to the LSCSA.
 	 */
 	offset = LSCSA_QW_OFFSET(event_mask);
-	regs_spill[offset].slot[0] = spu_readch(SPU_RdEventStatMask);
+	regs_spill[offset].slot[0] = spu_readch(SPU_RdEventMask);
 }
 
 static inline void save_tag_mask(void)
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h
index 0c437891dfd..bcfd6f063ef 100644
--- a/arch/powerpc/platforms/cell/spufs/spufs.h
+++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -26,11 +26,18 @@
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/fs.h>
+#include <linux/cpumask.h>
 
 #include <asm/spu.h>
 #include <asm/spu_csa.h>
 #include <asm/spu_info.h>
 
+#define SPUFS_PS_MAP_SIZE	0x20000
+#define SPUFS_MFC_MAP_SIZE	0x1000
+#define SPUFS_CNTL_MAP_SIZE	0x1000
+#define SPUFS_SIGNAL_MAP_SIZE	PAGE_SIZE
+#define SPUFS_MSS_MAP_SIZE	0x1000
+
 /* The magic number for our file system */
 enum {
 	SPUFS_MAGIC = 0x23c9b64e,
@@ -41,7 +48,32 @@ struct spu_gang;
 
 /* ctx->sched_flags */
 enum {
-	SPU_SCHED_WAKE = 0,
+	SPU_SCHED_NOTIFY_ACTIVE,
+	SPU_SCHED_WAS_ACTIVE,	/* was active upon spu_acquire_saved()  */
+	SPU_SCHED_SPU_RUN,	/* context is within spu_run */
+};
+
+enum {
+	SWITCH_LOG_BUFSIZE = 4096,
+};
+
+enum {
+	SWITCH_LOG_START,
+	SWITCH_LOG_STOP,
+	SWITCH_LOG_EXIT,
+};
+
+struct switch_log {
+	wait_queue_head_t	wait;
+	unsigned long		head;
+	unsigned long		tail;
+	struct switch_log_entry {
+		struct timespec	tstamp;
+		s32		spu_id;
+		u32		type;
+		u32		val;
+		u64		timebase;
+	} log[];
 };
 
 struct spu_context {
@@ -50,16 +82,17 @@ struct spu_context {
 	spinlock_t mmio_lock;		  /* protects mmio access */
 	struct address_space *local_store; /* local store mapping.  */
 	struct address_space *mfc;	   /* 'mfc' area mappings. */
-	struct address_space *cntl; 	   /* 'control' area mappings. */
-	struct address_space *signal1; 	   /* 'signal1' area mappings. */
-	struct address_space *signal2; 	   /* 'signal2' area mappings. */
-	struct address_space *mss; 	   /* 'mss' area mappings. */
-	struct address_space *psmap; 	   /* 'psmap' area mappings. */
+	struct address_space *cntl;	   /* 'control' area mappings. */
+	struct address_space *signal1;	   /* 'signal1' area mappings. */
+	struct address_space *signal2;	   /* 'signal2' area mappings. */
+	struct address_space *mss;	   /* 'mss' area mappings. */
+	struct address_space *psmap;	   /* 'psmap' area mappings. */
+	struct mutex mapping_lock;
 	u64 object_id;		   /* user space pointer for oprofile */
 
 	enum { SPU_STATE_RUNNABLE, SPU_STATE_SAVED } state;
 	struct mutex state_mutex;
-	struct semaphore run_sema;
+	struct mutex run_mutex;
 
 	struct mm_struct *owner;
 
@@ -68,6 +101,7 @@ struct spu_context {
 	wait_queue_head_t wbox_wq;
 	wait_queue_head_t stop_wq;
 	wait_queue_head_t mfc_wq;
+	wait_queue_head_t run_wq;
 	struct fasync_struct *ibox_fasync;
 	struct fasync_struct *wbox_fasync;
 	struct fasync_struct *mfc_fasync;
@@ -79,14 +113,45 @@ struct spu_context {
 
 	struct list_head gang_list;
 	struct spu_gang *gang;
+	struct kref *prof_priv_kref;
+	void ( * prof_priv_release) (struct kref *kref);
+
+	/* owner thread */
+	pid_t tid;
 
 	/* scheduler fields */
- 	struct list_head rq;
-	struct delayed_work sched_work;
+	struct list_head rq;
+	unsigned int time_slice;
 	unsigned long sched_flags;
-	unsigned long rt_priority;
+	cpumask_t cpus_allowed;
 	int policy;
 	int prio;
+	int last_ran;
+
+	/* statistics */
+	struct {
+		/* updates protected by ctx->state_mutex */
+		enum spu_utilization_state util_state;
+		unsigned long long tstamp;	/* time of last state switch */
+		unsigned long long times[SPU_UTIL_MAX];
+		unsigned long long vol_ctx_switch;
+		unsigned long long invol_ctx_switch;
+		unsigned long long min_flt;
+		unsigned long long maj_flt;
+		unsigned long long hash_flt;
+		unsigned long long slb_flt;
+		unsigned long long slb_flt_base; /* # at last ctx switch */
+		unsigned long long class2_intr;
+		unsigned long long class2_intr_base; /* # at last ctx switch */
+		unsigned long long libassist;
+	} stats;
+
+	/* context switch log */
+	struct switch_log *switch_log;
+
+	struct list_head aff_list;
+	int aff_head;
+	int aff_offset;
 };
 
 struct spu_gang {
@@ -94,8 +159,19 @@ struct spu_gang {
 	struct mutex mutex;
 	struct kref kref;
 	int contexts;
+
+	struct spu_context *aff_ref_ctx;
+	struct list_head aff_list_head;
+	struct mutex aff_mutex;
+	int aff_flags;
+	struct spu *aff_ref_spu;
+	atomic_t aff_sched_count;
 };
 
+/* Flag bits for spu_gang aff_flags */
+#define AFF_OFFSETS_SET		1
+#define AFF_MERGED		2
+
 struct mfc_dma_command {
 	int32_t pad;	/* reserved */
 	uint32_t lsa;	/* local storage address */
@@ -127,8 +203,10 @@ struct spu_context_ops {
 	void (*npc_write) (struct spu_context * ctx, u32 data);
 	 u32(*status_read) (struct spu_context * ctx);
 	char*(*get_ls) (struct spu_context * ctx);
+	void (*privcntl_write) (struct spu_context *ctx, u64 data);
 	 u32 (*runcntl_read) (struct spu_context * ctx);
 	void (*runcntl_write) (struct spu_context * ctx, u32 data);
+	void (*runcntl_stop) (struct spu_context * ctx);
 	void (*master_start) (struct spu_context * ctx);
 	void (*master_stop) (struct spu_context * ctx);
 	int (*set_mfc_query)(struct spu_context * ctx, u32 mask, u32 mode);
@@ -140,6 +218,7 @@ struct spu_context_ops {
 			       struct spu_dma_info * info);
 	void (*proxydma_info_read) (struct spu_context * ctx,
 				    struct spu_proxydma_info * info);
+	void (*restart_dma)(struct spu_context *ctx);
 };
 
 extern struct spu_context_ops spu_hw_ops;
@@ -149,18 +228,32 @@ struct spufs_inode_info {
 	struct spu_context *i_ctx;
 	struct spu_gang *i_gang;
 	struct inode vfs_inode;
+	int i_openers;
 };
 #define SPUFS_I(inode) \
 	container_of(inode, struct spufs_inode_info, vfs_inode)
 
-extern struct tree_descr spufs_dir_contents[];
-extern struct tree_descr spufs_dir_nosched_contents[];
+struct spufs_tree_descr {
+	const char *name;
+	const struct file_operations *ops;
+	umode_t mode;
+	size_t size;
+};
+
+extern const struct spufs_tree_descr spufs_dir_contents[];
+extern const struct spufs_tree_descr spufs_dir_nosched_contents[];
+extern const struct spufs_tree_descr spufs_dir_debug_contents[];
 
 /* system call implementation */
-long spufs_run_spu(struct file *file,
-		   struct spu_context *ctx, u32 *npc, u32 *status);
-long spufs_create(struct nameidata *nd,
-			 unsigned int flags, mode_t mode);
+extern struct spufs_calls spufs_calls;
+struct coredump_params;
+long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *status);
+long spufs_create(struct path *nd, struct dentry *dentry, unsigned int flags,
+			umode_t mode, struct file *filp);
+/* ELF coredump callbacks for writing SPU ELF notes */
+extern int spufs_coredump_extra_notes_size(void);
+extern int spufs_coredump_extra_notes_write(struct coredump_params *cprm);
+
 extern const struct file_operations spufs_context_fops;
 
 /* gang management */
@@ -170,10 +263,18 @@ int put_spu_gang(struct spu_gang *gang);
 void spu_gang_remove_ctx(struct spu_gang *gang, struct spu_context *ctx);
 void spu_gang_add_ctx(struct spu_gang *gang, struct spu_context *ctx);
 
+/* fault handling */
+int spufs_handle_class1(struct spu_context *ctx);
+int spufs_handle_class0(struct spu_context *ctx);
+
+/* affinity */
+struct spu *affinity_check(struct spu_context *ctx);
+
 /* context management */
-static inline void spu_acquire(struct spu_context *ctx)
+extern atomic_t nr_spu_contexts;
+static inline int __must_check spu_acquire(struct spu_context *ctx)
 {
-	mutex_lock(&ctx->state_mutex);
+	return mutex_lock_interruptible(&ctx->state_mutex);
 }
 
 static inline void spu_release(struct spu_context *ctx)
@@ -188,28 +289,33 @@ int put_spu_context(struct spu_context *ctx);
 void spu_unmap_mappings(struct spu_context *ctx);
 
 void spu_forget(struct spu_context *ctx);
-int spu_acquire_runnable(struct spu_context *ctx, unsigned long flags);
-void spu_acquire_saved(struct spu_context *ctx);
-int spu_acquire_exclusive(struct spu_context *ctx);
-enum {
-	SPU_ACTIVATE_NOWAKE = 1,
-};
+int __must_check spu_acquire_saved(struct spu_context *ctx);
+void spu_release_saved(struct spu_context *ctx);
+
+int spu_stopped(struct spu_context *ctx, u32 * stat);
+void spu_del_from_rq(struct spu_context *ctx);
 int spu_activate(struct spu_context *ctx, unsigned long flags);
 void spu_deactivate(struct spu_context *ctx);
 void spu_yield(struct spu_context *ctx);
-void spu_start_tick(struct spu_context *ctx);
-void spu_stop_tick(struct spu_context *ctx);
-void spu_sched_tick(struct work_struct *work);
+void spu_switch_notify(struct spu *spu, struct spu_context *ctx);
+void spu_switch_log_notify(struct spu *spu, struct spu_context *ctx,
+		u32 type, u32 val);
+void spu_set_timeslice(struct spu_context *ctx);
+void spu_update_sched_info(struct spu_context *ctx);
+void __spu_update_sched_info(struct spu_context *ctx);
 int __init spu_sched_init(void);
-void __exit spu_sched_exit(void);
+void spu_sched_exit(void);
 
 extern char *isolated_loader;
 
 /*
  * spufs_wait
- * 	Same as wait_event_interruptible(), except that here
+ *	Same as wait_event_interruptible(), except that here
  *	we need to call spu_release(ctx) before sleeping, and
  *	then spu_acquire(ctx) when awoken.
+ *
+ * 	Returns with state_mutex re-acquired when successful or
+ * 	with -ERESTARTSYS and the state_mutex dropped when interrupted.
  */
 
 #define spufs_wait(wq, condition)					\
@@ -220,14 +326,15 @@ extern char *isolated_loader;
 		prepare_to_wait(&(wq), &__wait, TASK_INTERRUPTIBLE);	\
 		if (condition)						\
 			break;						\
-		if (!signal_pending(current)) {				\
-			spu_release(ctx);				\
-			schedule();					\
-			spu_acquire(ctx);				\
-			continue;					\
+		spu_release(ctx);					\
+		if (signal_pending(current)) {				\
+			__ret = -ERESTARTSYS;				\
+			break;						\
 		}							\
-		__ret = -ERESTARTSYS;					\
-		break;							\
+		schedule();						\
+		__ret = spu_acquire(ctx);				\
+		if (__ret)						\
+			break;						\
 	}								\
 	finish_wait(&(wq), &__wait);					\
 	__ret;								\
@@ -239,7 +346,7 @@ size_t spu_ibox_read(struct spu_context *ctx, u32 *data);
 /* irq callback funcs. */
 void spufs_ibox_callback(struct spu *spu);
 void spufs_wbox_callback(struct spu *spu);
-void spufs_stop_callback(struct spu *spu);
+void spufs_stop_callback(struct spu *spu, int irq);
 void spufs_mfc_callback(struct spu *spu);
 void spufs_dma_callback(struct spu *spu, int type);
 
@@ -248,10 +355,22 @@ struct spufs_coredump_reader {
 	char *name;
 	ssize_t (*read)(struct spu_context *ctx,
 			char __user *buffer, size_t size, loff_t *pos);
-	u64 (*get)(void *data);
+	u64 (*get)(struct spu_context *ctx);
 	size_t size;
 };
-extern struct spufs_coredump_reader spufs_coredump_read[];
+extern const struct spufs_coredump_reader spufs_coredump_read[];
 extern int spufs_coredump_num_notes;
 
+extern int spu_init_csa(struct spu_state *csa);
+extern void spu_fini_csa(struct spu_state *csa);
+extern int spu_save(struct spu_state *prev, struct spu *spu);
+extern int spu_restore(struct spu_state *new, struct spu *spu);
+extern int spu_switch(struct spu_state *prev, struct spu_state *new,
+		      struct spu *spu);
+extern int spu_alloc_lscsa(struct spu_state *csa);
+extern void spu_free_lscsa(struct spu_state *csa);
+
+extern void spuctx_switch_state(struct spu_context *ctx,
+		enum spu_utilization_state new_state);
+
 #endif
diff --git a/arch/powerpc/platforms/cell/spufs/sputrace.h b/arch/powerpc/platforms/cell/spufs/sputrace.h
new file mode 100644
index 00000000000..db2656aa410
--- /dev/null
+++ b/arch/powerpc/platforms/cell/spufs/sputrace.h
@@ -0,0 +1,39 @@
+#if !defined(_TRACE_SPUFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SPUFS_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM spufs
+
+TRACE_EVENT(spufs_context,
+	TP_PROTO(struct spu_context *ctx, struct spu *spu, const char *name),
+	TP_ARGS(ctx, spu, name),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(int, owner_tid)
+		__field(int, number)
+	),
+
+	TP_fast_assign(
+		__entry->name = name;
+		__entry->owner_tid = ctx->tid;
+		__entry->number = spu ? spu->number : -1;
+	),
+
+	TP_printk("%s (ctxthread = %d, spu = %d)",
+		__entry->name, __entry->owner_tid, __entry->number)
+);
+
+#define spu_context_trace(name, ctx, spu) \
+	trace_spufs_context(ctx, spu, __stringify(name))
+#define spu_context_nospu_trace(name, ctx) \
+	trace_spufs_context(ctx, NULL, __stringify(name))
+
+#endif /* _TRACE_SPUFS_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE sputrace
+#include <trace/define_trace.h>
diff --git a/arch/powerpc/platforms/cell/spufs/switch.c b/arch/powerpc/platforms/cell/spufs/switch.c
index c08981ff7fc..dde35551e74 100644
--- a/arch/powerpc/platforms/cell/spufs/switch.c
+++ b/arch/powerpc/platforms/cell/spufs/switch.c
@@ -32,14 +32,14 @@
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/errno.h>
+#include <linux/hardirq.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/stddef.h>
 #include <linux/unistd.h>
 
@@ -49,6 +49,8 @@
 #include <asm/spu_csa.h>
 #include <asm/mmu_context.h>
 
+#include "spufs.h"
+
 #include "spu_save_dump.h"
 #include "spu_restore_dump.h"
 
@@ -71,7 +73,7 @@
   }
 #endif				/* debug */
 
-#define POLL_WHILE_FALSE(_c) 	POLL_WHILE_TRUE(!(_c))
+#define POLL_WHILE_FALSE(_c)	POLL_WHILE_TRUE(!(_c))
 
 static inline void acquire_spu_lock(struct spu *spu)
 {
@@ -116,6 +118,8 @@ static inline void disable_interrupts(struct spu_state *csa, struct spu *spu)
 	 *     Write INT_MASK_class1 with value of 0.
 	 *     Save INT_Mask_class2 in CSA.
 	 *     Write INT_MASK_class2 with value of 0.
+	 *     Synchronize all three interrupts to be sure
+	 *     we no longer execute a handler on another CPU.
 	 */
 	spin_lock_irq(&spu->register_lock);
 	if (csa) {
@@ -128,6 +132,17 @@ static inline void disable_interrupts(struct spu_state *csa, struct spu *spu)
 	spu_int_mask_set(spu, 2, 0ul);
 	eieio();
 	spin_unlock_irq(&spu->register_lock);
+
+	/*
+	 * This flag needs to be set before calling synchronize_irq so
+	 * that the update will be visible to the relevant handlers
+	 * via a simple load.
+	 */
+	set_bit(SPU_CONTEXT_SWITCH_PENDING, &spu->flags);
+	clear_bit(SPU_CONTEXT_FAULT_PENDING, &spu->flags);
+	synchronize_irq(spu->irqs[0]);
+	synchronize_irq(spu->irqs[1]);
+	synchronize_irq(spu->irqs[2]);
 }
 
 static inline void set_watchdog_timer(struct spu_state *csa, struct spu *spu)
@@ -159,9 +174,8 @@ static inline void set_switch_pending(struct spu_state *csa, struct spu *spu)
 	/* Save, Step 7:
 	 * Restore, Step 5:
 	 *     Set a software context switch pending flag.
+	 *     Done above in Step 3 - disable_interrupts().
 	 */
-	set_bit(SPU_CONTEXT_SWITCH_PENDING, &spu->flags);
-	mb();
 }
 
 static inline void save_mfc_cntl(struct spu_state *csa, struct spu *spu)
@@ -179,22 +193,21 @@ static inline void save_mfc_cntl(struct spu_state *csa, struct spu *spu)
 				 MFC_CNTL_SUSPEND_COMPLETE);
 		/* fall through */
 	case MFC_CNTL_SUSPEND_COMPLETE:
-		if (csa) {
+		if (csa)
 			csa->priv2.mfc_control_RW =
 				in_be64(&priv2->mfc_control_RW) |
 				MFC_CNTL_SUSPEND_DMA_QUEUE;
-		}
 		break;
 	case MFC_CNTL_NORMAL_DMA_QUEUE_OPERATION:
 		out_be64(&priv2->mfc_control_RW, MFC_CNTL_SUSPEND_DMA_QUEUE);
 		POLL_WHILE_FALSE((in_be64(&priv2->mfc_control_RW) &
 				  MFC_CNTL_SUSPEND_DMA_STATUS_MASK) ==
 				 MFC_CNTL_SUSPEND_COMPLETE);
-		if (csa) {
+		if (csa)
 			csa->priv2.mfc_control_RW =
 				in_be64(&priv2->mfc_control_RW) &
-				~MFC_CNTL_SUSPEND_DMA_QUEUE;
-		}
+				~MFC_CNTL_SUSPEND_DMA_QUEUE &
+				~MFC_CNTL_SUSPEND_MASK;
 		break;
 	}
 }
@@ -244,24 +257,21 @@ static inline void save_spu_status(struct spu_state *csa, struct spu *spu)
 	}
 }
 
-static inline void save_mfc_decr(struct spu_state *csa, struct spu *spu)
+static inline void save_mfc_stopped_status(struct spu_state *csa,
+		struct spu *spu)
 {
 	struct spu_priv2 __iomem *priv2 = spu->priv2;
+	const u64 mask = MFC_CNTL_DECREMENTER_RUNNING |
+			MFC_CNTL_DMA_QUEUES_EMPTY;
 
 	/* Save, Step 12:
 	 *     Read MFC_CNTL[Ds].  Update saved copy of
 	 *     CSA.MFC_CNTL[Ds].
+	 *
+	 * update: do the same with MFC_CNTL[Q].
 	 */
-	if (in_be64(&priv2->mfc_control_RW) & MFC_CNTL_DECREMENTER_RUNNING) {
-		csa->priv2.mfc_control_RW |= MFC_CNTL_DECREMENTER_RUNNING;
-		csa->suspend_time = get_cycles();
-		out_be64(&priv2->spu_chnlcntptr_RW, 7ULL);
-		eieio();
-		csa->spu_chnldata_RW[7] = in_be64(&priv2->spu_chnldata_RW);
-		eieio();
-	} else {
-		csa->priv2.mfc_control_RW &= ~MFC_CNTL_DECREMENTER_RUNNING;
-	}
+	csa->priv2.mfc_control_RW &= ~mask;
+	csa->priv2.mfc_control_RW |= in_be64(&priv2->mfc_control_RW) & mask;
 }
 
 static inline void halt_mfc_decr(struct spu_state *csa, struct spu *spu)
@@ -272,7 +282,8 @@ static inline void halt_mfc_decr(struct spu_state *csa, struct spu *spu)
 	 *     Write MFC_CNTL[Dh] set to a '1' to halt
 	 *     the decrementer.
 	 */
-	out_be64(&priv2->mfc_control_RW, MFC_CNTL_DECREMENTER_HALTED);
+	out_be64(&priv2->mfc_control_RW,
+		 MFC_CNTL_DECREMENTER_HALTED | MFC_CNTL_SUSPEND_MASK);
 	eieio();
 }
 
@@ -388,6 +399,19 @@ static inline void save_ppu_querytype(struct spu_state *csa, struct spu *spu)
 	csa->prob.dma_querytype_RW = in_be32(&prob->dma_querytype_RW);
 }
 
+static inline void save_ppu_tagstatus(struct spu_state *csa, struct spu *spu)
+{
+	struct spu_problem __iomem *prob = spu->problem;
+
+	/* Save the Prxy_TagStatus register in the CSA.
+	 *
+	 * It is unnecessary to restore dma_tagstatus_R, however,
+	 * dma_tagstatus_R in the CSA is accessed via backing_ops, so
+	 * we must save it.
+	 */
+	csa->prob.dma_tagstatus_R = in_be32(&prob->dma_tagstatus_R);
+}
+
 static inline void save_mfc_csr_tsq(struct spu_state *csa, struct spu *spu)
 {
 	struct spu_priv2 __iomem *priv2 = spu->priv2;
@@ -451,7 +475,9 @@ static inline void purge_mfc_queue(struct spu_state *csa, struct spu *spu)
 	 * Restore, Step 14.
 	 *     Write MFC_CNTL[Pc]=1 (purge queue).
 	 */
-	out_be64(&priv2->mfc_control_RW, MFC_CNTL_PURGE_DMA_REQUEST);
+	out_be64(&priv2->mfc_control_RW,
+			MFC_CNTL_PURGE_DMA_REQUEST |
+			MFC_CNTL_SUSPEND_MASK);
 	eieio();
 }
 
@@ -468,26 +494,6 @@ static inline void wait_purge_complete(struct spu_state *csa, struct spu *spu)
 			 MFC_CNTL_PURGE_DMA_COMPLETE);
 }
 
-static inline void save_mfc_slbs(struct spu_state *csa, struct spu *spu)
-{
-	struct spu_priv2 __iomem *priv2 = spu->priv2;
-	int i;
-
-	/* Save, Step 29:
-	 *     If MFC_SR1[R]='1', save SLBs in CSA.
-	 */
-	if (spu_mfc_sr1_get(spu) & MFC_STATE1_RELOCATE_MASK) {
-		csa->priv2.slb_index_W = in_be64(&priv2->slb_index_W);
-		for (i = 0; i < 8; i++) {
-			out_be64(&priv2->slb_index_W, i);
-			eieio();
-			csa->slb_esid_RW[i] = in_be64(&priv2->slb_esid_RW);
-			csa->slb_vsid_RW[i] = in_be64(&priv2->slb_vsid_RW);
-			eieio();
-		}
-	}
-}
-
 static inline void setup_mfc_sr1(struct spu_state *csa, struct spu *spu)
 {
 	/* Save, Step 30:
@@ -623,7 +629,7 @@ static inline void save_ppuint_mb(struct spu_state *csa, struct spu *spu)
 static inline void save_ch_part1(struct spu_state *csa, struct spu *spu)
 {
 	struct spu_priv2 __iomem *priv2 = spu->priv2;
-	u64 idx, ch_indices[7] = { 0UL, 3UL, 4UL, 24UL, 25UL, 27UL };
+	u64 idx, ch_indices[] = { 0UL, 3UL, 4UL, 24UL, 25UL, 27UL };
 	int i;
 
 	/* Save, Step 42:
@@ -634,7 +640,7 @@ static inline void save_ch_part1(struct spu_state *csa, struct spu *spu)
 	csa->spu_chnldata_RW[1] = in_be64(&priv2->spu_chnldata_RW);
 
 	/* Save the following CH: [0,3,4,24,25,27] */
-	for (i = 0; i < 7; i++) {
+	for (i = 0; i < ARRAY_SIZE(ch_indices); i++) {
 		idx = ch_indices[i];
 		out_be64(&priv2->spu_chnlcntptr_RW, idx);
 		eieio();
@@ -708,49 +714,9 @@ static inline void resume_mfc_queue(struct spu_state *csa, struct spu *spu)
 	out_be64(&priv2->mfc_control_RW, MFC_CNTL_RESUME_DMA_QUEUE);
 }
 
-static inline void invalidate_slbs(struct spu_state *csa, struct spu *spu)
+static inline void setup_mfc_slbs(struct spu_state *csa, struct spu *spu,
+		unsigned int *code, int code_size)
 {
-	struct spu_priv2 __iomem *priv2 = spu->priv2;
-
-	/* Save, Step 45:
-	 * Restore, Step 19:
-	 *     If MFC_SR1[R]=1, write 0 to SLB_Invalidate_All.
-	 */
-	if (spu_mfc_sr1_get(spu) & MFC_STATE1_RELOCATE_MASK) {
-		out_be64(&priv2->slb_invalidate_all_W, 0UL);
-		eieio();
-	}
-}
-
-static inline void get_kernel_slb(u64 ea, u64 slb[2])
-{
-	u64 llp;
-
-	if (REGION_ID(ea) == KERNEL_REGION_ID)
-		llp = mmu_psize_defs[mmu_linear_psize].sllp;
-	else
-		llp = mmu_psize_defs[mmu_virtual_psize].sllp;
-	slb[0] = (get_kernel_vsid(ea) << SLB_VSID_SHIFT) |
-		SLB_VSID_KERNEL | llp;
-	slb[1] = (ea & ESID_MASK) | SLB_ESID_V;
-}
-
-static inline void load_mfc_slb(struct spu *spu, u64 slb[2], int slbe)
-{
-	struct spu_priv2 __iomem *priv2 = spu->priv2;
-
-	out_be64(&priv2->slb_index_W, slbe);
-	eieio();
-	out_be64(&priv2->slb_vsid_RW, slb[0]);
-	out_be64(&priv2->slb_esid_RW, slb[1]);
-	eieio();
-}
-
-static inline void setup_mfc_slbs(struct spu_state *csa, struct spu *spu)
-{
-	u64 code_slb[2];
-	u64 lscsa_slb[2];
-
 	/* Save, Step 47:
 	 * Restore, Step 30.
 	 *     If MFC_SR1[R]=1, write 0 to SLB_Invalidate_All
@@ -765,12 +731,8 @@ static inline void setup_mfc_slbs(struct spu_state *csa, struct spu *spu)
 	 *     MFC_SR1[R]=1 (in other words, assume that
 	 *     translation is desired by OS environment).
 	 */
-	invalidate_slbs(csa, spu);
-	get_kernel_slb((unsigned long)&spu_save_code[0], code_slb);
-	get_kernel_slb((unsigned long)csa->lscsa, lscsa_slb);
-	load_mfc_slb(spu, code_slb, 0);
-	if ((lscsa_slb[0] != code_slb[0]) || (lscsa_slb[1] != code_slb[1]))
-		load_mfc_slb(spu, lscsa_slb, 1);
+	spu_invalidate_slbs(spu);
+	spu_setup_kernel_slbs(spu, csa->lscsa, code, code_size);
 }
 
 static inline void set_switch_active(struct spu_state *csa, struct spu *spu)
@@ -778,9 +740,14 @@ static inline void set_switch_active(struct spu_state *csa, struct spu *spu)
 	/* Save, Step 48:
 	 * Restore, Step 23.
 	 *     Change the software context switch pending flag
-	 *     to context switch active.
+	 *     to context switch active.  This implementation does
+	 *     not uses a switch active flag.
+	 *
+	 * Now that we have saved the mfc in the csa, we can add in the
+	 * restart command if an exception occurred.
 	 */
-	set_bit(SPU_CONTEXT_SWITCH_ACTIVE, &spu->flags);
+	if (test_bit(SPU_CONTEXT_FAULT_PENDING, &spu->flags))
+		csa->priv2.mfc_control_RW |= MFC_CNTL_RESTART_DMA_COMMAND;
 	clear_bit(SPU_CONTEXT_SWITCH_PENDING, &spu->flags);
 	mb();
 }
@@ -799,9 +766,9 @@ static inline void enable_interrupts(struct spu_state *csa, struct spu *spu)
 	 *     (translation) interrupts.
 	 */
 	spin_lock_irq(&spu->register_lock);
-	spu_int_stat_clear(spu, 0, ~0ul);
-	spu_int_stat_clear(spu, 1, ~0ul);
-	spu_int_stat_clear(spu, 2, ~0ul);
+	spu_int_stat_clear(spu, 0, CLASS0_INTR_MASK);
+	spu_int_stat_clear(spu, 1, CLASS1_INTR_MASK);
+	spu_int_stat_clear(spu, 2, CLASS2_INTR_MASK);
 	spu_int_mask_set(spu, 0, 0ul);
 	spu_int_mask_set(spu, 1, class1_mask);
 	spu_int_mask_set(spu, 2, 0ul);
@@ -958,8 +925,8 @@ static inline void wait_tag_complete(struct spu_state *csa, struct spu *spu)
 	POLL_WHILE_FALSE(in_be32(&prob->dma_tagstatus_R) & mask);
 
 	local_irq_save(flags);
-	spu_int_stat_clear(spu, 0, ~(0ul));
-	spu_int_stat_clear(spu, 2, ~(0ul));
+	spu_int_stat_clear(spu, 0, CLASS0_INTR_MASK);
+	spu_int_stat_clear(spu, 2, CLASS2_INTR_MASK);
 	local_irq_restore(flags);
 }
 
@@ -977,8 +944,8 @@ static inline void wait_spu_stopped(struct spu_state *csa, struct spu *spu)
 	POLL_WHILE_TRUE(in_be32(&prob->spu_status_R) & SPU_STATUS_RUNNING);
 
 	local_irq_save(flags);
-	spu_int_stat_clear(spu, 0, ~(0ul));
-	spu_int_stat_clear(spu, 2, ~(0ul));
+	spu_int_stat_clear(spu, 0, CLASS0_INTR_MASK);
+	spu_int_stat_clear(spu, 2, CLASS2_INTR_MASK);
 	local_irq_restore(flags);
 }
 
@@ -1005,13 +972,13 @@ static inline void terminate_spu_app(struct spu_state *csa, struct spu *spu)
 	 */
 }
 
-static inline void suspend_mfc(struct spu_state *csa, struct spu *spu)
+static inline void suspend_mfc_and_halt_decr(struct spu_state *csa,
+		struct spu *spu)
 {
 	struct spu_priv2 __iomem *priv2 = spu->priv2;
 
 	/* Restore, Step 7:
-	 * Restore, Step 47.
-	 *     Write MFC_Cntl[Dh,Sc]='1','1' to suspend
+	 *     Write MFC_Cntl[Dh,Sc,Sm]='1','1','0' to suspend
 	 *     the queue and halt the decrementer.
 	 */
 	out_be64(&priv2->mfc_control_RW, MFC_CNTL_SUSPEND_DMA_QUEUE |
@@ -1112,7 +1079,7 @@ static inline void clear_spu_status(struct spu_state *csa, struct spu *spu)
 static inline void reset_ch_part1(struct spu_state *csa, struct spu *spu)
 {
 	struct spu_priv2 __iomem *priv2 = spu->priv2;
-	u64 ch_indices[7] = { 0UL, 3UL, 4UL, 24UL, 25UL, 27UL };
+	u64 ch_indices[] = { 0UL, 3UL, 4UL, 24UL, 25UL, 27UL };
 	u64 idx;
 	int i;
 
@@ -1124,7 +1091,7 @@ static inline void reset_ch_part1(struct spu_state *csa, struct spu *spu)
 	out_be64(&priv2->spu_chnldata_RW, 0UL);
 
 	/* Reset the following CH: [0,3,4,24,25,27] */
-	for (i = 0; i < 7; i++) {
+	for (i = 0; i < ARRAY_SIZE(ch_indices); i++) {
 		idx = ch_indices[i];
 		out_be64(&priv2->spu_chnlcntptr_RW, idx);
 		eieio();
@@ -1311,7 +1278,15 @@ static inline void setup_decr(struct spu_state *csa, struct spu *spu)
 		cycles_t resume_time = get_cycles();
 		cycles_t delta_time = resume_time - csa->suspend_time;
 
+		csa->lscsa->decr_status.slot[0] = SPU_DECR_STATUS_RUNNING;
+		if (csa->lscsa->decr.slot[0] < delta_time) {
+			csa->lscsa->decr_status.slot[0] |=
+				 SPU_DECR_STATUS_WRAPPED;
+		}
+
 		csa->lscsa->decr.slot[0] -= delta_time;
+	} else {
+		csa->lscsa->decr_status.slot[0] = 0;
 	}
 }
 
@@ -1420,6 +1395,18 @@ static inline void restore_ls_16kb(struct spu_state *csa, struct spu *spu)
 	send_mfc_dma(spu, addr, ls_offset, size, tag, rclass, cmd);
 }
 
+static inline void suspend_mfc(struct spu_state *csa, struct spu *spu)
+{
+	struct spu_priv2 __iomem *priv2 = spu->priv2;
+
+	/* Restore, Step 47.
+	 *     Write MFC_Cntl[Sc,Sm]='1','0' to suspend
+	 *     the queue.
+	 */
+	out_be64(&priv2->mfc_control_RW, MFC_CNTL_SUSPEND_DMA_QUEUE);
+	eieio();
+}
+
 static inline void clear_interrupts(struct spu_state *csa, struct spu *spu)
 {
 	/* Restore, Step 49:
@@ -1434,9 +1421,9 @@ static inline void clear_interrupts(struct spu_state *csa, struct spu *spu)
 	spu_int_mask_set(spu, 0, 0ul);
 	spu_int_mask_set(spu, 1, 0ul);
 	spu_int_mask_set(spu, 2, 0ul);
-	spu_int_stat_clear(spu, 0, ~0ul);
-	spu_int_stat_clear(spu, 1, ~0ul);
-	spu_int_stat_clear(spu, 2, ~0ul);
+	spu_int_stat_clear(spu, 0, CLASS0_INTR_MASK);
+	spu_int_stat_clear(spu, 1, CLASS1_INTR_MASK);
+	spu_int_stat_clear(spu, 2, CLASS2_INTR_MASK);
 	spin_unlock_irq(&spu->register_lock);
 }
 
@@ -1570,32 +1557,27 @@ static inline void restore_decr_wrapped(struct spu_state *csa, struct spu *spu)
 	 *     "wrapped" flag is set, OR in a '1' to
 	 *     CSA.SPU_Event_Status[Tm].
 	 */
-	if (csa->lscsa->decr_status.slot[0] == 1) {
-		csa->spu_chnldata_RW[0] |= 0x20;
-	}
-	if ((csa->lscsa->decr_status.slot[0] == 1) &&
-	    (csa->spu_chnlcnt_RW[0] == 0 &&
-	     ((csa->spu_chnldata_RW[2] & 0x20) == 0x0) &&
-	     ((csa->spu_chnldata_RW[0] & 0x20) != 0x1))) {
+	if (!(csa->lscsa->decr_status.slot[0] & SPU_DECR_STATUS_WRAPPED))
+		return;
+
+	if ((csa->spu_chnlcnt_RW[0] == 0) &&
+	    (csa->spu_chnldata_RW[1] & 0x20) &&
+	    !(csa->spu_chnldata_RW[0] & 0x20))
 		csa->spu_chnlcnt_RW[0] = 1;
-	}
+
+	csa->spu_chnldata_RW[0] |= 0x20;
 }
 
 static inline void restore_ch_part1(struct spu_state *csa, struct spu *spu)
 {
 	struct spu_priv2 __iomem *priv2 = spu->priv2;
-	u64 idx, ch_indices[7] = { 0UL, 3UL, 4UL, 24UL, 25UL, 27UL };
+	u64 idx, ch_indices[] = { 0UL, 3UL, 4UL, 24UL, 25UL, 27UL };
 	int i;
 
 	/* Restore, Step 59:
+	 *	Restore the following CH: [0,3,4,24,25,27]
 	 */
-
-	/* Restore CH 1 without count */
-	out_be64(&priv2->spu_chnlcntptr_RW, 1);
-	out_be64(&priv2->spu_chnldata_RW, csa->spu_chnldata_RW[1]);
-
-	/* Restore the following CH: [0,3,4,24,25,27] */
-	for (i = 0; i < 7; i++) {
+	for (i = 0; i < ARRAY_SIZE(ch_indices); i++) {
 		idx = ch_indices[i];
 		out_be64(&priv2->spu_chnlcntptr_RW, idx);
 		eieio();
@@ -1718,27 +1700,6 @@ static inline void check_ppuint_mb_stat(struct spu_state *csa, struct spu *spu)
 	}
 }
 
-static inline void restore_mfc_slbs(struct spu_state *csa, struct spu *spu)
-{
-	struct spu_priv2 __iomem *priv2 = spu->priv2;
-	int i;
-
-	/* Restore, Step 68:
-	 *     If MFC_SR1[R]='1', restore SLBs from CSA.
-	 */
-	if (csa->priv1.mfc_sr1_RW & MFC_STATE1_RELOCATE_MASK) {
-		for (i = 0; i < 8; i++) {
-			out_be64(&priv2->slb_index_W, i);
-			eieio();
-			out_be64(&priv2->slb_esid_RW, csa->slb_esid_RW[i]);
-			out_be64(&priv2->slb_vsid_RW, csa->slb_vsid_RW[i]);
-			eieio();
-		}
-		out_be64(&priv2->slb_index_W, csa->priv2.slb_index_W);
-		eieio();
-	}
-}
-
 static inline void restore_mfc_sr1(struct spu_state *csa, struct spu *spu)
 {
 	/* Restore, Step 69:
@@ -1748,6 +1709,13 @@ static inline void restore_mfc_sr1(struct spu_state *csa, struct spu *spu)
 	eieio();
 }
 
+static inline void set_int_route(struct spu_state *csa, struct spu *spu)
+{
+	struct spu_context *ctx = spu->ctx;
+
+	spu_cpu_affinity_set(spu, ctx->last_ran);
+}
+
 static inline void restore_other_spu_access(struct spu_state *csa,
 					    struct spu *spu)
 {
@@ -1779,15 +1747,15 @@ static inline void restore_mfc_cntl(struct spu_state *csa, struct spu *spu)
 	 */
 	out_be64(&priv2->mfc_control_RW, csa->priv2.mfc_control_RW);
 	eieio();
+
 	/*
-	 * FIXME: this is to restart a DMA that we were processing
-	 *        before the save. better remember the fault information
-	 *        in the csa instead.
+	 * The queue is put back into the same state that was evident prior to
+	 * the context switch. The suspend flag is added to the saved state in
+	 * the csa, if the operational state was suspending or suspended. In
+	 * this case, the code that suspended the mfc is responsible for
+	 * continuing it. Note that SPE faults do not change the operational
+	 * state of the spu.
 	 */
-	if ((csa->priv2.mfc_control_RW & MFC_CNTL_SUSPEND_DMA_QUEUE_MASK)) {
-		out_be64(&priv2->mfc_control_RW, MFC_CNTL_RESTART_DMA_COMMAND);
-		eieio();
-	}
 }
 
 static inline void enable_user_access(struct spu_state *csa, struct spu *spu)
@@ -1804,9 +1772,8 @@ static inline void reset_switch_active(struct spu_state *csa, struct spu *spu)
 {
 	/* Restore, Step 74:
 	 *     Reset the "context switch active" flag.
+	 *     Not performed by this implementation.
 	 */
-	clear_bit(SPU_CONTEXT_SWITCH_ACTIVE, &spu->flags);
-	mb();
 }
 
 static inline void reenable_interrupts(struct spu_state *csa, struct spu *spu)
@@ -1847,7 +1814,7 @@ static int quiece_spu(struct spu_state *prev, struct spu *spu)
 	save_spu_runcntl(prev, spu);	        /* Step 9. */
 	save_mfc_sr1(prev, spu);	        /* Step 10. */
 	save_spu_status(prev, spu);	        /* Step 11. */
-	save_mfc_decr(prev, spu);	        /* Step 12. */
+	save_mfc_stopped_status(prev, spu);     /* Step 12. */
 	halt_mfc_decr(prev, spu);	        /* Step 13. */
 	save_timebase(prev, spu);		/* Step 14. */
 	remove_other_spu_access(prev, spu);	/* Step 15. */
@@ -1868,14 +1835,15 @@ static void save_csa(struct spu_state *prev, struct spu *spu)
 	save_mfc_queues(prev, spu);	/* Step 19. */
 	save_ppu_querymask(prev, spu);	/* Step 20. */
 	save_ppu_querytype(prev, spu);	/* Step 21. */
+	save_ppu_tagstatus(prev, spu);  /* NEW.     */
 	save_mfc_csr_tsq(prev, spu);	/* Step 22. */
 	save_mfc_csr_cmd(prev, spu);	/* Step 23. */
 	save_mfc_csr_ato(prev, spu);	/* Step 24. */
 	save_mfc_tclass_id(prev, spu);	/* Step 25. */
 	set_mfc_tclass_id(prev, spu);	/* Step 26. */
+	save_mfc_cmd(prev, spu);	/* Step 26a - moved from 44. */
 	purge_mfc_queue(prev, spu);	/* Step 27. */
 	wait_purge_complete(prev, spu);	/* Step 28. */
-	save_mfc_slbs(prev, spu);	/* Step 29. */
 	setup_mfc_sr1(prev, spu);	/* Step 30. */
 	save_spu_npc(prev, spu);	/* Step 31. */
 	save_spu_privcntl(prev, spu);	/* Step 32. */
@@ -1890,7 +1858,6 @@ static void save_csa(struct spu_state *prev, struct spu *spu)
 	save_ppuint_mb(prev, spu);	/* Step 41. */
 	save_ch_part1(prev, spu);	/* Step 42. */
 	save_spu_mb(prev, spu);	        /* Step 43. */
-	save_mfc_cmd(prev, spu);	/* Step 44. */
 	reset_ch(prev, spu);	        /* Step 45. */
 }
 
@@ -1903,7 +1870,8 @@ static void save_lscsa(struct spu_state *prev, struct spu *spu)
 	 */
 
 	resume_mfc_queue(prev, spu);	/* Step 46. */
-	setup_mfc_slbs(prev, spu);	/* Step 47. */
+	/* Step 47. */
+	setup_mfc_slbs(prev, spu, spu_save_code, sizeof(spu_save_code));
 	set_switch_active(prev, spu);	/* Step 48. */
 	enable_interrupts(prev, spu);	/* Step 49. */
 	save_ls_16kb(prev, spu);	/* Step 50. */
@@ -1975,7 +1943,7 @@ static void harvest(struct spu_state *prev, struct spu *spu)
 	set_switch_pending(prev, spu);	        /* Step 5.  */
 	stop_spu_isolate(spu);			/* NEW.     */
 	remove_other_spu_access(prev, spu);	/* Step 6.  */
-	suspend_mfc(prev, spu);	                /* Step 7.  */
+	suspend_mfc_and_halt_decr(prev, spu);	/* Step 7.  */
 	wait_suspend_mfc_complete(prev, spu);	/* Step 8.  */
 	if (!suspend_spe(prev, spu))	        /* Step 9.  */
 		clear_spu_status(prev, spu);	/* Step 10. */
@@ -1987,7 +1955,7 @@ static void harvest(struct spu_state *prev, struct spu *spu)
 	reset_spu_privcntl(prev, spu);	        /* Step 16. */
 	reset_spu_lslr(prev, spu);              /* Step 17. */
 	setup_mfc_sr1(prev, spu);	        /* Step 18. */
-	invalidate_slbs(prev, spu);	        /* Step 19. */
+	spu_invalidate_slbs(spu);		/* Step 19. */
 	reset_ch_part1(prev, spu);	        /* Step 20. */
 	reset_ch_part2(prev, spu);	        /* Step 21. */
 	enable_interrupts(prev, spu);	        /* Step 22. */
@@ -2008,7 +1976,8 @@ static void restore_lscsa(struct spu_state *next, struct spu *spu)
 	setup_spu_status_part1(next, spu);	/* Step 27. */
 	setup_spu_status_part2(next, spu);	/* Step 28. */
 	restore_mfc_rag(next, spu);	        /* Step 29. */
-	setup_mfc_slbs(next, spu);	        /* Step 30. */
+	/* Step 30. */
+	setup_mfc_slbs(next, spu, spu_restore_code, sizeof(spu_restore_code));
 	set_spu_npc(next, spu);	                /* Step 31. */
 	set_signot1(next, spu);	                /* Step 32. */
 	set_signot2(next, spu);	                /* Step 33. */
@@ -2055,8 +2024,9 @@ static void restore_csa(struct spu_state *next, struct spu *spu)
 	restore_spu_mb(next, spu);	        /* Step 65. */
 	check_ppu_mb_stat(next, spu);	        /* Step 66. */
 	check_ppuint_mb_stat(next, spu);	/* Step 67. */
-	restore_mfc_slbs(next, spu);	        /* Step 68. */
+	spu_invalidate_slbs(spu);		/* Modified Step 68. */
 	restore_mfc_sr1(next, spu);	        /* Step 69. */
+	set_int_route(next, spu);		/* NEW      */
 	restore_other_spu_access(next, spu);	/* Step 70. */
 	restore_spu_runcntl(next, spu);	        /* Step 71. */
 	restore_mfc_cntl(next, spu);	        /* Step 72. */
@@ -2165,10 +2135,7 @@ int spu_restore(struct spu_state *new, struct spu *spu)
 
 	acquire_spu_lock(spu);
 	harvest(NULL, spu);
-	spu->dar = 0;
-	spu->dsisr = 0;
 	spu->slb_replace = 0;
-	spu->class_0_pending = 0;
 	rc = __do_spu_restore(new, spu);
 	release_spu_lock(spu);
 	if (rc) {
@@ -2179,19 +2146,6 @@ int spu_restore(struct spu_state *new, struct spu *spu)
 }
 EXPORT_SYMBOL_GPL(spu_restore);
 
-/**
- * spu_harvest - SPU harvest (reset) operation
- * @spu: pointer to SPU iomem structure.
- *
- * Perform SPU harvest (reset) operation.
- */
-void spu_harvest(struct spu *spu)
-{
-	acquire_spu_lock(spu);
-	harvest(NULL, spu);
-	release_spu_lock(spu);
-}
-
 static void init_prob(struct spu_state *csa)
 {
 	csa->spu_chnlcnt_RW[9] = 1;
@@ -2241,40 +2195,28 @@ static void init_priv2(struct spu_state *csa)
  * as it is by far the largest of the context save regions,
  * and may need to be pinned or otherwise specially aligned.
  */
-void spu_init_csa(struct spu_state *csa)
+int spu_init_csa(struct spu_state *csa)
 {
-	struct spu_lscsa *lscsa;
-	unsigned char *p;
+	int rc;
 
 	if (!csa)
-		return;
+		return -EINVAL;
 	memset(csa, 0, sizeof(struct spu_state));
 
-	lscsa = vmalloc(sizeof(struct spu_lscsa));
-	if (!lscsa)
-		return;
+	rc = spu_alloc_lscsa(csa);
+	if (rc)
+		return rc;
 
-	memset(lscsa, 0, sizeof(struct spu_lscsa));
-	csa->lscsa = lscsa;
 	spin_lock_init(&csa->register_lock);
 
-	/* Set LS pages reserved to allow for user-space mapping. */
-	for (p = lscsa->ls; p < lscsa->ls + LS_SIZE; p += PAGE_SIZE)
-		SetPageReserved(vmalloc_to_page(p));
-
 	init_prob(csa);
 	init_priv1(csa);
 	init_priv2(csa);
+
+	return 0;
 }
-EXPORT_SYMBOL_GPL(spu_init_csa);
 
 void spu_fini_csa(struct spu_state *csa)
 {
-	/* Clear reserved bit before vfree. */
-	unsigned char *p;
-	for (p = csa->lscsa->ls; p < csa->lscsa->ls + LS_SIZE; p += PAGE_SIZE)
-		ClearPageReserved(vmalloc_to_page(p));
-
-	vfree(csa->lscsa);
+	spu_free_lscsa(csa);
 }
-EXPORT_SYMBOL_GPL(spu_fini_csa);
diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c
index 8e37bdf4dfd..a87200a535f 100644
--- a/arch/powerpc/platforms/cell/spufs/syscalls.c
+++ b/arch/powerpc/platforms/cell/spufs/syscalls.c
@@ -1,8 +1,9 @@
 #include <linux/file.h>
 #include <linux/fs.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 
 #include <asm/uaccess.h>
 
@@ -46,8 +47,8 @@ static long do_spu_run(struct file *filp,
 	if (filp->f_op != &spufs_context_fops)
 		goto out;
 
-	i = SPUFS_I(filp->f_path.dentry->d_inode);
-	ret = spufs_run_spu(filp, i->i_ctx, &npc, &status);
+	i = SPUFS_I(file_inode(filp));
+	ret = spufs_run_spu(i->i_ctx, &npc, &status);
 
 	if (put_user(npc, unpc))
 		ret = -EFAULT;
@@ -58,49 +59,30 @@ out:
 	return ret;
 }
 
-#ifndef MODULE
-asmlinkage long sys_spu_run(int fd, __u32 __user *unpc, __u32 __user *ustatus)
+static long do_spu_create(const char __user *pathname, unsigned int flags,
+		umode_t mode, struct file *neighbor)
 {
-	int fput_needed;
-	struct file *filp;
-	long ret;
-
-	ret = -EBADF;
-	filp = fget_light(fd, &fput_needed);
-	if (filp) {
-		ret = do_spu_run(filp, unpc, ustatus);
-		fput_light(filp, fput_needed);
-	}
-
-	return ret;
-}
-#endif
-
-asmlinkage long sys_spu_create(const char __user *pathname,
-					unsigned int flags, mode_t mode)
-{
-	char *tmp;
+	struct path path;
+	struct dentry *dentry;
 	int ret;
 
-	tmp = getname(pathname);
-	ret = PTR_ERR(tmp);
-	if (!IS_ERR(tmp)) {
-		struct nameidata nd;
-
-		ret = path_lookup(tmp, LOOKUP_PARENT|
-				LOOKUP_OPEN|LOOKUP_CREATE, &nd);
-		if (!ret) {
-			ret = spufs_create(&nd, flags, mode);
-			path_release(&nd);
-		}
-		putname(tmp);
+	dentry = user_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY);
+	ret = PTR_ERR(dentry);
+	if (!IS_ERR(dentry)) {
+		ret = spufs_create(&path, dentry, flags, mode, neighbor);
+		done_path_create(&path, dentry);
 	}
 
 	return ret;
 }
 
 struct spufs_calls spufs_calls = {
-	.create_thread = sys_spu_create,
+	.create_thread = do_spu_create,
 	.spu_run = do_spu_run,
+	.notify_spus_active = do_notify_spus_active,
 	.owner = THIS_MODULE,
+#ifdef CONFIG_COREDUMP
+	.coredump_extra_notes_size = spufs_coredump_extra_notes_size,
+	.coredump_extra_notes_write = spufs_coredump_extra_notes_write,
+#endif
 };