12 files changed, 574 insertions, 336 deletions
diff --git a/drivers/misc/sgi-gru/Makefile b/drivers/misc/sgi-gru/Makefile
index 9e9170b3599..bcd8136d2f9 100644
--- a/drivers/misc/sgi-gru/Makefile
+++ b/drivers/misc/sgi-gru/Makefile
@@ -3,5 +3,5 @@ ifdef CONFIG_SGI_GRU_DEBUG
 endif
 
 obj-$(CONFIG_SGI_GRU) := gru.o
-gru-y := grufile.o grumain.o grufault.o grutlbpurge.o gruprocfs.o grukservices.o
+gru-y := grufile.o grumain.o grufault.o grutlbpurge.o gruprocfs.o grukservices.o gruhandles.o
 
diff --git a/drivers/misc/sgi-gru/gru_instructions.h b/drivers/misc/sgi-gru/gru_instructions.h
index 48762e7b98b..3fde33c1e8f 100644
--- a/drivers/misc/sgi-gru/gru_instructions.h
+++ b/drivers/misc/sgi-gru/gru_instructions.h
@@ -19,8 +19,11 @@
 #ifndef __GRU_INSTRUCTIONS_H__
 #define __GRU_INSTRUCTIONS_H__
 
-#define gru_flush_cache_hook(p)
-#define gru_emulator_wait_hook(p, w)
+extern int gru_check_status_proc(void *cb);
+extern int gru_wait_proc(void *cb);
+extern void gru_wait_abort_proc(void *cb);
+
+
 
 /*
  * Architecture dependent functions
@@ -29,16 +32,16 @@
 #if defined(CONFIG_IA64)
 #include <linux/compiler.h>
 #include <asm/intrinsics.h>
-#define __flush_cache(p)		ia64_fc(p)
+#define __flush_cache(p)		ia64_fc((unsigned long)p)
 /* Use volatile on IA64 to ensure ordering via st4.rel */
-#define gru_ordered_store_int(p,v)					\
+#define gru_ordered_store_int(p, v)					\
 		do {							\
 			barrier();					\
 			*((volatile int *)(p)) = v; /* force st.rel */	\
 		} while (0)
 #elif defined(CONFIG_X86_64)
 #define __flush_cache(p)		clflush(p)
-#define gru_ordered_store_int(p,v)					\
+#define gru_ordered_store_int(p, v)					\
 		do {							\
 			barrier();					\
 			*(int *)p = v;					\
@@ -558,20 +561,19 @@ extern int gru_get_cb_exception_detail(void *cb,
 
 #define GRU_EXC_STR_SIZE		256
 
-extern int gru_check_status_proc(void *cb);
-extern int gru_wait_proc(void *cb);
-extern void gru_wait_abort_proc(void *cb);
 
 /*
  * Control block definition for checking status
  */
 struct gru_control_block_status {
 	unsigned int	icmd		:1;
-	unsigned int	unused1		:31;
+	unsigned int	ima		:3;
+	unsigned int	reserved0	:4;
+	unsigned int	unused1		:24;
 	unsigned int	unused2		:24;
 	unsigned int	istatus		:2;
 	unsigned int	isubstatus	:4;
-	unsigned int	inused3		:2;
+	unsigned int	unused3		:2;
 };
 
 /* Get CB status */
diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c
index 3ee698ad859..ab118558552 100644
--- a/drivers/misc/sgi-gru/grufault.c
+++ b/drivers/misc/sgi-gru/grufault.c
@@ -32,6 +32,7 @@
 #include <linux/device.h>
 #include <linux/io.h>
 #include <linux/uaccess.h>
+#include <linux/security.h>
 #include <asm/pgtable.h>
 #include "gru.h"
 #include "grutables.h"
@@ -266,6 +267,44 @@ err:
 	return 1;
 }
 
+static int gru_vtop(struct gru_thread_state *gts, unsigned long vaddr,
+		    int write, int atomic, unsigned long *gpa, int *pageshift)
+{
+	struct mm_struct *mm = gts->ts_mm;
+	struct vm_area_struct *vma;
+	unsigned long paddr;
+	int ret, ps;
+
+	vma = find_vma(mm, vaddr);
+	if (!vma)
+		goto inval;
+
+	/*
+	 * Atomic lookup is faster & usually works even if called in non-atomic
+	 * context.
+	 */
+	rmb();	/* Must/check ms_range_active before loading PTEs */
+	ret = atomic_pte_lookup(vma, vaddr, write, &paddr, &ps);
+	if (ret) {
+		if (atomic)
+			goto upm;
+		if (non_atomic_pte_lookup(vma, vaddr, write, &paddr, &ps))
+			goto inval;
+	}
+	if (is_gru_paddr(paddr))
+		goto inval;
+	paddr = paddr & ~((1UL << ps) - 1);
+	*gpa = uv_soc_phys_ram_to_gpa(paddr);
+	*pageshift = ps;
+	return 0;
+
+inval:
+	return -1;
+upm:
+	return -2;
+}
+
+
 /*
  * Drop a TLB entry into the GRU. The fault is described by info in an TFH.
  *	Input:
@@ -280,10 +319,8 @@ static int gru_try_dropin(struct gru_thread_state *gts,
 			  struct gru_tlb_fault_handle *tfh,
 			  unsigned long __user *cb)
 {
-	struct mm_struct *mm = gts->ts_mm;
-	struct vm_area_struct *vma;
-	int pageshift, asid, write, ret;
-	unsigned long paddr, gpa, vaddr;
+	int pageshift = 0, asid, write, ret, atomic = !cb;
+	unsigned long gpa = 0, vaddr = 0;
 
 	/*
 	 * NOTE: The GRU contains magic hardware that eliminates races between
@@ -317,28 +354,19 @@ static int gru_try_dropin(struct gru_thread_state *gts,
 	if (atomic_read(&gts->ts_gms->ms_range_active))
 		goto failactive;
 
-	vma = find_vma(mm, vaddr);
-	if (!vma)
+	ret = gru_vtop(gts, vaddr, write, atomic, &gpa, &pageshift);
+	if (ret == -1)
 		goto failinval;
+	if (ret == -2)
+		goto failupm;
 
-	/*
-	 * Atomic lookup is faster & usually works even if called in non-atomic
-	 * context.
-	 */
-	rmb();	/* Must/check ms_range_active before loading PTEs */
-	ret = atomic_pte_lookup(vma, vaddr, write, &paddr, &pageshift);
-	if (ret) {
-		if (!cb)
+	if (!(gts->ts_sizeavail & GRU_SIZEAVAIL(pageshift))) {
+		gts->ts_sizeavail |= GRU_SIZEAVAIL(pageshift);
+		if (atomic || !gru_update_cch(gts, 0)) {
+			gts->ts_force_cch_reload = 1;
 			goto failupm;
-		if (non_atomic_pte_lookup(vma, vaddr, write, &paddr,
-					  &pageshift))
-			goto failinval;
+		}
 	}
-	if (is_gru_paddr(paddr))
-		goto failinval;
-
-	paddr = paddr & ~((1UL << pageshift) - 1);
-	gpa = uv_soc_phys_ram_to_gpa(paddr);
 	gru_cb_set_istatus_active(cb);
 	tfh_write_restart(tfh, gpa, GAA_RAM, vaddr, asid, write,
 			  GRU_PAGESIZE(pageshift));
@@ -368,6 +396,7 @@ failupm:
 
 failfmm:
 	/* FMM state on UPM call */
+	gru_flush_cache(tfh);
 	STAT(tlb_dropin_fail_fmm);
 	gru_dbg(grudev, "FAILED fmm tfh: 0x%p, state %d\n", tfh, tfh->state);
 	return 0;
@@ -448,6 +477,7 @@ irqreturn_t gru_intr(int irq, void *dev_id)
 			up_read(&gts->ts_mm->mmap_sem);
 		} else {
 			tfh_user_polling_mode(tfh);
+			STAT(intr_mm_lock_failed);
 		}
 	}
 	return IRQ_HANDLED;
@@ -497,10 +527,8 @@ int gru_handle_user_call_os(unsigned long cb)
 	if (!gts)
 		return -EINVAL;
 
-	if (ucbnum >= gts->ts_cbr_au_count * GRU_CBR_AU_SIZE) {
-		ret = -EINVAL;
+	if (ucbnum >= gts->ts_cbr_au_count * GRU_CBR_AU_SIZE)
 		goto exit;
-	}
 
 	/*
 	 * If force_unload is set, the UPM TLB fault is phony. The task
@@ -508,6 +536,20 @@ int gru_handle_user_call_os(unsigned long cb)
 	 * unload the context. The task will page fault and assign a new
 	 * context.
 	 */
+	if (gts->ts_tgid_owner == current->tgid && gts->ts_blade >= 0 &&
+				gts->ts_blade != uv_numa_blade_id()) {
+		STAT(call_os_offnode_reference);
+		gts->ts_force_unload = 1;
+	}
+
+	/*
+	 * CCH may contain stale data if ts_force_cch_reload is set.
+	 */
+	if (gts->ts_gru && gts->ts_force_cch_reload) {
+		gru_update_cch(gts, 0);
+		gts->ts_force_cch_reload = 0;
+	}
+
 	ret = -EAGAIN;
 	cbrnum = thread_cbr_number(gts, ucbnum);
 	if (gts->ts_force_unload) {
@@ -541,11 +583,13 @@ int gru_get_exception_detail(unsigned long arg)
 	if (!gts)
 		return -EINVAL;
 
-	if (gts->ts_gru) {
-		ucbnum = get_cb_number((void *)excdet.cb);
+	ucbnum = get_cb_number((void *)excdet.cb);
+	if (ucbnum >= gts->ts_cbr_au_count * GRU_CBR_AU_SIZE) {
+		ret = -EINVAL;
+	} else if (gts->ts_gru) {
 		cbrnum = thread_cbr_number(gts, ucbnum);
 		cbe = get_cbe_by_index(gts->ts_gru, cbrnum);
-		prefetchw(cbe);		/* Harmless on hardware, required for emulator */
+		prefetchw(cbe);/* Harmless on hardware, required for emulator */
 		excdet.opc = cbe->opccpy;
 		excdet.exopc = cbe->exopccpy;
 		excdet.ecause = cbe->ecause;
@@ -567,6 +611,31 @@ int gru_get_exception_detail(unsigned long arg)
 /*
  * User request to unload a context. Content is saved for possible reload.
  */
+static int gru_unload_all_contexts(void)
+{
+	struct gru_thread_state *gts;
+	struct gru_state *gru;
+	int gid, ctxnum;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	foreach_gid(gid) {
+		gru = GID_TO_GRU(gid);
+		spin_lock(&gru->gs_lock);
+		for (ctxnum = 0; ctxnum < GRU_NUM_CCH; ctxnum++) {
+			gts = gru->gs_gts[ctxnum];
+			if (gts && mutex_trylock(&gts->ts_ctxlock)) {
+				spin_unlock(&gru->gs_lock);
+				gru_unload_context(gts, 1);
+				gru_unlock_gts(gts);
+				spin_lock(&gru->gs_lock);
+			}
+		}
+		spin_unlock(&gru->gs_lock);
+	}
+	return 0;
+}
+
 int gru_user_unload_context(unsigned long arg)
 {
 	struct gru_thread_state *gts;
@@ -578,6 +647,9 @@ int gru_user_unload_context(unsigned long arg)
 
 	gru_dbg(grudev, "gseg 0x%lx\n", req.gseg);
 
+	if (!req.gseg)
+		return gru_unload_all_contexts();
+
 	gts = gru_find_lock_gts(req.gseg);
 	if (!gts)
 		return -EINVAL;
@@ -609,7 +681,7 @@ int gru_user_flush_tlb(unsigned long arg)
 	if (!gts)
 		return -EINVAL;
 
-	gru_flush_tlb_range(gts->ts_gms, req.vaddr, req.vaddr + req.len);
+	gru_flush_tlb_range(gts->ts_gms, req.vaddr, req.len);
 	gru_unlock_gts(gts);
 
 	return 0;
diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c
index 65098380639..3e6e42d2f01 100644
--- a/drivers/misc/sgi-gru/grufile.c
+++ b/drivers/misc/sgi-gru/grufile.c
@@ -36,28 +36,18 @@
 #include <linux/interrupt.h>
 #include <linux/proc_fs.h>
 #include <linux/uaccess.h>
+#include <asm/uv/uv.h>
 #include "gru.h"
 #include "grulib.h"
 #include "grutables.h"
 
-#if defined CONFIG_X86_64
-#include <asm/genapic.h>
-#include <asm/irq.h>
-#define IS_UV()		is_uv_system()
-#elif defined CONFIG_IA64
-#include <asm/system.h>
-#include <asm/sn/simulator.h>
-/* temp support for running on hardware simulator */
-#define IS_UV()		IS_MEDUSA() || ia64_platform_is("uv")
-#else
-#define IS_UV()		0
-#endif
-
 #include <asm/uv/uv_hub.h>
 #include <asm/uv/uv_mmrs.h>
 
 struct gru_blade_state *gru_base[GRU_MAX_BLADES] __read_mostly;
-unsigned long gru_start_paddr, gru_end_paddr __read_mostly;
+unsigned long gru_start_paddr __read_mostly;
+unsigned long gru_end_paddr __read_mostly;
+unsigned int gru_max_gids __read_mostly;
 struct gru_stats_s gru_stats;
 
 /* Guaranteed user available resources on each node */
@@ -113,7 +103,7 @@ static int gru_file_mmap(struct file *file, struct vm_area_struct *vma)
 		return -EPERM;
 
 	if (vma->vm_start & (GRU_GSEG_PAGESIZE - 1) ||
-	    			vma->vm_end & (GRU_GSEG_PAGESIZE - 1))
+				vma->vm_end & (GRU_GSEG_PAGESIZE - 1))
 		return -EINVAL;
 
 	vma->vm_flags |=
@@ -285,8 +275,11 @@ static void gru_init_chiplet(struct gru_state *gru, unsigned long paddr,
 	gru->gs_blade_id = bid;
 	gru->gs_cbr_map = (GRU_CBR_AU == 64) ? ~0 : (1UL << GRU_CBR_AU) - 1;
 	gru->gs_dsr_map = (1UL << GRU_DSR_AU) - 1;
+	gru->gs_asid_limit = MAX_ASID;
 	gru_tgh_flush_init(gru);
-	gru_dbg(grudev, "bid %d, nid %d, gru %x, vaddr %p (0x%lx)\n",
+	if (gru->gs_gid >= gru_max_gids)
+		gru_max_gids = gru->gs_gid + 1;
+	gru_dbg(grudev, "bid %d, nid %d, gid %d, vaddr %p (0x%lx)\n",
 		bid, nid, gru->gs_gid, gru->gs_gru_base_vaddr,
 		gru->gs_gru_base_paddr);
 	gru_kservices_init(gru);
@@ -307,7 +300,7 @@ static int gru_init_tables(unsigned long gru_base_paddr, void *gru_base_vaddr)
 	for_each_online_node(nid) {
 		bid = uv_node_to_blade_id(nid);
 		pnode = uv_node_to_pnode(nid);
-		if (gru_base[bid])
+		if (bid < 0 || gru_base[bid])
 			continue;
 		page = alloc_pages_node(nid, GFP_KERNEL, order);
 		if (!page)
@@ -320,11 +313,11 @@ static int gru_init_tables(unsigned long gru_base_paddr, void *gru_base_vaddr)
 		dsrbytes = 0;
 		cbrs = 0;
 		for (gru = gru_base[bid]->bs_grus, chip = 0;
-		     		chip < GRU_CHIPLETS_PER_BLADE;
+				chip < GRU_CHIPLETS_PER_BLADE;
 				chip++, gru++) {
 			paddr = gru_chiplet_paddr(gru_base_paddr, pnode, chip);
 			vaddr = gru_chiplet_vaddr(gru_base_vaddr, pnode, chip);
-			gru_init_chiplet(gru, paddr, vaddr, bid, nid, chip);
+			gru_init_chiplet(gru, paddr, vaddr, nid, bid, chip);
 			n = hweight64(gru->gs_cbr_map) * GRU_CBR_AU_SIZE;
 			cbrs = max(cbrs, n);
 			n = hweight64(gru->gs_dsr_map) * GRU_DSR_AU_BYTES;
@@ -381,27 +374,27 @@ static int __init gru_init(void)
 	char id[10];
 	void *gru_start_vaddr;
 
-	if (!IS_UV())
-		return 0;
+	if (!is_uv_system())
+		return -ENODEV;
 
 #if defined CONFIG_IA64
 	gru_start_paddr = 0xd000000000UL; /* ZZZZZZZZZZZZZZZZZZZ fixme */
 #else
 	gru_start_paddr = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR) &
 				0x7fffffffffffUL;
-
 #endif
 	gru_start_vaddr = __va(gru_start_paddr);
-	gru_end_paddr = gru_start_paddr + MAX_NUMNODES * GRU_SIZE;
+	gru_end_paddr = gru_start_paddr + GRU_MAX_BLADES * GRU_SIZE;
 	printk(KERN_INFO "GRU space: 0x%lx - 0x%lx\n",
 	       gru_start_paddr, gru_end_paddr);
 	irq = get_base_irq();
 	for (chip = 0; chip < GRU_CHIPLETS_PER_BLADE; chip++) {
 		ret = request_irq(irq + chip, gru_intr, 0, id, NULL);
-		/* TODO: fix irq handling on x86. For now ignore failures because
+		/* TODO: fix irq handling on x86. For now ignore failure because
 		 * interrupts are not required & not yet fully supported */
 		if (ret) {
-			printk("!!!WARNING: GRU ignoring request failure!!!\n");
+			printk(KERN_WARNING
+			       "!!!WARNING: GRU ignoring request failure!!!\n");
 			ret = 0;
 		}
 		if (ret) {
@@ -447,16 +440,19 @@ exit1:
 
 static void __exit gru_exit(void)
 {
-	int i, bid;
+	int i, bid, gid;
 	int order = get_order(sizeof(struct gru_state) *
 			      GRU_CHIPLETS_PER_BLADE);
 
-	if (!IS_UV())
+	if (!is_uv_system())
 		return;
 
 	for (i = 0; i < GRU_CHIPLETS_PER_BLADE; i++)
 		free_irq(IRQ_GRU + i, NULL);
 
+	foreach_gid(gid)
+		gru_kservices_exit(GID_TO_GRU(gid));
+
 	for (bid = 0; bid < GRU_MAX_BLADES; bid++)
 		free_pages((unsigned long)gru_base[bid], order);
 
@@ -481,7 +477,11 @@ struct vm_operations_struct gru_vm_ops = {
 	.fault		= gru_fault,
 };
 
+#ifndef MODULE
 fs_initcall(gru_init);
+#else
+module_init(gru_init);
+#endif
 module_exit(gru_exit);
 
 module_param(gru_options, ulong, 0644);
diff --git a/drivers/misc/sgi-gru/gruhandles.c b/drivers/misc/sgi-gru/gruhandles.c
new file mode 100644
index 00000000000..9b7ccb32869
--- /dev/null
+++ b/drivers/misc/sgi-gru/gruhandles.c
@@ -0,0 +1,183 @@
+/*
+ *              GRU KERNEL MCS INSTRUCTIONS
+ *
+ *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include "gru.h"
+#include "grulib.h"
+#include "grutables.h"
+
+/* 10 sec */
+#ifdef CONFIG_IA64
+#include <asm/processor.h>
+#define GRU_OPERATION_TIMEOUT	(((cycles_t) local_cpu_data->itc_freq)*10)
+#else
+#include <asm/tsc.h>
+#define GRU_OPERATION_TIMEOUT	((cycles_t) tsc_khz*10*1000)
+#endif
+
+/* Extract the status field from a kernel handle */
+#define GET_MSEG_HANDLE_STATUS(h)	(((*(unsigned long *)(h)) >> 16) & 3)
+
+struct mcs_op_statistic mcs_op_statistics[mcsop_last];
+
+static void update_mcs_stats(enum mcs_op op, unsigned long clks)
+{
+	atomic_long_inc(&mcs_op_statistics[op].count);
+	atomic_long_add(clks, &mcs_op_statistics[op].total);
+	if (mcs_op_statistics[op].max < clks)
+		mcs_op_statistics[op].max = clks;
+}
+
+static void start_instruction(void *h)
+{
+	unsigned long *w0 = h;
+
+	wmb();		/* setting CMD bit must be last */
+	*w0 = *w0 | 1;
+	gru_flush_cache(h);
+}
+
+static int wait_instruction_complete(void *h, enum mcs_op opc)
+{
+	int status;
+	cycles_t start_time = get_cycles();
+
+	while (1) {
+		cpu_relax();
+		status = GET_MSEG_HANDLE_STATUS(h);
+		if (status != CCHSTATUS_ACTIVE)
+			break;
+		if (GRU_OPERATION_TIMEOUT < (get_cycles() - start_time))
+			panic("GRU %p is malfunctioning\n", h);
+	}
+	if (gru_options & OPT_STATS)
+		update_mcs_stats(opc, get_cycles() - start_time);
+	return status;
+}
+
+int cch_allocate(struct gru_context_configuration_handle *cch,
+		int asidval, int sizeavail, unsigned long cbrmap,
+		unsigned long dsrmap)
+{
+	int i;
+
+	for (i = 0; i < 8; i++) {
+		cch->asid[i] = (asidval++);
+		cch->sizeavail[i] = sizeavail;
+	}
+	cch->dsr_allocation_map = dsrmap;
+	cch->cbr_allocation_map = cbrmap;
+	cch->opc = CCHOP_ALLOCATE;
+	start_instruction(cch);
+	return wait_instruction_complete(cch, cchop_allocate);
+}
+
+int cch_start(struct gru_context_configuration_handle *cch)
+{
+	cch->opc = CCHOP_START;
+	start_instruction(cch);
+	return wait_instruction_complete(cch, cchop_start);
+}
+
+int cch_interrupt(struct gru_context_configuration_handle *cch)
+{
+	cch->opc = CCHOP_INTERRUPT;
+	start_instruction(cch);
+	return wait_instruction_complete(cch, cchop_interrupt);
+}
+
+int cch_deallocate(struct gru_context_configuration_handle *cch)
+{
+	cch->opc = CCHOP_DEALLOCATE;
+	start_instruction(cch);
+	return wait_instruction_complete(cch, cchop_deallocate);
+}
+
+int cch_interrupt_sync(struct gru_context_configuration_handle
+				     *cch)
+{
+	cch->opc = CCHOP_INTERRUPT_SYNC;
+	start_instruction(cch);
+	return wait_instruction_complete(cch, cchop_interrupt_sync);
+}
+
+int tgh_invalidate(struct gru_tlb_global_handle *tgh,
+				 unsigned long vaddr, unsigned long vaddrmask,
+				 int asid, int pagesize, int global, int n,
+				 unsigned short ctxbitmap)
+{
+	tgh->vaddr = vaddr;
+	tgh->asid = asid;
+	tgh->pagesize = pagesize;
+	tgh->n = n;
+	tgh->global = global;
+	tgh->vaddrmask = vaddrmask;
+	tgh->ctxbitmap = ctxbitmap;
+	tgh->opc = TGHOP_TLBINV;
+	start_instruction(tgh);
+	return wait_instruction_complete(tgh, tghop_invalidate);
+}
+
+void tfh_write_only(struct gru_tlb_fault_handle *tfh,
+				  unsigned long pfn, unsigned long vaddr,
+				  int asid, int dirty, int pagesize)
+{
+	tfh->fillasid = asid;
+	tfh->fillvaddr = vaddr;
+	tfh->pfn = pfn;
+	tfh->dirty = dirty;
+	tfh->pagesize = pagesize;
+	tfh->opc = TFHOP_WRITE_ONLY;
+	start_instruction(tfh);
+}
+
+void tfh_write_restart(struct gru_tlb_fault_handle *tfh,
+				     unsigned long paddr, int gaa,
+				     unsigned long vaddr, int asid, int dirty,
+				     int pagesize)
+{
+	tfh->fillasid = asid;
+	tfh->fillvaddr = vaddr;
+	tfh->pfn = paddr >> GRU_PADDR_SHIFT;
+	tfh->gaa = gaa;
+	tfh->dirty = dirty;
+	tfh->pagesize = pagesize;
+	tfh->opc = TFHOP_WRITE_RESTART;
+	start_instruction(tfh);
+}
+
+void tfh_restart(struct gru_tlb_fault_handle *tfh)
+{
+	tfh->opc = TFHOP_RESTART;
+	start_instruction(tfh);
+}
+
+void tfh_user_polling_mode(struct gru_tlb_fault_handle *tfh)
+{
+	tfh->opc = TFHOP_USER_POLLING_MODE;
+	start_instruction(tfh);
+}
+
+void tfh_exception(struct gru_tlb_fault_handle *tfh)
+{
+	tfh->opc = TFHOP_EXCEPTION;
+	start_instruction(tfh);
+}
+
diff --git a/drivers/misc/sgi-gru/gruhandles.h b/drivers/misc/sgi-gru/gruhandles.h
index b63018d60fe..1ed74d7508c 100644
--- a/drivers/misc/sgi-gru/gruhandles.h
+++ b/drivers/misc/sgi-gru/gruhandles.h
@@ -489,170 +489,28 @@ enum gru_cbr_state {
  * 	 64m			26	8
  * 	...
  */
-#define GRU_PAGESIZE(sh)	((((sh) > 20 ? (sh) + 2: (sh)) >> 1) - 6)
+#define GRU_PAGESIZE(sh)	((((sh) > 20 ? (sh) + 2 : (sh)) >> 1) - 6)
 #define GRU_SIZEAVAIL(sh)	(1UL << GRU_PAGESIZE(sh))
 
 /* minimum TLB purge count to ensure a full purge */
 #define GRUMAXINVAL		1024UL
 
-
-/* Extract the status field from a kernel handle */
-#define GET_MSEG_HANDLE_STATUS(h)	(((*(unsigned long *)(h)) >> 16) & 3)
-
-static inline void start_instruction(void *h)
-{
-	unsigned long *w0 = h;
-
-	wmb();		/* setting CMD bit must be last */
-	*w0 = *w0 | 1;
-	gru_flush_cache(h);
-}
-
-static inline int wait_instruction_complete(void *h)
-{
-	int status;
-
-	do {
-		cpu_relax();
-		barrier();
-		status = GET_MSEG_HANDLE_STATUS(h);
-	} while (status == CCHSTATUS_ACTIVE);
-	return status;
-}
-
-#if defined CONFIG_IA64
-static inline void cch_allocate_set_asids(
-		  struct gru_context_configuration_handle *cch, int asidval)
-{
-	int i;
-
-	for (i = 0; i <= RGN_HPAGE; i++) {  /*  assume HPAGE is last region */
-		cch->asid[i] = (asidval++);
-#if 0
-		/* ZZZ hugepages not supported yet */
-		if (i == RGN_HPAGE)
-			cch->sizeavail[i] = GRU_SIZEAVAIL(hpage_shift);
-		else
-#endif
-			cch->sizeavail[i] = GRU_SIZEAVAIL(PAGE_SHIFT);
-	}
-}
-#elif defined CONFIG_X86_64
-static inline void cch_allocate_set_asids(
-		  struct gru_context_configuration_handle *cch, int asidval)
-{
-	int i;
-
-	for (i = 0; i < 8; i++) {
-		cch->asid[i] = asidval++;
-		cch->sizeavail[i] = GRU_SIZEAVAIL(PAGE_SHIFT) |
-			GRU_SIZEAVAIL(21);
-	}
-}
-#endif
-
-static inline int cch_allocate(struct gru_context_configuration_handle *cch,
-			       int asidval, unsigned long cbrmap,
-			       unsigned long dsrmap)
-{
-	cch_allocate_set_asids(cch, asidval);
-	cch->dsr_allocation_map = dsrmap;
-	cch->cbr_allocation_map = cbrmap;
-	cch->opc = CCHOP_ALLOCATE;
-	start_instruction(cch);
-	return wait_instruction_complete(cch);
-}
-
-static inline int cch_start(struct gru_context_configuration_handle *cch)
-{
-	cch->opc = CCHOP_START;
-	start_instruction(cch);
-	return wait_instruction_complete(cch);
-}
-
-static inline int cch_interrupt(struct gru_context_configuration_handle *cch)
-{
-	cch->opc = CCHOP_INTERRUPT;
-	start_instruction(cch);
-	return wait_instruction_complete(cch);
-}
-
-static inline int cch_deallocate(struct gru_context_configuration_handle *cch)
-{
-	cch->opc = CCHOP_DEALLOCATE;
-	start_instruction(cch);
-	return wait_instruction_complete(cch);
-}
-
-static inline int cch_interrupt_sync(struct gru_context_configuration_handle
-				     *cch)
-{
-	cch->opc = CCHOP_INTERRUPT_SYNC;
-	start_instruction(cch);
-	return wait_instruction_complete(cch);
-}
-
-static inline int tgh_invalidate(struct gru_tlb_global_handle *tgh,
-				 unsigned long vaddr, unsigned long vaddrmask,
-				 int asid, int pagesize, int global, int n,
-				 unsigned short ctxbitmap)
-{
-	tgh->vaddr = vaddr;
-	tgh->asid = asid;
-	tgh->pagesize = pagesize;
-	tgh->n = n;
-	tgh->global = global;
-	tgh->vaddrmask = vaddrmask;
-	tgh->ctxbitmap = ctxbitmap;
-	tgh->opc = TGHOP_TLBINV;
-	start_instruction(tgh);
-	return wait_instruction_complete(tgh);
-}
-
-static inline void tfh_write_only(struct gru_tlb_fault_handle *tfh,
-				  unsigned long pfn, unsigned long vaddr,
-				  int asid, int dirty, int pagesize)
-{
-	tfh->fillasid = asid;
-	tfh->fillvaddr = vaddr;
-	tfh->pfn = pfn;
-	tfh->dirty = dirty;
-	tfh->pagesize = pagesize;
-	tfh->opc = TFHOP_WRITE_ONLY;
-	start_instruction(tfh);
-}
-
-static inline void tfh_write_restart(struct gru_tlb_fault_handle *tfh,
-				     unsigned long paddr, int gaa,
-				     unsigned long vaddr, int asid, int dirty,
-				     int pagesize)
-{
-	tfh->fillasid = asid;
-	tfh->fillvaddr = vaddr;
-	tfh->pfn = paddr >> GRU_PADDR_SHIFT;
-	tfh->gaa = gaa;
-	tfh->dirty = dirty;
-	tfh->pagesize = pagesize;
-	tfh->opc = TFHOP_WRITE_RESTART;
-	start_instruction(tfh);
-}
-
-static inline void tfh_restart(struct gru_tlb_fault_handle *tfh)
-{
-	tfh->opc = TFHOP_RESTART;
-	start_instruction(tfh);
-}
-
-static inline void tfh_user_polling_mode(struct gru_tlb_fault_handle *tfh)
-{
-	tfh->opc = TFHOP_USER_POLLING_MODE;
-	start_instruction(tfh);
-}
-
-static inline void tfh_exception(struct gru_tlb_fault_handle *tfh)
-{
-	tfh->opc = TFHOP_EXCEPTION;
-	start_instruction(tfh);
-}
+int cch_allocate(struct gru_context_configuration_handle *cch,
+       int asidval, int sizeavail, unsigned long cbrmap, unsigned long dsrmap);
+
+int cch_start(struct gru_context_configuration_handle *cch);
+int cch_interrupt(struct gru_context_configuration_handle *cch);
+int cch_deallocate(struct gru_context_configuration_handle *cch);
+int cch_interrupt_sync(struct gru_context_configuration_handle *cch);
+int tgh_invalidate(struct gru_tlb_global_handle *tgh, unsigned long vaddr,
+	unsigned long vaddrmask, int asid, int pagesize, int global, int n,
+	unsigned short ctxbitmap);
+void tfh_write_only(struct gru_tlb_fault_handle *tfh, unsigned long pfn,
+	unsigned long vaddr, int asid, int dirty, int pagesize);
+void tfh_write_restart(struct gru_tlb_fault_handle *tfh, unsigned long paddr,
+	int gaa, unsigned long vaddr, int asid, int dirty, int pagesize);
+void tfh_restart(struct gru_tlb_fault_handle *tfh);
+void tfh_user_polling_mode(struct gru_tlb_fault_handle *tfh);
+void tfh_exception(struct gru_tlb_fault_handle *tfh);
 
 #endif /* __GRUHANDLES_H__ */
diff --git a/drivers/misc/sgi-gru/grukservices.c b/drivers/misc/sgi-gru/grukservices.c
index 880c55dfb66..d8bd7d84a7c 100644
--- a/drivers/misc/sgi-gru/grukservices.c
+++ b/drivers/misc/sgi-gru/grukservices.c
@@ -52,8 +52,10 @@
  */
 
 /* Blade percpu resources PERMANENTLY reserved for kernel use */
-#define GRU_NUM_KERNEL_CBR      1
+#define GRU_NUM_KERNEL_CBR	1
 #define GRU_NUM_KERNEL_DSR_BYTES 256
+#define GRU_NUM_KERNEL_DSR_CL	(GRU_NUM_KERNEL_DSR_BYTES /		\
+					GRU_CACHE_LINE_BYTES)
 #define KERNEL_CTXNUM           15
 
 /* GRU instruction attributes for all instructions */
@@ -94,7 +96,6 @@ struct message_header {
 	char	fill;
 };
 
-#define QLINES(mq)	((mq) + offsetof(struct message_queue, qlines))
 #define HSTATUS(mq, h)	((mq) + offsetof(struct message_queue, hstatus[h]))
 
 static int gru_get_cpu_resources(int dsr_bytes, void **cb, void **dsr)
@@ -122,7 +123,7 @@ int gru_get_cb_exception_detail(void *cb,
 	struct gru_control_block_extended *cbe;
 
 	cbe = get_cbe(GRUBASE(cb), get_cb_number(cb));
-	prefetchw(cbe);         /* Harmless on hardware, required for emulator */
+	prefetchw(cbe);	/* Harmless on hardware, required for emulator */
 	excdet->opc = cbe->opccpy;
 	excdet->exopc = cbe->exopccpy;
 	excdet->ecause = cbe->ecause;
@@ -250,7 +251,8 @@ static inline void restore_present2(void *p, int val)
  * Create a message queue.
  * 	qlines - message queue size in cache lines. Includes 2-line header.
  */
-int gru_create_message_queue(void *p, unsigned int bytes)
+int gru_create_message_queue(struct gru_message_queue_desc *mqd,
+		void *p, unsigned int bytes, int nasid, int vector, int apicid)
 {
 	struct message_queue *mq = p;
 	unsigned int qlines;
@@ -265,6 +267,12 @@ int gru_create_message_queue(void *p, unsigned int bytes)
 	mq->hstatus[0] = 0;
 	mq->hstatus[1] = 1;
 	mq->head = gru_mesq_head(2, qlines / 2 + 1);
+	mqd->mq = mq;
+	mqd->mq_gpa = uv_gpa(mq);
+	mqd->qlines = qlines;
+	mqd->interrupt_pnode = UV_NASID_TO_PNODE(nasid);
+	mqd->interrupt_vector = vector;
+	mqd->interrupt_apicid = apicid;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(gru_create_message_queue);
@@ -277,8 +285,8 @@ EXPORT_SYMBOL_GPL(gru_create_message_queue);
  *		-1 - if mesq sent successfully but queue not full
  *		>0 - unexpected error. MQE_xxx returned
  */
-static int send_noop_message(void *cb,
-				unsigned long mq, void *mesg)
+static int send_noop_message(void *cb, struct gru_message_queue_desc *mqd,
+				void *mesg)
 {
 	const struct message_header noop_header = {
 					.present = MQS_NOOP, .lines = 1};
@@ -289,7 +297,7 @@ static int send_noop_message(void *cb,
 	STAT(mesq_noop);
 	save_mhdr = *mhdr;
 	*mhdr = noop_header;
-	gru_mesq(cb, mq, gru_get_tri(mhdr), 1, IMA);
+	gru_mesq(cb, mqd->mq_gpa, gru_get_tri(mhdr), 1, IMA);
 	ret = gru_wait(cb);
 
 	if (ret) {
@@ -313,7 +321,7 @@ static int send_noop_message(void *cb,
 			break;
 		case CBSS_PUT_NACKED:
 			STAT(mesq_noop_put_nacked);
-			m = mq + (gru_get_amo_value_head(cb) << 6);
+			m = mqd->mq_gpa + (gru_get_amo_value_head(cb) << 6);
 			gru_vstore(cb, m, gru_get_tri(mesg), XTYPE_CL, 1, 1,
 						IMA);
 			if (gru_wait(cb) == CBS_IDLE)
@@ -333,30 +341,20 @@ static int send_noop_message(void *cb,
 /*
  * Handle a gru_mesq full.
  */
-static int send_message_queue_full(void *cb,
-			   unsigned long mq, void *mesg, int lines)
+static int send_message_queue_full(void *cb, struct gru_message_queue_desc *mqd,
+				void *mesg, int lines)
 {
 	union gru_mesqhead mqh;
 	unsigned int limit, head;
 	unsigned long avalue;
-	int half, qlines, save;
+	int half, qlines;
 
 	/* Determine if switching to first/second half of q */
 	avalue = gru_get_amo_value(cb);
 	head = gru_get_amo_value_head(cb);
 	limit = gru_get_amo_value_limit(cb);
 
-	/*
-	 * Fetch "qlines" from the queue header. Since the queue may be
-	 * in memory that can't be accessed using socket addresses, use
-	 * the GRU to access the data. Use DSR space from the message.
-	 */
-	save = *(int *)mesg;
-	gru_vload(cb, QLINES(mq), gru_get_tri(mesg), XTYPE_W, 1, 1, IMA);
-	if (gru_wait(cb) != CBS_IDLE)
-		goto cberr;
-	qlines = *(int *)mesg;
-	*(int *)mesg = save;
+	qlines = mqd->qlines;
 	half = (limit != qlines);
 
 	if (half)
@@ -365,7 +363,7 @@ static int send_message_queue_full(void *cb,
 		mqh = gru_mesq_head(2, qlines / 2 + 1);
 
 	/* Try to get lock for switching head pointer */
-	gru_gamir(cb, EOP_IR_CLR, HSTATUS(mq, half), XTYPE_DW, IMA);
+	gru_gamir(cb, EOP_IR_CLR, HSTATUS(mqd->mq_gpa, half), XTYPE_DW, IMA);
 	if (gru_wait(cb) != CBS_IDLE)
 		goto cberr;
 	if (!gru_get_amo_value(cb)) {
@@ -375,8 +373,8 @@ static int send_message_queue_full(void *cb,
 
 	/* Got the lock. Send optional NOP if queue not full, */
 	if (head != limit) {
-		if (send_noop_message(cb, mq, mesg)) {
-			gru_gamir(cb, EOP_IR_INC, HSTATUS(mq, half),
+		if (send_noop_message(cb, mqd, mesg)) {
+			gru_gamir(cb, EOP_IR_INC, HSTATUS(mqd->mq_gpa, half),
 					XTYPE_DW, IMA);
 			if (gru_wait(cb) != CBS_IDLE)
 				goto cberr;
@@ -387,14 +385,16 @@ static int send_message_queue_full(void *cb,
 	}
 
 	/* Then flip queuehead to other half of queue. */
-	gru_gamer(cb, EOP_ERR_CSWAP, mq, XTYPE_DW, mqh.val, avalue, IMA);
+	gru_gamer(cb, EOP_ERR_CSWAP, mqd->mq_gpa, XTYPE_DW, mqh.val, avalue,
+							IMA);
 	if (gru_wait(cb) != CBS_IDLE)
 		goto cberr;
 
 	/* If not successfully in swapping queue head, clear the hstatus lock */
 	if (gru_get_amo_value(cb) != avalue) {
 		STAT(mesq_qf_switch_head_failed);
-		gru_gamir(cb, EOP_IR_INC, HSTATUS(mq, half), XTYPE_DW, IMA);
+		gru_gamir(cb, EOP_IR_INC, HSTATUS(mqd->mq_gpa, half), XTYPE_DW,
+							IMA);
 		if (gru_wait(cb) != CBS_IDLE)
 			goto cberr;
 	}
@@ -404,15 +404,25 @@ cberr:
 	return MQE_UNEXPECTED_CB_ERR;
 }
 
+/*
+ * Send a cross-partition interrupt to the SSI that contains the target
+ * message queue. Normally, the interrupt is automatically delivered by hardware
+ * but some error conditions require explicit delivery.
+ */
+static void send_message_queue_interrupt(struct gru_message_queue_desc *mqd)
+{
+	if (mqd->interrupt_vector)
+		uv_hub_send_ipi(mqd->interrupt_pnode, mqd->interrupt_apicid,
+				mqd->interrupt_vector);
+}
+
 
 /*
  * Handle a gru_mesq failure. Some of these failures are software recoverable
  * or retryable.
  */
-static int send_message_failure(void *cb,
-				unsigned long mq,
-				void *mesg,
-				int lines)
+static int send_message_failure(void *cb, struct gru_message_queue_desc *mqd,
+				void *mesg, int lines)
 {
 	int substatus, ret = 0;
 	unsigned long m;
@@ -429,7 +439,7 @@ static int send_message_failure(void *cb,
 		break;
 	case CBSS_QLIMIT_REACHED: