14 files changed, 6462 insertions, 64 deletions
diff --git a/arch/ia64/sn/kernel/Makefile b/arch/ia64/sn/kernel/Makefile
index 4f381fb2504..4351c4ff984 100644
--- a/arch/ia64/sn/kernel/Makefile
+++ b/arch/ia64/sn/kernel/Makefile
@@ -4,10 +4,15 @@
 # License.  See the file "COPYING" in the main directory of this archive
 # for more details.
 #
-# Copyright (C) 1999,2001-2003 Silicon Graphics, Inc.  All Rights Reserved.
+# Copyright (C) 1999,2001-2005 Silicon Graphics, Inc.  All Rights Reserved.
 #
 
 obj-y				+= setup.o bte.o bte_error.o irq.o mca.o idle.o \
 				   huberror.o io_init.o iomv.o klconflib.o sn2/
 obj-$(CONFIG_IA64_GENERIC)      += machvec.o
 obj-$(CONFIG_SGI_TIOCX)		+= tiocx.o
+obj-$(CONFIG_IA64_SGI_SN_XP)	+= xp.o
+xp-y				:= xp_main.o xp_nofault.o
+obj-$(CONFIG_IA64_SGI_SN_XP)	+= xpc.o
+xpc-y				:= xpc_main.o xpc_channel.o xpc_partition.o
+obj-$(CONFIG_IA64_SGI_SN_XP)	+= xpnet.o
diff --git a/arch/ia64/sn/kernel/io_init.c b/arch/ia64/sn/kernel/io_init.c
index 18160a06a8c..9e07f5463f2 100644
--- a/arch/ia64/sn/kernel/io_init.c
+++ b/arch/ia64/sn/kernel/io_init.c
@@ -174,6 +174,12 @@ static void sn_fixup_ionodes(void)
 		if (status)
 			continue;
 
+		/* Attach the error interrupt handlers */
+		if (nasid & 1)
+			ice_error_init(hubdev);
+		else
+			hub_error_init(hubdev);
+
 		for (widget = 0; widget <= HUB_WIDGET_ID_MAX; widget++)
 			hubdev->hdi_xwidget_info[widget].xwi_hubinfo = hubdev;
 
@@ -211,10 +217,6 @@ static void sn_fixup_ionodes(void)
 			    sn_flush_device_list;
 		}
 
-		if (!(i & 1))
-			hub_error_init(hubdev);
-		else
-			ice_error_init(hubdev);
 	}
 
 }
diff --git a/arch/ia64/sn/kernel/mca.c b/arch/ia64/sn/kernel/mca.c
index 857774bb2c9..6546db6abdb 100644
--- a/arch/ia64/sn/kernel/mca.c
+++ b/arch/ia64/sn/kernel/mca.c
@@ -37,6 +37,11 @@ static u64 *sn_oemdata_size, sn_oemdata_bufsize;
  * This function is the callback routine that SAL calls to log error
  * info for platform errors.  buf is appended to sn_oemdata, resizing as
  * required.
+ * Note: this is a SAL to OS callback, running under the same rules as the SAL
+ * code.  SAL calls are run with preempt disabled so this routine must not
+ * sleep.  vmalloc can sleep so print_hook cannot resize the output buffer
+ * itself, instead it must set the required size and return to let the caller
+ * resize the buffer then redrive the SAL call.
  */
 static int print_hook(const char *fmt, ...)
 {
@@ -47,18 +52,8 @@ static int print_hook(const char *fmt, ...)
 	vsnprintf(buf, sizeof(buf), fmt, args);
 	va_end(args);
 	len = strlen(buf);
-	while (*sn_oemdata_size + len + 1 > sn_oemdata_bufsize) {
-		u8 *newbuf = vmalloc(sn_oemdata_bufsize += 1000);
-		if (!newbuf) {
-			printk(KERN_ERR "%s: unable to extend sn_oemdata\n",
-			       __FUNCTION__);
-			return 0;
-		}
-		memcpy(newbuf, *sn_oemdata, *sn_oemdata_size);
-		vfree(*sn_oemdata);
-		*sn_oemdata = newbuf;
-	}
-	memcpy(*sn_oemdata + *sn_oemdata_size, buf, len + 1);
+	if (*sn_oemdata_size + len <= sn_oemdata_bufsize)
+		memcpy(*sn_oemdata + *sn_oemdata_size, buf, len);
 	*sn_oemdata_size += len;
 	return 0;
 }
@@ -98,7 +93,20 @@ sn_platform_plat_specific_err_print(const u8 * sect_header, u8 ** oemdata,
 	sn_oemdata = oemdata;
 	sn_oemdata_size = oemdata_size;
 	sn_oemdata_bufsize = 0;
-	ia64_sn_plat_specific_err_print(print_hook, (char *)sect_header);
+	*sn_oemdata_size = PAGE_SIZE;	/* first guess at how much data will be generated */
+	while (*sn_oemdata_size > sn_oemdata_bufsize) {
+		u8 *newbuf = vmalloc(*sn_oemdata_size);
+		if (!newbuf) {
+			printk(KERN_ERR "%s: unable to extend sn_oemdata\n",
+			       __FUNCTION__);
+			return 1;
+		}
+		vfree(*sn_oemdata);
+		*sn_oemdata = newbuf;
+		sn_oemdata_bufsize = *sn_oemdata_size;
+		*sn_oemdata_size = 0;
+		ia64_sn_plat_specific_err_print(print_hook, (char *)sect_header);
+	}
 	up(&sn_oemdata_mutex);
 	return 0;
 }
diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c
index d35f2a6f9c9..4fb44984afe 100644
--- a/arch/ia64/sn/kernel/setup.c
+++ b/arch/ia64/sn/kernel/setup.c
@@ -3,7 +3,7 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (C) 1999,2001-2004 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 1999,2001-2005 Silicon Graphics, Inc. All rights reserved.
  */
 
 #include <linux/config.h>
@@ -73,6 +73,12 @@ EXPORT_SYMBOL(sn_rtc_cycles_per_second);
 DEFINE_PER_CPU(struct sn_hub_info_s, __sn_hub_info);
 EXPORT_PER_CPU_SYMBOL(__sn_hub_info);
 
+DEFINE_PER_CPU(short, __sn_cnodeid_to_nasid[MAX_NUMNODES]);
+EXPORT_PER_CPU_SYMBOL(__sn_cnodeid_to_nasid);
+
+DEFINE_PER_CPU(struct nodepda_s *, __sn_nodepda);
+EXPORT_PER_CPU_SYMBOL(__sn_nodepda);
+
 partid_t sn_partid = -1;
 EXPORT_SYMBOL(sn_partid);
 char sn_system_serial_number_string[128];
@@ -373,11 +379,11 @@ static void __init sn_init_pdas(char **cmdline_p)
 {
 	cnodeid_t cnode;
 
-	memset(pda->cnodeid_to_nasid_table, -1,
-	       sizeof(pda->cnodeid_to_nasid_table));
+	memset(sn_cnodeid_to_nasid, -1,
+			sizeof(__ia64_per_cpu_var(__sn_cnodeid_to_nasid)));
 	for_each_online_node(cnode)
-		pda->cnodeid_to_nasid_table[cnode] =
-		    pxm_to_nasid(nid_to_pxm_map[cnode]);
+		sn_cnodeid_to_nasid[cnode] =
+				pxm_to_nasid(nid_to_pxm_map[cnode]);
 
 	numionodes = num_online_nodes();
 	scan_for_ionodes();
@@ -477,7 +483,8 @@ void __init sn_cpu_init(void)
 
 	cnode = nasid_to_cnodeid(nasid);
 
-	pda->p_nodepda = nodepdaindr[cnode];
+	sn_nodepda = nodepdaindr[cnode];
+
 	pda->led_address =
 	    (typeof(pda->led_address)) (LED0 + (slice << LED_CPU_SHIFT));
 	pda->led_state = LED_ALWAYS_SET;
@@ -486,15 +493,18 @@ void __init sn_cpu_init(void)
 	pda->idle_flag = 0;
 
 	if (cpuid != 0) {
-		memcpy(pda->cnodeid_to_nasid_table,
-		       pdacpu(0)->cnodeid_to_nasid_table,
-		       sizeof(pda->cnodeid_to_nasid_table));
+		/* copy cpu 0's sn_cnodeid_to_nasid table to this cpu's */
+		memcpy(sn_cnodeid_to_nasid,
+		       (&per_cpu(__sn_cnodeid_to_nasid, 0)),
+		       sizeof(__ia64_per_cpu_var(__sn_cnodeid_to_nasid)));
 	}
 
 	/*
 	 * Check for WARs.
 	 * Only needs to be done once, on BSP.
-	 * Has to be done after loop above, because it uses pda.cnodeid_to_nasid_table[i].
+	 * Has to be done after loop above, because it uses this cpu's
+	 * sn_cnodeid_to_nasid table which was just initialized if this
+	 * isn't cpu 0.
 	 * Has to be done before assignment below.
 	 */
 	if (!wars_have_been_checked) {
@@ -580,8 +590,7 @@ static void __init scan_for_ionodes(void)
 		brd = find_lboard_any(brd, KLTYPE_SNIA);
 
 		while (brd) {
-			pda->cnodeid_to_nasid_table[numionodes] =
-			    brd->brd_nasid;
+			sn_cnodeid_to_nasid[numionodes] = brd->brd_nasid;
 			physical_node_map[brd->brd_nasid] = numionodes;
 			root_lboard[numionodes] = brd;
 			numionodes++;
@@ -602,8 +611,7 @@ static void __init scan_for_ionodes(void)
 				      root_lboard[nasid_to_cnodeid(nasid)],
 				      KLTYPE_TIO);
 		while (brd) {
-			pda->cnodeid_to_nasid_table[numionodes] =
-			    brd->brd_nasid;
+			sn_cnodeid_to_nasid[numionodes] = brd->brd_nasid;
 			physical_node_map[brd->brd_nasid] = numionodes;
 			root_lboard[numionodes] = brd;
 			numionodes++;
@@ -614,7 +622,6 @@ static void __init scan_for_ionodes(void)
 			brd = find_lboard_any(brd, KLTYPE_TIO);
 		}
 	}
-
 }
 
 int
@@ -623,7 +630,8 @@ nasid_slice_to_cpuid(int nasid, int slice)
 	long cpu;
 	
 	for (cpu=0; cpu < NR_CPUS; cpu++) 
-		if (nodepda->phys_cpuid[cpu].nasid == nasid && nodepda->phys_cpuid[cpu].slice == slice)
+		if (cpuid_to_nasid(cpu) == nasid &&
+					cpuid_to_slice(cpu) == slice)
 			return cpu;
 
 	return -1;
diff --git a/arch/ia64/sn/kernel/tiocx.c b/arch/ia64/sn/kernel/tiocx.c
index 66190d7e492..ab9b5f35c2a 100644
--- a/arch/ia64/sn/kernel/tiocx.c
+++ b/arch/ia64/sn/kernel/tiocx.c
@@ -21,6 +21,8 @@
 #include <asm/sn/types.h>
 #include <asm/sn/shubio.h>
 #include <asm/sn/tiocx.h>
+#include <asm/sn/l1.h>
+#include <asm/sn/module.h>
 #include "tio.h"
 #include "xtalk/xwidgetdev.h"
 #include "xtalk/hubdev.h"
@@ -308,14 +310,12 @@ void tiocx_irq_free(struct sn_irq_info *sn_irq_info)
 	}
 }
 
-uint64_t
-tiocx_dma_addr(uint64_t addr)
+uint64_t tiocx_dma_addr(uint64_t addr)
 {
 	return PHYS_TO_TIODMA(addr);
 }
 
-uint64_t
-tiocx_swin_base(int nasid)
+uint64_t tiocx_swin_base(int nasid)
 {
 	return TIO_SWIN_BASE(nasid, TIOCX_CORELET);
 }
@@ -330,19 +330,6 @@ EXPORT_SYMBOL(tiocx_bus_type);
 EXPORT_SYMBOL(tiocx_dma_addr);
 EXPORT_SYMBOL(tiocx_swin_base);
 
-static uint64_t tiocx_get_hubdev_info(u64 handle, u64 address)
-{
-
-	struct ia64_sal_retval ret_stuff;
-	ret_stuff.status = 0;
-	ret_stuff.v0 = 0;
-
-	ia64_sal_oemcall_nolock(&ret_stuff,
-				SN_SAL_IOIF_GET_HUBDEV_INFO,
-				handle, address, 0, 0, 0, 0, 0);
-	return ret_stuff.v0;
-}
-
 static void tio_conveyor_set(nasid_t nasid, int enable_flag)
 {
 	uint64_t ice_frz;
@@ -379,7 +366,29 @@ static void tio_corelet_reset(nasid_t nasid, int corelet)
 	udelay(2000);
 }
 
-static int fpga_attached(nasid_t nasid)
+static int tiocx_btchar_get(int nasid)
+{
+	moduleid_t module_id;
+	geoid_t geoid;
+	int cnodeid;
+
+	cnodeid = nasid_to_cnodeid(nasid);
+	geoid = cnodeid_get_geoid(cnodeid);
+	module_id = geo_module(geoid);
+	return MODULE_GET_BTCHAR(module_id);
+}
+
+static int is_fpga_brick(int nasid)
+{
+	switch (tiocx_btchar_get(nasid)) {
+	case L1_BRICKTYPE_SA:
+	case L1_BRICKTYPE_ATHENA:
+		return 1;
+	}
+	return 0;
+}
+
+static int bitstream_loaded(nasid_t nasid)
 {
 	uint64_t cx_credits;
 
@@ -396,7 +405,7 @@ static int tiocx_reload(struct cx_dev *cx_dev)
 	int mfg_num = CX_DEV_NONE;
 	nasid_t nasid = cx_dev->cx_id.nasid;
 
-	if (fpga_attached(nasid)) {
+	if (bitstream_loaded(nasid)) {
 		uint64_t cx_id;
 
 		cx_id =
@@ -427,9 +436,10 @@ static ssize_t show_cxdev_control(struct device *dev, char *buf)
 {
 	struct cx_dev *cx_dev = to_cx_dev(dev);
 
-	return sprintf(buf, "0x%x 0x%x 0x%x\n",
+	return sprintf(buf, "0x%x 0x%x 0x%x %d\n",
 		       cx_dev->cx_id.nasid,
-		       cx_dev->cx_id.part_num, cx_dev->cx_id.mfg_num);
+		       cx_dev->cx_id.part_num, cx_dev->cx_id.mfg_num,
+		       tiocx_btchar_get(cx_dev->cx_id.nasid));
 }
 
 static ssize_t store_cxdev_control(struct device *dev, const char *buf,
@@ -475,20 +485,14 @@ static int __init tiocx_init(void)
 		if ((nasid = cnodeid_to_nasid(cnodeid)) < 0)
 			break;	/* No more nasids .. bail out of loop */
 
-		if (nasid & 0x1) {	/* TIO's are always odd */
+		if ((nasid & 0x1) && is_fpga_brick(nasid)) {
 			struct hubdev_info *hubdev;
-			uint64_t status;
 			struct xwidget_info *widgetp;
 
 			DBG("Found TIO at nasid 0x%x\n", nasid);
 
 			hubdev =
 			    (struct hubdev_info *)(NODEPDA(cnodeid)->pdinfo);
-			status =
-			    tiocx_get_hubdev_info(nasid,
-						  (uint64_t) __pa(hubdev));
-			if (status)
-				continue;
 
 			widgetp = &hubdev->hdi_xwidget_info[TIOCX_CORELET];
 
diff --git a/arch/ia64/sn/kernel/xp_main.c b/arch/ia64/sn/kernel/xp_main.c
new file mode 100644
index 00000000000..3be52a34c80
--- /dev/null
+++ b/arch/ia64/sn/kernel/xp_main.c
@@ -0,0 +1,289 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2004-2005 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+
+/*
+ * Cross Partition (XP) base.
+ *
+ *	XP provides a base from which its users can interact
+ *	with XPC, yet not be dependent on XPC.
+ *
+ */
+
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <asm/sn/intr.h>
+#include <asm/sn/sn_sal.h>
+#include <asm/sn/xp.h>
+
+
+/*
+ * Target of nofault PIO read.
+ */
+u64 xp_nofault_PIOR_target;
+
+
+/*
+ * xpc_registrations[] keeps track of xpc_connect()'s done by the kernel-level
+ * users of XPC.
+ */
+struct xpc_registration xpc_registrations[XPC_NCHANNELS];
+
+
+/*
+ * Initialize the XPC interface to indicate that XPC isn't loaded.
+ */
+static enum xpc_retval xpc_notloaded(void) { return xpcNotLoaded; }
+
+struct xpc_interface xpc_interface = {
+	(void (*)(int)) xpc_notloaded,
+	(void (*)(int)) xpc_notloaded,
+	(enum xpc_retval (*)(partid_t, int, u32, void **)) xpc_notloaded,
+	(enum xpc_retval (*)(partid_t, int, void *)) xpc_notloaded,
+	(enum xpc_retval (*)(partid_t, int, void *, xpc_notify_func, void *))
+							xpc_notloaded,
+	(void (*)(partid_t, int, void *)) xpc_notloaded,
+	(enum xpc_retval (*)(partid_t, void *)) xpc_notloaded
+};
+
+
+/*
+ * XPC calls this when it (the XPC module) has been loaded.
+ */
+void
+xpc_set_interface(void (*connect)(int),
+		void (*disconnect)(int),
+		enum xpc_retval (*allocate)(partid_t, int, u32, void **),
+		enum xpc_retval (*send)(partid_t, int, void *),
+		enum xpc_retval (*send_notify)(partid_t, int, void *,
+						xpc_notify_func, void *),
+		void (*received)(partid_t, int, void *),
+		enum xpc_retval (*partid_to_nasids)(partid_t, void *))
+{
+	xpc_interface.connect = connect;
+	xpc_interface.disconnect = disconnect;
+	xpc_interface.allocate = allocate;
+	xpc_interface.send = send;
+	xpc_interface.send_notify = send_notify;
+	xpc_interface.received = received;
+	xpc_interface.partid_to_nasids = partid_to_nasids;
+}
+
+
+/*
+ * XPC calls this when it (the XPC module) is being unloaded.
+ */
+void
+xpc_clear_interface(void)
+{
+	xpc_interface.connect = (void (*)(int)) xpc_notloaded;
+	xpc_interface.disconnect = (void (*)(int)) xpc_notloaded;
+	xpc_interface.allocate = (enum xpc_retval (*)(partid_t, int, u32,
+					void **)) xpc_notloaded;
+	xpc_interface.send = (enum xpc_retval (*)(partid_t, int, void *))
+					xpc_notloaded;
+	xpc_interface.send_notify = (enum xpc_retval (*)(partid_t, int, void *,
+				    xpc_notify_func, void *)) xpc_notloaded;
+	xpc_interface.received = (void (*)(partid_t, int, void *))
+					xpc_notloaded;
+	xpc_interface.partid_to_nasids = (enum xpc_retval (*)(partid_t, void *))
+					xpc_notloaded;
+}
+
+
+/*
+ * Register for automatic establishment of a channel connection whenever
+ * a partition comes up.
+ *
+ * Arguments:
+ *
+ *	ch_number - channel # to register for connection.
+ *	func - function to call for asynchronous notification of channel
+ *	       state changes (i.e., connection, disconnection, error) and
+ *	       the arrival of incoming messages.
+ *      key - pointer to optional user-defined value that gets passed back
+ *	      to the user on any callouts made to func.
+ *	payload_size - size in bytes of the XPC message's payload area which
+ *		       contains a user-defined message. The user should make
+ *		       this large enough to hold their largest message.
+ *	nentries - max #of XPC message entries a message queue can contain.
+ *		   The actual number, which is determined when a connection
+ * 		   is established and may be less then requested, will be
+ *		   passed to the user via the xpcConnected callout.
+ *	assigned_limit - max number of kthreads allowed to be processing
+ * 			 messages (per connection) at any given instant.
+ *	idle_limit - max number of kthreads allowed to be idle at any given
+ * 		     instant.
+ */
+enum xpc_retval
+xpc_connect(int ch_number, xpc_channel_func func, void *key, u16 payload_size,
+		u16 nentries, u32 assigned_limit, u32 idle_limit)
+{
+	struct xpc_registration *registration;
+
+
+	DBUG_ON(ch_number < 0 || ch_number >= XPC_NCHANNELS);
+	DBUG_ON(payload_size == 0 || nentries == 0);
+	DBUG_ON(func == NULL);
+	DBUG_ON(assigned_limit == 0 || idle_limit > assigned_limit);
+
+	registration = &xpc_registrations[ch_number];
+
+	if (down_interruptible(&registration->sema) != 0) {
+		return xpcInterrupted;
+	}
+
+	/* if XPC_CHANNEL_REGISTERED(ch_number) */
+	if (registration->func != NULL) {
+		up(&registration->sema);
+		return xpcAlreadyRegistered;
+	}
+
+	/* register the channel for connection */
+	registration->msg_size = XPC_MSG_SIZE(payload_size);
+	registration->nentries = nentries;
+	registration->assigned_limit = assigned_limit;
+	registration->idle_limit = idle_limit;
+	registration->key = key;
+	registration->func = func;
+
+	up(&registration->sema);
+
+	xpc_interface.connect(ch_number);
+
+	return xpcSuccess;
+}
+
+
+/*
+ * Remove the registration for automatic connection of the specified channel
+ * when a partition comes up.
+ *
+ * Before returning this xpc_disconnect() will wait for all connections on the
+ * specified channel have been closed/torndown. So the caller can be assured
+ * that they will not be receiving any more callouts from XPC to their
+ * function registered via xpc_connect().
+ *
+ * Arguments:
+ *
+ *	ch_number - channel # to unregister.
+ */
+void
+xpc_disconnect(int ch_number)
+{
+	struct xpc_registration *registration;
+
+
+	DBUG_ON(ch_number < 0 || ch_number >= XPC_NCHANNELS);
+
+	registration = &xpc_registrations[ch_number];
+
+	/*
+	 * We've decided not to make this a down_interruptible(), since we
+	 * figured XPC's users will just turn around and call xpc_disconnect()
+	 * again anyways, so we might as well wait, if need be.
+	 */
+	down(&registration->sema);
+
+	/* if !XPC_CHANNEL_REGISTERED(ch_number) */
+	if (registration->func == NULL) {
+		up(&registration->sema);
+		return;
+	}
+
+	/* remove the connection registration for the specified channel */
+	registration->func = NULL;
+	registration->key = NULL;
+	registration->nentries = 0;
+	registration->msg_size = 0;
+	registration->assigned_limit = 0;
+	registration->idle_limit = 0;
+
+	xpc_interface.disconnect(ch_number);
+
+	up(&registration->sema);
+
+	return;
+}
+
+
+int __init
+xp_init(void)
+{
+	int ret, ch_number;
+	u64 func_addr = *(u64 *) xp_nofault_PIOR;
+	u64 err_func_addr = *(u64 *) xp_error_PIOR;
+
+
+	if (!ia64_platform_is("sn2")) {
+		return -ENODEV;
+	}
+
+	/*
+	 * Register a nofault code region which performs a cross-partition
+	 * PIO read. If the PIO read times out, the MCA handler will consume
+	 * the error and return to a kernel-provided instruction to indicate
+	 * an error. This PIO read exists because it is guaranteed to timeout
+	 * if the destination is down (AMO operations do not timeout on at
+	 * least some CPUs on Shubs <= v1.2, which unfortunately we have to
+	 * work around).
+	 */
+	if ((ret = sn_register_nofault_code(func_addr, err_func_addr,
+						err_func_addr, 1, 1)) != 0) {
+		printk(KERN_ERR "XP: can't register nofault code, error=%d\n",
+			ret);
+	}
+	/*
+	 * Setup the nofault PIO read target. (There is no special reason why
+	 * SH_IPI_ACCESS was selected.)
+	 */
+	if (is_shub2()) {
+		xp_nofault_PIOR_target = SH2_IPI_ACCESS0;
+	} else {
+		xp_nofault_PIOR_target = SH1_IPI_ACCESS;
+	}
+
+	/* initialize the connection registration semaphores */
+	for (ch_number = 0; ch_number < XPC_NCHANNELS; ch_number++) {
+		sema_init(&xpc_registrations[ch_number].sema, 1);  /* mutex */
+	}
+
+	return 0;
+}
+module_init(xp_init);
+
+
+void __exit
+xp_exit(void)
+{
+	u64 func_addr = *(u64 *) xp_nofault_PIOR;
+	u64 err_func_addr = *(u64 *) xp_error_PIOR;
+
+
+	/* unregister the PIO read nofault code region */
+	(void) sn_register_nofault_code(func_addr, err_func_addr,
+					err_func_addr, 1, 0);
+}
+module_exit(xp_exit);
+
+
+MODULE_AUTHOR("Silicon Graphics, Inc.");
+MODULE_DESCRIPTION("Cross Partition (XP) base");
+MODULE_LICENSE("GPL");
+
+EXPORT_SYMBOL(xp_nofault_PIOR);
+EXPORT_SYMBOL(xp_nofault_PIOR_target);
+EXPORT_SYMBOL(xpc_registrations);
+EXPORT_SYMBOL(xpc_interface);
+EXPORT_SYMBOL(xpc_clear_interface);
+EXPORT_SYMBOL(xpc_set_interface);
+EXPORT_SYMBOL(xpc_connect);
+EXPORT_SYMBOL(xpc_disconnect);
+
diff --git a/arch/ia64/sn/kernel/xp_nofault.S b/arch/ia64/sn/kernel/xp_nofault.S
new file mode 100644
index 00000000000..b772543053c
--- /dev/null
+++ b/arch/ia64/sn/kernel/xp_nofault.S
@@ -0,0 +1,31 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2004-2005 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+
+/*
+ * The xp_nofault_PIOR function takes a pointer to a remote PIO register
+ * and attempts to load and consume a value from it.  This function
+ * will be registered as a nofault code block.  In the event that the
+ * PIO read fails, the MCA handler will force the error to look
+ * corrected and vector to the xp_error_PIOR which will return an error.
+ *
+ *	extern int xp_nofault_PIOR(void *remote_register);
+ */
+
+	.global xp_nofault_PIOR
+xp_nofault_PIOR:
+	mov	r8=r0			// Stage a success return value
+	ld8.acq	r9=[r32];;		// PIO Read the specified register
+	adds	r9=1,r9			// Add to force a consume
+	br.ret.sptk.many b0;;		// Return success
+
+	.global xp_error_PIOR
+xp_error_PIOR:
+	mov	r8=1			// Return value of 1
+	br.ret.sptk.many b0;;		// Return failure
+
diff --git a/arch/ia64/sn/kernel/xpc.h b/arch/ia64/sn/kernel/xpc.h
new file mode 100644
index 00000000000..1a0aed8490d
--- /dev/null
+++ b/arch/ia64/sn/kernel/xpc.h
@@ -0,0 +1,991 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2004-2005 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+
+/*
+ * Cross Partition Communication (XPC) structures and macros.
+ */
+
+#ifndef _IA64_SN_KERNEL_XPC_H
+#define _IA64_SN_KERNEL_XPC_H
+
+
+#include <linux/config.h>
+#include <linux/interrupt.h>
+#include <linux/sysctl.h>
+#include <linux/device.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/sn/bte.h>
+#include <asm/sn/clksupport.h>
+#include <asm/sn/addrs.h>
+#include <asm/sn/mspec.h>
+#include <asm/sn/shub_mmr.h>
+#include <asm/sn/xp.h>
+
+
+/*
+ * XPC Version numbers consist of a major and minor number. XPC can always
+ * talk to versions with same major #, and never talk to versions with a
+ * different major #.
+ */
+#define _XPC_VERSION(_maj, _min)	(((_maj) << 4) | ((_min) & 0xf))
+#define XPC_VERSION_MAJOR(_v)		((_v) >> 4)
+#define XPC_VERSION_MINOR(_v)		((_v) & 0xf)
+
+
+/*
+ * The next macros define word or bit representations for given
+ * C-brick nasid in either the SAL provided bit array representing
+ * nasids in the partition/machine or the AMO_t array used for
+ * inter-partition initiation communications.
+ *
+ * For SN2 machines, C-Bricks are alway even numbered NASIDs.  As
+ * such, some space will be saved by insisting that nasid information
+ * passed from SAL always be packed for C-Bricks and the
+ * cross-partition interrupts use the same packing scheme.
+ */
+#define XPC_NASID_W_INDEX(_n)	(((_n) / 64) / 2)
+#define XPC_NASID_B_INDEX(_n)	(((_n) / 2) & (64 - 1))
+#define XPC_NASID_IN_ARRAY(_n, _p) ((_p)[XPC_NASID_W_INDEX(_n)] & \
+				    (1UL << XPC_NASID_B_INDEX(_n)))
+#define XPC_NASID_FROM_W_B(_w, _b) (((_w) * 64 + (_b)) * 2)
+
+#define XPC_HB_DEFAULT_INTERVAL		5	/* incr HB every x secs */
+#define XPC_HB_CHECK_DEFAULT_TIMEOUT	20	/* check HB every x secs */
+
+/* define the process name of HB checker and the CPU it is pinned to */
+#define XPC_HB_CHECK_THREAD_NAME	"xpc_hb"
+#define XPC_HB_CHECK_CPU		0
+
+/* define the process name of the discovery thread */
+#define XPC_DISCOVERY_THREAD_NAME	"xpc_discovery"
+
+
+#define XPC_HB_ALLOWED(_p, _v)	((_v)->heartbeating_to_mask & (1UL << (_p)))
+#define XPC_ALLOW_HB(_p, _v)	(_v)->heartbeating_to_mask |= (1UL << (_p))
+#define XPC_DISALLOW_HB(_p, _v)	(_v)->heartbeating_to_mask &= (~(1UL << (_p)))
+
+
+/*
+ * Reserved Page provided by SAL.
+ *
+ * SAL provides one page per partition of reserved memory.  When SAL
+ * initialization is complete, SAL_signature, SAL_version, partid,
+ * part_nasids, and mach_nasids are set.
+ *
+ * Note: Until vars_pa is set, the partition XPC code has not been initialized.
+ */
+struct xpc_rsvd_page {
+	u64 SAL_signature;	/* SAL unique signature */
+	u64 SAL_version;	/* SAL specified version */
+	u8 partid;		/* partition ID from SAL */
+	u8 version;
+	u8 pad[6];		/* pad to u64 align */
+	u64 vars_pa;
+	u64 part_nasids[XP_NASID_MASK_WORDS] ____cacheline_aligned;
+	u64 mach_nasids[XP_NASID_MASK_WORDS] ____cacheline_aligned;
+};
+#define XPC_RP_VERSION _XPC_VERSION(1,0) /* version 1.0 of the reserved page */
+
+#define XPC_RSVD_PAGE_ALIGNED_SIZE \
+			(L1_CACHE_ALIGN(sizeof(struct xpc_rsvd_page)))
+
+
+/*
+ * Define the structures by which XPC variables can be exported to other
+ * partitions. (There are two: struct xpc_vars and struct xpc_vars_part)
+ */
+
+/*
+ * The following structure describes the partition generic variables
+ * needed by other partitions in order to properly initialize.
+ *
+ * struct xpc_vars version number also applies to struct xpc_vars_part.
+ * Changes to either structure and/or related functionality should be
+ * reflected by incrementing either the major or minor version numbers
+ * of struct xpc_vars.
+ */
+struct xpc_vars {
+	u8 version;
+	u64 heartbeat;
+	u64 heartbeating_to_mask;
+	u64 kdb_status;		/* 0 = machine running */
+	int act_nasid;
+	int act_phys_cpuid;
+	u64 vars_part_pa;
+	u64 amos_page_pa;	/* paddr of page of AMOs from MSPEC driver */
+	AMO_t *amos_page;	/* vaddr of page of AMOs from MSPEC driver */
+	AMO_t *act_amos;	/* pointer to the first activation AMO */
+};
+#define XPC_V_VERSION _XPC_VERSION(3,0) /* version 3.0 of the cross vars */
+
+#define XPC_VARS_ALIGNED_SIZE  (L1_CACHE_ALIGN(sizeof(struct xpc_vars)))
+
+/*
+ * The following structure describes the per partition specific variables.
+ *
+ * An array of these structures, one per partition, will be defined. As a
+ * partition becomes active XPC will copy the array entry corresponding to
+ * itself from that partition. It is desirable that the size of this
+ * structure evenly divide into a cacheline, such that none of the entries
+ * in this array crosses a cacheline boundary. As it is now, each entry
+ * occupies half a cacheline.
+ */
+struct xpc_vars_part {
+	u64 magic;
+
+	u64 openclose_args_pa;	/* physical address of open and close args */
+	u64 GPs_pa;		/* physical address of Get/Put values */
+
+	u64 IPI_amo_pa;		/* physical address of IPI AMO_t structure */
+	int IPI_nasid;		/* nasid of where to send IPIs */
+	int IPI_phys_cpuid;	/* physical CPU ID of where to send IPIs */
+
+	u8 nchannels;		/* #of defined channels supported */
+
+	u8 reserved[23];	/* pad to a full 64 bytes */
+};
+
+/*
+ * The vars_part MAGIC numbers play a part in the first contact protocol.
+ *
+ * MAGIC1 indicates that the per partition specific variables for a remote
+ * partition have been initialized by this partition.
+ *
+ * MAGIC2 indicates that this partition has pulled the remote partititions
+ * per partition variables that pertain to this partition.
+ */
+#define XPC_VP_MAGIC1	0x0053524156435058L  /* 'XPCVARS\0'L (little endian) */
+#define XPC_VP_MAGIC2	0x0073726176435058L  /* 'XPCvars\0'L (little endian) */
+
+
+
+/*
+ * Functions registered by add_timer() or called by kernel_thread() only
+ * allow for a single 64-bit argument. The following macros can be used to
+ * pack and unpack two (32-bit, 16-bit or 8-bit) arguments into or out from
+ * the passed argument.
+ */
+#define XPC_PACK_ARGS(_arg1, _arg2) \
+			((((u64) _arg1) & 0xffffffff) | \
+			((((u64) _arg2) & 0xffffffff) << 32))
+
+#define XPC_UNPACK_ARG1(_args)	(((u64) _args) & 0xffffffff)
+#define XPC_UNPACK_ARG2(_args)	((((u64) _args) >> 32) & 0xffffffff)
+
+
+
+/*
+ * Define a Get/Put value pair (pointers) used with a message queue.
+ */
+struct xpc_gp {
+	s64 get;	/* Get value */
+	s64 put;	/* Put value */
+};
+
+#define XPC_GP_SIZE \
+		L1_CACHE_ALIGN(sizeof(struct xpc_gp) * XPC_NCHANNELS)
+
+
+
+/*
+ * Define a structure that contains arguments associated with opening and
+ * closing a channel.
+ */
+struct xpc_openclose_args {
+	u16 reason;		/* reason why channel is closing */
+	u16 msg_size;		/* sizeof each message entry */
+	u16 remote_nentries;	/* #of message entries in remote msg queue */
+	u16 local_nentries;	/* #of message entries in local msg queue */
+	u64 local_msgqueue_pa;	/* physical address of local message queue */
+};
+
+#define XPC_OPENCLOSE_ARGS_SIZE \
+	      L1_CACHE_ALIGN(sizeof(struct xpc_openclose_args) * XPC_NCHANNELS)
+
+
+
+/* struct xpc_msg flags */
+
+#define	XPC_M_DONE		0x01	/* msg has been received/consumed */
+#define	XPC_M_READY		0x02	/* msg is ready to be sent */
+#define	XPC_M_INTERRUPT		0x04	/* send interrupt when msg consumed */
+
+
+#define XPC_MSG_ADDRESS(_payload) \
+		((struct xpc_msg *)((u8 *)(_payload) - XPC_MSG_PAYLOAD_OFFSET))
+
+
+
+/*
+ * Defines notify entry.
+ *
+ * This is used to notify a message's sender that their message was received
+ * and consumed by the intended recipient.
+ */
+struct xpc_notify {
+	struct semaphore sema;		/* notify semaphore */
+	u8 type;			/* type of notification */
+
+	/* the following two fields are only used if type == XPC_N_CALL */
+	xpc_notify_func func;		/* user's notify function */
+	void *key;			/* pointer to user's key */
+};
+
+/* struct xpc_notify type of notification */
+
+#define	XPC_N_CALL		0x01	/* notify function provided by user */
+
+
+
+/*
+ * Define the structure that manages all the stuff required by a channel. In
+ * particular, they are used to manage the messages sent across the channel.
+ *
+ * This structure is private to a partition, and is NOT shared across the
+ * partition boundary.
+ *
+ * There is an array of these structures for each remote partition. It is
+ * allocated at the time a partition becomes active. The array contains one
+ * of these structures for each potential channel connection to that partition.
+ *
+ * Each of these structures manages two message queues (circular buffers).
+ * They are allocated at the time a channel connection is made. One of
+ * these message queues (local_msgqueue) holds the locally created messages
+ * that are destined for the remote partition. The other of these message
+ * queues (remote_msgqueue) is a locally cached copy of the remote partition's
+ * own local_msgqueue.
+ *
+ * The following is a description of the Get/Put pointers used to manage these
+ * two message queues. Consider the local_msgqueue to be on one partition
+ * and the remote_msgqueue to be its cached copy on another partition. A
+ * description of what each of the lettered areas contains is included.
+ *
+ *
+ *                     local_msgqueue      remote_msgqueue
+ *
+ *                        |/////////|      |/////////|
+ *    w_remote_GP.get --> +---------+      |/////////|
+ *                        |    F    |      |/////////|
+ *     remote_GP.get  --> +---------+      +---------+ <-- local_GP->get
+ *                        |         |      |         |
+ *                        |         |      |    E    |
+ *                        |         |      |         |
+ *                        |         |      +---------+ <-- w_local_GP.get
+ *                        |    B    |      |/////////|
+ *                        |         |      |////D////|
+ *                        |         |      |/////////|
+ *                        |         |      +---------+ <-- w_remote_GP.put
+ *                        |         |      |////C////|
+ *      local_GP->put --> +---------+      +---------+ <-- remote_GP.put
+ *                        |         |      |/////////|
+ *                        |    A    |      |/////////|
+ *                        |         |      |/////////|
+ *     w_local_GP.put --> +---------+      |/////////|
+ *                        |/////////|      |/////////|
+ *
+ *
+ *	    ( remote_GP.[get|put] are cached copies of the remote
+ *	      partition's local_GP->[get|put], and thus their values can
+ *	      lag behind their counterparts on the